}
}
-static void do_nb_verlet(t_forcerec *fr,
- const interaction_const_t *ic,
- gmx_enerdata_t *enerd,
- int flags, int ilocality,
- int clearF,
- int64_t step,
- t_nrnb *nrnb,
- gmx_wallcycle_t wcycle)
+static void do_nb_verlet(t_forcerec *fr,
+ const interaction_const_t *ic,
+ gmx_enerdata_t *enerd,
+ const int flags,
+ const Nbnxm::InteractionLocality ilocality,
+ const int clearF,
+ const int64_t step,
+ t_nrnb *nrnb,
+ gmx_wallcycle_t wcycle)
{
if (!(flags & GMX_FORCE_NONBONDED))
{
{
GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
- isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
- flags, eatLocal,
- haveOtherWork,
- enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
- fshift, completionType);
+ isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
+ flags,
+ Nbnxm::AtomLocality::Local,
+ haveOtherWork,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fshift, completionType);
wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
// To get the call count right, when the task finished we
// issue a start/stop.
wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
- nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
}
}
*/
int numRollingParts = nbv->listParams->numRollingParts;
GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
- int stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
+ int stepWithCurrentList = step - nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.outerListCreationStep;
bool stepIsEven = ((stepWithCurrentList & 1) == 0);
if (stepWithCurrentList > 0 &&
stepWithCurrentList < inputrec->nstlist - 1 &&
(stepIsEven || DOMAINDECOMP(cr)))
{
- nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
- stepIsEven ? eintLocal : eintNonlocal,
- numRollingParts);
+ Nbnxm::gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
+ stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal,
+ numRollingParts);
}
}
nullptr, 0, mdatoms->homenr, -1,
fr->cginfo, x.unpaddedArrayRef(),
0, nullptr,
- nbv->grp[eintLocal].kernel_type,
+ nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
nbv->nbat);
wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
}
wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
fr->cginfo, x.unpaddedArrayRef(),
- nbv->grp[eintNonlocal].kernel_type,
+ nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
nbv->nbat);
wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
}
if (bNS)
{
- nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
+ Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
}
- nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
+ Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
// higher-level object than the nb module.
fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
top->idef,
- nbnxn_gpu_get_xq(nbv->gpu_nbv),
- nbnxn_gpu_get_f(nbv->gpu_nbv),
- nbnxn_gpu_get_fshift(nbv->gpu_nbv));
+ Nbnxm::gpu_get_xq(nbv->gpu_nbv),
+ Nbnxm::gpu_get_f(nbv->gpu_nbv),
+ Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
}
/* do local pair search */
if (bNS)
{
+ nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists;
+
wallcycle_start_nocount(wcycle, ewcNS);
wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
&top->excls,
nbv->listParams->rlistOuter,
nbv->min_ci_balanced,
- &nbv->grp[eintLocal].nbl_lists,
- eintLocal,
- nbv->grp[eintLocal].kernel_type,
+ &pairlistSet,
+ Nbnxm::InteractionLocality::Local,
+ nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
nrnb);
- nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
+ pairlistSet.outerListCreationStep = step;
if (nbv->listParams->useDynamicPruning && !bUseGPU)
{
- nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
+ nbnxnPrepareListForDynamicPruning(&pairlistSet);
}
wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
if (bUseGPU)
{
/* initialize local pair-list on the GPU */
- nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
- nbv->grp[eintLocal].nbl_lists.nblGpu[0],
- eintLocal);
+ Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
+ pairlistSet.nblGpu[0],
+ Nbnxm::InteractionLocality::Local);
}
wallcycle_stop(wcycle, ewcNS);
}
else
{
- nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
+ nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
+ FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
nbv->nbat, wcycle);
}
wallcycle_start(wcycle, ewcLAUNCH_GPU);
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatLocal, ppForceWorkload->haveGpuBondedWork);
+ Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
// bonded work not split into separate local and non-local, so with DD
/* launch local nonbonded work on GPU */
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
+ do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
step, nrnb, wcycle);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
do non-local pair search */
if (DOMAINDECOMP(cr))
{
+ nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists;
+
if (bNS)
{
wallcycle_start_nocount(wcycle, ewcNS);
&top->excls,
nbv->listParams->rlistOuter,
nbv->min_ci_balanced,
- &nbv->grp[eintNonlocal].nbl_lists,
- eintNonlocal,
- nbv->grp[eintNonlocal].kernel_type,
+ &pairlistSet,
+ Nbnxm::InteractionLocality::NonLocal,
+ nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
nrnb);
- nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
+ pairlistSet.outerListCreationStep = step;
if (nbv->listParams->useDynamicPruning && !bUseGPU)
{
- nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
+ nbnxnPrepareListForDynamicPruning(&pairlistSet);
}
wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
- if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
+ if (nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type == nbnxnk8x8x8_GPU)
{
/* initialize non-local pair-list on the GPU */
- nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
- nbv->grp[eintNonlocal].nbl_lists.nblGpu[0],
- eintNonlocal);
+ Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
+ pairlistSet.nblGpu[0],
+ Nbnxm::InteractionLocality::NonLocal);
}
wallcycle_stop(wcycle, ewcNS);
}
{
dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
- nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
+ nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
+ FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
nbv->nbat, wcycle);
}
/* launch non-local nonbonded tasks on GPU */
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
+ Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
if (ppForceWorkload->haveGpuBondedWork)
}
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
+ do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
step, nrnb, wcycle);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
if (DOMAINDECOMP(cr))
{
- nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
- flags, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
+ Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
+ flags, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
}
- nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
- flags, eatLocal, ppForceWorkload->haveGpuBondedWork);
+ Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
+ flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
if (!bUseOrEmulGPU)
{
- do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
+ do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
step, nrnb, wcycle);
}
/* Calculate the local and non-local free energy interactions here.
* Happens here on the CPU both with and without GPU.
*/
- if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
+ if (fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.nbl_fep[0]->nrj > 0)
{
- do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
+ do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists,
fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
inputrec->fepvals, lambda,
enerd, flags, nrnb, wcycle);
}
if (DOMAINDECOMP(cr) &&
- fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
+ fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nbl_fep[0]->nrj > 0)
{
- do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
+ do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists,
fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
inputrec->fepvals, lambda,
enerd, flags, nrnb, wcycle);
if (!bUseOrEmulGPU)
{
- int aloc;
-
if (DOMAINDECOMP(cr))
{
- do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
+ do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
step, nrnb, wcycle);
}
- if (!bUseOrEmulGPU)
- {
- aloc = eintLocal;
- }
- else
- {
- aloc = eintNonlocal;
- }
+ const Nbnxm::InteractionLocality iloc =
+ (!bUseOrEmulGPU ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal);
/* Add all the non-bonded force to the normal force array.
* This can be split into a local and a non-local part when overlapping
*/
wallcycle_stop(wcycle, ewcFORCE);
- nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::All, nbv->nbat, f, wcycle);
wallcycle_start_nocount(wcycle, ewcFORCE);
/* if there are multiple fshift output buffers reduce them */
if ((flags & GMX_FORCE_VIRIAL) &&
- nbv->grp[aloc].nbl_lists.nnbl > 1)
+ nbv->grp[iloc].nbl_lists.nnbl > 1)
{
/* This is not in a subcounter because it takes a
negligible and constant-sized amount of time */
if (bUseGPU)
{
wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
- nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
- flags, eatNonlocal,
- ppForceWorkload->haveGpuBondedWork,
- enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
- fr->fshift);
+ Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
+ flags, Nbnxm::AtomLocality::NonLocal,
+ ppForceWorkload->haveGpuBondedWork,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fr->fshift);
cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
}
else
{
wallcycle_start_nocount(wcycle, ewcFORCE);
- do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
+ do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
step, nrnb, wcycle);
wallcycle_stop(wcycle, ewcFORCE);
}
/* skip the reduction if there was no non-local work to do */
- if (!nbv->grp[eintNonlocal].nbl_lists.nblGpu[0]->sci.empty())
+ if (!nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nblGpu[0]->sci.empty())
{
- nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
nbv->nbat, f, wcycle);
}
}
const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
- nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
- flags, eatLocal, ppForceWorkload->haveGpuBondedWork,
- enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
- fr->fshift);
+ Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
+ flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fr->fshift);
float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
// NOTE: emulation kernel is not included in the balancing region,
// but emulation mode does not target performance anyway
wallcycle_start_nocount(wcycle, ewcFORCE);
- do_nb_verlet(fr, ic, enerd, flags, eintLocal,
+ do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local,
DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
step, nrnb, wcycle);
wallcycle_stop(wcycle, ewcFORCE);
/* now clear the GPU outputs while we finish the step on the CPU */
wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
+ Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, flags);
/* Is dynamic pair-list pruning activated? */
if (nbv->listParams->useDynamicPruning)
* on the non-alternating path. */
if (bUseOrEmulGPU && !alternateGpuWait)
{
- nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
nbv->nbat, f, wcycle);
}
if (DOMAINDECOMP(cr))
if (printReport)
{
- auto nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
+ auto nbnxn_gpu_timings = use_GPU(nbv) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
if (pme_gpu_task_enabled(pme))
{