Converted the interaction and atom locality enum to enum class.
This exposed and fixed a few mixings of the two enums.
Move some files into the Nbnxm namespace.
Add local/non-local organization to the GPU timers.
Note that the use of the Nbnxm namespace outside the nbnxm module
is only temporary. This should all be replaced by methods.
Change-Id: I3c891b3b2b3e14d8175b63f4191f365a5cd64b18
fr->cginfo,
state_local->x,
ncg_moved, bRedist ? comm->movedBuffer.data() : nullptr,
- fr->nbv->grp[eintLocal].kernel_type,
+ fr->nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
fr->nbv->nbat);
nbnxn_get_ncells(fr->nbv->nbs.get(), &ncells_new[XX], &ncells_new[YY]);
/* We always re-initialize the tables whether they are used or not */
init_interaction_const_tables(nullptr, ic, rtab);
- nbnxn_gpu_pme_loadbal_update_param(nbv, ic, listParams);
+ Nbnxm::gpu_pme_loadbal_update_param(nbv, ic, listParams);
if (!pme_lb->bSepPMERanks)
{
GMX_RELEASE_ASSERT(ir->rcoulomb == ir->rvdw, "With Verlet lists and no PME rcoulomb and rvdw should be identical");
}
- init_nb_verlet(mdlog, &fr->nbv, bFEP_NonBonded, ir, fr,
- cr, hardwareInfo, deviceInfo,
- mtop, box);
+ Nbnxm::init_nb_verlet(mdlog, &fr->nbv, bFEP_NonBonded, ir, fr,
+ cr, hardwareInfo, deviceInfo,
+ mtop, box);
if (useGpuForBonded)
{
auto stream = DOMAINDECOMP(cr) ?
- nbnxn_gpu_get_command_stream(fr->nbv->gpu_nbv, eintNonlocal) :
- nbnxn_gpu_get_command_stream(fr->nbv->gpu_nbv, eintLocal);
+ Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal) :
+ Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::Local);
// TODO the heap allocation is only needed while
// t_forcerec lacks a constructor.
fr->gpuBonded = new gmx::GpuBonded(mtop->ffparams,
if (isPPrankUsingGPU)
{
/* free nbnxn data in GPU memory */
- nbnxn_gpu_free(fr->nbv->gpu_nbv);
+ Nbnxm::gpu_free(fr->nbv->gpu_nbv);
delete fr->gpuBonded;
fr->gpuBonded = nullptr;
}
if (use_GPU(nbv))
{
- nbnxn_gpu_reset_timings(nbv);
+ Nbnxm::gpu_reset_timings(nbv);
}
if (pme_gpu_task_enabled(pme))
}
}
-static void do_nb_verlet(t_forcerec *fr,
- const interaction_const_t *ic,
- gmx_enerdata_t *enerd,
- int flags, int ilocality,
- int clearF,
- int64_t step,
- t_nrnb *nrnb,
- gmx_wallcycle_t wcycle)
+static void do_nb_verlet(t_forcerec *fr,
+ const interaction_const_t *ic,
+ gmx_enerdata_t *enerd,
+ const int flags,
+ const Nbnxm::InteractionLocality ilocality,
+ const int clearF,
+ const int64_t step,
+ t_nrnb *nrnb,
+ gmx_wallcycle_t wcycle)
{
if (!(flags & GMX_FORCE_NONBONDED))
{
{
GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
- isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
- flags, eatLocal,
- haveOtherWork,
- enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
- fshift, completionType);
+ isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
+ flags,
+ Nbnxm::AtomLocality::Local,
+ haveOtherWork,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fshift, completionType);
wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
// To get the call count right, when the task finished we
// issue a start/stop.
wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
- nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
}
}
*/
int numRollingParts = nbv->listParams->numRollingParts;
GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
- int stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
+ int stepWithCurrentList = step - nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.outerListCreationStep;
bool stepIsEven = ((stepWithCurrentList & 1) == 0);
if (stepWithCurrentList > 0 &&
stepWithCurrentList < inputrec->nstlist - 1 &&
(stepIsEven || DOMAINDECOMP(cr)))
{
- nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
- stepIsEven ? eintLocal : eintNonlocal,
- numRollingParts);
+ Nbnxm::gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
+ stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal,
+ numRollingParts);
}
}
nullptr, 0, mdatoms->homenr, -1,
fr->cginfo, x.unpaddedArrayRef(),
0, nullptr,
- nbv->grp[eintLocal].kernel_type,
+ nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
nbv->nbat);
wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
}
wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
fr->cginfo, x.unpaddedArrayRef(),
- nbv->grp[eintNonlocal].kernel_type,
+ nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
nbv->nbat);
wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
}
if (bNS)
{
- nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
+ Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
}
- nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
+ Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
// higher-level object than the nb module.
fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
top->idef,
- nbnxn_gpu_get_xq(nbv->gpu_nbv),
- nbnxn_gpu_get_f(nbv->gpu_nbv),
- nbnxn_gpu_get_fshift(nbv->gpu_nbv));
+ Nbnxm::gpu_get_xq(nbv->gpu_nbv),
+ Nbnxm::gpu_get_f(nbv->gpu_nbv),
+ Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
}
/* do local pair search */
if (bNS)
{
+ nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists;
+
wallcycle_start_nocount(wcycle, ewcNS);
wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
&top->excls,
nbv->listParams->rlistOuter,
nbv->min_ci_balanced,
- &nbv->grp[eintLocal].nbl_lists,
- eintLocal,
- nbv->grp[eintLocal].kernel_type,
+ &pairlistSet,
+ Nbnxm::InteractionLocality::Local,
+ nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
nrnb);
- nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
+ pairlistSet.outerListCreationStep = step;
if (nbv->listParams->useDynamicPruning && !bUseGPU)
{
- nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
+ nbnxnPrepareListForDynamicPruning(&pairlistSet);
}
wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
if (bUseGPU)
{
/* initialize local pair-list on the GPU */
- nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
- nbv->grp[eintLocal].nbl_lists.nblGpu[0],
- eintLocal);
+ Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
+ pairlistSet.nblGpu[0],
+ Nbnxm::InteractionLocality::Local);
}
wallcycle_stop(wcycle, ewcNS);
}
else
{
- nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
+ nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
+ FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
nbv->nbat, wcycle);
}
wallcycle_start(wcycle, ewcLAUNCH_GPU);
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatLocal, ppForceWorkload->haveGpuBondedWork);
+ Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
// bonded work not split into separate local and non-local, so with DD
/* launch local nonbonded work on GPU */
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
+ do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
step, nrnb, wcycle);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
do non-local pair search */
if (DOMAINDECOMP(cr))
{
+ nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists;
+
if (bNS)
{
wallcycle_start_nocount(wcycle, ewcNS);
&top->excls,
nbv->listParams->rlistOuter,
nbv->min_ci_balanced,
- &nbv->grp[eintNonlocal].nbl_lists,
- eintNonlocal,
- nbv->grp[eintNonlocal].kernel_type,
+ &pairlistSet,
+ Nbnxm::InteractionLocality::NonLocal,
+ nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
nrnb);
- nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
+ pairlistSet.outerListCreationStep = step;
if (nbv->listParams->useDynamicPruning && !bUseGPU)
{
- nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
+ nbnxnPrepareListForDynamicPruning(&pairlistSet);
}
wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
- if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
+ if (nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type == nbnxnk8x8x8_GPU)
{
/* initialize non-local pair-list on the GPU */
- nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
- nbv->grp[eintNonlocal].nbl_lists.nblGpu[0],
- eintNonlocal);
+ Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
+ pairlistSet.nblGpu[0],
+ Nbnxm::InteractionLocality::NonLocal);
}
wallcycle_stop(wcycle, ewcNS);
}
{
dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
- nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
+ nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
+ FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
nbv->nbat, wcycle);
}
/* launch non-local nonbonded tasks on GPU */
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
+ Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
if (ppForceWorkload->haveGpuBondedWork)
}
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
+ do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
step, nrnb, wcycle);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
if (DOMAINDECOMP(cr))
{
- nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
- flags, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
+ Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
+ flags, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
}
- nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
- flags, eatLocal, ppForceWorkload->haveGpuBondedWork);
+ Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
+ flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
if (!bUseOrEmulGPU)
{
- do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
+ do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
step, nrnb, wcycle);
}
/* Calculate the local and non-local free energy interactions here.
* Happens here on the CPU both with and without GPU.
*/
- if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
+ if (fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.nbl_fep[0]->nrj > 0)
{
- do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
+ do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists,
fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
inputrec->fepvals, lambda,
enerd, flags, nrnb, wcycle);
}
if (DOMAINDECOMP(cr) &&
- fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
+ fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nbl_fep[0]->nrj > 0)
{
- do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
+ do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists,
fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
inputrec->fepvals, lambda,
enerd, flags, nrnb, wcycle);
if (!bUseOrEmulGPU)
{
- int aloc;
-
if (DOMAINDECOMP(cr))
{
- do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
+ do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
step, nrnb, wcycle);
}
- if (!bUseOrEmulGPU)
- {
- aloc = eintLocal;
- }
- else
- {
- aloc = eintNonlocal;
- }
+ const Nbnxm::InteractionLocality iloc =
+ (!bUseOrEmulGPU ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal);
/* Add all the non-bonded force to the normal force array.
* This can be split into a local and a non-local part when overlapping
*/
wallcycle_stop(wcycle, ewcFORCE);
- nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::All, nbv->nbat, f, wcycle);
wallcycle_start_nocount(wcycle, ewcFORCE);
/* if there are multiple fshift output buffers reduce them */
if ((flags & GMX_FORCE_VIRIAL) &&
- nbv->grp[aloc].nbl_lists.nnbl > 1)
+ nbv->grp[iloc].nbl_lists.nnbl > 1)
{
/* This is not in a subcounter because it takes a
negligible and constant-sized amount of time */
if (bUseGPU)
{
wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
- nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
- flags, eatNonlocal,
- ppForceWorkload->haveGpuBondedWork,
- enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
- fr->fshift);
+ Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
+ flags, Nbnxm::AtomLocality::NonLocal,
+ ppForceWorkload->haveGpuBondedWork,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fr->fshift);
cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
}
else
{
wallcycle_start_nocount(wcycle, ewcFORCE);
- do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
+ do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
step, nrnb, wcycle);
wallcycle_stop(wcycle, ewcFORCE);
}
/* skip the reduction if there was no non-local work to do */
- if (!nbv->grp[eintNonlocal].nbl_lists.nblGpu[0]->sci.empty())
+ if (!nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nblGpu[0]->sci.empty())
{
- nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
nbv->nbat, f, wcycle);
}
}
const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
- nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
- flags, eatLocal, ppForceWorkload->haveGpuBondedWork,
- enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
- fr->fshift);
+ Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
+ flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fr->fshift);
float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
// NOTE: emulation kernel is not included in the balancing region,
// but emulation mode does not target performance anyway
wallcycle_start_nocount(wcycle, ewcFORCE);
- do_nb_verlet(fr, ic, enerd, flags, eintLocal,
+ do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local,
DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
step, nrnb, wcycle);
wallcycle_stop(wcycle, ewcFORCE);
/* now clear the GPU outputs while we finish the step on the CPU */
wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
+ Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, flags);
/* Is dynamic pair-list pruning activated? */
if (nbv->listParams->useDynamicPruning)
* on the non-alternating path. */
if (bUseOrEmulGPU && !alternateGpuWait)
{
- nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
nbv->nbat, f, wcycle);
}
if (DOMAINDECOMP(cr))
if (printReport)
{
- auto nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
+ auto nbnxn_gpu_timings = use_GPU(nbv) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
if (pme_gpu_task_enabled(pme))
{
}
/* Copies (and reorders) the coordinates to nbnxn_atomdata_t */
-void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search *nbs,
- int locality,
- gmx_bool FillLocal,
- rvec *x,
- nbnxn_atomdata_t *nbat,
- gmx_wallcycle *wcycle)
+void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search *nbs,
+ const Nbnxm::AtomLocality locality,
+ gmx_bool FillLocal,
+ rvec *x,
+ nbnxn_atomdata_t *nbat,
+ gmx_wallcycle *wcycle)
{
wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
switch (locality)
{
- case eatAll:
+ case Nbnxm::AtomLocality::All:
+ case Nbnxm::AtomLocality::Count:
g0 = 0;
g1 = nbs->grid.size();
break;
- case eatLocal:
+ case Nbnxm::AtomLocality::Local:
g0 = 0;
g1 = 1;
break;
- case eatNonlocal:
+ case Nbnxm::AtomLocality::NonLocal:
g0 = 1;
g1 = nbs->grid.size();
break;
}
/* Add the force array(s) from nbnxn_atomdata_t to f */
-void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search *nbs,
- int locality,
- nbnxn_atomdata_t *nbat,
- rvec *f,
- gmx_wallcycle *wcycle)
+void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search *nbs,
+ const Nbnxm::AtomLocality locality,
+ nbnxn_atomdata_t *nbat,
+ rvec *f,
+ gmx_wallcycle *wcycle)
{
wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
switch (locality)
{
- case eatAll:
+ case Nbnxm::AtomLocality::All:
+ case Nbnxm::AtomLocality::Count:
a0 = 0;
na = nbs->natoms_nonlocal;
break;
- case eatLocal:
+ case Nbnxm::AtomLocality::Local:
a0 = 0;
na = nbs->natoms_local;
break;
- case eatNonlocal:
+ case Nbnxm::AtomLocality::NonLocal:
a0 = nbs->natoms_local;
na = nbs->natoms_nonlocal - nbs->natoms_local;
break;
if (nbat->out.size() > 1)
{
- if (locality != eatAll)
+ if (locality != Nbnxm::AtomLocality::All)
{
gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll");
}
#include "gromacs/utility/basedefinitions.h"
#include "gromacs/utility/real.h"
+#include "locality.h"
+
namespace gmx
{
class MDLogger;
* FillLocal tells if the local filler particle coordinates should be zeroed.
*/
void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search *nbs,
- int locality,
+ Nbnxm::AtomLocality locality,
gmx_bool FillLocal,
rvec *x,
nbnxn_atomdata_t *nbat,
/* Add the forces stored in nbat to f, zeros the forces in nbat */
void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search *nbs,
- int locality,
+ Nbnxm::AtomLocality locality,
nbnxn_atomdata_t *nbat,
rvec *f,
gmx_wallcycle *wcycle);
#endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
+namespace Nbnxm
+{
+
/*! Nonbonded kernel function pointer type */
typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
const cu_nbparam_t,
}
/*! \brief Launch asynchronously the xq buffer host to device copy. */
-void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t *nb,
- const nbnxn_atomdata_t *nbatom,
- int iloc,
- bool haveOtherWork)
+void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t *nb,
+ const nbnxn_atomdata_t *nbatom,
+ const AtomLocality atomLocality,
+ const bool haveOtherWork)
{
- int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
+ GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal,
+ "Only local and non-local xq transfers are supported");
- cu_atomdata_t *adat = nb->atdat;
- cu_plist_t *plist = nb->plist[iloc];
- cu_timers_t *t = nb->timers;
- cudaStream_t stream = nb->stream[iloc];
+ const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
- bool bDoTime = nb->bDoTime;
+ int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
+
+ cu_atomdata_t *adat = nb->atdat;
+ cu_plist_t *plist = nb->plist[iloc];
+ cu_timers_t *t = nb->timers;
+ cudaStream_t stream = nb->stream[iloc];
+
+ bool bDoTime = nb->bDoTime;
/* Don't launch the non-local H2D copy if there is no dependent
work to do: neither non-local nor other (e.g. bonded) work
we always call the local local x+q copy (and the rest of the local
work in nbnxn_gpu_launch_kernel().
*/
- if (!haveOtherWork && canSkipWork(nb, iloc))
+ if (!haveOtherWork && canSkipWork(*nb, iloc))
{
plist->haveFreshList = false;
}
/* calculate the atom data index range based on locality */
- if (LOCAL_I(iloc))
+ if (atomLocality == AtomLocality::Local)
{
adat_begin = 0;
adat_len = adat->natoms_local;
/* beginning of timed HtoD section */
if (bDoTime)
{
- t->nb_h2d[iloc].openTimingRegion(stream);
+ t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
}
/* HtoD x, q */
if (bDoTime)
{
- t->nb_h2d[iloc].closeTimingRegion(stream);
+ t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
}
/* When we get here all misc operations issued in the local stream as well as
*/
if (nb->bUseTwoStreams)
{
- if (iloc == eintLocal)
+ if (iloc == InteractionLocality::Local)
{
cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, stream);
CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
the local x+q H2D (and all preceding) tasks are complete and synchronize
with this event in the non-local stream before launching the non-bonded kernel.
*/
-void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t *nb,
- int flags,
- int iloc)
+void gpu_launch_kernel(gmx_nbnxn_cuda_t *nb,
+ const int flags,
+ const InteractionLocality iloc)
{
/* CUDA kernel launch-related stuff */
int nblock;
clearing. All these operations, except for the local interaction kernel,
are needed for the non-local interactions. The skip of the local kernel
call is taken care of later in this function. */
- if (canSkipWork(nb, iloc))
+ if (canSkipWork(*nb, iloc))
{
plist->haveFreshList = false;
(TODO: ATM that's the way the timing accounting can distinguish between
separate prune kernel and combined force+prune, maybe we need a better way?).
*/
- nbnxn_gpu_launch_kernel_pruneonly(nb, iloc, 1);
+ gpu_launch_kernel_pruneonly(nb, iloc, 1);
}
if (plist->nsci == 0)
/* beginning of timed nonbonded calculation section */
if (bDoTime)
{
- t->nb_k[iloc].openTimingRegion(stream);
+ t->interaction[iloc].nb_k.openTimingRegion(stream);
}
/* get the pointer to the kernel flavor we need to use */
nb_kernel = select_nbnxn_kernel(nbp->eeltype,
nbp->vdwtype,
bCalcEner,
- (plist->haveFreshList && !nb->timers->didPrune[iloc]),
+ (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune),
nb->dev_info);
/* Kernel launch config:
config.sharedMemorySize);
}
- auto *timingEvent = bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr;
+ auto *timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
const auto kernelArgs = prepareGpuKernelArguments(nb_kernel, config, adat, nbp, plist, &bCalcFshift);
launchGpuKernel(nb_kernel, config, timingEvent, "k_calc_nb", kernelArgs);
if (bDoTime)
{
- t->nb_k[iloc].closeTimingRegion(stream);
+ t->interaction[iloc].nb_k.closeTimingRegion(stream);
}
if (GMX_NATIVE_WINDOWS)
return shmem;
}
-void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t *nb,
- int iloc,
- int numParts)
+void gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t *nb,
+ const InteractionLocality iloc,
+ const int numParts)
{
cu_atomdata_t *adat = nb->atdat;
cu_nbparam_t *nbp = nb->nbparam;
GpuRegionTimer *timer = nullptr;
if (bDoTime)
{
- timer = &(plist->haveFreshList ? t->prune_k[iloc] : t->rollingPrune_k[iloc]);
+ timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
}
/* beginning of timed prune calculation section */
(combined or separate 1st pass prune, rolling prune). */
if (plist->haveFreshList)
{
- plist->haveFreshList = false;
+ plist->haveFreshList = false;
/* Mark that pruning has been done */
- nb->timers->didPrune[iloc] = true;
+ nb->timers->interaction[iloc].didPrune = true;
}
else
{
/* Mark that rolling pruning has been done */
- nb->timers->didRollingPrune[iloc] = true;
+ nb->timers->interaction[iloc].didRollingPrune = true;
}
if (bDoTime)
}
}
-void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb,
- nbnxn_atomdata_t *nbatom,
- int flags,
- int aloc,
- bool haveOtherWork)
+void gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb,
+ nbnxn_atomdata_t *nbatom,
+ const int flags,
+ const AtomLocality atomLocality,
+ const bool haveOtherWork)
{
cudaError_t stat;
int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
/* determine interaction locality from atom locality */
- int iloc = gpuAtomToInteractionLocality(aloc);
+ const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
+ /* extract the data */
cu_atomdata_t *adat = nb->atdat;
cu_timers_t *t = nb->timers;
bool bDoTime = nb->bDoTime;
bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
/* don't launch non-local copy-back if there was no non-local work to do */
- if (!haveOtherWork && canSkipWork(nb, iloc))
+ if (!haveOtherWork && canSkipWork(*nb, iloc))
{
return;
}
- getGpuAtomRange(adat, aloc, &adat_begin, &adat_len);
+ getGpuAtomRange(adat, atomLocality, &adat_begin, &adat_len);
/* beginning of timed D2H section */
if (bDoTime)
{
- t->nb_d2h[iloc].openTimingRegion(stream);
+ t->xf[atomLocality].nb_d2h.openTimingRegion(stream);
}
/* With DD the local D2H transfer can only start after the non-local
kernel has finished. */
- if (iloc == eintLocal && nb->bUseTwoStreams)
+ if (iloc == InteractionLocality::Local && nb->bUseTwoStreams)
{
stat = cudaStreamWaitEvent(stream, nb->nonlocal_done, 0);
CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
recorded which signals that the local D2H can proceed. This event is not
placed after the non-local kernel because we want the non-local data
back first. */
- if (iloc == eintNonlocal)
+ if (iloc == InteractionLocality::NonLocal)
{
stat = cudaEventRecord(nb->nonlocal_done, stream);
CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
}
/* only transfer energies in the local stream */
- if (LOCAL_I(iloc))
+ if (iloc == InteractionLocality::Local)
{
/* DtoH fshift */
if (bCalcFshift)
if (bDoTime)
{
- t->nb_d2h[iloc].closeTimingRegion(stream);
+ t->xf[atomLocality].nb_d2h.closeTimingRegion(stream);
}
}
-void nbnxn_cuda_set_cacheconfig()
+void cuda_set_cacheconfig()
{
cudaError_t stat;
}
}
}
+
+} // namespace Nbnxm
* \brief
* Declares nbnxn cuda cache and texture helper functions
*/
-#ifndef GMX_MDLIB_NBNXN_CUDA_NBNXN_CUDA_H
-#define GMX_MDLIB_NBNXN_CUDA_NBNXN_CUDA_H
+#ifndef GMX_NBNXN_CUDA_NBNXN_CUDA_H
+#define GMX_NBNXN_CUDA_NBNXN_CUDA_H
+
+namespace Nbnxm
+{
//! Set up the cache configuration for the non-bonded kernels.
-void nbnxn_cuda_set_cacheconfig();
+void cuda_set_cacheconfig();
+
+} // namespace Nbnxm
#endif
#include <stdio.h>
#include <stdlib.h>
+// TODO We would like to move this down, but the way gmx_nbnxn_gpu_t
+// is currently declared means this has to be before gpu_types.h
+#include "nbnxm_cuda_types.h"
+
+// TODO Remove this comment when the above order issue is resolved
#include "gromacs/gpu_utils/cudautils.cuh"
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/gpu_utils/pmalloc_cuda.h"
#include "gromacs/utility/smalloc.h"
#include "nbnxm_cuda.h"
-#include "nbnxm_cuda_types.h"
+
+namespace Nbnxm
+{
/* This is a heuristically determined parameter for the Kepler
* and Maxwell architectures for the minimum size of ci lists by multiplying
/*! Re-generate the GPU Ewald force table, resets rlist, and update the
* electrostatic type switching to twin cut-off (or back) if needed. */
-void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t *nbv,
- const interaction_const_t *ic,
- const NbnxnListParameters *listParams)
+void gpu_pme_loadbal_update_param(const nonbonded_verlet_t *nbv,
+ const interaction_const_t *ic,
+ const NbnxnListParameters *listParams)
{
- if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
+ if (!nbv || nbv->grp[InteractionLocality::Local].kernel_type != nbnxnk8x8x8_GPU)
{
return;
}
pl->haveFreshList = false;
}
-/*! Initializes the timer data structure. */
-static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
-{
- /* The non-local counters/stream (second in the array) are needed only with DD. */
- for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
- {
- t->didPairlistH2D[i] = false;
- t->didPrune[i] = false;
- t->didRollingPrune[i] = false;
- }
-}
-
/*! Initializes the timings data structure. */
static void init_timings(gmx_wallclock_gpu_nbnxn_t *t)
{
}
/*! Initializes simulation constant data. */
-static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t *nb,
- const interaction_const_t *ic,
- const NbnxnListParameters *listParams,
- const nbnxn_atomdata_t::Params &nbatParams)
+static void cuda_init_const(gmx_nbnxn_cuda_t *nb,
+ const interaction_const_t *ic,
+ const NbnxnListParameters *listParams,
+ const nbnxn_atomdata_t::Params &nbatParams)
{
init_atomdata_first(nb->atdat, nbatParams.numTypes);
init_nbparam(nb->nbparam, ic, listParams, nbatParams);
nbnxn_cuda_clear_e_fshift(nb);
}
-void nbnxn_gpu_init(gmx_nbnxn_cuda_t **p_nb,
- const gmx_device_info_t *deviceInfo,
- const interaction_const_t *ic,
- const NbnxnListParameters *listParams,
- const nbnxn_atomdata_t *nbat,
- int /*rank*/,
- gmx_bool bLocalAndNonlocal)
+void gpu_init(gmx_nbnxn_cuda_t **p_nb,
+ const gmx_device_info_t *deviceInfo,
+ const interaction_const_t *ic,
+ const NbnxnListParameters *listParams,
+ const nbnxn_atomdata_t *nbat,
+ int /*rank*/,
+ gmx_bool bLocalAndNonlocal)
{
cudaError_t stat;
gmx_nbnxn_cuda_t *nb;
snew(nb, 1);
snew(nb->atdat, 1);
snew(nb->nbparam, 1);
- snew(nb->plist[eintLocal], 1);
+ snew(nb->plist[InteractionLocality::Local], 1);
if (bLocalAndNonlocal)
{
- snew(nb->plist[eintNonlocal], 1);
+ snew(nb->plist[InteractionLocality::NonLocal], 1);
}
nb->bUseTwoStreams = bLocalAndNonlocal;
pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
- init_plist(nb->plist[eintLocal]);
+ init_plist(nb->plist[InteractionLocality::Local]);
/* set device info, just point it to the right GPU among the detected ones */
nb->dev_info = deviceInfo;
/* local/non-local GPU streams */
- stat = cudaStreamCreate(&nb->stream[eintLocal]);
- CU_RET_ERR(stat, "cudaStreamCreate on stream[eintLocal] failed");
+ stat = cudaStreamCreate(&nb->stream[InteractionLocality::Local]);
+ CU_RET_ERR(stat, "cudaStreamCreate on stream[InterationLocality::Local] failed");
if (nb->bUseTwoStreams)
{
- init_plist(nb->plist[eintNonlocal]);
+ init_plist(nb->plist[InteractionLocality::NonLocal]);
/* Note that the device we're running on does not have to support
* priorities, because we are querying the priority range which in this
stat = cudaDeviceGetStreamPriorityRange(nullptr, &highest_priority);
CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
- stat = cudaStreamCreateWithPriority(&nb->stream[eintNonlocal],
+ stat = cudaStreamCreateWithPriority(&nb->stream[InteractionLocality::NonLocal],
cudaStreamDefault,
highest_priority);
- CU_RET_ERR(stat, "cudaStreamCreateWithPriority on stream[eintNonlocal] failed");
+ CU_RET_ERR(stat, "cudaStreamCreateWithPriority on stream[InteractionLocality::NonLocal] failed");
}
/* init events for sychronization (timing disabled for performance reasons!) */
if (nb->bDoTime)
{
- init_timers(nb->timers, nb->bUseTwoStreams);
init_timings(nb->timings);
}
/* set the kernel type for the current GPU */
/* pick L1 cache configuration */
- nbnxn_cuda_set_cacheconfig();
+ cuda_set_cacheconfig();
- nbnxn_cuda_init_const(nb, ic, listParams, nbat->params());
+ cuda_init_const(nb, ic, listParams, nbat->params());
*p_nb = nb;
}
}
-void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t *nb,
- const NbnxnPairlistGpu *h_plist,
- int iloc)
+void gpu_init_pairlist(gmx_nbnxn_cuda_t *nb,
+ const NbnxnPairlistGpu *h_plist,
+ const InteractionLocality iloc)
{
char sbuf[STRLEN];
bool bDoTime = (nb->bDoTime && !h_plist->sci.empty());
}
}
+ gpu_timers_t::Interaction &iTimers = nb->timers->interaction[iloc];
+
if (bDoTime)
{
- nb->timers->pl_h2d[iloc].openTimingRegion(stream);
- nb->timers->didPairlistH2D[iloc] = true;
+ iTimers.pl_h2d.openTimingRegion(stream);
+ iTimers.didPairlistH2D = true;
}
Context context = nullptr;
&d_plist->nsci, &d_plist->sci_nalloc, context);
copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(),
stream, GpuApiCallBehavior::Async,
- bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+ bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(),
&d_plist->ncj4, &d_plist->cj4_nalloc, context);
copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(),
stream, GpuApiCallBehavior::Async,
- bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+ bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size()*c_nbnxnGpuClusterpairSplit,
&d_plist->nimask, &d_plist->imask_nalloc, context);
&d_plist->nexcl, &d_plist->excl_nalloc, context);
copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(),
stream, GpuApiCallBehavior::Async,
- bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+ bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
if (bDoTime)
{
- nb->timers->pl_h2d[iloc].closeTimingRegion(stream);
+ iTimers.pl_h2d.closeTimingRegion(stream);
}
/* the next use of thist list we be the first one, so we need to prune */
d_plist->haveFreshList = true;
}
-void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_cuda_t *nb,
- const nbnxn_atomdata_t *nbatom)
+void gpu_upload_shiftvec(gmx_nbnxn_cuda_t *nb,
+ const nbnxn_atomdata_t *nbatom)
{
cu_atomdata_t *adat = nb->atdat;
- cudaStream_t ls = nb->stream[eintLocal];
+ cudaStream_t ls = nb->stream[InteractionLocality::Local];
/* only if we have a dynamic box */
if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
{
cudaError_t stat;
cu_atomdata_t *adat = nb->atdat;
- cudaStream_t ls = nb->stream[eintLocal];
+ cudaStream_t ls = nb->stream[InteractionLocality::Local];
stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
{
cudaError_t stat;
cu_atomdata_t *adat = nb->atdat;
- cudaStream_t ls = nb->stream[eintLocal];
+ cudaStream_t ls = nb->stream[InteractionLocality::Local];
stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
}
-void nbnxn_gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags)
+void gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags)
{
nbnxn_cuda_clear_f(nb, nb->atdat->natoms);
/* clear shift force array and energies if the outputs were
}
}
-void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t *nb,
- const nbnxn_atomdata_t *nbat)
+void gpu_init_atomdata(gmx_nbnxn_cuda_t *nb,
+ const nbnxn_atomdata_t *nbat)
{
cudaError_t stat;
int nalloc, natoms;
bool bDoTime = nb->bDoTime;
cu_timers_t *timers = nb->timers;
cu_atomdata_t *d_atdat = nb->atdat;
- cudaStream_t ls = nb->stream[eintLocal];
+ cudaStream_t ls = nb->stream[InteractionLocality::Local];
natoms = nbat->numAtoms();
realloced = false;
}
}
-void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
+void gpu_free(gmx_nbnxn_cuda_t *nb)
{
cudaError_t stat;
cu_atomdata_t *atdat;
freeDeviceBuffer(&atdat->lj_comb);
/* Free plist */
- auto *plist = nb->plist[eintLocal];
+ auto *plist = nb->plist[InteractionLocality::Local];
freeDeviceBuffer(&plist->sci);
freeDeviceBuffer(&plist->cj4);
freeDeviceBuffer(&plist->imask);
sfree(plist);
if (nb->bUseTwoStreams)
{
- auto *plist_nl = nb->plist[eintNonlocal];
+ auto *plist_nl = nb->plist[InteractionLocality::NonLocal];
freeDeviceBuffer(&plist_nl->sci);
freeDeviceBuffer(&plist_nl->cj4);
freeDeviceBuffer(&plist_nl->imask);
}
//! This function is documented in the header file
-gmx_wallclock_gpu_nbnxn_t *nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb)
+gmx_wallclock_gpu_nbnxn_t *gpu_get_timings(gmx_nbnxn_cuda_t *nb)
{
return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
}
-void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
+void gpu_reset_timings(nonbonded_verlet_t* nbv)
{
if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
{
}
}
-int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb)
+int gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb)
{
return nb != nullptr ?
gpu_min_ci_balanced_factor*nb->dev_info->prop.multiProcessorCount : 0;
}
-gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_cuda_t *nb)
+gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxn_cuda_t *nb)
{
return ((nb->nbparam->eeltype == eelCuEWALD_ANA) ||
(nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
}
-void *nbnxn_gpu_get_command_stream(gmx_nbnxn_gpu_t *nb,
- int iloc)
+void *gpu_get_command_stream(gmx_nbnxn_gpu_t *nb,
+ const InteractionLocality iloc)
{
assert(nb);
return static_cast<void *>(&nb->stream[iloc]);
}
-void *nbnxn_gpu_get_xq(gmx_nbnxn_gpu_t *nb)
+void *gpu_get_xq(gmx_nbnxn_gpu_t *nb)
{
assert(nb);
return static_cast<void *>(nb->atdat->xq);
}
-void *nbnxn_gpu_get_f(gmx_nbnxn_gpu_t *nb)
+void *gpu_get_f(gmx_nbnxn_gpu_t *nb)
{
assert(nb);
return static_cast<void *>(nb->atdat->f);
}
-rvec *nbnxn_gpu_get_fshift(gmx_nbnxn_gpu_t *nb)
+rvec *gpu_get_fshift(gmx_nbnxn_gpu_t *nb)
{
assert(nb);
return reinterpret_cast<rvec *>(nb->atdat->fshift);
}
+
+} // namespace Nbnxm
#include "gromacs/gpu_utils/gputraits.cuh"
#include "gromacs/mdtypes/interaction_const.h"
#include "gromacs/nbnxm/gpu_types_common.h"
+#include "gromacs/nbnxm/nbnxm.h"
#include "gromacs/nbnxm/pairlist.h"
#include "gromacs/timing/gpu_timing.h"
+#include "gromacs/utility/enumerationhelpers.h"
/*! \brief Macro definining default for the prune kernel's j4 processing concurrency.
*
/** \internal
* \brief Pair list data.
*/
-using cu_plist_t = gpu_plist;
+using cu_plist_t = Nbnxm::gpu_plist;
/** \internal
* \brief Typedef of actual timer type.
*/
-typedef struct nbnxn_gpu_timers_t cu_timers_t;
+typedef struct Nbnxm::gpu_timers_t cu_timers_t;
/** \internal
* \brief Main data structure for CUDA nonbonded force calculations.
*/
struct gmx_nbnxn_cuda_t
{
- const gmx_device_info_t *dev_info; /**< CUDA device information */
- bool bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU */
- cu_atomdata_t *atdat; /**< atom data */
- cu_nbparam_t *nbparam; /**< parameters required for the non-bonded calc. */
- cu_plist_t *plist[2]; /**< pair-list data structures (local and non-local) */
- nb_staging_t nbst; /**< staging area where fshift/energies get downloaded */
-
- cudaStream_t stream[2]; /**< local and non-local GPU streams */
+ const gmx_device_info_t *dev_info; /**< CUDA device information */
+ bool bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU */
+ cu_atomdata_t *atdat; /**< atom data */
+ cu_nbparam_t *nbparam; /**< parameters required for the non-bonded calc. */
+ gmx::EnumerationArray<Nbnxm::InteractionLocality, cu_plist_t *> plist; /**< pair-list data structures (local and non-local) */
+ nb_staging_t nbst; /**< staging area where fshift/energies get downloaded */
+
+ gmx::EnumerationArray<Nbnxm::InteractionLocality, cudaStream_t> stream; /**< local and non-local GPU streams */
/** events used for synchronization */
cudaEvent_t nonlocal_done; /**< event triggered when the non-local non-bonded kernel
#include "gpu_common_utils.h"
#include "nbnxm_gpu.h"
+namespace Nbnxm
+{
+
/*! \brief Check that atom locality values are valid for the GPU module.
*
* In the GPU module atom locality "all" is not supported, the local and
*
* \param[in] atomLocality atom locality specifier
*/
-static inline void validateGpuAtomLocality(int atomLocality)
+static inline void
+validateGpuAtomLocality(const AtomLocality atomLocality)
{
std::string str = gmx::formatString("Invalid atom locality passed (%d); valid here is only "
- "local (%d) or nonlocal (%d)", atomLocality, eatLocal, eatNonlocal);
+ "local (%d) or nonlocal (%d)",
+ static_cast<int>(atomLocality),
+ static_cast<int>(AtomLocality::Local),
+ static_cast<int>(AtomLocality::NonLocal));
- GMX_ASSERT(LOCAL_OR_NONLOCAL_A(atomLocality), str.c_str());
+ GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal, str.c_str());
}
/*! \brief Convert atom locality to interaction locality.
* \param[in] atomLocality Atom locality specifier
* \returns Interaction locality corresponding to the atom locality passed.
*/
-static inline int gpuAtomToInteractionLocality(int atomLocality)
+static inline InteractionLocality
+gpuAtomToInteractionLocality(const AtomLocality atomLocality)
{
validateGpuAtomLocality(atomLocality);
/* determine interaction locality from atom locality */
- if (LOCAL_A(atomLocality))
+ if (atomLocality == AtomLocality::Local)
{
- return eintLocal;
+ return InteractionLocality::Local;
}
- else if (NONLOCAL_A(atomLocality))
+ else if (atomLocality == AtomLocality::NonLocal)
{
- return eintNonlocal;
+ return InteractionLocality::NonLocal;
}
else
{
* \param[out] atomRangeLen Atom range length in the atom data array.
*/
template <typename AtomDataT>
-static inline void getGpuAtomRange(const AtomDataT *atomData,
- int atomLocality,
- int *atomRangeBegin,
- int *atomRangeLen)
+static inline void
+getGpuAtomRange(const AtomDataT *atomData,
+ const AtomLocality atomLocality,
+ int *atomRangeBegin,
+ int *atomRangeLen)
{
assert(atomData);
validateGpuAtomLocality(atomLocality);
/* calculate the atom data index range based on locality */
- if (LOCAL_A(atomLocality))
+ if (atomLocality == AtomLocality::Local)
{
*atomRangeBegin = 0;
*atomRangeLen = atomData->natoms_local;
template <typename GpuTimers>
static void countPruneKernelTime(GpuTimers *timers,
gmx_wallclock_gpu_nbnxn_t *timings,
- const int iloc)
+ const InteractionLocality iloc)
{
+ gpu_timers_t::Interaction &iTimers = timers->interaction[iloc];
+
// We might have not done any pruning (e.g. if we skipped with empty domains).
- if (!timers->didPrune[iloc] && !timers->didRollingPrune[iloc])
+ if (!iTimers.didPrune &&
+ !iTimers.didRollingPrune)
{
return;
}
- if (timers->didPrune[iloc])
+ if (iTimers.didPrune)
{
timings->pruneTime.c++;
- timings->pruneTime.t += timers->prune_k[iloc].getLastRangeTime();
+ timings->pruneTime.t += iTimers.prune_k.getLastRangeTime();
}
- if (timers->didRollingPrune[iloc])
+ if (iTimers.didRollingPrune)
{
timings->dynamicPruneTime.c++;
- timings->dynamicPruneTime.t += timers->rollingPrune_k[iloc].getLastRangeTime();
+ timings->dynamicPruneTime.t += iTimers.rollingPrune_k.getLastRangeTime();
}
}
* \param[out] fshift Pointer to the array of shift forces to accumulate into
*/
template <typename StagingData>
-static inline void nbnxn_gpu_reduce_staged_outputs(const StagingData &nbst,
- int iLocality,
- bool reduceEnergies,
- bool reduceFshift,
- real *e_lj,
- real *e_el,
- rvec *fshift)
+static inline void
+gpu_reduce_staged_outputs(const StagingData &nbst,
+ const InteractionLocality iLocality,
+ const bool reduceEnergies,
+ const bool reduceFshift,
+ real *e_lj,
+ real *e_el,
+ rvec *fshift)
{
/* add up energies and shift forces (only once at local F wait) */
- if (LOCAL_I(iLocality))
+ if (iLocality == InteractionLocality::Local)
{
if (reduceEnergies)
{
*
*/
template <typename GpuTimers, typename GpuPairlist>
-static inline void nbnxn_gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timings,
- GpuTimers *timers,
- const GpuPairlist *plist,
- int atomLocality,
- bool didEnergyKernels,
- bool doTiming)
+static inline void
+gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timings,
+ GpuTimers *timers,
+ const GpuPairlist *plist,
+ AtomLocality atomLocality,
+ bool didEnergyKernels,
+ bool doTiming)
{
/* timing data accumulation */
if (!doTiming)
}
/* determine interaction locality from atom locality */
- int iLocality = gpuAtomToInteractionLocality(atomLocality);
+ const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
/* only increase counter once (at local F wait) */
- if (LOCAL_I(iLocality))
+ if (iLocality == InteractionLocality::Local)
{
timings->nb_c++;
timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].c += 1;
/* kernel timings */
timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].t +=
- timers->nb_k[iLocality].getLastRangeTime();
+ timers->interaction[iLocality].nb_k.getLastRangeTime();
/* X/q H2D and F D2H timings */
- timings->nb_h2d_t += timers->nb_h2d[iLocality].getLastRangeTime();
- timings->nb_d2h_t += timers->nb_d2h[iLocality].getLastRangeTime();
+ timings->nb_h2d_t += timers->xf[atomLocality].nb_h2d.getLastRangeTime();
+ timings->nb_d2h_t += timers->xf[atomLocality].nb_d2h.getLastRangeTime();
/* Count the pruning kernel times for both cases:1st pass (at search step)
and rolling pruning (if called at the previous step).
countPruneKernelTime(timers, timings, iLocality);
/* only count atdat and pair-list H2D at pair-search step */
- if (timers->didPairlistH2D[iLocality])
+ if (timers->interaction[iLocality].didPairlistH2D)
{
/* atdat transfer timing (add only once, at local F wait) */
- if (LOCAL_A(atomLocality))
+ if (atomLocality == AtomLocality::Local)
{
timings->pl_h2d_c++;
timings->pl_h2d_t += timers->atdat.getLastRangeTime();
}
- timings->pl_h2d_t += timers->pl_h2d[iLocality].getLastRangeTime();
+ timings->pl_h2d_t += timers->interaction[iLocality].pl_h2d.getLastRangeTime();
/* Clear the timing flag for the next step */
- timers->didPairlistH2D[iLocality] = false;
+ timers->interaction[iLocality].didPairlistH2D = false;
}
}
//TODO: move into shared source file with gmx_compile_cpp_as_cuda
//NOLINTNEXTLINE(misc-definitions-in-headers)
-bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t *nb,
- int flags,
- int aloc,
- bool haveOtherWork,
- real *e_lj,
- real *e_el,
- rvec *fshift,
- GpuTaskCompletion completionKind)
+bool gpu_try_finish_task(gmx_nbnxn_gpu_t *nb,
+ const int flags,
+ const AtomLocality aloc,
+ const bool haveOtherWork,
+ real *e_lj,
+ real *e_el,
+ rvec *fshift,
+ GpuTaskCompletion completionKind)
{
+ GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
/* determine interaction locality from atom locality */
- int iLocality = gpuAtomToInteractionLocality(aloc);
+ const InteractionLocality iLocality = gpuAtomToInteractionLocality(aloc);
// We skip when during the non-local phase there was actually no work to do.
// This is consistent with nbnxn_gpu_launch_kernel.
- if (haveOtherWork || !canSkipWork(nb, iLocality))
+ if (haveOtherWork || !canSkipWork(*nb, iLocality))
{
// Query the state of the GPU stream and return early if we're not done
if (completionKind == GpuTaskCompletion::Check)
bool calcEner = (flags & GMX_FORCE_ENERGY) != 0;
bool calcFshift = (flags & GMX_FORCE_VIRIAL) != 0;
- nbnxn_gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner,
- nb->bDoTime != 0);
+ gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner,
+ nb->bDoTime != 0);
- nbnxn_gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift, e_lj, e_el, fshift);
+ gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift, e_lj, e_el, fshift);
}
/* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
- nb->timers->didPrune[iLocality] = nb->timers->didRollingPrune[iLocality] = false;
+ nb->timers->interaction[iLocality].didPrune = nb->timers->interaction[iLocality].didRollingPrune = false;
/* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
nb->plist[iLocality]->haveFreshList = false;
* \param[out] fshift Pointer to the shift force buffer to accumulate into
*/
//NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file
-void nbnxn_gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb,
- int flags,
- int aloc,
- bool haveOtherWork,
- real *e_lj,
- real *e_el,
- rvec *fshift)
+void gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb,
+ int flags,
+ AtomLocality aloc,
+ bool haveOtherWork,
+ real *e_lj,
+ real *e_el,
+ rvec *fshift)
{
- nbnxn_gpu_try_finish_task(nb, flags, aloc, haveOtherWork, e_lj, e_el, fshift,
- GpuTaskCompletion::Wait);
+ gpu_try_finish_task(nb, flags, aloc, haveOtherWork, e_lj, e_el, fshift,
+ GpuTaskCompletion::Wait);
}
+} // namespace Nbnxm
+
#endif
#include "opencl/nbnxm_ocl_types.h"
#endif
+namespace Nbnxm
+{
+
/*! \brief An early return condition for empty NB GPU workloads
*
* This is currently used for non-local kernels/transfers only.
* local part of the force array also depends on the non-local kernel.
* The skip of the local kernel is taken care of separately.
*/
-static inline bool canSkipWork(const gmx_nbnxn_gpu_t *nb, int iloc)
+static inline bool canSkipWork(const gmx_nbnxn_gpu_t &nb,
+ InteractionLocality iloc)
{
- assert(nb && nb->plist[iloc]);
- return (iloc == eintNonlocal) && (nb->plist[iloc]->nsci == 0);
+ assert(nb.plist[iloc]);
+ return (iloc == InteractionLocality::NonLocal &&
+ nb.plist[iloc]->nsci == 0);
}
+} // namespace Nbnxm
+
#endif
#include "gromacs/mdtypes/interaction_const.h"
#include "gpu_types.h"
+#include "locality.h"
struct nonbonded_verlet_group_t;
struct NbnxnPairlistGpu;
struct gmx_gpu_info_t;
struct gmx_device_info_t;
+namespace Nbnxm
+{
+
/** Initializes the data structures related to GPU nonbonded calculations. */
GPU_FUNC_QUALIFIER
-void nbnxn_gpu_init(gmx_nbnxn_gpu_t gmx_unused **p_nb,
- const gmx_device_info_t gmx_unused *deviceInfo,
- const interaction_const_t gmx_unused *ic,
- const NbnxnListParameters gmx_unused *listParams,
- const nbnxn_atomdata_t gmx_unused *nbat,
- int gmx_unused rank,
- /* true if both local and non-local are done on GPU */
- gmx_bool gmx_unused bLocalAndNonlocal) GPU_FUNC_TERM
+void gpu_init(gmx_nbnxn_gpu_t gmx_unused **p_nb,
+ const gmx_device_info_t gmx_unused *deviceInfo,
+ const interaction_const_t gmx_unused *ic,
+ const NbnxnListParameters gmx_unused *listParams,
+ const nbnxn_atomdata_t gmx_unused *nbat,
+ int gmx_unused rank,
+ /* true if both local and non-local are done on GPU */
+ gmx_bool gmx_unused bLocalAndNonlocal) GPU_FUNC_TERM
/** Initializes pair-list data for GPU, called at every pair search step. */
GPU_FUNC_QUALIFIER
-void nbnxn_gpu_init_pairlist(gmx_nbnxn_gpu_t gmx_unused *nb,
- const struct NbnxnPairlistGpu gmx_unused *h_nblist,
- int gmx_unused iloc) GPU_FUNC_TERM
+void gpu_init_pairlist(gmx_nbnxn_gpu_t gmx_unused *nb,
+ const struct NbnxnPairlistGpu gmx_unused *h_nblist,
+ InteractionLocality gmx_unused iloc) GPU_FUNC_TERM
/** Initializes atom-data on the GPU, called at every pair search step. */
GPU_FUNC_QUALIFIER
-void nbnxn_gpu_init_atomdata(gmx_nbnxn_gpu_t gmx_unused *nb,
- const nbnxn_atomdata_t gmx_unused *nbat) GPU_FUNC_TERM
+void gpu_init_atomdata(gmx_nbnxn_gpu_t gmx_unused *nb,
+ const nbnxn_atomdata_t gmx_unused *nbat) GPU_FUNC_TERM
/*! \brief Re-generate the GPU Ewald force table, resets rlist, and update the
* electrostatic type switching to twin cut-off (or back) if needed.
*/
GPU_FUNC_QUALIFIER
-void nbnxn_gpu_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unused *nbv,
- const interaction_const_t gmx_unused *ic,
- const NbnxnListParameters gmx_unused *listParams) GPU_FUNC_TERM
+void gpu_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unused *nbv,
+ const interaction_const_t gmx_unused *ic,
+ const NbnxnListParameters gmx_unused *listParams) GPU_FUNC_TERM
/** Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). */
GPU_FUNC_QUALIFIER
-void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_gpu_t gmx_unused *nb,
- const nbnxn_atomdata_t gmx_unused *nbatom) GPU_FUNC_TERM
+void gpu_upload_shiftvec(gmx_nbnxn_gpu_t gmx_unused *nb,
+ const nbnxn_atomdata_t gmx_unused *nbatom) GPU_FUNC_TERM
/** Clears GPU outputs: nonbonded force, shift force and energy. */
GPU_FUNC_QUALIFIER
-void nbnxn_gpu_clear_outputs(gmx_nbnxn_gpu_t gmx_unused *nb,
- int gmx_unused flags) GPU_FUNC_TERM
+void gpu_clear_outputs(gmx_nbnxn_gpu_t gmx_unused *nb,
+ int gmx_unused flags) GPU_FUNC_TERM
/** Frees all GPU resources used for the nonbonded calculations. */
GPU_FUNC_QUALIFIER
-void nbnxn_gpu_free(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM
+void gpu_free(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM
/** Returns the GPU timings structure or NULL if GPU is not used or timing is off. */
GPU_FUNC_QUALIFIER
-struct gmx_wallclock_gpu_nbnxn_t *nbnxn_gpu_get_timings(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(nullptr)
+struct gmx_wallclock_gpu_nbnxn_t *gpu_get_timings(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(nullptr)
/** Resets nonbonded GPU timings. */
GPU_FUNC_QUALIFIER
-void nbnxn_gpu_reset_timings(struct nonbonded_verlet_t gmx_unused *nbv) GPU_FUNC_TERM
+void gpu_reset_timings(struct nonbonded_verlet_t gmx_unused *nbv) GPU_FUNC_TERM
/** Calculates the minimum size of proximity lists to improve SM load balance
* with GPU non-bonded kernels. */
-GPU_FUNC_QUALIFIER
-int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(-1)
+ GPU_FUNC_QUALIFIER
+int gpu_min_ci_balanced(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(-1)
/** Returns if analytical Ewald GPU kernels are used. */
GPU_FUNC_QUALIFIER
-gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(FALSE)
+gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(FALSE)
/** Returns an opaque pointer to the GPU command stream
* Note: CUDA only.
*/
CUDA_FUNC_QUALIFIER
-void *nbnxn_gpu_get_command_stream(gmx_nbnxn_gpu_t gmx_unused *nb,
- int gmx_unused iloc) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
+void *gpu_get_command_stream(gmx_nbnxn_gpu_t gmx_unused *nb,
+ InteractionLocality gmx_unused iloc) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
/** Returns an opaque pointer to the GPU coordinate+charge array
* Note: CUDA only.
*/
CUDA_FUNC_QUALIFIER
-void *nbnxn_gpu_get_xq(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
+void *gpu_get_xq(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
/** Returns an opaque pointer to the GPU force array
* Note: CUDA only.
*/
CUDA_FUNC_QUALIFIER
-void *nbnxn_gpu_get_f(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
+void *gpu_get_f(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
/** Returns an opaque pointer to the GPU shift force array
* Note: CUDA only.
*/
CUDA_FUNC_QUALIFIER
-rvec *nbnxn_gpu_get_fshift(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
+ rvec *gpu_get_fshift(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
+
+} // namespace Nbnxm
#endif
#include "config.h"
#include "gromacs/nbnxm/pairlist.h"
+#include "gromacs/utility/enumerationhelpers.h"
+
+#include "locality.h"
#if GMX_GPU == GMX_GPU_OPENCL
#include "gromacs/gpu_utils/gpuregiontimer_ocl.h"
#include "gromacs/gpu_utils/gpuregiontimer.cuh"
#endif
+namespace Nbnxm
+{
+
/*! \internal
* \brief GPU region timers used for timing GPU kernels and H2D/D2H transfers.
*
* The two-sized arrays hold the local and non-local values and should always
* be indexed with eintLocal/eintNonlocal.
*/
-struct nbnxn_gpu_timers_t
+struct gpu_timers_t
{
- GpuRegionTimer atdat; /**< timer for atom data transfer (every PS step) */
- GpuRegionTimer nb_h2d[2]; /**< timer for x/q H2D transfers (l/nl, every step) */
- GpuRegionTimer nb_d2h[2]; /**< timer for f D2H transfer (l/nl, every step) */
- GpuRegionTimer pl_h2d[2]; /**< timer for pair-list H2D transfers (l/nl, every PS step) */
- bool didPairlistH2D[2]; /**< true when a pair-list transfer has been done at this step */
- GpuRegionTimer nb_k[2]; /**< timer for non-bonded kernels (l/nl, every step) */
- GpuRegionTimer prune_k[2]; /**< timer for the 1st pass list pruning kernel (l/nl, every PS step) */
- bool didPrune[2]; /**< true when we timed pruning and the timings need to be accounted for */
- GpuRegionTimer rollingPrune_k[2]; /**< timer for rolling pruning kernels (l/nl, frequency depends on chunk size) */
- bool didRollingPrune[2]; /**< true when we timed rolling pruning (at the previous step) and the timings need to be accounted for */
+ /*! \internal
+ * \brief Timers for local or non-local coordinate/force transfers
+ */
+ struct XFTransfers
+ {
+ GpuRegionTimer nb_h2d; /**< timer for x/q H2D transfers (l/nl, every step) */
+ GpuRegionTimer nb_d2h; /**< timer for f D2H transfer (l/nl, every step) */
+ };
+
+ /*! \internal
+ * \brief Timers for local or non-local interaction related operations
+ */
+ struct Interaction
+ {
+ GpuRegionTimer pl_h2d; /**< timer for pair-list H2D transfers (l/nl, every PS step) */
+ bool didPairlistH2D = false; /**< true when a pair-list transfer has been done at this step */
+ GpuRegionTimer nb_k; /**< timer for non-bonded kernels (l/nl, every step) */
+ GpuRegionTimer prune_k; /**< timer for the 1st pass list pruning kernel (l/nl, every PS step) */
+ bool didPrune = false; /**< true when we timed pruning and the timings need to be accounted for */
+ GpuRegionTimer rollingPrune_k; /**< timer for rolling pruning kernels (l/nl, frequency depends on chunk size) */
+ bool didRollingPrune = false; /**< true when we timed rolling pruning (at the previous step) and the timings need to be accounted for */
+ };
+
+ //! timer for atom data transfer (every PS step)
+ GpuRegionTimer atdat;
+ //! timers for coordinate/force transfers (every step)
+ gmx::EnumerationArray<AtomLocality, XFTransfers> xf;
+ //! timers for interaction related transfers
+ gmx::EnumerationArray<InteractionLocality, Nbnxm::gpu_timers_t::Interaction> interaction;
};
struct gpu_plist
int rollingPruningPart; /**< the next part to which the roling pruning needs to be applied */
};
+} // namespace Nbnxm
+
#endif
}
}
-static void accountFlops(t_nrnb *nrnb,
- const nonbonded_verlet_t &nbv,
- const int ilocality,
- const interaction_const_t &ic,
- int forceFlags)
+static void accountFlops(t_nrnb *nrnb,
+ const nonbonded_verlet_t &nbv,
+ const Nbnxm::InteractionLocality iLocality,
+ const interaction_const_t &ic,
+ const int forceFlags)
{
- const nonbonded_verlet_group_t &nbvg = nbv.grp[ilocality];
+ const nonbonded_verlet_group_t &nbvg = nbv.grp[iLocality];
const bool usingGpuKernels = (nbvg.kernel_type == nbnxnk8x8x8_GPU);
int enr_nbnxn_kernel_ljc;
enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
}
else if ((!usingGpuKernels && nbvg.ewald_excl == ewaldexclAnalytical) ||
- (usingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv.gpu_nbv)))
+ (usingGpuKernels && Nbnxm::gpu_is_kernel_ewald_analytical(nbv.gpu_nbv)))
{
enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
}
}
void NbnxnDispatchKernel(nonbonded_verlet_t *nbv,
- const int ilocality,
+ Nbnxm::InteractionLocality iLocality,
const interaction_const_t &ic,
int forceFlags,
int clearF,
gmx_enerdata_t *enerd,
t_nrnb *nrnb)
{
- const nonbonded_verlet_group_t &nbvg = nbv->grp[ilocality];
+ const nonbonded_verlet_group_t &nbvg = nbv->grp[iLocality];
switch (nbvg.kernel_type)
{
break;
case nbnxnk8x8x8_GPU:
- nbnxn_gpu_launch_kernel(nbv->gpu_nbv, forceFlags, ilocality);
+ Nbnxm::gpu_launch_kernel(nbv->gpu_nbv, forceFlags, iLocality);
break;
case nbnxnk8x8x8_PlainC:
}
- accountFlops(nrnb, *nbv, ilocality, ic, forceFlags);
+ accountFlops(nrnb, *nbv, iLocality, ic, forceFlags);
}
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal \file
+ * \brief Defines nbnxn locality enums
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \ingroup module_nbnxm
+ */
+
+#ifndef GMX_NBNXM_LOCALITY_H
+#define GMX_NBNXM_LOCALITY_H
+
+namespace Nbnxm
+{
+
+/*! \brief Atom locality indicator: local, non-local, all.
+ *
+ * Used for calls to:
+ * gridding, force calculation, x/f buffer operations
+ */
+enum class AtomLocality : int
+{
+ Local = 0, //!< Local atoms
+ NonLocal = 1, //!< Non-local atoms
+ All = 2, //!< Both local and non-local atoms
+ Count = 3 //!< The number of atom locality types
+};
+
+/*! \brief Interaction locality indicator: local, non-local, all.
+ *
+ * Used for calls to:
+ * pair-search, force calculation, x/f buffer operations
+ */
+enum class InteractionLocality : int
+{
+ Local = 0, //!< Interactions between local atoms only
+ NonLocal = 1, //!< Interactions between non-local and (non-)local atoms
+ Count = 2 //!< The number of interaction locality types
+};
+
+} // namespace Nbnxm
+
+#endif // GMX_NBNXM_LOCALITY_H
#include "gromacs/nbnxm/pairlist.h"
#include "gromacs/nbnxm/pairlistset.h"
#include "gromacs/utility/arrayref.h"
+#include "gromacs/utility/enumerationhelpers.h"
#include "gromacs/utility/real.h"
+#include "locality.h"
+
// TODO: Remove this include and the two nbnxm includes above
#include "nbnxm_gpu.h"
nbnxnkNR
} nbnxn_kernel_type;
+namespace Nbnxm
+{
+
/*! \brief Return a string identifying the kernel type.
*
* \param [in] kernel_type nonbonded kernel types, takes values from the nbnxn_kernel_type enum
* \returns a string identifying the kernel corresponding to the type passed as argument
*/
-const char *lookup_nbnxn_kernel_name(int kernel_type);
+const char *lookup_kernel_name(int kernel_type);
+
+} // namespace Nbnxm
/*! \brief Ewald exclusion types */
enum {
ewaldexclTable, ewaldexclAnalytical
};
-/*! \brief Atom locality indicator: local, non-local, all.
- *
- * Used for calls to:
- * gridding, pair-search, force calculation, x/f buffer operations
- * */
-enum {
- eatLocal = 0, eatNonlocal = 1, eatAll
-};
-
-/*! \brief Tests for local atom range */
-#define LOCAL_A(x) ((x) == eatLocal)
-/*! \brief Tests for non-local atom range */
-#define NONLOCAL_A(x) ((x) == eatNonlocal)
-/*! \brief Tests for either local or non-local atom range */
-#define LOCAL_OR_NONLOCAL_A(x) (LOCAL_A(x) || NONLOCAL_A(x))
-
-/*! \brief Interaction locality indicator
- *
- * Used in pair-list search/calculations in the following manner:
- * - local interactions require local atom data and affect local output only;
- * - non-local interactions require both local and non-local atom data and
- * affect both local- and non-local output.
- */
-enum {
- eintLocal = 0, eintNonlocal = 1
-};
-
-/*! \brief Tests for local interaction indicator */
-#define LOCAL_I(x) ((x) == eintLocal)
-/*! \brief Tests for non-local interaction indicator */
-#define NONLOCAL_I(x) ((x) == eintNonlocal)
-
/*! \brief Flag to tell the nonbonded kernels whether to clear the force output buffers */
enum {
enbvClearFNo, enbvClearFYes
/*! \libinternal
* \brief Top-level non-bonded data structure for the Verlet-type cut-off scheme. */
-typedef struct nonbonded_verlet_t {
- std::unique_ptr<NbnxnListParameters> listParams; /**< Parameters for the search and list pruning setup */
- std::unique_ptr<nbnxn_search> nbs; /**< n vs n atom pair searching data */
- int ngrp; /**< number of interaction groups */
- nonbonded_verlet_group_t grp[2]; /**< local and non-local interaction group */
- nbnxn_atomdata_t *nbat; /**< atom data */
-
- gmx_bool bUseGPU; /**< TRUE when non-bonded interactions are computed on a physical GPU */
- EmulateGpuNonbonded emulateGpu; /**< true when non-bonded interactions are computed on the CPU using GPU-style pair lists */
- gmx_nbnxn_gpu_t *gpu_nbv; /**< pointer to GPU nb verlet data */
- int min_ci_balanced; /**< pair list balancing parameter
- used for the 8x8x8 GPU kernels */
-} nonbonded_verlet_t;
+struct nonbonded_verlet_t
+{
+ std::unique_ptr<NbnxnListParameters> listParams; /**< Parameters for the search and list pruning setup */
+ std::unique_ptr<nbnxn_search> nbs; /**< n vs n atom pair searching data */
+ int ngrp; /**< number of interaction groups */
+ //! Local and non-local interaction group
+ gmx::EnumerationArray<Nbnxm::InteractionLocality, nonbonded_verlet_group_t> grp;
+ //! Atom data
+ nbnxn_atomdata_t *nbat;
+
+ gmx_bool bUseGPU; /**< TRUE when non-bonded interactions are computed on a physical GPU */
+ EmulateGpuNonbonded emulateGpu; /**< true when non-bonded interactions are computed on the CPU using GPU-style pair lists */
+ gmx_nbnxn_gpu_t *gpu_nbv; /**< pointer to GPU nb verlet data */
+ int min_ci_balanced; /**< pair list balancing parameter
+ used for the 8x8x8 GPU kernels */
+};
+
+namespace Nbnxm
+{
/*! \brief Initializes the nbnxn module */
void init_nb_verlet(const gmx::MDLogger &mdlog,
const gmx_mtop_t *mtop,
matrix box);
+} // namespace Nbnxm
+
/*! \brief Put the atoms on the pair search grid.
*
* Only atoms atomStart to atomEnd in x are put on the grid.
* pairs beyond the pairlist inner radius and writes the result to a list that is
* to be consumed by the non-bonded kernel.
*/
-void NbnxnDispatchPruneKernel(nonbonded_verlet_t *nbv,
- int ilocality,
- const rvec *shift_vec);
+void NbnxnDispatchPruneKernel(nonbonded_verlet_t *nbv,
+ Nbnxm::InteractionLocality iLocality,
+ const rvec *shift_vec);
/*! \brief Executes the non-bonded kernel of the GPU or launches it on the GPU */
-void NbnxnDispatchKernel(nonbonded_verlet_t *nbv,
- int ilocality,
- const interaction_const_t &ic,
- int forceFlags,
- int clearF,
- t_forcerec *fr,
- gmx_enerdata_t *enerd,
- t_nrnb *nrnb);
+void NbnxnDispatchKernel(nonbonded_verlet_t *nbv,
+ Nbnxm::InteractionLocality iLocality,
+ const interaction_const_t &ic,
+ int forceFlags,
+ int clearF,
+ t_forcerec *fr,
+ gmx_enerdata_t *enerd,
+ t_nrnb *nrnb);
#endif // GMX_NBNXN_NBNXN_H
#include "gromacs/utility/real.h"
#include "gpu_types.h"
+#include "locality.h"
struct nbnxn_atomdata_t;
enum class GpuTaskCompletion;
+namespace Nbnxm
+{
+
/*! \brief
* Launch asynchronously the xq buffer host to device copy.
*
*
* \param [in] nb GPU nonbonded data.
* \param [in] nbdata Host-side atom data structure.
- * \param [in] iloc Interaction locality flag.
+ * \param [in] aloc Atom locality flag.
* \param [in] haveOtherWork True if there are other tasks that require the nbnxn coordinate input.
*/
GPU_FUNC_QUALIFIER
-void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_gpu_t gmx_unused *nb,
- const struct nbnxn_atomdata_t gmx_unused *nbdata,
- int gmx_unused iloc,
- bool gmx_unused haveOtherWork) GPU_FUNC_TERM
+void gpu_copy_xq_to_gpu(gmx_nbnxn_gpu_t gmx_unused *nb,
+ const struct nbnxn_atomdata_t gmx_unused *nbdata,
+ AtomLocality gmx_unused aloc,
+ bool gmx_unused haveOtherWork) GPU_FUNC_TERM
/*! \brief
* Launch asynchronously the nonbonded force calculations.
*
*/
GPU_FUNC_QUALIFIER
-void nbnxn_gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused *nb,
- int gmx_unused flags,
- int gmx_unused iloc) GPU_FUNC_TERM
+void gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused *nb,
+ int gmx_unused flags,
+ InteractionLocality gmx_unused iloc) GPU_FUNC_TERM
/*! \brief
* Launch asynchronously the nonbonded prune-only kernel.
* \param [in] numParts Number of parts the pair list is split into in the rolling kernel.
*/
GPU_FUNC_QUALIFIER
-void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t gmx_unused *nb,
- int gmx_unused iloc,
- int gmx_unused numParts) GPU_FUNC_TERM
+void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t gmx_unused *nb,
+ InteractionLocality gmx_unused iloc,
+ int gmx_unused numParts) GPU_FUNC_TERM
/*! \brief
* Launch asynchronously the download of nonbonded forces from the GPU
* no non-bonded work.
*/
GPU_FUNC_QUALIFIER
-void nbnxn_gpu_launch_cpyback(gmx_nbnxn_gpu_t gmx_unused *nb,
- struct nbnxn_atomdata_t gmx_unused *nbatom,
- int gmx_unused flags,
- int gmx_unused aloc,
- bool gmx_unused haveOtherWork) GPU_FUNC_TERM
+void gpu_launch_cpyback(gmx_nbnxn_gpu_t gmx_unused *nb,
+ nbnxn_atomdata_t gmx_unused *nbatom,
+ int gmx_unused flags,
+ AtomLocality gmx_unused aloc,
+ bool gmx_unused haveOtherWork) GPU_FUNC_TERM
/*! \brief Attempts to complete nonbonded GPU task.
*
* \returns True if the nonbonded tasks associated with \p aloc locality have completed
*/
GPU_FUNC_QUALIFIER
-bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
- int gmx_unused flags,
- int gmx_unused aloc,
- bool gmx_unused haveOtherWork,
- real gmx_unused *e_lj,
- real gmx_unused *e_el,
- rvec gmx_unused *fshift,
- GpuTaskCompletion gmx_unused completionKind) GPU_FUNC_TERM_WITH_RETURN(false)
+bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
+ int gmx_unused flags,
+ AtomLocality gmx_unused aloc,
+ bool gmx_unused haveOtherWork,
+ real gmx_unused *e_lj,
+ real gmx_unused *e_el,
+ rvec gmx_unused *fshift,
+ GpuTaskCompletion gmx_unused completionKind) GPU_FUNC_TERM_WITH_RETURN(false)
/*! \brief Completes the nonbonded GPU task blocking until GPU tasks and data
* transfers to finish.
* \param[out] fshift Pointer to the shift force buffer to accumulate into
*/
GPU_FUNC_QUALIFIER
-void nbnxn_gpu_wait_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
- int gmx_unused flags,
- int gmx_unused aloc,
- bool gmx_unused haveOtherWork,
- real gmx_unused *e_lj,
- real gmx_unused *e_el,
- rvec gmx_unused *fshift) GPU_FUNC_TERM
+void gpu_wait_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
+ int gmx_unused flags,
+ AtomLocality gmx_unused aloc,
+ bool gmx_unused haveOtherWork,
+ real gmx_unused *e_lj,
+ real gmx_unused *e_el,
+ rvec gmx_unused *fshift) GPU_FUNC_TERM
/*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
GPU_FUNC_QUALIFIER
-int nbnxn_gpu_pick_ewald_kernel_type(bool gmx_unused bTwinCut) GPU_FUNC_TERM_WITH_RETURN(-1)
+int gpu_pick_ewald_kernel_type(bool gmx_unused bTwinCut) GPU_FUNC_TERM_WITH_RETURN(-1)
+
+} // namespace Nbnxm
#endif
#include "grid.h"
#include "internal.h"
+namespace Nbnxm
+{
+
/*! \brief Returns whether CPU SIMD support exists for the given inputrec
*
* If the return value is FALSE and fplog/cr != NULL, prints a fallback
/* One 256-bit FMA per cycle makes 2xNN faster */
*kernel_type = nbnxnk4xN_SIMD_2xNN;
}
-#endif /* GMX_NBNXN_SIMD_2XNN && GMX_NBNXN_SIMD_4XN */
+#endif /* GMX_NBNXN_SIMD_2XNN && GMX_NBNXN_SIMD_4XN */
if (getenv("GMX_NBNXN_SIMD_4XN") != nullptr)
}
}
-#endif // GMX_SIMD
+#endif // GMX_SIMD
}
-const char *lookup_nbnxn_kernel_name(int kernel_type)
+const char *lookup_kernel_name(int kernel_type)
{
const char *returnvalue = nullptr;
switch (kernel_type)
{
GMX_LOG(mdlog.info).asParagraph().appendTextFormatted(
"Using %s %dx%d nonbonded short-range kernels",
- lookup_nbnxn_kernel_name(*kernel_type),
+ lookup_kernel_name(*kernel_type),
nbnxn_kernel_to_cluster_i_size(*kernel_type),
nbnxn_kernel_to_cluster_j_size(*kernel_type));
GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
"WARNING: Using the slow %s kernels. This should\n"
"not happen during routine usage on supported platforms.",
- lookup_nbnxn_kernel_name(*kernel_type));
+ lookup_kernel_name(*kernel_type));
}
}
}
{
/* init the NxN GPU data; the last argument tells whether we'll have
* both local and non-local NB calculation on GPU */
- nbnxn_gpu_init(&nbv->gpu_nbv,
- deviceInfo,
- fr->ic,
- nbv->listParams.get(),
- nbv->nbat,
- cr->nodeid,
- (nbv->ngrp > 1));
+ gpu_init(&nbv->gpu_nbv,
+ deviceInfo,
+ fr->ic,
+ nbv->listParams.get(),
+ nbv->nbat,
+ cr->nodeid,
+ (nbv->ngrp > 1));
if ((env = getenv("GMX_NB_MIN_CI")) != nullptr)
{
}
else
{
- nbv->min_ci_balanced = nbnxn_gpu_min_ci_balanced(nbv->gpu_nbv);
+ nbv->min_ci_balanced = gpu_min_ci_balanced(nbv->gpu_nbv);
if (debug)
{
fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
*nb_verlet = nbv;
}
+
+} // namespace Nbnxm
#include "nbnxm_ocl_internal.h"
#include "nbnxm_ocl_types.h"
+namespace Nbnxm
+{
+
/*! \brief Convenience constants */
//@{
static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
}
/*! \brief Launch asynchronously the xq buffer host to device copy. */
-void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t *nb,
- const nbnxn_atomdata_t *nbatom,
- int iloc,
- bool haveOtherWork)
+void gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t *nb,
+ const nbnxn_atomdata_t *nbatom,
+ const AtomLocality atomLocality,
+ const bool haveOtherWork)
{
- int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
+ const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
+
+ /* local/nonlocal offset and length used for xq and f */
+ int adat_begin, adat_len;
cl_atomdata_t *adat = nb->atdat;
cl_plist_t *plist = nb->plist[iloc];
cl_timers_t *t = nb->timers;
cl_command_queue stream = nb->stream[iloc];
- bool bDoTime = (nb->bDoTime) != 0;
+ bool bDoTime = (nb->bDoTime) != 0;
/* Don't launch the non-local H2D copy if there is no dependent
work to do: neither non-local nor other (e.g. bonded) work
we always call the local local x+q copy (and the rest of the local
work in nbnxn_gpu_launch_kernel().
*/
- if (!haveOtherWork && canSkipWork(nb, iloc))
+ if (!haveOtherWork && canSkipWork(*nb, iloc))
{
plist->haveFreshList = false;
}
/* calculate the atom data index range based on locality */
- if (LOCAL_I(iloc))
+ if (atomLocality == AtomLocality::Local)
{
adat_begin = 0;
adat_len = adat->natoms_local;
/* beginning of timed HtoD section */
if (bDoTime)
{
- t->nb_h2d[iloc].openTimingRegion(stream);
+ t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
}
/* HtoD x, q */
ocl_copy_H2D_async(adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin*sizeof(float)*4,
- adat_len * sizeof(float) * 4, stream, bDoTime ? t->nb_h2d[iloc].fetchNextEvent() : nullptr);
+ adat_len * sizeof(float) * 4, stream, bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
if (bDoTime)
{
- t->nb_h2d[iloc].closeTimingRegion(stream);
+ t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
}
/* When we get here all misc operations issues in the local stream as well as
so we record that in the local stream and wait for it in the nonlocal one. */
if (nb->bUseTwoStreams)
{
- if (iloc == eintLocal)
+ if (iloc == InteractionLocality::Local)
{
cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
assert(CL_SUCCESS == cl_error);
misc_ops_done event to record the point in time when the above operations
are finished and synchronize with this event in the non-local stream.
*/
-void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb,
- int flags,
- int iloc)
+void gpu_launch_kernel(gmx_nbnxn_ocl_t *nb,
+ const int flags,
+ const Nbnxm::InteractionLocality iloc)
{
/* OpenCL kernel launch-related stuff */
cl_kernel nb_kernel = nullptr; /* fn pointer to the nonbonded kernel */
clearing. All these operations, except for the local interaction kernel,
are needed for the non-local interactions. The skip of the local kernel
call is taken care of later in this function. */
- if (canSkipWork(nb, iloc))
+ if (canSkipWork(*nb, iloc))
{
plist->haveFreshList = false;
(that's the way the timing accounting can distinguish between
separate prune kernel and combined force+prune).
*/
- nbnxn_gpu_launch_kernel_pruneonly(nb, iloc, 1);
+ Nbnxm::gpu_launch_kernel_pruneonly(nb, iloc, 1);
}
if (plist->nsci == 0)
/* beginning of timed nonbonded calculation section */
if (bDoTime)
{
- t->nb_k[iloc].openTimingRegion(stream);
+ t->interaction[iloc].nb_k.openTimingRegion(stream);
}
/* get the pointer to the kernel flavor we need to use */
nbp->eeltype,
nbp->vdwtype,
bCalcEner,
- (plist->haveFreshList && !nb->timers->didPrune[iloc]));
+ (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune));
/* kernel launch config */
fillin_ocl_structures(nbp, &nbparams_params);
- auto *timingEvent = bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr;
+ auto *timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
constexpr char kernelName[] = "k_calc_nb";
if (useLjCombRule(nb->nbparam->vdwtype))
{
if (bDoTime)
{
- t->nb_k[iloc].closeTimingRegion(stream);
+ t->interaction[iloc].nb_k.closeTimingRegion(stream);
}
}
return shmem;
}
-void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb,
- int iloc,
- int numParts)
+void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb,
+ const InteractionLocality iloc,
+ const int numParts)
{
cl_atomdata_t *adat = nb->atdat;
cl_nbparam_t *nbp = nb->nbparam;
GpuRegionTimer *timer = nullptr;
if (bDoTime)
{
- timer = &(plist->haveFreshList ? t->prune_k[iloc] : t->rollingPrune_k[iloc]);
+ timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
}
/* beginning of timed prune calculation section */
{
plist->haveFreshList = false;
/* Mark that pruning has been done */
- nb->timers->didPrune[iloc] = true;
+ nb->timers->interaction[iloc].didPrune = true;
}
else
{
/* Mark that rolling pruning has been done */
- nb->timers->didRollingPrune[iloc] = true;
+ nb->timers->interaction[iloc].didRollingPrune = true;
}
if (bDoTime)
* Launch asynchronously the download of nonbonded forces from the GPU
* (and energies/shift forces if required).
*/
-void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb,
- struct nbnxn_atomdata_t *nbatom,
- int flags,
- int aloc,
- bool haveOtherWork)
+void gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb,
+ struct nbnxn_atomdata_t *nbatom,
+ const int flags,
+ const AtomLocality aloc,
+ const bool haveOtherWork)
{
cl_int gmx_unused cl_error;
int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
/* determine interaction locality from atom locality */
- int iloc = gpuAtomToInteractionLocality(aloc);
+ const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc);
- cl_atomdata_t *adat = nb->atdat;
- cl_timers_t *t = nb->timers;
- bool bDoTime = nb->bDoTime == CL_TRUE;
- cl_command_queue stream = nb->stream[iloc];
+ cl_atomdata_t *adat = nb->atdat;
+ cl_timers_t *t = nb->timers;
+ bool bDoTime = nb->bDoTime == CL_TRUE;
+ cl_command_queue stream = nb->stream[iloc];
- bool bCalcEner = (flags & GMX_FORCE_ENERGY) != 0;
- int bCalcFshift = flags & GMX_FORCE_VIRIAL;
+ bool bCalcEner = (flags & GMX_FORCE_ENERGY) != 0;
+ int bCalcFshift = flags & GMX_FORCE_VIRIAL;
/* don't launch non-local copy-back if there was no non-local work to do */
- if (!haveOtherWork && canSkipWork(nb, iloc))
+ if (!haveOtherWork && canSkipWork(*nb, iloc))
{
/* TODO An alternative way to signal that non-local work is
complete is to use a clEnqueueMarker+clEnqueueBarrier
/* beginning of timed D2H section */
if (bDoTime)
{
- t->nb_d2h[iloc].openTimingRegion(stream);
+ t->xf[aloc].nb_d2h.openTimingRegion(stream);
}
/* With DD the local D2H transfer can only start after the non-local
has been launched. */
- if (iloc == eintLocal && nb->bNonLocalStreamActive)
+ if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive)
{
sync_ocl_event(stream, &(nb->nonlocal_done));
}
/* DtoH f */
ocl_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
- (adat_len)* adat->f_elem_size, stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
+ (adat_len)* adat->f_elem_size, stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
/* kick off work */
cl_error = clFlush(stream);
recorded which signals that the local D2H can proceed. This event is not
placed after the non-local kernel because we first need the non-local
data back first. */
- if (iloc == eintNonlocal)
+ if (iloc == InteractionLocality::NonLocal)
{
cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->nonlocal_done));
assert(CL_SUCCESS == cl_error);
}
/* only transfer energies in the local stream */
- if (LOCAL_I(iloc))
+ if (iloc == InteractionLocality::Local)
{
/* DtoH fshift */
if (bCalcFshift)
{
ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
- SHIFTS * adat->fshift_elem_size, stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
+ SHIFTS * adat->fshift_elem_size, stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
}
/* DtoH energies */
if (bCalcEner)
{
ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
- sizeof(float), stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
+ sizeof(float), stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
- sizeof(float), stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
+ sizeof(float), stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
}
}
if (bDoTime)
{
- t->nb_d2h[iloc].closeTimingRegion(stream);
+ t->xf[aloc].nb_d2h.closeTimingRegion(stream);
}
}
/*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
-int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut)
+int gpu_pick_ewald_kernel_type(const bool bTwinCut)
{
bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
int kernel_type;
return kernel_type;
}
+
+} // namespace Nbnxm
#include <cmath>
+// TODO We would like to move this down, but the way gmx_nbnxn_gpu_t
+// is currently declared means this has to be before gpu_types.h
+#include "nbnxm_ocl_types.h"
+
+// TODO Remove this comment when the above order issue is resolved
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/gpu_utils/oclutils.h"
#include "gromacs/hardware/gpu_hw_info.h"
#include "gromacs/utility/smalloc.h"
#include "nbnxm_ocl_internal.h"
-#include "nbnxm_ocl_types.h"
+
+namespace Nbnxm
+{
/*! \brief This parameter should be determined heuristically from the
* kernel execution times
else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
{
/* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
- *gpu_eeltype = nbnxn_gpu_pick_ewald_kernel_type(false);
+ *gpu_eeltype = gpu_pick_ewald_kernel_type(false);
}
else
{
}
//! This function is documented in the header file
-void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t *nbv,
- const interaction_const_t *ic,
- const NbnxnListParameters *listParams)
+void gpu_pme_loadbal_update_param(const nonbonded_verlet_t *nbv,
+ const interaction_const_t *ic,
+ const NbnxnListParameters *listParams)
{
- if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
+ if (!nbv || nbv->grp[InteractionLocality::Local].kernel_type != nbnxnk8x8x8_GPU)
{
return;
}
set_cutoff_parameters(nbp, ic, listParams);
- nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw);
+ nbp->eeltype = gpu_pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw);
init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_rundata);
}
pl->haveFreshList = false;
}
-/*! \brief Initializes the timer data structure.
- */
-static void init_timers(cl_timers_t *t,
- bool bUseTwoStreams)
-{
- for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
- {
- t->didPairlistH2D[i] = false;
- t->didPrune[i] = false;
- t->didRollingPrune[i] = false;
- }
-}
-
/*! \brief Initializes the timings data structure.
*/
static void init_timings(gmx_wallclock_gpu_nbnxn_t *t)
cl_int cl_error;
cl_atomdata_t * adat = nb->atdat;
- cl_command_queue ls = nb->stream[eintLocal];
+ cl_command_queue ls = nb->stream[InteractionLocality::Local];
size_t local_work_size[3] = {1, 1, 1};
size_t global_work_size[3] = {1, 1, 1};
//! This function is documented in the header file
-void nbnxn_gpu_init(gmx_nbnxn_ocl_t **p_nb,
- const gmx_device_info_t *deviceInfo,
- const interaction_const_t *ic,
- const NbnxnListParameters *listParams,
- const nbnxn_atomdata_t *nbat,
- int rank,
- gmx_bool bLocalAndNonlocal)
+void gpu_init(gmx_nbnxn_ocl_t **p_nb,
+ const gmx_device_info_t *deviceInfo,
+ const interaction_const_t *ic,
+ const NbnxnListParameters *listParams,
+ const nbnxn_atomdata_t *nbat,
+ const int rank,
+ const gmx_bool bLocalAndNonlocal)
{
gmx_nbnxn_ocl_t *nb;
cl_int cl_error;
snew(nb, 1);
snew(nb->atdat, 1);
snew(nb->nbparam, 1);
- snew(nb->plist[eintLocal], 1);
+ snew(nb->plist[InteractionLocality::Local], 1);
if (bLocalAndNonlocal)
{
- snew(nb->plist[eintNonlocal], 1);
+ snew(nb->plist[InteractionLocality::NonLocal], 1);
}
nb->bUseTwoStreams = static_cast<cl_bool>(bLocalAndNonlocal);
pmalloc(reinterpret_cast<void**>(&nb->nbst.e_el), sizeof(*nb->nbst.e_el));
pmalloc(reinterpret_cast<void**>(&nb->nbst.fshift), SHIFTS * sizeof(*nb->nbst.fshift));
- init_plist(nb->plist[eintLocal]);
+ init_plist(nb->plist[InteractionLocality::Local]);
/* OpenCL timing disabled if GMX_DISABLE_GPU_TIMING is defined. */
nb->bDoTime = static_cast<cl_bool>(getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
nbnxn_gpu_create_context(nb->dev_rundata, nb->dev_info, rank);
/* local/non-local GPU streams */
- nb->stream[eintLocal] = clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
+ nb->stream[InteractionLocality::Local] =
+ clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
if (CL_SUCCESS != cl_error)
{
gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
if (nb->bUseTwoStreams)
{
- init_plist(nb->plist[eintNonlocal]);
+ init_plist(nb->plist[InteractionLocality::NonLocal]);
- nb->stream[eintNonlocal] = clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
+ nb->stream[InteractionLocality::NonLocal] =
+ clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
if (CL_SUCCESS != cl_error)
{
gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
if (nb->bDoTime)
{
- init_timers(nb->timers, nb->bUseTwoStreams == CL_TRUE);
init_timings(nb->timings);
}
cl_int gmx_used_in_debug cl_error;
cl_atomdata_t *atomData = nb->atdat;
- cl_command_queue ls = nb->stream[eintLocal];
+ cl_command_queue ls = nb->stream[InteractionLocality::Local];
cl_float value = 0.0f;
cl_error = clEnqueueFillBuffer(ls, atomData->f, &value, sizeof(cl_float),
//! This function is documented in the header file
void
-nbnxn_gpu_clear_outputs(gmx_nbnxn_ocl_t *nb,
- int flags)
+gpu_clear_outputs(gmx_nbnxn_ocl_t *nb,
+ const int flags)
{
nbnxn_ocl_clear_f(nb, nb->atdat->natoms);
/* clear shift force array and energies if the outputs were
/* kick off buffer clearing kernel to ensure concurrency with constraints/update */
cl_int gmx_unused cl_error;
- cl_error = clFlush(nb->stream[eintLocal]);
+ cl_error = clFlush(nb->stream[InteractionLocality::Local]);
assert(CL_SUCCESS == cl_error);
}
//! This function is documented in the header file
-void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t *nb,
- const NbnxnPairlistGpu *h_plist,
- int iloc)
+void gpu_init_pairlist(gmx_nbnxn_ocl_t *nb,
+ const NbnxnPairlistGpu *h_plist,
+ const InteractionLocality iloc)
{
char sbuf[STRLEN];
// Timing accumulation should happen only if there was work to do
}
}
+ gpu_timers_t::Interaction &iTimers = nb->timers->interaction[iloc];
+
if (bDoTime)
{
- nb->timers->pl_h2d[iloc].openTimingRegion(stream);
- nb->timers->didPairlistH2D[iloc] = true;
+ iTimers.pl_h2d.openTimingRegion(stream);
+ iTimers.didPairlistH2D = true;
}
// TODO most of this function is same in CUDA and OpenCL, move into the header
&d_plist->nsci, &d_plist->sci_nalloc, context);
copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(),
stream, GpuApiCallBehavior::Async,
- bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+ bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(),
&d_plist->ncj4, &d_plist->cj4_nalloc, context);
copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(),
stream, GpuApiCallBehavior::Async,
- bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+ bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size()*c_nbnxnGpuClusterpairSplit,
&d_plist->nimask, &d_plist->imask_nalloc, context);
&d_plist->nexcl, &d_plist->excl_nalloc, context);
copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(),
stream, GpuApiCallBehavior::Async,
- bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+ bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
if (bDoTime)
{
- nb->timers->pl_h2d[iloc].closeTimingRegion(stream);
+ iTimers.pl_h2d.closeTimingRegion(stream);
}
/* need to prune the pair list during the next step */
}
//! This function is documented in the header file
-void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_ocl_t *nb,
- const nbnxn_atomdata_t *nbatom)
+void gpu_upload_shiftvec(gmx_nbnxn_ocl_t *nb,
+ const nbnxn_atomdata_t *nbatom)
{
cl_atomdata_t *adat = nb->atdat;
- cl_command_queue ls = nb->stream[eintLocal];
+ cl_command_queue ls = nb->stream[InteractionLocality::Local];
/* only if we have a dynamic box */
if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
}
//! This function is documented in the header file
-void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t *nb,
- const nbnxn_atomdata_t *nbat)
+void gpu_init_atomdata(gmx_nbnxn_ocl_t *nb,
+ const nbnxn_atomdata_t *nbat)
{
cl_int cl_error;
int nalloc, natoms;
bool bDoTime = nb->bDoTime == CL_TRUE;
cl_timers_t *timers = nb->timers;
cl_atomdata_t *d_atdat = nb->atdat;
- cl_command_queue ls = nb->stream[eintLocal];
+ cl_command_queue ls = nb->stream[InteractionLocality::Local];
natoms = nbat->numAtoms();
realloced = false;
}
//! This function is documented in the header file
-void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
+void gpu_free(gmx_nbnxn_ocl_t *nb)
{
if (nb == nullptr)
{
sfree(nb->nbparam);
/* Free plist */
- auto *plist = nb->plist[eintLocal];
+ auto *plist = nb->plist[InteractionLocality::Local];
freeDeviceBuffer(&plist->sci);
freeDeviceBuffer(&plist->cj4);
freeDeviceBuffer(&plist->imask);
sfree(plist);
if (nb->bUseTwoStreams)
{
- auto *plist_nl = nb->plist[eintNonlocal];
+ auto *plist_nl = nb->plist[InteractionLocality::NonLocal];
freeDeviceBuffer(&plist_nl->sci);
freeDeviceBuffer(&plist_nl->cj4);
freeDeviceBuffer(&plist_nl->imask);
nb->nbst.fshift = nullptr;
/* Free command queues */
- clReleaseCommandQueue(nb->stream[eintLocal]);
- nb->stream[eintLocal] = nullptr;
+ clReleaseCommandQueue(nb->stream[InteractionLocality::Local]);
+ nb->stream[InteractionLocality::Local] = nullptr;
if (nb->bUseTwoStreams)
{
- clReleaseCommandQueue(nb->stream[eintNonlocal]);
- nb->stream[eintNonlocal] = nullptr;
+ clReleaseCommandQueue(nb->stream[InteractionLocality::NonLocal]);
+ nb->stream[InteractionLocality::NonLocal] = nullptr;
}
/* Free other events */
if (nb->nonlocal_done)
}
//! This function is documented in the header file
-gmx_wallclock_gpu_nbnxn_t *nbnxn_gpu_get_timings(gmx_nbnxn_ocl_t *nb)
+gmx_wallclock_gpu_nbnxn_t *gpu_get_timings(gmx_nbnxn_ocl_t *nb)
{
return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
}
//! This function is documented in the header file
-void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
+void gpu_reset_timings(nonbonded_verlet_t* nbv)
{
if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
{
}
//! This function is documented in the header file
-int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_ocl_t *nb)
+int gpu_min_ci_balanced(gmx_nbnxn_ocl_t *nb)
{
return nb != nullptr ?
gpu_min_ci_balanced_factor * nb->dev_info->compute_units : 0;
}
//! This function is documented in the header file
-gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_ocl_t *nb)
+gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxn_ocl_t *nb)
{
return ((nb->nbparam->eeltype == eelOclEWALD_ANA) ||
(nb->nbparam->eeltype == eelOclEWALD_ANA_TWIN));
}
+
+} // namespace Nbnxm
#ifndef NBNXN_OCL_INTERNAL_H
#define NBNXN_OCL_INTERNAL_H
+namespace Nbnxm
+{
+
/*! \brief Returns true if LJ combination rules are used in the non-bonded kernels.
*
* \param[in] vdwType The VdW interaction/implementation type as defined by evdwOcl in nbnxn_ocl_types.h.
*/
bool useLjCombRule(int vdwType);
+} // namespace Nbnxm
+
#endif /* NBNXN_OCL_INTERNAL_H */
#include "gromacs/gpu_utils/oclutils.h"
#include "gromacs/mdtypes/interaction_const.h"
#include "gromacs/nbnxm/gpu_types_common.h"
+#include "gromacs/nbnxm/nbnxm.h"
#include "gromacs/nbnxm/pairlist.h"
+#include "gromacs/utility/enumerationhelpers.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/real.h"
/*! \internal
* \brief Pair list data.
*/
-using cl_plist_t = gpu_plist;
+using cl_plist_t = Nbnxm::gpu_plist;
/** \internal
* \brief Typedef of actual timer type.
*/
-typedef struct nbnxn_gpu_timers_t cl_timers_t;
+typedef struct Nbnxm::gpu_timers_t cl_timers_t;
/*! \internal
* \brief Main data structure for OpenCL nonbonded force calculations.
cl_kernel kernel_zero_e_fshift;
///@}
- cl_bool bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU */
- cl_bool bNonLocalStreamActive; /**< true indicates that the nonlocal_done event was enqueued */
+ cl_bool bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU */
+ cl_bool bNonLocalStreamActive; /**< true indicates that the nonlocal_done event was enqueued */
- cl_atomdata_t *atdat; /**< atom data */
- cl_nbparam_t *nbparam; /**< parameters required for the non-bonded calc. */
- cl_plist_t *plist[2]; /**< pair-list data structures (local and non-local) */
- cl_nb_staging_t nbst; /**< staging area where fshift/energies get downloaded */
+ cl_atomdata_t *atdat; /**< atom data */
+ cl_nbparam_t *nbparam; /**< parameters required for the non-bonded calc. */
+ gmx::EnumerationArray<Nbnxm::InteractionLocality, cl_plist_t *> plist; /**< pair-list data structures (local and non-local) */
+ cl_nb_staging_t nbst; /**< staging area where fshift/energies get downloaded */
- cl_command_queue stream[2]; /**< local and non-local GPU queues */
+ gmx::EnumerationArray<Nbnxm::InteractionLocality, cl_command_queue> stream; /**< local and non-local GPU queues */
/** events used for synchronization */
cl_event nonlocal_done; /**< event triggered when the non-local non-bonded kernel
using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
+// Convience alias for partial Nbnxn namespace usage
+using InteractionLocality = Nbnxm::InteractionLocality;
/* We shift the i-particles backward for PBC.
* This leads to more conditionals than shifting forward.
}
/* Estimates the average size of a full j-list for super/sub setup */
-static void get_nsubpair_target(const nbnxn_search *nbs,
- int iloc,
- real rlist,
- int min_ci_balanced,
- int *nsubpair_target,
- float *nsubpair_tot_est)
+static void get_nsubpair_target(const nbnxn_search *nbs,
+ const InteractionLocality iloc,
+ const real rlist,
+ const int min_ci_balanced,
+ int *nsubpair_target,
+ float *nsubpair_tot_est)
{
/* The target value of 36 seems to be the optimum for Kepler.
* Maxwell is less sensitive to the exact value.
nonlocal_vol2(nbs->zones, ls, r_eff_sup);
}
- if (LOCAL_I(iloc))
+ if (iloc == InteractionLocality::Local)
{
/* Sub-cell interacts with itself */
vol_est = ls[XX]*ls[YY]*ls[ZZ];
}
/* Make a local or non-local pair-list, depending on iloc */
-void nbnxn_make_pairlist(nbnxn_search *nbs,
- nbnxn_atomdata_t *nbat,
- const t_blocka *excl,
- real rlist,
- int min_ci_balanced,
- nbnxn_pairlist_set_t *nbl_list,
- int iloc,
- int nb_kernel_type,
- t_nrnb *nrnb)
+void nbnxn_make_pairlist(nbnxn_search *nbs,
+ nbnxn_atomdata_t *nbat,
+ const t_blocka *excl,
+ const real rlist,
+ const int min_ci_balanced,
+ nbnxn_pairlist_set_t *nbl_list,
+ const InteractionLocality iloc,
+ const int nb_kernel_type,
+ t_nrnb *nrnb)
{
int nsubpair_target;
float nsubpair_tot_est;
nbat->bUseBufferFlags = (nbat->out.size() > 1);
/* We should re-init the flags before making the first list */
- if (nbat->bUseBufferFlags && LOCAL_I(iloc))
+ if (nbat->bUseBufferFlags && iloc == InteractionLocality::Local)
{
init_buffer_flags(&nbat->buffer_flags, nbat->numAtoms());
}
int nzi;
- if (LOCAL_I(iloc))
+ if (iloc == InteractionLocality::Local)
{
/* Only zone (grid) 0 vs 0 */
nzi = 1;
int zj0;
int zj1;
- if (LOCAL_I(iloc))
+ if (iloc == InteractionLocality::Local)
{
zj0 = 0;
zj1 = 1;
/* With GPU: generate progressively smaller lists for
* load balancing for local only or non-local with 2 zones.
*/
- progBal = (LOCAL_I(iloc) || nbs->zones->n <= 2);
+ progBal = (iloc == InteractionLocality::Local || nbs->zones->n <= 2);
#pragma omp parallel for num_threads(nnbl) schedule(static)
for (int th = 0; th < nnbl; th++)
}
/* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
- if (LOCAL_I(iloc))
+ if (iloc == InteractionLocality::Local)
{
nbs->search_count++;
}
if (nbs->print_cycles &&
- (!nbs->DomDec || !LOCAL_I(iloc)) &&
+ (!nbs->DomDec || iloc == InteractionLocality::NonLocal) &&
nbs->search_count % 100 == 0)
{
nbs_cycle_print(stderr, nbs);
#include "gromacs/utility/basedefinitions.h"
#include "gromacs/utility/real.h"
+#include "locality.h"
+
struct gmx_domdec_zones_t;
struct gmx_groups_t;
struct nbnxn_atomdata_t;
* for the number of equally sized lists is below min_ci_balanced.
* With perturbed particles, also a group scheme style nbl_fep list is made.
*/
-void nbnxn_make_pairlist(nbnxn_search *nbs,
- nbnxn_atomdata_t *nbat,
- const t_blocka *excl,
- real rlist,
- int min_ci_balanced,
- nbnxn_pairlist_set_t *nbl_list,
- int iloc,
- int nb_kernel_type,
- t_nrnb *nrnb);
+void nbnxn_make_pairlist(nbnxn_search *nbs,
+ nbnxn_atomdata_t *nbat,
+ const t_blocka *excl,
+ real rlist,
+ int min_ci_balanced,
+ nbnxn_pairlist_set_t *nbl_list,
+ Nbnxm::InteractionLocality iloc,
+ int nb_kernel_type,
+ t_nrnb *nrnb);
/*! \brief Prepare the list-set produced by the search for dynamic pruning
*
#include "kernels_simd_4xm/kernel_prune.h"
-void NbnxnDispatchPruneKernel(nonbonded_verlet_t *nbv,
- int ilocality,
- const rvec *shift_vec)
+void NbnxnDispatchPruneKernel(nonbonded_verlet_t *nbv,
+ const Nbnxm::InteractionLocality ilocality,
+ const rvec *shift_vec)
{
nonbonded_verlet_group_t &nbvg = nbv->grp[ilocality];
nbnxn_pairlist_set_t *nbl_lists = &nbvg.nbl_lists;