From: Berk Hess <hess@kth.se>
Date: Tue, 15 Jan 2019 08:35:04 +0000 (+0100)
Subject: Use enum class for nbnxm locality
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=3377895771dee21f7ef2056708bd5af9cc02db26;p=alexxy%2Fgromacs.git

Use enum class for nbnxm locality

Converted the interaction and atom locality enum to enum class.
This exposed and fixed a few mixings of the two enums.
Move some files into the Nbnxm namespace.
Add local/non-local organization to the GPU timers.

Note that the use of the Nbnxm namespace outside the nbnxm module
is only temporary. This should all be replaced by methods.

Change-Id: I3c891b3b2b3e14d8175b63f4191f365a5cd64b18
---

diff --git a/src/gromacs/domdec/partition.cpp b/src/gromacs/domdec/partition.cpp
index be726af141..83ffe74855 100644
--- a/src/gromacs/domdec/partition.cpp
+++ b/src/gromacs/domdec/partition.cpp
@@ -3422,7 +3422,7 @@ void dd_partition_system(FILE                    *fplog,
                                   fr->cginfo,
                                   state_local->x,
                                   ncg_moved, bRedist ? comm->movedBuffer.data() : nullptr,
-                                  fr->nbv->grp[eintLocal].kernel_type,
+                                  fr->nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
                                   fr->nbv->nbat);
 
                 nbnxn_get_ncells(fr->nbv->nbs.get(), &ncells_new[XX], &ncells_new[YY]);
diff --git a/src/gromacs/ewald/pme_load_balancing.cpp b/src/gromacs/ewald/pme_load_balancing.cpp
index 579fc3c7f9..778c7c2c48 100644
--- a/src/gromacs/ewald/pme_load_balancing.cpp
+++ b/src/gromacs/ewald/pme_load_balancing.cpp
@@ -825,7 +825,7 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
     /* We always re-initialize the tables whether they are used or not */
     init_interaction_const_tables(nullptr, ic, rtab);
 
-    nbnxn_gpu_pme_loadbal_update_param(nbv, ic, listParams);
+    Nbnxm::gpu_pme_loadbal_update_param(nbv, ic, listParams);
 
     if (!pme_lb->bSepPMERanks)
     {
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp
index 26a6a50a5c..1dda3a2c67 100644
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -2635,15 +2635,15 @@ void init_forcerec(FILE                             *fp,
             GMX_RELEASE_ASSERT(ir->rcoulomb == ir->rvdw, "With Verlet lists and no PME rcoulomb and rvdw should be identical");
         }
 
-        init_nb_verlet(mdlog, &fr->nbv, bFEP_NonBonded, ir, fr,
-                       cr, hardwareInfo, deviceInfo,
-                       mtop, box);
+        Nbnxm::init_nb_verlet(mdlog, &fr->nbv, bFEP_NonBonded, ir, fr,
+                              cr, hardwareInfo, deviceInfo,
+                              mtop, box);
 
         if (useGpuForBonded)
         {
             auto stream = DOMAINDECOMP(cr) ?
-                nbnxn_gpu_get_command_stream(fr->nbv->gpu_nbv, eintNonlocal) :
-                nbnxn_gpu_get_command_stream(fr->nbv->gpu_nbv, eintLocal);
+                Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal) :
+                Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::Local);
             // TODO the heap allocation is only needed while
             // t_forcerec lacks a constructor.
             fr->gpuBonded = new gmx::GpuBonded(mtop->ffparams,
@@ -2685,7 +2685,7 @@ void free_gpu_resources(t_forcerec                          *fr,
     if (isPPrankUsingGPU)
     {
         /* free nbnxn data in GPU memory */
-        nbnxn_gpu_free(fr->nbv->gpu_nbv);
+        Nbnxm::gpu_free(fr->nbv->gpu_nbv);
         delete fr->gpuBonded;
         fr->gpuBonded = nullptr;
     }
diff --git a/src/gromacs/mdlib/resethandler.cpp b/src/gromacs/mdlib/resethandler.cpp
index 5269dfde48..5697378346 100644
--- a/src/gromacs/mdlib/resethandler.cpp
+++ b/src/gromacs/mdlib/resethandler.cpp
@@ -175,7 +175,7 @@ bool ResetHandler::resetCountersImpl(
 
         if (use_GPU(nbv))
         {
-            nbnxn_gpu_reset_timings(nbv);
+            Nbnxm::gpu_reset_timings(nbv);
         }
 
         if (pme_gpu_task_enabled(pme))
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index 2fd4038c55..6dedbb795b 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -387,14 +387,15 @@ static void post_process_forces(const t_commrec           *cr,
     }
 }
 
-static void do_nb_verlet(t_forcerec *fr,
-                         const interaction_const_t *ic,
-                         gmx_enerdata_t *enerd,
-                         int flags, int ilocality,
-                         int clearF,
-                         int64_t step,
-                         t_nrnb *nrnb,
-                         gmx_wallcycle_t wcycle)
+static void do_nb_verlet(t_forcerec                       *fr,
+                         const interaction_const_t        *ic,
+                         gmx_enerdata_t                   *enerd,
+                         const int                         flags,
+                         const Nbnxm::InteractionLocality  ilocality,
+                         const int                         clearF,
+                         const int64_t                     step,
+                         t_nrnb                           *nrnb,
+                         gmx_wallcycle_t                   wcycle)
 {
     if (!(flags & GMX_FORCE_NONBONDED))
     {
@@ -859,11 +860,12 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
         {
             GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
-            isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
-                                                    flags, eatLocal,
-                                                    haveOtherWork,
-                                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
-                                                    fshift, completionType);
+            isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
+                                                     flags,
+                                                     Nbnxm::AtomLocality::Local,
+                                                     haveOtherWork,
+                                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                                     fshift, completionType);
             wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
             // To get the call count right, when the task finished we
             // issue a start/stop.
@@ -874,7 +876,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 
-                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
+                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
                                                nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
             }
         }
@@ -905,15 +907,15 @@ static inline void launchGpuRollingPruning(const t_commrec          *cr,
      */
     int  numRollingParts     = nbv->listParams->numRollingParts;
     GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
-    int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
+    int  stepWithCurrentList = step - nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.outerListCreationStep;
     bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
     if (stepWithCurrentList > 0 &&
         stepWithCurrentList < inputrec->nstlist - 1 &&
         (stepIsEven || DOMAINDECOMP(cr)))
     {
-        nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
-                                          stepIsEven ? eintLocal : eintNonlocal,
-                                          numRollingParts);
+        Nbnxm::gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
+                                           stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal,
+                                           numRollingParts);
     }
 }
 
@@ -1084,7 +1086,7 @@ static void do_force_cutsVERLET(FILE *fplog,
                               nullptr, 0, mdatoms->homenr, -1,
                               fr->cginfo, x.unpaddedArrayRef(),
                               0, nullptr,
-                              nbv->grp[eintLocal].kernel_type,
+                              nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
                               nbv->nbat);
             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
         }
@@ -1093,7 +1095,7 @@ static void do_force_cutsVERLET(FILE *fplog,
             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
             nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
                                        fr->cginfo, x.unpaddedArrayRef(),
-                                       nbv->grp[eintNonlocal].kernel_type,
+                                       nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
                                        nbv->nbat);
             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
         }
@@ -1111,10 +1113,10 @@ static void do_force_cutsVERLET(FILE *fplog,
 
         if (bNS)
         {
-            nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
+            Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
         }
 
-        nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
+        Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
 
         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 
@@ -1130,9 +1132,9 @@ static void do_force_cutsVERLET(FILE *fplog,
             // higher-level object than the nb module.
             fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
                                                                   top->idef,
-                                                                  nbnxn_gpu_get_xq(nbv->gpu_nbv),
-                                                                  nbnxn_gpu_get_f(nbv->gpu_nbv),
-                                                                  nbnxn_gpu_get_fshift(nbv->gpu_nbv));
+                                                                  Nbnxm::gpu_get_xq(nbv->gpu_nbv),
+                                                                  Nbnxm::gpu_get_f(nbv->gpu_nbv),
+                                                                  Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
             ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
         }
 
@@ -1142,35 +1144,38 @@ static void do_force_cutsVERLET(FILE *fplog,
     /* do local pair search */
     if (bNS)
     {
+        nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists;
+
         wallcycle_start_nocount(wcycle, ewcNS);
         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
         nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
                             &top->excls,
                             nbv->listParams->rlistOuter,
                             nbv->min_ci_balanced,
-                            &nbv->grp[eintLocal].nbl_lists,
-                            eintLocal,
-                            nbv->grp[eintLocal].kernel_type,
+                            &pairlistSet,
+                            Nbnxm::InteractionLocality::Local,
+                            nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
                             nrnb);
-        nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
+        pairlistSet.outerListCreationStep = step;
         if (nbv->listParams->useDynamicPruning && !bUseGPU)
         {
-            nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
+            nbnxnPrepareListForDynamicPruning(&pairlistSet);
         }
         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
 
         if (bUseGPU)
         {
             /* initialize local pair-list on the GPU */
-            nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
-                                    nbv->grp[eintLocal].nbl_lists.nblGpu[0],
-                                    eintLocal);
+            Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
+                                     pairlistSet.nblGpu[0],
+                                     Nbnxm::InteractionLocality::Local);
         }
         wallcycle_stop(wcycle, ewcNS);
     }
     else
     {
-        nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
+        nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
+                                        FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
                                         nbv->nbat, wcycle);
     }
 
@@ -1184,7 +1189,7 @@ static void do_force_cutsVERLET(FILE *fplog,
         wallcycle_start(wcycle, ewcLAUNCH_GPU);
 
         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatLocal, ppForceWorkload->haveGpuBondedWork);
+        Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 
         // bonded work not split into separate local and non-local, so with DD
@@ -1198,7 +1203,7 @@ static void do_force_cutsVERLET(FILE *fplog,
 
         /* launch local nonbonded work on GPU */
         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
+        do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
                      step, nrnb, wcycle);
         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
@@ -1217,6 +1222,8 @@ static void do_force_cutsVERLET(FILE *fplog,
        do non-local pair search */
     if (DOMAINDECOMP(cr))
     {
+        nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists;
+
         if (bNS)
         {
             wallcycle_start_nocount(wcycle, ewcNS);
@@ -1226,23 +1233,23 @@ static void do_force_cutsVERLET(FILE *fplog,
                                 &top->excls,
                                 nbv->listParams->rlistOuter,
                                 nbv->min_ci_balanced,
-                                &nbv->grp[eintNonlocal].nbl_lists,
-                                eintNonlocal,
-                                nbv->grp[eintNonlocal].kernel_type,
+                                &pairlistSet,
+                                Nbnxm::InteractionLocality::NonLocal,
+                                nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
                                 nrnb);
-            nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
+            pairlistSet.outerListCreationStep = step;
             if (nbv->listParams->useDynamicPruning && !bUseGPU)
             {
-                nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
+                nbnxnPrepareListForDynamicPruning(&pairlistSet);
             }
             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
 
-            if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
+            if (nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type == nbnxnk8x8x8_GPU)
             {
                 /* initialize non-local pair-list on the GPU */
-                nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
-                                        nbv->grp[eintNonlocal].nbl_lists.nblGpu[0],
-                                        eintNonlocal);
+                Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
+                                         pairlistSet.nblGpu[0],
+                                         Nbnxm::InteractionLocality::NonLocal);
             }
             wallcycle_stop(wcycle, ewcNS);
         }
@@ -1250,7 +1257,8 @@ static void do_force_cutsVERLET(FILE *fplog,
         {
             dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
 
-            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
+            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
+                                            FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
                                             nbv->nbat, wcycle);
         }
 
@@ -1260,7 +1268,7 @@ static void do_force_cutsVERLET(FILE *fplog,
 
             /* launch non-local nonbonded tasks on GPU */
             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-            nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
+            Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 
             if (ppForceWorkload->haveGpuBondedWork)
@@ -1271,7 +1279,7 @@ static void do_force_cutsVERLET(FILE *fplog,
             }
 
             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
+            do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
                          step, nrnb, wcycle);
             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 
@@ -1286,11 +1294,11 @@ static void do_force_cutsVERLET(FILE *fplog,
         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
         if (DOMAINDECOMP(cr))
         {
-            nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
-                                     flags, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
+            Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
+                                      flags, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
         }
-        nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
-                                 flags, eatLocal, ppForceWorkload->haveGpuBondedWork);
+        Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
+                                  flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 
         if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
@@ -1397,7 +1405,7 @@ static void do_force_cutsVERLET(FILE *fplog,
 
     if (!bUseOrEmulGPU)
     {
-        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
+        do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
                      step, nrnb, wcycle);
     }
 
@@ -1406,18 +1414,18 @@ static void do_force_cutsVERLET(FILE *fplog,
         /* Calculate the local and non-local free energy interactions here.
          * Happens here on the CPU both with and without GPU.
          */
-        if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
+        if (fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.nbl_fep[0]->nrj > 0)
         {
-            do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
+            do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists,
                              fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
                              inputrec->fepvals, lambda,
                              enerd, flags, nrnb, wcycle);
         }
 
         if (DOMAINDECOMP(cr) &&
-            fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
+            fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nbl_fep[0]->nrj > 0)
         {
-            do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
+            do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists,
                              fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
                              inputrec->fepvals, lambda,
                              enerd, flags, nrnb, wcycle);
@@ -1426,22 +1434,14 @@ static void do_force_cutsVERLET(FILE *fplog,
 
     if (!bUseOrEmulGPU)
     {
-        int aloc;
-
         if (DOMAINDECOMP(cr))
         {
-            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
+            do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
                          step, nrnb, wcycle);
         }
 
-        if (!bUseOrEmulGPU)
-        {
-            aloc = eintLocal;
-        }
-        else
-        {
-            aloc = eintNonlocal;
-        }
+        const Nbnxm::InteractionLocality iloc =
+            (!bUseOrEmulGPU ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal);
 
         /* Add all the non-bonded force to the normal force array.
          * This can be split into a local and a non-local part when overlapping
@@ -1449,13 +1449,13 @@ static void do_force_cutsVERLET(FILE *fplog,
          */
         wallcycle_stop(wcycle, ewcFORCE);
 
-        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
+        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::All, nbv->nbat, f, wcycle);
 
         wallcycle_start_nocount(wcycle, ewcFORCE);
 
         /* if there are multiple fshift output buffers reduce them */
         if ((flags & GMX_FORCE_VIRIAL) &&
-            nbv->grp[aloc].nbl_lists.nnbl > 1)
+            nbv->grp[iloc].nbl_lists.nnbl > 1)
         {
             /* This is not in a subcounter because it takes a
                negligible and constant-sized amount of time */
@@ -1493,25 +1493,25 @@ static void do_force_cutsVERLET(FILE *fplog,
             if (bUseGPU)
             {
                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
-                nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
-                                           flags, eatNonlocal,
-                                           ppForceWorkload->haveGpuBondedWork,
-                                           enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
-                                           fr->fshift);
+                Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
+                                            flags, Nbnxm::AtomLocality::NonLocal,
+                                            ppForceWorkload->haveGpuBondedWork,
+                                            enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                            fr->fshift);
                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
             }
             else
             {
                 wallcycle_start_nocount(wcycle, ewcFORCE);
-                do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
+                do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
                              step, nrnb, wcycle);
                 wallcycle_stop(wcycle, ewcFORCE);
             }
 
             /* skip the reduction if there was no non-local work to do */
-            if (!nbv->grp[eintNonlocal].nbl_lists.nblGpu[0]->sci.empty())
+            if (!nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nblGpu[0]->sci.empty())
             {
-                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
+                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
                                                nbv->nbat, f, wcycle);
             }
         }
@@ -1558,10 +1558,10 @@ static void do_force_cutsVERLET(FILE *fplog,
         const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
 
         wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
-        nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
-                                   flags, eatLocal, ppForceWorkload->haveGpuBondedWork,
-                                   enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
-                                   fr->fshift);
+        Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
+                                    flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork,
+                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                    fr->fshift);
         float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 
         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
@@ -1586,7 +1586,7 @@ static void do_force_cutsVERLET(FILE *fplog,
         // NOTE: emulation kernel is not included in the balancing region,
         // but emulation mode does not target performance anyway
         wallcycle_start_nocount(wcycle, ewcFORCE);
-        do_nb_verlet(fr, ic, enerd, flags, eintLocal,
+        do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local,
                      DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
                      step, nrnb, wcycle);
         wallcycle_stop(wcycle, ewcFORCE);
@@ -1602,7 +1602,7 @@ static void do_force_cutsVERLET(FILE *fplog,
         /* now clear the GPU outputs while we finish the step on the CPU */
         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
+        Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, flags);
 
         /* Is dynamic pair-list pruning activated? */
         if (nbv->listParams->useDynamicPruning)
@@ -1633,7 +1633,7 @@ static void do_force_cutsVERLET(FILE *fplog,
      * on the non-alternating path. */
     if (bUseOrEmulGPU && !alternateGpuWait)
     {
-        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
+        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
                                        nbv->nbat, f, wcycle);
     }
     if (DOMAINDECOMP(cr))
@@ -2744,7 +2744,7 @@ void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
 
     if (printReport)
     {
-        auto                    nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
+        auto                    nbnxn_gpu_timings = use_GPU(nbv) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
         if (pme_gpu_task_enabled(pme))
         {
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp
index db4ad05dc6..62ba0c4014 100644
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -1004,12 +1004,12 @@ void nbnxn_atomdata_copy_shiftvec(gmx_bool          bDynamicBox,
 }
 
 /* Copies (and reorders) the coordinates to nbnxn_atomdata_t */
-void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search  *nbs,
-                                     int                  locality,
-                                     gmx_bool             FillLocal,
-                                     rvec                *x,
-                                     nbnxn_atomdata_t    *nbat,
-                                     gmx_wallcycle       *wcycle)
+void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search       *nbs,
+                                     const Nbnxm::AtomLocality locality,
+                                     gmx_bool                  FillLocal,
+                                     rvec                     *x,
+                                     nbnxn_atomdata_t         *nbat,
+                                     gmx_wallcycle            *wcycle)
 {
     wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
     wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
@@ -1019,15 +1019,16 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search  *nbs,
 
     switch (locality)
     {
-        case eatAll:
+        case Nbnxm::AtomLocality::All:
+        case Nbnxm::AtomLocality::Count:
             g0 = 0;
             g1 = nbs->grid.size();
             break;
-        case eatLocal:
+        case Nbnxm::AtomLocality::Local:
             g0 = 0;
             g1 = 1;
             break;
-        case eatNonlocal:
+        case Nbnxm::AtomLocality::NonLocal:
             g0 = 1;
             g1 = nbs->grid.size();
             break;
@@ -1467,11 +1468,11 @@ static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbnxn_atomdata_t *nbat,
 }
 
 /* Add the force array(s) from nbnxn_atomdata_t to f */
-void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search           *nbs,
-                                    int                     locality,
-                                    nbnxn_atomdata_t       *nbat,
-                                    rvec                   *f,
-                                    gmx_wallcycle          *wcycle)
+void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search             *nbs,
+                                    const Nbnxm::AtomLocality locality,
+                                    nbnxn_atomdata_t         *nbat,
+                                    rvec                     *f,
+                                    gmx_wallcycle            *wcycle)
 {
     wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
     wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
@@ -1482,15 +1483,16 @@ void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search           *nbs,
 
     switch (locality)
     {
-        case eatAll:
+        case Nbnxm::AtomLocality::All:
+        case Nbnxm::AtomLocality::Count:
             a0 = 0;
             na = nbs->natoms_nonlocal;
             break;
-        case eatLocal:
+        case Nbnxm::AtomLocality::Local:
             a0 = 0;
             na = nbs->natoms_local;
             break;
-        case eatNonlocal:
+        case Nbnxm::AtomLocality::NonLocal:
             a0 = nbs->natoms_local;
             na = nbs->natoms_nonlocal - nbs->natoms_local;
             break;
@@ -1500,7 +1502,7 @@ void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search           *nbs,
 
     if (nbat->out.size() > 1)
     {
-        if (locality != eatAll)
+        if (locality != Nbnxm::AtomLocality::All)
         {
             gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll");
         }
diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h
index ebfeee5bfb..bd210ea014 100644
--- a/src/gromacs/nbnxm/atomdata.h
+++ b/src/gromacs/nbnxm/atomdata.h
@@ -42,6 +42,8 @@
 #include "gromacs/utility/basedefinitions.h"
 #include "gromacs/utility/real.h"
 
+#include "locality.h"
+
 namespace gmx
 {
 class MDLogger;
@@ -94,7 +96,7 @@ void nbnxn_atomdata_copy_shiftvec(gmx_bool          dynamic_box,
  * FillLocal tells if the local filler particle coordinates should be zeroed.
  */
 void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search  *nbs,
-                                     int                  locality,
+                                     Nbnxm::AtomLocality  locality,
                                      gmx_bool             FillLocal,
                                      rvec                *x,
                                      nbnxn_atomdata_t    *nbat,
@@ -102,7 +104,7 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search  *nbs,
 
 /* Add the forces stored in nbat to f, zeros the forces in nbat */
 void nbnxn_atomdata_add_nbat_f_to_f(nbnxn_search           *nbs,
-                                    int                     locality,
+                                    Nbnxm::AtomLocality     locality,
                                     nbnxn_atomdata_t       *nbat,
                                     rvec                   *f,
                                     gmx_wallcycle          *wcycle);
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index 15eb634f28..7ce9b9fa5e 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -107,6 +107,9 @@
 #endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
 
 
+namespace Nbnxm
+{
+
 /*! Nonbonded kernel function pointer type */
 typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
                                      const cu_nbparam_t,
@@ -268,19 +271,24 @@ static inline int calc_shmem_required_nonbonded(const int num_threads_z, const g
 }
 
 /*! \brief Launch asynchronously the xq buffer host to device copy. */
-void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
-                              const nbnxn_atomdata_t *nbatom,
-                              int                     iloc,
-                              bool                    haveOtherWork)
+void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
+                        const nbnxn_atomdata_t *nbatom,
+                        const AtomLocality      atomLocality,
+                        const bool              haveOtherWork)
 {
-    int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
+    GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal,
+               "Only local and non-local xq transfers are supported");
 
-    cu_atomdata_t       *adat    = nb->atdat;
-    cu_plist_t          *plist   = nb->plist[iloc];
-    cu_timers_t         *t       = nb->timers;
-    cudaStream_t         stream  = nb->stream[iloc];
+    const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
 
-    bool                 bDoTime     = nb->bDoTime;
+    int                       adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
+
+    cu_atomdata_t            *adat    = nb->atdat;
+    cu_plist_t               *plist   = nb->plist[iloc];
+    cu_timers_t              *t       = nb->timers;
+    cudaStream_t              stream  = nb->stream[iloc];
+
+    bool                      bDoTime     = nb->bDoTime;
 
     /* Don't launch the non-local H2D copy if there is no dependent
        work to do: neither non-local nor other (e.g. bonded) work
@@ -291,7 +299,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
        we always call the local local x+q copy (and the rest of the local
        work in nbnxn_gpu_launch_kernel().
      */
-    if (!haveOtherWork && canSkipWork(nb, iloc))
+    if (!haveOtherWork && canSkipWork(*nb, iloc))
     {
         plist->haveFreshList = false;
 
@@ -299,7 +307,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
     }
 
     /* calculate the atom data index range based on locality */
-    if (LOCAL_I(iloc))
+    if (atomLocality == AtomLocality::Local)
     {
         adat_begin  = 0;
         adat_len    = adat->natoms_local;
@@ -313,7 +321,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
     /* beginning of timed HtoD section */
     if (bDoTime)
     {
-        t->nb_h2d[iloc].openTimingRegion(stream);
+        t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
     }
 
     /* HtoD x, q */
@@ -323,7 +331,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
 
     if (bDoTime)
     {
-        t->nb_h2d[iloc].closeTimingRegion(stream);
+        t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
     }
 
     /* When we get here all misc operations issued in the local stream as well as
@@ -334,7 +342,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
      */
     if (nb->bUseTwoStreams)
     {
-        if (iloc == eintLocal)
+        if (iloc == InteractionLocality::Local)
         {
             cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, stream);
             CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
@@ -364,9 +372,9 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
    the local x+q H2D (and all preceding) tasks are complete and synchronize
    with this event in the non-local stream before launching the non-bonded kernel.
  */
-void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
-                             int                     flags,
-                             int                     iloc)
+void gpu_launch_kernel(gmx_nbnxn_cuda_t          *nb,
+                       const int                  flags,
+                       const InteractionLocality  iloc)
 {
     /* CUDA kernel launch-related stuff */
     int                  nblock;
@@ -392,7 +400,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
        clearing. All these operations, except for the local interaction kernel,
        are needed for the non-local interactions. The skip of the local kernel
        call is taken care of later in this function. */
-    if (canSkipWork(nb, iloc))
+    if (canSkipWork(*nb, iloc))
     {
         plist->haveFreshList = false;
 
@@ -405,7 +413,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
            (TODO: ATM that's the way the timing accounting can distinguish between
            separate prune kernel and combined force+prune, maybe we need a better way?).
          */
-        nbnxn_gpu_launch_kernel_pruneonly(nb, iloc, 1);
+        gpu_launch_kernel_pruneonly(nb, iloc, 1);
     }
 
     if (plist->nsci == 0)
@@ -417,14 +425,14 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
     /* beginning of timed nonbonded calculation section */
     if (bDoTime)
     {
-        t->nb_k[iloc].openTimingRegion(stream);
+        t->interaction[iloc].nb_k.openTimingRegion(stream);
     }
 
     /* get the pointer to the kernel flavor we need to use */
     nb_kernel = select_nbnxn_kernel(nbp->eeltype,
                                     nbp->vdwtype,
                                     bCalcEner,
-                                    (plist->haveFreshList && !nb->timers->didPrune[iloc]),
+                                    (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune),
                                     nb->dev_info);
 
     /* Kernel launch config:
@@ -458,13 +466,13 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
                 config.sharedMemorySize);
     }
 
-    auto      *timingEvent = bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr;
+    auto      *timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
     const auto kernelArgs  = prepareGpuKernelArguments(nb_kernel, config, adat, nbp, plist, &bCalcFshift);
     launchGpuKernel(nb_kernel, config, timingEvent, "k_calc_nb", kernelArgs);
 
     if (bDoTime)
     {
-        t->nb_k[iloc].closeTimingRegion(stream);
+        t->interaction[iloc].nb_k.closeTimingRegion(stream);
     }
 
     if (GMX_NATIVE_WINDOWS)
@@ -487,9 +495,9 @@ static inline int calc_shmem_required_prune(const int num_threads_z)
     return shmem;
 }
 
-void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t       *nb,
-                                       int                     iloc,
-                                       int                     numParts)
+void gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t          *nb,
+                                 const InteractionLocality  iloc,
+                                 const int                  numParts)
 {
     cu_atomdata_t       *adat    = nb->atdat;
     cu_nbparam_t        *nbp     = nb->nbparam;
@@ -544,7 +552,7 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t       *nb,
     GpuRegionTimer *timer = nullptr;
     if (bDoTime)
     {
-        timer = &(plist->haveFreshList ? t->prune_k[iloc] : t->rollingPrune_k[iloc]);
+        timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
     }
 
     /* beginning of timed prune calculation section */
@@ -589,14 +597,14 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t       *nb,
        (combined or separate 1st pass prune, rolling prune). */
     if (plist->haveFreshList)
     {
-        plist->haveFreshList         = false;
+        plist->haveFreshList                   = false;
         /* Mark that pruning has been done */
-        nb->timers->didPrune[iloc] = true;
+        nb->timers->interaction[iloc].didPrune = true;
     }
     else
     {
         /* Mark that rolling pruning has been done */
-        nb->timers->didRollingPrune[iloc] = true;
+        nb->timers->interaction[iloc].didRollingPrune = true;
     }
 
     if (bDoTime)
@@ -611,18 +619,19 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t       *nb,
     }
 }
 
-void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
-                              nbnxn_atomdata_t       *nbatom,
-                              int                     flags,
-                              int                     aloc,
-                              bool                    haveOtherWork)
+void gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
+                        nbnxn_atomdata_t       *nbatom,
+                        const int               flags,
+                        const AtomLocality      atomLocality,
+                        const bool              haveOtherWork)
 {
     cudaError_t stat;
     int         adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 
     /* determine interaction locality from atom locality */
-    int              iloc = gpuAtomToInteractionLocality(aloc);
+    const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
 
+    /* extract the data */
     cu_atomdata_t   *adat    = nb->atdat;
     cu_timers_t     *t       = nb->timers;
     bool             bDoTime = nb->bDoTime;
@@ -632,22 +641,22 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
     bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
 
     /* don't launch non-local copy-back if there was no non-local work to do */
-    if (!haveOtherWork && canSkipWork(nb, iloc))
+    if (!haveOtherWork && canSkipWork(*nb, iloc))
     {
         return;
     }
 
-    getGpuAtomRange(adat, aloc, &adat_begin, &adat_len);
+    getGpuAtomRange(adat, atomLocality, &adat_begin, &adat_len);
 
     /* beginning of timed D2H section */
     if (bDoTime)
     {
-        t->nb_d2h[iloc].openTimingRegion(stream);
+        t->xf[atomLocality].nb_d2h.openTimingRegion(stream);
     }
 
     /* With DD the local D2H transfer can only start after the non-local
        kernel has finished. */
-    if (iloc == eintLocal && nb->bUseTwoStreams)
+    if (iloc == InteractionLocality::Local && nb->bUseTwoStreams)
     {
         stat = cudaStreamWaitEvent(stream, nb->nonlocal_done, 0);
         CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
@@ -661,14 +670,14 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
        recorded which signals that the local D2H can proceed. This event is not
        placed after the non-local kernel because we want the non-local data
        back first. */
-    if (iloc == eintNonlocal)
+    if (iloc == InteractionLocality::NonLocal)
     {
         stat = cudaEventRecord(nb->nonlocal_done, stream);
         CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
     }
 
     /* only transfer energies in the local stream */
-    if (LOCAL_I(iloc))
+    if (iloc == InteractionLocality::Local)
     {
         /* DtoH fshift */
         if (bCalcFshift)
@@ -689,11 +698,11 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
 
     if (bDoTime)
     {
-        t->nb_d2h[iloc].closeTimingRegion(stream);
+        t->xf[atomLocality].nb_d2h.closeTimingRegion(stream);
     }
 }
 
-void nbnxn_cuda_set_cacheconfig()
+void cuda_set_cacheconfig()
 {
     cudaError_t stat;
 
@@ -710,3 +719,5 @@ void nbnxn_cuda_set_cacheconfig()
         }
     }
 }
+
+} // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda.h
index f732da87d7..42296f62b2 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.h
@@ -36,10 +36,15 @@
  * \brief
  * Declares nbnxn cuda cache and texture helper functions
  */
-#ifndef GMX_MDLIB_NBNXN_CUDA_NBNXN_CUDA_H
-#define GMX_MDLIB_NBNXN_CUDA_NBNXN_CUDA_H
+#ifndef GMX_NBNXN_CUDA_NBNXN_CUDA_H
+#define GMX_NBNXN_CUDA_NBNXN_CUDA_H
+
+namespace Nbnxm
+{
 
 //! Set up the cache configuration for the non-bonded kernels.
-void nbnxn_cuda_set_cacheconfig();
+void cuda_set_cacheconfig();
+
+} // namespace Nbnxm
 
 #endif
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
index 537098d479..f75b1c080f 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -44,6 +44,11 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+// TODO We would like to move this down, but the way gmx_nbnxn_gpu_t
+//      is currently declared means this has to be before gpu_types.h
+#include "nbnxm_cuda_types.h"
+
+// TODO Remove this comment when the above order issue is resolved
 #include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/gpu_utils/pmalloc_cuda.h"
@@ -63,7 +68,9 @@
 #include "gromacs/utility/smalloc.h"
 
 #include "nbnxm_cuda.h"
-#include "nbnxm_cuda_types.h"
+
+namespace Nbnxm
+{
 
 /* This is a heuristically determined parameter for the Kepler
  * and Maxwell architectures for the minimum size of ci lists by multiplying
@@ -330,11 +337,11 @@ static void init_nbparam(cu_nbparam_t                   *nbp,
 
 /*! Re-generate the GPU Ewald force table, resets rlist, and update the
  *  electrostatic type switching to twin cut-off (or back) if needed. */
-void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
-                                        const interaction_const_t   *ic,
-                                        const NbnxnListParameters   *listParams)
+void gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
+                                  const interaction_const_t   *ic,
+                                  const NbnxnListParameters   *listParams)
 {
-    if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
+    if (!nbv || nbv->grp[InteractionLocality::Local].kernel_type != nbnxnk8x8x8_GPU)
     {
         return;
     }
@@ -371,18 +378,6 @@ static void init_plist(cu_plist_t *pl)
     pl->haveFreshList  = false;
 }
 
-/*! Initializes the timer data structure. */
-static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
-{
-    /* The non-local counters/stream (second in the array) are needed only with DD. */
-    for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
-    {
-        t->didPairlistH2D[i]  = false;
-        t->didPrune[i]        = false;
-        t->didRollingPrune[i] = false;
-    }
-}
-
 /*! Initializes the timings data structure. */
 static void init_timings(gmx_wallclock_gpu_nbnxn_t *t)
 {
@@ -408,10 +403,10 @@ static void init_timings(gmx_wallclock_gpu_nbnxn_t *t)
 }
 
 /*! Initializes simulation constant data. */
-static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t               *nb,
-                                  const interaction_const_t      *ic,
-                                  const NbnxnListParameters      *listParams,
-                                  const nbnxn_atomdata_t::Params &nbatParams)
+static void cuda_init_const(gmx_nbnxn_cuda_t               *nb,
+                            const interaction_const_t      *ic,
+                            const NbnxnListParameters      *listParams,
+                            const nbnxn_atomdata_t::Params &nbatParams)
 {
     init_atomdata_first(nb->atdat, nbatParams.numTypes);
     init_nbparam(nb->nbparam, ic, listParams, nbatParams);
@@ -420,13 +415,13 @@ static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t               *nb,
     nbnxn_cuda_clear_e_fshift(nb);
 }
 
-void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
-                    const gmx_device_info_t   *deviceInfo,
-                    const interaction_const_t *ic,
-                    const NbnxnListParameters *listParams,
-                    const nbnxn_atomdata_t    *nbat,
-                    int                        /*rank*/,
-                    gmx_bool                   bLocalAndNonlocal)
+void gpu_init(gmx_nbnxn_cuda_t         **p_nb,
+              const gmx_device_info_t   *deviceInfo,
+              const interaction_const_t *ic,
+              const NbnxnListParameters *listParams,
+              const nbnxn_atomdata_t    *nbat,
+              int                        /*rank*/,
+              gmx_bool                   bLocalAndNonlocal)
 {
     cudaError_t       stat;
     gmx_nbnxn_cuda_t *nb;
@@ -439,10 +434,10 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
     snew(nb, 1);
     snew(nb->atdat, 1);
     snew(nb->nbparam, 1);
-    snew(nb->plist[eintLocal], 1);
+    snew(nb->plist[InteractionLocality::Local], 1);
     if (bLocalAndNonlocal)
     {
-        snew(nb->plist[eintNonlocal], 1);
+        snew(nb->plist[InteractionLocality::NonLocal], 1);
     }
 
     nb->bUseTwoStreams = bLocalAndNonlocal;
@@ -455,17 +450,17 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
     pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
     pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
 
-    init_plist(nb->plist[eintLocal]);
+    init_plist(nb->plist[InteractionLocality::Local]);
 
     /* set device info, just point it to the right GPU among the detected ones */
     nb->dev_info = deviceInfo;
 
     /* local/non-local GPU streams */
-    stat = cudaStreamCreate(&nb->stream[eintLocal]);
-    CU_RET_ERR(stat, "cudaStreamCreate on stream[eintLocal] failed");
+    stat = cudaStreamCreate(&nb->stream[InteractionLocality::Local]);
+    CU_RET_ERR(stat, "cudaStreamCreate on stream[InterationLocality::Local] failed");
     if (nb->bUseTwoStreams)
     {
-        init_plist(nb->plist[eintNonlocal]);
+        init_plist(nb->plist[InteractionLocality::NonLocal]);
 
         /* Note that the device we're running on does not have to support
          * priorities, because we are querying the priority range which in this
@@ -475,10 +470,10 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
         stat = cudaDeviceGetStreamPriorityRange(nullptr, &highest_priority);
         CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
 
-        stat = cudaStreamCreateWithPriority(&nb->stream[eintNonlocal],
+        stat = cudaStreamCreateWithPriority(&nb->stream[InteractionLocality::NonLocal],
                                             cudaStreamDefault,
                                             highest_priority);
-        CU_RET_ERR(stat, "cudaStreamCreateWithPriority on stream[eintNonlocal] failed");
+        CU_RET_ERR(stat, "cudaStreamCreateWithPriority on stream[InteractionLocality::NonLocal] failed");
     }
 
     /* init events for sychronization (timing disabled for performance reasons!) */
@@ -495,15 +490,14 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
 
     if (nb->bDoTime)
     {
-        init_timers(nb->timers, nb->bUseTwoStreams);
         init_timings(nb->timings);
     }
 
     /* set the kernel type for the current GPU */
     /* pick L1 cache configuration */
-    nbnxn_cuda_set_cacheconfig();
+    cuda_set_cacheconfig();
 
-    nbnxn_cuda_init_const(nb, ic, listParams, nbat->params());
+    cuda_init_const(nb, ic, listParams, nbat->params());
 
     *p_nb = nb;
 
@@ -513,9 +507,9 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
     }
 }
 
-void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t       *nb,
-                             const NbnxnPairlistGpu *h_plist,
-                             int                     iloc)
+void gpu_init_pairlist(gmx_nbnxn_cuda_t          *nb,
+                       const NbnxnPairlistGpu    *h_plist,
+                       const InteractionLocality  iloc)
 {
     char          sbuf[STRLEN];
     bool          bDoTime    =  (nb->bDoTime && !h_plist->sci.empty());
@@ -536,10 +530,12 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t       *nb,
         }
     }
 
+    gpu_timers_t::Interaction &iTimers = nb->timers->interaction[iloc];
+
     if (bDoTime)
     {
-        nb->timers->pl_h2d[iloc].openTimingRegion(stream);
-        nb->timers->didPairlistH2D[iloc] = true;
+        iTimers.pl_h2d.openTimingRegion(stream);
+        iTimers.didPairlistH2D = true;
     }
 
     Context context = nullptr;
@@ -548,13 +544,13 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t       *nb,
                            &d_plist->nsci, &d_plist->sci_nalloc, context);
     copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(),
                        stream, GpuApiCallBehavior::Async,
-                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+                       bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 
     reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(),
                            &d_plist->ncj4, &d_plist->cj4_nalloc, context);
     copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(),
                        stream, GpuApiCallBehavior::Async,
-                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+                       bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 
     reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size()*c_nbnxnGpuClusterpairSplit,
                            &d_plist->nimask, &d_plist->imask_nalloc, context);
@@ -563,22 +559,22 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t       *nb,
                            &d_plist->nexcl, &d_plist->excl_nalloc, context);
     copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(),
                        stream, GpuApiCallBehavior::Async,
-                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+                       bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 
     if (bDoTime)
     {
-        nb->timers->pl_h2d[iloc].closeTimingRegion(stream);
+        iTimers.pl_h2d.closeTimingRegion(stream);
     }
 
     /* the next use of thist list we be the first one, so we need to prune */
     d_plist->haveFreshList = true;
 }
 
-void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_cuda_t       *nb,
-                               const nbnxn_atomdata_t *nbatom)
+void gpu_upload_shiftvec(gmx_nbnxn_cuda_t       *nb,
+                         const nbnxn_atomdata_t *nbatom)
 {
     cu_atomdata_t *adat  = nb->atdat;
-    cudaStream_t   ls    = nb->stream[eintLocal];
+    cudaStream_t   ls    = nb->stream[InteractionLocality::Local];
 
     /* only if we have a dynamic box */
     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
@@ -594,7 +590,7 @@ static void nbnxn_cuda_clear_f(gmx_nbnxn_cuda_t *nb, int natoms_clear)
 {
     cudaError_t    stat;
     cu_atomdata_t *adat  = nb->atdat;
-    cudaStream_t   ls    = nb->stream[eintLocal];
+    cudaStream_t   ls    = nb->stream[InteractionLocality::Local];
 
     stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
     CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
@@ -605,7 +601,7 @@ static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb)
 {
     cudaError_t    stat;
     cu_atomdata_t *adat  = nb->atdat;
-    cudaStream_t   ls    = nb->stream[eintLocal];
+    cudaStream_t   ls    = nb->stream[InteractionLocality::Local];
 
     stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
     CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
@@ -615,7 +611,7 @@ static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb)
     CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
 }
 
-void nbnxn_gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags)
+void gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags)
 {
     nbnxn_cuda_clear_f(nb, nb->atdat->natoms);
     /* clear shift force array and energies if the outputs were
@@ -626,8 +622,8 @@ void nbnxn_gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags)
     }
 }
 
-void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t              *nb,
-                             const nbnxn_atomdata_t        *nbat)
+void gpu_init_atomdata(gmx_nbnxn_cuda_t       *nb,
+                       const nbnxn_atomdata_t *nbat)
 {
     cudaError_t    stat;
     int            nalloc, natoms;
@@ -635,7 +631,7 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t              *nb,
     bool           bDoTime   = nb->bDoTime;
     cu_timers_t   *timers    = nb->timers;
     cu_atomdata_t *d_atdat   = nb->atdat;
-    cudaStream_t   ls        = nb->stream[eintLocal];
+    cudaStream_t   ls        = nb->stream[InteractionLocality::Local];
 
     natoms    = nbat->numAtoms();
     realloced = false;
@@ -714,7 +710,7 @@ static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam)
     }
 }
 
-void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
+void gpu_free(gmx_nbnxn_cuda_t *nb)
 {
     cudaError_t      stat;
     cu_atomdata_t   *atdat;
@@ -773,7 +769,7 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
     freeDeviceBuffer(&atdat->lj_comb);
 
     /* Free plist */
-    auto *plist = nb->plist[eintLocal];
+    auto *plist = nb->plist[InteractionLocality::Local];
     freeDeviceBuffer(&plist->sci);
     freeDeviceBuffer(&plist->cj4);
     freeDeviceBuffer(&plist->imask);
@@ -781,7 +777,7 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
     sfree(plist);
     if (nb->bUseTwoStreams)
     {
-        auto *plist_nl = nb->plist[eintNonlocal];
+        auto *plist_nl = nb->plist[InteractionLocality::NonLocal];
         freeDeviceBuffer(&plist_nl->sci);
         freeDeviceBuffer(&plist_nl->cj4);
         freeDeviceBuffer(&plist_nl->imask);
@@ -811,12 +807,12 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
 }
 
 //! This function is documented in the header file
-gmx_wallclock_gpu_nbnxn_t *nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb)
+gmx_wallclock_gpu_nbnxn_t *gpu_get_timings(gmx_nbnxn_cuda_t *nb)
 {
     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
 }
 
-void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
+void gpu_reset_timings(nonbonded_verlet_t* nbv)
 {
     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
     {
@@ -824,44 +820,46 @@ void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
     }
 }
 
-int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb)
+int gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb)
 {
     return nb != nullptr ?
            gpu_min_ci_balanced_factor*nb->dev_info->prop.multiProcessorCount : 0;
 
 }
 
-gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_cuda_t *nb)
+gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxn_cuda_t *nb)
 {
     return ((nb->nbparam->eeltype == eelCuEWALD_ANA) ||
             (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
 }
 
-void *nbnxn_gpu_get_command_stream(gmx_nbnxn_gpu_t *nb,
-                                   int              iloc)
+void *gpu_get_command_stream(gmx_nbnxn_gpu_t           *nb,
+                             const InteractionLocality  iloc)
 {
     assert(nb);
 
     return static_cast<void *>(&nb->stream[iloc]);
 }
 
-void *nbnxn_gpu_get_xq(gmx_nbnxn_gpu_t *nb)
+void *gpu_get_xq(gmx_nbnxn_gpu_t *nb)
 {
     assert(nb);
 
     return static_cast<void *>(nb->atdat->xq);
 }
 
-void *nbnxn_gpu_get_f(gmx_nbnxn_gpu_t *nb)
+void *gpu_get_f(gmx_nbnxn_gpu_t *nb)
 {
     assert(nb);
 
     return static_cast<void *>(nb->atdat->f);
 }
 
-rvec *nbnxn_gpu_get_fshift(gmx_nbnxn_gpu_t *nb)
+rvec *gpu_get_fshift(gmx_nbnxn_gpu_t *nb)
 {
     assert(nb);
 
     return reinterpret_cast<rvec *>(nb->atdat->fshift);
 }
+
+} // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
index 3d490c395a..5a8380f5b5 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -52,8 +52,10 @@
 #include "gromacs/gpu_utils/gputraits.cuh"
 #include "gromacs/mdtypes/interaction_const.h"
 #include "gromacs/nbnxm/gpu_types_common.h"
+#include "gromacs/nbnxm/nbnxm.h"
 #include "gromacs/nbnxm/pairlist.h"
 #include "gromacs/timing/gpu_timing.h"
+#include "gromacs/utility/enumerationhelpers.h"
 
 /*! \brief Macro definining default for the prune kernel's j4 processing concurrency.
  *
@@ -197,26 +199,26 @@ struct cu_nbparam
 /** \internal
  * \brief Pair list data.
  */
-using cu_plist_t = gpu_plist;
+using cu_plist_t = Nbnxm::gpu_plist;
 
 /** \internal
  * \brief Typedef of actual timer type.
  */
-typedef struct nbnxn_gpu_timers_t cu_timers_t;
+typedef struct Nbnxm::gpu_timers_t cu_timers_t;
 
 /** \internal
  * \brief Main data structure for CUDA nonbonded force calculations.
  */
 struct gmx_nbnxn_cuda_t
 {
-    const gmx_device_info_t  *dev_info;       /**< CUDA device information                              */
-    bool                      bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU    */
-    cu_atomdata_t            *atdat;          /**< atom data                                            */
-    cu_nbparam_t             *nbparam;        /**< parameters required for the non-bonded calc.         */
-    cu_plist_t               *plist[2];       /**< pair-list data structures (local and non-local)      */
-    nb_staging_t              nbst;           /**< staging area where fshift/energies get downloaded    */
-
-    cudaStream_t              stream[2];      /**< local and non-local GPU streams                      */
+    const gmx_device_info_t                                        *dev_info;       /**< CUDA device information                              */
+    bool                                                            bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU    */
+    cu_atomdata_t                                                  *atdat;          /**< atom data                                            */
+    cu_nbparam_t                                                   *nbparam;        /**< parameters required for the non-bonded calc.         */
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, cu_plist_t *> plist;          /**< pair-list data structures (local and non-local)      */
+    nb_staging_t                                                    nbst;           /**< staging area where fshift/energies get downloaded    */
+
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, cudaStream_t> stream;         /**< local and non-local GPU streams                      */
 
     /** events used for synchronization */
     cudaEvent_t    nonlocal_done;               /**< event triggered when the non-local non-bonded kernel
diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h
index cd64c8ca1a..1624c56c8c 100644
--- a/src/gromacs/nbnxm/gpu_common.h
+++ b/src/gromacs/nbnxm/gpu_common.h
@@ -67,6 +67,9 @@
 #include "gpu_common_utils.h"
 #include "nbnxm_gpu.h"
 
+namespace Nbnxm
+{
+
 /*! \brief Check that atom locality values are valid for the GPU module.
  *
  *  In the GPU module atom locality "all" is not supported, the local and
@@ -74,12 +77,16 @@
  *
  *  \param[in] atomLocality atom locality specifier
  */
-static inline void validateGpuAtomLocality(int atomLocality)
+static inline void
+validateGpuAtomLocality(const AtomLocality atomLocality)
 {
     std::string str = gmx::formatString("Invalid atom locality passed (%d); valid here is only "
-                                        "local (%d) or nonlocal (%d)", atomLocality, eatLocal, eatNonlocal);
+                                        "local (%d) or nonlocal (%d)",
+                                        static_cast<int>(atomLocality),
+                                        static_cast<int>(AtomLocality::Local),
+                                        static_cast<int>(AtomLocality::NonLocal));
 
-    GMX_ASSERT(LOCAL_OR_NONLOCAL_A(atomLocality), str.c_str());
+    GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal, str.c_str());
 }
 
 /*! \brief Convert atom locality to interaction locality.
@@ -90,18 +97,19 @@ static inline void validateGpuAtomLocality(int atomLocality)
  *  \param[in] atomLocality Atom locality specifier
  *  \returns                Interaction locality corresponding to the atom locality passed.
  */
-static inline int gpuAtomToInteractionLocality(int atomLocality)
+static inline InteractionLocality
+gpuAtomToInteractionLocality(const AtomLocality atomLocality)
 {
     validateGpuAtomLocality(atomLocality);
 
     /* determine interaction locality from atom locality */
-    if (LOCAL_A(atomLocality))
+    if (atomLocality == AtomLocality::Local)
     {
-        return eintLocal;
+        return InteractionLocality::Local;
     }
-    else if (NONLOCAL_A(atomLocality))
+    else if (atomLocality == AtomLocality::NonLocal)
     {
-        return eintNonlocal;
+        return InteractionLocality::NonLocal;
     }
     else
     {
@@ -117,16 +125,17 @@ static inline int gpuAtomToInteractionLocality(int atomLocality)
  * \param[out] atomRangeLen Atom range length in the atom data array.
  */
 template <typename AtomDataT>
-static inline void getGpuAtomRange(const AtomDataT *atomData,
-                                   int              atomLocality,
-                                   int             *atomRangeBegin,
-                                   int             *atomRangeLen)
+static inline void
+getGpuAtomRange(const AtomDataT    *atomData,
+                const AtomLocality  atomLocality,
+                int                *atomRangeBegin,
+                int                *atomRangeLen)
 {
     assert(atomData);
     validateGpuAtomLocality(atomLocality);
 
     /* calculate the atom data index range based on locality */
-    if (LOCAL_A(atomLocality))
+    if (atomLocality == AtomLocality::Local)
     {
         *atomRangeBegin  = 0;
         *atomRangeLen    = atomData->natoms_local;
@@ -155,24 +164,27 @@ static inline void getGpuAtomRange(const AtomDataT *atomData,
 template <typename GpuTimers>
 static void countPruneKernelTime(GpuTimers                 *timers,
                                  gmx_wallclock_gpu_nbnxn_t *timings,
-                                 const int                  iloc)
+                                 const InteractionLocality  iloc)
 {
+    gpu_timers_t::Interaction &iTimers = timers->interaction[iloc];
+
     // We might have not done any pruning (e.g. if we skipped with empty domains).
-    if (!timers->didPrune[iloc] && !timers->didRollingPrune[iloc])
+    if (!iTimers.didPrune &&
+        !iTimers.didRollingPrune)
     {
         return;
     }
 
-    if (timers->didPrune[iloc])
+    if (iTimers.didPrune)
     {
         timings->pruneTime.c++;
-        timings->pruneTime.t += timers->prune_k[iloc].getLastRangeTime();
+        timings->pruneTime.t += iTimers.prune_k.getLastRangeTime();
     }
 
-    if (timers->didRollingPrune[iloc])
+    if (iTimers.didRollingPrune)
     {
         timings->dynamicPruneTime.c++;
-        timings->dynamicPruneTime.t += timers->rollingPrune_k[iloc].getLastRangeTime();
+        timings->dynamicPruneTime.t += iTimers.rollingPrune_k.getLastRangeTime();
     }
 }
 
@@ -195,16 +207,17 @@ static void countPruneKernelTime(GpuTimers                 *timers,
  * \param[out] fshift         Pointer to the array of shift forces to accumulate into
  */
 template <typename StagingData>
-static inline void nbnxn_gpu_reduce_staged_outputs(const StagingData &nbst,
-                                                   int                iLocality,
-                                                   bool               reduceEnergies,
-                                                   bool               reduceFshift,
-                                                   real              *e_lj,
-                                                   real              *e_el,
-                                                   rvec              *fshift)
+static inline void
+gpu_reduce_staged_outputs(const StagingData         &nbst,
+                          const InteractionLocality  iLocality,
+                          const bool                 reduceEnergies,
+                          const bool                 reduceFshift,
+                          real                      *e_lj,
+                          real                      *e_el,
+                          rvec                      *fshift)
 {
     /* add up energies and shift forces (only once at local F wait) */
-    if (LOCAL_I(iLocality))
+    if (iLocality == InteractionLocality::Local)
     {
         if (reduceEnergies)
         {
@@ -244,12 +257,13 @@ static inline void nbnxn_gpu_reduce_staged_outputs(const StagingData &nbst,
  *
  */
 template <typename GpuTimers, typename GpuPairlist>
-static inline void nbnxn_gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timings,
-                                                GpuTimers                 *timers,
-                                                const GpuPairlist         *plist,
-                                                int                        atomLocality,
-                                                bool                       didEnergyKernels,
-                                                bool                       doTiming)
+static inline void
+gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timings,
+                       GpuTimers                 *timers,
+                       const GpuPairlist         *plist,
+                       AtomLocality               atomLocality,
+                       bool                       didEnergyKernels,
+                       bool                       doTiming)
 {
     /* timing data accumulation */
     if (!doTiming)
@@ -258,10 +272,10 @@ static inline void nbnxn_gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timin
     }
 
     /* determine interaction locality from atom locality */
-    int iLocality = gpuAtomToInteractionLocality(atomLocality);
+    const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
 
     /* only increase counter once (at local F wait) */
-    if (LOCAL_I(iLocality))
+    if (iLocality == InteractionLocality::Local)
     {
         timings->nb_c++;
         timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].c += 1;
@@ -269,11 +283,11 @@ static inline void nbnxn_gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timin
 
     /* kernel timings */
     timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].t +=
-        timers->nb_k[iLocality].getLastRangeTime();
+        timers->interaction[iLocality].nb_k.getLastRangeTime();
 
     /* X/q H2D and F D2H timings */
-    timings->nb_h2d_t += timers->nb_h2d[iLocality].getLastRangeTime();
-    timings->nb_d2h_t += timers->nb_d2h[iLocality].getLastRangeTime();
+    timings->nb_h2d_t += timers->xf[atomLocality].nb_h2d.getLastRangeTime();
+    timings->nb_d2h_t += timers->xf[atomLocality].nb_d2h.getLastRangeTime();
 
     /* Count the pruning kernel times for both cases:1st pass (at search step)
        and rolling pruning (if called at the previous step).
@@ -284,39 +298,41 @@ static inline void nbnxn_gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timin
     countPruneKernelTime(timers, timings, iLocality);
 
     /* only count atdat and pair-list H2D at pair-search step */
-    if (timers->didPairlistH2D[iLocality])
+    if (timers->interaction[iLocality].didPairlistH2D)
     {
         /* atdat transfer timing (add only once, at local F wait) */
-        if (LOCAL_A(atomLocality))
+        if (atomLocality == AtomLocality::Local)
         {
             timings->pl_h2d_c++;
             timings->pl_h2d_t += timers->atdat.getLastRangeTime();
         }
 
-        timings->pl_h2d_t += timers->pl_h2d[iLocality].getLastRangeTime();
+        timings->pl_h2d_t += timers->interaction[iLocality].pl_h2d.getLastRangeTime();
 
         /* Clear the timing flag for the next step */
-        timers->didPairlistH2D[iLocality] = false;
+        timers->interaction[iLocality].didPairlistH2D = false;
     }
 }
 
 //TODO: move into shared source file with gmx_compile_cpp_as_cuda
 //NOLINTNEXTLINE(misc-definitions-in-headers)
-bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t  *nb,
-                               int               flags,
-                               int               aloc,
-                               bool              haveOtherWork,
-                               real             *e_lj,
-                               real             *e_el,
-                               rvec             *fshift,
-                               GpuTaskCompletion completionKind)
+bool gpu_try_finish_task(gmx_nbnxn_gpu_t    *nb,
+                         const int           flags,
+                         const AtomLocality  aloc,
+                         const bool          haveOtherWork,
+                         real               *e_lj,
+                         real               *e_el,
+                         rvec               *fshift,
+                         GpuTaskCompletion   completionKind)
 {
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
     /* determine interaction locality from atom locality */
-    int iLocality = gpuAtomToInteractionLocality(aloc);
+    const InteractionLocality iLocality = gpuAtomToInteractionLocality(aloc);
 
     //  We skip when during the non-local phase there was actually no work to do.
     //  This is consistent with nbnxn_gpu_launch_kernel.
-    if (haveOtherWork || !canSkipWork(nb, iLocality))
+    if (haveOtherWork || !canSkipWork(*nb, iLocality))
     {
         // Query the state of the GPU stream and return early if we're not done
         if (completionKind == GpuTaskCompletion::Check)
@@ -336,14 +352,14 @@ bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t  *nb,
         bool calcEner   = (flags & GMX_FORCE_ENERGY) != 0;
         bool calcFshift = (flags & GMX_FORCE_VIRIAL) != 0;
 
-        nbnxn_gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner,
-                                     nb->bDoTime != 0);
+        gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner,
+                               nb->bDoTime != 0);
 
-        nbnxn_gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift, e_lj, e_el, fshift);
+        gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift, e_lj, e_el, fshift);
     }
 
     /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
-    nb->timers->didPrune[iLocality] = nb->timers->didRollingPrune[iLocality] = false;
+    nb->timers->interaction[iLocality].didPrune = nb->timers->interaction[iLocality].didRollingPrune = false;
 
     /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
     nb->plist[iLocality]->haveFreshList = false;
@@ -368,16 +384,18 @@ bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t  *nb,
  * \param[out] fshift Pointer to the shift force buffer to accumulate into
  */
 //NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file
-void nbnxn_gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb,
-                                int              flags,
-                                int              aloc,
-                                bool             haveOtherWork,
-                                real            *e_lj,
-                                real            *e_el,
-                                rvec            *fshift)
+void gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb,
+                          int              flags,
+                          AtomLocality     aloc,
+                          bool             haveOtherWork,
+                          real            *e_lj,
+                          real            *e_el,
+                          rvec            *fshift)
 {
-    nbnxn_gpu_try_finish_task(nb, flags, aloc, haveOtherWork, e_lj, e_el, fshift,
-                              GpuTaskCompletion::Wait);
+    gpu_try_finish_task(nb, flags, aloc, haveOtherWork, e_lj, e_el, fshift,
+                        GpuTaskCompletion::Wait);
 }
 
+} // namespace Nbnxm
+
 #endif
diff --git a/src/gromacs/nbnxm/gpu_common_utils.h b/src/gromacs/nbnxm/gpu_common_utils.h
index 40bffbb7b3..02febb47ab 100644
--- a/src/gromacs/nbnxm/gpu_common_utils.h
+++ b/src/gromacs/nbnxm/gpu_common_utils.h
@@ -54,6 +54,9 @@
 #include "opencl/nbnxm_ocl_types.h"
 #endif
 
+namespace Nbnxm
+{
+
 /*! \brief An early return condition for empty NB GPU workloads
  *
  * This is currently used for non-local kernels/transfers only.
@@ -61,10 +64,14 @@
  * local part of the force array also depends on the non-local kernel.
  * The skip of the local kernel is taken care of separately.
  */
-static inline bool canSkipWork(const gmx_nbnxn_gpu_t *nb, int iloc)
+static inline bool canSkipWork(const gmx_nbnxn_gpu_t &nb,
+                               InteractionLocality    iloc)
 {
-    assert(nb && nb->plist[iloc]);
-    return (iloc == eintNonlocal) && (nb->plist[iloc]->nsci == 0);
+    assert(nb.plist[iloc]);
+    return (iloc == InteractionLocality::NonLocal &&
+            nb.plist[iloc]->nsci == 0);
 }
 
+} // namespace Nbnxm
+
 #endif
diff --git a/src/gromacs/nbnxm/gpu_data_mgmt.h b/src/gromacs/nbnxm/gpu_data_mgmt.h
index 94d92a77e3..f578cf5cb9 100644
--- a/src/gromacs/nbnxm/gpu_data_mgmt.h
+++ b/src/gromacs/nbnxm/gpu_data_mgmt.h
@@ -48,6 +48,7 @@
 #include "gromacs/mdtypes/interaction_const.h"
 
 #include "gpu_types.h"
+#include "locality.h"
 
 struct nonbonded_verlet_group_t;
 struct NbnxnPairlistGpu;
@@ -57,90 +58,95 @@ struct gmx_wallclock_gpu_nbnxn_t;
 struct gmx_gpu_info_t;
 struct gmx_device_info_t;
 
+namespace Nbnxm
+{
+
 /** Initializes the data structures related to GPU nonbonded calculations. */
 GPU_FUNC_QUALIFIER
-void nbnxn_gpu_init(gmx_nbnxn_gpu_t gmx_unused            **p_nb,
-                    const gmx_device_info_t gmx_unused     *deviceInfo,
-                    const interaction_const_t gmx_unused   *ic,
-                    const NbnxnListParameters gmx_unused   *listParams,
-                    const nbnxn_atomdata_t gmx_unused      *nbat,
-                    int gmx_unused                          rank,
-                    /* true if both local and non-local are done on GPU */
-                    gmx_bool gmx_unused                     bLocalAndNonlocal) GPU_FUNC_TERM
+void gpu_init(gmx_nbnxn_gpu_t gmx_unused            **p_nb,
+              const gmx_device_info_t gmx_unused     *deviceInfo,
+              const interaction_const_t gmx_unused   *ic,
+              const NbnxnListParameters gmx_unused   *listParams,
+              const nbnxn_atomdata_t gmx_unused      *nbat,
+              int gmx_unused                          rank,
+              /* true if both local and non-local are done on GPU */
+              gmx_bool gmx_unused                     bLocalAndNonlocal) GPU_FUNC_TERM
 
 /** Initializes pair-list data for GPU, called at every pair search step. */
 GPU_FUNC_QUALIFIER
-void nbnxn_gpu_init_pairlist(gmx_nbnxn_gpu_t gmx_unused               *nb,
-                             const struct NbnxnPairlistGpu gmx_unused *h_nblist,
-                             int                    gmx_unused         iloc) GPU_FUNC_TERM
+void gpu_init_pairlist(gmx_nbnxn_gpu_t gmx_unused               *nb,
+                       const struct NbnxnPairlistGpu gmx_unused *h_nblist,
+                       InteractionLocality    gmx_unused         iloc) GPU_FUNC_TERM
 
 /** Initializes atom-data on the GPU, called at every pair search step. */
 GPU_FUNC_QUALIFIER
-void nbnxn_gpu_init_atomdata(gmx_nbnxn_gpu_t gmx_unused               *nb,
-                             const nbnxn_atomdata_t gmx_unused        *nbat) GPU_FUNC_TERM
+void gpu_init_atomdata(gmx_nbnxn_gpu_t gmx_unused               *nb,
+                       const nbnxn_atomdata_t gmx_unused        *nbat) GPU_FUNC_TERM
 
 /*! \brief Re-generate the GPU Ewald force table, resets rlist, and update the
  *  electrostatic type switching to twin cut-off (or back) if needed.
  */
 GPU_FUNC_QUALIFIER
-void nbnxn_gpu_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unused *nbv,
-                                        const interaction_const_t gmx_unused       *ic,
-                                        const NbnxnListParameters gmx_unused       *listParams) GPU_FUNC_TERM
+void gpu_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unused *nbv,
+                                  const interaction_const_t gmx_unused       *ic,
+                                  const NbnxnListParameters gmx_unused       *listParams) GPU_FUNC_TERM
 
 /** Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). */
 GPU_FUNC_QUALIFIER
-void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_gpu_t gmx_unused               *nb,
-                               const nbnxn_atomdata_t gmx_unused        *nbatom) GPU_FUNC_TERM
+void gpu_upload_shiftvec(gmx_nbnxn_gpu_t gmx_unused               *nb,
+                         const nbnxn_atomdata_t gmx_unused        *nbatom) GPU_FUNC_TERM
 
 /** Clears GPU outputs: nonbonded force, shift force and energy. */
 GPU_FUNC_QUALIFIER
-void nbnxn_gpu_clear_outputs(gmx_nbnxn_gpu_t gmx_unused *nb,
-                             int              gmx_unused flags) GPU_FUNC_TERM
+void gpu_clear_outputs(gmx_nbnxn_gpu_t gmx_unused *nb,
+                       int              gmx_unused flags) GPU_FUNC_TERM
 
 /** Frees all GPU resources used for the nonbonded calculations. */
 GPU_FUNC_QUALIFIER
-void nbnxn_gpu_free(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM
+void gpu_free(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM
 
 /** Returns the GPU timings structure or NULL if GPU is not used or timing is off. */
 GPU_FUNC_QUALIFIER
-struct gmx_wallclock_gpu_nbnxn_t *nbnxn_gpu_get_timings(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(nullptr)
+struct gmx_wallclock_gpu_nbnxn_t *gpu_get_timings(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(nullptr)
 
 /** Resets nonbonded GPU timings. */
 GPU_FUNC_QUALIFIER
-void nbnxn_gpu_reset_timings(struct nonbonded_verlet_t gmx_unused *nbv) GPU_FUNC_TERM
+void gpu_reset_timings(struct nonbonded_verlet_t gmx_unused *nbv) GPU_FUNC_TERM
 
 /** Calculates the minimum size of proximity lists to improve SM load balance
  *  with GPU non-bonded kernels. */
-GPU_FUNC_QUALIFIER
-int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(-1)
+     GPU_FUNC_QUALIFIER
+int gpu_min_ci_balanced(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(-1)
 
 /** Returns if analytical Ewald GPU kernels are used. */
 GPU_FUNC_QUALIFIER
-gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(FALSE)
+gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(FALSE)
 
 /** Returns an opaque pointer to the GPU command stream
  *  Note: CUDA only.
  */
 CUDA_FUNC_QUALIFIER
-void *nbnxn_gpu_get_command_stream(gmx_nbnxn_gpu_t gmx_unused *nb,
-                                   int gmx_unused              iloc) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
+void *gpu_get_command_stream(gmx_nbnxn_gpu_t gmx_unused     *nb,
+                             InteractionLocality gmx_unused  iloc) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
 
 /** Returns an opaque pointer to the GPU coordinate+charge array
  *  Note: CUDA only.
  */
 CUDA_FUNC_QUALIFIER
-void *nbnxn_gpu_get_xq(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
+void *gpu_get_xq(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
 
 /** Returns an opaque pointer to the GPU force array
  *  Note: CUDA only.
  */
 CUDA_FUNC_QUALIFIER
-void *nbnxn_gpu_get_f(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
+void *gpu_get_f(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
 
 /** Returns an opaque pointer to the GPU shift force array
  *  Note: CUDA only.
  */
 CUDA_FUNC_QUALIFIER
-rvec *nbnxn_gpu_get_fshift(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
+     rvec *gpu_get_fshift(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr)
+
+} // namespace Nbnxm
 
 #endif
diff --git a/src/gromacs/nbnxm/gpu_types_common.h b/src/gromacs/nbnxm/gpu_types_common.h
index 78893704bb..4c57f22bcd 100644
--- a/src/gromacs/nbnxm/gpu_types_common.h
+++ b/src/gromacs/nbnxm/gpu_types_common.h
@@ -45,6 +45,9 @@
 #include "config.h"
 
 #include "gromacs/nbnxm/pairlist.h"
+#include "gromacs/utility/enumerationhelpers.h"
+
+#include "locality.h"
 
 #if GMX_GPU == GMX_GPU_OPENCL
 #include "gromacs/gpu_utils/gpuregiontimer_ocl.h"
@@ -54,24 +57,46 @@
 #include "gromacs/gpu_utils/gpuregiontimer.cuh"
 #endif
 
+namespace Nbnxm
+{
+
 /*! \internal
  * \brief GPU region timers used for timing GPU kernels and H2D/D2H transfers.
  *
  * The two-sized arrays hold the local and non-local values and should always
  * be indexed with eintLocal/eintNonlocal.
  */
-struct nbnxn_gpu_timers_t
+struct gpu_timers_t
 {
-    GpuRegionTimer atdat;              /**< timer for atom data transfer (every PS step)            */
-    GpuRegionTimer nb_h2d[2];          /**< timer for x/q H2D transfers (l/nl, every step)          */
-    GpuRegionTimer nb_d2h[2];          /**< timer for f D2H transfer (l/nl, every step)             */
-    GpuRegionTimer pl_h2d[2];          /**< timer for pair-list H2D transfers (l/nl, every PS step) */
-    bool           didPairlistH2D[2];  /**< true when a pair-list transfer has been done at this step */
-    GpuRegionTimer nb_k[2];            /**< timer for non-bonded kernels (l/nl, every step)         */
-    GpuRegionTimer prune_k[2];         /**< timer for the 1st pass list pruning kernel (l/nl, every PS step)   */
-    bool           didPrune[2];        /**< true when we timed pruning and the timings need to be accounted for */
-    GpuRegionTimer rollingPrune_k[2];  /**< timer for rolling pruning kernels (l/nl, frequency depends on chunk size)  */
-    bool           didRollingPrune[2]; /**< true when we timed rolling pruning (at the previous step) and the timings need to be accounted for */
+    /*! \internal
+     * \brief Timers for local or non-local coordinate/force transfers
+     */
+    struct XFTransfers
+    {
+        GpuRegionTimer nb_h2d; /**< timer for x/q H2D transfers (l/nl, every step) */
+        GpuRegionTimer nb_d2h; /**< timer for f D2H transfer (l/nl, every step) */
+    };
+
+    /*! \internal
+     * \brief Timers for local or non-local interaction related operations
+     */
+    struct Interaction
+    {
+        GpuRegionTimer pl_h2d;                  /**< timer for pair-list H2D transfers (l/nl, every PS step) */
+        bool           didPairlistH2D = false;  /**< true when a pair-list transfer has been done at this step */
+        GpuRegionTimer nb_k;                    /**< timer for non-bonded kernels (l/nl, every step)         */
+        GpuRegionTimer prune_k;                 /**< timer for the 1st pass list pruning kernel (l/nl, every PS step)   */
+        bool           didPrune = false;        /**< true when we timed pruning and the timings need to be accounted for */
+        GpuRegionTimer rollingPrune_k;          /**< timer for rolling pruning kernels (l/nl, frequency depends on chunk size)  */
+        bool           didRollingPrune = false; /**< true when we timed rolling pruning (at the previous step) and the timings need to be accounted for */
+    };
+
+    //! timer for atom data transfer (every PS step)
+    GpuRegionTimer                                                               atdat;
+    //! timers for coordinate/force transfers (every step)
+    gmx::EnumerationArray<AtomLocality, XFTransfers>                             xf;
+    //! timers for interaction related transfers
+    gmx::EnumerationArray<InteractionLocality, Nbnxm::gpu_timers_t::Interaction> interaction;
 };
 
 struct gpu_plist
@@ -99,4 +124,6 @@ struct gpu_plist
     int              rollingPruningPart;     /**< the next part to which the roling pruning needs to be applied */
 };
 
+} // namespace Nbnxm
+
 #endif
diff --git a/src/gromacs/nbnxm/kerneldispatch.cpp b/src/gromacs/nbnxm/kerneldispatch.cpp
index 276a2e69e3..14402aca91 100644
--- a/src/gromacs/nbnxm/kerneldispatch.cpp
+++ b/src/gromacs/nbnxm/kerneldispatch.cpp
@@ -413,13 +413,13 @@ nbnxn_kernel_cpu(const nonbonded_verlet_group_t *nbvg,
     }
 }
 
-static void accountFlops(t_nrnb                    *nrnb,
-                         const nonbonded_verlet_t  &nbv,
-                         const int                  ilocality,
-                         const interaction_const_t &ic,
-                         int                        forceFlags)
+static void accountFlops(t_nrnb                           *nrnb,
+                         const nonbonded_verlet_t         &nbv,
+                         const Nbnxm::InteractionLocality  iLocality,
+                         const interaction_const_t        &ic,
+                         const int                         forceFlags)
 {
-    const nonbonded_verlet_group_t &nbvg            = nbv.grp[ilocality];
+    const nonbonded_verlet_group_t &nbvg            = nbv.grp[iLocality];
     const bool                      usingGpuKernels = (nbvg.kernel_type == nbnxnk8x8x8_GPU);
 
     int enr_nbnxn_kernel_ljc;
@@ -428,7 +428,7 @@ static void accountFlops(t_nrnb                    *nrnb,
         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
     }
     else if ((!usingGpuKernels && nbvg.ewald_excl == ewaldexclAnalytical) ||
-             (usingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv.gpu_nbv)))
+             (usingGpuKernels && Nbnxm::gpu_is_kernel_ewald_analytical(nbv.gpu_nbv)))
     {
         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
     }
@@ -474,7 +474,7 @@ static void accountFlops(t_nrnb                    *nrnb,
 }
 
 void NbnxnDispatchKernel(nonbonded_verlet_t        *nbv,
-                         const int                  ilocality,
+                         Nbnxm::InteractionLocality iLocality,
                          const interaction_const_t &ic,
                          int                        forceFlags,
                          int                        clearF,
@@ -482,7 +482,7 @@ void NbnxnDispatchKernel(nonbonded_verlet_t        *nbv,
                          gmx_enerdata_t            *enerd,
                          t_nrnb                    *nrnb)
 {
-    const nonbonded_verlet_group_t &nbvg = nbv->grp[ilocality];
+    const nonbonded_verlet_group_t &nbvg = nbv->grp[iLocality];
 
     switch (nbvg.kernel_type)
     {
@@ -503,7 +503,7 @@ void NbnxnDispatchKernel(nonbonded_verlet_t        *nbv,
             break;
 
         case nbnxnk8x8x8_GPU:
-            nbnxn_gpu_launch_kernel(nbv->gpu_nbv, forceFlags, ilocality);
+            Nbnxm::gpu_launch_kernel(nbv->gpu_nbv, forceFlags, iLocality);
             break;
 
         case nbnxnk8x8x8_PlainC:
@@ -525,5 +525,5 @@ void NbnxnDispatchKernel(nonbonded_verlet_t        *nbv,
 
     }
 
-    accountFlops(nrnb, *nbv, ilocality, ic, forceFlags);
+    accountFlops(nrnb, *nbv, iLocality, ic, forceFlags);
 }
diff --git a/src/gromacs/nbnxm/locality.h b/src/gromacs/nbnxm/locality.h
new file mode 100644
index 0000000000..6ebd4fb409
--- /dev/null
+++ b/src/gromacs/nbnxm/locality.h
@@ -0,0 +1,76 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal \file
+ * \brief Defines nbnxn locality enums
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \ingroup module_nbnxm
+ */
+
+#ifndef GMX_NBNXM_LOCALITY_H
+#define GMX_NBNXM_LOCALITY_H
+
+namespace Nbnxm
+{
+
+/*! \brief Atom locality indicator: local, non-local, all.
+ *
+ * Used for calls to:
+ * gridding, force calculation, x/f buffer operations
+ */
+enum class AtomLocality : int
+{
+    Local    = 0, //!< Local atoms
+    NonLocal = 1, //!< Non-local atoms
+    All      = 2, //!< Both local and non-local atoms
+    Count    = 3  //!< The number of atom locality types
+};
+
+/*! \brief Interaction locality indicator: local, non-local, all.
+ *
+ * Used for calls to:
+ * pair-search, force calculation, x/f buffer operations
+ */
+enum class InteractionLocality : int
+{
+    Local    = 0, //!< Interactions between local atoms only
+    NonLocal = 1, //!< Interactions between non-local and (non-)local atoms
+    Count    = 2  //!< The number of interaction locality types
+};
+
+}      // namespace Nbnxm
+
+#endif // GMX_NBNXM_LOCALITY_H
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h
index 59886a5f8b..9e0138e162 100644
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -104,8 +104,11 @@
 #include "gromacs/nbnxm/pairlist.h"
 #include "gromacs/nbnxm/pairlistset.h"
 #include "gromacs/utility/arrayref.h"
+#include "gromacs/utility/enumerationhelpers.h"
 #include "gromacs/utility/real.h"
 
+#include "locality.h"
+
 // TODO: Remove this include and the two nbnxm includes above
 #include "nbnxm_gpu.h"
 
@@ -148,50 +151,23 @@ typedef enum
     nbnxnkNR
 } nbnxn_kernel_type;
 
+namespace Nbnxm
+{
+
 /*! \brief Return a string identifying the kernel type.
  *
  * \param [in] kernel_type   nonbonded kernel types, takes values from the nbnxn_kernel_type enum
  * \returns                  a string identifying the kernel corresponding to the type passed as argument
  */
-const char *lookup_nbnxn_kernel_name(int kernel_type);
+const char *lookup_kernel_name(int kernel_type);
+
+} // namespace Nbnxm
 
 /*! \brief Ewald exclusion types */
 enum {
     ewaldexclTable, ewaldexclAnalytical
 };
 
-/*! \brief Atom locality indicator: local, non-local, all.
- *
- * Used for calls to:
- * gridding, pair-search, force calculation, x/f buffer operations
- * */
-enum {
-    eatLocal = 0, eatNonlocal = 1, eatAll
-};
-
-/*! \brief Tests for local atom range */
-#define LOCAL_A(x)               ((x) == eatLocal)
-/*! \brief Tests for non-local atom range */
-#define NONLOCAL_A(x)            ((x) == eatNonlocal)
-/*! \brief Tests for either local or non-local atom range */
-#define LOCAL_OR_NONLOCAL_A(x)   (LOCAL_A(x) || NONLOCAL_A(x))
-
-/*! \brief Interaction locality indicator
- *
- * Used in pair-list search/calculations in the following manner:
- *  - local interactions require local atom data and affect local output only;
- *  - non-local interactions require both local and non-local atom data and
- *    affect both local- and non-local output.
- */
-enum {
-    eintLocal = 0, eintNonlocal = 1
-};
-
-/*! \brief Tests for local interaction indicator */
-#define LOCAL_I(x)               ((x) == eintLocal)
-/*! \brief Tests for non-local interaction indicator */
-#define NONLOCAL_I(x)            ((x) == eintNonlocal)
-
 /*! \brief Flag to tell the nonbonded kernels whether to clear the force output buffers */
 enum {
     enbvClearFNo, enbvClearFYes
@@ -207,19 +183,25 @@ typedef struct nonbonded_verlet_group_t {
 
 /*! \libinternal
  *  \brief Top-level non-bonded data structure for the Verlet-type cut-off scheme. */
-typedef struct nonbonded_verlet_t {
-    std::unique_ptr<NbnxnListParameters> listParams;      /**< Parameters for the search and list pruning setup */
-    std::unique_ptr<nbnxn_search>        nbs;             /**< n vs n atom pair searching data       */
-    int                                  ngrp;            /**< number of interaction groups          */
-    nonbonded_verlet_group_t             grp[2];          /**< local and non-local interaction group */
-    nbnxn_atomdata_t                    *nbat;            /**< atom data                             */
-
-    gmx_bool                             bUseGPU;         /**< TRUE when non-bonded interactions are computed on a physical GPU */
-    EmulateGpuNonbonded                  emulateGpu;      /**< true when non-bonded interactions are computed on the CPU using GPU-style pair lists */
-    gmx_nbnxn_gpu_t                     *gpu_nbv;         /**< pointer to GPU nb verlet data     */
-    int                                  min_ci_balanced; /**< pair list balancing parameter
-                                                               used for the 8x8x8 GPU kernels    */
-} nonbonded_verlet_t;
+struct nonbonded_verlet_t
+{
+    std::unique_ptr<NbnxnListParameters>                                        listParams; /**< Parameters for the search and list pruning setup */
+    std::unique_ptr<nbnxn_search>                                               nbs;        /**< n vs n atom pair searching data       */
+    int                                                                         ngrp;       /**< number of interaction groups          */
+    //! Local and non-local interaction group
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, nonbonded_verlet_group_t> grp;
+    //! Atom data
+    nbnxn_atomdata_t                                                           *nbat;
+
+    gmx_bool                                                                    bUseGPU;         /**< TRUE when non-bonded interactions are computed on a physical GPU */
+    EmulateGpuNonbonded                                                         emulateGpu;      /**< true when non-bonded interactions are computed on the CPU using GPU-style pair lists */
+    gmx_nbnxn_gpu_t                                                            *gpu_nbv;         /**< pointer to GPU nb verlet data     */
+    int                                                                         min_ci_balanced; /**< pair list balancing parameter
+                                                                                                      used for the 8x8x8 GPU kernels    */
+};
+
+namespace Nbnxm
+{
 
 /*! \brief Initializes the nbnxn module */
 void init_nb_verlet(const gmx::MDLogger     &mdlog,
@@ -233,6 +215,8 @@ void init_nb_verlet(const gmx::MDLogger     &mdlog,
                     const gmx_mtop_t        *mtop,
                     matrix                   box);
 
+} // namespace Nbnxm
+
 /*! \brief Put the atoms on the pair search grid.
  *
  * Only atoms atomStart to atomEnd in x are put on the grid.
@@ -290,18 +274,18 @@ gmx::ArrayRef<const int> nbnxn_get_gridindices(const nbnxn_search* nbs);
  * pairs beyond the pairlist inner radius and writes the result to a list that is
  * to be consumed by the non-bonded kernel.
  */
-void NbnxnDispatchPruneKernel(nonbonded_verlet_t *nbv,
-                              int                 ilocality,
-                              const rvec         *shift_vec);
+void NbnxnDispatchPruneKernel(nonbonded_verlet_t         *nbv,
+                              Nbnxm::InteractionLocality  iLocality,
+                              const rvec                 *shift_vec);
 
 /*! \brief Executes the non-bonded kernel of the GPU or launches it on the GPU */
-void NbnxnDispatchKernel(nonbonded_verlet_t        *nbv,
-                         int                        ilocality,
-                         const interaction_const_t &ic,
-                         int                        forceFlags,
-                         int                        clearF,
-                         t_forcerec                *fr,
-                         gmx_enerdata_t            *enerd,
-                         t_nrnb                    *nrnb);
+void NbnxnDispatchKernel(nonbonded_verlet_t         *nbv,
+                         Nbnxm::InteractionLocality  iLocality,
+                         const interaction_const_t  &ic,
+                         int                         forceFlags,
+                         int                         clearF,
+                         t_forcerec                 *fr,
+                         gmx_enerdata_t             *enerd,
+                         t_nrnb                     *nrnb);
 
 #endif // GMX_NBNXN_NBNXN_H
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h
index 6aa7c21752..8e5f77a9af 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -49,10 +49,14 @@
 #include "gromacs/utility/real.h"
 
 #include "gpu_types.h"
+#include "locality.h"
 
 struct nbnxn_atomdata_t;
 enum class GpuTaskCompletion;
 
+namespace Nbnxm
+{
+
 /*! \brief
  * Launch asynchronously the xq buffer host to device copy.
  *
@@ -61,14 +65,14 @@ enum class GpuTaskCompletion;
  *
  * \param [in]    nb        GPU nonbonded data.
  * \param [in]    nbdata    Host-side atom data structure.
- * \param [in]    iloc      Interaction locality flag.
+ * \param [in]    aloc      Atom locality flag.
  * \param [in]    haveOtherWork  True if there are other tasks that require the nbnxn coordinate input.
  */
 GPU_FUNC_QUALIFIER
-void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_gpu_t gmx_unused               *nb,
-                              const struct nbnxn_atomdata_t gmx_unused *nbdata,
-                              int gmx_unused                            iloc,
-                              bool gmx_unused                           haveOtherWork) GPU_FUNC_TERM
+void gpu_copy_xq_to_gpu(gmx_nbnxn_gpu_t gmx_unused               *nb,
+                        const struct nbnxn_atomdata_t gmx_unused *nbdata,
+                        AtomLocality gmx_unused                   aloc,
+                        bool gmx_unused                           haveOtherWork) GPU_FUNC_TERM
 
 /*! \brief
  * Launch asynchronously the nonbonded force calculations.
@@ -81,9 +85,9 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_gpu_t gmx_unused               *nb,
  *
  */
 GPU_FUNC_QUALIFIER
-void nbnxn_gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused               *nb,
-                             int gmx_unused                            flags,
-                             int gmx_unused                            iloc) GPU_FUNC_TERM
+void gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused     *nb,
+                       int gmx_unused                  flags,
+                       InteractionLocality gmx_unused  iloc) GPU_FUNC_TERM
 
 /*! \brief
  * Launch asynchronously the nonbonded prune-only kernel.
@@ -121,9 +125,9 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused               *nb,
  * \param [in]    numParts  Number of parts the pair list is split into in the rolling kernel.
  */
 GPU_FUNC_QUALIFIER
-void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t gmx_unused *nb,
-                                       int gmx_unused              iloc,
-                                       int gmx_unused              numParts) GPU_FUNC_TERM
+void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t gmx_unused     *nb,
+                                 InteractionLocality gmx_unused  iloc,
+                                 int gmx_unused                  numParts) GPU_FUNC_TERM
 
 /*! \brief
  * Launch asynchronously the download of nonbonded forces from the GPU
@@ -132,11 +136,11 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t gmx_unused *nb,
  * no non-bonded work.
  */
 GPU_FUNC_QUALIFIER
-void nbnxn_gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused              *nb,
-                              struct nbnxn_atomdata_t gmx_unused       *nbatom,
-                              int                    gmx_unused         flags,
-                              int                    gmx_unused         aloc,
-                              bool                   gmx_unused         haveOtherWork) GPU_FUNC_TERM
+void gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused *nb,
+                        nbnxn_atomdata_t gmx_unused *nbatom,
+                        int              gmx_unused  flags,
+                        AtomLocality     gmx_unused  aloc,
+                        bool             gmx_unused  haveOtherWork) GPU_FUNC_TERM
 
 /*! \brief Attempts to complete nonbonded GPU task.
  *
@@ -172,14 +176,14 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused              *nb,
  * \returns              True if the nonbonded tasks associated with \p aloc locality have completed
  */
 GPU_FUNC_QUALIFIER
-bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused  *nb,
-                               int             gmx_unused   flags,
-                               int             gmx_unused   aloc,
-                               bool            gmx_unused   haveOtherWork,
-                               real            gmx_unused  *e_lj,
-                               real            gmx_unused  *e_el,
-                               rvec            gmx_unused  *fshift,
-                               GpuTaskCompletion gmx_unused completionKind) GPU_FUNC_TERM_WITH_RETURN(false)
+bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused  *nb,
+                         int             gmx_unused   flags,
+                         AtomLocality    gmx_unused   aloc,
+                         bool            gmx_unused   haveOtherWork,
+                         real            gmx_unused  *e_lj,
+                         real            gmx_unused  *e_el,
+                         rvec            gmx_unused  *fshift,
+                         GpuTaskCompletion gmx_unused completionKind) GPU_FUNC_TERM_WITH_RETURN(false)
 
 /*! \brief  Completes the nonbonded GPU task blocking until GPU tasks and data
  * transfers to finish.
@@ -197,16 +201,18 @@ bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused  *nb,
  * \param[out] fshift Pointer to the shift force buffer to accumulate into
  */
 GPU_FUNC_QUALIFIER
-void nbnxn_gpu_wait_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
-                                int             gmx_unused  flags,
-                                int             gmx_unused  aloc,
-                                bool            gmx_unused  haveOtherWork,
-                                real            gmx_unused *e_lj,
-                                real            gmx_unused *e_el,
-                                rvec            gmx_unused *fshift) GPU_FUNC_TERM
+void gpu_wait_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
+                          int             gmx_unused  flags,
+                          AtomLocality    gmx_unused  aloc,
+                          bool            gmx_unused  haveOtherWork,
+                          real            gmx_unused *e_lj,
+                          real            gmx_unused *e_el,
+                          rvec            gmx_unused *fshift) GPU_FUNC_TERM
 
 /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
 GPU_FUNC_QUALIFIER
-int nbnxn_gpu_pick_ewald_kernel_type(bool gmx_unused bTwinCut) GPU_FUNC_TERM_WITH_RETURN(-1)
+int gpu_pick_ewald_kernel_type(bool gmx_unused bTwinCut) GPU_FUNC_TERM_WITH_RETURN(-1)
+
+} // namespace Nbnxm
 
 #endif
diff --git a/src/gromacs/nbnxm/nbnxm_setup.cpp b/src/gromacs/nbnxm/nbnxm_setup.cpp
index d4dafe11c7..0565d3faf0 100644
--- a/src/gromacs/nbnxm/nbnxm_setup.cpp
+++ b/src/gromacs/nbnxm/nbnxm_setup.cpp
@@ -62,6 +62,9 @@
 #include "grid.h"
 #include "internal.h"
 
+namespace Nbnxm
+{
+
 /*! \brief Returns whether CPU SIMD support exists for the given inputrec
  *
  * If the return value is FALSE and fplog/cr != NULL, prints a fallback
@@ -134,7 +137,7 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused    *ir,
             /* One 256-bit FMA per cycle makes 2xNN faster */
             *kernel_type = nbnxnk4xN_SIMD_2xNN;
         }
-#endif  /* GMX_NBNXN_SIMD_2XNN && GMX_NBNXN_SIMD_4XN */
+#endif      /* GMX_NBNXN_SIMD_2XNN && GMX_NBNXN_SIMD_4XN */
 
 
         if (getenv("GMX_NBNXN_SIMD_4XN") != nullptr)
@@ -181,10 +184,10 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused    *ir,
         }
 
     }
-#endif // GMX_SIMD
+#endif  // GMX_SIMD
 }
 
-const char *lookup_nbnxn_kernel_name(int kernel_type)
+const char *lookup_kernel_name(int kernel_type)
 {
     const char *returnvalue = nullptr;
     switch (kernel_type)
@@ -260,7 +263,7 @@ static void pick_nbnxn_kernel(const gmx::MDLogger &mdlog,
     {
         GMX_LOG(mdlog.info).asParagraph().appendTextFormatted(
                 "Using %s %dx%d nonbonded short-range kernels",
-                lookup_nbnxn_kernel_name(*kernel_type),
+                lookup_kernel_name(*kernel_type),
                 nbnxn_kernel_to_cluster_i_size(*kernel_type),
                 nbnxn_kernel_to_cluster_j_size(*kernel_type));
 
@@ -270,7 +273,7 @@ static void pick_nbnxn_kernel(const gmx::MDLogger &mdlog,
             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
                     "WARNING: Using the slow %s kernels. This should\n"
                     "not happen during routine usage on supported platforms.",
-                    lookup_nbnxn_kernel_name(*kernel_type));
+                    lookup_kernel_name(*kernel_type));
         }
     }
 }
@@ -388,13 +391,13 @@ void init_nb_verlet(const gmx::MDLogger     &mdlog,
     {
         /* init the NxN GPU data; the last argument tells whether we'll have
          * both local and non-local NB calculation on GPU */
-        nbnxn_gpu_init(&nbv->gpu_nbv,
-                       deviceInfo,
-                       fr->ic,
-                       nbv->listParams.get(),
-                       nbv->nbat,
-                       cr->nodeid,
-                       (nbv->ngrp > 1));
+        gpu_init(&nbv->gpu_nbv,
+                 deviceInfo,
+                 fr->ic,
+                 nbv->listParams.get(),
+                 nbv->nbat,
+                 cr->nodeid,
+                 (nbv->ngrp > 1));
 
         if ((env = getenv("GMX_NB_MIN_CI")) != nullptr)
         {
@@ -414,7 +417,7 @@ void init_nb_verlet(const gmx::MDLogger     &mdlog,
         }
         else
         {
-            nbv->min_ci_balanced = nbnxn_gpu_min_ci_balanced(nbv->gpu_nbv);
+            nbv->min_ci_balanced = gpu_min_ci_balanced(nbv->gpu_nbv);
             if (debug)
             {
                 fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
@@ -426,3 +429,5 @@ void init_nb_verlet(const gmx::MDLogger     &mdlog,
 
     *nb_verlet = nbv;
 }
+
+} // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
index 8aac82d3d3..34132dbbd7 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -88,6 +88,9 @@
 #include "nbnxm_ocl_internal.h"
 #include "nbnxm_ocl_types.h"
 
+namespace Nbnxm
+{
+
 /*! \brief Convenience constants */
 //@{
 static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
@@ -359,19 +362,22 @@ static void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event)
 }
 
 /*! \brief Launch asynchronously the xq buffer host to device copy. */
-void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t        *nb,
-                              const nbnxn_atomdata_t *nbatom,
-                              int                     iloc,
-                              bool                    haveOtherWork)
+void gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t        *nb,
+                        const nbnxn_atomdata_t *nbatom,
+                        const AtomLocality      atomLocality,
+                        const bool              haveOtherWork)
 {
-    int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
+    const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
+
+    /* local/nonlocal offset and length used for xq and f */
+    int                  adat_begin, adat_len;
 
     cl_atomdata_t       *adat    = nb->atdat;
     cl_plist_t          *plist   = nb->plist[iloc];
     cl_timers_t         *t       = nb->timers;
     cl_command_queue     stream  = nb->stream[iloc];
 
-    bool                 bDoTime     = (nb->bDoTime) != 0;
+    bool                 bDoTime = (nb->bDoTime) != 0;
 
     /* Don't launch the non-local H2D copy if there is no dependent
        work to do: neither non-local nor other (e.g. bonded) work
@@ -382,7 +388,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t        *nb,
        we always call the local local x+q copy (and the rest of the local
        work in nbnxn_gpu_launch_kernel().
      */
-    if (!haveOtherWork && canSkipWork(nb, iloc))
+    if (!haveOtherWork && canSkipWork(*nb, iloc))
     {
         plist->haveFreshList = false;
 
@@ -390,7 +396,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t        *nb,
     }
 
     /* calculate the atom data index range based on locality */
-    if (LOCAL_I(iloc))
+    if (atomLocality == AtomLocality::Local)
     {
         adat_begin  = 0;
         adat_len    = adat->natoms_local;
@@ -404,16 +410,16 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t        *nb,
     /* beginning of timed HtoD section */
     if (bDoTime)
     {
-        t->nb_h2d[iloc].openTimingRegion(stream);
+        t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
     }
 
     /* HtoD x, q */
     ocl_copy_H2D_async(adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin*sizeof(float)*4,
-                       adat_len * sizeof(float) * 4, stream, bDoTime ? t->nb_h2d[iloc].fetchNextEvent() : nullptr);
+                       adat_len * sizeof(float) * 4, stream, bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
 
     if (bDoTime)
     {
-        t->nb_h2d[iloc].closeTimingRegion(stream);
+        t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
     }
 
     /* When we get here all misc operations issues in the local stream as well as
@@ -421,7 +427,7 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t        *nb,
        so we record that in the local stream and wait for it in the nonlocal one. */
     if (nb->bUseTwoStreams)
     {
-        if (iloc == eintLocal)
+        if (iloc == InteractionLocality::Local)
         {
             cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
             assert(CL_SUCCESS == cl_error);
@@ -459,9 +465,9 @@ void nbnxn_gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t        *nb,
    misc_ops_done event to record the point in time when the above  operations
    are finished and synchronize with this event in the non-local stream.
  */
-void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
-                             int                            flags,
-                             int                            iloc)
+void gpu_launch_kernel(gmx_nbnxn_ocl_t                  *nb,
+                       const int                         flags,
+                       const Nbnxm::InteractionLocality  iloc)
 {
     /* OpenCL kernel launch-related stuff */
     cl_kernel            nb_kernel = nullptr;  /* fn pointer to the nonbonded kernel */
@@ -487,7 +493,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
        clearing. All these operations, except for the local interaction kernel,
        are needed for the non-local interactions. The skip of the local kernel
        call is taken care of later in this function. */
-    if (canSkipWork(nb, iloc))
+    if (canSkipWork(*nb, iloc))
     {
         plist->haveFreshList = false;
 
@@ -500,7 +506,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
            (that's the way the timing accounting can distinguish between
            separate prune kernel and combined force+prune).
          */
-        nbnxn_gpu_launch_kernel_pruneonly(nb, iloc, 1);
+        Nbnxm::gpu_launch_kernel_pruneonly(nb, iloc, 1);
     }
 
     if (plist->nsci == 0)
@@ -513,7 +519,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
     /* beginning of timed nonbonded calculation section */
     if (bDoTime)
     {
-        t->nb_k[iloc].openTimingRegion(stream);
+        t->interaction[iloc].nb_k.openTimingRegion(stream);
     }
 
     /* get the pointer to the kernel flavor we need to use */
@@ -521,7 +527,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
                                     nbp->eeltype,
                                     nbp->vdwtype,
                                     bCalcEner,
-                                    (plist->haveFreshList && !nb->timers->didPrune[iloc]));
+                                    (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune));
 
     /* kernel launch config */
 
@@ -545,7 +551,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
 
     fillin_ocl_structures(nbp, &nbparams_params);
 
-    auto          *timingEvent  = bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr;
+    auto          *timingEvent  = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
     constexpr char kernelName[] = "k_calc_nb";
     if (useLjCombRule(nb->nbparam->vdwtype))
     {
@@ -570,7 +576,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
 
     if (bDoTime)
     {
-        t->nb_k[iloc].closeTimingRegion(stream);
+        t->interaction[iloc].nb_k.closeTimingRegion(stream);
     }
 }
 
@@ -600,9 +606,9 @@ static inline int calc_shmem_required_prune(const int num_threads_z)
     return shmem;
 }
 
-void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t       *nb,
-                                       int                    iloc,
-                                       int                    numParts)
+void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t           *nb,
+                                 const InteractionLocality  iloc,
+                                 const int                  numParts)
 {
     cl_atomdata_t       *adat    = nb->atdat;
     cl_nbparam_t        *nbp     = nb->nbparam;
@@ -656,7 +662,7 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t       *nb,
     GpuRegionTimer *timer = nullptr;
     if (bDoTime)
     {
-        timer = &(plist->haveFreshList ? t->prune_k[iloc] : t->rollingPrune_k[iloc]);
+        timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
     }
 
     /* beginning of timed prune calculation section */
@@ -708,12 +714,12 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t       *nb,
     {
         plist->haveFreshList         = false;
         /* Mark that pruning has been done */
-        nb->timers->didPrune[iloc] = true;
+        nb->timers->interaction[iloc].didPrune = true;
     }
     else
     {
         /* Mark that rolling pruning has been done */
-        nb->timers->didRollingPrune[iloc] = true;
+        nb->timers->interaction[iloc].didRollingPrune = true;
     }
 
     if (bDoTime)
@@ -726,29 +732,29 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t       *nb,
  * Launch asynchronously the download of nonbonded forces from the GPU
  * (and energies/shift forces if required).
  */
-void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
-                              struct nbnxn_atomdata_t       *nbatom,
-                              int                            flags,
-                              int                            aloc,
-                              bool                           haveOtherWork)
+void gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
+                        struct nbnxn_atomdata_t       *nbatom,
+                        const int                      flags,
+                        const AtomLocality             aloc,
+                        const bool                     haveOtherWork)
 {
     cl_int gmx_unused cl_error;
     int               adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 
     /* determine interaction locality from atom locality */
-    int              iloc = gpuAtomToInteractionLocality(aloc);
+    const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc);
 
-    cl_atomdata_t   *adat    = nb->atdat;
-    cl_timers_t     *t       = nb->timers;
-    bool             bDoTime = nb->bDoTime == CL_TRUE;
-    cl_command_queue stream  = nb->stream[iloc];
+    cl_atomdata_t            *adat    = nb->atdat;
+    cl_timers_t              *t       = nb->timers;
+    bool                      bDoTime = nb->bDoTime == CL_TRUE;
+    cl_command_queue          stream  = nb->stream[iloc];
 
-    bool             bCalcEner   = (flags & GMX_FORCE_ENERGY) != 0;
-    int              bCalcFshift = flags & GMX_FORCE_VIRIAL;
+    bool                      bCalcEner   = (flags & GMX_FORCE_ENERGY) != 0;
+    int                       bCalcFshift = flags & GMX_FORCE_VIRIAL;
 
 
     /* don't launch non-local copy-back if there was no non-local work to do */
-    if (!haveOtherWork && canSkipWork(nb, iloc))
+    if (!haveOtherWork && canSkipWork(*nb, iloc))
     {
         /* TODO An alternative way to signal that non-local work is
            complete is to use a clEnqueueMarker+clEnqueueBarrier
@@ -767,19 +773,19 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
     /* beginning of timed D2H section */
     if (bDoTime)
     {
-        t->nb_d2h[iloc].openTimingRegion(stream);
+        t->xf[aloc].nb_d2h.openTimingRegion(stream);
     }
 
     /* With DD the local D2H transfer can only start after the non-local
        has been launched. */
-    if (iloc == eintLocal && nb->bNonLocalStreamActive)
+    if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive)
     {
         sync_ocl_event(stream, &(nb->nonlocal_done));
     }
 
     /* DtoH f */
     ocl_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
-                       (adat_len)* adat->f_elem_size, stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
+                       (adat_len)* adat->f_elem_size, stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
 
     /* kick off work */
     cl_error = clFlush(stream);
@@ -789,7 +795,7 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
        recorded which signals that the local D2H can proceed. This event is not
        placed after the non-local kernel because we first need the non-local
        data back first. */
-    if (iloc == eintNonlocal)
+    if (iloc == InteractionLocality::NonLocal)
     {
         cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->nonlocal_done));
         assert(CL_SUCCESS == cl_error);
@@ -797,35 +803,35 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
     }
 
     /* only transfer energies in the local stream */
-    if (LOCAL_I(iloc))
+    if (iloc == InteractionLocality::Local)
     {
         /* DtoH fshift */
         if (bCalcFshift)
         {
             ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
-                               SHIFTS * adat->fshift_elem_size, stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
+                               SHIFTS * adat->fshift_elem_size, stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
         }
 
         /* DtoH energies */
         if (bCalcEner)
         {
             ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
-                               sizeof(float), stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
+                               sizeof(float), stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
 
             ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
-                               sizeof(float), stream, bDoTime ? t->nb_d2h[iloc].fetchNextEvent() : nullptr);
+                               sizeof(float), stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
         }
     }
 
     if (bDoTime)
     {
-        t->nb_d2h[iloc].closeTimingRegion(stream);
+        t->xf[aloc].nb_d2h.closeTimingRegion(stream);
     }
 }
 
 
 /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
-int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut)
+int gpu_pick_ewald_kernel_type(const bool bTwinCut)
 {
     bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
     int  kernel_type;
@@ -879,3 +885,5 @@ int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut)
 
     return kernel_type;
 }
+
+} // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
index 27a2cd806e..72be765441 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
@@ -50,6 +50,11 @@
 
 #include <cmath>
 
+// TODO We would like to move this down, but the way gmx_nbnxn_gpu_t
+//      is currently declared means this has to be before gpu_types.h
+#include "nbnxm_ocl_types.h"
+
+// TODO Remove this comment when the above order issue is resolved
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/gpu_utils/oclutils.h"
 #include "gromacs/hardware/gpu_hw_info.h"
@@ -70,7 +75,9 @@
 #include "gromacs/utility/smalloc.h"
 
 #include "nbnxm_ocl_internal.h"
-#include "nbnxm_ocl_types.h"
+
+namespace Nbnxm
+{
 
 /*! \brief This parameter should be determined heuristically from the
  * kernel execution times
@@ -276,7 +283,7 @@ map_interaction_types_to_gpu_kernel_flavors(const interaction_const_t *ic,
     else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
     {
         /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
-        *gpu_eeltype = nbnxn_gpu_pick_ewald_kernel_type(false);
+        *gpu_eeltype = gpu_pick_ewald_kernel_type(false);
     }
     else
     {
@@ -398,11 +405,11 @@ static void init_nbparam(cl_nbparam_t                    *nbp,
 }
 
 //! This function is documented in the header file
-void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
-                                        const interaction_const_t   *ic,
-                                        const NbnxnListParameters   *listParams)
+void gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
+                                  const interaction_const_t   *ic,
+                                  const NbnxnListParameters   *listParams)
 {
-    if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
+    if (!nbv || nbv->grp[InteractionLocality::Local].kernel_type != nbnxnk8x8x8_GPU)
     {
         return;
     }
@@ -411,7 +418,7 @@ void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
 
     set_cutoff_parameters(nbp, ic, listParams);
 
-    nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw);
+    nbp->eeltype = gpu_pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw);
 
     init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_rundata);
 }
@@ -440,19 +447,6 @@ static void init_plist(cl_plist_t *pl)
     pl->haveFreshList  = false;
 }
 
-/*! \brief Initializes the timer data structure.
- */
-static void init_timers(cl_timers_t *t,
-                        bool         bUseTwoStreams)
-{
-    for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
-    {
-        t->didPairlistH2D[i]  = false;
-        t->didPrune[i]        = false;
-        t->didRollingPrune[i] = false;
-    }
-}
-
 /*! \brief Initializes the timings data structure.
  */
 static void init_timings(gmx_wallclock_gpu_nbnxn_t *t)
@@ -547,7 +541,7 @@ nbnxn_ocl_clear_e_fshift(gmx_nbnxn_ocl_t *nb)
 
     cl_int               cl_error;
     cl_atomdata_t *      adat     = nb->atdat;
-    cl_command_queue     ls       = nb->stream[eintLocal];
+    cl_command_queue     ls       = nb->stream[InteractionLocality::Local];
 
     size_t               local_work_size[3]   = {1, 1, 1};
     size_t               global_work_size[3]  = {1, 1, 1};
@@ -613,13 +607,13 @@ static void nbnxn_ocl_init_const(gmx_nbnxn_ocl_t                *nb,
 
 
 //! This function is documented in the header file
-void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
-                    const gmx_device_info_t   *deviceInfo,
-                    const interaction_const_t *ic,
-                    const NbnxnListParameters *listParams,
-                    const nbnxn_atomdata_t    *nbat,
-                    int                        rank,
-                    gmx_bool                   bLocalAndNonlocal)
+void gpu_init(gmx_nbnxn_ocl_t          **p_nb,
+              const gmx_device_info_t   *deviceInfo,
+              const interaction_const_t *ic,
+              const NbnxnListParameters *listParams,
+              const nbnxn_atomdata_t    *nbat,
+              const int                  rank,
+              const gmx_bool             bLocalAndNonlocal)
 {
     gmx_nbnxn_ocl_t            *nb;
     cl_int                      cl_error;
@@ -635,10 +629,10 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
     snew(nb, 1);
     snew(nb->atdat, 1);
     snew(nb->nbparam, 1);
-    snew(nb->plist[eintLocal], 1);
+    snew(nb->plist[InteractionLocality::Local], 1);
     if (bLocalAndNonlocal)
     {
-        snew(nb->plist[eintNonlocal], 1);
+        snew(nb->plist[InteractionLocality::NonLocal], 1);
     }
 
     nb->bUseTwoStreams = static_cast<cl_bool>(bLocalAndNonlocal);
@@ -655,7 +649,7 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
     pmalloc(reinterpret_cast<void**>(&nb->nbst.e_el), sizeof(*nb->nbst.e_el));
     pmalloc(reinterpret_cast<void**>(&nb->nbst.fshift), SHIFTS * sizeof(*nb->nbst.fshift));
 
-    init_plist(nb->plist[eintLocal]);
+    init_plist(nb->plist[InteractionLocality::Local]);
 
     /* OpenCL timing disabled if GMX_DISABLE_GPU_TIMING is defined. */
     nb->bDoTime = static_cast<cl_bool>(getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
@@ -673,7 +667,8 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
     nbnxn_gpu_create_context(nb->dev_rundata, nb->dev_info, rank);
 
     /* local/non-local GPU streams */
-    nb->stream[eintLocal] = clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
+    nb->stream[InteractionLocality::Local] =
+        clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
     if (CL_SUCCESS != cl_error)
     {
         gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
@@ -684,9 +679,10 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
 
     if (nb->bUseTwoStreams)
     {
-        init_plist(nb->plist[eintNonlocal]);
+        init_plist(nb->plist[InteractionLocality::NonLocal]);
 
-        nb->stream[eintNonlocal] = clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
+        nb->stream[InteractionLocality::NonLocal] =
+            clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
         if (CL_SUCCESS != cl_error)
         {
             gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
@@ -698,7 +694,6 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
 
     if (nb->bDoTime)
     {
-        init_timers(nb->timers, nb->bUseTwoStreams == CL_TRUE);
         init_timings(nb->timings);
     }
 
@@ -742,7 +737,7 @@ static void nbnxn_ocl_clear_f(gmx_nbnxn_ocl_t *nb, int natoms_clear)
     cl_int gmx_used_in_debug cl_error;
 
     cl_atomdata_t           *atomData = nb->atdat;
-    cl_command_queue         ls       = nb->stream[eintLocal];
+    cl_command_queue         ls       = nb->stream[InteractionLocality::Local];
     cl_float                 value    = 0.0f;
 
     cl_error = clEnqueueFillBuffer(ls, atomData->f, &value, sizeof(cl_float),
@@ -753,8 +748,8 @@ static void nbnxn_ocl_clear_f(gmx_nbnxn_ocl_t *nb, int natoms_clear)
 
 //! This function is documented in the header file
 void
-nbnxn_gpu_clear_outputs(gmx_nbnxn_ocl_t   *nb,
-                        int                flags)
+gpu_clear_outputs(gmx_nbnxn_ocl_t   *nb,
+                  const int          flags)
 {
     nbnxn_ocl_clear_f(nb, nb->atdat->natoms);
     /* clear shift force array and energies if the outputs were
@@ -766,14 +761,14 @@ nbnxn_gpu_clear_outputs(gmx_nbnxn_ocl_t   *nb,
 
     /* kick off buffer clearing kernel to ensure concurrency with constraints/update */
     cl_int gmx_unused cl_error;
-    cl_error = clFlush(nb->stream[eintLocal]);
+    cl_error = clFlush(nb->stream[InteractionLocality::Local]);
     assert(CL_SUCCESS == cl_error);
 }
 
 //! This function is documented in the header file
-void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
-                             const NbnxnPairlistGpu *h_plist,
-                             int                     iloc)
+void gpu_init_pairlist(gmx_nbnxn_ocl_t           *nb,
+                       const NbnxnPairlistGpu    *h_plist,
+                       const InteractionLocality  iloc)
 {
     char             sbuf[STRLEN];
     // Timing accumulation should happen only if there was work to do
@@ -797,10 +792,12 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
         }
     }
 
+    gpu_timers_t::Interaction &iTimers = nb->timers->interaction[iloc];
+
     if (bDoTime)
     {
-        nb->timers->pl_h2d[iloc].openTimingRegion(stream);
-        nb->timers->didPairlistH2D[iloc] = true;
+        iTimers.pl_h2d.openTimingRegion(stream);
+        iTimers.didPairlistH2D = true;
     }
 
     // TODO most of this function is same in CUDA and OpenCL, move into the header
@@ -810,13 +807,13 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
                            &d_plist->nsci, &d_plist->sci_nalloc, context);
     copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(),
                        stream, GpuApiCallBehavior::Async,
-                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+                       bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 
     reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(),
                            &d_plist->ncj4, &d_plist->cj4_nalloc, context);
     copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(),
                        stream, GpuApiCallBehavior::Async,
-                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+                       bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 
     reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size()*c_nbnxnGpuClusterpairSplit,
                            &d_plist->nimask, &d_plist->imask_nalloc, context);
@@ -825,11 +822,11 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
                            &d_plist->nexcl, &d_plist->excl_nalloc, context);
     copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(),
                        stream, GpuApiCallBehavior::Async,
-                       bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
+                       bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 
     if (bDoTime)
     {
-        nb->timers->pl_h2d[iloc].closeTimingRegion(stream);
+        iTimers.pl_h2d.closeTimingRegion(stream);
     }
 
     /* need to prune the pair list during the next step */
@@ -837,11 +834,11 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
 }
 
 //! This function is documented in the header file
-void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_ocl_t        *nb,
-                               const nbnxn_atomdata_t *nbatom)
+void gpu_upload_shiftvec(gmx_nbnxn_ocl_t        *nb,
+                         const nbnxn_atomdata_t *nbatom)
 {
     cl_atomdata_t   *adat  = nb->atdat;
-    cl_command_queue ls    = nb->stream[eintLocal];
+    cl_command_queue ls    = nb->stream[InteractionLocality::Local];
 
     /* only if we have a dynamic box */
     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
@@ -853,8 +850,8 @@ void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_ocl_t        *nb,
 }
 
 //! This function is documented in the header file
-void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t               *nb,
-                             const nbnxn_atomdata_t        *nbat)
+void gpu_init_atomdata(gmx_nbnxn_ocl_t        *nb,
+                       const nbnxn_atomdata_t *nbat)
 {
     cl_int           cl_error;
     int              nalloc, natoms;
@@ -862,7 +859,7 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t               *nb,
     bool             bDoTime = nb->bDoTime == CL_TRUE;
     cl_timers_t     *timers  = nb->timers;
     cl_atomdata_t   *d_atdat = nb->atdat;
-    cl_command_queue ls      = nb->stream[eintLocal];
+    cl_command_queue ls      = nb->stream[InteractionLocality::Local];
 
     natoms    = nbat->numAtoms();
     realloced = false;
@@ -1011,7 +1008,7 @@ static void free_gpu_device_runtime_data(gmx_device_runtime_data_t *runData)
 }
 
 //! This function is documented in the header file
-void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
+void gpu_free(gmx_nbnxn_ocl_t *nb)
 {
     if (nb == nullptr)
     {
@@ -1051,7 +1048,7 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
     sfree(nb->nbparam);
 
     /* Free plist */
-    auto *plist = nb->plist[eintLocal];
+    auto *plist = nb->plist[InteractionLocality::Local];
     freeDeviceBuffer(&plist->sci);
     freeDeviceBuffer(&plist->cj4);
     freeDeviceBuffer(&plist->imask);
@@ -1059,7 +1056,7 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
     sfree(plist);
     if (nb->bUseTwoStreams)
     {
-        auto *plist_nl = nb->plist[eintNonlocal];
+        auto *plist_nl = nb->plist[InteractionLocality::NonLocal];
         freeDeviceBuffer(&plist_nl->sci);
         freeDeviceBuffer(&plist_nl->cj4);
         freeDeviceBuffer(&plist_nl->imask);
@@ -1078,12 +1075,12 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
     nb->nbst.fshift = nullptr;
 
     /* Free command queues */
-    clReleaseCommandQueue(nb->stream[eintLocal]);
-    nb->stream[eintLocal] = nullptr;
+    clReleaseCommandQueue(nb->stream[InteractionLocality::Local]);
+    nb->stream[InteractionLocality::Local] = nullptr;
     if (nb->bUseTwoStreams)
     {
-        clReleaseCommandQueue(nb->stream[eintNonlocal]);
-        nb->stream[eintNonlocal] = nullptr;
+        clReleaseCommandQueue(nb->stream[InteractionLocality::NonLocal]);
+        nb->stream[InteractionLocality::NonLocal] = nullptr;
     }
     /* Free other events */
     if (nb->nonlocal_done)
@@ -1112,13 +1109,13 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
 }
 
 //! This function is documented in the header file
-gmx_wallclock_gpu_nbnxn_t *nbnxn_gpu_get_timings(gmx_nbnxn_ocl_t *nb)
+gmx_wallclock_gpu_nbnxn_t *gpu_get_timings(gmx_nbnxn_ocl_t *nb)
 {
     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
 }
 
 //! This function is documented in the header file
-void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
+void gpu_reset_timings(nonbonded_verlet_t* nbv)
 {
     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
     {
@@ -1127,15 +1124,17 @@ void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
 }
 
 //! This function is documented in the header file
-int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_ocl_t *nb)
+int gpu_min_ci_balanced(gmx_nbnxn_ocl_t *nb)
 {
     return nb != nullptr ?
            gpu_min_ci_balanced_factor * nb->dev_info->compute_units : 0;
 }
 
 //! This function is documented in the header file
-gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_ocl_t *nb)
+gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxn_ocl_t *nb)
 {
     return ((nb->nbparam->eeltype == eelOclEWALD_ANA) ||
             (nb->nbparam->eeltype == eelOclEWALD_ANA_TWIN));
 }
+
+} // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_internal.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_internal.h
index 318e72623d..c735ab5d48 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_internal.h
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_internal.h
@@ -46,6 +46,9 @@
 #ifndef NBNXN_OCL_INTERNAL_H
 #define NBNXN_OCL_INTERNAL_H
 
+namespace Nbnxm
+{
+
 /*! \brief Returns true if LJ combination rules are used in the non-bonded kernels.
  *
  *  \param[in] vdwType The VdW interaction/implementation type as defined by evdwOcl in nbnxn_ocl_types.h.
@@ -53,4 +56,6 @@
  */
 bool useLjCombRule(int vdwType);
 
+}      // namespace Nbnxm
+
 #endif /* NBNXN_OCL_INTERNAL_H */
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
index 75279fb3cd..57d945bdcb 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
@@ -51,7 +51,9 @@
 #include "gromacs/gpu_utils/oclutils.h"
 #include "gromacs/mdtypes/interaction_const.h"
 #include "gromacs/nbnxm/gpu_types_common.h"
+#include "gromacs/nbnxm/nbnxm.h"
 #include "gromacs/nbnxm/pairlist.h"
+#include "gromacs/utility/enumerationhelpers.h"
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/real.h"
 
@@ -243,12 +245,12 @@ typedef struct cl_nbparam_params
 /*! \internal
  * \brief Pair list data.
  */
-using cl_plist_t = gpu_plist;
+using cl_plist_t = Nbnxm::gpu_plist;
 
 /** \internal
  * \brief Typedef of actual timer type.
  */
-typedef struct nbnxn_gpu_timers_t cl_timers_t;
+typedef struct Nbnxm::gpu_timers_t cl_timers_t;
 
 /*! \internal
  * \brief Main data structure for OpenCL nonbonded force calculations.
@@ -278,15 +280,15 @@ struct gmx_nbnxn_ocl_t
     cl_kernel           kernel_zero_e_fshift;
     ///@}
 
-    cl_bool             bUseTwoStreams;        /**< true if doing both local/non-local NB work on GPU          */
-    cl_bool             bNonLocalStreamActive; /**< true indicates that the nonlocal_done event was enqueued   */
+    cl_bool                                                             bUseTwoStreams;        /**< true if doing both local/non-local NB work on GPU          */
+    cl_bool                                                             bNonLocalStreamActive; /**< true indicates that the nonlocal_done event was enqueued   */
 
-    cl_atomdata_t      *atdat;                 /**< atom data                                                  */
-    cl_nbparam_t       *nbparam;               /**< parameters required for the non-bonded calc.               */
-    cl_plist_t         *plist[2];              /**< pair-list data structures (local and non-local)            */
-    cl_nb_staging_t     nbst;                  /**< staging area where fshift/energies get downloaded          */
+    cl_atomdata_t                                                      *atdat;                 /**< atom data                                                  */
+    cl_nbparam_t                                                       *nbparam;               /**< parameters required for the non-bonded calc.               */
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, cl_plist_t *>     plist;                 /**< pair-list data structures (local and non-local)            */
+    cl_nb_staging_t                                                     nbst;                  /**< staging area where fshift/energies get downloaded          */
 
-    cl_command_queue    stream[2];             /**< local and non-local GPU queues                             */
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, cl_command_queue> stream;                /**< local and non-local GPU queues                             */
 
     /** events used for synchronization */
     cl_event nonlocal_done;                     /**< event triggered when the non-local non-bonded kernel
diff --git a/src/gromacs/nbnxm/pairlist.cpp b/src/gromacs/nbnxm/pairlist.cpp
index 17dd8e09fd..16f940a71a 100644
--- a/src/gromacs/nbnxm/pairlist.cpp
+++ b/src/gromacs/nbnxm/pairlist.cpp
@@ -75,6 +75,8 @@
 
 using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
 
+// Convience alias for partial Nbnxn namespace usage
+using InteractionLocality = Nbnxm::InteractionLocality;
 
 /* We shift the i-particles backward for PBC.
  * This leads to more conditionals than shifting forward.
@@ -2650,12 +2652,12 @@ static real nonlocal_vol2(const struct gmx_domdec_zones_t *zones, const rvec ls,
 }
 
 /* Estimates the average size of a full j-list for super/sub setup */
-static void get_nsubpair_target(const nbnxn_search   *nbs,
-                                int                   iloc,
-                                real                  rlist,
-                                int                   min_ci_balanced,
-                                int                  *nsubpair_target,
-                                float                *nsubpair_tot_est)
+static void get_nsubpair_target(const nbnxn_search        *nbs,
+                                const InteractionLocality  iloc,
+                                const real                 rlist,
+                                const int                  min_ci_balanced,
+                                int                       *nsubpair_target,
+                                float                     *nsubpair_tot_est)
 {
     /* The target value of 36 seems to be the optimum for Kepler.
      * Maxwell is less sensitive to the exact value.
@@ -2702,7 +2704,7 @@ static void get_nsubpair_target(const nbnxn_search   *nbs,
             nonlocal_vol2(nbs->zones, ls, r_eff_sup);
     }
 
-    if (LOCAL_I(iloc))
+    if (iloc == InteractionLocality::Local)
     {
         /* Sub-cell interacts with itself */
         vol_est  = ls[XX]*ls[YY]*ls[ZZ];
@@ -4077,15 +4079,15 @@ static void sort_sci(NbnxnPairlistGpu *nbl)
 }
 
 /* Make a local or non-local pair-list, depending on iloc */
-void nbnxn_make_pairlist(nbnxn_search         *nbs,
-                         nbnxn_atomdata_t     *nbat,
-                         const t_blocka       *excl,
-                         real                  rlist,
-                         int                   min_ci_balanced,
-                         nbnxn_pairlist_set_t *nbl_list,
-                         int                   iloc,
-                         int                   nb_kernel_type,
-                         t_nrnb               *nrnb)
+void nbnxn_make_pairlist(nbnxn_search              *nbs,
+                         nbnxn_atomdata_t          *nbat,
+                         const t_blocka            *excl,
+                         const real                 rlist,
+                         const int                  min_ci_balanced,
+                         nbnxn_pairlist_set_t      *nbl_list,
+                         const InteractionLocality  iloc,
+                         const int                  nb_kernel_type,
+                         t_nrnb                    *nrnb)
 {
     int                nsubpair_target;
     float              nsubpair_tot_est;
@@ -4105,13 +4107,13 @@ void nbnxn_make_pairlist(nbnxn_search         *nbs,
 
     nbat->bUseBufferFlags = (nbat->out.size() > 1);
     /* We should re-init the flags before making the first list */
-    if (nbat->bUseBufferFlags && LOCAL_I(iloc))
+    if (nbat->bUseBufferFlags && iloc == InteractionLocality::Local)
     {
         init_buffer_flags(&nbat->buffer_flags, nbat->numAtoms());
     }
 
     int nzi;
-    if (LOCAL_I(iloc))
+    if (iloc == InteractionLocality::Local)
     {
         /* Only zone (grid) 0 vs 0 */
         nzi = 1;
@@ -4156,7 +4158,7 @@ void nbnxn_make_pairlist(nbnxn_search         *nbs,
 
         int                 zj0;
         int                 zj1;
-        if (LOCAL_I(iloc))
+        if (iloc == InteractionLocality::Local)
         {
             zj0 = 0;
             zj1 = 1;
@@ -4186,7 +4188,7 @@ void nbnxn_make_pairlist(nbnxn_search         *nbs,
             /* With GPU: generate progressively smaller lists for
              * load balancing for local only or non-local with 2 zones.
              */
-            progBal = (LOCAL_I(iloc) || nbs->zones->n <= 2);
+            progBal = (iloc == InteractionLocality::Local || nbs->zones->n <= 2);
 
 #pragma omp parallel for num_threads(nnbl) schedule(static)
             for (int th = 0; th < nnbl; th++)
@@ -4342,12 +4344,12 @@ void nbnxn_make_pairlist(nbnxn_search         *nbs,
     }
 
     /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
-    if (LOCAL_I(iloc))
+    if (iloc == InteractionLocality::Local)
     {
         nbs->search_count++;
     }
     if (nbs->print_cycles &&
-        (!nbs->DomDec || !LOCAL_I(iloc)) &&
+        (!nbs->DomDec || iloc == InteractionLocality::NonLocal) &&
         nbs->search_count % 100 == 0)
     {
         nbs_cycle_print(stderr, nbs);
diff --git a/src/gromacs/nbnxm/pairlistset.h b/src/gromacs/nbnxm/pairlistset.h
index 0e92eb3cde..885e8f07be 100644
--- a/src/gromacs/nbnxm/pairlistset.h
+++ b/src/gromacs/nbnxm/pairlistset.h
@@ -40,6 +40,8 @@
 #include "gromacs/utility/basedefinitions.h"
 #include "gromacs/utility/real.h"
 
+#include "locality.h"
+
 struct gmx_domdec_zones_t;
 struct gmx_groups_t;
 struct nbnxn_atomdata_t;
@@ -86,15 +88,15 @@ void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
  * for the number of equally sized lists is below min_ci_balanced.
  * With perturbed particles, also a group scheme style nbl_fep list is made.
  */
-void nbnxn_make_pairlist(nbnxn_search         *nbs,
-                         nbnxn_atomdata_t     *nbat,
-                         const t_blocka       *excl,
-                         real                  rlist,
-                         int                   min_ci_balanced,
-                         nbnxn_pairlist_set_t *nbl_list,
-                         int                   iloc,
-                         int                   nb_kernel_type,
-                         t_nrnb               *nrnb);
+void nbnxn_make_pairlist(nbnxn_search               *nbs,
+                         nbnxn_atomdata_t           *nbat,
+                         const t_blocka             *excl,
+                         real                        rlist,
+                         int                         min_ci_balanced,
+                         nbnxn_pairlist_set_t       *nbl_list,
+                         Nbnxm::InteractionLocality  iloc,
+                         int                         nb_kernel_type,
+                         t_nrnb                     *nrnb);
 
 /*! \brief Prepare the list-set produced by the search for dynamic pruning
  *
diff --git a/src/gromacs/nbnxm/prunekerneldispatch.cpp b/src/gromacs/nbnxm/prunekerneldispatch.cpp
index 96a1883701..76938e744d 100644
--- a/src/gromacs/nbnxm/prunekerneldispatch.cpp
+++ b/src/gromacs/nbnxm/prunekerneldispatch.cpp
@@ -45,9 +45,9 @@
 #include "kernels_simd_4xm/kernel_prune.h"
 
 
-void NbnxnDispatchPruneKernel(nonbonded_verlet_t *nbv,
-                              int                 ilocality,
-                              const rvec         *shift_vec)
+void NbnxnDispatchPruneKernel(nonbonded_verlet_t               *nbv,
+                              const Nbnxm::InteractionLocality  ilocality,
+                              const rvec                       *shift_vec)
 {
     nonbonded_verlet_group_t &nbvg       = nbv->grp[ilocality];
     nbnxn_pairlist_set_t     *nbl_lists  = &nbvg.nbl_lists;