Merge branch 'release-2019' into master
[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.cpp
index 4b0992916ab6f12b19a59b5ead139840cf39b156..3d5578a664b07f0849877da60a1a25f0b68985c1 100644 (file)
@@ -49,6 +49,7 @@
 
 #include "gromacs/awh/awh.h"
 #include "gromacs/domdec/dlbtiming.h"
+#include "gromacs/domdec/domdec.h"
 #include "gromacs/domdec/domdec_struct.h"
 #include "gromacs/domdec/partition.h"
 #include "gromacs/essentialdynamics/edsam.h"
 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/imd/imd.h"
-#include "gromacs/listed-forces/bonded.h"
-#include "gromacs/listed-forces/disre.h"
-#include "gromacs/listed-forces/gpubonded.h"
-#include "gromacs/listed-forces/manage-threading.h"
-#include "gromacs/listed-forces/orires.h"
+#include "gromacs/listed_forces/bonded.h"
+#include "gromacs/listed_forces/disre.h"
+#include "gromacs/listed_forces/gpubonded.h"
+#include "gromacs/listed_forces/manage_threading.h"
+#include "gromacs/listed_forces/orires.h"
 #include "gromacs/math/arrayrefwithpadding.h"
 #include "gromacs/math/functions.h"
 #include "gromacs/math/units.h"
 #include "gromacs/mdlib/forcerec.h"
 #include "gromacs/mdlib/gmx_omp_nthreads.h"
 #include "gromacs/mdlib/mdrun.h"
-#include "gromacs/mdlib/nb_verlet.h"
-#include "gromacs/mdlib/nbnxn_atomdata.h"
-#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
-#include "gromacs/mdlib/nbnxn_grid.h"
-#include "gromacs/mdlib/nbnxn_search.h"
 #include "gromacs/mdlib/ppforceworkload.h"
 #include "gromacs/mdlib/qmmm.h"
 #include "gromacs/mdlib/update.h"
-#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
 #include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/enerdata.h"
 #include "gromacs/mdtypes/forceoutput.h"
 #include "gromacs/mdtypes/iforceprovider.h"
 #include "gromacs/mdtypes/inputrec.h"
 #include "gromacs/mdtypes/md_enums.h"
 #include "gromacs/mdtypes/state.h"
+#include "gromacs/nbnxm/atomdata.h"
+#include "gromacs/nbnxm/gpu_data_mgmt.h"
+#include "gromacs/nbnxm/nbnxm.h"
 #include "gromacs/pbcutil/ishift.h"
 #include "gromacs/pbcutil/mshift.h"
 #include "gromacs/pbcutil/pbc.h"
 #include "gromacs/utility/gmxassert.h"
 #include "gromacs/utility/gmxmpi.h"
 #include "gromacs/utility/logger.h"
-#include "gromacs/utility/pleasecite.h"
 #include "gromacs/utility/smalloc.h"
 #include "gromacs/utility/strconvert.h"
 #include "gromacs/utility/sysinfo.h"
 
-#include "nbnxn_gpu.h"
-#include "nbnxn_kernels/nbnxn_kernel_cpu.h"
-#include "nbnxn_kernels/nbnxn_kernel_prune.h"
-
 // TODO: this environment variable allows us to verify before release
 // that on less common architectures the total cost of polling is not larger than
 // a blocking wait (so polling does not introduce overhead when the static
@@ -393,14 +387,15 @@ static void post_process_forces(const t_commrec           *cr,
     }
 }
 
-static void do_nb_verlet(const t_forcerec *fr,
-                         const interaction_const_t *ic,
-                         gmx_enerdata_t *enerd,
-                         int flags, int ilocality,
-                         int clearF,
-                         int64_t step,
-                         t_nrnb *nrnb,
-                         gmx_wallcycle_t wcycle)
+static void do_nb_verlet(t_forcerec                       *fr,
+                         const interaction_const_t        *ic,
+                         gmx_enerdata_t                   *enerd,
+                         const int                         flags,
+                         const Nbnxm::InteractionLocality  ilocality,
+                         const int                         clearF,
+                         const int64_t                     step,
+                         t_nrnb                           *nrnb,
+                         gmx_wallcycle_t                   wcycle)
 {
     if (!(flags & GMX_FORCE_NONBONDED))
     {
@@ -431,106 +426,19 @@ static void do_nb_verlet(const t_forcerec *fr,
              * the current coordinates of the atoms.
              */
             wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
-            nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
+            NbnxnDispatchPruneKernel(nbv, ilocality, fr->shift_vec);
             wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
         }
 
         wallcycle_sub_start(wcycle, ewcsNONBONDED);
     }
 
-    switch (nbvg->kernel_type)
-    {
-        case nbnxnk4x4_PlainC:
-        case nbnxnk4xN_SIMD_4xN:
-        case nbnxnk4xN_SIMD_2xNN:
-            nbnxn_kernel_cpu(nbvg,
-                             nbv->nbat,
-                             ic,
-                             fr->shift_vec,
-                             flags,
-                             clearF,
-                             fr->fshift[0],
-                             enerd->grpp.ener[egCOULSR],
-                             fr->bBHAM ?
-                             enerd->grpp.ener[egBHAMSR] :
-                             enerd->grpp.ener[egLJSR]);
-            break;
-
-        case nbnxnk8x8x8_GPU:
-            nbnxn_gpu_launch_kernel(nbv->gpu_nbv, flags, ilocality);
-            break;
+    NbnxnDispatchKernel(nbv, ilocality, *ic, flags, clearF, fr, enerd, nrnb);
 
-        case nbnxnk8x8x8_PlainC:
-            nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
-                                 nbv->nbat, ic,
-                                 fr->shift_vec,
-                                 flags,
-                                 clearF,
-                                 nbv->nbat->out[0].f,
-                                 fr->fshift[0],
-                                 enerd->grpp.ener[egCOULSR],
-                                 fr->bBHAM ?
-                                 enerd->grpp.ener[egBHAMSR] :
-                                 enerd->grpp.ener[egLJSR]);
-            break;
-
-        default:
-            GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
-
-    }
     if (!bUsingGpuKernels)
     {
         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
     }
-
-    int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
-    if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
-    {
-        enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
-    }
-    else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
-             (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
-    {
-        enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
-    }
-    else
-    {
-        enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
-    }
-    enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
-    if (flags & GMX_FORCE_ENERGY)
-    {
-        /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
-        enr_nbnxn_kernel_ljc += 1;
-        enr_nbnxn_kernel_lj  += 1;
-    }
-
-    inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
-             nbvg->nbl_lists.natpair_ljq);
-    inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
-             nbvg->nbl_lists.natpair_lj);
-    /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
-    inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
-             nbvg->nbl_lists.natpair_q);
-
-    if (ic->vdw_modifier == eintmodFORCESWITCH)
-    {
-        /* We add up the switch cost separately */
-        inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
-                 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
-    }
-    if (ic->vdw_modifier == eintmodPOTSWITCH)
-    {
-        /* We add up the switch cost separately */
-        inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
-                 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
-    }
-    if (ic->vdwtype == evdwPME)
-    {
-        /* We add up the LJ Ewald cost separately */
-        inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
-                 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
-    }
 }
 
 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
@@ -952,11 +860,12 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
         {
             GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
-            isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
-                                                    flags, eatLocal,
-                                                    haveOtherWork,
-                                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
-                                                    fshift, completionType);
+            isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
+                                                     flags,
+                                                     Nbnxm::AtomLocality::Local,
+                                                     haveOtherWork,
+                                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                                     fshift, completionType);
             wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
             // To get the call count right, when the task finished we
             // issue a start/stop.
@@ -967,7 +876,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 
-                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
+                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
                                                nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
             }
         }
@@ -998,15 +907,15 @@ static inline void launchGpuRollingPruning(const t_commrec          *cr,
      */
     int  numRollingParts     = nbv->listParams->numRollingParts;
     GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
-    int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
+    int  stepWithCurrentList = step - nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.outerListCreationStep;
     bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
     if (stepWithCurrentList > 0 &&
         stepWithCurrentList < inputrec->nstlist - 1 &&
         (stepIsEven || DOMAINDECOMP(cr)))
     {
-        nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
-                                          stepIsEven ? eintLocal : eintNonlocal,
-                                          numRollingParts);
+        Nbnxm::gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
+                                           stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal,
+                                           numRollingParts);
     }
 }
 
@@ -1049,7 +958,7 @@ static void do_force_cutsVERLET(FILE *fplog,
     nonbonded_verlet_t *nbv = fr->nbv;
 
     bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
-    bNS           = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
+    bNS           = ((flags & GMX_FORCE_NS) != 0);
     bFillGrid     = (bNS && bStateChanged);
     bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
     bDoForces     = ((flags & GMX_FORCE_FORCES) != 0);
@@ -1177,7 +1086,7 @@ static void do_force_cutsVERLET(FILE *fplog,
                               nullptr, 0, mdatoms->homenr, -1,
                               fr->cginfo, x.unpaddedArrayRef(),
                               0, nullptr,
-                              nbv->grp[eintLocal].kernel_type,
+                              nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
                               nbv->nbat);
             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
         }
@@ -1186,7 +1095,7 @@ static void do_force_cutsVERLET(FILE *fplog,
             wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
             nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
                                        fr->cginfo, x.unpaddedArrayRef(),
-                                       nbv->grp[eintNonlocal].kernel_type,
+                                       nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
                                        nbv->nbat);
             wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
         }
@@ -1204,10 +1113,10 @@ static void do_force_cutsVERLET(FILE *fplog,
 
         if (bNS)
         {
-            nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
+            Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
         }
 
-        nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
+        Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
 
         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 
@@ -1223,9 +1132,9 @@ static void do_force_cutsVERLET(FILE *fplog,
             // higher-level object than the nb module.
             fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
                                                                   top->idef,
-                                                                  nbnxn_gpu_get_xq(nbv->gpu_nbv),
-                                                                  nbnxn_gpu_get_f(nbv->gpu_nbv),
-                                                                  nbnxn_gpu_get_fshift(nbv->gpu_nbv));
+                                                                  Nbnxm::gpu_get_xq(nbv->gpu_nbv),
+                                                                  Nbnxm::gpu_get_f(nbv->gpu_nbv),
+                                                                  Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
             ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
         }
 
@@ -1235,35 +1144,38 @@ static void do_force_cutsVERLET(FILE *fplog,
     /* do local pair search */
     if (bNS)
     {
+        nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists;
+
         wallcycle_start_nocount(wcycle, ewcNS);
         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
         nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
                             &top->excls,
                             nbv->listParams->rlistOuter,
                             nbv->min_ci_balanced,
-                            &nbv->grp[eintLocal].nbl_lists,
-                            eintLocal,
-                            nbv->grp[eintLocal].kernel_type,
+                            &pairlistSet,
+                            Nbnxm::InteractionLocality::Local,
+                            nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
                             nrnb);
-        nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
+        pairlistSet.outerListCreationStep = step;
         if (nbv->listParams->useDynamicPruning && !bUseGPU)
         {
-            nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
+            nbnxnPrepareListForDynamicPruning(&pairlistSet);
         }
         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
 
         if (bUseGPU)
         {
             /* initialize local pair-list on the GPU */
-            nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
-                                    nbv->grp[eintLocal].nbl_lists.nbl[0],
-                                    eintLocal);
+            Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
+                                     pairlistSet.nblGpu[0],
+                                     Nbnxm::InteractionLocality::Local);
         }
         wallcycle_stop(wcycle, ewcNS);
     }
     else
     {
-        nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
+        nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
+                                        FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
                                         nbv->nbat, wcycle);
     }
 
@@ -1277,7 +1189,7 @@ static void do_force_cutsVERLET(FILE *fplog,
         wallcycle_start(wcycle, ewcLAUNCH_GPU);
 
         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatLocal, ppForceWorkload->haveGpuBondedWork);
+        Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 
         // bonded work not split into separate local and non-local, so with DD
@@ -1291,7 +1203,7 @@ static void do_force_cutsVERLET(FILE *fplog,
 
         /* launch local nonbonded work on GPU */
         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
+        do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
                      step, nrnb, wcycle);
         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
@@ -1310,6 +1222,8 @@ static void do_force_cutsVERLET(FILE *fplog,
        do non-local pair search */
     if (DOMAINDECOMP(cr))
     {
+        nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists;
+
         if (bNS)
         {
             wallcycle_start_nocount(wcycle, ewcNS);
@@ -1319,23 +1233,23 @@ static void do_force_cutsVERLET(FILE *fplog,
                                 &top->excls,
                                 nbv->listParams->rlistOuter,
                                 nbv->min_ci_balanced,
-                                &nbv->grp[eintNonlocal].nbl_lists,
-                                eintNonlocal,
-                                nbv->grp[eintNonlocal].kernel_type,
+                                &pairlistSet,
+                                Nbnxm::InteractionLocality::NonLocal,
+                                nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
                                 nrnb);
-            nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
+            pairlistSet.outerListCreationStep = step;
             if (nbv->listParams->useDynamicPruning && !bUseGPU)
             {
-                nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
+                nbnxnPrepareListForDynamicPruning(&pairlistSet);
             }
             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
 
-            if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
+            if (nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type == nbnxnk8x8x8_GPU)
             {
                 /* initialize non-local pair-list on the GPU */
-                nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
-                                        nbv->grp[eintNonlocal].nbl_lists.nbl[0],
-                                        eintNonlocal);
+                Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
+                                         pairlistSet.nblGpu[0],
+                                         Nbnxm::InteractionLocality::NonLocal);
             }
             wallcycle_stop(wcycle, ewcNS);
         }
@@ -1343,7 +1257,8 @@ static void do_force_cutsVERLET(FILE *fplog,
         {
             dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
 
-            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
+            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
+                                            FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
                                             nbv->nbat, wcycle);
         }
 
@@ -1353,7 +1268,7 @@ static void do_force_cutsVERLET(FILE *fplog,
 
             /* launch non-local nonbonded tasks on GPU */
             wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-            nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
+            Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 
             if (ppForceWorkload->haveGpuBondedWork)
@@ -1364,7 +1279,7 @@ static void do_force_cutsVERLET(FILE *fplog,
             }
 
             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
+            do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
                          step, nrnb, wcycle);
             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 
@@ -1379,11 +1294,11 @@ static void do_force_cutsVERLET(FILE *fplog,
         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
         if (DOMAINDECOMP(cr))
         {
-            nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
-                                     flags, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
+            Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
+                                      flags, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
         }
-        nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
-                                 flags, eatLocal, ppForceWorkload->haveGpuBondedWork);
+        Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
+                                  flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 
         if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
@@ -1490,7 +1405,7 @@ static void do_force_cutsVERLET(FILE *fplog,
 
     if (!bUseOrEmulGPU)
     {
-        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
+        do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
                      step, nrnb, wcycle);
     }
 
@@ -1499,18 +1414,18 @@ static void do_force_cutsVERLET(FILE *fplog,
         /* Calculate the local and non-local free energy interactions here.
          * Happens here on the CPU both with and without GPU.
          */
-        if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
+        if (fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.nbl_fep[0]->nrj > 0)
         {
-            do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
+            do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists,
                              fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
                              inputrec->fepvals, lambda,
                              enerd, flags, nrnb, wcycle);
         }
 
         if (DOMAINDECOMP(cr) &&
-            fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
+            fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nbl_fep[0]->nrj > 0)
         {
-            do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
+            do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists,
                              fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
                              inputrec->fepvals, lambda,
                              enerd, flags, nrnb, wcycle);
@@ -1519,22 +1434,14 @@ static void do_force_cutsVERLET(FILE *fplog,
 
     if (!bUseOrEmulGPU)
     {
-        int aloc;
-
         if (DOMAINDECOMP(cr))
         {
-            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
+            do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
                          step, nrnb, wcycle);
         }
 
-        if (!bUseOrEmulGPU)
-        {
-            aloc = eintLocal;
-        }
-        else
-        {
-            aloc = eintNonlocal;
-        }
+        const Nbnxm::InteractionLocality iloc =
+            (!bUseOrEmulGPU ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal);
 
         /* Add all the non-bonded force to the normal force array.
          * This can be split into a local and a non-local part when overlapping
@@ -1542,13 +1449,13 @@ static void do_force_cutsVERLET(FILE *fplog,
          */
         wallcycle_stop(wcycle, ewcFORCE);
 
-        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
+        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::All, nbv->nbat, f, wcycle);
 
         wallcycle_start_nocount(wcycle, ewcFORCE);
 
         /* if there are multiple fshift output buffers reduce them */
         if ((flags & GMX_FORCE_VIRIAL) &&
-            nbv->grp[aloc].nbl_lists.nnbl > 1)
+            nbv->grp[iloc].nbl_lists.nnbl > 1)
         {
             /* This is not in a subcounter because it takes a
                negligible and constant-sized amount of time */
@@ -1586,25 +1493,25 @@ static void do_force_cutsVERLET(FILE *fplog,
             if (bUseGPU)
             {
                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
-                nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
-                                           flags, eatNonlocal,
-                                           ppForceWorkload->haveGpuBondedWork,
-                                           enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
-                                           fr->fshift);
+                Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
+                                            flags, Nbnxm::AtomLocality::NonLocal,
+                                            ppForceWorkload->haveGpuBondedWork,
+                                            enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                            fr->fshift);
                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
             }
             else
             {
                 wallcycle_start_nocount(wcycle, ewcFORCE);
-                do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
+                do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
                              step, nrnb, wcycle);
                 wallcycle_stop(wcycle, ewcFORCE);
             }
 
             /* skip the reduction if there was no non-local work to do */
-            if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
+            if (!nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nblGpu[0]->sci.empty())
             {
-                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
+                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
                                                nbv->nbat, f, wcycle);
             }
         }
@@ -1651,10 +1558,10 @@ static void do_force_cutsVERLET(FILE *fplog,
         const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
 
         wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
-        nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
-                                   flags, eatLocal, ppForceWorkload->haveGpuBondedWork,
-                                   enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
-                                   fr->fshift);
+        Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
+                                    flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork,
+                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                    fr->fshift);
         float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 
         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
@@ -1679,7 +1586,7 @@ static void do_force_cutsVERLET(FILE *fplog,
         // NOTE: emulation kernel is not included in the balancing region,
         // but emulation mode does not target performance anyway
         wallcycle_start_nocount(wcycle, ewcFORCE);
-        do_nb_verlet(fr, ic, enerd, flags, eintLocal,
+        do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local,
                      DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
                      step, nrnb, wcycle);
         wallcycle_stop(wcycle, ewcFORCE);
@@ -1695,7 +1602,7 @@ static void do_force_cutsVERLET(FILE *fplog,
         /* now clear the GPU outputs while we finish the step on the CPU */
         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
+        Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, flags);
 
         /* Is dynamic pair-list pruning activated? */
         if (nbv->listParams->useDynamicPruning)
@@ -1726,7 +1633,7 @@ static void do_force_cutsVERLET(FILE *fplog,
      * on the non-alternating path. */
     if (bUseOrEmulGPU && !alternateGpuWait)
     {
-        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
+        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
                                        nbv->nbat, f, wcycle);
     }
     if (DOMAINDECOMP(cr))
@@ -1836,7 +1743,7 @@ static void do_force_cutsGROUP(FILE *fplog,
     }
 
     bStateChanged  = ((flags & GMX_FORCE_STATECHANGED) != 0);
-    bNS            = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
+    bNS            = ((flags & GMX_FORCE_NS) != 0);
     /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
     bFillGrid      = (bNS && bStateChanged);
     bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
@@ -2837,7 +2744,7 @@ void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
 
     if (printReport)
     {
-        auto                    nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
+        auto                    nbnxn_gpu_timings = use_GPU(nbv) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
         if (pme_gpu_task_enabled(pme))
         {
@@ -2871,49 +2778,59 @@ void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
     }
 }
 
-extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
+void initialize_lambdas(FILE               *fplog,
+                        const t_inputrec   &ir,
+                        bool                isMaster,
+                        int                *fep_state,
+                        gmx::ArrayRef<real> lambda,
+                        double             *lam0)
 {
-    /* this function works, but could probably use a logic rewrite to keep all the different
-       types of efep straight. */
+    /* TODO: Clean up initialization of fep_state and lambda in
+       t_state.  This function works, but could probably use a logic
+       rewrite to keep all the different types of efep straight. */
 
-    if ((ir->efep == efepNO) && (!ir->bSimTemp))
+    if ((ir.efep == efepNO) && (!ir.bSimTemp))
     {
         return;
     }
 
-    t_lambda *fep = ir->fepvals;
-    *fep_state    = fep->init_fep_state; /* this might overwrite the checkpoint
-                                            if checkpoint is set -- a kludge is in for now
-                                            to prevent this.*/
+    const t_lambda *fep = ir.fepvals;
+    if (isMaster)
+    {
+        *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
+                                             if checkpoint is set -- a kludge is in for now
+                                             to prevent this.*/
+    }
 
     for (int i = 0; i < efptNR; i++)
     {
+        double thisLambda;
         /* overwrite lambda state with init_lambda for now for backwards compatibility */
-        if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
+        if (fep->init_lambda >= 0) /* if it's -1, it was never initialized */
         {
-            lambda[i] = fep->init_lambda;
-            if (lam0)
-            {
-                lam0[i] = lambda[i];
-            }
+            thisLambda = fep->init_lambda;
         }
         else
         {
-            lambda[i] = fep->all_lambda[i][*fep_state];
-            if (lam0)
-            {
-                lam0[i] = lambda[i];
-            }
+            thisLambda = fep->all_lambda[i][fep->init_fep_state];
+        }
+        if (isMaster)
+        {
+            lambda[i] = thisLambda;
+        }
+        if (lam0 != nullptr)
+        {
+            lam0[i] = thisLambda;
         }
     }
-    if (ir->bSimTemp)
+    if (ir.bSimTemp)
     {
         /* need to rescale control temperatures to match current state */
-        for (int i = 0; i < ir->opts.ngtc; i++)
+        for (int i = 0; i < ir.opts.ngtc; i++)
         {
-            if (ir->opts.ref_t[i] > 0)
+            if (ir.opts.ref_t[i] > 0)
             {
-                ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
+                ir.opts.ref_t[i] = ir.simtempvals->temperatures[fep->init_fep_state];
             }
         }
     }
@@ -2929,135 +2846,3 @@ extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx:
         fprintf(fplog, "]\n");
     }
 }
-
-
-void init_md(FILE *fplog,
-             const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
-             t_inputrec *ir, const gmx_output_env_t *oenv,
-             const MdrunOptions &mdrunOptions,
-             double *t, double *t0,
-             t_state *globalState, double *lam0,
-             t_nrnb *nrnb, gmx_mtop_t *mtop,
-             gmx_update_t **upd,
-             gmx::BoxDeformation *deform,
-             int nfile, const t_filenm fnm[],
-             gmx_mdoutf_t *outf, t_mdebin **mdebin,
-             tensor force_vir, tensor shake_vir,
-             tensor total_vir, tensor pres, rvec mu_tot,
-             gmx_bool *bSimAnn, t_vcm **vcm,
-             gmx_wallcycle_t wcycle)
-{
-    int  i;
-
-    /* Initial values */
-    *t = *t0       = ir->init_t;
-
-    *bSimAnn = FALSE;
-    for (i = 0; i < ir->opts.ngtc; i++)
-    {
-        /* set bSimAnn if any group is being annealed */
-        if (ir->opts.annealing[i] != eannNO)
-        {
-            *bSimAnn = TRUE;
-        }
-    }
-
-    /* Initialize lambda variables */
-    /* TODO: Clean up initialization of fep_state and lambda in t_state.
-     * We currently need to call initialize_lambdas on non-master ranks
-     * to initialize lam0.
-     */
-    if (MASTER(cr))
-    {
-        initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
-    }
-    else
-    {
-        int                      tmpFepState;
-        std::array<real, efptNR> tmpLambda;
-        initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
-    }
-
-    // TODO upd is never NULL in practice, but the analysers don't know that
-    if (upd)
-    {
-        *upd = init_update(ir, deform);
-    }
-    if (*bSimAnn)
-    {
-        update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
-    }
-
-    if (vcm != nullptr)
-    {
-        *vcm = init_vcm(fplog, &mtop->groups, ir);
-    }
-
-    if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
-    {
-        if (ir->etc == etcBERENDSEN)
-        {
-            please_cite(fplog, "Berendsen84a");
-        }
-        if (ir->etc == etcVRESCALE)
-        {
-            please_cite(fplog, "Bussi2007a");
-        }
-        if (ir->eI == eiSD1)
-        {
-            please_cite(fplog, "Goga2012");
-        }
-    }
-    init_nrnb(nrnb);
-
-    if (nfile != -1)
-    {
-        *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
-
-        *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
-                              mtop, ir, mdoutf_get_fp_dhdl(*outf));
-    }
-
-    /* Initiate variables */
-    clear_mat(force_vir);
-    clear_mat(shake_vir);
-    clear_rvec(mu_tot);
-    clear_mat(total_vir);
-    clear_mat(pres);
-}
-
-void init_rerun(FILE *fplog,
-                const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
-                t_inputrec *ir, const gmx_output_env_t *oenv,
-                const MdrunOptions &mdrunOptions,
-                t_state *globalState, double *lam0,
-                t_nrnb *nrnb, gmx_mtop_t *mtop,
-                int nfile, const t_filenm fnm[],
-                gmx_mdoutf_t *outf, t_mdebin **mdebin,
-                gmx_wallcycle_t wcycle)
-{
-    /* Initialize lambda variables */
-    /* TODO: Clean up initialization of fep_state and lambda in t_state.
-     * We currently need to call initialize_lambdas on non-master ranks
-     * to initialize lam0.
-     */
-    if (MASTER(cr))
-    {
-        initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
-    }
-    else
-    {
-        int                      tmpFepState;
-        std::array<real, efptNR> tmpLambda;
-        initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
-    }
-
-    init_nrnb(nrnb);
-
-    if (nfile != -1)
-    {
-        *outf   = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
-        *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
-                              mtop, ir, mdoutf_get_fp_dhdl(*outf), true);
-    }
-}