Merge branch 'release-2019' into master

[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.cpp
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 4b0992916ab6f12b19a59b5ead139840cf39b156..3d5578a664b07f0849877da60a1a25f0b68985c1 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -49,6 +49,7 @@
  
  #include "gromacs/awh/awh.h"
  #include "gromacs/domdec/dlbtiming.h"
+#include "gromacs/domdec/domdec.h"
  #include "gromacs/domdec/domdec_struct.h"
  #include "gromacs/domdec/partition.h"
  #include "gromacs/essentialdynamics/edsam.h"
@@ -61,11 +62,11 @@
  #include "gromacs/gmxlib/nonbonded/nonbonded.h"
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/imd/imd.h"
-#include "gromacs/listed-forces/bonded.h"
-#include "gromacs/listed-forces/disre.h"
-#include "gromacs/listed-forces/gpubonded.h"
-#include "gromacs/listed-forces/manage-threading.h"
-#include "gromacs/listed-forces/orires.h"
+#include "gromacs/listed_forces/bonded.h"
+#include "gromacs/listed_forces/disre.h"
+#include "gromacs/listed_forces/gpubonded.h"
+#include "gromacs/listed_forces/manage_threading.h"
+#include "gromacs/listed_forces/orires.h"
  #include "gromacs/math/arrayrefwithpadding.h"
  #include "gromacs/math/functions.h"
  #include "gromacs/math/units.h"
@@ -78,21 +79,19 @@
  #include "gromacs/mdlib/forcerec.h"
  #include "gromacs/mdlib/gmx_omp_nthreads.h"
  #include "gromacs/mdlib/mdrun.h"
-#include "gromacs/mdlib/nb_verlet.h"
-#include "gromacs/mdlib/nbnxn_atomdata.h"
-#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
-#include "gromacs/mdlib/nbnxn_grid.h"
-#include "gromacs/mdlib/nbnxn_search.h"
  #include "gromacs/mdlib/ppforceworkload.h"
  #include "gromacs/mdlib/qmmm.h"
  #include "gromacs/mdlib/update.h"
-#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
  #include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/enerdata.h"
  #include "gromacs/mdtypes/forceoutput.h"
  #include "gromacs/mdtypes/iforceprovider.h"
  #include "gromacs/mdtypes/inputrec.h"
  #include "gromacs/mdtypes/md_enums.h"
  #include "gromacs/mdtypes/state.h"
+#include "gromacs/nbnxm/atomdata.h"
+#include "gromacs/nbnxm/gpu_data_mgmt.h"
+#include "gromacs/nbnxm/nbnxm.h"
  #include "gromacs/pbcutil/ishift.h"
  #include "gromacs/pbcutil/mshift.h"
  #include "gromacs/pbcutil/pbc.h"
@@ -112,15 +111,10 @@
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/gmxmpi.h"
  #include "gromacs/utility/logger.h"
-#include "gromacs/utility/pleasecite.h"
  #include "gromacs/utility/smalloc.h"
  #include "gromacs/utility/strconvert.h"
  #include "gromacs/utility/sysinfo.h"
  
-#include "nbnxn_gpu.h"
-#include "nbnxn_kernels/nbnxn_kernel_cpu.h"
-#include "nbnxn_kernels/nbnxn_kernel_prune.h"
-
  // TODO: this environment variable allows us to verify before release
  // that on less common architectures the total cost of polling is not larger than
  // a blocking wait (so polling does not introduce overhead when the static
@@ -393,14 +387,15 @@ static void post_process_forces(const t_commrec           *cr,
      }
  }
  
-static void do_nb_verlet(const t_forcerec *fr,
-                         const interaction_const_t *ic,
-                         gmx_enerdata_t *enerd,
-                         int flags, int ilocality,
-                         int clearF,
-                         int64_t step,
-                         t_nrnb *nrnb,
-                         gmx_wallcycle_t wcycle)
+static void do_nb_verlet(t_forcerec                       *fr,
+                         const interaction_const_t        *ic,
+                         gmx_enerdata_t                   *enerd,
+                         const int                         flags,
+                         const Nbnxm::InteractionLocality  ilocality,
+                         const int                         clearF,
+                         const int64_t                     step,
+                         t_nrnb                           *nrnb,
+                         gmx_wallcycle_t                   wcycle)
  {
      if (!(flags & GMX_FORCE_NONBONDED))
      {
@@ -431,106 +426,19 @@ static void do_nb_verlet(const t_forcerec *fr,
               * the current coordinates of the atoms.
               */
              wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
-            nbnxn_kernel_cpu_prune(nbvg, nbv->nbat, fr->shift_vec, nbv->listParams->rlistInner);
+            NbnxnDispatchPruneKernel(nbv, ilocality, fr->shift_vec);
              wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
          }
  
          wallcycle_sub_start(wcycle, ewcsNONBONDED);
      }
  
-    switch (nbvg->kernel_type)
-    {
-        case nbnxnk4x4_PlainC:
-        case nbnxnk4xN_SIMD_4xN:
-        case nbnxnk4xN_SIMD_2xNN:
-            nbnxn_kernel_cpu(nbvg,
-                             nbv->nbat,
-                             ic,
-                             fr->shift_vec,
-                             flags,
-                             clearF,
-                             fr->fshift[0],
-                             enerd->grpp.ener[egCOULSR],
-                             fr->bBHAM ?
-                             enerd->grpp.ener[egBHAMSR] :
-                             enerd->grpp.ener[egLJSR]);
-            break;
-
-        case nbnxnk8x8x8_GPU:
-            nbnxn_gpu_launch_kernel(nbv->gpu_nbv, flags, ilocality);
-            break;
+    NbnxnDispatchKernel(nbv, ilocality, *ic, flags, clearF, fr, enerd, nrnb);
  
-        case nbnxnk8x8x8_PlainC:
-            nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
-                                 nbv->nbat, ic,
-                                 fr->shift_vec,
-                                 flags,
-                                 clearF,
-                                 nbv->nbat->out[0].f,
-                                 fr->fshift[0],
-                                 enerd->grpp.ener[egCOULSR],
-                                 fr->bBHAM ?
-                                 enerd->grpp.ener[egBHAMSR] :
-                                 enerd->grpp.ener[egLJSR]);
-            break;
-
-        default:
-            GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
-
-    }
      if (!bUsingGpuKernels)
      {
          wallcycle_sub_stop(wcycle, ewcsNONBONDED);
      }
-
-    int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
-    if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
-    {
-        enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
-    }
-    else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
-             (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
-    {
-        enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
-    }
-    else
-    {
-        enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
-    }
-    enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
-    if (flags & GMX_FORCE_ENERGY)
-    {
-        /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
-        enr_nbnxn_kernel_ljc += 1;
-        enr_nbnxn_kernel_lj  += 1;
-    }
-
-    inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
-             nbvg->nbl_lists.natpair_ljq);
-    inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
-             nbvg->nbl_lists.natpair_lj);
-    /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
-    inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
-             nbvg->nbl_lists.natpair_q);
-
-    if (ic->vdw_modifier == eintmodFORCESWITCH)
-    {
-        /* We add up the switch cost separately */
-        inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
-                 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
-    }
-    if (ic->vdw_modifier == eintmodPOTSWITCH)
-    {
-        /* We add up the switch cost separately */
-        inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
-                 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
-    }
-    if (ic->vdwtype == evdwPME)
-    {
-        /* We add up the LJ Ewald cost separately */
-        inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
-                 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
-    }
  }
  
  static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
@@ -952,11 +860,12 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
          {
              GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
              wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
-            isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
-                                                    flags, eatLocal,
-                                                    haveOtherWork,
-                                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
-                                                    fshift, completionType);
+            isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
+                                                     flags,
+                                                     Nbnxm::AtomLocality::Local,
+                                                     haveOtherWork,
+                                                     enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                                     fshift, completionType);
              wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
              // To get the call count right, when the task finished we
              // issue a start/stop.
@@ -967,7 +876,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
                  wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
                  wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
  
-                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
+                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
                                                 nbv->nbat, as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
              }
          }
@@ -998,15 +907,15 @@ static inline void launchGpuRollingPruning(const t_commrec          *cr,
       */
      int  numRollingParts     = nbv->listParams->numRollingParts;
      GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
-    int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
+    int  stepWithCurrentList = step - nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.outerListCreationStep;
      bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
      if (stepWithCurrentList > 0 &&
          stepWithCurrentList < inputrec->nstlist - 1 &&
          (stepIsEven || DOMAINDECOMP(cr)))
      {
-        nbnxn_gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
-                                          stepIsEven ? eintLocal : eintNonlocal,
-                                          numRollingParts);
+        Nbnxm::gpu_launch_kernel_pruneonly(nbv->gpu_nbv,
+                                           stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal,
+                                           numRollingParts);
      }
  }
  
@@ -1049,7 +958,7 @@ static void do_force_cutsVERLET(FILE *fplog,
      nonbonded_verlet_t *nbv = fr->nbv;
  
      bStateChanged = ((flags & GMX_FORCE_STATECHANGED) != 0);
-    bNS           = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
+    bNS           = ((flags & GMX_FORCE_NS) != 0);
      bFillGrid     = (bNS && bStateChanged);
      bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
      bDoForces     = ((flags & GMX_FORCE_FORCES) != 0);
@@ -1177,7 +1086,7 @@ static void do_force_cutsVERLET(FILE *fplog,
                                nullptr, 0, mdatoms->homenr, -1,
                                fr->cginfo, x.unpaddedArrayRef(),
                                0, nullptr,
-                              nbv->grp[eintLocal].kernel_type,
+                              nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
                                nbv->nbat);
              wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
          }
@@ -1186,7 +1095,7 @@ static void do_force_cutsVERLET(FILE *fplog,
              wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
              nbnxn_put_on_grid_nonlocal(nbv->nbs.get(), domdec_zones(cr->dd),
                                         fr->cginfo, x.unpaddedArrayRef(),
-                                       nbv->grp[eintNonlocal].kernel_type,
+                                       nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
                                         nbv->nbat);
              wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
          }
@@ -1204,10 +1113,10 @@ static void do_force_cutsVERLET(FILE *fplog,
  
          if (bNS)
          {
-            nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
+            Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat);
          }
  
-        nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
+        Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat);
  
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
  
@@ -1223,9 +1132,9 @@ static void do_force_cutsVERLET(FILE *fplog,
              // higher-level object than the nb module.
              fr->gpuBonded->updateInteractionListsAndDeviceBuffers(nbnxn_get_gridindices(fr->nbv->nbs.get()),
                                                                    top->idef,
-                                                                  nbnxn_gpu_get_xq(nbv->gpu_nbv),
-                                                                  nbnxn_gpu_get_f(nbv->gpu_nbv),
-                                                                  nbnxn_gpu_get_fshift(nbv->gpu_nbv));
+                                                                  Nbnxm::gpu_get_xq(nbv->gpu_nbv),
+                                                                  Nbnxm::gpu_get_f(nbv->gpu_nbv),
+                                                                  Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
              ppForceWorkload->haveGpuBondedWork = fr->gpuBonded->haveInteractions();
          }
  
@@ -1235,35 +1144,38 @@ static void do_force_cutsVERLET(FILE *fplog,
      /* do local pair search */
      if (bNS)
      {
+        nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists;
+
          wallcycle_start_nocount(wcycle, ewcNS);
          wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
          nbnxn_make_pairlist(nbv->nbs.get(), nbv->nbat,
                              &top->excls,
                              nbv->listParams->rlistOuter,
                              nbv->min_ci_balanced,
-                            &nbv->grp[eintLocal].nbl_lists,
-                            eintLocal,
-                            nbv->grp[eintLocal].kernel_type,
+                            &pairlistSet,
+                            Nbnxm::InteractionLocality::Local,
+                            nbv->grp[Nbnxm::InteractionLocality::Local].kernel_type,
                              nrnb);
-        nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
+        pairlistSet.outerListCreationStep = step;
          if (nbv->listParams->useDynamicPruning && !bUseGPU)
          {
-            nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
+            nbnxnPrepareListForDynamicPruning(&pairlistSet);
          }
          wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
  
          if (bUseGPU)
          {
              /* initialize local pair-list on the GPU */
-            nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
-                                    nbv->grp[eintLocal].nbl_lists.nbl[0],
-                                    eintLocal);
+            Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
+                                     pairlistSet.nblGpu[0],
+                                     Nbnxm::InteractionLocality::Local);
          }
          wallcycle_stop(wcycle, ewcNS);
      }
      else
      {
-        nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatLocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
+        nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
+                                        FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
                                          nbv->nbat, wcycle);
      }
  
@@ -1277,7 +1189,7 @@ static void do_force_cutsVERLET(FILE *fplog,
          wallcycle_start(wcycle, ewcLAUNCH_GPU);
  
          wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatLocal, ppForceWorkload->haveGpuBondedWork);
+        Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
  
          // bonded work not split into separate local and non-local, so with DD
@@ -1291,7 +1203,7 @@ static void do_force_cutsVERLET(FILE *fplog,
  
          /* launch local nonbonded work on GPU */
          wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
+        do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
                       step, nrnb, wcycle);
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
          wallcycle_stop(wcycle, ewcLAUNCH_GPU);
@@ -1310,6 +1222,8 @@ static void do_force_cutsVERLET(FILE *fplog,
         do non-local pair search */
      if (DOMAINDECOMP(cr))
      {
+        nbnxn_pairlist_set_t &pairlistSet = nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists;
+
          if (bNS)
          {
              wallcycle_start_nocount(wcycle, ewcNS);
@@ -1319,23 +1233,23 @@ static void do_force_cutsVERLET(FILE *fplog,
                                  &top->excls,
                                  nbv->listParams->rlistOuter,
                                  nbv->min_ci_balanced,
-                                &nbv->grp[eintNonlocal].nbl_lists,
-                                eintNonlocal,
-                                nbv->grp[eintNonlocal].kernel_type,
+                                &pairlistSet,
+                                Nbnxm::InteractionLocality::NonLocal,
+                                nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type,
                                  nrnb);
-            nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
+            pairlistSet.outerListCreationStep = step;
              if (nbv->listParams->useDynamicPruning && !bUseGPU)
              {
-                nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
+                nbnxnPrepareListForDynamicPruning(&pairlistSet);
              }
              wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
  
-            if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
+            if (nbv->grp[Nbnxm::InteractionLocality::NonLocal].kernel_type == nbnxnk8x8x8_GPU)
              {
                  /* initialize non-local pair-list on the GPU */
-                nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
-                                        nbv->grp[eintNonlocal].nbl_lists.nbl[0],
-                                        eintNonlocal);
+                Nbnxm::gpu_init_pairlist(nbv->gpu_nbv,
+                                         pairlistSet.nblGpu[0],
+                                         Nbnxm::InteractionLocality::NonLocal);
              }
              wallcycle_stop(wcycle, ewcNS);
          }
@@ -1343,7 +1257,8 @@ static void do_force_cutsVERLET(FILE *fplog,
          {
              dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
  
-            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), eatNonlocal, FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
+            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
+                                            FALSE, as_rvec_array(x.unpaddedArrayRef().data()),
                                              nbv->nbat, wcycle);
          }
  
@@ -1353,7 +1268,7 @@ static void do_force_cutsVERLET(FILE *fplog,
  
              /* launch non-local nonbonded tasks on GPU */
              wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-            nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
+            Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
              wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
  
              if (ppForceWorkload->haveGpuBondedWork)
@@ -1364,7 +1279,7 @@ static void do_force_cutsVERLET(FILE *fplog,
              }
  
              wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
+            do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
                           step, nrnb, wcycle);
              wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
  
@@ -1379,11 +1294,11 @@ static void do_force_cutsVERLET(FILE *fplog,
          wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
          if (DOMAINDECOMP(cr))
          {
-            nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
-                                     flags, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
+            Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
+                                      flags, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
          }
-        nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
-                                 flags, eatLocal, ppForceWorkload->haveGpuBondedWork);
+        Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat,
+                                  flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
  
          if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
@@ -1490,7 +1405,7 @@ static void do_force_cutsVERLET(FILE *fplog,
  
      if (!bUseOrEmulGPU)
      {
-        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
+        do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
                       step, nrnb, wcycle);
      }
  
@@ -1499,18 +1414,18 @@ static void do_force_cutsVERLET(FILE *fplog,
          /* Calculate the local and non-local free energy interactions here.
           * Happens here on the CPU both with and without GPU.
           */
-        if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
+        if (fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists.nbl_fep[0]->nrj > 0)
          {
-            do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
+            do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::Local].nbl_lists,
                               fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
                               inputrec->fepvals, lambda,
                               enerd, flags, nrnb, wcycle);
          }
  
          if (DOMAINDECOMP(cr) &&
-            fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
+            fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nbl_fep[0]->nrj > 0)
          {
-            do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
+            do_nb_verlet_fep(&fr->nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists,
                               fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms,
                               inputrec->fepvals, lambda,
                               enerd, flags, nrnb, wcycle);
@@ -1519,22 +1434,14 @@ static void do_force_cutsVERLET(FILE *fplog,
  
      if (!bUseOrEmulGPU)
      {
-        int aloc;
-
          if (DOMAINDECOMP(cr))
          {
-            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
+            do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
                           step, nrnb, wcycle);
          }
  
-        if (!bUseOrEmulGPU)
-        {
-            aloc = eintLocal;
-        }
-        else
-        {
-            aloc = eintNonlocal;
-        }
+        const Nbnxm::InteractionLocality iloc =
+            (!bUseOrEmulGPU ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal);
  
          /* Add all the non-bonded force to the normal force array.
           * This can be split into a local and a non-local part when overlapping
@@ -1542,13 +1449,13 @@ static void do_force_cutsVERLET(FILE *fplog,
           */
          wallcycle_stop(wcycle, ewcFORCE);
  
-        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatAll, nbv->nbat, f, wcycle);
+        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::All, nbv->nbat, f, wcycle);
  
          wallcycle_start_nocount(wcycle, ewcFORCE);
  
          /* if there are multiple fshift output buffers reduce them */
          if ((flags & GMX_FORCE_VIRIAL) &&
-            nbv->grp[aloc].nbl_lists.nnbl > 1)
+            nbv->grp[iloc].nbl_lists.nnbl > 1)
          {
              /* This is not in a subcounter because it takes a
                 negligible and constant-sized amount of time */
@@ -1586,25 +1493,25 @@ static void do_force_cutsVERLET(FILE *fplog,
              if (bUseGPU)
              {
                  wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
-                nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
-                                           flags, eatNonlocal,
-                                           ppForceWorkload->haveGpuBondedWork,
-                                           enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
-                                           fr->fshift);
+                Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
+                                            flags, Nbnxm::AtomLocality::NonLocal,
+                                            ppForceWorkload->haveGpuBondedWork,
+                                            enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                            fr->fshift);
                  cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
              }
              else
              {
                  wallcycle_start_nocount(wcycle, ewcFORCE);
-                do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
+                do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
                               step, nrnb, wcycle);
                  wallcycle_stop(wcycle, ewcFORCE);
              }
  
              /* skip the reduction if there was no non-local work to do */
-            if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
+            if (!nbv->grp[Nbnxm::InteractionLocality::NonLocal].nbl_lists.nblGpu[0]->sci.empty())
              {
-                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
+                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal,
                                                 nbv->nbat, f, wcycle);
              }
          }
@@ -1651,10 +1558,10 @@ static void do_force_cutsVERLET(FILE *fplog,
          const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
  
          wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
-        nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
-                                   flags, eatLocal, ppForceWorkload->haveGpuBondedWork,
-                                   enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
-                                   fr->fshift);
+        Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
+                                    flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork,
+                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                    fr->fshift);
          float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
  
          if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
@@ -1679,7 +1586,7 @@ static void do_force_cutsVERLET(FILE *fplog,
          // NOTE: emulation kernel is not included in the balancing region,
          // but emulation mode does not target performance anyway
          wallcycle_start_nocount(wcycle, ewcFORCE);
-        do_nb_verlet(fr, ic, enerd, flags, eintLocal,
+        do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local,
                       DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
                       step, nrnb, wcycle);
          wallcycle_stop(wcycle, ewcFORCE);
@@ -1695,7 +1602,7 @@ static void do_force_cutsVERLET(FILE *fplog,
          /* now clear the GPU outputs while we finish the step on the CPU */
          wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
          wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
+        Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, flags);
  
          /* Is dynamic pair-list pruning activated? */
          if (nbv->listParams->useDynamicPruning)
@@ -1726,7 +1633,7 @@ static void do_force_cutsVERLET(FILE *fplog,
       * on the non-alternating path. */
      if (bUseOrEmulGPU && !alternateGpuWait)
      {
-        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatLocal,
+        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::Local,
                                         nbv->nbat, f, wcycle);
      }
      if (DOMAINDECOMP(cr))
@@ -1836,7 +1743,7 @@ static void do_force_cutsGROUP(FILE *fplog,
      }
  
      bStateChanged  = ((flags & GMX_FORCE_STATECHANGED) != 0);
-    bNS            = ((flags & GMX_FORCE_NS) != 0) && (!fr->bAllvsAll);
+    bNS            = ((flags & GMX_FORCE_NS) != 0);
      /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
      bFillGrid      = (bNS && bStateChanged);
      bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
@@ -2837,7 +2744,7 @@ void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
  
      if (printReport)
      {
-        auto                    nbnxn_gpu_timings = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
+        auto                    nbnxn_gpu_timings = use_GPU(nbv) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
          gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
          if (pme_gpu_task_enabled(pme))
          {
@@ -2871,49 +2778,59 @@ void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
      }
  }
  
-extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
+void initialize_lambdas(FILE               *fplog,
+                        const t_inputrec   &ir,
+                        bool                isMaster,
+                        int                *fep_state,
+                        gmx::ArrayRef<real> lambda,
+                        double             *lam0)
  {
-    /* this function works, but could probably use a logic rewrite to keep all the different
-       types of efep straight. */
+    /* TODO: Clean up initialization of fep_state and lambda in
+       t_state.  This function works, but could probably use a logic
+       rewrite to keep all the different types of efep straight. */
  
-    if ((ir->efep == efepNO) && (!ir->bSimTemp))
+    if ((ir.efep == efepNO) && (!ir.bSimTemp))
      {
          return;
      }
  
-    t_lambda *fep = ir->fepvals;
-    *fep_state    = fep->init_fep_state; /* this might overwrite the checkpoint
-                                            if checkpoint is set -- a kludge is in for now
-                                            to prevent this.*/
+    const t_lambda *fep = ir.fepvals;
+    if (isMaster)
+    {
+        *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
+                                             if checkpoint is set -- a kludge is in for now
+                                             to prevent this.*/
+    }
  
      for (int i = 0; i < efptNR; i++)
      {
+        double thisLambda;
          /* overwrite lambda state with init_lambda for now for backwards compatibility */
-        if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
+        if (fep->init_lambda >= 0) /* if it's -1, it was never initialized */
          {
-            lambda[i] = fep->init_lambda;
-            if (lam0)
-            {
-                lam0[i] = lambda[i];
-            }
+            thisLambda = fep->init_lambda;
          }
          else
          {
-            lambda[i] = fep->all_lambda[i][*fep_state];
-            if (lam0)
-            {
-                lam0[i] = lambda[i];
-            }
+            thisLambda = fep->all_lambda[i][fep->init_fep_state];
+        }
+        if (isMaster)
+        {
+            lambda[i] = thisLambda;
+        }
+        if (lam0 != nullptr)
+        {
+            lam0[i] = thisLambda;
          }
      }
-    if (ir->bSimTemp)
+    if (ir.bSimTemp)
      {
          /* need to rescale control temperatures to match current state */
-        for (int i = 0; i < ir->opts.ngtc; i++)
+        for (int i = 0; i < ir.opts.ngtc; i++)
          {
-            if (ir->opts.ref_t[i] > 0)
+            if (ir.opts.ref_t[i] > 0)
              {
-                ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
+                ir.opts.ref_t[i] = ir.simtempvals->temperatures[fep->init_fep_state];
              }
          }
      }
@@ -2929,135 +2846,3 @@ extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx:
          fprintf(fplog, "]\n");
      }
  }
-
-
-void init_md(FILE *fplog,
-             const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
-             t_inputrec *ir, const gmx_output_env_t *oenv,
-             const MdrunOptions &mdrunOptions,
-             double *t, double *t0,
-             t_state *globalState, double *lam0,
-             t_nrnb *nrnb, gmx_mtop_t *mtop,
-             gmx_update_t **upd,
-             gmx::BoxDeformation *deform,
-             int nfile, const t_filenm fnm[],
-             gmx_mdoutf_t *outf, t_mdebin **mdebin,
-             tensor force_vir, tensor shake_vir,
-             tensor total_vir, tensor pres, rvec mu_tot,
-             gmx_bool *bSimAnn, t_vcm **vcm,
-             gmx_wallcycle_t wcycle)
-{
-    int  i;
-
-    /* Initial values */
-    *t = *t0       = ir->init_t;
-
-    *bSimAnn = FALSE;
-    for (i = 0; i < ir->opts.ngtc; i++)
-    {
-        /* set bSimAnn if any group is being annealed */
-        if (ir->opts.annealing[i] != eannNO)
-        {
-            *bSimAnn = TRUE;
-        }
-    }
-
-    /* Initialize lambda variables */
-    /* TODO: Clean up initialization of fep_state and lambda in t_state.
-     * We currently need to call initialize_lambdas on non-master ranks
-     * to initialize lam0.
-     */
-    if (MASTER(cr))
-    {
-        initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
-    }
-    else
-    {
-        int                      tmpFepState;
-        std::array<real, efptNR> tmpLambda;
-        initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
-    }
-
-    // TODO upd is never NULL in practice, but the analysers don't know that
-    if (upd)
-    {
-        *upd = init_update(ir, deform);
-    }
-    if (*bSimAnn)
-    {
-        update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
-    }
-
-    if (vcm != nullptr)
-    {
-        *vcm = init_vcm(fplog, &mtop->groups, ir);
-    }
-
-    if (EI_DYNAMICS(ir->eI) && !mdrunOptions.continuationOptions.appendFiles)
-    {
-        if (ir->etc == etcBERENDSEN)
-        {
-            please_cite(fplog, "Berendsen84a");
-        }
-        if (ir->etc == etcVRESCALE)
-        {
-            please_cite(fplog, "Bussi2007a");
-        }
-        if (ir->eI == eiSD1)
-        {
-            please_cite(fplog, "Goga2012");
-        }
-    }
-    init_nrnb(nrnb);
-
-    if (nfile != -1)
-    {
-        *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
-
-        *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
-                              mtop, ir, mdoutf_get_fp_dhdl(*outf));
-    }
-
-    /* Initiate variables */
-    clear_mat(force_vir);
-    clear_mat(shake_vir);
-    clear_rvec(mu_tot);
-    clear_mat(total_vir);
-    clear_mat(pres);
-}
-
-void init_rerun(FILE *fplog,
-                const t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
-                t_inputrec *ir, const gmx_output_env_t *oenv,
-                const MdrunOptions &mdrunOptions,
-                t_state *globalState, double *lam0,
-                t_nrnb *nrnb, gmx_mtop_t *mtop,
-                int nfile, const t_filenm fnm[],
-                gmx_mdoutf_t *outf, t_mdebin **mdebin,
-                gmx_wallcycle_t wcycle)
-{
-    /* Initialize lambda variables */
-    /* TODO: Clean up initialization of fep_state and lambda in t_state.
-     * We currently need to call initialize_lambdas on non-master ranks
-     * to initialize lam0.
-     */
-    if (MASTER(cr))
-    {
-        initialize_lambdas(fplog, ir, &globalState->fep_state, globalState->lambda, lam0);
-    }
-    else
-    {
-        int                      tmpFepState;
-        std::array<real, efptNR> tmpLambda;
-        initialize_lambdas(fplog, ir, &tmpFepState, tmpLambda, lam0);
-    }
-
-    init_nrnb(nrnb);
-
-    if (nfile != -1)
-    {
-        *outf   = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, mtop, oenv, wcycle);
-        *mdebin = init_mdebin(mdrunOptions.continuationOptions.appendFiles ? nullptr : mdoutf_get_fp_ene(*outf),
-                              mtop, ir, mdoutf_get_fp_dhdl(*outf), true);
-    }
-}