Fix random typos

[alexxy/gromacs.git] / src / gromacs / nbnxm / pairlist.cpp
diff --git a/src/gromacs/nbnxm/pairlist.cpp b/src/gromacs/nbnxm/pairlist.cpp

index 955c0967e6c4a273fdaacefcb9a321d107326079..cd44bc180a691aa563781b218f622d569814150a 100644 (file)
--- a/src/gromacs/nbnxm/pairlist.cpp
+++ b/src/gromacs/nbnxm/pairlist.cpp
@@ -2,7 +2,7 @@
   * This file is part of the GROMACS molecular simulation package.
   *
   * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
- * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -49,11 +49,14 @@
  #include "gromacs/domdec/domdec_struct.h"
  #include "gromacs/gmxlib/nrnb.h"
  #include "gromacs/math/functions.h"
+#include "gromacs/math/units.h"
  #include "gromacs/math/utilities.h"
  #include "gromacs/math/vec.h"
  #include "gromacs/mdlib/gmx_omp_nthreads.h"
  #include "gromacs/mdtypes/group.h"
  #include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/nblist.h"
+#include "gromacs/nbnxm/atomdata.h"
  #include "gromacs/nbnxm/gpu_data_mgmt.h"
  #include "gromacs/pbcutil/ishift.h"
  #include "gromacs/pbcutil/pbc.h"
@@ -65,7 +68,6 @@
  #include "gromacs/utility/listoflists.h"
  #include "gromacs/utility/smalloc.h"
  
-#include "atomdata.h"
  #include "boundingboxes.h"
  #include "clusterdistancekerneltype.h"
  #include "gridset.h"
@@ -83,7 +85,7 @@ using BoundingBox1D = Nbnxm::BoundingBox1D; // TODO: Remove when refactoring thi
  
  using Grid = Nbnxm::Grid; // TODO: Remove when refactoring this file
  
-// Convience alias for partial Nbnxn namespace usage
+// Convenience alias for partial Nbnxn namespace usage
  using InteractionLocality = gmx::InteractionLocality;
  
  /* We shift the i-particles backward for PBC.
@@ -101,7 +103,7 @@ enum class NbnxnLayout
      Gpu8x8x8   // i-cluster size 8, j-cluster size 8 + super-clustering
  };
  
-#if GMX_SIMD
+#if defined(GMX_NBNXN_SIMD_4XN) || defined(GMX_NBNXN_SIMD_2XNN)
  /* Returns the j-cluster size */
  template<NbnxnLayout layout>
  static constexpr int jClusterSize()
@@ -119,7 +121,8 @@ static constexpr int jClusterSize()
   *
   * \tparam    jClusterSize      The number of atoms in a j-cluster
   * \tparam    jSubClusterIndex  The j-sub-cluster index (0/1), used when size(j-cluster) <
- * size(i-cluster) \param[in] ci                The i-cluster index
+ *                              size(i-cluster)
+ * \param[in] ci                The i-cluster index
   */
  template<int jClusterSize, int jSubClusterIndex>
  static inline int cjFromCi(int ci)
@@ -156,7 +159,8 @@ static inline int cjFromCi(int ci)
   *
   * \tparam    layout            The pair-list layout
   * \tparam    jSubClusterIndex  The j-sub-cluster index (0/1), used when size(j-cluster) <
- * size(i-cluster) \param[in] ci                The i-cluster index
+ *                              size(i-cluster)
+ * \param[in] ci                The i-cluster index
   */
  template<NbnxnLayout layout, int jSubClusterIndex>
  static inline int cjFromCi(int ci)
@@ -214,45 +218,21 @@ static inline int xIndexFromCj(int cj)
          return cj * STRIDE_P8;
      }
  }
-#endif // GMX_SIMD
-
+#endif // defined(GMX_NBNXN_SIMD_4XN) || defined(GMX_NBNXN_SIMD_2XNN)
  
-void nbnxn_init_pairlist_fep(t_nblist* nl)
+static constexpr int sizeNeededForBufferFlags(const int numAtoms)
  {
-    nl->type      = GMX_NBLIST_INTERACTION_FREE_ENERGY;
-    nl->igeometry = GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE;
-    /* The interaction functions are set in the free energy kernel fuction */
-    nl->ivdw     = -1;
-    nl->ivdwmod  = -1;
-    nl->ielec    = -1;
-    nl->ielecmod = -1;
-
-    nl->maxnri   = 0;
-    nl->maxnrj   = 0;
-    nl->nri      = 0;
-    nl->nrj      = 0;
-    nl->iinr     = nullptr;
-    nl->gid      = nullptr;
-    nl->shift    = nullptr;
-    nl->jindex   = nullptr;
-    nl->jjnr     = nullptr;
-    nl->excl_fep = nullptr;
+    return (numAtoms + NBNXN_BUFFERFLAG_SIZE - 1) / NBNXN_BUFFERFLAG_SIZE;
  }
  
-static void init_buffer_flags(nbnxn_buffer_flags_t* flags, int natoms)
+// Resets current flags to 0 and adds more flags if needed.
+static void resizeAndZeroBufferFlags(std::vector<gmx_bitmask_t>* flags, const int numAtoms)
  {
-    flags->nflag = (natoms + NBNXN_BUFFERFLAG_SIZE - 1) / NBNXN_BUFFERFLAG_SIZE;
-    if (flags->nflag > flags->flag_nalloc)
-    {
-        flags->flag_nalloc = over_alloc_large(flags->nflag);
-        srenew(flags->flag, flags->flag_nalloc);
-    }
-    for (int b = 0; b < flags->nflag; b++)
-    {
-        bitmask_clear(&(flags->flag[b]));
-    }
+    flags->clear();
+    flags->resize(sizeNeededForBufferFlags(numAtoms), gmx_bitmask_t{ 0 });
  }
  
+
  /* Returns the pair-list cutoff between a bounding box and a grid cell given an atom-to-atom pair-list cutoff
   *
   * Given a cutoff distance between atoms, this functions returns the cutoff
@@ -515,7 +495,7 @@ clusterpair_in_range(const NbnxnPairlistGpuWork& work, int si, int csj, int stri
      static_assert(c_nbnxnGpuClusterSize == 8 || c_nbnxnGpuClusterSize == 4,
                    "A cluster is hard-coded to 4/8 atoms.");
  
-    Simd4Real rc2_S = Simd4Real(rlist2);
+    Simd4Real rc2_S{ rlist2 };
  
      const real* x_i = work.iSuperClusterData.xSimd.data();
  
@@ -677,17 +657,13 @@ NbnxnPairlistGpu::NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy) :
  }
  
  // TODO: Move to pairlistset.cpp
-PairlistSet::PairlistSet(const InteractionLocality locality, const PairlistParams& pairlistParams) :
-    locality_(locality),
-    params_(pairlistParams)
+PairlistSet::PairlistSet(const PairlistParams& pairlistParams) :
+    params_(pairlistParams),
+    combineLists_(sc_isGpuPairListType[pairlistParams.pairlistType]), // Currently GPU lists are always combined
+    isCpuType_(!sc_isGpuPairListType[pairlistParams.pairlistType])
  {
-    isCpuType_ = (params_.pairlistType == PairlistType::Simple4x2
-                  || params_.pairlistType == PairlistType::Simple4x4
-                  || params_.pairlistType == PairlistType::Simple4x8);
-    // Currently GPU lists are always combined
-    combineLists_ = !isCpuType_;
  
-    const int numLists = gmx_omp_nthreads_get(emntNonbonded);
+    const int numLists = gmx_omp_nthreads_get(ModuleMultiThread::Nonbonded);
  
      if (!combineLists_ && numLists > NBNXN_BUFFERFLAG_MAX_THREADS)
      {
@@ -695,7 +671,9 @@ PairlistSet::PairlistSet(const InteractionLocality locality, const PairlistParam
                    "%d OpenMP threads were requested. Since the non-bonded force buffer reduction "
                    "is prohibitively slow with more than %d threads, we do not allow this. Use %d "
                    "or less OpenMP threads.",
-                  numLists, NBNXN_BUFFERFLAG_MAX_THREADS, NBNXN_BUFFERFLAG_MAX_THREADS);
+                  numLists,
+                  NBNXN_BUFFERFLAG_MAX_THREADS,
+                  NBNXN_BUFFERFLAG_MAX_THREADS);
      }
  
      if (isCpuType_)
@@ -734,7 +712,6 @@ PairlistSet::PairlistSet(const InteractionLocality locality, const PairlistParam
                   * impacts performance.
                   */
                  fepLists_[i] = std::make_unique<t_nblist>();
-                nbnxn_init_pairlist_fep(fepLists_[i].get());
              }
              GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
          }
@@ -752,18 +729,30 @@ static void print_nblist_statistics(FILE*                   fp,
  
      fprintf(fp, "nbl nci %zu ncj %d\n", nbl.ci.size(), nbl.ncjInUse);
      const int numAtomsJCluster = grid.geometry().numAtomsJCluster;
+
+    if (grid.numCells() == 0)
+    {
+        return;
+    }
+
      const double numAtomsPerCell = nbl.ncjInUse / static_cast<double>(grid.numCells()) * numAtomsJCluster;
-    fprintf(fp, "nbl na_cj %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n", nbl.na_cj, rl,
-            nbl.ncjInUse, nbl.ncjInUse / static_cast<double>(grid.numCells()), numAtomsPerCell,
+    fprintf(fp,
+            "nbl na_cj %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
+            nbl.na_cj,
+            rl,
+            nbl.ncjInUse,
+            nbl.ncjInUse / static_cast<double>(grid.numCells()),
+            numAtomsPerCell,
              numAtomsPerCell
                      / (0.5 * 4.0 / 3.0 * M_PI * rl * rl * rl * grid.numCells() * numAtomsJCluster
                         / (dims.gridSize[XX] * dims.gridSize[YY] * dims.gridSize[ZZ])));
  
-    fprintf(fp, "nbl average j cell list length %.1f\n",
+    fprintf(fp,
+            "nbl average j cell list length %.1f\n",
              0.25 * nbl.ncjInUse / std::max(static_cast<double>(nbl.ci.size()), 1.0));
  
-    int cs[SHIFTS] = { 0 };
-    int npexcl     = 0;
+    int cs[gmx::c_numShiftVectors] = { 0 };
+    int npexcl                     = 0;
      for (const nbnxn_ci_t& ciEntry : nbl.ci)
      {
          cs[ciEntry.shift & NBNXN_CI_SHIFT] += ciEntry.cj_ind_end - ciEntry.cj_ind_start;
@@ -775,9 +764,12 @@ static void print_nblist_statistics(FILE*                   fp,
              j++;
          }
      }
-    fprintf(fp, "nbl cell pairs, total: %zu excl: %d %.1f%%\n", nbl.cj.size(), npexcl,
+    fprintf(fp,
+            "nbl cell pairs, total: %zu excl: %d %.1f%%\n",
+            nbl.cj.size(),
+            npexcl,
              100 * npexcl / std::max(static_cast<double>(nbl.cj.size()), 1.0));
-    for (int s = 0; s < SHIFTS; s++)
+    for (int s = 0; s < gmx::c_numShiftVectors; s++)
      {
          if (cs[s] > 0)
          {
@@ -795,12 +787,21 @@ static void print_nblist_statistics(FILE*                   fp,
      const Grid&             grid = gridSet.grids()[0];
      const Grid::Dimensions& dims = grid.dimensions();
  
-    fprintf(fp, "nbl nsci %zu ncj4 %zu nsi %d excl4 %zu\n", nbl.sci.size(), nbl.cj4.size(),
-            nbl.nci_tot, nbl.excl.size());
+    fprintf(fp,
+            "nbl nsci %zu ncj4 %zu nsi %d excl4 %zu\n",
+            nbl.sci.size(),
+            nbl.cj4.size(),
+            nbl.nci_tot,
+            nbl.excl.size());
      const int numAtomsCluster = grid.geometry().numAtomsICluster;
      const double numAtomsPerCell = nbl.nci_tot / static_cast<double>(grid.numClusters()) * numAtomsCluster;
-    fprintf(fp, "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n", nbl.na_ci, rl,
-            nbl.nci_tot, nbl.nci_tot / static_cast<double>(grid.numClusters()), numAtomsPerCell,
+    fprintf(fp,
+            "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
+            nbl.na_ci,
+            rl,
+            nbl.nci_tot,
+            nbl.nci_tot / static_cast<double>(grid.numClusters()),
+            numAtomsPerCell,
              numAtomsPerCell
                      / (0.5 * 4.0 / 3.0 * M_PI * rl * rl * rl * grid.numClusters() * numAtomsCluster
                         / (dims.gridSize[XX] * dims.gridSize[YY] * dims.gridSize[ZZ])));
@@ -837,14 +838,20 @@ static void print_nblist_statistics(FILE*                   fp,
          sum_nsp /= nbl.sci.size();
          sum_nsp2 /= nbl.sci.size();
      }
-    fprintf(fp, "nbl #cluster-pairs: av %.1f stddev %.1f max %d\n", sum_nsp,
-            std::sqrt(sum_nsp2 - sum_nsp * sum_nsp), nsp_max);
+    fprintf(fp,
+            "nbl #cluster-pairs: av %.1f stddev %.1f max %d\n",
+            sum_nsp,
+            std::sqrt(sum_nsp2 - sum_nsp * sum_nsp),
+            nsp_max);
  
      if (!nbl.cj4.empty())
      {
          for (int b = 0; b <= c_gpuNumClusterPerCell; b++)
          {
-            fprintf(fp, "nbl j-list #i-subcell %d %7d %4.1f\n", b, c[b],
+            fprintf(fp,
+                    "nbl j-list #i-subcell %d %7d %4.1f\n",
+                    b,
+                    c[b],
                      100.0 * c[b] / size_t{ nbl.cj4.size() * c_nbnxnGpuJgroupSize });
          }
      }
@@ -960,23 +967,21 @@ gmx_unused static unsigned int get_imask_simd_j8(gmx_bool rdiag, int ci, int cj)
   * \param[in]     rbb2                The squared cut-off for putting cluster-pairs in the list based on bounding box distance only
   * \param[in,out] numDistanceChecks   The number of distance checks performed
   */
-static void makeClusterListSimple(const Grid&       jGrid,
-                                  NbnxnPairlistCpu* nbl,
-                                  int               icluster,
-                                  int               jclusterFirst,
-                                  int               jclusterLast,
-                                  bool              excludeSubDiagonal,
+static void makeClusterListSimple(const Grid&              jGrid,
+                                  NbnxnPairlistCpu*        nbl,
+                                  int                      icluster,
+                                  int                      jclusterFirst,
+                                  int                      jclusterLast,
+                                  bool                     excludeSubDiagonal,
                                    const real* gmx_restrict x_j,
                                    real                     rlist2,
                                    float                    rbb2,
-                                  int* gmx_restrict numDistanceChecks)
+                                  int* gmx_restrict        numDistanceChecks)
  {
      const BoundingBox* gmx_restrict bb_ci = nbl->work->iClusterData.bb.data();
-    const real* gmx_restrict x_ci         = nbl->work->iClusterData.x.data();
-
-    gmx_bool InRange;
+    const real* gmx_restrict        x_ci  = nbl->work->iClusterData.x.data();
  
-    InRange = FALSE;
+    bool InRange = false;
      while (!InRange && jclusterFirst <= jclusterLast)
      {
          real d2 = clusterBoundingBoxDistance2(bb_ci[0], jGrid.jBoundingBoxes()[jclusterFirst]);
@@ -989,7 +994,7 @@ static void makeClusterListSimple(const Grid&       jGrid,
           */
          if (d2 < rbb2)
          {
-            InRange = TRUE;
+            InRange = true;
          }
          else if (d2 < rlist2)
          {
@@ -1021,7 +1026,7 @@ static void makeClusterListSimple(const Grid&       jGrid,
          return;
      }
  
-    InRange = FALSE;
+    InRange = false;
      while (!InRange && jclusterLast > jclusterFirst)
      {
          real d2 = clusterBoundingBoxDistance2(bb_ci[0], jGrid.jBoundingBoxes()[jclusterLast]);
@@ -1034,7 +1039,7 @@ static void makeClusterListSimple(const Grid&       jGrid,
           */
          if (d2 < rbb2)
          {
-            InRange = TRUE;
+            InRange = true;
          }
          else if (d2 < rlist2)
          {
@@ -1132,21 +1137,14 @@ static void make_cluster_list_supersub(const Grid&       iGrid,
  
          const int cj_gl = jGrid.cellOffset() * c_gpuNumClusterPerCell + cj;
  
-        int ci1;
-        if (excludeSubDiagonal && sci == scj)
-        {
-            ci1 = subc + 1;
-        }
-        else
-        {
-            ci1 = iGrid.numClustersPerCell()[sci];
-        }
+        int ci1 = (excludeSubDiagonal && sci == scj) ? subc + 1 : iGrid.numClustersPerCell()[sci];
+
  
  #if NBNXN_BBXXXX
          /* Determine all ci1 bb distances in one call with SIMD4 */
          const int offset = packedBoundingBoxesIndex(cj) + (cj & (c_packedBoundingBoxesDimSize - 1));
-        clusterBoundingBoxDistance2_xxxx_simd4(jGrid.packedBoundingBoxes().data() + offset, ci1,
-                                               pbb_ci, d2l);
+        clusterBoundingBoxDistance2_xxxx_simd4(
+                jGrid.packedBoundingBoxes().data() + offset, ci1, pbb_ci, d2l);
          *numDistanceChecks += c_nbnxnGpuClusterSize * 2;
  #endif
  
@@ -1281,8 +1279,7 @@ struct JListRanges
  #ifndef DOXYGEN
  template<typename CjListType>
  JListRanges::JListRanges(int cjIndexStart, int cjIndexEnd, gmx::ArrayRef<const CjListType> cjList) :
-    cjIndexStart(cjIndexStart),
-    cjIndexEnd(cjIndexEnd)
+    cjIndexStart(cjIndexStart), cjIndexEnd(cjIndexEnd)
  {
      GMX_ASSERT(cjIndexEnd > cjIndexStart, "JListRanges should only be called with non-empty lists");
  
@@ -1307,23 +1304,20 @@ static inline int findJClusterInJList(int                             jCluster,
                                        const JListRanges&              ranges,
                                        gmx::ArrayRef<const CjListType> cjList)
  {
-    int index;
-
      if (jCluster < ranges.cjFirst + ranges.numDirect)
      {
          /* We can calculate the index directly using the offset */
-        index = ranges.cjIndexStart + jCluster - ranges.cjFirst;
+        return ranges.cjIndexStart + jCluster - ranges.cjFirst;
      }
      else
      {
          /* Search for jCluster using bisection */
-        index          = -1;
+        int index      = -1;
          int rangeStart = ranges.cjIndexStart + ranges.numDirect;
          int rangeEnd   = ranges.cjIndexEnd;
-        int rangeMiddle;
          while (index == -1 && rangeStart < rangeEnd)
          {
-            rangeMiddle = (rangeStart + rangeEnd) >> 1;
+            int rangeMiddle = (rangeStart + rangeEnd) >> 1;
  
              const int clusterMiddle = nblCj(cjList, rangeMiddle);
  
@@ -1340,9 +1334,8 @@ static inline int findJClusterInJList(int                             jCluster,
                  rangeStart = rangeMiddle + 1;
              }
          }
+        return index;
      }
-
-    return index;
  }
  
  // TODO: Get rid of the two functions below by renaming sci to ci (or something better)
@@ -1452,16 +1445,10 @@ static inline void fep_list_new_nri_copy(t_nblist* nlist)
  /* Rellocate FEP list for size nl->maxnri, TODO: replace by C++ */
  static void reallocate_nblist(t_nblist* nl)
  {
-    if (gmx_debug_at)
-    {
-        fprintf(debug,
-                "reallocating neigborlist (ielec=%d, ivdw=%d, igeometry=%d, type=%d), maxnri=%d\n",
-                nl->ielec, nl->ivdw, nl->igeometry, nl->type, nl->maxnri);
-    }
-    srenew(nl->iinr, nl->maxnri);
-    srenew(nl->gid, nl->maxnri);
-    srenew(nl->shift, nl->maxnri);
-    srenew(nl->jindex, nl->maxnri + 1);
+    nl->iinr.resize(nl->maxnri);
+    nl->gid.resize(nl->maxnri);
+    nl->shift.resize(nl->maxnri);
+    nl->jindex.resize(nl->maxnri + 1);
  }
  
  /* For load balancing of the free-energy lists over threads, we set
@@ -1483,22 +1470,16 @@ static void make_fep_list(gmx::ArrayRef<const int> atomIndices,
                            NbnxnPairlistCpu*        nbl,
                            gmx_bool                 bDiagRemoved,
                            nbnxn_ci_t*              nbl_ci,
-                          real gmx_unused shx,
-                          real gmx_unused shy,
-                          real gmx_unused shz,
-                          real gmx_unused rlist_fep2,
-                          const Grid&     iGrid,
-                          const Grid&     jGrid,
-                          t_nblist*       nlist)
+                          real gmx_unused          shx,
+                          real gmx_unused          shy,
+                          real gmx_unused          shz,
+                          real gmx_unused          rlist_fep2,
+                          const Grid&              iGrid,
+                          const Grid&              jGrid,
+                          t_nblist*                nlist)
  {
-    int      ci, cj_ind_start, cj_ind_end, cja, cjr;
-    int      nri_max;
-    int      gid_i = 0, gid_j, gid;
-    int      egp_shift, egp_mask;
-    int      gid_cj = 0;
-    int      ind_i, ind_j, ai, aj;
-    int      nri;
-    gmx_bool bFEP_i, bFEP_i_all;
+    int gid_i  = 0;
+    int gid_cj = 0;
  
      if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start)
      {
@@ -1506,16 +1487,16 @@ static void make_fep_list(gmx::ArrayRef<const int> atomIndices,
          return;
      }
  
-    ci = nbl_ci->ci;
+    const int ci = nbl_ci->ci;
  
-    cj_ind_start = nbl_ci->cj_ind_start;
-    cj_ind_end   = nbl_ci->cj_ind_end;
+    const int cj_ind_start = nbl_ci->cj_ind_start;
+    const int cj_ind_end   = nbl_ci->cj_ind_end;
  
      /* In worst case we have alternating energy groups
       * and create #atom-pair lists, which means we need the size
       * of a cluster pair (na_ci*na_cj) times the number of cj's.
       */
-    nri_max = nbl->na_ci * nbl->na_cj * (cj_ind_end - cj_ind_start);
+    const int nri_max = nbl->na_ci * nbl->na_cj * (cj_ind_end - cj_ind_start);
      if (nlist->nri + nri_max > nlist->maxnri)
      {
          nlist->maxnri = over_alloc_large(nlist->nri + nri_max);
@@ -1535,37 +1516,38 @@ static void make_fep_list(gmx::ArrayRef<const int> atomIndices,
          gmx_fatal(FARGS,
                    "The Verlet scheme with %dx%d kernels and free-energy only supports up to %zu "
                    "energy groups",
-                  iGrid.geometry().numAtomsICluster, numAtomsJCluster,
+                  iGrid.geometry().numAtomsICluster,
+                  numAtomsJCluster,
                    (sizeof(gid_cj) * 8) / numAtomsJCluster);
      }
  
-    egp_shift = nbatParams.neg_2log;
-    egp_mask  = (1 << egp_shift) - 1;
+    const int egp_shift = nbatParams.neg_2log;
+    const int egp_mask  = (1 << egp_shift) - 1;
  
      /* Loop over the atoms in the i sub-cell */
-    bFEP_i_all = TRUE;
+    bool bFEP_i_all = true;
      for (int i = 0; i < nbl->na_ci; i++)
      {
-        ind_i = ci * nbl->na_ci + i;
-        ai    = atomIndices[ind_i];
+        const int ind_i = ci * nbl->na_ci + i;
+        const int ai    = atomIndices[ind_i];
          if (ai >= 0)
          {
-            nri                    = nlist->nri;
+            int nri                = nlist->nri;
              nlist->jindex[nri + 1] = nlist->jindex[nri];
              nlist->iinr[nri]       = ai;
              /* The actual energy group pair index is set later */
              nlist->gid[nri]   = 0;
              nlist->shift[nri] = nbl_ci->shift & NBNXN_CI_SHIFT;
  
-            bFEP_i = iGrid.atomIsPerturbed(ci - iGrid.cellOffset(), i);
+            bool bFEP_i = iGrid.atomIsPerturbed(ci - iGrid.cellOffset(), i);
  
              bFEP_i_all = bFEP_i_all && bFEP_i;
  
              if (nlist->nrj + (cj_ind_end - cj_ind_start) * nbl->na_cj > nlist->maxnrj)
              {
                  nlist->maxnrj = over_alloc_small(nlist->nrj + (cj_ind_end - cj_ind_start) * nbl->na_cj);
-                srenew(nlist->jjnr, nlist->maxnrj);
-                srenew(nlist->excl_fep, nlist->maxnrj);
+                nlist->jjnr.resize(nlist->maxnrj);
+                nlist->excl_fep.resize(nlist->maxnrj);
              }
  
              if (ngid > 1)
@@ -1575,14 +1557,15 @@ static void make_fep_list(gmx::ArrayRef<const int> atomIndices,
  
              for (int cj_ind = cj_ind_start; cj_ind < cj_ind_end; cj_ind++)
              {
-                unsigned int fep_cj;
+                unsigned int fep_cj = 0U;
+                gid_cj              = 0;
  
-                cja = nbl->cj[cj_ind].cj;
+                const int cja = nbl->cj[cj_ind].cj;
  
                  if (numAtomsJCluster == jGrid.geometry().numAtomsICluster)
                  {
-                    cjr    = cja - jGrid.cellOffset();
-                    fep_cj = jGrid.fepBits(cjr);
+                    const int cjr = cja - jGrid.cellOffset();
+                    fep_cj        = jGrid.fepBits(cjr);
                      if (ngid > 1)
                      {
                          gid_cj = nbatParams.energrp[cja];
@@ -1590,7 +1573,7 @@ static void make_fep_list(gmx::ArrayRef<const int> atomIndices,
                  }
                  else if (2 * numAtomsJCluster == jGrid.geometry().numAtomsICluster)
                  {
-                    cjr = cja - jGrid.cellOffset() * 2;
+                    const int cjr = cja - jGrid.cellOffset() * 2;
                      /* Extract half of the ci fep/energrp mask */
                      fep_cj = (jGrid.fepBits(cjr >> 1) >> ((cjr & 1) * numAtomsJCluster))
                               & ((1 << numAtomsJCluster) - 1);
@@ -1602,7 +1585,7 @@ static void make_fep_list(gmx::ArrayRef<const int> atomIndices,
                  }
                  else
                  {
-                    cjr = cja - (jGrid.cellOffset() >> 1);
+                    const int cjr = cja - (jGrid.cellOffset() >> 1);
                      /* Combine two ci fep masks/energrp */
                      fep_cj = jGrid.fepBits(cjr * 2)
                               + (jGrid.fepBits(cjr * 2 + 1) << jGrid.geometry().numAtomsICluster);
@@ -1619,14 +1602,14 @@ static void make_fep_list(gmx::ArrayRef<const int> atomIndices,
                      for (int j = 0; j < nbl->na_cj; j++)
                      {
                          /* Is this interaction perturbed and not excluded? */
-                        ind_j = cja * nbl->na_cj + j;
-                        aj    = atomIndices[ind_j];
+                        const int ind_j = cja * nbl->na_cj + j;
+                        const int aj    = atomIndices[ind_j];
                          if (aj >= 0 && (bFEP_i || (fep_cj & (1 << j))) && (!bDiagRemoved || ind_j >= ind_i))
                          {
                              if (ngid > 1)
                              {
-                                gid_j = (gid_cj >> (j * egp_shift)) & egp_mask;
-                                gid   = GID(gid_i, gid_j, ngid);
+                                const int gid_j = (gid_cj >> (j * egp_shift)) & egp_mask;
+                                const int gid   = GID(gid_i, gid_j, ngid);
  
                                  if (nlist->nrj > nlist->jindex[nri] && nlist->gid[nri] != gid)
                                  {
@@ -1709,14 +1692,6 @@ static void make_fep_list(gmx::ArrayRef<const int> atomIndices,
                            const Grid&              jGrid,
                            t_nblist*                nlist)
  {
-    int                nri_max;
-    int                c_abs;
-    int                ind_i, ind_j, ai, aj;
-    int                nri;
-    gmx_bool           bFEP_i;
-    real               xi, yi, zi;
-    const nbnxn_cj4_t* cj4;
-
      const int numJClusterGroups = nbl_sci->numJClusterGroups();
      if (numJClusterGroups == 0)
      {
@@ -1736,7 +1711,8 @@ static void make_fep_list(gmx::ArrayRef<const int> atomIndices,
       * So for each of the na_sc i-atoms, we need max one FEP list
       * for each max_nrj_fep j-atoms.
       */
-    nri_max = nbl->na_sc * nbl->na_cj * (1 + (numJClusterGroups * c_nbnxnGpuJgroupSize) / max_nrj_fep);
+    const int nri_max =
+            nbl->na_sc * nbl->na_cj * (1 + (numJClusterGroups * c_nbnxnGpuJgroupSize) / max_nrj_fep);
      if (nlist->nri + nri_max > nlist->maxnri)
      {
          nlist->maxnri = over_alloc_large(nlist->nri + nri_max);
@@ -1746,38 +1722,39 @@ static void make_fep_list(gmx::ArrayRef<const int> atomIndices,
      /* Loop over the atoms in the i super-cluster */
      for (int c = 0; c < c_gpuNumClusterPerCell; c++)
      {
-        c_abs = sci * c_gpuNumClusterPerCell + c;
+        const int c_abs = sci * c_gpuNumClusterPerCell + c;
  
          for (int i = 0; i < nbl->na_ci; i++)
          {
-            ind_i = c_abs * nbl->na_ci + i;
-            ai    = atomIndices[ind_i];
+            const int ind_i = c_abs * nbl->na_ci + i;
+            const int ai    = atomIndices[ind_i];
              if (ai >= 0)
              {
-                nri                    = nlist->nri;
+                int nri                = nlist->nri;
                  nlist->jindex[nri + 1] = nlist->jindex[nri];
                  nlist->iinr[nri]       = ai;
                  /* With GPUs, energy groups are not supported */
                  nlist->gid[nri]   = 0;
                  nlist->shift[nri] = nbl_sci->shift & NBNXN_CI_SHIFT;
  
-                bFEP_i = iGrid.atomIsPerturbed(c_abs - iGrid.cellOffset() * c_gpuNumClusterPerCell, i);
+                const bool bFEP_i =
+                        iGrid.atomIsPerturbed(c_abs - iGrid.cellOffset() * c_gpuNumClusterPerCell, i);
  
-                xi = nbat->x()[ind_i * nbat->xstride + XX] + shx;
-                yi = nbat->x()[ind_i * nbat->xstride + YY] + shy;
-                zi = nbat->x()[ind_i * nbat->xstride + ZZ] + shz;
+                real xi = nbat->x()[ind_i * nbat->xstride + XX] + shx;
+                real yi = nbat->x()[ind_i * nbat->xstride + YY] + shy;
+                real zi = nbat->x()[ind_i * nbat->xstride + ZZ] + shz;
  
                  const int nrjMax = nlist->nrj + numJClusterGroups * c_nbnxnGpuJgroupSize * nbl->na_cj;
                  if (nrjMax > nlist->maxnrj)
                  {
                      nlist->maxnrj = over_alloc_small(nrjMax);
-                    srenew(nlist->jjnr, nlist->maxnrj);
-                    srenew(nlist->excl_fep, nlist->maxnrj);
+                    nlist->jjnr.resize(nlist->maxnrj);
+                    nlist->excl_fep.resize(nlist->maxnrj);
                  }
  
                  for (int cj4_ind = cj4_ind_start; cj4_ind < cj4_ind_end; cj4_ind++)
                  {
-                    cj4 = &nbl->cj4[cj4_ind];
+                    const nbnxn_cj4_t* cj4 = &nbl->cj4[cj4_ind];
  
                      for (int gcj = 0; gcj < c_nbnxnGpuJgroupSize; gcj++)
                      {
@@ -1794,25 +1771,22 @@ static void make_fep_list(gmx::ArrayRef<const int> atomIndices,
                              for (int j = 0; j < nbl->na_cj; j++)
                              {
                                  /* Is this interaction perturbed and not excluded? */
-                                ind_j = (jGrid.cellOffset() * c_gpuNumClusterPerCell + cjr) * nbl->na_cj + j;
-                                aj = atomIndices[ind_j];
+                                const int ind_j =
+                                        (jGrid.cellOffset() * c_gpuNumClusterPerCell + cjr) * nbl->na_cj + j;
+                                const int aj = atomIndices[ind_j];
                                  if (aj >= 0 && (bFEP_i || jGrid.atomIsPerturbed(cjr, j))
                                      && (!bDiagRemoved || ind_j >= ind_i))
                                  {
-                                    int          excl_pair;
-                                    unsigned int excl_bit;
-                                    real         dx, dy, dz;
-
                                      const int jHalf =
                                              j / (c_nbnxnGpuClusterSize / c_nbnxnGpuClusterpairSplit);
                                      nbnxn_excl_t& excl = get_exclusion_mask(nbl, cj4_ind, jHalf);
  
-                                    excl_pair = a_mod_wj(j) * nbl->na_ci + i;
-                                    excl_bit  = (1U << (gcj * c_gpuNumClusterPerCell + c));
+                                    int          excl_pair = a_mod_wj(j) * nbl->na_ci + i;
+                                    unsigned int excl_bit = (1U << (gcj * c_gpuNumClusterPerCell + c));
  
-                                    dx = nbat->x()[ind_j * nbat->xstride + XX] - xi;
-                                    dy = nbat->x()[ind_j * nbat->xstride + YY] - yi;
-                                    dz = nbat->x()[ind_j * nbat->xstride + ZZ] - zi;
+                                    real dx = nbat->x()[ind_j * nbat->xstride + XX] - xi;
+                                    real dy = nbat->x()[ind_j * nbat->xstride + YY] - yi;
+                                    real dz = nbat->x()[ind_j * nbat->xstride + ZZ] - zi;
  
                                      /* The unpruned GPU list has more than 2/3
                                       * of the atom pairs beyond rlist. Using
@@ -1871,9 +1845,9 @@ static void make_fep_list(gmx::ArrayRef<const int> atomIndices,
   * Sets all atom-pair exclusions from the topology stored in exclusions
   * as masks in the pair-list for i-super-cluster list entry iEntry.
   */
-static void setExclusionsForIEntry(const Nbnxm::GridSet& gridSet,
-                                   NbnxnPairlistGpu*     nbl,
-                                   gmx_bool              diagRemoved,
+static void setExclusionsForIEntry(const Nbnxm::GridSet&   gridSet,
+                                   NbnxnPairlistGpu*       nbl,
+                                   gmx_bool                diagRemoved,
                                     int gmx_unused          na_cj_2log,
                                     const nbnxn_sci_t&      iEntry,
                                     const ListOfLists<int>& exclusions)
@@ -1888,7 +1862,8 @@ static void setExclusionsForIEntry(const Nbnxm::GridSet& gridSet,
       * Note that here we can not use cj4_ind_end, since the last cj4
       * can be only partially filled, so we use cj_ind.
       */
-    const JListRanges ranges(iEntry.cj4_ind_start * c_nbnxnGpuJgroupSize, nbl->work->cj_ind,
+    const JListRanges ranges(iEntry.cj4_ind_start * c_nbnxnGpuJgroupSize,
+                             nbl->work->cj_ind,
                               gmx::makeConstArrayRef(nbl->cj4));
  
      GMX_ASSERT(nbl->na_ci == c_nbnxnGpuClusterSize, "na_ci should match the GPU cluster size");
@@ -2030,12 +2005,12 @@ static void sort_cj_excl(nbnxn_cj_t* cj, int ncj, NbnxnPairlistCpuWork* work)
  }
  
  /* Close this simple list i entry */
-static void closeIEntry(NbnxnPairlistCpu* nbl,
-                        int gmx_unused sp_max_av,
+static void closeIEntry(NbnxnPairlistCpu*   nbl,
+                        int gmx_unused      sp_max_av,
                          gmx_bool gmx_unused progBal,
-                        float gmx_unused nsp_tot_est,
-                        int gmx_unused thread,
-                        int gmx_unused nthread)
+                        float gmx_unused    nsp_tot_est,
+                        int gmx_unused      thread,
+                        int gmx_unused      nthread)
  {
      nbnxn_ci_t& ciEntry = nbl->ci.back();
  
@@ -2081,16 +2056,15 @@ static void split_sci_entry(NbnxnPairlistGpu* nbl,
                              int               thread,
                              int               nthread)
  {
-    int nsp_max;
+
+    int nsp_max = nsp_target_av;
  
      if (progBal)
      {
-        float nsp_est;
-
          /* Estimate the total numbers of ci's of the nblist combined
           * over all threads using the target number of ci's.
           */
-        nsp_est = (nsp_tot_est * thread) / nthread + nbl->nci_tot;
+        float nsp_est = (nsp_tot_est * thread) / nthread + nbl->nci_tot;
  
          /* The first ci blocks should be larger, to avoid overhead.
           * The last ci blocks should be smaller, to improve load balancing.
@@ -2100,10 +2074,6 @@ static void split_sci_entry(NbnxnPairlistGpu* nbl,
           */
          nsp_max = static_cast<int>(nsp_target_av * (nsp_tot_est * 1.5 / (nsp_est + nsp_tot_est)));
      }
-    else
-    {
-        nsp_max = nsp_target_av;
-    }
  
      const int cj4_start = nbl->sci.back().cj4_ind_start;
      const int cj4_end   = nbl->sci.back().cj4_ind_end;
@@ -2230,9 +2200,9 @@ static void clear_pairlist_fep(t_nblist* nl)
  {
      nl->nri = 0;
      nl->nrj = 0;
-    if (nl->jindex == nullptr)
+    if (nl->jindex.empty())
      {
-        snew(nl->jindex, 1);
+        nl->jindex.resize(1);
      }
      nl->jindex[0] = 0;
  }
@@ -2295,8 +2265,8 @@ gmx_unused static void set_icell_bb_supersub(gmx::ArrayRef<const BoundingBox> bb
  gmx_unused static void set_icell_bb(const Grid& iGrid, int ci, real shx, real shy, real shz, NbnxnPairlistGpuWork* work)
  {
  #if NBNXN_BBXXXX
-    set_icell_bbxxxx_supersub(iGrid.packedBoundingBoxes(), ci, shx, shy, shz,
-                              work->iSuperClusterData.bbPacked.data());
+    set_icell_bbxxxx_supersub(
+            iGrid.packedBoundingBoxes(), ci, shx, shy, shz, work->iSuperClusterData.bbPacked.data());
  #else
      set_icell_bb_supersub(iGrid.iBoundingBoxes(), ci, shx, shy, shz, work->iSuperClusterData.bb.data());
  #endif
@@ -2352,12 +2322,12 @@ static void icell_set_x(int                             ci,
  }
  
  /* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
-static void icell_set_x(int                       ci,
-                        real                      shx,
-                        real                      shy,
-                        real                      shz,
-                        int                       stride,
-                        const real*               x,
+static void icell_set_x(int                                  ci,
+                        real                                 shx,
+                        real                                 shy,
+                        real                                 shz,
+                        int                                  stride,
+                        const real*                          x,
                          ClusterDistanceKernelType gmx_unused kernelType,
                          NbnxnPairlistGpuWork*                work)
  {
@@ -2437,11 +2407,7 @@ static real effective_buffer_1x1_vs_MxN(const Grid& iGrid, const Grid& jGrid)
  /* Estimates the interaction volume^2 for non-local interactions */
  static real nonlocal_vol2(const struct gmx_domdec_zones_t* zones, const rvec ls, real r)
  {
-    real cl, ca, za;
-    real vold_est;
-    real vol2_est_tot;
-
-    vol2_est_tot = 0;
+    real vol2_est_tot = 0;
  
      /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
       * not home interaction volume^2. As these volumes are not additive,
@@ -2454,9 +2420,9 @@ static real nonlocal_vol2(const struct gmx_domdec_zones_t* zones, const rvec ls,
      {
          if (zones->shift[z][XX] + zones->shift[z][YY] + zones->shift[z][ZZ] == 1)
          {
-            cl = 0;
-            ca = 1;
-            za = 1;
+            real cl = 0;
+            real ca = 1;
+            real za = 1;
              for (int d = 0; d < DIM; d++)
              {
                  if (zones->shift[z][d] == 0)
@@ -2468,7 +2434,7 @@ static real nonlocal_vol2(const struct gmx_domdec_zones_t* zones, const rvec ls,
              }
  
              /* 4 octants of a sphere */
-            vold_est = 0.25 * M_PI * r * r * r * r;
+            real vold_est = 0.25 * M_PI * r * r * r * r;
              /* 4 quarter pie slices on the edges */
              vold_est += 4 * cl * M_PI / 6.0 * r * r * r;
              /* One rectangular volume on a face */
@@ -2493,7 +2459,6 @@ static void get_nsubpair_target(const Nbnxm::GridSet&     gridSet,
       * Maxwell is less sensitive to the exact value.
       */
      const int nsubpair_target_min = 36;
-    real      r_eff_sup, vol_est, nsp_est, nsp_est_nl;
  
      const Grid& grid = gridSet.grids()[0];
  
@@ -2521,22 +2486,20 @@ static void get_nsubpair_target(const Nbnxm::GridSet&     gridSet,
      ls[ZZ] = numAtomsCluster / (dims.atomDensity * ls[XX] * ls[YY]);
  
      /* The formulas below are a heuristic estimate of the average nsj per si*/
-    r_eff_sup = rlist + nbnxn_get_rlist_effective_inc(numAtomsCluster, ls);
+    const real r_eff_sup = rlist + nbnxn_get_rlist_effective_inc(numAtomsCluster, ls);
  
-    if (!gridSet.domainSetup().haveMultipleDomains || gridSet.domainSetup().zones->n == 1)
-    {
-        nsp_est_nl = 0;
-    }
-    else
+    real nsp_est_nl = 0;
+    if (gridSet.domainSetup().haveMultipleDomains && gridSet.domainSetup().zones->n != 1)
      {
          nsp_est_nl = gmx::square(dims.atomDensity / numAtomsCluster)
                       * nonlocal_vol2(gridSet.domainSetup().zones, ls, r_eff_sup);
      }
  
+    real nsp_est = nsp_est_nl;
      if (iloc == InteractionLocality::Local)
      {
          /* Sub-cell interacts with itself */
-        vol_est = ls[XX] * ls[YY] * ls[ZZ];
+        real vol_est = ls[XX] * ls[YY] * ls[ZZ];
          /* 6/2 rectangular volume on the faces */
          vol_est += (ls[XX] * ls[YY] + ls[XX] * ls[ZZ] + ls[YY] * ls[ZZ]) * r_eff_sup;
          /* 12/2 quarter pie slices on the edges */
@@ -2552,7 +2515,7 @@ static void get_nsubpair_target(const Nbnxm::GridSet&     gridSet,
          /* Subtract the non-local pair count */
          nsp_est -= nsp_est_nl;
  
-        /* For small cut-offs nsp_est will be an underesimate.
+        /* For small cut-offs nsp_est will be an underestimate.
           * With DD nsp_est_nl is an overestimate so nsp_est can get negative.
           * So to avoid too small or negative nsp_est we set a minimum of
           * all cells interacting with all 3^3 direct neighbors (3^3-1)/2+1=14.
@@ -2568,10 +2531,6 @@ static void get_nsubpair_target(const Nbnxm::GridSet&     gridSet,
              fprintf(debug, "nsp_est local %5.1f non-local %5.1f\n", nsp_est, nsp_est_nl);
          }
      }
-    else
-    {
-        nsp_est = nsp_est_nl;
-    }
  
      /* Thus the (average) maximum j-list size should be as follows.
       * Since there is overhead, we shouldn't make the lists too small
@@ -2591,8 +2550,7 @@ static void print_nblist_ci_cj(FILE* fp, const NbnxnPairlistCpu& nbl)
  {
      for (const nbnxn_ci_t& ciEntry : nbl.ci)
      {
-        fprintf(fp, "ci %4d  shift %2d  ncj %3d\n", ciEntry.ci, ciEntry.shift,
-                ciEntry.cj_ind_end - ciEntry.cj_ind_start);
+        fprintf(fp, "ci %4d  shift %2d  ncj %3d\n", ciEntry.ci, ciEntry.shift, ciEntry.cj_ind_end - ciEntry.cj_ind_start);
  
          for (int j = ciEntry.cj_ind_start; j < ciEntry.cj_ind_end; j++)
          {
@@ -2623,8 +2581,7 @@ static void print_nblist_sci_cj(FILE* fp, const NbnxnPairlistGpu& nbl)
                  }
              }
          }
-        fprintf(fp, "ci %4d  shift %2d  ncj4 %2d ncp %3d\n", sci.sci, sci.shift,
-                sci.numJClusterGroups(), ncp);
+        fprintf(fp, "ci %4d  shift %2d  ncj4 %2d ncp %3d\n", sci.sci, sci.shift, sci.numJClusterGroups(), ncp);
      }
  }
  
@@ -2634,7 +2591,7 @@ static void combine_nblists(gmx::ArrayRef<const NbnxnPairlistGpu> nbls, NbnxnPai
      int nsci  = nblc->sci.size();
      int ncj4  = nblc->cj4.size();
      int nexcl = nblc->excl.size();
-    for (auto& nbl : nbls)
+    for (const auto& nbl : nbls)
      {
          nsci += nbl.sci.size();
          ncj4 += nbl.cj4.size();
@@ -2650,7 +2607,7 @@ static void combine_nblists(gmx::ArrayRef<const NbnxnPairlistGpu> nbls, NbnxnPai
      /* Each thread should copy its own data to the combined arrays,
       * as otherwise data will go back and forth between different caches.
       */
-    const int gmx_unused nthreads = gmx_omp_nthreads_get(emntPairsearch);
+    const int gmx_unused nthreads = gmx_omp_nthreads_get(ModuleMultiThread::Pairsearch);
  
  #pragma omp parallel for num_threads(nthreads) schedule(static)
      for (gmx::index n = 0; n < nbls.ssize(); n++)
@@ -2695,7 +2652,7 @@ static void combine_nblists(gmx::ArrayRef<const NbnxnPairlistGpu> nbls, NbnxnPai
          GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
      }
  
-    for (auto& nbl : nbls)
+    for (const auto& nbl : nbls)
      {
          nblc->nci_tot += nbl.nci_tot;
      }
@@ -2723,7 +2680,7 @@ static void balance_fep_lists(gmx::ArrayRef<std::unique_ptr<t_nblist>> fepLists,
  
      const int nrj_target = (nrj_tot + numLists - 1) / numLists;
  
-    GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == numLists,
+    GMX_ASSERT(gmx_omp_nthreads_get(ModuleMultiThread::Nonbonded) == numLists,
                 "We should have as many work objects as FEP lists");
  
  #pragma omp parallel for schedule(static) num_threads(numLists)
@@ -2744,8 +2701,8 @@ static void balance_fep_lists(gmx::ArrayRef<std::unique_ptr<t_nblist>> fepLists,
              if (nri_tot > nbl->maxnri || nrj_tot > nbl->maxnrj)
              {
                  nbl->maxnrj = over_alloc_small(nrj_tot);
-                srenew(nbl->jjnr, nbl->maxnrj);
-                srenew(nbl->excl_fep, nbl->maxnrj);
+                nbl->jjnr.resize(nbl->maxnrj);
+                nbl->excl_fep.resize(nbl->maxnrj);
              }
  
              clear_pairlist_fep(nbl);
@@ -2762,10 +2719,8 @@ static void balance_fep_lists(gmx::ArrayRef<std::unique_ptr<t_nblist>> fepLists,
  
          for (int i = 0; i < nbls->nri; i++)
          {
-            int nrj;
-
              /* The number of pairs in this i-entry */
-            nrj = nbls->jindex[i + 1] - nbls->jindex[i];
+            const int nrj = nbls->jindex[i + 1] - nbls->jindex[i];
  
              /* Decide if list th_dest is too large and we should procede
               * to the next destination list.
@@ -2856,24 +2811,22 @@ static float boundingbox_only_distance2(const Grid::Dimensions& iGridDims,
       * is only performed when only 1 out of 8 sub-cells in within range,
       * this is because the GPU is much faster than the cpu.
       */
-    real bbx, bby;
-    real rbb2;
  
-    bbx = 0.5 * (iGridDims.cellSize[XX] + jGridDims.cellSize[XX]);
-    bby = 0.5 * (iGridDims.cellSize[YY] + jGridDims.cellSize[YY]);
+    real bbx = 0.5 * (iGridDims.cellSize[XX] + jGridDims.cellSize[XX]);
+    real bby = 0.5 * (iGridDims.cellSize[YY] + jGridDims.cellSize[YY]);
      if (!simple)
      {
          bbx /= c_gpuNumClusterPerCellX;
          bby /= c_gpuNumClusterPerCellY;
      }
  
-    rbb2 = std::max(0.0, rlist - 0.5 * std::sqrt(bbx * bbx + bby * bby));
-    rbb2 = rbb2 * rbb2;
+    real rbb2 = std::max(0.0, rlist - 0.5 * std::sqrt(bbx * bbx + bby * bby));
+    rbb2      = rbb2 * rbb2;
  
  #if !GMX_DOUBLE
      return rbb2;
  #else
-    return (float)((1 + GMX_FLOAT_EPS) * rbb2);
+    return static_cast<float>((1 + GMX_FLOAT_EPS) * rbb2);
  #endif
  }
  
@@ -2882,7 +2835,6 @@ static int get_ci_block_size(const Grid& iGrid, const bool haveMultipleDomains,
      const int ci_block_enum      = 5;
      const int ci_block_denom     = 11;
      const int ci_block_min_atoms = 16;
-    int       ci_block;
  
      /* Here we decide how to distribute the blocks over the threads.
       * We use prime numbers to try to avoid that the grid size becomes
@@ -2897,8 +2849,8 @@ static int get_ci_block_size(const Grid& iGrid, const bool haveMultipleDomains,
       */
      GMX_ASSERT(iGrid.dimensions().numCells[XX] > 0, "Grid can't be empty");
      GMX_ASSERT(numLists > 0, "We need at least one list");
-    ci_block = (iGrid.numCells() * ci_block_enum)
-               / (ci_block_denom * iGrid.dimensions().numCells[XX] * numLists);
+    int ci_block = (iGrid.numCells() * ci_block_enum)
+                   / (ci_block_denom * iGrid.dimensions().numCells[XX] * numLists);
  
      const int numAtomsPerCell = iGrid.geometry().numAtomsPerCell;
  
@@ -2972,42 +2924,42 @@ static void makeClusterListWrapper(NbnxnPairlistCpu* nbl,
      switch (kernelType)
      {
          case ClusterDistanceKernelType::CpuPlainC:
-            makeClusterListSimple(jGrid, nbl, ci, firstCell, lastCell, excludeSubDiagonal,
-                                  nbat->x().data(), rlist2, rbb2, numDistanceChecks);
+            makeClusterListSimple(
+                    jGrid, nbl, ci, firstCell, lastCell, excludeSubDiagonal, nbat->x().data(), rlist2, rbb2, numDistanceChecks);
              break;
  #ifdef GMX_NBNXN_SIMD_4XN
          case ClusterDistanceKernelType::CpuSimd_4xM:
-            makeClusterListSimd4xn(jGrid, nbl, ci, firstCell, lastCell, excludeSubDiagonal,
-                                   nbat->x().data(), rlist2, rbb2, numDistanceChecks);
+            makeClusterListSimd4xn(
+                    jGrid, nbl, ci, firstCell, lastCell, excludeSubDiagonal, nbat->x().data(), rlist2, rbb2, numDistanceChecks);
              break;
  #endif
  #ifdef GMX_NBNXN_SIMD_2XNN
          case ClusterDistanceKernelType::CpuSimd_2xMM:
-            makeClusterListSimd2xnn(jGrid, nbl, ci, firstCell, lastCell, excludeSubDiagonal,
-                                    nbat->x().data(), rlist2, rbb2, numDistanceChecks);
+            makeClusterListSimd2xnn(
+                    jGrid, nbl, ci, firstCell, lastCell, excludeSubDiagonal, nbat->x().data(), rlist2, rbb2, numDistanceChecks);
              break;
  #endif
          default: GMX_ASSERT(false, "Unhandled kernel type");
      }
  }
  
-static void makeClusterListWrapper(NbnxnPairlistGpu* nbl,
-                                   const Grid& gmx_unused    iGrid,
-                                   const int                 ci,
-                                   const Grid&               jGrid,
-                                   const int                 firstCell,
-                                   const int                 lastCell,
-                                   const bool                excludeSubDiagonal,
-                                   const nbnxn_atomdata_t*   nbat,
-                                   const real                rlist2,
-                                   const real                rbb2,
+static void makeClusterListWrapper(NbnxnPairlistGpu*                    nbl,
+                                   const Grid& gmx_unused               iGrid,
+                                   const int                            ci,
+                                   const Grid&                          jGrid,
+                                   const int                            firstCell,
+                                   const int                            lastCell,
+                                   const bool                           excludeSubDiagonal,
+                                   const nbnxn_atomdata_t*              nbat,
+                                   const real                           rlist2,
+                                   const real                           rbb2,
                                     ClusterDistanceKernelType gmx_unused kernelType,
                                     int*                                 numDistanceChecks)
  {
      for (int cj = firstCell; cj <= lastCell; cj++)
      {
-        make_cluster_list_supersub(iGrid, jGrid, nbl, ci, cj, excludeSubDiagonal, nbat->xstride,
-                                   nbat->x().data(), rlist2, rbb2, numDistanceChecks);
+        make_cluster_list_supersub(
+                iGrid, jGrid, nbl, ci, cj, excludeSubDiagonal, nbat->xstride, nbat->x().data(), rlist2, rbb2, numDistanceChecks);
      }
  }
  
@@ -3063,10 +3015,10 @@ static void setBufferFlags(const NbnxnPairlistCpu& nbl,
  }
  
  static void setBufferFlags(const NbnxnPairlistGpu gmx_unused& nbl,
-                           int gmx_unused ncj_old_j,
-                           int gmx_unused gridj_flag_shift,
+                           int gmx_unused                     ncj_old_j,
+                           int gmx_unused                     gridj_flag_shift,
                             gmx_bitmask_t gmx_unused* gridj_flag,
-                           int gmx_unused th)
+                           int gmx_unused            th)
  {
      GMX_ASSERT(false, "This function should never be called");
  }
@@ -3091,20 +3043,11 @@ static void nbnxn_make_pairlist_part(const Nbnxm::GridSet&   gridSet,
                                       T*                      nbl,
                                       t_nblist*               nbl_fep)
  {
-    int            na_cj_2log;
      matrix         box;
      real           rl_fep2 = 0;
-    float          rbb2;
-    int            ci_b, ci, ci_x, ci_y, ci_xy;
      ivec           shp;
-    real           bx0, bx1, by0, by1, bz0, bz1;
-    real           bz1_frac;
-    real           d2cx, d2z, d2z_cx, d2z_cy, d2zx, d2zxy, d2xy;
-    int            cxf, cxl, cyf, cyf_x, cyl;
-    int            numDistanceChecks;
      int            gridi_flag_shift = 0, gridj_flag_shift = 0;
      gmx_bitmask_t* gridj_flag = nullptr;
-    int            ncj_old_i, ncj_old_j;
  
      if (jGrid.geometry().isSimple != pairlistIsSimple(*nbl)
          || iGrid.geometry().isSimple != pairlistIsSimple(*nbl))
@@ -3115,8 +3058,8 @@ static void nbnxn_make_pairlist_part(const Nbnxm::GridSet&   gridSet,
      sync_work(nbl);
      GMX_ASSERT(nbl->na_ci == jGrid.geometry().numAtomsICluster,
                 "The cluster sizes in the list and grid should match");
-    nbl->na_cj = JClusterSizePerListType[pairlistType];
-    na_cj_2log = get_2log(nbl->na_cj);
+    nbl->na_cj           = JClusterSizePerListType[pairlistType];
+    const int na_cj_2log = get_2log(nbl->na_cj);
  
      nbl->rlist = rlist;
  
@@ -3126,7 +3069,7 @@ static void nbnxn_make_pairlist_part(const Nbnxm::GridSet&   gridSet,
          gridi_flag_shift = getBufferFlagShift(nbl->na_ci);
          gridj_flag_shift = getBufferFlagShift(nbl->na_cj);
  
-        gridj_flag = work->buffer_flags.flag;
+        gridj_flag = work->buffer_flags.data();
      }
  
      gridSet.getBox(box);
@@ -3144,7 +3087,7 @@ static void nbnxn_make_pairlist_part(const Nbnxm::GridSet&   gridSet,
           * We should not simply use rlist, since then we would not have
           * the small, effective buffering of the NxN lists.
           * The buffer is on overestimate, but the resulting cost for pairs
-         * beyond rlist is neglible compared to the FEP pairs within rlist.
+         * beyond rlist is negligible compared to the FEP pairs within rlist.
           */
          rl_fep2 = nbl->rlist + effective_buffer_1x1_vs_MxN(iGrid, jGrid);
  
@@ -3158,7 +3101,8 @@ static void nbnxn_make_pairlist_part(const Nbnxm::GridSet&   gridSet,
      const Grid::Dimensions& iGridDims = iGrid.dimensions();
      const Grid::Dimensions& jGridDims = jGrid.dimensions();
  
-    rbb2 = boundingbox_only_distance2(iGridDims, jGridDims, nbl->rlist, pairlistIsSimple(*nbl));
+    const float rbb2 =
+            boundingbox_only_distance2(iGridDims, jGridDims, nbl->rlist, pairlistIsSimple(*nbl));
  
      if (debug)
      {
@@ -3215,11 +3159,14 @@ static void nbnxn_make_pairlist_part(const Nbnxm::GridSet&   gridSet,
  
      if (debug)
      {
-        fprintf(debug, "nbl nc_i %d col.av. %.1f ci_block %d\n", iGrid.numCells(),
-                iGrid.numCells() / static_cast<double>(iGrid.numColumns()), ci_block);
+        fprintf(debug,
+                "nbl nc_i %d col.av. %.1f ci_block %d\n",
+                iGrid.numCells(),
+                iGrid.numCells() / static_cast<double>(iGrid.numColumns()),
+                ci_block);
      }
  
-    numDistanceChecks = 0;
+    int numDistanceChecks = 0;
  
      const real listRangeBBToJCell2 =
              gmx::square(listRangeForBoundingBoxToGridCell(rlist, jGrid.dimensions()));
@@ -3227,29 +3174,24 @@ static void nbnxn_make_pairlist_part(const Nbnxm::GridSet&   gridSet,
      /* Initially ci_b and ci to 1 before where we want them to start,
       * as they will both be incremented in next_ci.
       */
-    ci_b = -1;
-    ci   = th * ci_block - 1;
-    ci_x = 0;
-    ci_y = 0;
+    int ci_b = -1;
+    int ci   = th * ci_block - 1;
+    int ci_x = 0;
+    int ci_y = 0;
      while (next_ci(iGrid, nth, ci_block, &ci_x, &ci_y, &ci_b, &ci))
      {
          if (bSimple && flags_i[ci] == 0)
          {
              continue;
          }
-        ncj_old_i = getNumSimpleJClustersInList(*nbl);
+        const int ncj_old_i = getNumSimpleJClustersInList(*nbl);
  
-        d2cx = 0;
+        real d2cx = 0;
          if (!isIntraGridList && shp[XX] == 0)
          {
-            if (bSimple)
-            {
-                bx1 = bb_i[ci].upper.x;
-            }
-            else
-            {
-                bx1 = iGridDims.lowerCorner[XX] + (real(ci_x) + 1) * iGridDims.cellSize[XX];
-            }
+            const real bx1 =
+                    bSimple ? bb_i[ci].upper.x
+                            : iGridDims.lowerCorner[XX] + (real(ci_x) + 1) * iGridDims.cellSize[XX];
              if (bx1 < jGridDims.lowerCorner[XX])
              {
                  d2cx = gmx::square(jGridDims.lowerCorner[XX] - bx1);
@@ -3261,37 +3203,34 @@ static void nbnxn_make_pairlist_part(const Nbnxm::GridSet&   gridSet,
              }
          }
  
-        ci_xy = ci_x * iGridDims.numCells[YY] + ci_y;
+        int ci_xy = ci_x * iGridDims.numCells[YY] + ci_y;
  
          /* Loop over shift vectors in three dimensions */
          for (int tz = -shp[ZZ]; tz <= shp[ZZ]; tz++)
          {
              const real shz = real(tz) * box[ZZ][ZZ];
  
-            bz0 = bbcz_i[ci].lower + shz;
-            bz1 = bbcz_i[ci].upper + shz;
+            real bz0 = bbcz_i[ci].lower + shz;
+            real bz1 = bbcz_i[ci].upper + shz;
  
-            if (tz == 0)
-            {
-                d2z = 0;
-            }
-            else if (tz < 0)
+            real d2z = 0;
+            if (tz < 0)
              {
                  d2z = gmx::square(bz1);
              }
-            else
+            else if (tz > 0)
              {
                  d2z = gmx::square(bz0 - box[ZZ][ZZ]);
              }
  
-            d2z_cx = d2z + d2cx;
+            const real d2z_cx = d2z + d2cx;
  
              if (d2z_cx >= rlist2)
              {
                  continue;
              }
  
-            bz1_frac = bz1 / real(iGrid.numCellsInColumn(ci_xy));
+            real bz1_frac = bz1 / real(iGrid.numCellsInColumn(ci_xy));
              if (bz1_frac < 0)
              {
                  bz1_frac = 0;
@@ -3302,17 +3241,14 @@ static void nbnxn_make_pairlist_part(const Nbnxm::GridSet&   gridSet,
              {
                  const real shy = real(ty) * box[YY][YY] + real(tz) * box[ZZ][YY];
  
-                if (bSimple)
-                {
-                    by0 = bb_i[ci].lower.y + shy;
-                    by1 = bb_i[ci].upper.y + shy;
-                }
-                else
-                {
-                    by0 = iGridDims.lowerCorner[YY] + (real(ci_y)) * iGridDims.cellSize[YY] + shy;
-                    by1 = iGridDims.lowerCorner[YY] + (real(ci_y) + 1) * iGridDims.cellSize[YY] + shy;
-                }
+                const real by0 = bSimple ? bb_i[ci].lower.y + shy
+                                         : iGridDims.lowerCorner[YY]
+                                                   + (real(ci_y)) * iGridDims.cellSize[YY] + shy;
+                const real by1 = bSimple ? bb_i[ci].upper.y + shy
+                                         : iGridDims.lowerCorner[YY]
+                                                   + (real(ci_y) + 1) * iGridDims.cellSize[YY] + shy;
  
+                int cyf, cyl; //NOLINT(cppcoreguidelines-init-variables)
                  get_cell_range<YY>(by0, by1, jGridDims, d2z_cx, rlist, &cyf, &cyl);
  
                  if (cyf > cyl)
@@ -3320,7 +3256,7 @@ static void nbnxn_make_pairlist_part(const Nbnxm::GridSet&   gridSet,
                      continue;
                  }
  
-                d2z_cy = d2z;
+                real d2z_cy = d2z;
                  if (by1 < jGridDims.lowerCorner[YY])
                  {
                      d2z_cy += gmx::square(jGridDims.lowerCorner[YY] - by1);
@@ -3332,11 +3268,11 @@ static void nbnxn_make_pairlist_part(const Nbnxm::GridSet&   gridSet,
  
                  for (int tx = -shp[XX]; tx <= shp[XX]; tx++)
                  {
-                    const int shift = XYZ2IS(tx, ty, tz);
+                    const int shift = xyzToShiftIndex(tx, ty, tz);
  
-                    const bool excludeSubDiagonal = (isIntraGridList && shift == CENTRAL);
+                    const bool excludeSubDiagonal = (isIntraGridList && shift == gmx::c_centralShiftIndex);
  
-                    if (c_pbcShiftBackward && isIntraGridList && shift > CENTRAL)
+                    if (c_pbcShiftBackward && isIntraGridList && shift > gmx::c_centralShiftIndex)
                      {
                          continue;
                      }
@@ -3344,17 +3280,14 @@ static void nbnxn_make_pairlist_part(const Nbnxm::GridSet&   gridSet,
                      const real shx =
                              real(tx) * box[XX][XX] + real(ty) * box[YY][XX] + real(tz) * box[ZZ][XX];
  
-                    if (bSimple)
-                    {
-                        bx0 = bb_i[ci].lower.x + shx;
-                        bx1 = bb_i[ci].upper.x + shx;
-                    }
-                    else
-                    {
-                        bx0 = iGridDims.lowerCorner[XX] + (real(ci_x)) * iGridDims.cellSize[XX] + shx;
-                        bx1 = iGridDims.lowerCorner[XX] + (real(ci_x) + 1) * iGridDims.cellSize[XX] + shx;
-                    }
+                    const real bx0 = bSimple ? bb_i[ci].lower.x + shx
+                                             : iGridDims.lowerCorner[XX]
+                                                       + (real(ci_x)) * iGridDims.cellSize[XX] + shx;
+                    const real bx1 = bSimple ? bb_i[ci].upper.x + shx
+                                             : iGridDims.lowerCorner[XX]
+                                                       + (real(ci_x) + 1) * iGridDims.cellSize[XX] + shx;
  
+                    int cxf, cxl; //NOLINT(cppcoreguidelines-init-variables)
                      get_cell_range<XX>(bx0, bx1, jGridDims, d2z_cy, rlist, &cxf, &cxl);
  
                      if (cxf > cxl)
@@ -3374,13 +3307,19 @@ static void nbnxn_make_pairlist_part(const Nbnxm::GridSet&   gridSet,
  
                      set_icell_bb(iGrid, ci, shx, shy, shz, nbl->work.get());
  
-                    icell_set_x(cell0_i + ci, shx, shy, shz, nbat->xstride, nbat->x().data(),
-                                kernelType, nbl->work.get());
+                    icell_set_x(cell0_i + ci,
+                                shx,
+                                shy,
+                                shz,
+                                nbat->xstride,
+                                nbat->x().data(),
+                                kernelType,
+                                nbl->work.get());
  
                      for (int cx = cxf; cx <= cxl; cx++)
                      {
                          const real cx_real = cx;
-                        d2zx               = d2z;
+                        real       d2zx    = d2z;
                          if (jGridDims.lowerCorner[XX] + cx_real * jGridDims.cellSize[XX] > bx1)
                          {
                              d2zx += gmx::square(jGridDims.lowerCorner[XX]
@@ -3392,18 +3331,13 @@ static void nbnxn_make_pairlist_part(const Nbnxm::GridSet&   gridSet,
                                                  + (cx_real + 1) * jGridDims.cellSize[XX] - bx0);
                          }
  
-                        if (isIntraGridList && cx == 0 && (!c_pbcShiftBackward || shift == CENTRAL)
-                            && cyf < ci_y)
-                        {
-                            /* Leave the pairs with i > j.
-                             * Skip half of y when i and j have the same x.
-                             */
-                            cyf_x = ci_y;
-                        }
-                        else
-                        {
-                            cyf_x = cyf;
-                        }
+                        /* When true, leave the pairs with i > j.
+                         * Skip half of y when i and j have the same x.
+                         */
+                        const bool skipHalfY = (isIntraGridList && cx == 0
+                                                && (!c_pbcShiftBackward || shift == gmx::c_centralShiftIndex)
+                                                && cyf < ci_y);
+                        const int  cyf_x     = skipHalfY ? ci_y : cyf;
  
                          for (int cy = cyf_x; cy <= cyl; cy++)
                          {
@@ -3413,7 +3347,7 @@ static void nbnxn_make_pairlist_part(const Nbnxm::GridSet&   gridSet,
                                      jGrid.firstCellInColumn(cx * jGridDims.numCells[YY] + cy + 1);
  
                              const real cy_real = cy;
-                            d2zxy              = d2zx;
+                            real       d2zxy   = d2zx;
                              if (jGridDims.lowerCorner[YY] + cy_real * jGridDims.cellSize[YY] > by1)
                              {
                                  d2zxy += gmx::square(jGridDims.lowerCorner[YY]
@@ -3441,14 +3375,14 @@ static void nbnxn_make_pairlist_part(const Nbnxm::GridSet&   gridSet,
                                   */
                                  int midCell =
                                          columnStart
-                                        + static_cast<int>(bz1_frac
-                                                           * static_cast<real>(columnEnd - columnStart));
+                                        + static_cast<int>(
+                                                bz1_frac * static_cast<real>(columnEnd - columnStart));
                                  if (midCell >= columnEnd)
                                  {
                                      midCell = columnEnd - 1;
                                  }
  
-                                d2xy = d2zxy - d2z;
+                                const real d2xy = d2zxy - d2z;
  
                                  /* Find the lowest cell that can possibly
                                   * be within range.
@@ -3509,7 +3443,7 @@ static void nbnxn_make_pairlist_part(const Nbnxm::GridSet&   gridSet,
                                      /* We want each atom/cell pair only once,
                                       * only use cj >= ci.
                                       */
-                                    if (!c_pbcShiftBackward || shift == CENTRAL)
+                                    if (!c_pbcShiftBackward || shift == gmx::c_centralShiftIndex)
                                      {
                                          firstCell = std::max(firstCell, ci);
                                      }
@@ -3522,11 +3456,20 @@ static void nbnxn_make_pairlist_part(const Nbnxm::GridSet&   gridSet,
                                                 "column");
  
                                      /* For f buffer flags with simple lists */
-                                    ncj_old_j = getNumSimpleJClustersInList(*nbl);
-
-                                    makeClusterListWrapper(nbl, iGrid, ci, jGrid, firstCell, lastCell,
-                                                           excludeSubDiagonal, nbat, rlist2, rbb2,
-                                                           kernelType, &numDistanceChecks);
+                                    const int ncj_old_j = getNumSimpleJClustersInList(*nbl);
+
+                                    makeClusterListWrapper(nbl,
+                                                           iGrid,
+                                                           ci,
+                                                           jGrid,
+                                                           firstCell,
+                                                           lastCell,
+                                                           excludeSubDiagonal,
+                                                           nbat,
+                                                           rlist2,
+                                                           rbb2,
+                                                           kernelType,
+                                                           &numDistanceChecks);
  
                                      if (bFBufferFlag)
                                      {
@@ -3539,14 +3482,27 @@ static void nbnxn_make_pairlist_part(const Nbnxm::GridSet&   gridSet,
                          }
                      }
  
-                    /* Set the exclusions for this ci list */
-                    setExclusionsForIEntry(gridSet, nbl, excludeSubDiagonal, na_cj_2log,
-                                           *getOpenIEntry(nbl), exclusions);
+                    if (!exclusions.empty())
+                    {
+                        /* Set the exclusions for this ci list */
+                        setExclusionsForIEntry(
+                                gridSet, nbl, excludeSubDiagonal, na_cj_2log, *getOpenIEntry(nbl), exclusions);
+                    }
  
                      if (haveFep)
                      {
-                        make_fep_list(gridSet.atomIndices(), nbat, nbl, excludeSubDiagonal,
-                                      getOpenIEntry(nbl), shx, shy, shz, rl_fep2, iGrid, jGrid, nbl_fep);
+                        make_fep_list(gridSet.atomIndices(),
+                                      nbat,
+                                      nbl,
+                                      excludeSubDiagonal,
+                                      getOpenIEntry(nbl),
+                                      shx,
+                                      shy,
+                                      shz,
+                                      rl_fep2,
+                                      iGrid,
+                                      jGrid,
+                                      nbl_fep);
                      }
  
                      /* Close this ci list */
@@ -3557,7 +3513,7 @@ static void nbnxn_make_pairlist_part(const Nbnxm::GridSet&   gridSet,
  
          if (bFBufferFlag && getNumSimpleJClustersInList(*nbl) > ncj_old_i)
          {
-            bitmask_init_bit(&(work->buffer_flags.flag[(iGrid.cellOffset() + ci) >> gridi_flag_shift]), th);
+            bitmask_init_bit(&(work->buffer_flags[(iGrid.cellOffset() + ci) >> gridi_flag_shift]), th);
          }
      }
  
@@ -3580,43 +3536,43 @@ static void nbnxn_make_pairlist_part(const Nbnxm::GridSet&   gridSet,
  
  static void reduce_buffer_flags(gmx::ArrayRef<PairsearchWork> searchWork,
                                  int                           nsrc,
-                                const nbnxn_buffer_flags_t*   dest)
+                                gmx::ArrayRef<gmx_bitmask_t>  dest)
  {
      for (int s = 0; s < nsrc; s++)
      {
-        gmx_bitmask_t* flag = searchWork[s].buffer_flags.flag;
+        gmx::ArrayRef<gmx_bitmask_t> flags(searchWork[s].buffer_flags);
  
-        for (int b = 0; b < dest->nflag; b++)
+        for (size_t b = 0; b < dest.size(); b++)
          {
-            bitmask_union(&(dest->flag[b]), flag[b]);
+            gmx_bitmask_t& flag = dest[b];
+            bitmask_union(&flag, flags[b]);
          }
      }
  }
  
-static void print_reduction_cost(const nbnxn_buffer_flags_t* flags, int nout)
+static void print_reduction_cost(gmx::ArrayRef<const gmx_bitmask_t> flags, int nout)
  {
-    int           nelem, nkeep, ncopy, nred, out;
-    gmx_bitmask_t mask_0;
+    int nelem = 0;
+    int nkeep = 0;
+    int ncopy = 0;
+    int nred  = 0;
  
-    nelem = 0;
-    nkeep = 0;
-    ncopy = 0;
-    nred  = 0;
+    gmx_bitmask_t mask_0; // NOLINT(cppcoreguidelines-init-variables)
      bitmask_init_bit(&mask_0, 0);
-    for (int b = 0; b < flags->nflag; b++)
+    for (const gmx_bitmask_t& flag_mask : flags)
      {
-        if (bitmask_is_equal(flags->flag[b], mask_0))
+        if (bitmask_is_equal(flag_mask, mask_0))
          {
              /* Only flag 0 is set, no copy of reduction required */
              nelem++;
              nkeep++;
          }
-        else if (!bitmask_is_zero(flags->flag[b]))
+        else if (!bitmask_is_zero(flag_mask))
          {
              int c = 0;
-            for (out = 0; out < nout; out++)
+            for (int out = 0; out < nout; out++)
              {
-                if (bitmask_is_set(flags->flag[b], out))
+                if (bitmask_is_set(flag_mask, out))
                  {
                      c++;
                  }
@@ -3632,12 +3588,15 @@ static void print_reduction_cost(const nbnxn_buffer_flags_t* flags, int nout)
              }
          }
      }
-
+    const auto numFlags = static_cast<double>(flags.size());
      fprintf(debug,
-            "nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
-            flags->nflag, nout, nelem / static_cast<double>(flags->nflag),
-            nkeep / static_cast<double>(flags->nflag), ncopy / static_cast<double>(flags->nflag),
-            nred / static_cast<double>(flags->nflag));
+            "nbnxn reduction: #flag %zu #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
+            flags.size(),
+            nout,
+            nelem / numFlags,
+            nkeep / numFlags,
+            ncopy / numFlags,
+            nred / numFlags);
  }
  
  /* Copies the list entries from src to dest when cjStart <= *cjGlobal < cjEnd.
@@ -3645,13 +3604,13 @@ static void print_reduction_cost(const nbnxn_buffer_flags_t* flags, int nout)
   * When setFlags==true, flag bit t is set in flag for all i and j clusters.
   */
  template<bool setFlags>
-static void copySelectedListRange(const nbnxn_ci_t* gmx_restrict srcCi,
+static void copySelectedListRange(const nbnxn_ci_t* gmx_restrict       srcCi,
                                    const NbnxnPairlistCpu* gmx_restrict src,
-                                  NbnxnPairlistCpu* gmx_restrict dest,
-                                  gmx_bitmask_t*                 flag,
-                                  int                            iFlagShift,
-                                  int                            jFlagShift,
-                                  int                            t)
+                                  NbnxnPairlistCpu* gmx_restrict       dest,
+                                  gmx_bitmask_t*                       flag,
+                                  int                                  iFlagShift,
+                                  int                                  jFlagShift,
+                                  int                                  t)
  {
      const int ncj = srcCi->cj_ind_end - srcCi->cj_ind_start;
  
@@ -3680,7 +3639,7 @@ static void copySelectedListRange(const nbnxn_ci_t* gmx_restrict srcCi,
      }
  }
  
-#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && __GNUC__ == 7
+#if defined(__GNUC__) && !defined(__clang__) && __GNUC__ == 7
  /* Avoid gcc 7 avx512 loop vectorization bug (actually only needed with -mavx512f) */
  #    pragma GCC push_options
  #    pragma GCC optimize("no-tree-vectorize")
@@ -3701,7 +3660,7 @@ static int countClusterpairs(gmx::ArrayRef<const NbnxnPairlistCpu> pairlists)
      return ncjTotal;
  }
  
-#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && __GNUC__ == 7
+#if defined(__GNUC__) && !defined(__clang__) && __GNUC__ == 7
  #    pragma GCC pop_options
  #endif
  
@@ -3737,7 +3696,7 @@ static void rebalanceSimpleLists(gmx::ArrayRef<const NbnxnPairlistCpu> srcSet,
          /* Note that the flags in the work struct (still) contain flags
           * for all entries that are present in srcSet->nbl[t].
           */
-        gmx_bitmask_t* flag = searchWork[t].buffer_flags.flag;
+        gmx_bitmask_t* flag = &searchWork[t].buffer_flags[0];
  
          int iFlagShift = getBufferFlagShift(dest.na_ci);
          int jFlagShift = getBufferFlagShift(dest.na_cj);
@@ -3764,8 +3723,8 @@ static void rebalanceSimpleLists(gmx::ArrayRef<const NbnxnPairlistCpu> srcSet,
                          }
                          else
                          {
-                            copySelectedListRange<false>(srcCi, src, &dest, flag, iFlagShift,
-                                                         jFlagShift, t);
+                            copySelectedListRange<false>(
+                                    srcCi, src, &dest, flag, iFlagShift, jFlagShift, t);
                          }
                      }
                      cjGlobal += ncj;
@@ -3892,7 +3851,7 @@ static Range<int> getIZoneRange(const Nbnxm::GridSet::DomainSetup& domainSetup,
  }
  
  /* Returns the j-zone range for pairlist construction for the give locality and i-zone */
-static Range<int> getJZoneRange(const gmx_domdec_zones_t& ddZones,
+static Range<int> getJZoneRange(const gmx_domdec_zones_t* ddZones,
                                  const InteractionLocality locality,
                                  const int                 iZone)
  {
@@ -3904,19 +3863,20 @@ static Range<int> getJZoneRange(const gmx_domdec_zones_t& ddZones,
      else if (iZone == 0)
      {
          /* Non-local: we need to avoid the local (zone 0 vs 0) interactions */
-        return { 1, *ddZones.iZones[iZone].jZoneRange.end() };
+        return { 1, *ddZones->iZones[iZone].jZoneRange.end() };
      }
      else
      {
          /* Non-local with non-local i-zone: use all j-zones */
-        return ddZones.iZones[iZone].jZoneRange;
+        return ddZones->iZones[iZone].jZoneRange;
      }
  }
  
  //! Prepares CPU lists produced by the search for dynamic pruning
  static void prepareListsForDynamicPruning(gmx::ArrayRef<NbnxnPairlistCpu> lists);
  
-void PairlistSet::constructPairlists(const Nbnxm::GridSet&         gridSet,
+void PairlistSet::constructPairlists(gmx::InteractionLocality      locality,
+                                     const Nbnxm::GridSet&         gridSet,
                                       gmx::ArrayRef<PairsearchWork> searchWork,
                                       nbnxn_atomdata_t*             nbat,
                                       const ListOfLists<int>&       exclusions,
@@ -3926,12 +3886,6 @@ void PairlistSet::constructPairlists(const Nbnxm::GridSet&         gridSet,
  {
      const real rlist = params_.rlistOuter;
  
-    int      nsubpair_target;
-    float    nsubpair_tot_est;
-    int      ci_block;
-    gmx_bool progBal;
-    int      np_tot, np_noq, np_hlj, nap;
-
      const int numLists = (isCpuType_ ? cpuLists_.size() : gpuLists_.size());
  
      if (debug)
@@ -3941,20 +3895,17 @@ void PairlistSet::constructPairlists(const Nbnxm::GridSet&         gridSet,
  
      nbat->bUseBufferFlags = (nbat->out.size() > 1);
      /* We should re-init the flags before making the first list */
-    if (nbat->bUseBufferFlags && locality_ == InteractionLocality::Local)
+    if (nbat->bUseBufferFlags && locality == InteractionLocality::Local)
      {
-        init_buffer_flags(&nbat->buffer_flags, nbat->numAtoms());
+        resizeAndZeroBufferFlags(&nbat->buffer_flags, nbat->numAtoms());
      }
  
+    int   nsubpair_target  = 0;
+    float nsubpair_tot_est = 0.0F;
      if (!isCpuType_ && minimumIlistCountForGpuBalancing > 0)
      {
-        get_nsubpair_target(gridSet, locality_, rlist, minimumIlistCountForGpuBalancing,
-                            &nsubpair_target, &nsubpair_tot_est);
-    }
-    else
-    {
-        nsubpair_target  = 0;
-        nsubpair_tot_est = 0;
+        get_nsubpair_target(
+                gridSet, locality, rlist, minimumIlistCountForGpuBalancing, &nsubpair_target, &nsubpair_tot_est);
      }
  
      /* Clear all pair-lists */
@@ -3975,15 +3926,17 @@ void PairlistSet::constructPairlists(const Nbnxm::GridSet&         gridSet,
          }
      }
  
-    const gmx_domdec_zones_t& ddZones = *gridSet.domainSetup().zones;
+    const gmx_domdec_zones_t* ddZones = gridSet.domainSetup().zones;
+    GMX_ASSERT(locality == InteractionLocality::Local || ddZones != nullptr,
+               "Nonlocal interaction locality with null ddZones.");
  
-    const auto iZoneRange = getIZoneRange(gridSet.domainSetup(), locality_);
+    const auto iZoneRange = getIZoneRange(gridSet.domainSetup(), locality);
  
      for (const int iZone : iZoneRange)
      {
          const Grid& iGrid = gridSet.grids()[iZone];
  
-        const auto jZoneRange = getJZoneRange(ddZones, locality_, iZone);
+        const auto jZoneRange = getJZoneRange(ddZones, locality, iZone);
  
          for (int jZone : jZoneRange)
          {
@@ -3996,12 +3949,13 @@ void PairlistSet::constructPairlists(const Nbnxm::GridSet&         gridSet,
  
              searchCycleCounting->start(enbsCCsearch);
  
-            ci_block = get_ci_block_size(iGrid, gridSet.domainSetup().haveMultipleDomains, numLists);
+            const int ci_block =
+                    get_ci_block_size(iGrid, gridSet.domainSetup().haveMultipleDomains, numLists);
  
              /* With GPU: generate progressively smaller lists for
               * load balancing for local only or non-local with 2 zones.
               */
-            progBal = (locality_ == InteractionLocality::Local || ddZones.n <= 2);
+            const bool progBal = (locality == InteractionLocality::Local || ddZones->n <= 2);
  
  #pragma omp parallel for num_threads(numLists) schedule(static)
              for (int th = 0; th < numLists; th++)
@@ -4013,7 +3967,7 @@ void PairlistSet::constructPairlists(const Nbnxm::GridSet&         gridSet,
                       */
                      if (nbat->bUseBufferFlags && (iZone == 0 && jZone == 0))
                      {
-                        init_buffer_flags(&searchWork[th].buffer_flags, nbat->numAtoms());
+                        resizeAndZeroBufferFlags(&searchWork[th].buffer_flags, nbat->numAtoms());
                      }
  
                      if (combineLists_ && th > 0)
@@ -4032,17 +3986,43 @@ void PairlistSet::constructPairlists(const Nbnxm::GridSet&         gridSet,
                      /* Divide the i cells equally over the pairlists */
                      if (isCpuType_)
                      {
-                        nbnxn_make_pairlist_part(gridSet, iGrid, jGrid, &work, nbat, exclusions, rlist,
-                                                 params_.pairlistType, ci_block, nbat->bUseBufferFlags,
-                                                 nsubpair_target, progBal, nsubpair_tot_est, th,
-                                                 numLists, &cpuLists_[th], fepListPtr);
+                        nbnxn_make_pairlist_part(gridSet,
+                                                 iGrid,
+                                                 jGrid,
+                                                 &work,
+                                                 nbat,
+                                                 exclusions,
+                                                 rlist,
+                                                 params_.pairlistType,
+                                                 ci_block,
+                                                 nbat->bUseBufferFlags,
+                                                 nsubpair_target,
+                                                 progBal,
+                                                 nsubpair_tot_est,
+                                                 th,
+                                                 numLists,
+                                                 &cpuLists_[th],
+                                                 fepListPtr);
                      }
                      else
                      {
-                        nbnxn_make_pairlist_part(gridSet, iGrid, jGrid, &work, nbat, exclusions, rlist,
-                                                 params_.pairlistType, ci_block, nbat->bUseBufferFlags,
-                                                 nsubpair_target, progBal, nsubpair_tot_est, th,
-                                                 numLists, &gpuLists_[th], fepListPtr);
+                        nbnxn_make_pairlist_part(gridSet,
+                                                 iGrid,
+                                                 jGrid,
+                                                 &work,
+                                                 nbat,
+                                                 exclusions,
+                                                 rlist,
+                                                 params_.pairlistType,
+                                                 ci_block,
+                                                 nbat->bUseBufferFlags,
+                                                 nsubpair_target,
+                                                 progBal,
+                                                 nsubpair_tot_est,
+                                                 th,
+                                                 numLists,
+                                                 &gpuLists_[th],
+                                                 fepListPtr);
                      }
  
                      work.cycleCounter.stop();
@@ -4051,9 +4031,9 @@ void PairlistSet::constructPairlists(const Nbnxm::GridSet&         gridSet,
              }
              searchCycleCounting->stop(enbsCCsearch);
  
-            np_tot = 0;
-            np_noq = 0;
-            np_hlj = 0;
+            int np_tot = 0;
+            int np_noq = 0;
+            int np_hlj = 0;
              for (int th = 0; th < numLists; th++)
              {
                  inc_nrnb(nrnb, eNR_NBNXN_DIST2, searchWork[th].ndistc);
@@ -4072,14 +4052,9 @@ void PairlistSet::constructPairlists(const Nbnxm::GridSet&         gridSet,
                      np_tot += nbl.nci_tot;
                  }
              }
-            if (isCpuType_)
-            {
-                nap = cpuLists_[0].na_ci * cpuLists_[0].na_cj;
-            }
-            else
-            {
-                nap = gmx::square(gpuLists_[0].na_ci);
-            }
+            const int nap = isCpuType_ ? cpuLists_[0].na_ci * cpuLists_[0].na_cj
+                                       : gmx::square(gpuLists_[0].na_ci);
+
              natpair_ljq_ = (np_tot - np_noq) * nap - np_hlj * nap / 2;
              natpair_lj_  = np_noq * nap;
              natpair_q_   = np_hlj * nap / 2;
@@ -4130,7 +4105,7 @@ void PairlistSet::constructPairlists(const Nbnxm::GridSet&         gridSet,
  
      if (nbat->bUseBufferFlags)
      {
-        reduce_buffer_flags(searchWork, numLists, &nbat->buffer_flags);
+        reduce_buffer_flags(searchWork, numLists, nbat->buffer_flags);
      }
  
      if (gridSet.haveFep())
@@ -4184,7 +4159,7 @@ void PairlistSet::constructPairlists(const Nbnxm::GridSet&         gridSet,
  
          if (nbat->bUseBufferFlags)
          {
-            print_reduction_cost(&nbat->buffer_flags, numLists);
+            print_reduction_cost(nbat->buffer_flags, numLists);
          }
      }
  
@@ -4201,8 +4176,25 @@ void PairlistSets::construct(const InteractionLocality iLocality,
                               const int64_t             step,
                               t_nrnb*                   nrnb)
  {
-    pairlistSet(iLocality).constructPairlists(pairSearch->gridSet(), pairSearch->work(), nbat,
-                                              exclusions, minimumIlistCountForGpuBalancing_, nrnb,
+    const auto& gridSet = pairSearch->gridSet();
+    const auto* ddZones = gridSet.domainSetup().zones;
+
+    /* The Nbnxm code can also work with more exclusions than those in i-zones only
+     * when using DD, but the equality check can catch more issues.
+     */
+    GMX_RELEASE_ASSERT(
+            exclusions.empty() || (!ddZones && exclusions.ssize() == gridSet.numRealAtomsTotal())
+                    || (ddZones && exclusions.ssize() == ddZones->cg_range[ddZones->iZones.size()]),
+            "exclusions should either be empty or the number of lists should match the number of "
+            "local i-atoms");
+
+    pairlistSet(iLocality).constructPairlists(iLocality,
+                                              gridSet,
+                                              pairSearch->work(),
+                                              nbat,
+                                              exclusions,
+                                              minimumIlistCountForGpuBalancing_,
+                                              nrnb,
                                                &pairSearch->cycleCounting_);
  
      if (iLocality == InteractionLocality::Local)
@@ -4231,7 +4223,7 @@ void PairlistSets::construct(const InteractionLocality iLocality,
  void nonbonded_verlet_t::constructPairlist(const InteractionLocality iLocality,
                                             const ListOfLists<int>&   exclusions,
                                             int64_t                   step,
-                                           t_nrnb*                   nrnb)
+                                           t_nrnb*                   nrnb) const
  {
      pairlistSets_->construct(iLocality, pairSearch_.get(), nbat.get(), exclusions, step, nrnb);