Merge branch release-2018 into release-2019
[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_search.cpp
index 10af3399bff5c050eb90a29d6766f5a0c557435a..8399236d1417af510247be52a68eaa6344f2a308 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013,2014,2015,2016,2017,2019, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
 
 #include "config.h"
 
-#include <assert.h>
-#include <string.h>
-
+#include <cassert>
 #include <cmath>
+#include <cstring>
 
 #include <algorithm>
 
@@ -66,6 +65,7 @@
 #include "gromacs/pbcutil/pbc.h"
 #include "gromacs/simd/simd.h"
 #include "gromacs/simd/vector_operations.h"
+#include "gromacs/topology/block.h"
 #include "gromacs/utility/exceptions.h"
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/gmxomp.h"
@@ -92,10 +92,10 @@ static void nbs_cycle_clear(nbnxn_cycle_t *cc)
 
 static double Mcyc_av(const nbnxn_cycle_t *cc)
 {
-    return (double)cc->c*1e-6/cc->count;
+    return static_cast<double>(cc->c)*1e-6/cc->count;
 }
 
-static void nbs_cycle_print(FILE *fp, const nbnxn_search_t nbs)
+static void nbs_cycle_print(FILE *fp, const nbnxn_search *nbs)
 {
     fprintf(fp, "\n");
     fprintf(fp, "ns %4d grid %4.1f search %4.1f red.f %5.3f",
@@ -104,7 +104,7 @@ static void nbs_cycle_print(FILE *fp, const nbnxn_search_t nbs)
             Mcyc_av(&nbs->cc[enbsCCsearch]),
             Mcyc_av(&nbs->cc[enbsCCreducef]));
 
-    if (nbs->nthread_max > 1)
+    if (nbs->work.size() > 1)
     {
         if (nbs->cc[enbsCCcombine].count > 0)
         {
@@ -112,10 +112,10 @@ static void nbs_cycle_print(FILE *fp, const nbnxn_search_t nbs)
                     Mcyc_av(&nbs->cc[enbsCCcombine]));
         }
         fprintf(fp, " s. th");
-        for (int t = 0; t < nbs->nthread_max; t++)
+        for (const nbnxn_search_work_t &work : nbs->work)
         {
             fprintf(fp, " %4.1f",
-                    Mcyc_av(&nbs->work[t].cc[enbsCCsearch]));
+                    Mcyc_av(&work.cc[enbsCCsearch]));
         }
     }
     fprintf(fp, "\n");
@@ -130,19 +130,14 @@ enum class NbnxnLayout
     Gpu8x8x8   // i-cluster size 8, j-cluster size 8 + super-clustering
 };
 
+#if GMX_SIMD
 /* Returns the j-cluster size */
 template <NbnxnLayout layout>
 static constexpr int jClusterSize()
 {
-#if GMX_SIMD
     static_assert(layout == NbnxnLayout::NoSimd4x4 || layout == NbnxnLayout::Simd4xN || layout == NbnxnLayout::Simd2xNN, "Currently jClusterSize only supports CPU layouts");
 
     return layout == NbnxnLayout::Simd4xN ? GMX_SIMD_REAL_WIDTH : (layout == NbnxnLayout::Simd2xNN ? GMX_SIMD_REAL_WIDTH/2 : NBNXN_CPU_CLUSTER_I_SIZE);
-#else
-    static_assert(layout == NbnxnLayout::NoSimd4x4, "Currently without SIMD, jClusterSize only supports NoSimd4x4");
-
-    return NBNXN_CPU_CLUSTER_I_SIZE;
-#endif
 }
 
 /* Returns the j-cluster index given the i-cluster index */
@@ -218,6 +213,7 @@ static inline int xIndexFromCj(int cj)
         return cj*STRIDE_P8;
     }
 }
+#endif //GMX_SIMD
 
 gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
 {
@@ -267,69 +263,79 @@ static void nbnxn_init_pairlist_fep(t_nblist *nl)
 
 }
 
-void nbnxn_init_search(nbnxn_search_t           * nbs_ptr,
-                       ivec                      *n_dd_cells,
-                       struct gmx_domdec_zones_t *zones,
-                       gmx_bool                   bFEP,
-                       int                        nthread_max)
+static void free_nblist(t_nblist *nl)
 {
-    nbnxn_search_t nbs;
-    int            ngrid;
+    sfree(nl->iinr);
+    sfree(nl->gid);
+    sfree(nl->shift);
+    sfree(nl->jindex);
+    sfree(nl->jjnr);
+    sfree(nl->excl_fep);
+}
 
-    snew(nbs, 1);
-    *nbs_ptr = nbs;
+nbnxn_search_work_t::nbnxn_search_work_t() :
+    cp0({{0}}
+        ),
+    buffer_flags({0, nullptr, 0}),
+    ndistc(0),
+    nbl_fep(new t_nblist),
+    cp1({{0}})
+{
+    nbnxn_init_pairlist_fep(nbl_fep.get());
 
-    nbs->bFEP   = bFEP;
+    nbs_cycle_clear(cc);
+}
 
-    nbs->DomDec = (n_dd_cells != nullptr);
+nbnxn_search_work_t::~nbnxn_search_work_t()
+{
+    sfree(buffer_flags.flag);
 
-    clear_ivec(nbs->dd_dim);
-    ngrid = 1;
-    if (nbs->DomDec)
-    {
-        nbs->zones = zones;
+    free_nblist(nbl_fep.get());
+}
 
+nbnxn_search::nbnxn_search(const ivec               *n_dd_cells,
+                           const gmx_domdec_zones_t *zones,
+                           gmx_bool                  bFEP,
+                           int                       nthread_max) :
+    bFEP(bFEP),
+    ePBC(epbcNONE), // The correct value will be set during the gridding
+    zones(zones),
+    natoms_local(0),
+    natoms_nonlocal(0),
+    search_count(0),
+    work(nthread_max)
+{
+    // The correct value will be set during the gridding
+    clear_mat(box);
+    clear_ivec(dd_dim);
+    int numGrids = 1;
+    DomDec = n_dd_cells != nullptr;
+    if (DomDec)
+    {
         for (int d = 0; d < DIM; d++)
         {
             if ((*n_dd_cells)[d] > 1)
             {
-                nbs->dd_dim[d] = 1;
+                dd_dim[d] = 1;
                 /* Each grid matches a DD zone */
-                ngrid *= 2;
+                numGrids *= 2;
             }
         }
     }
 
-    nbnxn_grids_init(nbs, ngrid);
-
-    nbs->cell        = nullptr;
-    nbs->cell_nalloc = 0;
-    nbs->a           = nullptr;
-    nbs->a_nalloc    = 0;
-
-    nbs->nthread_max = nthread_max;
-
-    /* Initialize the work data structures for each thread */
-    snew(nbs->work, nbs->nthread_max);
-    for (int t = 0; t < nbs->nthread_max; t++)
-    {
-        nbs->work[t].cxy_na           = nullptr;
-        nbs->work[t].cxy_na_nalloc    = 0;
-        nbs->work[t].sort_work        = nullptr;
-        nbs->work[t].sort_work_nalloc = 0;
-
-        snew(nbs->work[t].nbl_fep, 1);
-        nbnxn_init_pairlist_fep(nbs->work[t].nbl_fep);
-    }
+    grid.resize(numGrids);
 
     /* Initialize detailed nbsearch cycle counting */
-    nbs->print_cycles = (getenv("GMX_NBNXN_CYCLE") != nullptr);
-    nbs->search_count = 0;
-    nbs_cycle_clear(nbs->cc);
-    for (int t = 0; t < nbs->nthread_max; t++)
-    {
-        nbs_cycle_clear(nbs->work[t].cc);
-    }
+    print_cycles = (getenv("GMX_NBNXN_CYCLE") != nullptr);
+    nbs_cycle_clear(cc);
+}
+
+nbnxn_search *nbnxn_init_search(const ivec                *n_dd_cells,
+                                const gmx_domdec_zones_t  *zones,
+                                gmx_bool                   bFEP,
+                                int                        nthread_max)
+{
+    return new nbnxn_search(n_dd_cells, zones, bFEP, nthread_max);
 }
 
 static void init_buffer_flags(nbnxn_buffer_flags_t *flags,
@@ -347,22 +353,56 @@ static void init_buffer_flags(nbnxn_buffer_flags_t *flags,
     }
 }
 
+/* Returns the pair-list cutoff between a bounding box and a grid cell given an atom-to-atom pair-list cutoff
+ *
+ * Given a cutoff distance between atoms, this functions returns the cutoff
+ * distance2 between a bounding box of a group of atoms and a grid cell.
+ * Since atoms can be geometrically outside of the cell they have been
+ * assigned to (when atom groups instead of individual atoms are assigned
+ * to cells), this distance returned can be larger than the input.
+ */
+static real listRangeForBoundingBoxToGridCell(real                rlist,
+                                              const nbnxn_grid_t &grid)
+{
+    return rlist + grid.maxAtomGroupRadius;
+
+}
+/* Returns the pair-list cutoff between a grid cells given an atom-to-atom pair-list cutoff
+ *
+ * Given a cutoff distance between atoms, this functions returns the cutoff
+ * distance2 between two grid cells.
+ * Since atoms can be geometrically outside of the cell they have been
+ * assigned to (when atom groups instead of individual atoms are assigned
+ * to cells), this distance returned can be larger than the input.
+ */
+static real listRangeForGridCellToGridCell(real                rlist,
+                                           const nbnxn_grid_t &gridi,
+                                           const nbnxn_grid_t &gridj)
+{
+    return rlist + gridi.maxAtomGroupRadius + gridj.maxAtomGroupRadius;
+}
+
 /* Determines the cell range along one dimension that
  * the bounding box b0 - b1 sees.
  */
+template<int dim>
 static void get_cell_range(real b0, real b1,
-                           int nc, real c0, real s, real invs,
-                           real d2, real r2, int *cf, int *cl)
+                           const nbnxn_grid_t &gridj,
+                           real d2, real rlist, int *cf, int *cl)
 {
-    *cf = std::max(static_cast<int>((b0 - c0)*invs), 0);
+    real listRangeBBToCell2 = gmx::square(listRangeForBoundingBoxToGridCell(rlist, gridj));
+    real distanceInCells    = (b0 - gridj.c0[dim])*gridj.invCellSize[dim];
+    *cf                     = std::max(static_cast<int>(distanceInCells), 0);
 
-    while (*cf > 0 && d2 + gmx::square((b0 - c0) - (*cf-1+1)*s) < r2)
+    while (*cf > 0 &&
+           d2 + gmx::square((b0 - gridj.c0[dim]) - (*cf - 1 + 1)*gridj.cellSize[dim]) < listRangeBBToCell2)
     {
         (*cf)--;
     }
 
-    *cl = std::min(static_cast<int>((b1 - c0)*invs), nc-1);
-    while (*cl < nc-1 && d2 + gmx::square((*cl+1)*s - (b1 - c0)) < r2)
+    *cl = std::min(static_cast<int>((b1 - gridj.c0[dim])*gridj.invCellSize[dim]), gridj.numCells[dim] - 1);
+    while (*cl < gridj.numCells[dim] - 1 &&
+           d2 + gmx::square((*cl + 1)*gridj.cellSize[dim] - (b1 - gridj.c0[dim])) < listRangeBBToCell2)
     {
         (*cl)++;
     }
@@ -402,17 +442,16 @@ static void get_cell_range(real b0, real b1,
  */
 
 /* Plain C code calculating the distance^2 between two bounding boxes */
-static float subc_bb_dist2(int si, const nbnxn_bb_t *bb_i_ci,
-                           int csj, const nbnxn_bb_t *bb_j_all)
+static float subc_bb_dist2(int                              si,
+                           const nbnxn_bb_t                *bb_i_ci,
+                           int                              csj,
+                           gmx::ArrayRef<const nbnxn_bb_t>  bb_j_all)
 {
-    const nbnxn_bb_t *bb_i, *bb_j;
-    float             d2;
-    float             dl, dh, dm, dm0;
+    const nbnxn_bb_t *bb_i = bb_i_ci         +  si;
+    const nbnxn_bb_t *bb_j = bb_j_all.data() + csj;
 
-    bb_i = bb_i_ci  +  si;
-    bb_j = bb_j_all + csj;
-
-    d2 = 0;
+    float             d2 = 0;
+    float             dl, dh, dm, dm0;
 
     dl  = bb_i->lower[BB_X] - bb_j->upper[BB_X];
     dh  = bb_j->lower[BB_X] - bb_i->upper[BB_X];
@@ -438,8 +477,10 @@ static float subc_bb_dist2(int si, const nbnxn_bb_t *bb_i_ci,
 #if NBNXN_SEARCH_BB_SIMD4
 
 /* 4-wide SIMD code for bb distance for bb format xyz0 */
-static float subc_bb_dist2_simd4(int si, const nbnxn_bb_t *bb_i_ci,
-                                 int csj, const nbnxn_bb_t *bb_j_all)
+static float subc_bb_dist2_simd4(int                              si,
+                                 const nbnxn_bb_t                *bb_i_ci,
+                                 int                              csj,
+                                 gmx::ArrayRef<const nbnxn_bb_t>  bb_j_all)
 {
     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
     using namespace gmx;
@@ -479,14 +520,14 @@ static float subc_bb_dist2_simd4(int si, const nbnxn_bb_t *bb_i_ci,
         Simd4Float        d2x, d2y, d2z;                       \
         Simd4Float        d2s, d2t;                            \
                                                  \
-        shi = si*NNBSBB_D*DIM;                       \
+        shi = (si)*NNBSBB_D*DIM;                       \
                                                  \
-        xi_l = load4(bb_i+shi+0*STRIDE_PBB);   \
-        yi_l = load4(bb_i+shi+1*STRIDE_PBB);   \
-        zi_l = load4(bb_i+shi+2*STRIDE_PBB);   \
-        xi_h = load4(bb_i+shi+3*STRIDE_PBB);   \
-        yi_h = load4(bb_i+shi+4*STRIDE_PBB);   \
-        zi_h = load4(bb_i+shi+5*STRIDE_PBB);   \
+        xi_l = load4((bb_i)+shi+0*STRIDE_PBB);   \
+        yi_l = load4((bb_i)+shi+1*STRIDE_PBB);   \
+        zi_l = load4((bb_i)+shi+2*STRIDE_PBB);   \
+        xi_h = load4((bb_i)+shi+3*STRIDE_PBB);   \
+        yi_h = load4((bb_i)+shi+4*STRIDE_PBB);   \
+        zi_h = load4((bb_i)+shi+5*STRIDE_PBB);   \
                                                  \
         dx_0 = xi_l - xj_h;                 \
         dy_0 = yi_l - yj_h;                 \
@@ -511,7 +552,7 @@ static float subc_bb_dist2_simd4(int si, const nbnxn_bb_t *bb_i_ci,
         d2s  = d2x + d2y;                   \
         d2t  = d2s + d2z;                   \
                                                  \
-        store4(d2+si, d2t);                      \
+        store4((d2)+(si), d2t);                      \
     }
 
 /* 4-wide SIMD code for nsi bb distances for bb format xxxxyyyyzzzz */
@@ -552,7 +593,7 @@ static void subc_bb_dist2_simd4_xxxx(const float *bb_j,
 
 
 /* Returns if any atom pair from two clusters is within distance sqrt(rlist2) */
-static gmx_inline gmx_bool
+static inline gmx_bool
 clusterpair_in_range(const nbnxn_list_work_t *work,
                      int si,
                      int csj, int stride, const real *x_j,
@@ -587,11 +628,11 @@ clusterpair_in_range(const nbnxn_list_work_t *work,
 #else /* !GMX_SIMD4_HAVE_REAL */
 
     /* 4-wide SIMD version.
-     * A cluster is hard-coded to 8 atoms.
      * The coordinates x_i are stored as xxxxyyyy..., x_j is stored xyzxyz...
      * Using 8-wide AVX(2) is not faster on Intel Sandy Bridge and Haswell.
      */
-    assert(c_nbnxnGpuClusterSize == 8);
+    static_assert(c_nbnxnGpuClusterSize == 8 || c_nbnxnGpuClusterSize == 4,
+                  "A cluster is hard-coded to 4/8 atoms.");
 
     Simd4Real   rc2_S      = Simd4Real(rlist2);
 
@@ -601,10 +642,14 @@ clusterpair_in_range(const nbnxn_list_work_t *work,
     Simd4Real   ix_S0      = load4(x_i + si*dim_stride + 0*GMX_SIMD4_WIDTH);
     Simd4Real   iy_S0      = load4(x_i + si*dim_stride + 1*GMX_SIMD4_WIDTH);
     Simd4Real   iz_S0      = load4(x_i + si*dim_stride + 2*GMX_SIMD4_WIDTH);
-    Simd4Real   ix_S1      = load4(x_i + si*dim_stride + 3*GMX_SIMD4_WIDTH);
-    Simd4Real   iy_S1      = load4(x_i + si*dim_stride + 4*GMX_SIMD4_WIDTH);
-    Simd4Real   iz_S1      = load4(x_i + si*dim_stride + 5*GMX_SIMD4_WIDTH);
 
+    Simd4Real   ix_S1, iy_S1, iz_S1;
+    if (c_nbnxnGpuClusterSize == 8)
+    {
+        ix_S1      = load4(x_i + si*dim_stride + 3*GMX_SIMD4_WIDTH);
+        iy_S1      = load4(x_i + si*dim_stride + 4*GMX_SIMD4_WIDTH);
+        iz_S1      = load4(x_i + si*dim_stride + 5*GMX_SIMD4_WIDTH);
+    }
     /* We loop from the outer to the inner particles to maximize
      * the chance that we find a pair in range quickly and return.
      */
@@ -643,30 +688,45 @@ clusterpair_in_range(const nbnxn_list_work_t *work,
         dx_S0            = ix_S0 - jx0_S;
         dy_S0            = iy_S0 - jy0_S;
         dz_S0            = iz_S0 - jz0_S;
-        dx_S1            = ix_S1 - jx0_S;
-        dy_S1            = iy_S1 - jy0_S;
-        dz_S1            = iz_S1 - jz0_S;
         dx_S2            = ix_S0 - jx1_S;
         dy_S2            = iy_S0 - jy1_S;
         dz_S2            = iz_S0 - jz1_S;
-        dx_S3            = ix_S1 - jx1_S;
-        dy_S3            = iy_S1 - jy1_S;
-        dz_S3            = iz_S1 - jz1_S;
+        if (c_nbnxnGpuClusterSize == 8)
+        {
+            dx_S1            = ix_S1 - jx0_S;
+            dy_S1            = iy_S1 - jy0_S;
+            dz_S1            = iz_S1 - jz0_S;
+            dx_S3            = ix_S1 - jx1_S;
+            dy_S3            = iy_S1 - jy1_S;
+            dz_S3            = iz_S1 - jz1_S;
+        }
 
         /* rsq = dx*dx+dy*dy+dz*dz */
         rsq_S0           = norm2(dx_S0, dy_S0, dz_S0);
-        rsq_S1           = norm2(dx_S1, dy_S1, dz_S1);
         rsq_S2           = norm2(dx_S2, dy_S2, dz_S2);
-        rsq_S3           = norm2(dx_S3, dy_S3, dz_S3);
+        if (c_nbnxnGpuClusterSize == 8)
+        {
+            rsq_S1           = norm2(dx_S1, dy_S1, dz_S1);
+            rsq_S3           = norm2(dx_S3, dy_S3, dz_S3);
+        }
 
         wco_S0           = (rsq_S0 < rc2_S);
-        wco_S1           = (rsq_S1 < rc2_S);
         wco_S2           = (rsq_S2 < rc2_S);
-        wco_S3           = (rsq_S3 < rc2_S);
-
-        wco_any_S01      = wco_S0 || wco_S1;
-        wco_any_S23      = wco_S2 || wco_S3;
-        wco_any_S        = wco_any_S01 || wco_any_S23;
+        if (c_nbnxnGpuClusterSize == 8)
+        {
+            wco_S1           = (rsq_S1 < rc2_S);
+            wco_S3           = (rsq_S3 < rc2_S);
+        }
+        if (c_nbnxnGpuClusterSize == 8)
+        {
+            wco_any_S01      = wco_S0 || wco_S1;
+            wco_any_S23      = wco_S2 || wco_S3;
+            wco_any_S        = wco_any_S01 || wco_any_S23;
+        }
+        else
+        {
+            wco_any_S = wco_S0 || wco_S2;
+        }
 
         if (anyTrue(wco_any_S))
         {
@@ -706,7 +766,7 @@ static void check_excl_space(nbnxn_pairlist_t *nbl, int extra)
     if (nbl->nexcl+extra > nbl->excl_nalloc)
     {
         nbl->excl_nalloc = over_alloc_small(nbl->nexcl+extra);
-        nbnxn_realloc_void((void **)&nbl->excl,
+        nbnxn_realloc_void(reinterpret_cast<void **>(&nbl->excl),
                            nbl->nexcl*sizeof(*nbl->excl),
                            nbl->excl_nalloc*sizeof(*nbl->excl),
                            nbl->alloc, nbl->free);
@@ -724,12 +784,12 @@ static void check_cell_list_space_simple(nbnxn_pairlist_t *nbl,
     if (cj_max > nbl->cj_nalloc)
     {
         nbl->cj_nalloc = over_alloc_small(cj_max);
-        nbnxn_realloc_void((void **)&nbl->cj,
+        nbnxn_realloc_void(reinterpret_cast<void **>(&nbl->cj),
                            nbl->ncj*sizeof(*nbl->cj),
                            nbl->cj_nalloc*sizeof(*nbl->cj),
                            nbl->alloc, nbl->free);
 
-        nbnxn_realloc_void((void **)&nbl->cjOuter,
+        nbnxn_realloc_void(reinterpret_cast<void **>(&nbl->cjOuter),
                            nbl->ncj*sizeof(*nbl->cjOuter),
                            nbl->cj_nalloc*sizeof(*nbl->cjOuter),
                            nbl->alloc, nbl->free);
@@ -751,7 +811,7 @@ static void check_cell_list_space_supersub(nbnxn_pairlist_t *nbl,
     if (ncj4_max > nbl->cj4_nalloc)
     {
         nbl->cj4_nalloc = over_alloc_small(ncj4_max);
-        nbnxn_realloc_void((void **)&nbl->cj4,
+        nbnxn_realloc_void(reinterpret_cast<void **>(&nbl->cj4),
                            nbl->work->cj4_init*sizeof(*nbl->cj4),
                            nbl->cj4_nalloc*sizeof(*nbl->cj4),
                            nbl->alloc, nbl->free);
@@ -928,7 +988,7 @@ void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
 
 /* Print statistics of a pair list, used for debug output */
 static void print_nblist_statistics_simple(FILE *fp, const nbnxn_pairlist_t *nbl,
-                                           const nbnxn_search_t nbs, real rl)
+                                           const nbnxn_search *nbs, real rl)
 {
     const nbnxn_grid_t *grid;
     int                 cs[SHIFTS];
@@ -939,12 +999,12 @@ static void print_nblist_statistics_simple(FILE *fp, const nbnxn_pairlist_t *nbl
     fprintf(fp, "nbl nci %d ncj %d\n",
             nbl->nci, nbl->ncjInUse);
     fprintf(fp, "nbl na_sc %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
-            nbl->na_sc, rl, nbl->ncjInUse, nbl->ncjInUse/(double)grid->nc,
-            nbl->ncjInUse/(double)grid->nc*grid->na_sc,
-            nbl->ncjInUse/(double)grid->nc*grid->na_sc/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nc*grid->na_sc/(grid->size[XX]*grid->size[YY]*grid->size[ZZ])));
+            nbl->na_sc, rl, nbl->ncjInUse, nbl->ncjInUse/static_cast<double>(grid->nc),
+            nbl->ncjInUse/static_cast<double>(grid->nc)*grid->na_sc,
+            nbl->ncjInUse/static_cast<double>(grid->nc)*grid->na_sc/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nc*grid->na_sc/(grid->size[XX]*grid->size[YY]*grid->size[ZZ])));
 
     fprintf(fp, "nbl average j cell list length %.1f\n",
-            0.25*nbl->ncjInUse/(double)std::max(nbl->nci, 1));
+            0.25*nbl->ncjInUse/static_cast<double>(std::max(nbl->nci, 1)));
 
     for (int s = 0; s < SHIFTS; s++)
     {
@@ -965,7 +1025,7 @@ static void print_nblist_statistics_simple(FILE *fp, const nbnxn_pairlist_t *nbl
         }
     }
     fprintf(fp, "nbl cell pairs, total: %d excl: %d %.1f%%\n",
-            nbl->ncj, npexcl, 100*npexcl/(double)std::max(nbl->ncj, 1));
+            nbl->ncj, npexcl, 100*npexcl/static_cast<double>(std::max(nbl->ncj, 1)));
     for (int s = 0; s < SHIFTS; s++)
     {
         if (cs[s] > 0)
@@ -977,7 +1037,7 @@ static void print_nblist_statistics_simple(FILE *fp, const nbnxn_pairlist_t *nbl
 
 /* Print statistics of a pair lists, used for debug output */
 static void print_nblist_statistics_supersub(FILE *fp, const nbnxn_pairlist_t *nbl,
-                                             const nbnxn_search_t nbs, real rl)
+                                             const nbnxn_search *nbs, real rl)
 {
     const nbnxn_grid_t *grid;
     int                 b;
@@ -991,9 +1051,9 @@ static void print_nblist_statistics_supersub(FILE *fp, const nbnxn_pairlist_t *n
     fprintf(fp, "nbl nsci %d ncj4 %d nsi %d excl4 %d\n",
             nbl->nsci, nbl->ncj4, nbl->nci_tot, nbl->nexcl);
     fprintf(fp, "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
-            nbl->na_ci, rl, nbl->nci_tot, nbl->nci_tot/(double)grid->nsubc_tot,
-            nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c,
-            nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nsubc_tot*grid->na_c/(grid->size[XX]*grid->size[YY]*grid->size[ZZ])));
+            nbl->na_ci, rl, nbl->nci_tot, nbl->nci_tot/static_cast<double>(grid->nsubc_tot),
+            nbl->nci_tot/static_cast<double>(grid->nsubc_tot)*grid->na_c,
+            nbl->nci_tot/static_cast<double>(grid->nsubc_tot)*grid->na_c/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nsubc_tot*grid->na_c/(grid->size[XX]*grid->size[YY]*grid->size[ZZ])));
 
     sum_nsp  = 0;
     sum_nsp2 = 0;
@@ -1041,7 +1101,7 @@ static void print_nblist_statistics_supersub(FILE *fp, const nbnxn_pairlist_t *n
         {
             fprintf(fp, "nbl j-list #i-subcell %d %7d %4.1f\n",
                     b, c[b],
-                    100.0*c[b]/(double)(nbl->ncj4*c_nbnxnGpuJgroupSize));
+                    100.0*c[b]/int{nbl->ncj4*c_nbnxnGpuJgroupSize});
         }
     }
 }
@@ -1102,7 +1162,7 @@ static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t *nbl,
 
     /* Here we only set the set self and double pair exclusions */
 
-    assert(c_nbnxnGpuClusterpairSplit == 2);
+    static_assert(c_nbnxnGpuClusterpairSplit == 2, "");
 
     get_nbl_exclusions_2(nbl, cj4_ind, &excl[0], &excl[1]);
 
@@ -1351,7 +1411,7 @@ static void make_cluster_list_supersub(const nbnxn_grid_t *gridi,
 
 #if NBNXN_BBXXXX
         /* Determine all ci1 bb distances in one call with SIMD4 */
-        subc_bb_dist2_simd4_xxxx(gridj->pbb+(cj>>STRIDE_PBB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_PBB-1)),
+        subc_bb_dist2_simd4_xxxx(gridj->pbb.data() + (cj >> STRIDE_PBB_2LOG)*NNBSBB_XXXX + (cj & (STRIDE_PBB-1)),
                                  ci1, pbb_ci, d2l);
         *numDistanceChecks += c_nbnxnGpuClusterSize*2;
 #endif
@@ -1552,7 +1612,7 @@ static inline int findJClusterInJList(int                jCluster,
  * as masks in the pair-list for simple list entry iEntry.
  */
 static void
-setExclusionsForSimpleIentry(const nbnxn_search_t  nbs,
+setExclusionsForSimpleIentry(const nbnxn_search   *nbs,
                              nbnxn_pairlist_t     *nbl,
                              gmx_bool              diagRemoved,
                              int                   na_cj_2log,
@@ -1565,11 +1625,11 @@ setExclusionsForSimpleIentry(const nbnxn_search_t  nbs,
         return;
     }
 
-    const JListRanges  ranges(iEntry.cj_ind_start, iEntry.cj_ind_end, nbl->cj);
+    const JListRanges        ranges(iEntry.cj_ind_start, iEntry.cj_ind_end, nbl->cj);
 
-    const int          iCluster = iEntry.ci;
+    const int                iCluster = iEntry.ci;
 
-    const int         *cell = nbs->cell;
+    gmx::ArrayRef<const int> cell = nbs->cell;
 
     /* Loop over the atoms in the i-cluster */
     for (int i = 0; i < nbl->na_sc; i++)
@@ -1624,7 +1684,7 @@ setExclusionsForSimpleIentry(const nbnxn_search_t  nbs,
 }
 
 /* Add a new i-entry to the FEP list and copy the i-properties */
-static gmx_inline void fep_list_new_nri_copy(t_nblist *nlist)
+static inline void fep_list_new_nri_copy(t_nblist *nlist)
 {
     /* Add a new i-entry */
     nlist->nri++;
@@ -1652,7 +1712,7 @@ const int max_nrj_fep = 40;
  * LJ parameters have been zeroed in the nbnxn data structure.
  * Simultaneously make a group pair list for the perturbed pairs.
  */
-static void make_fep_list(const nbnxn_search_t    nbs,
+static void make_fep_list(const nbnxn_search     *nbs,
                           const nbnxn_atomdata_t *nbat,
                           nbnxn_pairlist_t       *nbl,
                           gmx_bool                bDiagRemoved,
@@ -1694,9 +1754,9 @@ static void make_fep_list(const nbnxn_search_t    nbs,
 
     ngid = nbat->nenergrp;
 
-    if (static_cast<std::size_t>(ngid*gridj->na_cj) > sizeof(gid_cj)*8)
+    if (ngid*gridj->na_cj > gmx::index(sizeof(gid_cj)*8))
     {
-        gmx_fatal(FARGS, "The Verlet scheme with %dx%d kernels and free-energy only supports up to %d energy groups",
+        gmx_fatal(FARGS, "The Verlet scheme with %dx%d kernels and free-energy only supports up to %zu energy groups",
                   gridi->na_c, gridj->na_cj, (sizeof(gid_cj)*8)/gridj->na_cj);
     }
 
@@ -1718,7 +1778,7 @@ static void make_fep_list(const nbnxn_search_t    nbs,
             nlist->gid[nri]      = 0;
             nlist->shift[nri]    = nbl_ci->shift & NBNXN_CI_SHIFT;
 
-            bFEP_i = gridi->fep[ci - gridi->cell0] & (1 << i);
+            bFEP_i = ((gridi->fep[ci - gridi->cell0] & (1 << i)) != 0u);
 
             bFEP_i_all = bFEP_i_all && bFEP_i;
 
@@ -1836,25 +1896,25 @@ static void make_fep_list(const nbnxn_search_t    nbs,
 }
 
 /* Return the index of atom a within a cluster */
-static gmx_inline int cj_mod_cj4(int cj)
+static inline int cj_mod_cj4(int cj)
 {
     return cj & (c_nbnxnGpuJgroupSize - 1);
 }
 
 /* Convert a j-cluster to a cj4 group */
-static gmx_inline int cj_to_cj4(int cj)
+static inline int cj_to_cj4(int cj)
 {
     return cj/c_nbnxnGpuJgroupSize;
 }
 
 /* Return the index of an j-atom within a warp */
-static gmx_inline int a_mod_wj(int a)
+static inline int a_mod_wj(int a)
 {
     return a & (c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit - 1);
 }
 
 /* As make_fep_list above, but for super/sub lists. */
-static void make_fep_list_supersub(const nbnxn_search_t    nbs,
+static void make_fep_list_supersub(const nbnxn_search     *nbs,
                                    const nbnxn_atomdata_t *nbat,
                                    nbnxn_pairlist_t       *nbl,
                                    gmx_bool                bDiagRemoved,
@@ -1919,7 +1979,7 @@ static void make_fep_list_supersub(const nbnxn_search_t    nbs,
                 nlist->gid[nri]      = 0;
                 nlist->shift[nri]    = nbl_sci->shift & NBNXN_CI_SHIFT;
 
-                bFEP_i = (gridi->fep[c_abs - gridi->cell0*c_gpuNumClusterPerCell] & (1 << i));
+                bFEP_i = ((gridi->fep[c_abs - gridi->cell0*c_gpuNumClusterPerCell] & (1 << i)) != 0u);
 
                 xi = nbat->x[ind_i*nbat->xstride+XX] + shx;
                 yi = nbat->x[ind_i*nbat->xstride+YY] + shy;
@@ -1966,7 +2026,8 @@ static void make_fep_list_supersub(const nbnxn_search_t    nbs,
                                     unsigned int  excl_bit;
                                     real          dx, dy, dz;
 
-                                    get_nbl_exclusions_1(nbl, cj4_ind, j>>2, &excl);
+                                    const int     jHalf = j/(c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit);
+                                    get_nbl_exclusions_1(nbl, cj4_ind, jHalf, &excl);
 
                                     excl_pair = a_mod_wj(j)*nbl->na_ci + i;
                                     excl_bit  = (1U << (gcj*c_gpuNumClusterPerCell + c));
@@ -2032,7 +2093,7 @@ static void make_fep_list_supersub(const nbnxn_search_t    nbs,
  * as masks in the pair-list for i-super-cluster list entry iEntry.
  */
 static void
-setExclusionsForGpuIentry(const nbnxn_search_t  nbs,
+setExclusionsForGpuIentry(const nbnxn_search   *nbs,
                           nbnxn_pairlist_t     *nbl,
                           gmx_bool              diagRemoved,
                           const nbnxn_sci_t    &iEntry,
@@ -2053,12 +2114,12 @@ setExclusionsForGpuIentry(const nbnxn_search_t  nbs,
                              nbl->cj4);
 
     GMX_ASSERT(nbl->na_ci == c_nbnxnGpuClusterSize, "na_ci should match the GPU cluster size");
-    constexpr int  c_clusterSize      = c_nbnxnGpuClusterSize;
-    constexpr int  c_superClusterSize = c_nbnxnGpuNumClusterPerSupercluster*c_nbnxnGpuClusterSize;
+    constexpr int            c_clusterSize      = c_nbnxnGpuClusterSize;
+    constexpr int            c_superClusterSize = c_nbnxnGpuNumClusterPerSupercluster*c_nbnxnGpuClusterSize;
 
-    const int      iSuperCluster = iEntry.sci;
+    const int                iSuperCluster = iEntry.sci;
 
-    const int     *cell = nbs->cell;
+    gmx::ArrayRef<const int> cell = nbs->cell;
 
     /* Loop over the atoms in the i super-cluster */
     for (int i = 0; i < c_superClusterSize; i++)
@@ -2136,12 +2197,12 @@ setExclusionsForGpuIentry(const nbnxn_search_t  nbs,
 static void nb_realloc_ci(nbnxn_pairlist_t *nbl, int n)
 {
     nbl->ci_nalloc = over_alloc_small(n);
-    nbnxn_realloc_void((void **)&nbl->ci,
+    nbnxn_realloc_void(reinterpret_cast<void **>(&nbl->ci),
                        nbl->nci*sizeof(*nbl->ci),
                        nbl->ci_nalloc*sizeof(*nbl->ci),
                        nbl->alloc, nbl->free);
 
-    nbnxn_realloc_void((void **)&nbl->ciOuter,
+    nbnxn_realloc_void(reinterpret_cast<void **>(&nbl->ciOuter),
                        nbl->nci*sizeof(*nbl->ciOuter),
                        nbl->ci_nalloc*sizeof(*nbl->ciOuter),
                        nbl->alloc, nbl->free);
@@ -2151,7 +2212,7 @@ static void nb_realloc_ci(nbnxn_pairlist_t *nbl, int n)
 static void nb_realloc_sci(nbnxn_pairlist_t *nbl, int n)
 {
     nbl->sci_nalloc = over_alloc_small(n);
-    nbnxn_realloc_void((void **)&nbl->sci,
+    nbnxn_realloc_void(reinterpret_cast<void **>(&nbl->sci),
                        nbl->nsci*sizeof(*nbl->sci),
                        nbl->sci_nalloc*sizeof(*nbl->sci),
                        nbl->alloc, nbl->free);
@@ -2428,9 +2489,10 @@ static void clear_pairlist_fep(t_nblist *nl)
 }
 
 /* Sets a simple list i-cell bounding box, including PBC shift */
-static gmx_inline void set_icell_bb_simple(const nbnxn_bb_t *bb, int ci,
-                                           real shx, real shy, real shz,
-                                           nbnxn_bb_t *bb_ci)
+static inline void set_icell_bb_simple(gmx::ArrayRef<const nbnxn_bb_t> bb,
+                                       int ci,
+                                       real shx, real shy, real shz,
+                                       nbnxn_bb_t *bb_ci)
 {
     bb_ci->lower[BB_X] = bb[ci].lower[BB_X] + shx;
     bb_ci->lower[BB_Y] = bb[ci].lower[BB_Y] + shy;
@@ -2442,7 +2504,8 @@ static gmx_inline void set_icell_bb_simple(const nbnxn_bb_t *bb, int ci,
 
 #if NBNXN_BBXXXX
 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
-static void set_icell_bbxxxx_supersub(const float *bb, int ci,
+static void set_icell_bbxxxx_supersub(gmx::ArrayRef<const float> bb,
+                                      int ci,
                                       real shx, real shy, real shz,
                                       float *bb_ci)
 {
@@ -2463,7 +2526,8 @@ static void set_icell_bbxxxx_supersub(const float *bb, int ci,
 #endif
 
 /* Sets a super-cell and sub cell bounding boxes, including PBC shift */
-gmx_unused static void set_icell_bb_supersub(const nbnxn_bb_t *bb, int ci,
+gmx_unused static void set_icell_bb_supersub(gmx::ArrayRef<const nbnxn_bb_t> bb,
+                                             int ci,
                                              real shx, real shy, real shz,
                                              nbnxn_bb_t *bb_ci)
 {
@@ -2535,12 +2599,12 @@ static real minimum_subgrid_size_xy(const nbnxn_grid_t *grid)
 {
     if (grid->bSimple)
     {
-        return std::min(grid->sx, grid->sy);
+        return std::min(grid->cellSize[XX], grid->cellSize[YY]);
     }
     else
     {
-        return std::min(grid->sx/c_gpuNumClusterPerCellX,
-                        grid->sy/c_gpuNumClusterPerCellY);
+        return std::min(grid->cellSize[XX]/c_gpuNumClusterPerCellX,
+                        grid->cellSize[YY]/c_gpuNumClusterPerCellY);
     }
 }
 
@@ -2593,7 +2657,7 @@ real nbnxn_get_rlist_effective_inc(int cluster_size_j, real atom_density)
 }
 
 /* Estimates the interaction volume^2 for non-local interactions */
-static real nonlocal_vol2(const struct gmx_domdec_zones_t *zones, rvec ls, real r)
+static real nonlocal_vol2(const struct gmx_domdec_zones_t *zones, const rvec ls, real r)
 {
     real cl, ca, za;
     real vold_est;
@@ -2640,7 +2704,7 @@ static real nonlocal_vol2(const struct gmx_domdec_zones_t *zones, rvec ls, real
 }
 
 /* Estimates the average size of a full j-list for super/sub setup */
-static void get_nsubpair_target(const nbnxn_search_t  nbs,
+static void get_nsubpair_target(const nbnxn_search   *nbs,
                                 int                   iloc,
                                 real                  rlist,
                                 int                   min_ci_balanced,
@@ -2672,8 +2736,8 @@ static void get_nsubpair_target(const nbnxn_search_t  nbs,
         return;
     }
 
-    ls[XX] = (grid->c1[XX] - grid->c0[XX])/(grid->ncx*c_gpuNumClusterPerCellX);
-    ls[YY] = (grid->c1[YY] - grid->c0[YY])/(grid->ncy*c_gpuNumClusterPerCellY);
+    ls[XX] = (grid->c1[XX] - grid->c0[XX])/(grid->numCells[XX]*c_gpuNumClusterPerCellX);
+    ls[YY] = (grid->c1[YY] - grid->c0[YY])/(grid->numCells[YY]*c_gpuNumClusterPerCellY);
     ls[ZZ] = grid->na_c/(grid->atom_density*ls[XX]*ls[YY]);
 
     /* The average length of the diagonal of a sub cell */
@@ -2721,7 +2785,7 @@ static void get_nsubpair_target(const nbnxn_search_t  nbs,
          * groups of atoms we'll anyhow be limited by nsubpair_target_min,
          * so this overestimation will not matter.
          */
-        nsp_est = std::max(nsp_est, grid->nsubc_tot*static_cast<real>(14));
+        nsp_est = std::max(nsp_est, grid->nsubc_tot*14._real);
 
         if (debug)
         {
@@ -2739,7 +2803,7 @@ static void get_nsubpair_target(const nbnxn_search_t  nbs,
      * (and we can't chop up j-groups) so we use a minimum target size of 36.
      */
     *nsubpair_target  = std::max(nsubpair_target_min,
-                                 static_cast<int>(nsp_est/min_ci_balanced + 0.5));
+                                 roundToInt(nsp_est/min_ci_balanced));
     *nsubpair_tot_est = static_cast<int>(nsp_est);
 
     if (debug)
@@ -2828,7 +2892,7 @@ static void combine_nblists(int nnbl, nbnxn_pairlist_t **nbl,
     if (ncj4 > nblc->cj4_nalloc)
     {
         nblc->cj4_nalloc = over_alloc_small(ncj4);
-        nbnxn_realloc_void((void **)&nblc->cj4,
+        nbnxn_realloc_void(reinterpret_cast<void **>(&nblc->cj4),
                            nblc->ncj4*sizeof(*nblc->cj4),
                            nblc->cj4_nalloc*sizeof(*nblc->cj4),
                            nblc->alloc, nblc->free);
@@ -2836,7 +2900,7 @@ static void combine_nblists(int nnbl, nbnxn_pairlist_t **nbl,
     if (nexcl > nblc->excl_nalloc)
     {
         nblc->excl_nalloc = over_alloc_small(nexcl);
-        nbnxn_realloc_void((void **)&nblc->excl,
+        nbnxn_realloc_void(reinterpret_cast<void **>(&nblc->excl),
                            nblc->nexcl*sizeof(*nblc->excl),
                            nblc->excl_nalloc*sizeof(*nblc->excl),
                            nblc->alloc, nblc->free);
@@ -2846,7 +2910,6 @@ static void combine_nblists(int nnbl, nbnxn_pairlist_t **nbl,
      * as otherwise data will go back and forth between different caches.
      */
 #if GMX_OPENMP && !(defined __clang_analyzer__)
-    // cppcheck-suppress unreadVariable
     int nthreads = gmx_omp_nthreads_get(emntPairsearch);
 #endif
 
@@ -2905,7 +2968,7 @@ static void combine_nblists(int nnbl, nbnxn_pairlist_t **nbl,
     }
 }
 
-static void balance_fep_lists(const nbnxn_search_t  nbs,
+static void balance_fep_lists(const nbnxn_search   *nbs,
                               nbnxn_pairlist_set_t *nbl_lists)
 {
     int       nnbl;
@@ -2939,9 +3002,7 @@ static void balance_fep_lists(const nbnxn_search_t  nbs,
     {
         try
         {
-            t_nblist *nbl;
-
-            nbl = nbs->work[th].nbl_fep;
+            t_nblist *nbl = nbs->work[th].nbl_fep.get();
 
             /* Note that here we allocate for the total size, instead of
              * a per-thread esimate (which is hard to obtain).
@@ -2965,7 +3026,7 @@ static void balance_fep_lists(const nbnxn_search_t  nbs,
 
     /* Loop over the source lists and assign and copy i-entries */
     th_dest = 0;
-    nbld    = nbs->work[th_dest].nbl_fep;
+    nbld    = nbs->work[th_dest].nbl_fep.get();
     for (int th = 0; th < nnbl; th++)
     {
         t_nblist *nbls;
@@ -2986,7 +3047,7 @@ static void balance_fep_lists(const nbnxn_search_t  nbs,
                 nbld->nrj + nrj - nrj_target > nrj_target - nbld->nrj)
             {
                 th_dest++;
-                nbld = nbs->work[th_dest].nbl_fep;
+                nbld = nbs->work[th_dest].nbl_fep.get();
             }
 
             nbld->iinr[nbld->nri]  = nbls->iinr[i];
@@ -3007,11 +3068,9 @@ static void balance_fep_lists(const nbnxn_search_t  nbs,
     /* Swap the list pointers */
     for (int th = 0; th < nnbl; th++)
     {
-        t_nblist *nbl_tmp;
-
-        nbl_tmp                = nbl_lists->nbl_fep[th];
-        nbl_lists->nbl_fep[th] = nbs->work[th].nbl_fep;
-        nbs->work[th].nbl_fep  = nbl_tmp;
+        t_nblist *nbl_tmp      = nbs->work[th].nbl_fep.release();
+        nbs->work[th].nbl_fep.reset(nbl_lists->nbl_fep[th]);
+        nbl_lists->nbl_fep[th] = nbl_tmp;
 
         if (debug)
         {
@@ -3044,10 +3103,10 @@ static gmx_bool next_ci(const nbnxn_grid_t *grid,
         return FALSE;
     }
 
-    while (*ci >= grid->cxy_ind[*ci_x*grid->ncy + *ci_y + 1])
+    while (*ci >= grid->cxy_ind[*ci_x*grid->numCells[YY] + *ci_y + 1])
     {
         *ci_y += 1;
-        if (*ci_y == grid->ncy)
+        if (*ci_y == grid->numCells[YY])
         {
             *ci_x += 1;
             *ci_y  = 0;
@@ -3081,8 +3140,8 @@ static float boundingbox_only_distance2(const nbnxn_grid_t *gridi,
     real bbx, bby;
     real rbb2;
 
-    bbx = 0.5*(gridi->sx + gridj->sx);
-    bby = 0.5*(gridi->sy + gridj->sy);
+    bbx = 0.5*(gridi->cellSize[XX] + gridj->cellSize[XX]);
+    bby = 0.5*(gridi->cellSize[YY] + gridj->cellSize[YY]);
     if (!simple)
     {
         bbx /= c_gpuNumClusterPerCellX;
@@ -3118,7 +3177,7 @@ static int get_ci_block_size(const nbnxn_grid_t *gridi,
      * zone boundaries with 3D domain decomposition. At the same time
      * the blocks will not become too small.
      */
-    ci_block = (gridi->nc*ci_block_enum)/(ci_block_denom*gridi->ncx*nth);
+    ci_block = (gridi->nc*ci_block_enum)/(ci_block_denom*gridi->numCells[XX]*nth);
 
     /* Ensure the blocks are not too small: avoids cache invalidation */
     if (ci_block*gridi->na_sc < ci_block_min_atoms)
@@ -3165,7 +3224,7 @@ static int getBufferFlagShift(int numAtomsPerCluster)
 }
 
 /* Generates the part of pair-list nbl assigned to our thread */
-static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
+static void nbnxn_make_pairlist_part(const nbnxn_search *nbs,
                                      const nbnxn_grid_t *gridi,
                                      const nbnxn_grid_t *gridj,
                                      nbnxn_search_work_t *work,
@@ -3190,13 +3249,6 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
     ivec              shp;
     int               shift;
     real              shx, shy, shz;
-    int               cell0_i;
-    const nbnxn_bb_t *bb_i = nullptr;
-#if NBNXN_BBXXXX
-    const float      *pbb_i = nullptr;
-#endif
-    const float      *bbcz_i, *bbcz_j;
-    const int        *flags_i;
     real              bx0, bx1, by0, by1, bz0, bz1;
     real              bz1_frac;
     real              d2cx, d2z, d2z_cx, d2z_cy, d2zx, d2zxy, d2xy;
@@ -3208,7 +3260,7 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
 
     nbs_cycle_start(&work->cc[enbsCCsearch]);
 
-    if (gridj->bSimple != nbl->bSimple)
+    if (gridj->bSimple != nbl->bSimple || gridi->bSimple != nbl->bSimple)
     {
         gmx_incons("Grid incompatible with pair-list");
     }
@@ -3270,8 +3322,9 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
         }
         else
         {
+            const real listRangeCellToCell = listRangeForGridCellToGridCell(rlist, *gridi, *gridj);
             if (d == XX &&
-                box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < std::sqrt(rlist2))
+                box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < listRangeCellToCell)
             {
                 shp[d] = 2;
             }
@@ -3281,9 +3334,11 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
             }
         }
     }
-
+    const bool bSimple = nbl->bSimple;
+    gmx::ArrayRef<const nbnxn_bb_t> bb_i;
 #if NBNXN_BBXXXX
-    if (gridi->bSimple)
+    gmx::ArrayRef<const float>      pbb_i;
+    if (bSimple)
     {
         bb_i  = gridi->bb;
     }
@@ -3295,20 +3350,21 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
     /* We use the normal bounding box format for both grid types */
     bb_i  = gridi->bb;
 #endif
-    bbcz_i  = gridi->bbcz;
-    flags_i = gridi->flags;
-    cell0_i = gridi->cell0;
-
-    bbcz_j = gridj->bbcz;
+    gmx::ArrayRef<const float> bbcz_i  = gridi->bbcz;
+    gmx::ArrayRef<const int>   flags_i = gridi->flags;
+    gmx::ArrayRef<const float> bbcz_j  = gridj->bbcz;
+    int                        cell0_i = gridi->cell0;
 
     if (debug)
     {
         fprintf(debug, "nbl nc_i %d col.av. %.1f ci_block %d\n",
-                gridi->nc, gridi->nc/(double)(gridi->ncx*gridi->ncy), ci_block);
+                gridi->nc, gridi->nc/static_cast<double>(gridi->numCells[XX]*gridi->numCells[YY]), ci_block);
     }
 
     numDistanceChecks = 0;
 
+    const real listRangeBBToJCell2 = gmx::square(listRangeForBoundingBoxToGridCell(rlist, *gridj));
+
     /* Initially ci_b and ci to 1 before where we want them to start,
      * as they will both be incremented in next_ci.
      */
@@ -3318,7 +3374,7 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
     ci_y = 0;
     while (next_ci(gridi, nth, ci_block, &ci_x, &ci_y, &ci_b, &ci))
     {
-        if (nbl->bSimple && flags_i[ci] == 0)
+        if (bSimple && flags_i[ci] == 0)
         {
             continue;
         }
@@ -3328,26 +3384,26 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
         d2cx = 0;
         if (gridj != gridi && shp[XX] == 0)
         {
-            if (nbl->bSimple)
+            if (bSimple)
             {
                 bx1 = bb_i[ci].upper[BB_X];
             }
             else
             {
-                bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx;
+                bx1 = gridi->c0[XX] + (ci_x+1)*gridi->cellSize[XX];
             }
             if (bx1 < gridj->c0[XX])
             {
                 d2cx = gmx::square(gridj->c0[XX] - bx1);
 
-                if (d2cx >= rlist2)
+                if (d2cx >= listRangeBBToJCell2)
                 {
                     continue;
                 }
             }
         }
 
-        ci_xy = ci_x*gridi->ncy + ci_y;
+        ci_xy = ci_x*gridi->numCells[YY] + ci_y;
 
         /* Loop over shift vectors in three dimensions */
         for (int tz = -shp[ZZ]; tz <= shp[ZZ]; tz++)
@@ -3388,21 +3444,21 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
             {
                 shy = ty*box[YY][YY] + tz*box[ZZ][YY];
 
-                if (nbl->bSimple)
+                if (bSimple)
                 {
                     by0 = bb_i[ci].lower[BB_Y] + shy;
                     by1 = bb_i[ci].upper[BB_Y] + shy;
                 }
                 else
                 {
-                    by0 = gridi->c0[YY] + (ci_y  )*gridi->sy + shy;
-                    by1 = gridi->c0[YY] + (ci_y+1)*gridi->sy + shy;
+                    by0 = gridi->c0[YY] + (ci_y  )*gridi->cellSize[YY] + shy;
+                    by1 = gridi->c0[YY] + (ci_y+1)*gridi->cellSize[YY] + shy;
                 }
 
-                get_cell_range(by0, by1,
-                               gridj->ncy, gridj->c0[YY], gridj->sy, gridj->inv_sy,
-                               d2z_cx, rlist2,
-                               &cyf, &cyl);
+                get_cell_range<YY>(by0, by1,
+                                   *gridj,
+                                   d2z_cx, rlist,
+                                   &cyf, &cyl);
 
                 if (cyf > cyl)
                 {
@@ -3430,28 +3486,28 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
 
                     shx = tx*box[XX][XX] + ty*box[YY][XX] + tz*box[ZZ][XX];
 
-                    if (nbl->bSimple)
+                    if (bSimple)
                     {
                         bx0 = bb_i[ci].lower[BB_X] + shx;
                         bx1 = bb_i[ci].upper[BB_X] + shx;
                     }
                     else
                     {
-                        bx0 = gridi->c0[XX] + (ci_x  )*gridi->sx + shx;
-                        bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx + shx;
+                        bx0 = gridi->c0[XX] + (ci_x  )*gridi->cellSize[XX] + shx;
+                        bx1 = gridi->c0[XX] + (ci_x+1)*gridi->cellSize[XX] + shx;
                     }
 
-                    get_cell_range(bx0, bx1,
-                                   gridj->ncx, gridj->c0[XX], gridj->sx, gridj->inv_sx,
-                                   d2z_cy, rlist2,
-                                   &cxf, &cxl);
+                    get_cell_range<XX>(bx0, bx1,
+                                       *gridj,
+                                       d2z_cy, rlist,
+                                       &cxf, &cxl);
 
                     if (cxf > cxl)
                     {
                         continue;
                     }
 
-                    if (nbl->bSimple)
+                    if (bSimple)
                     {
                         new_ci_entry(nbl, cell0_i+ci, shift, flags_i[ci]);
                     }
@@ -3470,7 +3526,7 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
                         cxf = ci_x;
                     }
 
-                    if (nbl->bSimple)
+                    if (bSimple)
                     {
                         set_icell_bb_simple(bb_i, ci, shx, shy, shz,
                                             nbl->work->bb_ci);
@@ -3493,13 +3549,13 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
                     for (int cx = cxf; cx <= cxl; cx++)
                     {
                         d2zx = d2z;
-                        if (gridj->c0[XX] + cx*gridj->sx > bx1)
+                        if (gridj->c0[XX] + cx*gridj->cellSize[XX] > bx1)
                         {
-                            d2zx += gmx::square(gridj->c0[XX] + cx*gridj->sx - bx1);
+                            d2zx += gmx::square(gridj->c0[XX] + cx*gridj->cellSize[XX] - bx1);
                         }
-                        else if (gridj->c0[XX] + (cx+1)*gridj->sx < bx0)
+                        else if (gridj->c0[XX] + (cx+1)*gridj->cellSize[XX] < bx0)
                         {
-                            d2zx += gmx::square(gridj->c0[XX] + (cx+1)*gridj->sx - bx0);
+                            d2zx += gmx::square(gridj->c0[XX] + (cx+1)*gridj->cellSize[XX] - bx0);
                         }
 
                         if (gridi == gridj &&
@@ -3519,19 +3575,19 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
 
                         for (int cy = cyf_x; cy <= cyl; cy++)
                         {
-                            const int columnStart = gridj->cxy_ind[cx*gridj->ncy + cy];
-                            const int columnEnd   = gridj->cxy_ind[cx*gridj->ncy + cy + 1];
+                            const int columnStart = gridj->cxy_ind[cx*gridj->numCells[YY] + cy];
+                            const int columnEnd   = gridj->cxy_ind[cx*gridj->numCells[YY] + cy + 1];
 
                             d2zxy = d2zx;
-                            if (gridj->c0[YY] + cy*gridj->sy > by1)
+                            if (gridj->c0[YY] + cy*gridj->cellSize[YY] > by1)
                             {
-                                d2zxy += gmx::square(gridj->c0[YY] + cy*gridj->sy - by1);
+                                d2zxy += gmx::square(gridj->c0[YY] + cy*gridj->cellSize[YY] - by1);
                             }
-                            else if (gridj->c0[YY] + (cy+1)*gridj->sy < by0)
+                            else if (gridj->c0[YY] + (cy+1)*gridj->cellSize[YY] < by0)
                             {
-                                d2zxy += gmx::square(gridj->c0[YY] + (cy+1)*gridj->sy - by0);
+                                d2zxy += gmx::square(gridj->c0[YY] + (cy+1)*gridj->cellSize[YY] - by0);
                             }
-                            if (columnStart < columnEnd && d2zxy < rlist2)
+                            if (columnStart < columnEnd && d2zxy < listRangeBBToJCell2)
                             {
                                 /* To improve efficiency in the common case
                                  * of a homogeneous particle distribution,
@@ -3626,7 +3682,7 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
                                     /* For f buffer flags with simple lists */
                                     ncj_old_j = nbl->ncj;
 
-                                    if (nbl->bSimple)
+                                    if (bSimple)
                                     {
                                         /* We have a maximum of 2 j-clusters
                                          * per i-cluster sized cell.
@@ -3699,7 +3755,7 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
                     }
 
                     /* Set the exclusions for this ci list */
-                    if (nbl->bSimple)
+                    if (bSimple)
                     {
                         setExclusionsForSimpleIentry(nbs,
                                                      nbl,
@@ -3736,7 +3792,7 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
                     }
 
                     /* Close this ci list */
-                    if (nbl->bSimple)
+                    if (bSimple)
                     {
                         close_ci_entry_simple(nbl);
                     }
@@ -3767,7 +3823,7 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
     {
         fprintf(debug, "number of distance checks %d\n", numDistanceChecks);
 
-        if (nbl->bSimple)
+        if (bSimple)
         {
             print_nblist_statistics_simple(debug, nbl, nbs, rlist);
         }
@@ -3783,7 +3839,7 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
     }
 }
 
-static void reduce_buffer_flags(const nbnxn_search_t        nbs,
+static void reduce_buffer_flags(const nbnxn_search         *nbs,
                                 int                         nsrc,
                                 const nbnxn_buffer_flags_t *dest)
 {
@@ -3840,10 +3896,10 @@ static void print_reduction_cost(const nbnxn_buffer_flags_t *flags, int nout)
 
     fprintf(debug, "nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
             flags->nflag, nout,
-            nelem/(double)(flags->nflag),
-            nkeep/(double)(flags->nflag),
-            ncopy/(double)(flags->nflag),
-            nred/(double)(flags->nflag));
+            nelem/static_cast<double>(flags->nflag),
+            nkeep/static_cast<double>(flags->nflag),
+            ncopy/static_cast<double>(flags->nflag),
+            nred/static_cast<double>(flags->nflag));
 }
 
 /* Copies the list entries from src to dest when cjStart <= *cjGlobal < cjEnd.
@@ -3926,10 +3982,10 @@ static int countClusterpairs(const int                        numLists,
  * to reduction of parts of the force buffer that could be avoided. But since
  * the original lists are quite balanced, this will only give minor overhead.
  */
-static void rebalanceSimpleLists(int                              numLists,
-                                 nbnxn_pairlist_t * const * const srcSet,
-                                 nbnxn_pairlist_t               **destSet,
-                                 nbnxn_search_work_t             *searchWork)
+static void rebalanceSimpleLists(int                                  numLists,
+                                 nbnxn_pairlist_t * const * const     srcSet,
+                                 nbnxn_pairlist_t                   **destSet,
+                                 gmx::ArrayRef<nbnxn_search_work_t>   searchWork)
 {
     int ncjTotal  = countClusterpairs(numLists, srcSet);
     int ncjTarget = (ncjTotal + numLists - 1)/numLists;
@@ -4067,7 +4123,7 @@ static void sort_sci(nbnxn_pairlist_t *nbl)
     if (work->sci_sort_nalloc != nbl->sci_nalloc)
     {
         work->sci_sort_nalloc = nbl->sci_nalloc;
-        nbnxn_realloc_void((void **)&work->sci_sort,
+        nbnxn_realloc_void(reinterpret_cast<void **>(&work->sci_sort),
                            0,
                            work->sci_sort_nalloc*sizeof(*work->sci_sort),
                            nbl->alloc, nbl->free);
@@ -4107,7 +4163,7 @@ static void sort_sci(nbnxn_pairlist_t *nbl)
 }
 
 /* Make a local or non-local pair-list, depending on iloc */
-void nbnxn_make_pairlist(const nbnxn_search_t  nbs,
+void nbnxn_make_pairlist(nbnxn_search         *nbs,
                          nbnxn_atomdata_t     *nbat,
                          const t_blocka       *excl,
                          real                  rlist,