Convert nbnxn_pairlist_set_t to class PairlistSet
[alexxy/gromacs.git] / src / gromacs / nbnxm / pairlist.cpp
index 8469c409bad92a82f97dcee530f45bec10b19ca2..e1e165839c01fcf83504cdb9a7c8d7bd0429a29a 100644 (file)
@@ -648,19 +648,14 @@ static unsigned int nbl_imask0(const NbnxnPairlistGpu *nbl, int cj_ind)
     return nbl->cj4[cj_ind/c_nbnxnGpuJgroupSize].imei[0].imask;
 }
 
-/* Initializes a single NbnxnPairlistCpu data structure */
-static void nbnxn_init_pairlist(NbnxnPairlistCpu *nbl)
+NbnxnPairlistCpu::NbnxnPairlistCpu() :
+    na_ci(c_nbnxnCpuIClusterSize),
+    na_cj(0),
+    rlist(0),
+    ncjInUse(0),
+    nci_tot(0),
+    work(std::make_unique<NbnxnPairlistCpuWork>())
 {
-    nbl->na_ci       = c_nbnxnCpuIClusterSize;
-    nbl->na_cj       = 0;
-    nbl->ci.clear();
-    nbl->ciOuter.clear();
-    nbl->ncjInUse    = 0;
-    nbl->cj.clear();
-    nbl->cjOuter.clear();
-    nbl->nci_tot     = 0;
-
-    nbl->work        = new NbnxnPairlistCpuWork();
 }
 
 NbnxnPairlistGpu::NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy) :
@@ -671,7 +666,8 @@ NbnxnPairlistGpu::NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy) :
     sci({}, {pinningPolicy}),
     cj4({}, {pinningPolicy}),
     excl({}, {pinningPolicy}),
-    nci_tot(0)
+    nci_tot(0),
+    work(std::make_unique<NbnxnPairlistGpuWork>())
 {
     static_assert(c_nbnxnGpuNumClusterPerSupercluster == c_gpuNumClusterPerCell,
                   "The search code assumes that the a super-cluster matches a search grid cell");
@@ -683,79 +679,76 @@ NbnxnPairlistGpu::NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy) :
 
     // We always want a first entry without any exclusions
     excl.resize(1);
-
-    work = new NbnxnPairlistGpuWork();
 }
 
-void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list)
+// TODO: Move to pairlistset.cpp
+PairlistSet::PairlistSet(const Nbnxm::InteractionLocality  locality,
+                         const NbnxnListParameters        &listParams) :
+    locality_(locality),
+    params_(listParams)
 {
-    nbl_list->bSimple   =
-        (nbl_list->params.pairlistType == PairlistType::Simple4x2 ||
-         nbl_list->params.pairlistType == PairlistType::Simple4x4 ||
-         nbl_list->params.pairlistType == PairlistType::Simple4x8);
+    isCpuType_ =
+        (params_.pairlistType == PairlistType::Simple4x2 ||
+         params_.pairlistType == PairlistType::Simple4x4 ||
+         params_.pairlistType == PairlistType::Simple4x8);
     // Currently GPU lists are always combined
-    nbl_list->bCombined = !nbl_list->bSimple;
+    combineLists_ = !isCpuType_;
 
-    nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded);
+    const int numLists = gmx_omp_nthreads_get(emntNonbonded);
 
-    if (!nbl_list->bCombined &&
-        nbl_list->nnbl > NBNXN_BUFFERFLAG_MAX_THREADS)
+    if (!combineLists_ &&
+        numLists > NBNXN_BUFFERFLAG_MAX_THREADS)
     {
         gmx_fatal(FARGS, "%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
-                  nbl_list->nnbl, NBNXN_BUFFERFLAG_MAX_THREADS, NBNXN_BUFFERFLAG_MAX_THREADS);
+                  numLists, NBNXN_BUFFERFLAG_MAX_THREADS, NBNXN_BUFFERFLAG_MAX_THREADS);
     }
 
-    if (nbl_list->bSimple)
+    if (isCpuType_)
     {
-        snew(nbl_list->nbl, nbl_list->nnbl);
-        if (nbl_list->nnbl > 1)
+        cpuLists_.resize(numLists);
+        if (numLists > 1)
         {
-            snew(nbl_list->nbl_work, nbl_list->nnbl);
+            cpuListsWork_.resize(numLists);
         }
     }
     else
     {
-        snew(nbl_list->nblGpu, nbl_list->nnbl);
+        /* Only list 0 is used on the GPU, use normal allocation for i>0 */
+        gpuLists_.emplace_back(gmx::PinningPolicy::PinnedIfSupported);
+        /* Lists 0 to numLists are use for constructing lists in parallel
+         * on the CPU using numLists threads (and then merged into list 0).
+         */
+        for (int i = 1; i < numLists; i++)
+        {
+            gpuLists_.emplace_back(gmx::PinningPolicy::CannotBePinned);
+        }
     }
-    nbl_list->nbl_fep.resize(nbl_list->nnbl);
-    /* Execute in order to avoid memory interleaving between threads */
-#pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
-    for (int i = 0; i < nbl_list->nnbl; i++)
+    if (params_.haveFep)
     {
-        try
-        {
-            /* Allocate the nblist data structure locally on each thread
-             * to optimize memory access for NUMA architectures.
-             */
-            if (nbl_list->bSimple)
-            {
-                nbl_list->nbl[i] = new NbnxnPairlistCpu();
+        fepLists_.resize(numLists);
 
-                nbnxn_init_pairlist(nbl_list->nbl[i]);
-                if (nbl_list->nnbl > 1)
-                {
-                    nbl_list->nbl_work[i] = new NbnxnPairlistCpu();
-                    nbnxn_init_pairlist(nbl_list->nbl_work[i]);
-                }
-            }
-            else
+        /* Execute in order to avoid memory interleaving between threads */
+#pragma omp parallel for num_threads(numLists) schedule(static)
+        for (int i = 0; i < numLists; i++)
+        {
+            try
             {
-                /* Only list 0 is used on the GPU, use normal allocation for i>0 */
-                auto pinningPolicy = (i == 0 ? gmx::PinningPolicy::PinnedIfSupported : gmx::PinningPolicy::CannotBePinned);
-
-                nbl_list->nblGpu[i] = new NbnxnPairlistGpu(pinningPolicy);
+                /* We used to allocate all normal lists locally on each thread
+                 * as well. The question is if allocating the object on the
+                 * master thread (but all contained list memory thread local)
+                 * impacts performance.
+                 */
+                snew(fepLists_[i], 1);
+                nbnxn_init_pairlist_fep(fepLists_[i]);
             }
-
-            snew(nbl_list->nbl_fep[i], 1);
-            nbnxn_init_pairlist_fep(nbl_list->nbl_fep[i]);
+            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
         }
-        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
     }
 }
 
 /* Print statistics of a pair list, used for debug output */
 static void print_nblist_statistics(FILE                   *fp,
-                                    const NbnxnPairlistCpu *nbl,
+                                    const NbnxnPairlistCpu &nbl,
                                     const PairSearch       &pairSearch,
                                     const real              rl)
 {
@@ -763,34 +756,34 @@ static void print_nblist_statistics(FILE                   *fp,
     const Grid::Dimensions &dims = grid.dimensions();
 
     fprintf(fp, "nbl nci %zu ncj %d\n",
-            nbl->ci.size(), nbl->ncjInUse);
+            nbl.ci.size(), nbl.ncjInUse);
     const int    numAtomsJCluster = grid.geometry().numAtomsJCluster;
-    const double numAtomsPerCell  = nbl->ncjInUse/static_cast<double>(grid.numCells())*numAtomsJCluster;
+    const double numAtomsPerCell  = nbl.ncjInUse/static_cast<double>(grid.numCells())*numAtomsJCluster;
     fprintf(fp, "nbl na_cj %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
-            nbl->na_cj, rl, nbl->ncjInUse, nbl->ncjInUse/static_cast<double>(grid.numCells()),
+            nbl.na_cj, rl, nbl.ncjInUse, nbl.ncjInUse/static_cast<double>(grid.numCells()),
             numAtomsPerCell,
             numAtomsPerCell/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid.numCells()*numAtomsJCluster/(dims.gridSize[XX]*dims.gridSize[YY]*dims.gridSize[ZZ])));
 
     fprintf(fp, "nbl average j cell list length %.1f\n",
-            0.25*nbl->ncjInUse/std::max(static_cast<double>(nbl->ci.size()), 1.0));
+            0.25*nbl.ncjInUse/std::max(static_cast<double>(nbl.ci.size()), 1.0));
 
     int cs[SHIFTS] = { 0 };
     int npexcl     = 0;
-    for (const nbnxn_ci_t &ciEntry : nbl->ci)
+    for (const nbnxn_ci_t &ciEntry : nbl.ci)
     {
         cs[ciEntry.shift & NBNXN_CI_SHIFT] +=
             ciEntry.cj_ind_end - ciEntry.cj_ind_start;
 
         int j = ciEntry.cj_ind_start;
         while (j < ciEntry.cj_ind_end &&
-               nbl->cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
+               nbl.cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
         {
             npexcl++;
             j++;
         }
     }
     fprintf(fp, "nbl cell pairs, total: %zu excl: %d %.1f%%\n",
-            nbl->cj.size(), npexcl, 100*npexcl/std::max(static_cast<double>(nbl->cj.size()), 1.0));
+            nbl.cj.size(), npexcl, 100*npexcl/std::max(static_cast<double>(nbl.cj.size()), 1.0));
     for (int s = 0; s < SHIFTS; s++)
     {
         if (cs[s] > 0)
@@ -802,7 +795,7 @@ static void print_nblist_statistics(FILE                   *fp,
 
 /* Print statistics of a pair lists, used for debug output */
 static void print_nblist_statistics(FILE                   *fp,
-                                    const NbnxnPairlistGpu *nbl,
+                                    const NbnxnPairlistGpu &nbl,
                                     const PairSearch       &pairSearch,
                                     const real              rl)
 {
@@ -810,11 +803,11 @@ static void print_nblist_statistics(FILE                   *fp,
     const Grid::Dimensions &dims = grid.dimensions();
 
     fprintf(fp, "nbl nsci %zu ncj4 %zu nsi %d excl4 %zu\n",
-            nbl->sci.size(), nbl->cj4.size(), nbl->nci_tot, nbl->excl.size());
+            nbl.sci.size(), nbl.cj4.size(), nbl.nci_tot, nbl.excl.size());
     const int    numAtomsCluster = grid.geometry().numAtomsICluster;
-    const double numAtomsPerCell = nbl->nci_tot/static_cast<double>(grid.numClusters())*numAtomsCluster;
+    const double numAtomsPerCell = nbl.nci_tot/static_cast<double>(grid.numClusters())*numAtomsCluster;
     fprintf(fp, "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
-            nbl->na_ci, rl, nbl->nci_tot, nbl->nci_tot/static_cast<double>(grid.numClusters()),
+            nbl.na_ci, rl, nbl.nci_tot, nbl.nci_tot/static_cast<double>(grid.numClusters()),
             numAtomsPerCell,
             numAtomsPerCell/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid.numClusters()*numAtomsCluster/(dims.gridSize[XX]*dims.gridSize[YY]*dims.gridSize[ZZ])));
 
@@ -822,7 +815,7 @@ static void print_nblist_statistics(FILE                   *fp,
     double sum_nsp2 = 0;
     int    nsp_max  = 0;
     int    c[c_gpuNumClusterPerCell + 1] = { 0 };
-    for (const nbnxn_sci_t &sci : nbl->sci)
+    for (const nbnxn_sci_t &sci : nbl.sci)
     {
         int nsp = 0;
         for (int j4 = sci.cj4_ind_start; j4 < sci.cj4_ind_end; j4++)
@@ -832,7 +825,7 @@ static void print_nblist_statistics(FILE                   *fp,
                 int b = 0;
                 for (int si = 0; si < c_gpuNumClusterPerCell; si++)
                 {
-                    if (nbl->cj4[j4].imei[0].imask & (1U << (j*c_gpuNumClusterPerCell + si)))
+                    if (nbl.cj4[j4].imei[0].imask & (1U << (j*c_gpuNumClusterPerCell + si)))
                     {
                         b++;
                     }
@@ -845,20 +838,20 @@ static void print_nblist_statistics(FILE                   *fp,
         sum_nsp2 += nsp*nsp;
         nsp_max   = std::max(nsp_max, nsp);
     }
-    if (!nbl->sci.empty())
+    if (!nbl.sci.empty())
     {
-        sum_nsp  /= nbl->sci.size();
-        sum_nsp2 /= nbl->sci.size();
+        sum_nsp  /= nbl.sci.size();
+        sum_nsp2 /= nbl.sci.size();
     }
     fprintf(fp, "nbl #cluster-pairs: av %.1f stddev %.1f max %d\n",
             sum_nsp, std::sqrt(sum_nsp2 - sum_nsp*sum_nsp), nsp_max);
 
-    if (!nbl->cj4.empty())
+    if (!nbl.cj4.empty())
     {
         for (int b = 0; b <= c_gpuNumClusterPerCell; b++)
         {
             fprintf(fp, "nbl j-list #i-subcell %d %7d %4.1f\n",
-                    b, c[b], 100.0*c[b]/size_t {nbl->cj4.size()*c_nbnxnGpuJgroupSize});
+                    b, c[b], 100.0*c[b]/size_t {nbl.cj4.size()*c_nbnxnGpuJgroupSize});
         }
     }
 }
@@ -2045,7 +2038,7 @@ static void closeIEntry(NbnxnPairlistCpu    *nbl,
     const int jlen = ciEntry.cj_ind_end - ciEntry.cj_ind_start;
     if (jlen > 0)
     {
-        sort_cj_excl(nbl->cj.data() + ciEntry.cj_ind_start, jlen, nbl->work);
+        sort_cj_excl(nbl->cj.data() + ciEntry.cj_ind_start, jlen, nbl->work.get());
 
         /* The counts below are used for non-bonded pair/flop counts
          * and should therefore match the available kernel setups.
@@ -2605,9 +2598,10 @@ static void get_nsubpair_target(const PairSearch          &pairSearch,
 }
 
 /* Debug list print function */
-static void print_nblist_ci_cj(FILE *fp, const NbnxnPairlistCpu *nbl)
+static void print_nblist_ci_cj(FILE                   *fp,
+                               const NbnxnPairlistCpu &nbl)
 {
-    for (const nbnxn_ci_t &ciEntry : nbl->ci)
+    for (const nbnxn_ci_t &ciEntry : nbl.ci)
     {
         fprintf(fp, "ci %4d  shift %2d  ncj %3d\n",
                 ciEntry.ci, ciEntry.shift,
@@ -2616,16 +2610,17 @@ static void print_nblist_ci_cj(FILE *fp, const NbnxnPairlistCpu *nbl)
         for (int j = ciEntry.cj_ind_start; j < ciEntry.cj_ind_end; j++)
         {
             fprintf(fp, "  cj %5d  imask %x\n",
-                    nbl->cj[j].cj,
-                    nbl->cj[j].excl);
+                    nbl.cj[j].cj,
+                    nbl.cj[j].excl);
         }
     }
 }
 
 /* Debug list print function */
-static void print_nblist_sci_cj(FILE *fp, const NbnxnPairlistGpu *nbl)
+static void print_nblist_sci_cj(FILE                   *fp,
+                                const NbnxnPairlistGpu &nbl)
 {
-    for (const nbnxn_sci_t &sci : nbl->sci)
+    for (const nbnxn_sci_t &sci : nbl.sci)
     {
         fprintf(fp, "ci %4d  shift %2d  ncj4 %2d\n",
                 sci.sci, sci.shift,
@@ -2637,11 +2632,11 @@ static void print_nblist_sci_cj(FILE *fp, const NbnxnPairlistGpu *nbl)
             for (int j = 0; j < c_nbnxnGpuJgroupSize; j++)
             {
                 fprintf(fp, "  sj %5d  imask %x\n",
-                        nbl->cj4[j4].cj[j],
-                        nbl->cj4[j4].imei[0].imask);
+                        nbl.cj4[j4].cj[j],
+                        nbl.cj4[j4].imei[0].imask);
                 for (int si = 0; si < c_gpuNumClusterPerCell; si++)
                 {
-                    if (nbl->cj4[j4].imei[0].imask & (1U << (j*c_gpuNumClusterPerCell + si)))
+                    if (nbl.cj4[j4].imei[0].imask & (1U << (j*c_gpuNumClusterPerCell + si)))
                     {
                         ncp++;
                     }
@@ -2656,17 +2651,17 @@ static void print_nblist_sci_cj(FILE *fp, const NbnxnPairlistGpu *nbl)
 }
 
 /* Combine pair lists *nbl generated on multiple threads nblc */
-static void combine_nblists(int nnbl, NbnxnPairlistGpu **nbl,
-                            NbnxnPairlistGpu *nblc)
+static void combine_nblists(gmx::ArrayRef<const NbnxnPairlistGpu>  nbls,
+                            NbnxnPairlistGpu                      *nblc)
 {
     int nsci  = nblc->sci.size();
     int ncj4  = nblc->cj4.size();
     int nexcl = nblc->excl.size();
-    for (int i = 0; i < nnbl; i++)
+    for (auto &nbl : nbls)
     {
-        nsci  += nbl[i]->sci.size();
-        ncj4  += nbl[i]->cj4.size();
-        nexcl += nbl[i]->excl.size();
+        nsci  += nbl.sci.size();
+        ncj4  += nbl.cj4.size();
+        nexcl += nbl.excl.size();
     }
 
     /* Resize with the final, combined size, so we can fill in parallel */
@@ -2683,7 +2678,7 @@ static void combine_nblists(int nnbl, NbnxnPairlistGpu **nbl,
 #endif
 
 #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (int n = 0; n < nnbl; n++)
+    for (int n = 0; n < nbls.ssize(); n++)
     {
         try
         {
@@ -2694,14 +2689,14 @@ static void combine_nblists(int nnbl, NbnxnPairlistGpu **nbl,
             int cj4_offset  = ncj4;
             int excl_offset = nexcl;
 
-            for (int i = n; i < nnbl; i++)
+            for (int i = n; i < nbls.ssize(); i++)
             {
-                sci_offset  -= nbl[i]->sci.size();
-                cj4_offset  -= nbl[i]->cj4.size();
-                excl_offset -= nbl[i]->excl.size();
+                sci_offset  -= nbls[i].sci.size();
+                cj4_offset  -= nbls[i].cj4.size();
+                excl_offset -= nbls[i].excl.size();
             }
 
-            const NbnxnPairlistGpu &nbli = *nbl[n];
+            const NbnxnPairlistGpu &nbli = nbls[n];
 
             for (size_t i = 0; i < nbli.sci.size(); i++)
             {
@@ -2725,43 +2720,39 @@ static void combine_nblists(int nnbl, NbnxnPairlistGpu **nbl,
         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
     }
 
-    for (int n = 0; n < nnbl; n++)
+    for (auto &nbl : nbls)
     {
-        nblc->nci_tot += nbl[n]->nci_tot;
+        nblc->nci_tot += nbl.nci_tot;
     }
 }
 
-static void balance_fep_lists(gmx::ArrayRef<PairsearchWork>       work,
-                              nbnxn_pairlist_set_t               *nbl_lists)
+static void balance_fep_lists(gmx::ArrayRef<t_nblist *>      fepLists,
+                              gmx::ArrayRef<PairsearchWork>  work)
 {
-    int       nnbl;
-    int       nri_tot, nrj_tot, nrj_target;
-    int       th_dest;
-    t_nblist *nbld;
+    const int numLists = fepLists.ssize();
 
-    nnbl = nbl_lists->nnbl;
-
-    if (nnbl == 1)
+    if (numLists == 1)
     {
         /* Nothing to balance */
         return;
     }
 
     /* Count the total i-lists and pairs */
-    nri_tot = 0;
-    nrj_tot = 0;
-    for (int th = 0; th < nnbl; th++)
+    int nri_tot = 0;
+    int nrj_tot = 0;
+    for (auto list : fepLists)
     {
-        nri_tot += nbl_lists->nbl_fep[th]->nri;
-        nrj_tot += nbl_lists->nbl_fep[th]->nrj;
+        nri_tot += list->nri;
+        nrj_tot += list->nrj;
     }
 
-    nrj_target = (nrj_tot + nnbl - 1)/nnbl;
+    const int nrj_target = (nrj_tot + numLists - 1)/numLists;
 
-    assert(gmx_omp_nthreads_get(emntNonbonded) == nnbl);
+    GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == numLists,
+               "We should have as many work objects as FEP lists");
 
-#pragma omp parallel for schedule(static) num_threads(nnbl)
-    for (int th = 0; th < nnbl; th++)
+#pragma omp parallel for schedule(static) num_threads(numLists)
+    for (int th = 0; th < numLists; th++)
     {
         try
         {
@@ -2788,13 +2779,11 @@ static void balance_fep_lists(gmx::ArrayRef<PairsearchWork>       work,
     }
 
     /* Loop over the source lists and assign and copy i-entries */
-    th_dest = 0;
-    nbld    = work[th_dest].nbl_fep.get();
-    for (int th = 0; th < nnbl; th++)
+    int       th_dest = 0;
+    t_nblist *nbld    = work[th_dest].nbl_fep.get();
+    for (int th = 0; th < numLists; th++)
     {
-        t_nblist *nbls;
-
-        nbls = nbl_lists->nbl_fep[th];
+        t_nblist *nbls = fepLists[th];
 
         for (int i = 0; i < nbls->nri; i++)
         {
@@ -2806,7 +2795,7 @@ static void balance_fep_lists(gmx::ArrayRef<PairsearchWork>       work,
             /* Decide if list th_dest is too large and we should procede
              * to the next destination list.
              */
-            if (th_dest+1 < nnbl && nbld->nrj > 0 &&
+            if (th_dest + 1 < numLists && nbld->nrj > 0 &&
                 nbld->nrj + nrj - nrj_target > nrj_target - nbld->nrj)
             {
                 th_dest++;
@@ -2829,18 +2818,18 @@ static void balance_fep_lists(gmx::ArrayRef<PairsearchWork>       work,
     }
 
     /* Swap the list pointers */
-    for (int th = 0; th < nnbl; th++)
+    for (int th = 0; th < numLists; th++)
     {
-        t_nblist *nbl_tmp      = work[th].nbl_fep.release();
-        work[th].nbl_fep.reset(nbl_lists->nbl_fep[th]);
-        nbl_lists->nbl_fep[th] = nbl_tmp;
+        t_nblist *nbl_tmp = work[th].nbl_fep.release();
+        work[th].nbl_fep.reset(fepLists[th]);
+        fepLists[th]      = nbl_tmp;
 
         if (debug)
         {
             fprintf(debug, "nbl_fep[%d] nri %4d nrj %4d\n",
                     th,
-                    nbl_lists->nbl_fep[th]->nri,
-                    nbl_lists->nbl_fep[th]->nrj);
+                    fepLists[th]->nri,
+                    fepLists[th]->nrj);
         }
     }
 }
@@ -2922,7 +2911,8 @@ static float boundingbox_only_distance2(const Grid::Dimensions &iGridDims,
 }
 
 static int get_ci_block_size(const Grid &iGrid,
-                             gmx_bool bDomDec, int nth)
+                             const bool  haveDomDec,
+                             const int   numLists)
 {
     const int ci_block_enum      = 5;
     const int ci_block_denom     = 11;
@@ -2940,7 +2930,9 @@ static int get_ci_block_size(const Grid &iGrid,
      * zone boundaries with 3D domain decomposition. At the same time
      * the blocks will not become too small.
      */
-    ci_block = (iGrid.numCells()*ci_block_enum)/(ci_block_denom*iGrid.dimensions().numCells[XX]*nth);
+    GMX_ASSERT(iGrid.dimensions().numCells[XX] > 0, "Grid can't be empty");
+    GMX_ASSERT(numLists > 0, "We need at least one list");
+    ci_block = (iGrid.numCells()*ci_block_enum)/(ci_block_denom*iGrid.dimensions().numCells[XX]*numLists);
 
     const int numAtomsPerCell = iGrid.geometry().numAtomsPerCell;
 
@@ -2953,12 +2945,12 @@ static int get_ci_block_size(const Grid &iGrid,
     /* Without domain decomposition
      * or with less than 3 blocks per task, divide in nth blocks.
      */
-    if (!bDomDec || nth*3*ci_block > iGrid.numCells())
+    if (!haveDomDec || numLists*3*ci_block > iGrid.numCells())
     {
-        ci_block = (iGrid.numCells() + nth - 1)/nth;
+        ci_block = (iGrid.numCells() + numLists - 1)/numLists;
     }
 
-    if (ci_block > 1 && (nth - 1)*ci_block >= iGrid.numCells())
+    if (ci_block > 1 && (numLists - 1)*ci_block >= iGrid.numCells())
     {
         /* Some threads have no work. Although reducing the block size
          * does not decrease the block count on the first few threads,
@@ -3440,12 +3432,12 @@ static void nbnxn_make_pairlist_part(const PairSearch &pairSearch,
                     }
 
                     set_icell_bb(iGrid, ci, shx, shy, shz,
-                                 nbl->work);
+                                 nbl->work.get());
 
                     icell_set_x(cell0_i+ci, shx, shy, shz,
                                 nbat->xstride, nbat->x().data(),
                                 kernelType,
-                                nbl->work);
+                                nbl->work.get());
 
                     for (int cx = cxf; cx <= cxl; cx++)
                     {
@@ -3645,7 +3637,7 @@ static void nbnxn_make_pairlist_part(const PairSearch &pairSearch,
     {
         fprintf(debug, "number of distance checks %d\n", numDistanceChecks);
 
-        print_nblist_statistics(debug, nbl, pairSearch, rlist);
+        print_nblist_statistics(debug, *nbl, pairSearch, rlist);
 
         if (haveFep)
         {
@@ -3763,17 +3755,17 @@ static void copySelectedListRange(const nbnxn_ci_t * gmx_restrict srcCi,
  * to reduction of parts of the force buffer that could be avoided. But since
  * the original lists are quite balanced, this will only give minor overhead.
  */
-static void rebalanceSimpleLists(int                                  numLists,
-                                 NbnxnPairlistCpu * const * const     srcSet,
-                                 NbnxnPairlistCpu                   **destSet,
-                                 gmx::ArrayRef<PairsearchWork>        searchWork)
+static void rebalanceSimpleLists(gmx::ArrayRef<const NbnxnPairlistCpu> srcSet,
+                                 gmx::ArrayRef<NbnxnPairlistCpu>       destSet,
+                                 gmx::ArrayRef<PairsearchWork>         searchWork)
 {
     int ncjTotal = 0;
-    for (int s = 0; s < numLists; s++)
+    for (auto &src : srcSet)
     {
-        ncjTotal += srcSet[s]->ncjInUse;
+        ncjTotal += src.ncjInUse;
     }
-    int ncjTarget = (ncjTotal + numLists - 1)/numLists;
+    const int numLists  = srcSet.ssize();
+    const int ncjTarget = (ncjTotal + numLists - 1)/numLists;
 
 #pragma omp parallel num_threads(numLists)
     {
@@ -3783,23 +3775,23 @@ static void rebalanceSimpleLists(int                                  numLists,
         int cjEnd   = ncjTarget*(t + 1);
 
         /* The destination pair-list for task/thread t */
-        NbnxnPairlistCpu *dest = destSet[t];
+        NbnxnPairlistCpu &dest = destSet[t];
 
-        clear_pairlist(dest);
-        dest->na_cj   = srcSet[0]->na_cj;
+        clear_pairlist(&dest);
+        dest.na_cj = srcSet[0].na_cj;
 
         /* Note that the flags in the work struct (still) contain flags
          * for all entries that are present in srcSet->nbl[t].
          */
         gmx_bitmask_t *flag       = searchWork[t].buffer_flags.flag;
 
-        int            iFlagShift = getBufferFlagShift(dest->na_ci);
-        int            jFlagShift = getBufferFlagShift(dest->na_cj);
+        int            iFlagShift = getBufferFlagShift(dest.na_ci);
+        int            jFlagShift = getBufferFlagShift(dest.na_cj);
 
         int            cjGlobal   = 0;
         for (int s = 0; s < numLists && cjGlobal < cjEnd; s++)
         {
-            const NbnxnPairlistCpu *src = srcSet[s];
+            const NbnxnPairlistCpu *src = &srcSet[s];
 
             if (cjGlobal + src->ncjInUse > cjStart)
             {
@@ -3816,7 +3808,7 @@ static void rebalanceSimpleLists(int                                  numLists,
                         {
                             copySelectedListRange
                             <true>
-                                (srcCi, src, dest,
+                                (srcCi, src, &dest,
                                 flag, iFlagShift, jFlagShift, t);
                         }
                         else
@@ -3824,7 +3816,7 @@ static void rebalanceSimpleLists(int                                  numLists,
                             copySelectedListRange
                             <false>
                                 (srcCi, src,
-                                dest, flag, iFlagShift, jFlagShift, t);
+                                &dest, flag, iFlagShift, jFlagShift, t);
                         }
                     }
                     cjGlobal += ncj;
@@ -3836,29 +3828,29 @@ static void rebalanceSimpleLists(int                                  numLists,
             }
         }
 
-        dest->ncjInUse = dest->cj.size();
+        dest.ncjInUse = dest.cj.size();
     }
 
 #ifndef NDEBUG
     int ncjTotalNew = 0;
-    for (int s = 0; s < numLists; s++)
+    for (auto &dest : destSet)
     {
-        ncjTotalNew += destSet[s]->ncjInUse;
+        ncjTotalNew += dest.ncjInUse;
     }
     GMX_RELEASE_ASSERT(ncjTotalNew == ncjTotal, "The total size of the lists before and after rebalancing should match");
 #endif
 }
 
 /* Returns if the pairlists are so imbalanced that it is worth rebalancing. */
-static bool checkRebalanceSimpleLists(const nbnxn_pairlist_set_t *listSet)
+static bool checkRebalanceSimpleLists(gmx::ArrayRef<const NbnxnPairlistCpu> lists)
 {
-    int numLists = listSet->nnbl;
+    int numLists = lists.ssize();
     int ncjMax   = 0;
     int ncjTotal = 0;
     for (int s = 0; s < numLists; s++)
     {
-        ncjMax    = std::max(ncjMax, listSet->nbl[s]->ncjInUse);
-        ncjTotal += listSet->nbl[s]->ncjInUse;
+        ncjMax    = std::max(ncjMax, lists[s].ncjInUse);
+        ncjTotal += lists[s].ncjInUse;
     }
     if (debug)
     {
@@ -3932,44 +3924,41 @@ static void sort_sci(NbnxnPairlistGpu *nbl)
     std::swap(nbl->sci, work.sci_sort);
 }
 
+//! Prepares CPU lists produced by the search for dynamic pruning
+static void prepareListsForDynamicPruning(gmx::ArrayRef<NbnxnPairlistCpu> lists);
+
 void
-nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality,
-                                            PairSearch                *pairSearch,
-                                            nbnxn_atomdata_t          *nbat,
-                                            const t_blocka            *excl,
-                                            const Nbnxm::KernelType    kernelType,
-                                            const int64_t              step,
-                                            t_nrnb                    *nrnb)
+PairlistSet::constructPairlists(PairSearch                *pairSearch,
+                                nbnxn_atomdata_t          *nbat,
+                                const t_blocka            *excl,
+                                const Nbnxm::KernelType    kernelType,
+                                const int                  minimumIlistCountForGpuBalancing,
+                                t_nrnb                    *nrnb)
 {
-    nbnxn_pairlist_set_t *nbl_list = &pairlistSet(iLocality);
-
-    const real            rlist    = nbl_list->params.rlistOuter;
+    const real         rlist    = params_.rlistOuter;
 
     int                nsubpair_target;
     float              nsubpair_tot_est;
-    int                nnbl;
     int                ci_block;
-    gmx_bool           CombineNBLists;
     gmx_bool           progBal;
     int                np_tot, np_noq, np_hlj, nap;
 
-    nnbl            = nbl_list->nnbl;
-    CombineNBLists  = nbl_list->bCombined;
+    const int          numLists = (isCpuType_ ? cpuLists_.size() : gpuLists_.size());
 
     if (debug)
     {
-        fprintf(debug, "ns making %d nblists\n", nnbl);
+        fprintf(debug, "ns making %d nblists\n", numLists);
     }
 
     nbat->bUseBufferFlags = (nbat->out.size() > 1);
     /* We should re-init the flags before making the first list */
-    if (nbat->bUseBufferFlags && iLocality == InteractionLocality::Local)
+    if (nbat->bUseBufferFlags && locality_ == InteractionLocality::Local)
     {
         init_buffer_flags(&nbat->buffer_flags, nbat->numAtoms());
     }
 
     int nzi;
-    if (iLocality == InteractionLocality::Local)
+    if (locality_ == InteractionLocality::Local)
     {
         /* Only zone (grid) 0 vs 0 */
         nzi = 1;
@@ -3979,9 +3968,9 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
         nzi = pairSearch->domainSetup().zones->nizone;
     }
 
-    if (!nbl_list->bSimple && minimumIlistCountForGpuBalancing_ > 0)
+    if (!isCpuType_ && minimumIlistCountForGpuBalancing > 0)
     {
-        get_nsubpair_target(*pairSearch, iLocality, rlist, minimumIlistCountForGpuBalancing_,
+        get_nsubpair_target(*pairSearch, locality_, rlist, minimumIlistCountForGpuBalancing,
                             &nsubpair_target, &nsubpair_tot_est);
     }
     else
@@ -3991,20 +3980,20 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
     }
 
     /* Clear all pair-lists */
-    for (int th = 0; th < nnbl; th++)
+    for (int th = 0; th < numLists; th++)
     {
-        if (nbl_list->bSimple)
+        if (isCpuType_)
         {
-            clear_pairlist(nbl_list->nbl[th]);
+            clear_pairlist(&cpuLists_[th]);
         }
         else
         {
-            clear_pairlist(nbl_list->nblGpu[th]);
+            clear_pairlist(&gpuLists_[th]);
         }
 
-        if (pairSearch->gridSet().haveFep())
+        if (params_.haveFep)
         {
-            clear_pairlist_fep(nbl_list->nbl_fep[th]);
+            clear_pairlist_fep(fepLists_[th]);
         }
     }
 
@@ -4016,7 +4005,7 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
 
         int                 zj0;
         int                 zj1;
-        if (iLocality == InteractionLocality::Local)
+        if (locality_ == InteractionLocality::Local)
         {
             zj0 = 0;
             zj1 = 1;
@@ -4041,15 +4030,15 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
 
             pairSearch->cycleCounting_.start(PairSearch::enbsCCsearch);
 
-            ci_block = get_ci_block_size(iGrid, pairSearch->domainSetup().haveDomDec, nnbl);
+            ci_block = get_ci_block_size(iGrid, pairSearch->domainSetup().haveDomDec, numLists);
 
             /* With GPU: generate progressively smaller lists for
              * load balancing for local only or non-local with 2 zones.
              */
-            progBal = (iLocality == InteractionLocality::Local || ddZones->n <= 2);
+            progBal = (locality_ == InteractionLocality::Local || ddZones->n <= 2);
 
-#pragma omp parallel for num_threads(nnbl) schedule(static)
-            for (int th = 0; th < nnbl; th++)
+#pragma omp parallel for num_threads(numLists) schedule(static)
+            for (int th = 0; th < numLists; th++)
             {
                 try
                 {
@@ -4061,48 +4050,50 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
                         init_buffer_flags(&pairSearch->work()[th].buffer_flags, nbat->numAtoms());
                     }
 
-                    if (CombineNBLists && th > 0)
+                    if (combineLists_ && th > 0)
                     {
-                        GMX_ASSERT(!nbl_list->bSimple, "Can only combine GPU lists");
+                        GMX_ASSERT(!isCpuType_, "Can only combine GPU lists");
 
-                        clear_pairlist(nbl_list->nblGpu[th]);
+                        clear_pairlist(&gpuLists_[th]);
                     }
 
-                    auto &searchWork = pairSearch->work()[th];
+                    PairsearchWork *searchWork = &pairSearch->work()[th];
+
+                    searchWork->cycleCounter.start();
 
-                    searchWork.cycleCounter.start();
+                    t_nblist *fepListPtr = (fepLists_.empty() ? nullptr : fepLists_[th]);
 
-                    /* Divide the i super cell equally over the nblists */
-                    if (nbl_list->bSimple)
+                    /* Divide the i cells equally over the pairlists */
+                    if (isCpuType_)
                     {
                         nbnxn_make_pairlist_part(*pairSearch, iGrid, jGrid,
-                                                 &searchWork, nbat, *excl,
+                                                 searchWork, nbat, *excl,
                                                  rlist,
                                                  kernelType,
                                                  ci_block,
                                                  nbat->bUseBufferFlags,
                                                  nsubpair_target,
                                                  progBal, nsubpair_tot_est,
-                                                 th, nnbl,
-                                                 nbl_list->nbl[th],
-                                                 nbl_list->nbl_fep[th]);
+                                                 th, numLists,
+                                                 &cpuLists_[th],
+                                                 fepListPtr);
                     }
                     else
                     {
                         nbnxn_make_pairlist_part(*pairSearch, iGrid, jGrid,
-                                                 &searchWork, nbat, *excl,
+                                                 searchWork, nbat, *excl,
                                                  rlist,
                                                  kernelType,
                                                  ci_block,
                                                  nbat->bUseBufferFlags,
                                                  nsubpair_target,
                                                  progBal, nsubpair_tot_est,
-                                                 th, nnbl,
-                                                 nbl_list->nblGpu[th],
-                                                 nbl_list->nbl_fep[th]);
+                                                 th, numLists,
+                                                 &gpuLists_[th],
+                                                 fepListPtr);
                     }
 
-                    searchWork.cycleCounter.stop();
+                    searchWork->cycleCounter.stop();
                 }
                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
             }
@@ -4111,77 +4102,75 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
             np_tot = 0;
             np_noq = 0;
             np_hlj = 0;
-            for (int th = 0; th < nnbl; th++)
+            for (int th = 0; th < numLists; th++)
             {
                 inc_nrnb(nrnb, eNR_NBNXN_DIST2, pairSearch->work()[th].ndistc);
 
-                if (nbl_list->bSimple)
+                if (isCpuType_)
                 {
-                    NbnxnPairlistCpu *nbl = nbl_list->nbl[th];
-                    np_tot += nbl->cj.size();
-                    np_noq += nbl->work->ncj_noq;
-                    np_hlj += nbl->work->ncj_hlj;
+                    const NbnxnPairlistCpu &nbl = cpuLists_[th];
+                    np_tot += nbl.cj.size();
+                    np_noq += nbl.work->ncj_noq;
+                    np_hlj += nbl.work->ncj_hlj;
                 }
                 else
                 {
-                    NbnxnPairlistGpu *nbl = nbl_list->nblGpu[th];
+                    const NbnxnPairlistGpu &nbl = gpuLists_[th];
                     /* This count ignores potential subsequent pair pruning */
-                    np_tot += nbl->nci_tot;
+                    np_tot += nbl.nci_tot;
                 }
             }
-            if (nbl_list->bSimple)
+            if (isCpuType_)
             {
-                nap               = nbl_list->nbl[0]->na_ci*nbl_list->nbl[0]->na_cj;
+                nap      = cpuLists_[0].na_ci*cpuLists_[0].na_cj;
             }
             else
             {
-                nap               = gmx::square(nbl_list->nblGpu[0]->na_ci);
+                nap      = gmx::square(gpuLists_[0].na_ci);
             }
-            nbl_list->natpair_ljq = (np_tot - np_noq)*nap - np_hlj*nap/2;
-            nbl_list->natpair_lj  = np_noq*nap;
-            nbl_list->natpair_q   = np_hlj*nap/2;
+            natpair_ljq_ = (np_tot - np_noq)*nap - np_hlj*nap/2;
+            natpair_lj_  = np_noq*nap;
+            natpair_q_   = np_hlj*nap/2;
 
-            if (CombineNBLists && nnbl > 1)
+            if (combineLists_ && numLists > 1)
             {
-                GMX_ASSERT(!nbl_list->bSimple, "Can only combine GPU lists");
-                NbnxnPairlistGpu **nbl = nbl_list->nblGpu;
+                GMX_ASSERT(!isCpuType_, "Can only combine GPU lists");
 
                 pairSearch->cycleCounting_.start(PairSearch::enbsCCcombine);
 
-                combine_nblists(nnbl-1, nbl+1, nbl[0]);
+                combine_nblists(gmx::constArrayRefFromArray(&gpuLists_[1], numLists - 1),
+                                &gpuLists_[0]);
 
                 pairSearch->cycleCounting_.stop(PairSearch::enbsCCcombine);
             }
         }
     }
 
-    if (nbl_list->bSimple)
+    if (isCpuType_)
     {
-        if (nnbl > 1 && checkRebalanceSimpleLists(nbl_list))
+        if (numLists > 1 && checkRebalanceSimpleLists(cpuLists_))
         {
-            rebalanceSimpleLists(nbl_list->nnbl, nbl_list->nbl, nbl_list->nbl_work, pairSearch->work());
+            rebalanceSimpleLists(cpuLists_, cpuListsWork_, pairSearch->work());
 
-            /* Swap the pointer of the sets of pair lists */
-            NbnxnPairlistCpu **tmp = nbl_list->nbl;
-            nbl_list->nbl          = nbl_list->nbl_work;
-            nbl_list->nbl_work     = tmp;
+            /* Swap the sets of pair lists */
+            cpuLists_.swap(cpuListsWork_);
         }
     }
     else
     {
         /* Sort the entries on size, large ones first */
-        if (CombineNBLists || nnbl == 1)
+        if (combineLists_ || gpuLists_.size() == 1)
         {
-            sort_sci(nbl_list->nblGpu[0]);
+            sort_sci(&gpuLists_[0]);
         }
         else
         {
-#pragma omp parallel for num_threads(nnbl) schedule(static)
-            for (int th = 0; th < nnbl; th++)
+#pragma omp parallel for num_threads(numLists) schedule(static)
+            for (int th = 0; th < numLists; th++)
             {
                 try
                 {
-                    sort_sci(nbl_list->nblGpu[th]);
+                    sort_sci(&gpuLists_[th]);
                 }
                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
             }
@@ -4190,60 +4179,38 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
 
     if (nbat->bUseBufferFlags)
     {
-        reduce_buffer_flags(*pairSearch, nbl_list->nnbl, &nbat->buffer_flags);
+        reduce_buffer_flags(*pairSearch, numLists, &nbat->buffer_flags);
     }
 
     if (pairSearch->gridSet().haveFep())
     {
         /* Balance the free-energy lists over all the threads */
-        balance_fep_lists(pairSearch->work(), nbl_list);
+        balance_fep_lists(fepLists_, pairSearch->work());
     }
 
-    if (nbl_list->bSimple)
+    if (isCpuType_)
     {
         /* This is a fresh list, so not pruned, stored using ci.
          * ciOuter is invalid at this point.
          */
-        GMX_ASSERT(nbl_list->nbl[0]->ciOuter.empty(), "ciOuter is invalid so it should be empty");
-    }
-
-    if (iLocality == Nbnxm::InteractionLocality::Local)
-    {
-        outerListCreationStep_ = step;
-    }
-    else
-    {
-        GMX_RELEASE_ASSERT(outerListCreationStep_ == step,
-                           "Outer list should be created at the same step as the inner list");
-    }
-
-    /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
-    if (iLocality == InteractionLocality::Local)
-    {
-        pairSearch->cycleCounting_.searchCount_++;
-    }
-    if (pairSearch->cycleCounting_.recordCycles_ &&
-        (!pairSearch->domainSetup().haveDomDec || iLocality == InteractionLocality::NonLocal) &&
-        pairSearch->cycleCounting_.searchCount_ % 100 == 0)
-    {
-        pairSearch->cycleCounting_.printCycles(stderr, pairSearch->work());
+        GMX_ASSERT(cpuLists_[0].ciOuter.empty(), "ciOuter is invalid so it should be empty");
     }
 
     /* If we have more than one list, they either got rebalancing (CPU)
      * or combined (GPU), so we should dump the final result to debug.
      */
-    if (debug && nbl_list->nnbl > 1)
+    if (debug)
     {
-        if (nbl_list->bSimple)
+        if (isCpuType_ && cpuLists_.size() > 1)
         {
-            for (int t = 0; t < nbl_list->nnbl; t++)
+            for (auto &cpuList : cpuLists_)
             {
-                print_nblist_statistics(debug, nbl_list->nbl[t], *pairSearch, rlist);
+                print_nblist_statistics(debug, cpuList, *pairSearch, rlist);
             }
         }
-        else
+        else if (!isCpuType_ && gpuLists_.size() > 1)
         {
-            print_nblist_statistics(debug, nbl_list->nblGpu[0], *pairSearch, rlist);
+            print_nblist_statistics(debug, gpuLists_[0], *pairSearch, rlist);
         }
     }
 
@@ -4251,28 +4218,62 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
     {
         if (gmx_debug_at)
         {
-            if (nbl_list->bSimple)
+            if (isCpuType_)
             {
-                for (int t = 0; t < nbl_list->nnbl; t++)
+                for (auto &cpuList : cpuLists_)
                 {
-                    print_nblist_ci_cj(debug, nbl_list->nbl[t]);
+                    print_nblist_ci_cj(debug, cpuList);
                 }
             }
             else
             {
-                print_nblist_sci_cj(debug, nbl_list->nblGpu[0]);
+                print_nblist_sci_cj(debug, gpuLists_[0]);
             }
         }
 
         if (nbat->bUseBufferFlags)
         {
-            print_reduction_cost(&nbat->buffer_flags, nbl_list->nnbl);
+            print_reduction_cost(&nbat->buffer_flags, numLists);
         }
     }
 
-    if (params_.useDynamicPruning && nbl_list->bSimple)
+    if (params_.useDynamicPruning && isCpuType_)
     {
-        nbnxnPrepareListForDynamicPruning(nbl_list);
+        prepareListsForDynamicPruning(cpuLists_);
+    }
+}
+
+void
+nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality,
+                                            PairSearch                *pairSearch,
+                                            nbnxn_atomdata_t          *nbat,
+                                            const t_blocka            *excl,
+                                            const Nbnxm::KernelType    kernelType,
+                                            const int64_t              step,
+                                            t_nrnb                    *nrnb)
+{
+    pairlistSet(iLocality).constructPairlists(pairSearch, nbat, excl, kernelType, minimumIlistCountForGpuBalancing_, nrnb);
+
+    if (iLocality == Nbnxm::InteractionLocality::Local)
+    {
+        outerListCreationStep_ = step;
+    }
+    else
+    {
+        GMX_RELEASE_ASSERT(outerListCreationStep_ == step,
+                           "Outer list should be created at the same step as the inner list");
+    }
+
+    /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
+    if (iLocality == InteractionLocality::Local)
+    {
+        pairSearch->cycleCounting_.searchCount_++;
+    }
+    if (pairSearch->cycleCounting_.recordCycles_ &&
+        (!pairSearch->domainSetup().haveDomDec || iLocality == InteractionLocality::NonLocal) &&
+        pairSearch->cycleCounting_.searchCount_ % 100 == 0)
+    {
+        pairSearch->cycleCounting_.printCycles(stderr, pairSearch->work());
     }
 }
 
@@ -4293,24 +4294,20 @@ nonbonded_verlet_t::constructPairlist(const Nbnxm::InteractionLocality  iLocalit
          * NOTE: The launch overhead is currently not timed separately
          */
         Nbnxm::gpu_init_pairlist(gpu_nbv,
-                                 pairlistSets().pairlistSet(iLocality).nblGpu[0],
+                                 pairlistSets().pairlistSet(iLocality).gpuList(),
                                  iLocality);
     }
 }
 
-void nbnxnPrepareListForDynamicPruning(nbnxn_pairlist_set_t *listSet)
+static void prepareListsForDynamicPruning(gmx::ArrayRef<NbnxnPairlistCpu> lists)
 {
-    GMX_RELEASE_ASSERT(listSet->bSimple, "Should only be called for simple lists");
-
     /* TODO: Restructure the lists so we have actual outer and inner
      *       list objects so we can set a single pointer instead of
      *       swapping several pointers.
      */
 
-    for (int i = 0; i < listSet->nnbl; i++)
+    for (auto &list : lists)
     {
-        NbnxnPairlistCpu &list = *listSet->nbl[i];
-
         /* The search produced a list in ci/cj.
          * Swap the list pointers so we get the outer list is ciOuter,cjOuter
          * and we can prune that to get an inner list in ci/cj.