Convert nbnxn_pairlist_set_t to class PairlistSet

[alexxy/gromacs.git] / src / gromacs / nbnxm / pairlist.cpp
diff --git a/src/gromacs/nbnxm/pairlist.cpp b/src/gromacs/nbnxm/pairlist.cpp

index 8469c409bad92a82f97dcee530f45bec10b19ca2..e1e165839c01fcf83504cdb9a7c8d7bd0429a29a 100644 (file)
--- a/src/gromacs/nbnxm/pairlist.cpp
+++ b/src/gromacs/nbnxm/pairlist.cpp
@@ -648,19 +648,14 @@ static unsigned int nbl_imask0(const NbnxnPairlistGpu *nbl, int cj_ind)
      return nbl->cj4[cj_ind/c_nbnxnGpuJgroupSize].imei[0].imask;
  }
  
-/* Initializes a single NbnxnPairlistCpu data structure */
-static void nbnxn_init_pairlist(NbnxnPairlistCpu *nbl)
+NbnxnPairlistCpu::NbnxnPairlistCpu() :
+    na_ci(c_nbnxnCpuIClusterSize),
+    na_cj(0),
+    rlist(0),
+    ncjInUse(0),
+    nci_tot(0),
+    work(std::make_unique<NbnxnPairlistCpuWork>())
  {
-    nbl->na_ci       = c_nbnxnCpuIClusterSize;
-    nbl->na_cj       = 0;
-    nbl->ci.clear();
-    nbl->ciOuter.clear();
-    nbl->ncjInUse    = 0;
-    nbl->cj.clear();
-    nbl->cjOuter.clear();
-    nbl->nci_tot     = 0;
-
-    nbl->work        = new NbnxnPairlistCpuWork();
  }
  
  NbnxnPairlistGpu::NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy) :
@@ -671,7 +666,8 @@ NbnxnPairlistGpu::NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy) :
      sci({}, {pinningPolicy}),
      cj4({}, {pinningPolicy}),
      excl({}, {pinningPolicy}),
-    nci_tot(0)
+    nci_tot(0),
+    work(std::make_unique<NbnxnPairlistGpuWork>())
  {
      static_assert(c_nbnxnGpuNumClusterPerSupercluster == c_gpuNumClusterPerCell,
                    "The search code assumes that the a super-cluster matches a search grid cell");
@@ -683,79 +679,76 @@ NbnxnPairlistGpu::NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy) :
  
      // We always want a first entry without any exclusions
      excl.resize(1);
-
-    work = new NbnxnPairlistGpuWork();
  }
  
-void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list)
+// TODO: Move to pairlistset.cpp
+PairlistSet::PairlistSet(const Nbnxm::InteractionLocality  locality,
+                         const NbnxnListParameters        &listParams) :
+    locality_(locality),
+    params_(listParams)
  {
-    nbl_list->bSimple   =
-        (nbl_list->params.pairlistType == PairlistType::Simple4x2 ||
-         nbl_list->params.pairlistType == PairlistType::Simple4x4 ||
-         nbl_list->params.pairlistType == PairlistType::Simple4x8);
+    isCpuType_ =
+        (params_.pairlistType == PairlistType::Simple4x2 ||
+         params_.pairlistType == PairlistType::Simple4x4 ||
+         params_.pairlistType == PairlistType::Simple4x8);
      // Currently GPU lists are always combined
-    nbl_list->bCombined = !nbl_list->bSimple;
+    combineLists_ = !isCpuType_;
  
-    nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded);
+    const int numLists = gmx_omp_nthreads_get(emntNonbonded);
  
-    if (!nbl_list->bCombined &&
-        nbl_list->nnbl > NBNXN_BUFFERFLAG_MAX_THREADS)
+    if (!combineLists_ &&
+        numLists > NBNXN_BUFFERFLAG_MAX_THREADS)
      {
          gmx_fatal(FARGS, "%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
-                  nbl_list->nnbl, NBNXN_BUFFERFLAG_MAX_THREADS, NBNXN_BUFFERFLAG_MAX_THREADS);
+                  numLists, NBNXN_BUFFERFLAG_MAX_THREADS, NBNXN_BUFFERFLAG_MAX_THREADS);
      }
  
-    if (nbl_list->bSimple)
+    if (isCpuType_)
      {
-        snew(nbl_list->nbl, nbl_list->nnbl);
-        if (nbl_list->nnbl > 1)
+        cpuLists_.resize(numLists);
+        if (numLists > 1)
          {
-            snew(nbl_list->nbl_work, nbl_list->nnbl);
+            cpuListsWork_.resize(numLists);
          }
      }
      else
      {
-        snew(nbl_list->nblGpu, nbl_list->nnbl);
+        /* Only list 0 is used on the GPU, use normal allocation for i>0 */
+        gpuLists_.emplace_back(gmx::PinningPolicy::PinnedIfSupported);
+        /* Lists 0 to numLists are use for constructing lists in parallel
+         * on the CPU using numLists threads (and then merged into list 0).
+         */
+        for (int i = 1; i < numLists; i++)
+        {
+            gpuLists_.emplace_back(gmx::PinningPolicy::CannotBePinned);
+        }
      }
-    nbl_list->nbl_fep.resize(nbl_list->nnbl);
-    /* Execute in order to avoid memory interleaving between threads */
-#pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
-    for (int i = 0; i < nbl_list->nnbl; i++)
+    if (params_.haveFep)
      {
-        try
-        {
-            /* Allocate the nblist data structure locally on each thread
-             * to optimize memory access for NUMA architectures.
-             */
-            if (nbl_list->bSimple)
-            {
-                nbl_list->nbl[i] = new NbnxnPairlistCpu();
+        fepLists_.resize(numLists);
  
-                nbnxn_init_pairlist(nbl_list->nbl[i]);
-                if (nbl_list->nnbl > 1)
-                {
-                    nbl_list->nbl_work[i] = new NbnxnPairlistCpu();
-                    nbnxn_init_pairlist(nbl_list->nbl_work[i]);
-                }
-            }
-            else
+        /* Execute in order to avoid memory interleaving between threads */
+#pragma omp parallel for num_threads(numLists) schedule(static)
+        for (int i = 0; i < numLists; i++)
+        {
+            try
              {
-                /* Only list 0 is used on the GPU, use normal allocation for i>0 */
-                auto pinningPolicy = (i == 0 ? gmx::PinningPolicy::PinnedIfSupported : gmx::PinningPolicy::CannotBePinned);
-
-                nbl_list->nblGpu[i] = new NbnxnPairlistGpu(pinningPolicy);
+                /* We used to allocate all normal lists locally on each thread
+                 * as well. The question is if allocating the object on the
+                 * master thread (but all contained list memory thread local)
+                 * impacts performance.
+                 */
+                snew(fepLists_[i], 1);
+                nbnxn_init_pairlist_fep(fepLists_[i]);
              }
-
-            snew(nbl_list->nbl_fep[i], 1);
-            nbnxn_init_pairlist_fep(nbl_list->nbl_fep[i]);
+            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
          }
-        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
      }
  }
  
  /* Print statistics of a pair list, used for debug output */
  static void print_nblist_statistics(FILE                   *fp,
-                                    const NbnxnPairlistCpu *nbl,
+                                    const NbnxnPairlistCpu &nbl,
                                      const PairSearch       &pairSearch,
                                      const real              rl)
  {
@@ -763,34 +756,34 @@ static void print_nblist_statistics(FILE                   *fp,
      const Grid::Dimensions &dims = grid.dimensions();
  
      fprintf(fp, "nbl nci %zu ncj %d\n",
-            nbl->ci.size(), nbl->ncjInUse);
+            nbl.ci.size(), nbl.ncjInUse);
      const int    numAtomsJCluster = grid.geometry().numAtomsJCluster;
-    const double numAtomsPerCell  = nbl->ncjInUse/static_cast<double>(grid.numCells())*numAtomsJCluster;
+    const double numAtomsPerCell  = nbl.ncjInUse/static_cast<double>(grid.numCells())*numAtomsJCluster;
      fprintf(fp, "nbl na_cj %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
-            nbl->na_cj, rl, nbl->ncjInUse, nbl->ncjInUse/static_cast<double>(grid.numCells()),
+            nbl.na_cj, rl, nbl.ncjInUse, nbl.ncjInUse/static_cast<double>(grid.numCells()),
              numAtomsPerCell,
              numAtomsPerCell/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid.numCells()*numAtomsJCluster/(dims.gridSize[XX]*dims.gridSize[YY]*dims.gridSize[ZZ])));
  
      fprintf(fp, "nbl average j cell list length %.1f\n",
-            0.25*nbl->ncjInUse/std::max(static_cast<double>(nbl->ci.size()), 1.0));
+            0.25*nbl.ncjInUse/std::max(static_cast<double>(nbl.ci.size()), 1.0));
  
      int cs[SHIFTS] = { 0 };
      int npexcl     = 0;
-    for (const nbnxn_ci_t &ciEntry : nbl->ci)
+    for (const nbnxn_ci_t &ciEntry : nbl.ci)
      {
          cs[ciEntry.shift & NBNXN_CI_SHIFT] +=
              ciEntry.cj_ind_end - ciEntry.cj_ind_start;
  
          int j = ciEntry.cj_ind_start;
          while (j < ciEntry.cj_ind_end &&
-               nbl->cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
+               nbl.cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
          {
              npexcl++;
              j++;
          }
      }
      fprintf(fp, "nbl cell pairs, total: %zu excl: %d %.1f%%\n",
-            nbl->cj.size(), npexcl, 100*npexcl/std::max(static_cast<double>(nbl->cj.size()), 1.0));
+            nbl.cj.size(), npexcl, 100*npexcl/std::max(static_cast<double>(nbl.cj.size()), 1.0));
      for (int s = 0; s < SHIFTS; s++)
      {
          if (cs[s] > 0)
@@ -802,7 +795,7 @@ static void print_nblist_statistics(FILE                   *fp,
  
  /* Print statistics of a pair lists, used for debug output */
  static void print_nblist_statistics(FILE                   *fp,
-                                    const NbnxnPairlistGpu *nbl,
+                                    const NbnxnPairlistGpu &nbl,
                                      const PairSearch       &pairSearch,
                                      const real              rl)
  {
@@ -810,11 +803,11 @@ static void print_nblist_statistics(FILE                   *fp,
      const Grid::Dimensions &dims = grid.dimensions();
  
      fprintf(fp, "nbl nsci %zu ncj4 %zu nsi %d excl4 %zu\n",
-            nbl->sci.size(), nbl->cj4.size(), nbl->nci_tot, nbl->excl.size());
+            nbl.sci.size(), nbl.cj4.size(), nbl.nci_tot, nbl.excl.size());
      const int    numAtomsCluster = grid.geometry().numAtomsICluster;
-    const double numAtomsPerCell = nbl->nci_tot/static_cast<double>(grid.numClusters())*numAtomsCluster;
+    const double numAtomsPerCell = nbl.nci_tot/static_cast<double>(grid.numClusters())*numAtomsCluster;
      fprintf(fp, "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
-            nbl->na_ci, rl, nbl->nci_tot, nbl->nci_tot/static_cast<double>(grid.numClusters()),
+            nbl.na_ci, rl, nbl.nci_tot, nbl.nci_tot/static_cast<double>(grid.numClusters()),
              numAtomsPerCell,
              numAtomsPerCell/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid.numClusters()*numAtomsCluster/(dims.gridSize[XX]*dims.gridSize[YY]*dims.gridSize[ZZ])));
  
@@ -822,7 +815,7 @@ static void print_nblist_statistics(FILE                   *fp,
      double sum_nsp2 = 0;
      int    nsp_max  = 0;
      int    c[c_gpuNumClusterPerCell + 1] = { 0 };
-    for (const nbnxn_sci_t &sci : nbl->sci)
+    for (const nbnxn_sci_t &sci : nbl.sci)
      {
          int nsp = 0;
          for (int j4 = sci.cj4_ind_start; j4 < sci.cj4_ind_end; j4++)
@@ -832,7 +825,7 @@ static void print_nblist_statistics(FILE                   *fp,
                  int b = 0;
                  for (int si = 0; si < c_gpuNumClusterPerCell; si++)
                  {
-                    if (nbl->cj4[j4].imei[0].imask & (1U << (j*c_gpuNumClusterPerCell + si)))
+                    if (nbl.cj4[j4].imei[0].imask & (1U << (j*c_gpuNumClusterPerCell + si)))
                      {
                          b++;
                      }
@@ -845,20 +838,20 @@ static void print_nblist_statistics(FILE                   *fp,
          sum_nsp2 += nsp*nsp;
          nsp_max   = std::max(nsp_max, nsp);
      }
-    if (!nbl->sci.empty())
+    if (!nbl.sci.empty())
      {
-        sum_nsp  /= nbl->sci.size();
-        sum_nsp2 /= nbl->sci.size();
+        sum_nsp  /= nbl.sci.size();
+        sum_nsp2 /= nbl.sci.size();
      }
      fprintf(fp, "nbl #cluster-pairs: av %.1f stddev %.1f max %d\n",
              sum_nsp, std::sqrt(sum_nsp2 - sum_nsp*sum_nsp), nsp_max);
  
-    if (!nbl->cj4.empty())
+    if (!nbl.cj4.empty())
      {
          for (int b = 0; b <= c_gpuNumClusterPerCell; b++)
          {
              fprintf(fp, "nbl j-list #i-subcell %d %7d %4.1f\n",
-                    b, c[b], 100.0*c[b]/size_t {nbl->cj4.size()*c_nbnxnGpuJgroupSize});
+                    b, c[b], 100.0*c[b]/size_t {nbl.cj4.size()*c_nbnxnGpuJgroupSize});
          }
      }
  }
@@ -2045,7 +2038,7 @@ static void closeIEntry(NbnxnPairlistCpu    *nbl,
      const int jlen = ciEntry.cj_ind_end - ciEntry.cj_ind_start;
      if (jlen > 0)
      {
-        sort_cj_excl(nbl->cj.data() + ciEntry.cj_ind_start, jlen, nbl->work);
+        sort_cj_excl(nbl->cj.data() + ciEntry.cj_ind_start, jlen, nbl->work.get());
  
          /* The counts below are used for non-bonded pair/flop counts
           * and should therefore match the available kernel setups.
@@ -2605,9 +2598,10 @@ static void get_nsubpair_target(const PairSearch          &pairSearch,
  }
  
  /* Debug list print function */
-static void print_nblist_ci_cj(FILE *fp, const NbnxnPairlistCpu *nbl)
+static void print_nblist_ci_cj(FILE                   *fp,
+                               const NbnxnPairlistCpu &nbl)
  {
-    for (const nbnxn_ci_t &ciEntry : nbl->ci)
+    for (const nbnxn_ci_t &ciEntry : nbl.ci)
      {
          fprintf(fp, "ci %4d  shift %2d  ncj %3d\n",
                  ciEntry.ci, ciEntry.shift,
@@ -2616,16 +2610,17 @@ static void print_nblist_ci_cj(FILE *fp, const NbnxnPairlistCpu *nbl)
          for (int j = ciEntry.cj_ind_start; j < ciEntry.cj_ind_end; j++)
          {
              fprintf(fp, "  cj %5d  imask %x\n",
-                    nbl->cj[j].cj,
-                    nbl->cj[j].excl);
+                    nbl.cj[j].cj,
+                    nbl.cj[j].excl);
          }
      }
  }
  
  /* Debug list print function */
-static void print_nblist_sci_cj(FILE *fp, const NbnxnPairlistGpu *nbl)
+static void print_nblist_sci_cj(FILE                   *fp,
+                                const NbnxnPairlistGpu &nbl)
  {
-    for (const nbnxn_sci_t &sci : nbl->sci)
+    for (const nbnxn_sci_t &sci : nbl.sci)
      {
          fprintf(fp, "ci %4d  shift %2d  ncj4 %2d\n",
                  sci.sci, sci.shift,
@@ -2637,11 +2632,11 @@ static void print_nblist_sci_cj(FILE *fp, const NbnxnPairlistGpu *nbl)
              for (int j = 0; j < c_nbnxnGpuJgroupSize; j++)
              {
                  fprintf(fp, "  sj %5d  imask %x\n",
-                        nbl->cj4[j4].cj[j],
-                        nbl->cj4[j4].imei[0].imask);
+                        nbl.cj4[j4].cj[j],
+                        nbl.cj4[j4].imei[0].imask);
                  for (int si = 0; si < c_gpuNumClusterPerCell; si++)
                  {
-                    if (nbl->cj4[j4].imei[0].imask & (1U << (j*c_gpuNumClusterPerCell + si)))
+                    if (nbl.cj4[j4].imei[0].imask & (1U << (j*c_gpuNumClusterPerCell + si)))
                      {
                          ncp++;
                      }
@@ -2656,17 +2651,17 @@ static void print_nblist_sci_cj(FILE *fp, const NbnxnPairlistGpu *nbl)
  }
  
  /* Combine pair lists *nbl generated on multiple threads nblc */
-static void combine_nblists(int nnbl, NbnxnPairlistGpu **nbl,
-                            NbnxnPairlistGpu *nblc)
+static void combine_nblists(gmx::ArrayRef<const NbnxnPairlistGpu>  nbls,
+                            NbnxnPairlistGpu                      *nblc)
  {
      int nsci  = nblc->sci.size();
      int ncj4  = nblc->cj4.size();
      int nexcl = nblc->excl.size();
-    for (int i = 0; i < nnbl; i++)
+    for (auto &nbl : nbls)
      {
-        nsci  += nbl[i]->sci.size();
-        ncj4  += nbl[i]->cj4.size();
-        nexcl += nbl[i]->excl.size();
+        nsci  += nbl.sci.size();
+        ncj4  += nbl.cj4.size();
+        nexcl += nbl.excl.size();
      }
  
      /* Resize with the final, combined size, so we can fill in parallel */
@@ -2683,7 +2678,7 @@ static void combine_nblists(int nnbl, NbnxnPairlistGpu **nbl,
  #endif
  
  #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (int n = 0; n < nnbl; n++)
+    for (int n = 0; n < nbls.ssize(); n++)
      {
          try
          {
@@ -2694,14 +2689,14 @@ static void combine_nblists(int nnbl, NbnxnPairlistGpu **nbl,
              int cj4_offset  = ncj4;
              int excl_offset = nexcl;
  
-            for (int i = n; i < nnbl; i++)
+            for (int i = n; i < nbls.ssize(); i++)
              {
-                sci_offset  -= nbl[i]->sci.size();
-                cj4_offset  -= nbl[i]->cj4.size();
-                excl_offset -= nbl[i]->excl.size();
+                sci_offset  -= nbls[i].sci.size();
+                cj4_offset  -= nbls[i].cj4.size();
+                excl_offset -= nbls[i].excl.size();
              }
  
-            const NbnxnPairlistGpu &nbli = *nbl[n];
+            const NbnxnPairlistGpu &nbli = nbls[n];
  
              for (size_t i = 0; i < nbli.sci.size(); i++)
              {
@@ -2725,43 +2720,39 @@ static void combine_nblists(int nnbl, NbnxnPairlistGpu **nbl,
          GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
      }
  
-    for (int n = 0; n < nnbl; n++)
+    for (auto &nbl : nbls)
      {
-        nblc->nci_tot += nbl[n]->nci_tot;
+        nblc->nci_tot += nbl.nci_tot;
      }
  }
  
-static void balance_fep_lists(gmx::ArrayRef<PairsearchWork>       work,
-                              nbnxn_pairlist_set_t               *nbl_lists)
+static void balance_fep_lists(gmx::ArrayRef<t_nblist *>      fepLists,
+                              gmx::ArrayRef<PairsearchWork>  work)
  {
-    int       nnbl;
-    int       nri_tot, nrj_tot, nrj_target;
-    int       th_dest;
-    t_nblist *nbld;
+    const int numLists = fepLists.ssize();
  
-    nnbl = nbl_lists->nnbl;
-
-    if (nnbl == 1)
+    if (numLists == 1)
      {
          /* Nothing to balance */
          return;
      }
  
      /* Count the total i-lists and pairs */
-    nri_tot = 0;
-    nrj_tot = 0;
-    for (int th = 0; th < nnbl; th++)
+    int nri_tot = 0;
+    int nrj_tot = 0;
+    for (auto list : fepLists)
      {
-        nri_tot += nbl_lists->nbl_fep[th]->nri;
-        nrj_tot += nbl_lists->nbl_fep[th]->nrj;
+        nri_tot += list->nri;
+        nrj_tot += list->nrj;
      }
  
-    nrj_target = (nrj_tot + nnbl - 1)/nnbl;
+    const int nrj_target = (nrj_tot + numLists - 1)/numLists;
  
-    assert(gmx_omp_nthreads_get(emntNonbonded) == nnbl);
+    GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == numLists,
+               "We should have as many work objects as FEP lists");
  
-#pragma omp parallel for schedule(static) num_threads(nnbl)
-    for (int th = 0; th < nnbl; th++)
+#pragma omp parallel for schedule(static) num_threads(numLists)
+    for (int th = 0; th < numLists; th++)
      {
          try
          {
@@ -2788,13 +2779,11 @@ static void balance_fep_lists(gmx::ArrayRef<PairsearchWork>       work,
      }
  
      /* Loop over the source lists and assign and copy i-entries */
-    th_dest = 0;
-    nbld    = work[th_dest].nbl_fep.get();
-    for (int th = 0; th < nnbl; th++)
+    int       th_dest = 0;
+    t_nblist *nbld    = work[th_dest].nbl_fep.get();
+    for (int th = 0; th < numLists; th++)
      {
-        t_nblist *nbls;
-
-        nbls = nbl_lists->nbl_fep[th];
+        t_nblist *nbls = fepLists[th];
  
          for (int i = 0; i < nbls->nri; i++)
          {
@@ -2806,7 +2795,7 @@ static void balance_fep_lists(gmx::ArrayRef<PairsearchWork>       work,
              /* Decide if list th_dest is too large and we should procede
               * to the next destination list.
               */
-            if (th_dest+1 < nnbl && nbld->nrj > 0 &&
+            if (th_dest + 1 < numLists && nbld->nrj > 0 &&
                  nbld->nrj + nrj - nrj_target > nrj_target - nbld->nrj)
              {
                  th_dest++;
@@ -2829,18 +2818,18 @@ static void balance_fep_lists(gmx::ArrayRef<PairsearchWork>       work,
      }
  
      /* Swap the list pointers */
-    for (int th = 0; th < nnbl; th++)
+    for (int th = 0; th < numLists; th++)
      {
-        t_nblist *nbl_tmp      = work[th].nbl_fep.release();
-        work[th].nbl_fep.reset(nbl_lists->nbl_fep[th]);
-        nbl_lists->nbl_fep[th] = nbl_tmp;
+        t_nblist *nbl_tmp = work[th].nbl_fep.release();
+        work[th].nbl_fep.reset(fepLists[th]);
+        fepLists[th]      = nbl_tmp;
  
          if (debug)
          {
              fprintf(debug, "nbl_fep[%d] nri %4d nrj %4d\n",
                      th,
-                    nbl_lists->nbl_fep[th]->nri,
-                    nbl_lists->nbl_fep[th]->nrj);
+                    fepLists[th]->nri,
+                    fepLists[th]->nrj);
          }
      }
  }
@@ -2922,7 +2911,8 @@ static float boundingbox_only_distance2(const Grid::Dimensions &iGridDims,
  }
  
  static int get_ci_block_size(const Grid &iGrid,
-                             gmx_bool bDomDec, int nth)
+                             const bool  haveDomDec,
+                             const int   numLists)
  {
      const int ci_block_enum      = 5;
      const int ci_block_denom     = 11;
@@ -2940,7 +2930,9 @@ static int get_ci_block_size(const Grid &iGrid,
       * zone boundaries with 3D domain decomposition. At the same time
       * the blocks will not become too small.
       */
-    ci_block = (iGrid.numCells()*ci_block_enum)/(ci_block_denom*iGrid.dimensions().numCells[XX]*nth);
+    GMX_ASSERT(iGrid.dimensions().numCells[XX] > 0, "Grid can't be empty");
+    GMX_ASSERT(numLists > 0, "We need at least one list");
+    ci_block = (iGrid.numCells()*ci_block_enum)/(ci_block_denom*iGrid.dimensions().numCells[XX]*numLists);
  
      const int numAtomsPerCell = iGrid.geometry().numAtomsPerCell;
  
@@ -2953,12 +2945,12 @@ static int get_ci_block_size(const Grid &iGrid,
      /* Without domain decomposition
       * or with less than 3 blocks per task, divide in nth blocks.
       */
-    if (!bDomDec || nth*3*ci_block > iGrid.numCells())
+    if (!haveDomDec || numLists*3*ci_block > iGrid.numCells())
      {
-        ci_block = (iGrid.numCells() + nth - 1)/nth;
+        ci_block = (iGrid.numCells() + numLists - 1)/numLists;
      }
  
-    if (ci_block > 1 && (nth - 1)*ci_block >= iGrid.numCells())
+    if (ci_block > 1 && (numLists - 1)*ci_block >= iGrid.numCells())
      {
          /* Some threads have no work. Although reducing the block size
           * does not decrease the block count on the first few threads,
@@ -3440,12 +3432,12 @@ static void nbnxn_make_pairlist_part(const PairSearch &pairSearch,
                      }
  
                      set_icell_bb(iGrid, ci, shx, shy, shz,
-                                 nbl->work);
+                                 nbl->work.get());
  
                      icell_set_x(cell0_i+ci, shx, shy, shz,
                                  nbat->xstride, nbat->x().data(),
                                  kernelType,
-                                nbl->work);
+                                nbl->work.get());
  
                      for (int cx = cxf; cx <= cxl; cx++)
                      {
@@ -3645,7 +3637,7 @@ static void nbnxn_make_pairlist_part(const PairSearch &pairSearch,
      {
          fprintf(debug, "number of distance checks %d\n", numDistanceChecks);
  
-        print_nblist_statistics(debug, nbl, pairSearch, rlist);
+        print_nblist_statistics(debug, *nbl, pairSearch, rlist);
  
          if (haveFep)
          {
@@ -3763,17 +3755,17 @@ static void copySelectedListRange(const nbnxn_ci_t * gmx_restrict srcCi,
   * to reduction of parts of the force buffer that could be avoided. But since
   * the original lists are quite balanced, this will only give minor overhead.
   */
-static void rebalanceSimpleLists(int                                  numLists,
-                                 NbnxnPairlistCpu * const * const     srcSet,
-                                 NbnxnPairlistCpu                   **destSet,
-                                 gmx::ArrayRef<PairsearchWork>        searchWork)
+static void rebalanceSimpleLists(gmx::ArrayRef<const NbnxnPairlistCpu> srcSet,
+                                 gmx::ArrayRef<NbnxnPairlistCpu>       destSet,
+                                 gmx::ArrayRef<PairsearchWork>         searchWork)
  {
      int ncjTotal = 0;
-    for (int s = 0; s < numLists; s++)
+    for (auto &src : srcSet)
      {
-        ncjTotal += srcSet[s]->ncjInUse;
+        ncjTotal += src.ncjInUse;
      }
-    int ncjTarget = (ncjTotal + numLists - 1)/numLists;
+    const int numLists  = srcSet.ssize();
+    const int ncjTarget = (ncjTotal + numLists - 1)/numLists;
  
  #pragma omp parallel num_threads(numLists)
      {
@@ -3783,23 +3775,23 @@ static void rebalanceSimpleLists(int                                  numLists,
          int cjEnd   = ncjTarget*(t + 1);
  
          /* The destination pair-list for task/thread t */
-        NbnxnPairlistCpu *dest = destSet[t];
+        NbnxnPairlistCpu &dest = destSet[t];
  
-        clear_pairlist(dest);
-        dest->na_cj   = srcSet[0]->na_cj;
+        clear_pairlist(&dest);
+        dest.na_cj = srcSet[0].na_cj;
  
          /* Note that the flags in the work struct (still) contain flags
           * for all entries that are present in srcSet->nbl[t].
           */
          gmx_bitmask_t *flag       = searchWork[t].buffer_flags.flag;
  
-        int            iFlagShift = getBufferFlagShift(dest->na_ci);
-        int            jFlagShift = getBufferFlagShift(dest->na_cj);
+        int            iFlagShift = getBufferFlagShift(dest.na_ci);
+        int            jFlagShift = getBufferFlagShift(dest.na_cj);
  
          int            cjGlobal   = 0;
          for (int s = 0; s < numLists && cjGlobal < cjEnd; s++)
          {
-            const NbnxnPairlistCpu *src = srcSet[s];
+            const NbnxnPairlistCpu *src = &srcSet[s];
  
              if (cjGlobal + src->ncjInUse > cjStart)
              {
@@ -3816,7 +3808,7 @@ static void rebalanceSimpleLists(int                                  numLists,
                          {
                              copySelectedListRange
                              <true>
-                                (srcCi, src, dest,
+                                (srcCi, src, &dest,
                                  flag, iFlagShift, jFlagShift, t);
                          }
                          else
@@ -3824,7 +3816,7 @@ static void rebalanceSimpleLists(int                                  numLists,
                              copySelectedListRange
                              <false>
                                  (srcCi, src,
-                                dest, flag, iFlagShift, jFlagShift, t);
+                                &dest, flag, iFlagShift, jFlagShift, t);
                          }
                      }
                      cjGlobal += ncj;
@@ -3836,29 +3828,29 @@ static void rebalanceSimpleLists(int                                  numLists,
              }
          }
  
-        dest->ncjInUse = dest->cj.size();
+        dest.ncjInUse = dest.cj.size();
      }
  
  #ifndef NDEBUG
      int ncjTotalNew = 0;
-    for (int s = 0; s < numLists; s++)
+    for (auto &dest : destSet)
      {
-        ncjTotalNew += destSet[s]->ncjInUse;
+        ncjTotalNew += dest.ncjInUse;
      }
      GMX_RELEASE_ASSERT(ncjTotalNew == ncjTotal, "The total size of the lists before and after rebalancing should match");
  #endif
  }
  
  /* Returns if the pairlists are so imbalanced that it is worth rebalancing. */
-static bool checkRebalanceSimpleLists(const nbnxn_pairlist_set_t *listSet)
+static bool checkRebalanceSimpleLists(gmx::ArrayRef<const NbnxnPairlistCpu> lists)
  {
-    int numLists = listSet->nnbl;
+    int numLists = lists.ssize();
      int ncjMax   = 0;
      int ncjTotal = 0;
      for (int s = 0; s < numLists; s++)
      {
-        ncjMax    = std::max(ncjMax, listSet->nbl[s]->ncjInUse);
-        ncjTotal += listSet->nbl[s]->ncjInUse;
+        ncjMax    = std::max(ncjMax, lists[s].ncjInUse);
+        ncjTotal += lists[s].ncjInUse;
      }
      if (debug)
      {
@@ -3932,44 +3924,41 @@ static void sort_sci(NbnxnPairlistGpu *nbl)
      std::swap(nbl->sci, work.sci_sort);
  }
  
+//! Prepares CPU lists produced by the search for dynamic pruning
+static void prepareListsForDynamicPruning(gmx::ArrayRef<NbnxnPairlistCpu> lists);
+
  void
-nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality,
-                                            PairSearch                *pairSearch,
-                                            nbnxn_atomdata_t          *nbat,
-                                            const t_blocka            *excl,
-                                            const Nbnxm::KernelType    kernelType,
-                                            const int64_t              step,
-                                            t_nrnb                    *nrnb)
+PairlistSet::constructPairlists(PairSearch                *pairSearch,
+                                nbnxn_atomdata_t          *nbat,
+                                const t_blocka            *excl,
+                                const Nbnxm::KernelType    kernelType,
+                                const int                  minimumIlistCountForGpuBalancing,
+                                t_nrnb                    *nrnb)
  {
-    nbnxn_pairlist_set_t *nbl_list = &pairlistSet(iLocality);
-
-    const real            rlist    = nbl_list->params.rlistOuter;
+    const real         rlist    = params_.rlistOuter;
  
      int                nsubpair_target;
      float              nsubpair_tot_est;
-    int                nnbl;
      int                ci_block;
-    gmx_bool           CombineNBLists;
      gmx_bool           progBal;
      int                np_tot, np_noq, np_hlj, nap;
  
-    nnbl            = nbl_list->nnbl;
-    CombineNBLists  = nbl_list->bCombined;
+    const int          numLists = (isCpuType_ ? cpuLists_.size() : gpuLists_.size());
  
      if (debug)
      {
-        fprintf(debug, "ns making %d nblists\n", nnbl);
+        fprintf(debug, "ns making %d nblists\n", numLists);
      }
  
      nbat->bUseBufferFlags = (nbat->out.size() > 1);
      /* We should re-init the flags before making the first list */
-    if (nbat->bUseBufferFlags && iLocality == InteractionLocality::Local)
+    if (nbat->bUseBufferFlags && locality_ == InteractionLocality::Local)
      {
          init_buffer_flags(&nbat->buffer_flags, nbat->numAtoms());
      }
  
      int nzi;
-    if (iLocality == InteractionLocality::Local)
+    if (locality_ == InteractionLocality::Local)
      {
          /* Only zone (grid) 0 vs 0 */
          nzi = 1;
@@ -3979,9 +3968,9 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
          nzi = pairSearch->domainSetup().zones->nizone;
      }
  
-    if (!nbl_list->bSimple && minimumIlistCountForGpuBalancing_ > 0)
+    if (!isCpuType_ && minimumIlistCountForGpuBalancing > 0)
      {
-        get_nsubpair_target(*pairSearch, iLocality, rlist, minimumIlistCountForGpuBalancing_,
+        get_nsubpair_target(*pairSearch, locality_, rlist, minimumIlistCountForGpuBalancing,
                              &nsubpair_target, &nsubpair_tot_est);
      }
      else
@@ -3991,20 +3980,20 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
      }
  
      /* Clear all pair-lists */
-    for (int th = 0; th < nnbl; th++)
+    for (int th = 0; th < numLists; th++)
      {
-        if (nbl_list->bSimple)
+        if (isCpuType_)
          {
-            clear_pairlist(nbl_list->nbl[th]);
+            clear_pairlist(&cpuLists_[th]);
          }
          else
          {
-            clear_pairlist(nbl_list->nblGpu[th]);
+            clear_pairlist(&gpuLists_[th]);
          }
  
-        if (pairSearch->gridSet().haveFep())
+        if (params_.haveFep)
          {
-            clear_pairlist_fep(nbl_list->nbl_fep[th]);
+            clear_pairlist_fep(fepLists_[th]);
          }
      }
  
@@ -4016,7 +4005,7 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
  
          int                 zj0;
          int                 zj1;
-        if (iLocality == InteractionLocality::Local)
+        if (locality_ == InteractionLocality::Local)
          {
              zj0 = 0;
              zj1 = 1;
@@ -4041,15 +4030,15 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
  
              pairSearch->cycleCounting_.start(PairSearch::enbsCCsearch);
  
-            ci_block = get_ci_block_size(iGrid, pairSearch->domainSetup().haveDomDec, nnbl);
+            ci_block = get_ci_block_size(iGrid, pairSearch->domainSetup().haveDomDec, numLists);
  
              /* With GPU: generate progressively smaller lists for
               * load balancing for local only or non-local with 2 zones.
               */
-            progBal = (iLocality == InteractionLocality::Local || ddZones->n <= 2);
+            progBal = (locality_ == InteractionLocality::Local || ddZones->n <= 2);
  
-#pragma omp parallel for num_threads(nnbl) schedule(static)
-            for (int th = 0; th < nnbl; th++)
+#pragma omp parallel for num_threads(numLists) schedule(static)
+            for (int th = 0; th < numLists; th++)
              {
                  try
                  {
@@ -4061,48 +4050,50 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
                          init_buffer_flags(&pairSearch->work()[th].buffer_flags, nbat->numAtoms());
                      }
  
-                    if (CombineNBLists && th > 0)
+                    if (combineLists_ && th > 0)
                      {
-                        GMX_ASSERT(!nbl_list->bSimple, "Can only combine GPU lists");
+                        GMX_ASSERT(!isCpuType_, "Can only combine GPU lists");
  
-                        clear_pairlist(nbl_list->nblGpu[th]);
+                        clear_pairlist(&gpuLists_[th]);
                      }
  
-                    auto &searchWork = pairSearch->work()[th];
+                    PairsearchWork *searchWork = &pairSearch->work()[th];
+
+                    searchWork->cycleCounter.start();
  
-                    searchWork.cycleCounter.start();
+                    t_nblist *fepListPtr = (fepLists_.empty() ? nullptr : fepLists_[th]);
  
-                    /* Divide the i super cell equally over the nblists */
-                    if (nbl_list->bSimple)
+                    /* Divide the i cells equally over the pairlists */
+                    if (isCpuType_)
                      {
                          nbnxn_make_pairlist_part(*pairSearch, iGrid, jGrid,
-                                                 &searchWork, nbat, *excl,
+                                                 searchWork, nbat, *excl,
                                                   rlist,
                                                   kernelType,
                                                   ci_block,
                                                   nbat->bUseBufferFlags,
                                                   nsubpair_target,
                                                   progBal, nsubpair_tot_est,
-                                                 th, nnbl,
-                                                 nbl_list->nbl[th],
-                                                 nbl_list->nbl_fep[th]);
+                                                 th, numLists,
+                                                 &cpuLists_[th],
+                                                 fepListPtr);
                      }
                      else
                      {
                          nbnxn_make_pairlist_part(*pairSearch, iGrid, jGrid,
-                                                 &searchWork, nbat, *excl,
+                                                 searchWork, nbat, *excl,
                                                   rlist,
                                                   kernelType,
                                                   ci_block,
                                                   nbat->bUseBufferFlags,
                                                   nsubpair_target,
                                                   progBal, nsubpair_tot_est,
-                                                 th, nnbl,
-                                                 nbl_list->nblGpu[th],
-                                                 nbl_list->nbl_fep[th]);
+                                                 th, numLists,
+                                                 &gpuLists_[th],
+                                                 fepListPtr);
                      }
  
-                    searchWork.cycleCounter.stop();
+                    searchWork->cycleCounter.stop();
                  }
                  GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
              }
@@ -4111,77 +4102,75 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
              np_tot = 0;
              np_noq = 0;
              np_hlj = 0;
-            for (int th = 0; th < nnbl; th++)
+            for (int th = 0; th < numLists; th++)
              {
                  inc_nrnb(nrnb, eNR_NBNXN_DIST2, pairSearch->work()[th].ndistc);
  
-                if (nbl_list->bSimple)
+                if (isCpuType_)
                  {
-                    NbnxnPairlistCpu *nbl = nbl_list->nbl[th];
-                    np_tot += nbl->cj.size();
-                    np_noq += nbl->work->ncj_noq;
-                    np_hlj += nbl->work->ncj_hlj;
+                    const NbnxnPairlistCpu &nbl = cpuLists_[th];
+                    np_tot += nbl.cj.size();
+                    np_noq += nbl.work->ncj_noq;
+                    np_hlj += nbl.work->ncj_hlj;
                  }
                  else
                  {
-                    NbnxnPairlistGpu *nbl = nbl_list->nblGpu[th];
+                    const NbnxnPairlistGpu &nbl = gpuLists_[th];
                      /* This count ignores potential subsequent pair pruning */
-                    np_tot += nbl->nci_tot;
+                    np_tot += nbl.nci_tot;
                  }
              }
-            if (nbl_list->bSimple)
+            if (isCpuType_)
              {
-                nap               = nbl_list->nbl[0]->na_ci*nbl_list->nbl[0]->na_cj;
+                nap      = cpuLists_[0].na_ci*cpuLists_[0].na_cj;
              }
              else
              {
-                nap               = gmx::square(nbl_list->nblGpu[0]->na_ci);
+                nap      = gmx::square(gpuLists_[0].na_ci);
              }
-            nbl_list->natpair_ljq = (np_tot - np_noq)*nap - np_hlj*nap/2;
-            nbl_list->natpair_lj  = np_noq*nap;
-            nbl_list->natpair_q   = np_hlj*nap/2;
+            natpair_ljq_ = (np_tot - np_noq)*nap - np_hlj*nap/2;
+            natpair_lj_  = np_noq*nap;
+            natpair_q_   = np_hlj*nap/2;
  
-            if (CombineNBLists && nnbl > 1)
+            if (combineLists_ && numLists > 1)
              {
-                GMX_ASSERT(!nbl_list->bSimple, "Can only combine GPU lists");
-                NbnxnPairlistGpu **nbl = nbl_list->nblGpu;
+                GMX_ASSERT(!isCpuType_, "Can only combine GPU lists");
  
                  pairSearch->cycleCounting_.start(PairSearch::enbsCCcombine);
  
-                combine_nblists(nnbl-1, nbl+1, nbl[0]);
+                combine_nblists(gmx::constArrayRefFromArray(&gpuLists_[1], numLists - 1),
+                                &gpuLists_[0]);
  
                  pairSearch->cycleCounting_.stop(PairSearch::enbsCCcombine);
              }
          }
      }
  
-    if (nbl_list->bSimple)
+    if (isCpuType_)
      {
-        if (nnbl > 1 && checkRebalanceSimpleLists(nbl_list))
+        if (numLists > 1 && checkRebalanceSimpleLists(cpuLists_))
          {
-            rebalanceSimpleLists(nbl_list->nnbl, nbl_list->nbl, nbl_list->nbl_work, pairSearch->work());
+            rebalanceSimpleLists(cpuLists_, cpuListsWork_, pairSearch->work());
  
-            /* Swap the pointer of the sets of pair lists */
-            NbnxnPairlistCpu **tmp = nbl_list->nbl;
-            nbl_list->nbl          = nbl_list->nbl_work;
-            nbl_list->nbl_work     = tmp;
+            /* Swap the sets of pair lists */
+            cpuLists_.swap(cpuListsWork_);
          }
      }
      else
      {
          /* Sort the entries on size, large ones first */
-        if (CombineNBLists || nnbl == 1)
+        if (combineLists_ || gpuLists_.size() == 1)
          {
-            sort_sci(nbl_list->nblGpu[0]);
+            sort_sci(&gpuLists_[0]);
          }
          else
          {
-#pragma omp parallel for num_threads(nnbl) schedule(static)
-            for (int th = 0; th < nnbl; th++)
+#pragma omp parallel for num_threads(numLists) schedule(static)
+            for (int th = 0; th < numLists; th++)
              {
                  try
                  {
-                    sort_sci(nbl_list->nblGpu[th]);
+                    sort_sci(&gpuLists_[th]);
                  }
                  GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
              }
@@ -4190,60 +4179,38 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
  
      if (nbat->bUseBufferFlags)
      {
-        reduce_buffer_flags(*pairSearch, nbl_list->nnbl, &nbat->buffer_flags);
+        reduce_buffer_flags(*pairSearch, numLists, &nbat->buffer_flags);
      }
  
      if (pairSearch->gridSet().haveFep())
      {
          /* Balance the free-energy lists over all the threads */
-        balance_fep_lists(pairSearch->work(), nbl_list);
+        balance_fep_lists(fepLists_, pairSearch->work());
      }
  
-    if (nbl_list->bSimple)
+    if (isCpuType_)
      {
          /* This is a fresh list, so not pruned, stored using ci.
           * ciOuter is invalid at this point.
           */
-        GMX_ASSERT(nbl_list->nbl[0]->ciOuter.empty(), "ciOuter is invalid so it should be empty");
-    }
-
-    if (iLocality == Nbnxm::InteractionLocality::Local)
-    {
-        outerListCreationStep_ = step;
-    }
-    else
-    {
-        GMX_RELEASE_ASSERT(outerListCreationStep_ == step,
-                           "Outer list should be created at the same step as the inner list");
-    }
-
-    /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
-    if (iLocality == InteractionLocality::Local)
-    {
-        pairSearch->cycleCounting_.searchCount_++;
-    }
-    if (pairSearch->cycleCounting_.recordCycles_ &&
-        (!pairSearch->domainSetup().haveDomDec || iLocality == InteractionLocality::NonLocal) &&
-        pairSearch->cycleCounting_.searchCount_ % 100 == 0)
-    {
-        pairSearch->cycleCounting_.printCycles(stderr, pairSearch->work());
+        GMX_ASSERT(cpuLists_[0].ciOuter.empty(), "ciOuter is invalid so it should be empty");
      }
  
      /* If we have more than one list, they either got rebalancing (CPU)
       * or combined (GPU), so we should dump the final result to debug.
       */
-    if (debug && nbl_list->nnbl > 1)
+    if (debug)
      {
-        if (nbl_list->bSimple)
+        if (isCpuType_ && cpuLists_.size() > 1)
          {
-            for (int t = 0; t < nbl_list->nnbl; t++)
+            for (auto &cpuList : cpuLists_)
              {
-                print_nblist_statistics(debug, nbl_list->nbl[t], *pairSearch, rlist);
+                print_nblist_statistics(debug, cpuList, *pairSearch, rlist);
              }
          }
-        else
+        else if (!isCpuType_ && gpuLists_.size() > 1)
          {
-            print_nblist_statistics(debug, nbl_list->nblGpu[0], *pairSearch, rlist);
+            print_nblist_statistics(debug, gpuLists_[0], *pairSearch, rlist);
          }
      }
  
@@ -4251,28 +4218,62 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
      {
          if (gmx_debug_at)
          {
-            if (nbl_list->bSimple)
+            if (isCpuType_)
              {
-                for (int t = 0; t < nbl_list->nnbl; t++)
+                for (auto &cpuList : cpuLists_)
                  {
-                    print_nblist_ci_cj(debug, nbl_list->nbl[t]);
+                    print_nblist_ci_cj(debug, cpuList);
                  }
              }
              else
              {
-                print_nblist_sci_cj(debug, nbl_list->nblGpu[0]);
+                print_nblist_sci_cj(debug, gpuLists_[0]);
              }
          }
  
          if (nbat->bUseBufferFlags)
          {
-            print_reduction_cost(&nbat->buffer_flags, nbl_list->nnbl);
+            print_reduction_cost(&nbat->buffer_flags, numLists);
          }
      }
  
-    if (params_.useDynamicPruning && nbl_list->bSimple)
+    if (params_.useDynamicPruning && isCpuType_)
      {
-        nbnxnPrepareListForDynamicPruning(nbl_list);
+        prepareListsForDynamicPruning(cpuLists_);
+    }
+}
+
+void
+nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality,
+                                            PairSearch                *pairSearch,
+                                            nbnxn_atomdata_t          *nbat,
+                                            const t_blocka            *excl,
+                                            const Nbnxm::KernelType    kernelType,
+                                            const int64_t              step,
+                                            t_nrnb                    *nrnb)
+{
+    pairlistSet(iLocality).constructPairlists(pairSearch, nbat, excl, kernelType, minimumIlistCountForGpuBalancing_, nrnb);
+
+    if (iLocality == Nbnxm::InteractionLocality::Local)
+    {
+        outerListCreationStep_ = step;
+    }
+    else
+    {
+        GMX_RELEASE_ASSERT(outerListCreationStep_ == step,
+                           "Outer list should be created at the same step as the inner list");
+    }
+
+    /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
+    if (iLocality == InteractionLocality::Local)
+    {
+        pairSearch->cycleCounting_.searchCount_++;
+    }
+    if (pairSearch->cycleCounting_.recordCycles_ &&
+        (!pairSearch->domainSetup().haveDomDec || iLocality == InteractionLocality::NonLocal) &&
+        pairSearch->cycleCounting_.searchCount_ % 100 == 0)
+    {
+        pairSearch->cycleCounting_.printCycles(stderr, pairSearch->work());
      }
  }
  
@@ -4293,24 +4294,20 @@ nonbonded_verlet_t::constructPairlist(const Nbnxm::InteractionLocality  iLocalit
           * NOTE: The launch overhead is currently not timed separately
           */
          Nbnxm::gpu_init_pairlist(gpu_nbv,
-                                 pairlistSets().pairlistSet(iLocality).nblGpu[0],
+                                 pairlistSets().pairlistSet(iLocality).gpuList(),
                                   iLocality);
      }
  }
  
-void nbnxnPrepareListForDynamicPruning(nbnxn_pairlist_set_t *listSet)
+static void prepareListsForDynamicPruning(gmx::ArrayRef<NbnxnPairlistCpu> lists)
  {
-    GMX_RELEASE_ASSERT(listSet->bSimple, "Should only be called for simple lists");
-
      /* TODO: Restructure the lists so we have actual outer and inner
       *       list objects so we can set a single pointer instead of
       *       swapping several pointers.
       */
  
-    for (int i = 0; i < listSet->nnbl; i++)
+    for (auto &list : lists)
      {
-        NbnxnPairlistCpu &list = *listSet->nbl[i];
-
          /* The search produced a list in ci/cj.
           * Swap the list pointers so we get the outer list is ciOuter,cjOuter
           * and we can prune that to get an inner list in ci/cj.