Convert nbnxn_pairlist_set_t to class PairlistSet

author Berk Hess <hess@kth.se>

Fri, 15 Mar 2019 21:10:08 +0000 (22:10 +0100)

committer Berk Hess <hess@kth.se>

Mon, 25 Mar 2019 20:48:37 +0000 (21:48 +0100)
author Berk Hess <hess@kth.se>
Fri, 15 Mar 2019 21:10:08 +0000 (22:10 +0100)
committer Berk Hess <hess@kth.se>
Mon, 25 Mar 2019 20:48:37 +0000 (21:48 +0100)
diff --git a/src/gromacs/nbnxm/kerneldispatch.cpp b/src/gromacs/nbnxm/kerneldispatch.cpp

index 31cf946e201a7f84a5fbde6650f2a1f3c4b6a8dc..b676f0c0775154a4397d2a2d63fbb7e602a7a27b 100644 (file)
--- a/src/gromacs/nbnxm/kerneldispatch.cpp
+++ b/src/gromacs/nbnxm/kerneldispatch.cpp
@@ -51,6 +51,7 @@
  #include "gromacs/nbnxm/gpu_data_mgmt.h"
  #include "gromacs/nbnxm/nbnxm.h"
  #include "gromacs/nbnxm/nbnxm_simd.h"
+#include "gromacs/nbnxm/pairlist.h"
  #include "gromacs/nbnxm/kernels_reference/kernel_gpu_ref.h"
  #include "gromacs/simd/simd.h"
  #include "gromacs/utility/gmxassert.h"
@@ -145,7 +146,7 @@ reduceGroupEnergySimdBuffers(int                       numGroups,
   * \param[out]    vVdw          Output buffer for Van der Waals energies
   */
  static void
-nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
+nbnxn_kernel_cpu(const PairlistSet              &pairlistSet,
                   const Nbnxm::KernelSetup       &kernelSetup,
                   nbnxn_atomdata_t               *nbat,
                   const interaction_const_t      &ic,
@@ -234,12 +235,11 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
          GMX_RELEASE_ASSERT(false, "Unsupported VdW interaction type");
      }
  
-    int                        nnbl = pairlistSet.nnbl;
-    NbnxnPairlistCpu * const * nbl  = pairlistSet.nbl;
+    gmx::ArrayRef<const NbnxnPairlistCpu> pairlists = pairlistSet.cpuLists();
  
-    int gmx_unused             nthreads = gmx_omp_nthreads_get(emntNonbonded);
+    int gmx_unused                        nthreads = gmx_omp_nthreads_get(emntNonbonded);
  #pragma omp parallel for schedule(static) num_threads(nthreads)
-    for (int nb = 0; nb < nnbl; nb++)
+    for (int nb = 0; nb < pairlists.ssize(); nb++)
      {
          // Presently, the kernels do not call C++ code that can throw,
          // so no need for a try/catch pair in this OpenMP region.
@@ -251,7 +251,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
          }
  
          real *fshift_p;
-        if ((forceFlags & GMX_FORCE_VIRIAL) && nnbl == 1)
+        if ((forceFlags & GMX_FORCE_VIRIAL) && pairlists.ssize() == 1)
          {
              fshift_p = fshift;
          }
@@ -265,13 +265,16 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
              }
          }
  
+        // TODO: Change to reference
+        const NbnxnPairlistCpu *pairlist = &pairlists[nb];
+
          if (!(forceFlags & GMX_FORCE_ENERGY))
          {
              /* Don't calculate energies */
              switch (kernelSetup.kernelType)
              {
                  case Nbnxm::KernelType::Cpu4x4_PlainC:
-                    nbnxn_kernel_noener_ref[coulkt][vdwkt](nbl[nb], nbat,
+                    nbnxn_kernel_noener_ref[coulkt][vdwkt](pairlist, nbat,
                                                             &ic,
                                                             shiftVectors,
                                                             out->f.data(),
@@ -279,7 +282,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
                      break;
  #ifdef GMX_NBNXN_SIMD_2XNN
                  case Nbnxm::KernelType::Cpu4xN_Simd_2xNN:
-                    nbnxm_kernel_noener_simd_2xmm[coulkt][vdwkt](nbl[nb], nbat,
+                    nbnxm_kernel_noener_simd_2xmm[coulkt][vdwkt](pairlist, nbat,
                                                                   &ic,
                                                                   shiftVectors,
                                                                   out->f.data(),
@@ -288,7 +291,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
  #endif
  #ifdef GMX_NBNXN_SIMD_4XN
                  case Nbnxm::KernelType::Cpu4xN_Simd_4xN:
-                    nbnxm_kernel_noener_simd_4xm[coulkt][vdwkt](nbl[nb], nbat,
+                    nbnxm_kernel_noener_simd_4xm[coulkt][vdwkt](pairlist, nbat,
                                                                  &ic,
                                                                  shiftVectors,
                                                                  out->f.data(),
@@ -308,7 +311,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
              switch (kernelSetup.kernelType)
              {
                  case Nbnxm::KernelType::Cpu4x4_PlainC:
-                    nbnxn_kernel_ener_ref[coulkt][vdwkt](nbl[nb], nbat,
+                    nbnxn_kernel_ener_ref[coulkt][vdwkt](pairlist, nbat,
                                                           &ic,
                                                           shiftVectors,
                                                           out->f.data(),
@@ -318,7 +321,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
                      break;
  #ifdef GMX_NBNXN_SIMD_2XNN
                  case Nbnxm::KernelType::Cpu4xN_Simd_2xNN:
-                    nbnxm_kernel_ener_simd_2xmm[coulkt][vdwkt](nbl[nb], nbat,
+                    nbnxm_kernel_ener_simd_2xmm[coulkt][vdwkt](pairlist, nbat,
                                                                 &ic,
                                                                 shiftVectors,
                                                                 out->f.data(),
@@ -329,7 +332,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
  #endif
  #ifdef GMX_NBNXN_SIMD_4XN
                  case Nbnxm::KernelType::Cpu4xN_Simd_4xN:
-                    nbnxm_kernel_ener_simd_4xm[coulkt][vdwkt](nbl[nb], nbat,
+                    nbnxm_kernel_ener_simd_4xm[coulkt][vdwkt](pairlist, nbat,
                                                                &ic,
                                                                shiftVectors,
                                                                out->f.data(),
@@ -353,7 +356,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
              {
                  case Nbnxm::KernelType::Cpu4x4_PlainC:
                      unrollj = c_nbnxnCpuIClusterSize;
-                    nbnxn_kernel_energrp_ref[coulkt][vdwkt](nbl[nb], nbat,
+                    nbnxn_kernel_energrp_ref[coulkt][vdwkt](pairlist, nbat,
                                                              &ic,
                                                              shiftVectors,
                                                              out->f.data(),
@@ -364,7 +367,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
  #ifdef GMX_NBNXN_SIMD_2XNN
                  case Nbnxm::KernelType::Cpu4xN_Simd_2xNN:
                      unrollj = GMX_SIMD_REAL_WIDTH/2;
-                    nbnxm_kernel_energrp_simd_2xmm[coulkt][vdwkt](nbl[nb], nbat,
+                    nbnxm_kernel_energrp_simd_2xmm[coulkt][vdwkt](pairlist, nbat,
                                                                    &ic,
                                                                    shiftVectors,
                                                                    out->f.data(),
@@ -376,7 +379,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
  #ifdef GMX_NBNXN_SIMD_4XN
                  case Nbnxm::KernelType::Cpu4xN_Simd_4xN:
                      unrollj = GMX_SIMD_REAL_WIDTH;
-                    nbnxm_kernel_energrp_simd_4xm[coulkt][vdwkt](nbl[nb], nbat,
+                    nbnxm_kernel_energrp_simd_4xm[coulkt][vdwkt](pairlist, nbat,
                                                                   &ic,
                                                                   shiftVectors,
                                                                   out->f.data(),
@@ -417,12 +420,12 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
  
      if (forceFlags & GMX_FORCE_ENERGY)
      {
-        reduce_energies_over_lists(nbat, nnbl, vVdw, vCoulomb);
+        reduce_energies_over_lists(nbat, pairlists.ssize(), vVdw, vCoulomb);
      }
  }
  
  static void accountFlops(t_nrnb                           *nrnb,
-                         const nbnxn_pairlist_set_t       &pairlistSet,
+                         const PairlistSet                &pairlistSet,
                           const nonbonded_verlet_t         &nbv,
                           const interaction_const_t        &ic,
                           const int                         forceFlags)
@@ -452,31 +455,31 @@ static void accountFlops(t_nrnb                           *nrnb,
      }
  
      inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
-             pairlistSet.natpair_ljq);
+             pairlistSet.natpair_ljq_);
      inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
-             pairlistSet.natpair_lj);
+             pairlistSet.natpair_lj_);
      /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
      inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
-             pairlistSet.natpair_q);
+             pairlistSet.natpair_q_);
  
      const bool calcEnergy = ((forceFlags & GMX_FORCE_ENERGY) != 0);
      if (ic.vdw_modifier == eintmodFORCESWITCH)
      {
          /* We add up the switch cost separately */
          inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW + (calcEnergy ? 1 : 0),
-                 pairlistSet.natpair_ljq + pairlistSet.natpair_lj);
+                 pairlistSet.natpair_ljq_ + pairlistSet.natpair_lj_);
      }
      if (ic.vdw_modifier == eintmodPOTSWITCH)
      {
          /* We add up the switch cost separately */
          inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW + (calcEnergy ? 1 : 0),
-                 pairlistSet.natpair_ljq + pairlistSet.natpair_lj);
+                 pairlistSet.natpair_ljq_ + pairlistSet.natpair_lj_);
      }
      if (ic.vdwtype == evdwPME)
      {
          /* We add up the LJ Ewald cost separately */
          inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD + (calcEnergy ? 1 : 0),
-                 pairlistSet.natpair_ljq + pairlistSet.natpair_lj);
+                 pairlistSet.natpair_ljq_ + pairlistSet.natpair_lj_);
      }
  }
  
@@ -489,7 +492,7 @@ nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality
                                              gmx_enerdata_t            *enerd,
                                              t_nrnb                    *nrnb)
  {
-    const nbnxn_pairlist_set_t &pairlistSet = pairlistSets().pairlistSet(iLocality);
+    const PairlistSet &pairlistSet = pairlistSets().pairlistSet(iLocality);
  
      switch (kernelSetup().kernelType)
      {
@@ -515,7 +518,7 @@ nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality
              break;
  
          case Nbnxm::KernelType::Cpu8x8x8_PlainC:
-            nbnxn_kernel_gpu_ref(pairlistSet.nblGpu[0],
+            nbnxn_kernel_gpu_ref(pairlistSet.gpuList(),
                                   nbat.get(), &ic,
                                   fr->shift_vec,
                                   forceFlags,
@@ -548,10 +551,10 @@ nonbonded_verlet_t::dispatchFreeEnergyKernel(Nbnxm::InteractionLocality  iLocali
                                               const int                   forceFlags,
                                               t_nrnb                     *nrnb)
  {
-    const gmx::ArrayRef<t_nblist const * const > nbl_fep = pairlistSets().pairlistSet(iLocality).nbl_fep;
+    const gmx::ArrayRef<t_nblist const * const > nbl_fep = pairlistSets().pairlistSet(iLocality).fepLists();
  
      /* When the first list is empty, all are empty and there is nothing to do */
-    if (nbl_fep[0]->nrj == 0)
+    if (!pairlistSets().params().haveFep || nbl_fep[0]->nrj == 0)
      {
          return;
      }
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp

index 0aeb3d8d63164f07f1f99c0f4d8eb157b36300cc..270512b266978c9a54e05bf0b4e23207afd8dc64 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -154,8 +154,9 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality  locality
                                               gmx_wallcycle             *wcycle)
  {
      /* Skip the non-local reduction if there was no non-local work to do */
-    if (locality == Nbnxm::AtomLocality::NonLocal &&
-        pairlistSets().pairlistSet(Nbnxm::InteractionLocality::NonLocal).nblGpu[0]->sci.empty())
+    if (!pairlistIsSimple() &&
+        locality == Nbnxm::AtomLocality::NonLocal &&
+        pairlistSets().pairlistSet(Nbnxm::InteractionLocality::NonLocal).gpuList()->sci.empty())
      {
          return;
      }
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h

index ad9a79077ebc9c1c90b1825d35ec34c89c2ec872..4c84070ddedc6d14684bc464137e0bbd8451853c 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -117,10 +117,10 @@ struct gmx_hw_info_t;
  struct gmx_mtop_t;
  struct gmx_wallcycle;
  struct interaction_const_t;
-struct nbnxn_pairlist_set_t;
  struct nonbonded_verlet_t;
  enum class PairlistType;
  class PairSearch;
+class PairlistSet;
  struct t_blocka;
  struct t_commrec;
  struct t_lambda;
@@ -150,10 +150,12 @@ struct NbnxnListParameters
      /*! \brief Constructor producing a struct with dynamic pruning disabled
       */
      NbnxnListParameters(Nbnxm::KernelType kernelType,
+                        bool              haveFep,
                          real              rlist,
                          bool              haveMultipleDomains);
  
      PairlistType pairlistType;           //!< The type of cluster-pair list
+    bool         haveFep;                //!< Tells whether we have perturbed interactions
      real         rlistOuter;             //!< Cut-off of the larger, outer pair-list
      real         rlistInner;             //!< Cut-off of the smaller, inner pair-list
      bool         haveMultipleDomains;    //!< True when using DD with multiple domains
@@ -224,7 +226,7 @@ enum {
   */
  void nbnxn_make_pairlist(nonbonded_verlet_t         *nbv,
                           Nbnxm::InteractionLocality  iLocality,
-                         nbnxn_pairlist_set_t       *pairlistSet,
+                         PairlistSet                *pairlistSet,
                           const t_blocka             *excl,
                           int64_t                     step,
                           t_nrnb                     *nrnb);
@@ -235,7 +237,7 @@ void nbnxn_make_pairlist(nonbonded_verlet_t         *nbv,
   * pairs beyond the pairlist inner radius and writes the result to a list that is
   * to be consumed by the non-bonded kernel.
   */
-void NbnxnDispatchPruneKernel(nbnxn_pairlist_set_t   *pairlistSet,
+void NbnxnDispatchPruneKernel(PairlistSet            *pairlistSet,
                                Nbnxm::KernelType       kernelType,
                                const nbnxn_atomdata_t *nbat,
                                const rvec             *shift_vec);
@@ -306,7 +308,7 @@ struct nonbonded_verlet_t
                  }
  
                  //! Returns the pair-list set for the given locality
-                const nbnxn_pairlist_set_t &pairlistSet(Nbnxm::InteractionLocality iLocality) const
+                const PairlistSet &pairlistSet(Nbnxm::InteractionLocality iLocality) const
                  {
                      if (iLocality == Nbnxm::InteractionLocality::Local)
                      {
@@ -321,7 +323,7 @@ struct nonbonded_verlet_t
  
              private:
                  //! Returns the pair-list set for the given locality
-                nbnxn_pairlist_set_t &pairlistSet(Nbnxm::InteractionLocality iLocality)
+                PairlistSet &pairlistSet(Nbnxm::InteractionLocality iLocality)
                  {
                      if (iLocality == Nbnxm::InteractionLocality::Local)
                      {
@@ -339,9 +341,9 @@ struct nonbonded_verlet_t
                  //! Pair list balancing parameter for use with GPU
                  int                                   minimumIlistCountForGpuBalancing_;
                  //! Local pairlist set
-                std::unique_ptr<nbnxn_pairlist_set_t> localSet_;
+                std::unique_ptr<PairlistSet>          localSet_;
                  //! Non-local pairlist set
-                std::unique_ptr<nbnxn_pairlist_set_t> nonlocalSet_;
+                std::unique_ptr<PairlistSet>          nonlocalSet_;
                  //! MD step at with the outer lists in pairlistSets_ were created
                  int64_t                               outerListCreationStep_;
          };
diff --git a/src/gromacs/nbnxm/nbnxm_setup.cpp b/src/gromacs/nbnxm/nbnxm_setup.cpp

index 023f78c6852614ac3af89177907eb45d53144cea..d88ca578724b15cc2ee2efa1cd5a05b439024ba6 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_setup.cpp
+++ b/src/gromacs/nbnxm/nbnxm_setup.cpp
@@ -299,11 +299,13 @@ nonbonded_verlet_t::PairlistSets::PairlistSets(const NbnxnListParameters &listPa
      params_(listParams),
      minimumIlistCountForGpuBalancing_(minimumIlistCountForGpuBalancing)
  {
-    localSet_ = std::make_unique<nbnxn_pairlist_set_t>(params_);
+    localSet_ = std::make_unique<PairlistSet>(Nbnxm::InteractionLocality::Local,
+                                              params_);
  
      if (haveMultipleDomains)
      {
-        nonlocalSet_ = std::make_unique<nbnxn_pairlist_set_t>(params_);
+        nonlocalSet_ = std::make_unique<PairlistSet>(Nbnxm::InteractionLocality::NonLocal,
+                                                     params_);
      }
  }
  
@@ -382,6 +384,7 @@ init_nb_verlet(const gmx::MDLogger     &mdlog,
      const bool          haveMultipleDomains = (DOMAINDECOMP(cr) && cr->dd->nnodes > 1);
  
      NbnxnListParameters listParams(kernelSetup.kernelType,
+                                   bFEP_NonBonded,
                                     ir->rlist,
                                     havePPDomainDecomposition(cr));
  
diff --git a/src/gromacs/nbnxm/pairlist.cpp b/src/gromacs/nbnxm/pairlist.cpp

index 8469c409bad92a82f97dcee530f45bec10b19ca2..e1e165839c01fcf83504cdb9a7c8d7bd0429a29a 100644 (file)
--- a/src/gromacs/nbnxm/pairlist.cpp
+++ b/src/gromacs/nbnxm/pairlist.cpp
@@ -648,19 +648,14 @@ static unsigned int nbl_imask0(const NbnxnPairlistGpu *nbl, int cj_ind)
      return nbl->cj4[cj_ind/c_nbnxnGpuJgroupSize].imei[0].imask;
  }
  
-/* Initializes a single NbnxnPairlistCpu data structure */
-static void nbnxn_init_pairlist(NbnxnPairlistCpu *nbl)
+NbnxnPairlistCpu::NbnxnPairlistCpu() :
+    na_ci(c_nbnxnCpuIClusterSize),
+    na_cj(0),
+    rlist(0),
+    ncjInUse(0),
+    nci_tot(0),
+    work(std::make_unique<NbnxnPairlistCpuWork>())
  {
-    nbl->na_ci       = c_nbnxnCpuIClusterSize;
-    nbl->na_cj       = 0;
-    nbl->ci.clear();
-    nbl->ciOuter.clear();
-    nbl->ncjInUse    = 0;
-    nbl->cj.clear();
-    nbl->cjOuter.clear();
-    nbl->nci_tot     = 0;
-
-    nbl->work        = new NbnxnPairlistCpuWork();
  }
  
  NbnxnPairlistGpu::NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy) :
@@ -671,7 +666,8 @@ NbnxnPairlistGpu::NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy) :
      sci({}, {pinningPolicy}),
      cj4({}, {pinningPolicy}),
      excl({}, {pinningPolicy}),
-    nci_tot(0)
+    nci_tot(0),
+    work(std::make_unique<NbnxnPairlistGpuWork>())
  {
      static_assert(c_nbnxnGpuNumClusterPerSupercluster == c_gpuNumClusterPerCell,
                    "The search code assumes that the a super-cluster matches a search grid cell");
@@ -683,79 +679,76 @@ NbnxnPairlistGpu::NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy) :
  
      // We always want a first entry without any exclusions
      excl.resize(1);
-
-    work = new NbnxnPairlistGpuWork();
  }
  
-void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list)
+// TODO: Move to pairlistset.cpp
+PairlistSet::PairlistSet(const Nbnxm::InteractionLocality  locality,
+                         const NbnxnListParameters        &listParams) :
+    locality_(locality),
+    params_(listParams)
  {
-    nbl_list->bSimple   =
-        (nbl_list->params.pairlistType == PairlistType::Simple4x2 ||
-         nbl_list->params.pairlistType == PairlistType::Simple4x4 ||
-         nbl_list->params.pairlistType == PairlistType::Simple4x8);
+    isCpuType_ =
+        (params_.pairlistType == PairlistType::Simple4x2 ||
+         params_.pairlistType == PairlistType::Simple4x4 ||
+         params_.pairlistType == PairlistType::Simple4x8);
      // Currently GPU lists are always combined
-    nbl_list->bCombined = !nbl_list->bSimple;
+    combineLists_ = !isCpuType_;
  
-    nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded);
+    const int numLists = gmx_omp_nthreads_get(emntNonbonded);
  
-    if (!nbl_list->bCombined &&
-        nbl_list->nnbl > NBNXN_BUFFERFLAG_MAX_THREADS)
+    if (!combineLists_ &&
+        numLists > NBNXN_BUFFERFLAG_MAX_THREADS)
      {
          gmx_fatal(FARGS, "%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
-                  nbl_list->nnbl, NBNXN_BUFFERFLAG_MAX_THREADS, NBNXN_BUFFERFLAG_MAX_THREADS);
+                  numLists, NBNXN_BUFFERFLAG_MAX_THREADS, NBNXN_BUFFERFLAG_MAX_THREADS);
      }
  
-    if (nbl_list->bSimple)
+    if (isCpuType_)
      {
-        snew(nbl_list->nbl, nbl_list->nnbl);
-        if (nbl_list->nnbl > 1)
+        cpuLists_.resize(numLists);
+        if (numLists > 1)
          {
-            snew(nbl_list->nbl_work, nbl_list->nnbl);
+            cpuListsWork_.resize(numLists);
          }
      }
      else
      {
-        snew(nbl_list->nblGpu, nbl_list->nnbl);
+        /* Only list 0 is used on the GPU, use normal allocation for i>0 */
+        gpuLists_.emplace_back(gmx::PinningPolicy::PinnedIfSupported);
+        /* Lists 0 to numLists are use for constructing lists in parallel
+         * on the CPU using numLists threads (and then merged into list 0).
+         */
+        for (int i = 1; i < numLists; i++)
+        {
+            gpuLists_.emplace_back(gmx::PinningPolicy::CannotBePinned);
+        }
      }
-    nbl_list->nbl_fep.resize(nbl_list->nnbl);
-    /* Execute in order to avoid memory interleaving between threads */
-#pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
-    for (int i = 0; i < nbl_list->nnbl; i++)
+    if (params_.haveFep)
      {
-        try
-        {
-            /* Allocate the nblist data structure locally on each thread
-             * to optimize memory access for NUMA architectures.
-             */
-            if (nbl_list->bSimple)
-            {
-                nbl_list->nbl[i] = new NbnxnPairlistCpu();
+        fepLists_.resize(numLists);
  
-                nbnxn_init_pairlist(nbl_list->nbl[i]);
-                if (nbl_list->nnbl > 1)
-                {
-                    nbl_list->nbl_work[i] = new NbnxnPairlistCpu();
-                    nbnxn_init_pairlist(nbl_list->nbl_work[i]);
-                }
-            }
-            else
+        /* Execute in order to avoid memory interleaving between threads */
+#pragma omp parallel for num_threads(numLists) schedule(static)
+        for (int i = 0; i < numLists; i++)
+        {
+            try
              {
-                /* Only list 0 is used on the GPU, use normal allocation for i>0 */
-                auto pinningPolicy = (i == 0 ? gmx::PinningPolicy::PinnedIfSupported : gmx::PinningPolicy::CannotBePinned);
-
-                nbl_list->nblGpu[i] = new NbnxnPairlistGpu(pinningPolicy);
+                /* We used to allocate all normal lists locally on each thread
+                 * as well. The question is if allocating the object on the
+                 * master thread (but all contained list memory thread local)
+                 * impacts performance.
+                 */
+                snew(fepLists_[i], 1);
+                nbnxn_init_pairlist_fep(fepLists_[i]);
              }
-
-            snew(nbl_list->nbl_fep[i], 1);
-            nbnxn_init_pairlist_fep(nbl_list->nbl_fep[i]);
+            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
          }
-        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
      }
  }
  
  /* Print statistics of a pair list, used for debug output */
  static void print_nblist_statistics(FILE                   *fp,
-                                    const NbnxnPairlistCpu *nbl,
+                                    const NbnxnPairlistCpu &nbl,
                                      const PairSearch       &pairSearch,
                                      const real              rl)
  {
@@ -763,34 +756,34 @@ static void print_nblist_statistics(FILE                   *fp,
      const Grid::Dimensions &dims = grid.dimensions();
  
      fprintf(fp, "nbl nci %zu ncj %d\n",
-            nbl->ci.size(), nbl->ncjInUse);
+            nbl.ci.size(), nbl.ncjInUse);
      const int    numAtomsJCluster = grid.geometry().numAtomsJCluster;
-    const double numAtomsPerCell  = nbl->ncjInUse/static_cast<double>(grid.numCells())*numAtomsJCluster;
+    const double numAtomsPerCell  = nbl.ncjInUse/static_cast<double>(grid.numCells())*numAtomsJCluster;
      fprintf(fp, "nbl na_cj %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
-            nbl->na_cj, rl, nbl->ncjInUse, nbl->ncjInUse/static_cast<double>(grid.numCells()),
+            nbl.na_cj, rl, nbl.ncjInUse, nbl.ncjInUse/static_cast<double>(grid.numCells()),
              numAtomsPerCell,
              numAtomsPerCell/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid.numCells()*numAtomsJCluster/(dims.gridSize[XX]*dims.gridSize[YY]*dims.gridSize[ZZ])));
  
      fprintf(fp, "nbl average j cell list length %.1f\n",
-            0.25*nbl->ncjInUse/std::max(static_cast<double>(nbl->ci.size()), 1.0));
+            0.25*nbl.ncjInUse/std::max(static_cast<double>(nbl.ci.size()), 1.0));
  
      int cs[SHIFTS] = { 0 };
      int npexcl     = 0;
-    for (const nbnxn_ci_t &ciEntry : nbl->ci)
+    for (const nbnxn_ci_t &ciEntry : nbl.ci)
      {
          cs[ciEntry.shift & NBNXN_CI_SHIFT] +=
              ciEntry.cj_ind_end - ciEntry.cj_ind_start;
  
          int j = ciEntry.cj_ind_start;
          while (j < ciEntry.cj_ind_end &&
-               nbl->cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
+               nbl.cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
          {
              npexcl++;
              j++;
          }
      }
      fprintf(fp, "nbl cell pairs, total: %zu excl: %d %.1f%%\n",
-            nbl->cj.size(), npexcl, 100*npexcl/std::max(static_cast<double>(nbl->cj.size()), 1.0));
+            nbl.cj.size(), npexcl, 100*npexcl/std::max(static_cast<double>(nbl.cj.size()), 1.0));
      for (int s = 0; s < SHIFTS; s++)
      {
          if (cs[s] > 0)
@@ -802,7 +795,7 @@ static void print_nblist_statistics(FILE                   *fp,
  
  /* Print statistics of a pair lists, used for debug output */
  static void print_nblist_statistics(FILE                   *fp,
-                                    const NbnxnPairlistGpu *nbl,
+                                    const NbnxnPairlistGpu &nbl,
                                      const PairSearch       &pairSearch,
                                      const real              rl)
  {
@@ -810,11 +803,11 @@ static void print_nblist_statistics(FILE                   *fp,
      const Grid::Dimensions &dims = grid.dimensions();
  
      fprintf(fp, "nbl nsci %zu ncj4 %zu nsi %d excl4 %zu\n",
-            nbl->sci.size(), nbl->cj4.size(), nbl->nci_tot, nbl->excl.size());
+            nbl.sci.size(), nbl.cj4.size(), nbl.nci_tot, nbl.excl.size());
      const int    numAtomsCluster = grid.geometry().numAtomsICluster;
-    const double numAtomsPerCell = nbl->nci_tot/static_cast<double>(grid.numClusters())*numAtomsCluster;
+    const double numAtomsPerCell = nbl.nci_tot/static_cast<double>(grid.numClusters())*numAtomsCluster;
      fprintf(fp, "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
-            nbl->na_ci, rl, nbl->nci_tot, nbl->nci_tot/static_cast<double>(grid.numClusters()),
+            nbl.na_ci, rl, nbl.nci_tot, nbl.nci_tot/static_cast<double>(grid.numClusters()),
              numAtomsPerCell,
              numAtomsPerCell/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid.numClusters()*numAtomsCluster/(dims.gridSize[XX]*dims.gridSize[YY]*dims.gridSize[ZZ])));
  
@@ -822,7 +815,7 @@ static void print_nblist_statistics(FILE                   *fp,
      double sum_nsp2 = 0;
      int    nsp_max  = 0;
      int    c[c_gpuNumClusterPerCell + 1] = { 0 };
-    for (const nbnxn_sci_t &sci : nbl->sci)
+    for (const nbnxn_sci_t &sci : nbl.sci)
      {
          int nsp = 0;
          for (int j4 = sci.cj4_ind_start; j4 < sci.cj4_ind_end; j4++)
@@ -832,7 +825,7 @@ static void print_nblist_statistics(FILE                   *fp,
                  int b = 0;
                  for (int si = 0; si < c_gpuNumClusterPerCell; si++)
                  {
-                    if (nbl->cj4[j4].imei[0].imask & (1U << (j*c_gpuNumClusterPerCell + si)))
+                    if (nbl.cj4[j4].imei[0].imask & (1U << (j*c_gpuNumClusterPerCell + si)))
                      {
                          b++;
                      }
@@ -845,20 +838,20 @@ static void print_nblist_statistics(FILE                   *fp,
          sum_nsp2 += nsp*nsp;
          nsp_max   = std::max(nsp_max, nsp);
      }
-    if (!nbl->sci.empty())
+    if (!nbl.sci.empty())
      {
-        sum_nsp  /= nbl->sci.size();
-        sum_nsp2 /= nbl->sci.size();
+        sum_nsp  /= nbl.sci.size();
+        sum_nsp2 /= nbl.sci.size();
      }
      fprintf(fp, "nbl #cluster-pairs: av %.1f stddev %.1f max %d\n",
              sum_nsp, std::sqrt(sum_nsp2 - sum_nsp*sum_nsp), nsp_max);
  
-    if (!nbl->cj4.empty())
+    if (!nbl.cj4.empty())
      {
          for (int b = 0; b <= c_gpuNumClusterPerCell; b++)
          {
              fprintf(fp, "nbl j-list #i-subcell %d %7d %4.1f\n",
-                    b, c[b], 100.0*c[b]/size_t {nbl->cj4.size()*c_nbnxnGpuJgroupSize});
+                    b, c[b], 100.0*c[b]/size_t {nbl.cj4.size()*c_nbnxnGpuJgroupSize});
          }
      }
  }
@@ -2045,7 +2038,7 @@ static void closeIEntry(NbnxnPairlistCpu    *nbl,
      const int jlen = ciEntry.cj_ind_end - ciEntry.cj_ind_start;
      if (jlen > 0)
      {
-        sort_cj_excl(nbl->cj.data() + ciEntry.cj_ind_start, jlen, nbl->work);
+        sort_cj_excl(nbl->cj.data() + ciEntry.cj_ind_start, jlen, nbl->work.get());
  
          /* The counts below are used for non-bonded pair/flop counts
           * and should therefore match the available kernel setups.
@@ -2605,9 +2598,10 @@ static void get_nsubpair_target(const PairSearch          &pairSearch,
  }
  
  /* Debug list print function */
-static void print_nblist_ci_cj(FILE *fp, const NbnxnPairlistCpu *nbl)
+static void print_nblist_ci_cj(FILE                   *fp,
+                               const NbnxnPairlistCpu &nbl)
  {
-    for (const nbnxn_ci_t &ciEntry : nbl->ci)
+    for (const nbnxn_ci_t &ciEntry : nbl.ci)
      {
          fprintf(fp, "ci %4d  shift %2d  ncj %3d\n",
                  ciEntry.ci, ciEntry.shift,
@@ -2616,16 +2610,17 @@ static void print_nblist_ci_cj(FILE *fp, const NbnxnPairlistCpu *nbl)
          for (int j = ciEntry.cj_ind_start; j < ciEntry.cj_ind_end; j++)
          {
              fprintf(fp, "  cj %5d  imask %x\n",
-                    nbl->cj[j].cj,
-                    nbl->cj[j].excl);
+                    nbl.cj[j].cj,
+                    nbl.cj[j].excl);
          }
      }
  }
  
  /* Debug list print function */
-static void print_nblist_sci_cj(FILE *fp, const NbnxnPairlistGpu *nbl)
+static void print_nblist_sci_cj(FILE                   *fp,
+                                const NbnxnPairlistGpu &nbl)
  {
-    for (const nbnxn_sci_t &sci : nbl->sci)
+    for (const nbnxn_sci_t &sci : nbl.sci)
      {
          fprintf(fp, "ci %4d  shift %2d  ncj4 %2d\n",
                  sci.sci, sci.shift,
@@ -2637,11 +2632,11 @@ static void print_nblist_sci_cj(FILE *fp, const NbnxnPairlistGpu *nbl)
              for (int j = 0; j < c_nbnxnGpuJgroupSize; j++)
              {
                  fprintf(fp, "  sj %5d  imask %x\n",
-                        nbl->cj4[j4].cj[j],
-                        nbl->cj4[j4].imei[0].imask);
+                        nbl.cj4[j4].cj[j],
+                        nbl.cj4[j4].imei[0].imask);
                  for (int si = 0; si < c_gpuNumClusterPerCell; si++)
                  {
-                    if (nbl->cj4[j4].imei[0].imask & (1U << (j*c_gpuNumClusterPerCell + si)))
+                    if (nbl.cj4[j4].imei[0].imask & (1U << (j*c_gpuNumClusterPerCell + si)))
                      {
                          ncp++;
                      }
@@ -2656,17 +2651,17 @@ static void print_nblist_sci_cj(FILE *fp, const NbnxnPairlistGpu *nbl)
  }
  
  /* Combine pair lists *nbl generated on multiple threads nblc */
-static void combine_nblists(int nnbl, NbnxnPairlistGpu **nbl,
-                            NbnxnPairlistGpu *nblc)
+static void combine_nblists(gmx::ArrayRef<const NbnxnPairlistGpu>  nbls,
+                            NbnxnPairlistGpu                      *nblc)
  {
      int nsci  = nblc->sci.size();
      int ncj4  = nblc->cj4.size();
      int nexcl = nblc->excl.size();
-    for (int i = 0; i < nnbl; i++)
+    for (auto &nbl : nbls)
      {
-        nsci  += nbl[i]->sci.size();
-        ncj4  += nbl[i]->cj4.size();
-        nexcl += nbl[i]->excl.size();
+        nsci  += nbl.sci.size();
+        ncj4  += nbl.cj4.size();
+        nexcl += nbl.excl.size();
      }
  
      /* Resize with the final, combined size, so we can fill in parallel */
@@ -2683,7 +2678,7 @@ static void combine_nblists(int nnbl, NbnxnPairlistGpu **nbl,
  #endif
  
  #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (int n = 0; n < nnbl; n++)
+    for (int n = 0; n < nbls.ssize(); n++)
      {
          try
          {
@@ -2694,14 +2689,14 @@ static void combine_nblists(int nnbl, NbnxnPairlistGpu **nbl,
              int cj4_offset  = ncj4;
              int excl_offset = nexcl;
  
-            for (int i = n; i < nnbl; i++)
+            for (int i = n; i < nbls.ssize(); i++)
              {
-                sci_offset  -= nbl[i]->sci.size();
-                cj4_offset  -= nbl[i]->cj4.size();
-                excl_offset -= nbl[i]->excl.size();
+                sci_offset  -= nbls[i].sci.size();
+                cj4_offset  -= nbls[i].cj4.size();
+                excl_offset -= nbls[i].excl.size();
              }
  
-            const NbnxnPairlistGpu &nbli = *nbl[n];
+            const NbnxnPairlistGpu &nbli = nbls[n];
  
              for (size_t i = 0; i < nbli.sci.size(); i++)
              {
@@ -2725,43 +2720,39 @@ static void combine_nblists(int nnbl, NbnxnPairlistGpu **nbl,
          GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
      }
  
-    for (int n = 0; n < nnbl; n++)
+    for (auto &nbl : nbls)
      {
-        nblc->nci_tot += nbl[n]->nci_tot;
+        nblc->nci_tot += nbl.nci_tot;
      }
  }
  
-static void balance_fep_lists(gmx::ArrayRef<PairsearchWork>       work,
-                              nbnxn_pairlist_set_t               *nbl_lists)
+static void balance_fep_lists(gmx::ArrayRef<t_nblist *>      fepLists,
+                              gmx::ArrayRef<PairsearchWork>  work)
  {
-    int       nnbl;
-    int       nri_tot, nrj_tot, nrj_target;
-    int       th_dest;
-    t_nblist *nbld;
+    const int numLists = fepLists.ssize();
  
-    nnbl = nbl_lists->nnbl;
-
-    if (nnbl == 1)
+    if (numLists == 1)
      {
          /* Nothing to balance */
          return;
      }
  
      /* Count the total i-lists and pairs */
-    nri_tot = 0;
-    nrj_tot = 0;
-    for (int th = 0; th < nnbl; th++)
+    int nri_tot = 0;
+    int nrj_tot = 0;
+    for (auto list : fepLists)
      {
-        nri_tot += nbl_lists->nbl_fep[th]->nri;
-        nrj_tot += nbl_lists->nbl_fep[th]->nrj;
+        nri_tot += list->nri;
+        nrj_tot += list->nrj;
      }
  
-    nrj_target = (nrj_tot + nnbl - 1)/nnbl;
+    const int nrj_target = (nrj_tot + numLists - 1)/numLists;
  
-    assert(gmx_omp_nthreads_get(emntNonbonded) == nnbl);
+    GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == numLists,
+               "We should have as many work objects as FEP lists");
  
-#pragma omp parallel for schedule(static) num_threads(nnbl)
-    for (int th = 0; th < nnbl; th++)
+#pragma omp parallel for schedule(static) num_threads(numLists)
+    for (int th = 0; th < numLists; th++)
      {
          try
          {
@@ -2788,13 +2779,11 @@ static void balance_fep_lists(gmx::ArrayRef<PairsearchWork>       work,
      }
  
      /* Loop over the source lists and assign and copy i-entries */
-    th_dest = 0;
-    nbld    = work[th_dest].nbl_fep.get();
-    for (int th = 0; th < nnbl; th++)
+    int       th_dest = 0;
+    t_nblist *nbld    = work[th_dest].nbl_fep.get();
+    for (int th = 0; th < numLists; th++)
      {
-        t_nblist *nbls;
-
-        nbls = nbl_lists->nbl_fep[th];
+        t_nblist *nbls = fepLists[th];
  
          for (int i = 0; i < nbls->nri; i++)
          {
@@ -2806,7 +2795,7 @@ static void balance_fep_lists(gmx::ArrayRef<PairsearchWork>       work,
              /* Decide if list th_dest is too large and we should procede
               * to the next destination list.
               */
-            if (th_dest+1 < nnbl && nbld->nrj > 0 &&
+            if (th_dest + 1 < numLists && nbld->nrj > 0 &&
                  nbld->nrj + nrj - nrj_target > nrj_target - nbld->nrj)
              {
                  th_dest++;
@@ -2829,18 +2818,18 @@ static void balance_fep_lists(gmx::ArrayRef<PairsearchWork>       work,
      }
  
      /* Swap the list pointers */
-    for (int th = 0; th < nnbl; th++)
+    for (int th = 0; th < numLists; th++)
      {
-        t_nblist *nbl_tmp      = work[th].nbl_fep.release();
-        work[th].nbl_fep.reset(nbl_lists->nbl_fep[th]);
-        nbl_lists->nbl_fep[th] = nbl_tmp;
+        t_nblist *nbl_tmp = work[th].nbl_fep.release();
+        work[th].nbl_fep.reset(fepLists[th]);
+        fepLists[th]      = nbl_tmp;
  
          if (debug)
          {
              fprintf(debug, "nbl_fep[%d] nri %4d nrj %4d\n",
                      th,
-                    nbl_lists->nbl_fep[th]->nri,
-                    nbl_lists->nbl_fep[th]->nrj);
+                    fepLists[th]->nri,
+                    fepLists[th]->nrj);
          }
      }
  }
@@ -2922,7 +2911,8 @@ static float boundingbox_only_distance2(const Grid::Dimensions &iGridDims,
  }
  
  static int get_ci_block_size(const Grid &iGrid,
-                             gmx_bool bDomDec, int nth)
+                             const bool  haveDomDec,
+                             const int   numLists)
  {
      const int ci_block_enum      = 5;
      const int ci_block_denom     = 11;
@@ -2940,7 +2930,9 @@ static int get_ci_block_size(const Grid &iGrid,
       * zone boundaries with 3D domain decomposition. At the same time
       * the blocks will not become too small.
       */
-    ci_block = (iGrid.numCells()*ci_block_enum)/(ci_block_denom*iGrid.dimensions().numCells[XX]*nth);
+    GMX_ASSERT(iGrid.dimensions().numCells[XX] > 0, "Grid can't be empty");
+    GMX_ASSERT(numLists > 0, "We need at least one list");
+    ci_block = (iGrid.numCells()*ci_block_enum)/(ci_block_denom*iGrid.dimensions().numCells[XX]*numLists);
  
      const int numAtomsPerCell = iGrid.geometry().numAtomsPerCell;
  
@@ -2953,12 +2945,12 @@ static int get_ci_block_size(const Grid &iGrid,
      /* Without domain decomposition
       * or with less than 3 blocks per task, divide in nth blocks.
       */
-    if (!bDomDec || nth*3*ci_block > iGrid.numCells())
+    if (!haveDomDec || numLists*3*ci_block > iGrid.numCells())
      {
-        ci_block = (iGrid.numCells() + nth - 1)/nth;
+        ci_block = (iGrid.numCells() + numLists - 1)/numLists;
      }
  
-    if (ci_block > 1 && (nth - 1)*ci_block >= iGrid.numCells())
+    if (ci_block > 1 && (numLists - 1)*ci_block >= iGrid.numCells())
      {
          /* Some threads have no work. Although reducing the block size
           * does not decrease the block count on the first few threads,
@@ -3440,12 +3432,12 @@ static void nbnxn_make_pairlist_part(const PairSearch &pairSearch,
                      }
  
                      set_icell_bb(iGrid, ci, shx, shy, shz,
-                                 nbl->work);
+                                 nbl->work.get());
  
                      icell_set_x(cell0_i+ci, shx, shy, shz,
                                  nbat->xstride, nbat->x().data(),
                                  kernelType,
-                                nbl->work);
+                                nbl->work.get());
  
                      for (int cx = cxf; cx <= cxl; cx++)
                      {
@@ -3645,7 +3637,7 @@ static void nbnxn_make_pairlist_part(const PairSearch &pairSearch,
      {
          fprintf(debug, "number of distance checks %d\n", numDistanceChecks);
  
-        print_nblist_statistics(debug, nbl, pairSearch, rlist);
+        print_nblist_statistics(debug, *nbl, pairSearch, rlist);
  
          if (haveFep)
          {
@@ -3763,17 +3755,17 @@ static void copySelectedListRange(const nbnxn_ci_t * gmx_restrict srcCi,
   * to reduction of parts of the force buffer that could be avoided. But since
   * the original lists are quite balanced, this will only give minor overhead.
   */
-static void rebalanceSimpleLists(int                                  numLists,
-                                 NbnxnPairlistCpu * const * const     srcSet,
-                                 NbnxnPairlistCpu                   **destSet,
-                                 gmx::ArrayRef<PairsearchWork>        searchWork)
+static void rebalanceSimpleLists(gmx::ArrayRef<const NbnxnPairlistCpu> srcSet,
+                                 gmx::ArrayRef<NbnxnPairlistCpu>       destSet,
+                                 gmx::ArrayRef<PairsearchWork>         searchWork)
  {
      int ncjTotal = 0;
-    for (int s = 0; s < numLists; s++)
+    for (auto &src : srcSet)
      {
-        ncjTotal += srcSet[s]->ncjInUse;
+        ncjTotal += src.ncjInUse;
      }
-    int ncjTarget = (ncjTotal + numLists - 1)/numLists;
+    const int numLists  = srcSet.ssize();
+    const int ncjTarget = (ncjTotal + numLists - 1)/numLists;
  
  #pragma omp parallel num_threads(numLists)
      {
@@ -3783,23 +3775,23 @@ static void rebalanceSimpleLists(int                                  numLists,
          int cjEnd   = ncjTarget*(t + 1);
  
          /* The destination pair-list for task/thread t */
-        NbnxnPairlistCpu *dest = destSet[t];
+        NbnxnPairlistCpu &dest = destSet[t];
  
-        clear_pairlist(dest);
-        dest->na_cj   = srcSet[0]->na_cj;
+        clear_pairlist(&dest);
+        dest.na_cj = srcSet[0].na_cj;
  
          /* Note that the flags in the work struct (still) contain flags
           * for all entries that are present in srcSet->nbl[t].
           */
          gmx_bitmask_t *flag       = searchWork[t].buffer_flags.flag;
  
-        int            iFlagShift = getBufferFlagShift(dest->na_ci);
-        int            jFlagShift = getBufferFlagShift(dest->na_cj);
+        int            iFlagShift = getBufferFlagShift(dest.na_ci);
+        int            jFlagShift = getBufferFlagShift(dest.na_cj);
  
          int            cjGlobal   = 0;
          for (int s = 0; s < numLists && cjGlobal < cjEnd; s++)
          {
-            const NbnxnPairlistCpu *src = srcSet[s];
+            const NbnxnPairlistCpu *src = &srcSet[s];
  
              if (cjGlobal + src->ncjInUse > cjStart)
              {
@@ -3816,7 +3808,7 @@ static void rebalanceSimpleLists(int                                  numLists,
                          {
                              copySelectedListRange
                              <true>
-                                (srcCi, src, dest,
+                                (srcCi, src, &dest,
                                  flag, iFlagShift, jFlagShift, t);
                          }
                          else
@@ -3824,7 +3816,7 @@ static void rebalanceSimpleLists(int                                  numLists,
                              copySelectedListRange
                              <false>
                                  (srcCi, src,
-                                dest, flag, iFlagShift, jFlagShift, t);
+                                &dest, flag, iFlagShift, jFlagShift, t);
                          }
                      }
                      cjGlobal += ncj;
@@ -3836,29 +3828,29 @@ static void rebalanceSimpleLists(int                                  numLists,
              }
          }
  
-        dest->ncjInUse = dest->cj.size();
+        dest.ncjInUse = dest.cj.size();
      }
  
  #ifndef NDEBUG
      int ncjTotalNew = 0;
-    for (int s = 0; s < numLists; s++)
+    for (auto &dest : destSet)
      {
-        ncjTotalNew += destSet[s]->ncjInUse;
+        ncjTotalNew += dest.ncjInUse;
      }
      GMX_RELEASE_ASSERT(ncjTotalNew == ncjTotal, "The total size of the lists before and after rebalancing should match");
  #endif
  }
  
  /* Returns if the pairlists are so imbalanced that it is worth rebalancing. */
-static bool checkRebalanceSimpleLists(const nbnxn_pairlist_set_t *listSet)
+static bool checkRebalanceSimpleLists(gmx::ArrayRef<const NbnxnPairlistCpu> lists)
  {
-    int numLists = listSet->nnbl;
+    int numLists = lists.ssize();
      int ncjMax   = 0;
      int ncjTotal = 0;
      for (int s = 0; s < numLists; s++)
      {
-        ncjMax    = std::max(ncjMax, listSet->nbl[s]->ncjInUse);
-        ncjTotal += listSet->nbl[s]->ncjInUse;
+        ncjMax    = std::max(ncjMax, lists[s].ncjInUse);
+        ncjTotal += lists[s].ncjInUse;
      }
      if (debug)
      {
@@ -3932,44 +3924,41 @@ static void sort_sci(NbnxnPairlistGpu *nbl)
      std::swap(nbl->sci, work.sci_sort);
  }
  
+//! Prepares CPU lists produced by the search for dynamic pruning
+static void prepareListsForDynamicPruning(gmx::ArrayRef<NbnxnPairlistCpu> lists);
+
  void
-nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality,
-                                            PairSearch                *pairSearch,
-                                            nbnxn_atomdata_t          *nbat,
-                                            const t_blocka            *excl,
-                                            const Nbnxm::KernelType    kernelType,
-                                            const int64_t              step,
-                                            t_nrnb                    *nrnb)
+PairlistSet::constructPairlists(PairSearch                *pairSearch,
+                                nbnxn_atomdata_t          *nbat,
+                                const t_blocka            *excl,
+                                const Nbnxm::KernelType    kernelType,
+                                const int                  minimumIlistCountForGpuBalancing,
+                                t_nrnb                    *nrnb)
  {
-    nbnxn_pairlist_set_t *nbl_list = &pairlistSet(iLocality);
-
-    const real            rlist    = nbl_list->params.rlistOuter;
+    const real         rlist    = params_.rlistOuter;
  
      int                nsubpair_target;
      float              nsubpair_tot_est;
-    int                nnbl;
      int                ci_block;
-    gmx_bool           CombineNBLists;
      gmx_bool           progBal;
      int                np_tot, np_noq, np_hlj, nap;
  
-    nnbl            = nbl_list->nnbl;
-    CombineNBLists  = nbl_list->bCombined;
+    const int          numLists = (isCpuType_ ? cpuLists_.size() : gpuLists_.size());
  
      if (debug)
      {
-        fprintf(debug, "ns making %d nblists\n", nnbl);
+        fprintf(debug, "ns making %d nblists\n", numLists);
      }
  
      nbat->bUseBufferFlags = (nbat->out.size() > 1);
      /* We should re-init the flags before making the first list */
-    if (nbat->bUseBufferFlags && iLocality == InteractionLocality::Local)
+    if (nbat->bUseBufferFlags && locality_ == InteractionLocality::Local)
      {
          init_buffer_flags(&nbat->buffer_flags, nbat->numAtoms());
      }
  
      int nzi;
-    if (iLocality == InteractionLocality::Local)
+    if (locality_ == InteractionLocality::Local)
      {
          /* Only zone (grid) 0 vs 0 */
          nzi = 1;
@@ -3979,9 +3968,9 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
          nzi = pairSearch->domainSetup().zones->nizone;
      }
  
-    if (!nbl_list->bSimple && minimumIlistCountForGpuBalancing_ > 0)
+    if (!isCpuType_ && minimumIlistCountForGpuBalancing > 0)
      {
-        get_nsubpair_target(*pairSearch, iLocality, rlist, minimumIlistCountForGpuBalancing_,
+        get_nsubpair_target(*pairSearch, locality_, rlist, minimumIlistCountForGpuBalancing,
                              &nsubpair_target, &nsubpair_tot_est);
      }
      else
@@ -3991,20 +3980,20 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
      }
  
      /* Clear all pair-lists */
-    for (int th = 0; th < nnbl; th++)
+    for (int th = 0; th < numLists; th++)
      {
-        if (nbl_list->bSimple)
+        if (isCpuType_)
          {
-            clear_pairlist(nbl_list->nbl[th]);
+            clear_pairlist(&cpuLists_[th]);
          }
          else
          {
-            clear_pairlist(nbl_list->nblGpu[th]);
+            clear_pairlist(&gpuLists_[th]);
          }
  
-        if (pairSearch->gridSet().haveFep())
+        if (params_.haveFep)
          {
-            clear_pairlist_fep(nbl_list->nbl_fep[th]);
+            clear_pairlist_fep(fepLists_[th]);
          }
      }
  
@@ -4016,7 +4005,7 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
  
          int                 zj0;
          int                 zj1;
-        if (iLocality == InteractionLocality::Local)
+        if (locality_ == InteractionLocality::Local)
          {
              zj0 = 0;
              zj1 = 1;
@@ -4041,15 +4030,15 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
  
              pairSearch->cycleCounting_.start(PairSearch::enbsCCsearch);
  
-            ci_block = get_ci_block_size(iGrid, pairSearch->domainSetup().haveDomDec, nnbl);
+            ci_block = get_ci_block_size(iGrid, pairSearch->domainSetup().haveDomDec, numLists);
  
              /* With GPU: generate progressively smaller lists for
               * load balancing for local only or non-local with 2 zones.
               */
-            progBal = (iLocality == InteractionLocality::Local || ddZones->n <= 2);
+            progBal = (locality_ == InteractionLocality::Local || ddZones->n <= 2);
  
-#pragma omp parallel for num_threads(nnbl) schedule(static)
-            for (int th = 0; th < nnbl; th++)
+#pragma omp parallel for num_threads(numLists) schedule(static)
+            for (int th = 0; th < numLists; th++)
              {
                  try
                  {
@@ -4061,48 +4050,50 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
                          init_buffer_flags(&pairSearch->work()[th].buffer_flags, nbat->numAtoms());
                      }
  
-                    if (CombineNBLists && th > 0)
+                    if (combineLists_ && th > 0)
                      {
-                        GMX_ASSERT(!nbl_list->bSimple, "Can only combine GPU lists");
+                        GMX_ASSERT(!isCpuType_, "Can only combine GPU lists");
  
-                        clear_pairlist(nbl_list->nblGpu[th]);
+                        clear_pairlist(&gpuLists_[th]);
                      }
  
-                    auto &searchWork = pairSearch->work()[th];
+                    PairsearchWork *searchWork = &pairSearch->work()[th];
+
+                    searchWork->cycleCounter.start();
  
-                    searchWork.cycleCounter.start();
+                    t_nblist *fepListPtr = (fepLists_.empty() ? nullptr : fepLists_[th]);
  
-                    /* Divide the i super cell equally over the nblists */
-                    if (nbl_list->bSimple)
+                    /* Divide the i cells equally over the pairlists */
+                    if (isCpuType_)
                      {
                          nbnxn_make_pairlist_part(*pairSearch, iGrid, jGrid,
-                                                 &searchWork, nbat, *excl,
+                                                 searchWork, nbat, *excl,
                                                   rlist,
                                                   kernelType,
                                                   ci_block,
                                                   nbat->bUseBufferFlags,
                                                   nsubpair_target,
                                                   progBal, nsubpair_tot_est,
-                                                 th, nnbl,
-                                                 nbl_list->nbl[th],
-                                                 nbl_list->nbl_fep[th]);
+                                                 th, numLists,
+                                                 &cpuLists_[th],
+                                                 fepListPtr);
                      }
                      else
                      {
                          nbnxn_make_pairlist_part(*pairSearch, iGrid, jGrid,
-                                                 &searchWork, nbat, *excl,
+                                                 searchWork, nbat, *excl,
                                                   rlist,
                                                   kernelType,
                                                   ci_block,
                                                   nbat->bUseBufferFlags,
                                                   nsubpair_target,
                                                   progBal, nsubpair_tot_est,
-                                                 th, nnbl,
-                                                 nbl_list->nblGpu[th],
-                                                 nbl_list->nbl_fep[th]);
+                                                 th, numLists,
+                                                 &gpuLists_[th],
+                                                 fepListPtr);
                      }
  
-                    searchWork.cycleCounter.stop();
+                    searchWork->cycleCounter.stop();
                  }
                  GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
              }
@@ -4111,77 +4102,75 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
              np_tot = 0;
              np_noq = 0;
              np_hlj = 0;
-            for (int th = 0; th < nnbl; th++)
+            for (int th = 0; th < numLists; th++)
              {
                  inc_nrnb(nrnb, eNR_NBNXN_DIST2, pairSearch->work()[th].ndistc);
  
-                if (nbl_list->bSimple)
+                if (isCpuType_)
                  {
-                    NbnxnPairlistCpu *nbl = nbl_list->nbl[th];
-                    np_tot += nbl->cj.size();
-                    np_noq += nbl->work->ncj_noq;
-                    np_hlj += nbl->work->ncj_hlj;
+                    const NbnxnPairlistCpu &nbl = cpuLists_[th];
+                    np_tot += nbl.cj.size();
+                    np_noq += nbl.work->ncj_noq;
+                    np_hlj += nbl.work->ncj_hlj;
                  }
                  else
                  {
-                    NbnxnPairlistGpu *nbl = nbl_list->nblGpu[th];
+                    const NbnxnPairlistGpu &nbl = gpuLists_[th];
                      /* This count ignores potential subsequent pair pruning */
-                    np_tot += nbl->nci_tot;
+                    np_tot += nbl.nci_tot;
                  }
              }
-            if (nbl_list->bSimple)
+            if (isCpuType_)
              {
-                nap               = nbl_list->nbl[0]->na_ci*nbl_list->nbl[0]->na_cj;
+                nap      = cpuLists_[0].na_ci*cpuLists_[0].na_cj;
              }
              else
              {
-                nap               = gmx::square(nbl_list->nblGpu[0]->na_ci);
+                nap      = gmx::square(gpuLists_[0].na_ci);
              }
-            nbl_list->natpair_ljq = (np_tot - np_noq)*nap - np_hlj*nap/2;
-            nbl_list->natpair_lj  = np_noq*nap;
-            nbl_list->natpair_q   = np_hlj*nap/2;
+            natpair_ljq_ = (np_tot - np_noq)*nap - np_hlj*nap/2;
+            natpair_lj_  = np_noq*nap;
+            natpair_q_   = np_hlj*nap/2;
  
-            if (CombineNBLists && nnbl > 1)
+            if (combineLists_ && numLists > 1)
              {
-                GMX_ASSERT(!nbl_list->bSimple, "Can only combine GPU lists");
-                NbnxnPairlistGpu **nbl = nbl_list->nblGpu;
+                GMX_ASSERT(!isCpuType_, "Can only combine GPU lists");
  
                  pairSearch->cycleCounting_.start(PairSearch::enbsCCcombine);
  
-                combine_nblists(nnbl-1, nbl+1, nbl[0]);
+                combine_nblists(gmx::constArrayRefFromArray(&gpuLists_[1], numLists - 1),
+                                &gpuLists_[0]);
  
                  pairSearch->cycleCounting_.stop(PairSearch::enbsCCcombine);
              }
          }
      }
  
-    if (nbl_list->bSimple)
+    if (isCpuType_)
      {
-        if (nnbl > 1 && checkRebalanceSimpleLists(nbl_list))
+        if (numLists > 1 && checkRebalanceSimpleLists(cpuLists_))
          {
-            rebalanceSimpleLists(nbl_list->nnbl, nbl_list->nbl, nbl_list->nbl_work, pairSearch->work());
+            rebalanceSimpleLists(cpuLists_, cpuListsWork_, pairSearch->work());
  
-            /* Swap the pointer of the sets of pair lists */
-            NbnxnPairlistCpu **tmp = nbl_list->nbl;
-            nbl_list->nbl          = nbl_list->nbl_work;
-            nbl_list->nbl_work     = tmp;
+            /* Swap the sets of pair lists */
+            cpuLists_.swap(cpuListsWork_);
          }
      }
      else
      {
          /* Sort the entries on size, large ones first */
-        if (CombineNBLists || nnbl == 1)
+        if (combineLists_ || gpuLists_.size() == 1)
          {
-            sort_sci(nbl_list->nblGpu[0]);
+            sort_sci(&gpuLists_[0]);
          }
          else
          {
-#pragma omp parallel for num_threads(nnbl) schedule(static)
-            for (int th = 0; th < nnbl; th++)
+#pragma omp parallel for num_threads(numLists) schedule(static)
+            for (int th = 0; th < numLists; th++)
              {
                  try
                  {
-                    sort_sci(nbl_list->nblGpu[th]);
+                    sort_sci(&gpuLists_[th]);
                  }
                  GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
              }
@@ -4190,60 +4179,38 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
  
      if (nbat->bUseBufferFlags)
      {
-        reduce_buffer_flags(*pairSearch, nbl_list->nnbl, &nbat->buffer_flags);
+        reduce_buffer_flags(*pairSearch, numLists, &nbat->buffer_flags);
      }
  
      if (pairSearch->gridSet().haveFep())
      {
          /* Balance the free-energy lists over all the threads */
-        balance_fep_lists(pairSearch->work(), nbl_list);
+        balance_fep_lists(fepLists_, pairSearch->work());
      }
  
-    if (nbl_list->bSimple)
+    if (isCpuType_)
      {
          /* This is a fresh list, so not pruned, stored using ci.
           * ciOuter is invalid at this point.
           */
-        GMX_ASSERT(nbl_list->nbl[0]->ciOuter.empty(), "ciOuter is invalid so it should be empty");
-    }
-
-    if (iLocality == Nbnxm::InteractionLocality::Local)
-    {
-        outerListCreationStep_ = step;
-    }
-    else
-    {
-        GMX_RELEASE_ASSERT(outerListCreationStep_ == step,
-                           "Outer list should be created at the same step as the inner list");
-    }
-
-    /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
-    if (iLocality == InteractionLocality::Local)
-    {
-        pairSearch->cycleCounting_.searchCount_++;
-    }
-    if (pairSearch->cycleCounting_.recordCycles_ &&
-        (!pairSearch->domainSetup().haveDomDec || iLocality == InteractionLocality::NonLocal) &&
-        pairSearch->cycleCounting_.searchCount_ % 100 == 0)
-    {
-        pairSearch->cycleCounting_.printCycles(stderr, pairSearch->work());
+        GMX_ASSERT(cpuLists_[0].ciOuter.empty(), "ciOuter is invalid so it should be empty");
      }
  
      /* If we have more than one list, they either got rebalancing (CPU)
       * or combined (GPU), so we should dump the final result to debug.
       */
-    if (debug && nbl_list->nnbl > 1)
+    if (debug)
      {
-        if (nbl_list->bSimple)
+        if (isCpuType_ && cpuLists_.size() > 1)
          {
-            for (int t = 0; t < nbl_list->nnbl; t++)
+            for (auto &cpuList : cpuLists_)
              {
-                print_nblist_statistics(debug, nbl_list->nbl[t], *pairSearch, rlist);
+                print_nblist_statistics(debug, cpuList, *pairSearch, rlist);
              }
          }
-        else
+        else if (!isCpuType_ && gpuLists_.size() > 1)
          {
-            print_nblist_statistics(debug, nbl_list->nblGpu[0], *pairSearch, rlist);
+            print_nblist_statistics(debug, gpuLists_[0], *pairSearch, rlist);
          }
      }
  
@@ -4251,28 +4218,62 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
      {
          if (gmx_debug_at)
          {
-            if (nbl_list->bSimple)
+            if (isCpuType_)
              {
-                for (int t = 0; t < nbl_list->nnbl; t++)
+                for (auto &cpuList : cpuLists_)
                  {
-                    print_nblist_ci_cj(debug, nbl_list->nbl[t]);
+                    print_nblist_ci_cj(debug, cpuList);
                  }
              }
              else
              {
-                print_nblist_sci_cj(debug, nbl_list->nblGpu[0]);
+                print_nblist_sci_cj(debug, gpuLists_[0]);
              }
          }
  
          if (nbat->bUseBufferFlags)
          {
-            print_reduction_cost(&nbat->buffer_flags, nbl_list->nnbl);
+            print_reduction_cost(&nbat->buffer_flags, numLists);
          }
      }
  
-    if (params_.useDynamicPruning && nbl_list->bSimple)
+    if (params_.useDynamicPruning && isCpuType_)
      {
-        nbnxnPrepareListForDynamicPruning(nbl_list);
+        prepareListsForDynamicPruning(cpuLists_);
+    }
+}
+
+void
+nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality,
+                                            PairSearch                *pairSearch,
+                                            nbnxn_atomdata_t          *nbat,
+                                            const t_blocka            *excl,
+                                            const Nbnxm::KernelType    kernelType,
+                                            const int64_t              step,
+                                            t_nrnb                    *nrnb)
+{
+    pairlistSet(iLocality).constructPairlists(pairSearch, nbat, excl, kernelType, minimumIlistCountForGpuBalancing_, nrnb);
+
+    if (iLocality == Nbnxm::InteractionLocality::Local)
+    {
+        outerListCreationStep_ = step;
+    }
+    else
+    {
+        GMX_RELEASE_ASSERT(outerListCreationStep_ == step,
+                           "Outer list should be created at the same step as the inner list");
+    }
+
+    /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
+    if (iLocality == InteractionLocality::Local)
+    {
+        pairSearch->cycleCounting_.searchCount_++;
+    }
+    if (pairSearch->cycleCounting_.recordCycles_ &&
+        (!pairSearch->domainSetup().haveDomDec || iLocality == InteractionLocality::NonLocal) &&
+        pairSearch->cycleCounting_.searchCount_ % 100 == 0)
+    {
+        pairSearch->cycleCounting_.printCycles(stderr, pairSearch->work());
      }
  }
  
@@ -4293,24 +4294,20 @@ nonbonded_verlet_t::constructPairlist(const Nbnxm::InteractionLocality  iLocalit
           * NOTE: The launch overhead is currently not timed separately
           */
          Nbnxm::gpu_init_pairlist(gpu_nbv,
-                                 pairlistSets().pairlistSet(iLocality).nblGpu[0],
+                                 pairlistSets().pairlistSet(iLocality).gpuList(),
                                   iLocality);
      }
  }
  
-void nbnxnPrepareListForDynamicPruning(nbnxn_pairlist_set_t *listSet)
+static void prepareListsForDynamicPruning(gmx::ArrayRef<NbnxnPairlistCpu> lists)
  {
-    GMX_RELEASE_ASSERT(listSet->bSimple, "Should only be called for simple lists");
-
      /* TODO: Restructure the lists so we have actual outer and inner
       *       list objects so we can set a single pointer instead of
       *       swapping several pointers.
       */
  
-    for (int i = 0; i < listSet->nnbl; i++)
+    for (auto &list : lists)
      {
-        NbnxnPairlistCpu &list = *listSet->nbl[i];
-
          /* The search produced a list in ci/cj.
           * Swap the list pointers so we get the outer list is ciOuter,cjOuter
           * and we can prune that to get an inner list in ci/cj.
diff --git a/src/gromacs/nbnxm/pairlist.h b/src/gromacs/nbnxm/pairlist.h

index b4bbc52c92c3ba894b450ef6e6040896311a9adf..62f928e67f68ddd07610889bc7c78648e2989dcf 100644 (file)
--- a/src/gromacs/nbnxm/pairlist.h
+++ b/src/gromacs/nbnxm/pairlist.h
@@ -52,9 +52,15 @@
  // to include it during OpenCL jitting without including config.h
  #include "gromacs/nbnxm/constants.h"
  
+#include "locality.h"
+
  struct NbnxnListParameters;
  struct NbnxnPairlistCpuWork;
  struct NbnxnPairlistGpuWork;
+struct nbnxn_atomdata_t;
+class PairSearch;
+struct t_blocka;
+struct t_nrnb;
  
  namespace Nbnxm
  {
@@ -212,6 +218,8 @@ struct nbnxn_excl_t
  /* Cluster pairlist type for use on CPUs */
  struct NbnxnPairlistCpu
  {
+    NbnxnPairlistCpu();
+
      gmx_cache_protect_t     cp0;
  
      int                     na_ci;       /* The number of atoms per i-cluster        */
@@ -226,9 +234,10 @@ struct NbnxnPairlistCpu
  
      int                     nci_tot;     /* The total number of i clusters           */
  
-    NbnxnPairlistCpuWork   *work;
+    /* Working data storage for list construction */
+    std::unique_ptr<NbnxnPairlistCpuWork> work;
  
-    gmx_cache_protect_t     cp1;
+    gmx_cache_protect_t                   cp1;
  };
  
  /* Cluster pairlist type, with extra hierarchies, for on the GPU
@@ -260,29 +269,94 @@ struct NbnxnPairlistGpu
      // The total number of i-clusters
      int                            nci_tot;
  
-    NbnxnPairlistGpuWork          *work;
+    /* Working data storage for list construction */
+    std::unique_ptr<NbnxnPairlistGpuWork> work;
  
-    gmx_cache_protect_t            cp1;
+    gmx_cache_protect_t                   cp1;
  };
  
-struct nbnxn_pairlist_set_t
+/*! \internal
+ * \brief An object that holds the local or non-local pairlists
+ */
+class PairlistSet
  {
-    nbnxn_pairlist_set_t(const NbnxnListParameters &listParams);
-
-    int                         nnbl;         /* number of lists */
-    NbnxnPairlistCpu          **nbl;          /* lists for CPU */
-    NbnxnPairlistCpu          **nbl_work;     /* work space for rebalancing lists */
-    NbnxnPairlistGpu          **nblGpu;       /* lists for GPU */
-    const NbnxnListParameters  &params;       /* Pairlist parameters desribing setup and ranges */
-    gmx_bool                    bCombined;    /* TRUE if lists get combined into one (the 1st) */
-    gmx_bool                    bSimple;      /* TRUE if the list of of type "simple"
-                                                 (na_sc=na_s, no super-clusters used) */
-
-    /* Counts for debug printing */
-    int                     natpair_ljq;           /* Total number of atom pairs for LJ+Q kernel */
-    int                     natpair_lj;            /* Total number of atom pairs for LJ kernel   */
-    int                     natpair_q;             /* Total number of atom pairs for Q kernel    */
-    std::vector<t_nblist *> nbl_fep;               /* List of free-energy atom pair interactions */
+    public:
+        //! Constructor: initializes the pairlist set as empty
+        PairlistSet(Nbnxm::InteractionLocality  locality,
+                    const NbnxnListParameters  &listParams);
+
+        ~PairlistSet();
+
+        //! Constructs the pairlists in the set using the coordinates in \p nbat
+        void constructPairlists(PairSearch        *pairSearch,
+                                nbnxn_atomdata_t  *nbat,
+                                const t_blocka    *excl,
+                                Nbnxm::KernelType  kernelType,
+                                int                minimumIlistCountForGpuBalancing,
+                                t_nrnb            *nrnb);
+
+        //! Dispatch the kernel for dynamic pairlist pruning
+        void dispatchPruneKernel(const nbnxn_atomdata_t *nbat,
+                                 const rvec             *shift_vec,
+                                 Nbnxm::KernelType       kernelType);
+
+        //! Returns the locality
+        Nbnxm::InteractionLocality locality() const
+        {
+            return locality_;
+        }
+
+        //! Returns the lists of CPU pairlists
+        gmx::ArrayRef<const NbnxnPairlistCpu> cpuLists() const
+        {
+            return cpuLists_;
+        }
+
+        //! Returns a pointer to the GPU pairlist, nullptr when not present
+        const NbnxnPairlistGpu *gpuList() const
+        {
+            if (!gpuLists_.empty())
+            {
+                return &gpuLists_[0];
+            }
+            else
+            {
+                return nullptr;
+            }
+        }
+
+        //! Returns the lists of free-energy pairlists, empty when nonbonded interactions are not perturbed
+        gmx::ArrayRef<t_nblist const * const> fepLists() const
+        {
+            return fepLists_;
+        }
+
+    private:
+        //! The locality of the pairlist set
+        Nbnxm::InteractionLocality     locality_;
+        //! List of pairlists in CPU layout
+        std::vector<NbnxnPairlistCpu>  cpuLists_;
+        //! List of working list for rebalancing CPU lists
+        std::vector<NbnxnPairlistCpu>  cpuListsWork_;
+        //! List of pairlists in GPU layout
+        std::vector<NbnxnPairlistGpu>  gpuLists_;
+        //! Pairlist parameters describing setup and ranges
+        const NbnxnListParameters     &params_;
+        //! Tells whether multiple lists get merged into one (the first) after creation
+        bool                           combineLists_;
+        //! Tells whether the lists is of CPU type, otherwise GPU type
+        gmx_bool                       isCpuType_;
+        //! Lists for perturbed interactions in simple atom-atom layout
+        std::vector<t_nblist *>        fepLists_;
+
+    public:
+        /* Pair counts for flop counting */
+        //! Total number of atom pairs for LJ+Q kernel
+        int natpair_ljq_;
+        //! Total number of atom pairs for LJ kernel
+        int natpair_lj_;
+        //! Total number of atom pairs for Q kernel
+        int natpair_q_;
  };
  
  //! Initializes a free-energy pair-list
diff --git a/src/gromacs/nbnxm/pairlistset.cpp b/src/gromacs/nbnxm/pairlistset.cpp

index de04304177a0439f8f4480a2f2171fc9849173d1..57f90d380f280309fb1506bfaf3d027b143faed4 100644 (file)
--- a/src/gromacs/nbnxm/pairlistset.cpp
+++ b/src/gromacs/nbnxm/pairlistset.cpp
@@ -50,11 +50,15 @@
  #include "gromacs/nbnxm/pairlist.h"
  #include "gromacs/utility/gmxassert.h"
  
+#include "pairlistwork.h"
+
  /*! \cond INTERNAL */
  
  NbnxnListParameters::NbnxnListParameters(const Nbnxm::KernelType kernelType,
+                                         const bool              haveFep,
                                           const real              rlist,
                                           const bool              haveMultipleDomains) :
+    haveFep(haveFep),
      rlistOuter(rlist),
      rlistInner(rlist),
      haveMultipleDomains(haveMultipleDomains),
@@ -86,11 +90,6 @@ NbnxnListParameters::NbnxnListParameters(const Nbnxm::KernelType kernelType,
      }
  }
  
-nbnxn_pairlist_set_t::nbnxn_pairlist_set_t(const NbnxnListParameters &listParams) :
-    params(listParams)
-{
-    // TODO move this into this constructor
-    nbnxn_init_pairlist_set(this);
-}
+PairlistSet::~PairlistSet() = default;
  
  /*! \endcond */
diff --git a/src/gromacs/nbnxm/pairlistset.h b/src/gromacs/nbnxm/pairlistset.h

index 26e54c99a32313c8cb9a42a6529891814e732807..a35db0f83598be98f409f3a2bc81418ded29d31f 100644 (file)
--- a/src/gromacs/nbnxm/pairlistset.h
+++ b/src/gromacs/nbnxm/pairlistset.h
@@ -44,24 +44,9 @@
  
  #include "locality.h"
  
-/* Initializes a set of pair lists stored in nbnxn_pairlist_set_t
- *
- * TODO: Merge into the constructor
- */
-typedef void nbnxn_free_t (void *ptr);
-
  /* Tells if the pair-list corresponding to nb_kernel_type is simple.
   * Returns FALSE for super-sub type pair-list.
   */
  gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type);
  
-/* Initializes a set of pair lists stored in nbnxn_pairlist_set_t */
-void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list);
-
-/*! \brief Prepare the list-set produced by the search for dynamic pruning
- *
- * \param[in,out] listSet  The list-set to prepare for dynamic pruning.
- */
-void nbnxnPrepareListForDynamicPruning(nbnxn_pairlist_set_t *listSet);
-
  #endif
diff --git a/src/gromacs/nbnxm/prunekerneldispatch.cpp b/src/gromacs/nbnxm/prunekerneldispatch.cpp

index 15f84465b20a5bd2732307f4ec07865fa3d226f4..a1beaf2ac9d177b667c46f041165b6e937b68651 100644 (file)
--- a/src/gromacs/nbnxm/prunekerneldispatch.cpp
+++ b/src/gromacs/nbnxm/prunekerneldispatch.cpp
@@ -44,25 +44,32 @@
  #include "kernels_simd_2xmm/kernel_prune.h"
  #include "kernels_simd_4xm/kernel_prune.h"
  
-
  void
  nonbonded_verlet_t::PairlistSets::dispatchPruneKernel(const Nbnxm::InteractionLocality  iLocality,
                                                        const nbnxn_atomdata_t           *nbat,
                                                        const rvec                       *shift_vec,
                                                        const Nbnxm::KernelType           kernelType)
  {
-    nbnxn_pairlist_set_t *nbl_lists  = &pairlistSet(iLocality);
+    pairlistSet(iLocality).dispatchPruneKernel(nbat, shift_vec, kernelType);
+}
  
-    const real            rlistInner = nbl_lists->params.rlistInner;
+void
+PairlistSet::dispatchPruneKernel(const nbnxn_atomdata_t  *nbat,
+                                 const rvec              *shift_vec,
+                                 const Nbnxm::KernelType  kernelType)
+{
+    const real rlistInner = params_.rlistInner;
  
-    GMX_ASSERT(nbl_lists->nbl[0]->ciOuter.size() >= nbl_lists->nbl[0]->ci.size(),
+    GMX_ASSERT(cpuLists_[0].ciOuter.size() >= cpuLists_[0].ci.size(),
                 "Here we should either have an empty ci list or ciOuter should be >= ci");
  
      int gmx_unused nthreads = gmx_omp_nthreads_get(emntNonbonded);
+    GMX_ASSERT(nthreads == static_cast<gmx::index>(cpuLists_.size()),
+               "The number of threads should match the number of lists");
  #pragma omp parallel for schedule(static) num_threads(nthreads)
-    for (int i = 0; i < nbl_lists->nnbl; i++)
+    for (int i = 0; i < nthreads; i++)
      {
-        NbnxnPairlistCpu *nbl = nbl_lists->nbl[i];
+        NbnxnPairlistCpu *nbl = &cpuLists_[i];
  
          switch (kernelType)
          {
author	Berk Hess <hess@kth.se>
	Fri, 15 Mar 2019 21:10:08 +0000 (22:10 +0100)
committer	Berk Hess <hess@kth.se>
	Mon, 25 Mar 2019 20:48:37 +0000 (21:48 +0100)
src/gromacs/nbnxm/kerneldispatch.cpp		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.cpp		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_setup.cpp		patch \| blob \| history
src/gromacs/nbnxm/pairlist.cpp		patch \| blob \| history
src/gromacs/nbnxm/pairlist.h		patch \| blob \| history
src/gromacs/nbnxm/pairlistset.cpp		patch \| blob \| history
src/gromacs/nbnxm/pairlistset.h		patch \| blob \| history
src/gromacs/nbnxm/prunekerneldispatch.cpp		patch \| blob \| history