Convert nbnxn_pairlist_set_t to class PairlistSet
authorBerk Hess <hess@kth.se>
Fri, 15 Mar 2019 21:10:08 +0000 (22:10 +0100)
committerBerk Hess <hess@kth.se>
Mon, 25 Mar 2019 20:48:37 +0000 (21:48 +0100)
This change is only refactoring.
Two implementation details have changed:
The CPU and GPU pairlist objects are now straight lists instead
of array of pointer to lists. This means that the pairlist objects are
no longer allocated on their respecitive thtreads. But since the lists
have buffers to avoid cache polution and the actual i- and j-lists are
llocated thread local, this should not affect performance
ignificantly.
The free-energy lists are now only allocated with perturbed atoms.

Change-Id: Ifc76608215518edfc61c0ca8eb71ea2a928cf57c

src/gromacs/nbnxm/kerneldispatch.cpp
src/gromacs/nbnxm/nbnxm.cpp
src/gromacs/nbnxm/nbnxm.h
src/gromacs/nbnxm/nbnxm_setup.cpp
src/gromacs/nbnxm/pairlist.cpp
src/gromacs/nbnxm/pairlist.h
src/gromacs/nbnxm/pairlistset.cpp
src/gromacs/nbnxm/pairlistset.h
src/gromacs/nbnxm/prunekerneldispatch.cpp

index 31cf946e201a7f84a5fbde6650f2a1f3c4b6a8dc..b676f0c0775154a4397d2a2d63fbb7e602a7a27b 100644 (file)
@@ -51,6 +51,7 @@
 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 #include "gromacs/nbnxm/nbnxm.h"
 #include "gromacs/nbnxm/nbnxm_simd.h"
+#include "gromacs/nbnxm/pairlist.h"
 #include "gromacs/nbnxm/kernels_reference/kernel_gpu_ref.h"
 #include "gromacs/simd/simd.h"
 #include "gromacs/utility/gmxassert.h"
@@ -145,7 +146,7 @@ reduceGroupEnergySimdBuffers(int                       numGroups,
  * \param[out]    vVdw          Output buffer for Van der Waals energies
  */
 static void
-nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
+nbnxn_kernel_cpu(const PairlistSet              &pairlistSet,
                  const Nbnxm::KernelSetup       &kernelSetup,
                  nbnxn_atomdata_t               *nbat,
                  const interaction_const_t      &ic,
@@ -234,12 +235,11 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
         GMX_RELEASE_ASSERT(false, "Unsupported VdW interaction type");
     }
 
-    int                        nnbl = pairlistSet.nnbl;
-    NbnxnPairlistCpu * const * nbl  = pairlistSet.nbl;
+    gmx::ArrayRef<const NbnxnPairlistCpu> pairlists = pairlistSet.cpuLists();
 
-    int gmx_unused             nthreads = gmx_omp_nthreads_get(emntNonbonded);
+    int gmx_unused                        nthreads = gmx_omp_nthreads_get(emntNonbonded);
 #pragma omp parallel for schedule(static) num_threads(nthreads)
-    for (int nb = 0; nb < nnbl; nb++)
+    for (int nb = 0; nb < pairlists.ssize(); nb++)
     {
         // Presently, the kernels do not call C++ code that can throw,
         // so no need for a try/catch pair in this OpenMP region.
@@ -251,7 +251,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
         }
 
         real *fshift_p;
-        if ((forceFlags & GMX_FORCE_VIRIAL) && nnbl == 1)
+        if ((forceFlags & GMX_FORCE_VIRIAL) && pairlists.ssize() == 1)
         {
             fshift_p = fshift;
         }
@@ -265,13 +265,16 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
             }
         }
 
+        // TODO: Change to reference
+        const NbnxnPairlistCpu *pairlist = &pairlists[nb];
+
         if (!(forceFlags & GMX_FORCE_ENERGY))
         {
             /* Don't calculate energies */
             switch (kernelSetup.kernelType)
             {
                 case Nbnxm::KernelType::Cpu4x4_PlainC:
-                    nbnxn_kernel_noener_ref[coulkt][vdwkt](nbl[nb], nbat,
+                    nbnxn_kernel_noener_ref[coulkt][vdwkt](pairlist, nbat,
                                                            &ic,
                                                            shiftVectors,
                                                            out->f.data(),
@@ -279,7 +282,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
                     break;
 #ifdef GMX_NBNXN_SIMD_2XNN
                 case Nbnxm::KernelType::Cpu4xN_Simd_2xNN:
-                    nbnxm_kernel_noener_simd_2xmm[coulkt][vdwkt](nbl[nb], nbat,
+                    nbnxm_kernel_noener_simd_2xmm[coulkt][vdwkt](pairlist, nbat,
                                                                  &ic,
                                                                  shiftVectors,
                                                                  out->f.data(),
@@ -288,7 +291,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
 #endif
 #ifdef GMX_NBNXN_SIMD_4XN
                 case Nbnxm::KernelType::Cpu4xN_Simd_4xN:
-                    nbnxm_kernel_noener_simd_4xm[coulkt][vdwkt](nbl[nb], nbat,
+                    nbnxm_kernel_noener_simd_4xm[coulkt][vdwkt](pairlist, nbat,
                                                                 &ic,
                                                                 shiftVectors,
                                                                 out->f.data(),
@@ -308,7 +311,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
             switch (kernelSetup.kernelType)
             {
                 case Nbnxm::KernelType::Cpu4x4_PlainC:
-                    nbnxn_kernel_ener_ref[coulkt][vdwkt](nbl[nb], nbat,
+                    nbnxn_kernel_ener_ref[coulkt][vdwkt](pairlist, nbat,
                                                          &ic,
                                                          shiftVectors,
                                                          out->f.data(),
@@ -318,7 +321,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
                     break;
 #ifdef GMX_NBNXN_SIMD_2XNN
                 case Nbnxm::KernelType::Cpu4xN_Simd_2xNN:
-                    nbnxm_kernel_ener_simd_2xmm[coulkt][vdwkt](nbl[nb], nbat,
+                    nbnxm_kernel_ener_simd_2xmm[coulkt][vdwkt](pairlist, nbat,
                                                                &ic,
                                                                shiftVectors,
                                                                out->f.data(),
@@ -329,7 +332,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
 #endif
 #ifdef GMX_NBNXN_SIMD_4XN
                 case Nbnxm::KernelType::Cpu4xN_Simd_4xN:
-                    nbnxm_kernel_ener_simd_4xm[coulkt][vdwkt](nbl[nb], nbat,
+                    nbnxm_kernel_ener_simd_4xm[coulkt][vdwkt](pairlist, nbat,
                                                               &ic,
                                                               shiftVectors,
                                                               out->f.data(),
@@ -353,7 +356,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
             {
                 case Nbnxm::KernelType::Cpu4x4_PlainC:
                     unrollj = c_nbnxnCpuIClusterSize;
-                    nbnxn_kernel_energrp_ref[coulkt][vdwkt](nbl[nb], nbat,
+                    nbnxn_kernel_energrp_ref[coulkt][vdwkt](pairlist, nbat,
                                                             &ic,
                                                             shiftVectors,
                                                             out->f.data(),
@@ -364,7 +367,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
 #ifdef GMX_NBNXN_SIMD_2XNN
                 case Nbnxm::KernelType::Cpu4xN_Simd_2xNN:
                     unrollj = GMX_SIMD_REAL_WIDTH/2;
-                    nbnxm_kernel_energrp_simd_2xmm[coulkt][vdwkt](nbl[nb], nbat,
+                    nbnxm_kernel_energrp_simd_2xmm[coulkt][vdwkt](pairlist, nbat,
                                                                   &ic,
                                                                   shiftVectors,
                                                                   out->f.data(),
@@ -376,7 +379,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
 #ifdef GMX_NBNXN_SIMD_4XN
                 case Nbnxm::KernelType::Cpu4xN_Simd_4xN:
                     unrollj = GMX_SIMD_REAL_WIDTH;
-                    nbnxm_kernel_energrp_simd_4xm[coulkt][vdwkt](nbl[nb], nbat,
+                    nbnxm_kernel_energrp_simd_4xm[coulkt][vdwkt](pairlist, nbat,
                                                                  &ic,
                                                                  shiftVectors,
                                                                  out->f.data(),
@@ -417,12 +420,12 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
 
     if (forceFlags & GMX_FORCE_ENERGY)
     {
-        reduce_energies_over_lists(nbat, nnbl, vVdw, vCoulomb);
+        reduce_energies_over_lists(nbat, pairlists.ssize(), vVdw, vCoulomb);
     }
 }
 
 static void accountFlops(t_nrnb                           *nrnb,
-                         const nbnxn_pairlist_set_t       &pairlistSet,
+                         const PairlistSet                &pairlistSet,
                          const nonbonded_verlet_t         &nbv,
                          const interaction_const_t        &ic,
                          const int                         forceFlags)
@@ -452,31 +455,31 @@ static void accountFlops(t_nrnb                           *nrnb,
     }
 
     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
-             pairlistSet.natpair_ljq);
+             pairlistSet.natpair_ljq_);
     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
-             pairlistSet.natpair_lj);
+             pairlistSet.natpair_lj_);
     /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
-             pairlistSet.natpair_q);
+             pairlistSet.natpair_q_);
 
     const bool calcEnergy = ((forceFlags & GMX_FORCE_ENERGY) != 0);
     if (ic.vdw_modifier == eintmodFORCESWITCH)
     {
         /* We add up the switch cost separately */
         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW + (calcEnergy ? 1 : 0),
-                 pairlistSet.natpair_ljq + pairlistSet.natpair_lj);
+                 pairlistSet.natpair_ljq_ + pairlistSet.natpair_lj_);
     }
     if (ic.vdw_modifier == eintmodPOTSWITCH)
     {
         /* We add up the switch cost separately */
         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW + (calcEnergy ? 1 : 0),
-                 pairlistSet.natpair_ljq + pairlistSet.natpair_lj);
+                 pairlistSet.natpair_ljq_ + pairlistSet.natpair_lj_);
     }
     if (ic.vdwtype == evdwPME)
     {
         /* We add up the LJ Ewald cost separately */
         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD + (calcEnergy ? 1 : 0),
-                 pairlistSet.natpair_ljq + pairlistSet.natpair_lj);
+                 pairlistSet.natpair_ljq_ + pairlistSet.natpair_lj_);
     }
 }
 
@@ -489,7 +492,7 @@ nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality
                                             gmx_enerdata_t            *enerd,
                                             t_nrnb                    *nrnb)
 {
-    const nbnxn_pairlist_set_t &pairlistSet = pairlistSets().pairlistSet(iLocality);
+    const PairlistSet &pairlistSet = pairlistSets().pairlistSet(iLocality);
 
     switch (kernelSetup().kernelType)
     {
@@ -515,7 +518,7 @@ nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality
             break;
 
         case Nbnxm::KernelType::Cpu8x8x8_PlainC:
-            nbnxn_kernel_gpu_ref(pairlistSet.nblGpu[0],
+            nbnxn_kernel_gpu_ref(pairlistSet.gpuList(),
                                  nbat.get(), &ic,
                                  fr->shift_vec,
                                  forceFlags,
@@ -548,10 +551,10 @@ nonbonded_verlet_t::dispatchFreeEnergyKernel(Nbnxm::InteractionLocality  iLocali
                                              const int                   forceFlags,
                                              t_nrnb                     *nrnb)
 {
-    const gmx::ArrayRef<t_nblist const * const > nbl_fep = pairlistSets().pairlistSet(iLocality).nbl_fep;
+    const gmx::ArrayRef<t_nblist const * const > nbl_fep = pairlistSets().pairlistSet(iLocality).fepLists();
 
     /* When the first list is empty, all are empty and there is nothing to do */
-    if (nbl_fep[0]->nrj == 0)
+    if (!pairlistSets().params().haveFep || nbl_fep[0]->nrj == 0)
     {
         return;
     }
index 0aeb3d8d63164f07f1f99c0f4d8eb157b36300cc..270512b266978c9a54e05bf0b4e23207afd8dc64 100644 (file)
@@ -154,8 +154,9 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality  locality
                                              gmx_wallcycle             *wcycle)
 {
     /* Skip the non-local reduction if there was no non-local work to do */
-    if (locality == Nbnxm::AtomLocality::NonLocal &&
-        pairlistSets().pairlistSet(Nbnxm::InteractionLocality::NonLocal).nblGpu[0]->sci.empty())
+    if (!pairlistIsSimple() &&
+        locality == Nbnxm::AtomLocality::NonLocal &&
+        pairlistSets().pairlistSet(Nbnxm::InteractionLocality::NonLocal).gpuList()->sci.empty())
     {
         return;
     }
index ad9a79077ebc9c1c90b1825d35ec34c89c2ec872..4c84070ddedc6d14684bc464137e0bbd8451853c 100644 (file)
@@ -117,10 +117,10 @@ struct gmx_hw_info_t;
 struct gmx_mtop_t;
 struct gmx_wallcycle;
 struct interaction_const_t;
-struct nbnxn_pairlist_set_t;
 struct nonbonded_verlet_t;
 enum class PairlistType;
 class PairSearch;
+class PairlistSet;
 struct t_blocka;
 struct t_commrec;
 struct t_lambda;
@@ -150,10 +150,12 @@ struct NbnxnListParameters
     /*! \brief Constructor producing a struct with dynamic pruning disabled
      */
     NbnxnListParameters(Nbnxm::KernelType kernelType,
+                        bool              haveFep,
                         real              rlist,
                         bool              haveMultipleDomains);
 
     PairlistType pairlistType;           //!< The type of cluster-pair list
+    bool         haveFep;                //!< Tells whether we have perturbed interactions
     real         rlistOuter;             //!< Cut-off of the larger, outer pair-list
     real         rlistInner;             //!< Cut-off of the smaller, inner pair-list
     bool         haveMultipleDomains;    //!< True when using DD with multiple domains
@@ -224,7 +226,7 @@ enum {
  */
 void nbnxn_make_pairlist(nonbonded_verlet_t         *nbv,
                          Nbnxm::InteractionLocality  iLocality,
-                         nbnxn_pairlist_set_t       *pairlistSet,
+                         PairlistSet                *pairlistSet,
                          const t_blocka             *excl,
                          int64_t                     step,
                          t_nrnb                     *nrnb);
@@ -235,7 +237,7 @@ void nbnxn_make_pairlist(nonbonded_verlet_t         *nbv,
  * pairs beyond the pairlist inner radius and writes the result to a list that is
  * to be consumed by the non-bonded kernel.
  */
-void NbnxnDispatchPruneKernel(nbnxn_pairlist_set_t   *pairlistSet,
+void NbnxnDispatchPruneKernel(PairlistSet            *pairlistSet,
                               Nbnxm::KernelType       kernelType,
                               const nbnxn_atomdata_t *nbat,
                               const rvec             *shift_vec);
@@ -306,7 +308,7 @@ struct nonbonded_verlet_t
                 }
 
                 //! Returns the pair-list set for the given locality
-                const nbnxn_pairlist_set_t &pairlistSet(Nbnxm::InteractionLocality iLocality) const
+                const PairlistSet &pairlistSet(Nbnxm::InteractionLocality iLocality) const
                 {
                     if (iLocality == Nbnxm::InteractionLocality::Local)
                     {
@@ -321,7 +323,7 @@ struct nonbonded_verlet_t
 
             private:
                 //! Returns the pair-list set for the given locality
-                nbnxn_pairlist_set_t &pairlistSet(Nbnxm::InteractionLocality iLocality)
+                PairlistSet &pairlistSet(Nbnxm::InteractionLocality iLocality)
                 {
                     if (iLocality == Nbnxm::InteractionLocality::Local)
                     {
@@ -339,9 +341,9 @@ struct nonbonded_verlet_t
                 //! Pair list balancing parameter for use with GPU
                 int                                   minimumIlistCountForGpuBalancing_;
                 //! Local pairlist set
-                std::unique_ptr<nbnxn_pairlist_set_t> localSet_;
+                std::unique_ptr<PairlistSet>          localSet_;
                 //! Non-local pairlist set
-                std::unique_ptr<nbnxn_pairlist_set_t> nonlocalSet_;
+                std::unique_ptr<PairlistSet>          nonlocalSet_;
                 //! MD step at with the outer lists in pairlistSets_ were created
                 int64_t                               outerListCreationStep_;
         };
index 023f78c6852614ac3af89177907eb45d53144cea..d88ca578724b15cc2ee2efa1cd5a05b439024ba6 100644 (file)
@@ -299,11 +299,13 @@ nonbonded_verlet_t::PairlistSets::PairlistSets(const NbnxnListParameters &listPa
     params_(listParams),
     minimumIlistCountForGpuBalancing_(minimumIlistCountForGpuBalancing)
 {
-    localSet_ = std::make_unique<nbnxn_pairlist_set_t>(params_);
+    localSet_ = std::make_unique<PairlistSet>(Nbnxm::InteractionLocality::Local,
+                                              params_);
 
     if (haveMultipleDomains)
     {
-        nonlocalSet_ = std::make_unique<nbnxn_pairlist_set_t>(params_);
+        nonlocalSet_ = std::make_unique<PairlistSet>(Nbnxm::InteractionLocality::NonLocal,
+                                                     params_);
     }
 }
 
@@ -382,6 +384,7 @@ init_nb_verlet(const gmx::MDLogger     &mdlog,
     const bool          haveMultipleDomains = (DOMAINDECOMP(cr) && cr->dd->nnodes > 1);
 
     NbnxnListParameters listParams(kernelSetup.kernelType,
+                                   bFEP_NonBonded,
                                    ir->rlist,
                                    havePPDomainDecomposition(cr));
 
index 8469c409bad92a82f97dcee530f45bec10b19ca2..e1e165839c01fcf83504cdb9a7c8d7bd0429a29a 100644 (file)
@@ -648,19 +648,14 @@ static unsigned int nbl_imask0(const NbnxnPairlistGpu *nbl, int cj_ind)
     return nbl->cj4[cj_ind/c_nbnxnGpuJgroupSize].imei[0].imask;
 }
 
-/* Initializes a single NbnxnPairlistCpu data structure */
-static void nbnxn_init_pairlist(NbnxnPairlistCpu *nbl)
+NbnxnPairlistCpu::NbnxnPairlistCpu() :
+    na_ci(c_nbnxnCpuIClusterSize),
+    na_cj(0),
+    rlist(0),
+    ncjInUse(0),
+    nci_tot(0),
+    work(std::make_unique<NbnxnPairlistCpuWork>())
 {
-    nbl->na_ci       = c_nbnxnCpuIClusterSize;
-    nbl->na_cj       = 0;
-    nbl->ci.clear();
-    nbl->ciOuter.clear();
-    nbl->ncjInUse    = 0;
-    nbl->cj.clear();
-    nbl->cjOuter.clear();
-    nbl->nci_tot     = 0;
-
-    nbl->work        = new NbnxnPairlistCpuWork();
 }
 
 NbnxnPairlistGpu::NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy) :
@@ -671,7 +666,8 @@ NbnxnPairlistGpu::NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy) :
     sci({}, {pinningPolicy}),
     cj4({}, {pinningPolicy}),
     excl({}, {pinningPolicy}),
-    nci_tot(0)
+    nci_tot(0),
+    work(std::make_unique<NbnxnPairlistGpuWork>())
 {
     static_assert(c_nbnxnGpuNumClusterPerSupercluster == c_gpuNumClusterPerCell,
                   "The search code assumes that the a super-cluster matches a search grid cell");
@@ -683,79 +679,76 @@ NbnxnPairlistGpu::NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy) :
 
     // We always want a first entry without any exclusions
     excl.resize(1);
-
-    work = new NbnxnPairlistGpuWork();
 }
 
-void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list)
+// TODO: Move to pairlistset.cpp
+PairlistSet::PairlistSet(const Nbnxm::InteractionLocality  locality,
+                         const NbnxnListParameters        &listParams) :
+    locality_(locality),
+    params_(listParams)
 {
-    nbl_list->bSimple   =
-        (nbl_list->params.pairlistType == PairlistType::Simple4x2 ||
-         nbl_list->params.pairlistType == PairlistType::Simple4x4 ||
-         nbl_list->params.pairlistType == PairlistType::Simple4x8);
+    isCpuType_ =
+        (params_.pairlistType == PairlistType::Simple4x2 ||
+         params_.pairlistType == PairlistType::Simple4x4 ||
+         params_.pairlistType == PairlistType::Simple4x8);
     // Currently GPU lists are always combined
-    nbl_list->bCombined = !nbl_list->bSimple;
+    combineLists_ = !isCpuType_;
 
-    nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded);
+    const int numLists = gmx_omp_nthreads_get(emntNonbonded);
 
-    if (!nbl_list->bCombined &&
-        nbl_list->nnbl > NBNXN_BUFFERFLAG_MAX_THREADS)
+    if (!combineLists_ &&
+        numLists > NBNXN_BUFFERFLAG_MAX_THREADS)
     {
         gmx_fatal(FARGS, "%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
-                  nbl_list->nnbl, NBNXN_BUFFERFLAG_MAX_THREADS, NBNXN_BUFFERFLAG_MAX_THREADS);
+                  numLists, NBNXN_BUFFERFLAG_MAX_THREADS, NBNXN_BUFFERFLAG_MAX_THREADS);
     }
 
-    if (nbl_list->bSimple)
+    if (isCpuType_)
     {
-        snew(nbl_list->nbl, nbl_list->nnbl);
-        if (nbl_list->nnbl > 1)
+        cpuLists_.resize(numLists);
+        if (numLists > 1)
         {
-            snew(nbl_list->nbl_work, nbl_list->nnbl);
+            cpuListsWork_.resize(numLists);
         }
     }
     else
     {
-        snew(nbl_list->nblGpu, nbl_list->nnbl);
+        /* Only list 0 is used on the GPU, use normal allocation for i>0 */
+        gpuLists_.emplace_back(gmx::PinningPolicy::PinnedIfSupported);
+        /* Lists 0 to numLists are use for constructing lists in parallel
+         * on the CPU using numLists threads (and then merged into list 0).
+         */
+        for (int i = 1; i < numLists; i++)
+        {
+            gpuLists_.emplace_back(gmx::PinningPolicy::CannotBePinned);
+        }
     }
-    nbl_list->nbl_fep.resize(nbl_list->nnbl);
-    /* Execute in order to avoid memory interleaving between threads */
-#pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
-    for (int i = 0; i < nbl_list->nnbl; i++)
+    if (params_.haveFep)
     {
-        try
-        {
-            /* Allocate the nblist data structure locally on each thread
-             * to optimize memory access for NUMA architectures.
-             */
-            if (nbl_list->bSimple)
-            {
-                nbl_list->nbl[i] = new NbnxnPairlistCpu();
+        fepLists_.resize(numLists);
 
-                nbnxn_init_pairlist(nbl_list->nbl[i]);
-                if (nbl_list->nnbl > 1)
-                {
-                    nbl_list->nbl_work[i] = new NbnxnPairlistCpu();
-                    nbnxn_init_pairlist(nbl_list->nbl_work[i]);
-                }
-            }
-            else
+        /* Execute in order to avoid memory interleaving between threads */
+#pragma omp parallel for num_threads(numLists) schedule(static)
+        for (int i = 0; i < numLists; i++)
+        {
+            try
             {
-                /* Only list 0 is used on the GPU, use normal allocation for i>0 */
-                auto pinningPolicy = (i == 0 ? gmx::PinningPolicy::PinnedIfSupported : gmx::PinningPolicy::CannotBePinned);
-
-                nbl_list->nblGpu[i] = new NbnxnPairlistGpu(pinningPolicy);
+                /* We used to allocate all normal lists locally on each thread
+                 * as well. The question is if allocating the object on the
+                 * master thread (but all contained list memory thread local)
+                 * impacts performance.
+                 */
+                snew(fepLists_[i], 1);
+                nbnxn_init_pairlist_fep(fepLists_[i]);
             }
-
-            snew(nbl_list->nbl_fep[i], 1);
-            nbnxn_init_pairlist_fep(nbl_list->nbl_fep[i]);
+            GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
         }
-        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
     }
 }
 
 /* Print statistics of a pair list, used for debug output */
 static void print_nblist_statistics(FILE                   *fp,
-                                    const NbnxnPairlistCpu *nbl,
+                                    const NbnxnPairlistCpu &nbl,
                                     const PairSearch       &pairSearch,
                                     const real              rl)
 {
@@ -763,34 +756,34 @@ static void print_nblist_statistics(FILE                   *fp,
     const Grid::Dimensions &dims = grid.dimensions();
 
     fprintf(fp, "nbl nci %zu ncj %d\n",
-            nbl->ci.size(), nbl->ncjInUse);
+            nbl.ci.size(), nbl.ncjInUse);
     const int    numAtomsJCluster = grid.geometry().numAtomsJCluster;
-    const double numAtomsPerCell  = nbl->ncjInUse/static_cast<double>(grid.numCells())*numAtomsJCluster;
+    const double numAtomsPerCell  = nbl.ncjInUse/static_cast<double>(grid.numCells())*numAtomsJCluster;
     fprintf(fp, "nbl na_cj %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
-            nbl->na_cj, rl, nbl->ncjInUse, nbl->ncjInUse/static_cast<double>(grid.numCells()),
+            nbl.na_cj, rl, nbl.ncjInUse, nbl.ncjInUse/static_cast<double>(grid.numCells()),
             numAtomsPerCell,
             numAtomsPerCell/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid.numCells()*numAtomsJCluster/(dims.gridSize[XX]*dims.gridSize[YY]*dims.gridSize[ZZ])));
 
     fprintf(fp, "nbl average j cell list length %.1f\n",
-            0.25*nbl->ncjInUse/std::max(static_cast<double>(nbl->ci.size()), 1.0));
+            0.25*nbl.ncjInUse/std::max(static_cast<double>(nbl.ci.size()), 1.0));
 
     int cs[SHIFTS] = { 0 };
     int npexcl     = 0;
-    for (const nbnxn_ci_t &ciEntry : nbl->ci)
+    for (const nbnxn_ci_t &ciEntry : nbl.ci)
     {
         cs[ciEntry.shift & NBNXN_CI_SHIFT] +=
             ciEntry.cj_ind_end - ciEntry.cj_ind_start;
 
         int j = ciEntry.cj_ind_start;
         while (j < ciEntry.cj_ind_end &&
-               nbl->cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
+               nbl.cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
         {
             npexcl++;
             j++;
         }
     }
     fprintf(fp, "nbl cell pairs, total: %zu excl: %d %.1f%%\n",
-            nbl->cj.size(), npexcl, 100*npexcl/std::max(static_cast<double>(nbl->cj.size()), 1.0));
+            nbl.cj.size(), npexcl, 100*npexcl/std::max(static_cast<double>(nbl.cj.size()), 1.0));
     for (int s = 0; s < SHIFTS; s++)
     {
         if (cs[s] > 0)
@@ -802,7 +795,7 @@ static void print_nblist_statistics(FILE                   *fp,
 
 /* Print statistics of a pair lists, used for debug output */
 static void print_nblist_statistics(FILE                   *fp,
-                                    const NbnxnPairlistGpu *nbl,
+                                    const NbnxnPairlistGpu &nbl,
                                     const PairSearch       &pairSearch,
                                     const real              rl)
 {
@@ -810,11 +803,11 @@ static void print_nblist_statistics(FILE                   *fp,
     const Grid::Dimensions &dims = grid.dimensions();
 
     fprintf(fp, "nbl nsci %zu ncj4 %zu nsi %d excl4 %zu\n",
-            nbl->sci.size(), nbl->cj4.size(), nbl->nci_tot, nbl->excl.size());
+            nbl.sci.size(), nbl.cj4.size(), nbl.nci_tot, nbl.excl.size());
     const int    numAtomsCluster = grid.geometry().numAtomsICluster;
-    const double numAtomsPerCell = nbl->nci_tot/static_cast<double>(grid.numClusters())*numAtomsCluster;
+    const double numAtomsPerCell = nbl.nci_tot/static_cast<double>(grid.numClusters())*numAtomsCluster;
     fprintf(fp, "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
-            nbl->na_ci, rl, nbl->nci_tot, nbl->nci_tot/static_cast<double>(grid.numClusters()),
+            nbl.na_ci, rl, nbl.nci_tot, nbl.nci_tot/static_cast<double>(grid.numClusters()),
             numAtomsPerCell,
             numAtomsPerCell/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid.numClusters()*numAtomsCluster/(dims.gridSize[XX]*dims.gridSize[YY]*dims.gridSize[ZZ])));
 
@@ -822,7 +815,7 @@ static void print_nblist_statistics(FILE                   *fp,
     double sum_nsp2 = 0;
     int    nsp_max  = 0;
     int    c[c_gpuNumClusterPerCell + 1] = { 0 };
-    for (const nbnxn_sci_t &sci : nbl->sci)
+    for (const nbnxn_sci_t &sci : nbl.sci)
     {
         int nsp = 0;
         for (int j4 = sci.cj4_ind_start; j4 < sci.cj4_ind_end; j4++)
@@ -832,7 +825,7 @@ static void print_nblist_statistics(FILE                   *fp,
                 int b = 0;
                 for (int si = 0; si < c_gpuNumClusterPerCell; si++)
                 {
-                    if (nbl->cj4[j4].imei[0].imask & (1U << (j*c_gpuNumClusterPerCell + si)))
+                    if (nbl.cj4[j4].imei[0].imask & (1U << (j*c_gpuNumClusterPerCell + si)))
                     {
                         b++;
                     }
@@ -845,20 +838,20 @@ static void print_nblist_statistics(FILE                   *fp,
         sum_nsp2 += nsp*nsp;
         nsp_max   = std::max(nsp_max, nsp);
     }
-    if (!nbl->sci.empty())
+    if (!nbl.sci.empty())
     {
-        sum_nsp  /= nbl->sci.size();
-        sum_nsp2 /= nbl->sci.size();
+        sum_nsp  /= nbl.sci.size();
+        sum_nsp2 /= nbl.sci.size();
     }
     fprintf(fp, "nbl #cluster-pairs: av %.1f stddev %.1f max %d\n",
             sum_nsp, std::sqrt(sum_nsp2 - sum_nsp*sum_nsp), nsp_max);
 
-    if (!nbl->cj4.empty())
+    if (!nbl.cj4.empty())
     {
         for (int b = 0; b <= c_gpuNumClusterPerCell; b++)
         {
             fprintf(fp, "nbl j-list #i-subcell %d %7d %4.1f\n",
-                    b, c[b], 100.0*c[b]/size_t {nbl->cj4.size()*c_nbnxnGpuJgroupSize});
+                    b, c[b], 100.0*c[b]/size_t {nbl.cj4.size()*c_nbnxnGpuJgroupSize});
         }
     }
 }
@@ -2045,7 +2038,7 @@ static void closeIEntry(NbnxnPairlistCpu    *nbl,
     const int jlen = ciEntry.cj_ind_end - ciEntry.cj_ind_start;
     if (jlen > 0)
     {
-        sort_cj_excl(nbl->cj.data() + ciEntry.cj_ind_start, jlen, nbl->work);
+        sort_cj_excl(nbl->cj.data() + ciEntry.cj_ind_start, jlen, nbl->work.get());
 
         /* The counts below are used for non-bonded pair/flop counts
          * and should therefore match the available kernel setups.
@@ -2605,9 +2598,10 @@ static void get_nsubpair_target(const PairSearch          &pairSearch,
 }
 
 /* Debug list print function */
-static void print_nblist_ci_cj(FILE *fp, const NbnxnPairlistCpu *nbl)
+static void print_nblist_ci_cj(FILE                   *fp,
+                               const NbnxnPairlistCpu &nbl)
 {
-    for (const nbnxn_ci_t &ciEntry : nbl->ci)
+    for (const nbnxn_ci_t &ciEntry : nbl.ci)
     {
         fprintf(fp, "ci %4d  shift %2d  ncj %3d\n",
                 ciEntry.ci, ciEntry.shift,
@@ -2616,16 +2610,17 @@ static void print_nblist_ci_cj(FILE *fp, const NbnxnPairlistCpu *nbl)
         for (int j = ciEntry.cj_ind_start; j < ciEntry.cj_ind_end; j++)
         {
             fprintf(fp, "  cj %5d  imask %x\n",
-                    nbl->cj[j].cj,
-                    nbl->cj[j].excl);
+                    nbl.cj[j].cj,
+                    nbl.cj[j].excl);
         }
     }
 }
 
 /* Debug list print function */
-static void print_nblist_sci_cj(FILE *fp, const NbnxnPairlistGpu *nbl)
+static void print_nblist_sci_cj(FILE                   *fp,
+                                const NbnxnPairlistGpu &nbl)
 {
-    for (const nbnxn_sci_t &sci : nbl->sci)
+    for (const nbnxn_sci_t &sci : nbl.sci)
     {
         fprintf(fp, "ci %4d  shift %2d  ncj4 %2d\n",
                 sci.sci, sci.shift,
@@ -2637,11 +2632,11 @@ static void print_nblist_sci_cj(FILE *fp, const NbnxnPairlistGpu *nbl)
             for (int j = 0; j < c_nbnxnGpuJgroupSize; j++)
             {
                 fprintf(fp, "  sj %5d  imask %x\n",
-                        nbl->cj4[j4].cj[j],
-                        nbl->cj4[j4].imei[0].imask);
+                        nbl.cj4[j4].cj[j],
+                        nbl.cj4[j4].imei[0].imask);
                 for (int si = 0; si < c_gpuNumClusterPerCell; si++)
                 {
-                    if (nbl->cj4[j4].imei[0].imask & (1U << (j*c_gpuNumClusterPerCell + si)))
+                    if (nbl.cj4[j4].imei[0].imask & (1U << (j*c_gpuNumClusterPerCell + si)))
                     {
                         ncp++;
                     }
@@ -2656,17 +2651,17 @@ static void print_nblist_sci_cj(FILE *fp, const NbnxnPairlistGpu *nbl)
 }
 
 /* Combine pair lists *nbl generated on multiple threads nblc */
-static void combine_nblists(int nnbl, NbnxnPairlistGpu **nbl,
-                            NbnxnPairlistGpu *nblc)
+static void combine_nblists(gmx::ArrayRef<const NbnxnPairlistGpu>  nbls,
+                            NbnxnPairlistGpu                      *nblc)
 {
     int nsci  = nblc->sci.size();
     int ncj4  = nblc->cj4.size();
     int nexcl = nblc->excl.size();
-    for (int i = 0; i < nnbl; i++)
+    for (auto &nbl : nbls)
     {
-        nsci  += nbl[i]->sci.size();
-        ncj4  += nbl[i]->cj4.size();
-        nexcl += nbl[i]->excl.size();
+        nsci  += nbl.sci.size();
+        ncj4  += nbl.cj4.size();
+        nexcl += nbl.excl.size();
     }
 
     /* Resize with the final, combined size, so we can fill in parallel */
@@ -2683,7 +2678,7 @@ static void combine_nblists(int nnbl, NbnxnPairlistGpu **nbl,
 #endif
 
 #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (int n = 0; n < nnbl; n++)
+    for (int n = 0; n < nbls.ssize(); n++)
     {
         try
         {
@@ -2694,14 +2689,14 @@ static void combine_nblists(int nnbl, NbnxnPairlistGpu **nbl,
             int cj4_offset  = ncj4;
             int excl_offset = nexcl;
 
-            for (int i = n; i < nnbl; i++)
+            for (int i = n; i < nbls.ssize(); i++)
             {
-                sci_offset  -= nbl[i]->sci.size();
-                cj4_offset  -= nbl[i]->cj4.size();
-                excl_offset -= nbl[i]->excl.size();
+                sci_offset  -= nbls[i].sci.size();
+                cj4_offset  -= nbls[i].cj4.size();
+                excl_offset -= nbls[i].excl.size();
             }
 
-            const NbnxnPairlistGpu &nbli = *nbl[n];
+            const NbnxnPairlistGpu &nbli = nbls[n];
 
             for (size_t i = 0; i < nbli.sci.size(); i++)
             {
@@ -2725,43 +2720,39 @@ static void combine_nblists(int nnbl, NbnxnPairlistGpu **nbl,
         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
     }
 
-    for (int n = 0; n < nnbl; n++)
+    for (auto &nbl : nbls)
     {
-        nblc->nci_tot += nbl[n]->nci_tot;
+        nblc->nci_tot += nbl.nci_tot;
     }
 }
 
-static void balance_fep_lists(gmx::ArrayRef<PairsearchWork>       work,
-                              nbnxn_pairlist_set_t               *nbl_lists)
+static void balance_fep_lists(gmx::ArrayRef<t_nblist *>      fepLists,
+                              gmx::ArrayRef<PairsearchWork>  work)
 {
-    int       nnbl;
-    int       nri_tot, nrj_tot, nrj_target;
-    int       th_dest;
-    t_nblist *nbld;
+    const int numLists = fepLists.ssize();
 
-    nnbl = nbl_lists->nnbl;
-
-    if (nnbl == 1)
+    if (numLists == 1)
     {
         /* Nothing to balance */
         return;
     }
 
     /* Count the total i-lists and pairs */
-    nri_tot = 0;
-    nrj_tot = 0;
-    for (int th = 0; th < nnbl; th++)
+    int nri_tot = 0;
+    int nrj_tot = 0;
+    for (auto list : fepLists)
     {
-        nri_tot += nbl_lists->nbl_fep[th]->nri;
-        nrj_tot += nbl_lists->nbl_fep[th]->nrj;
+        nri_tot += list->nri;
+        nrj_tot += list->nrj;
     }
 
-    nrj_target = (nrj_tot + nnbl - 1)/nnbl;
+    const int nrj_target = (nrj_tot + numLists - 1)/numLists;
 
-    assert(gmx_omp_nthreads_get(emntNonbonded) == nnbl);
+    GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == numLists,
+               "We should have as many work objects as FEP lists");
 
-#pragma omp parallel for schedule(static) num_threads(nnbl)
-    for (int th = 0; th < nnbl; th++)
+#pragma omp parallel for schedule(static) num_threads(numLists)
+    for (int th = 0; th < numLists; th++)
     {
         try
         {
@@ -2788,13 +2779,11 @@ static void balance_fep_lists(gmx::ArrayRef<PairsearchWork>       work,
     }
 
     /* Loop over the source lists and assign and copy i-entries */
-    th_dest = 0;
-    nbld    = work[th_dest].nbl_fep.get();
-    for (int th = 0; th < nnbl; th++)
+    int       th_dest = 0;
+    t_nblist *nbld    = work[th_dest].nbl_fep.get();
+    for (int th = 0; th < numLists; th++)
     {
-        t_nblist *nbls;
-
-        nbls = nbl_lists->nbl_fep[th];
+        t_nblist *nbls = fepLists[th];
 
         for (int i = 0; i < nbls->nri; i++)
         {
@@ -2806,7 +2795,7 @@ static void balance_fep_lists(gmx::ArrayRef<PairsearchWork>       work,
             /* Decide if list th_dest is too large and we should procede
              * to the next destination list.
              */
-            if (th_dest+1 < nnbl && nbld->nrj > 0 &&
+            if (th_dest + 1 < numLists && nbld->nrj > 0 &&
                 nbld->nrj + nrj - nrj_target > nrj_target - nbld->nrj)
             {
                 th_dest++;
@@ -2829,18 +2818,18 @@ static void balance_fep_lists(gmx::ArrayRef<PairsearchWork>       work,
     }
 
     /* Swap the list pointers */
-    for (int th = 0; th < nnbl; th++)
+    for (int th = 0; th < numLists; th++)
     {
-        t_nblist *nbl_tmp      = work[th].nbl_fep.release();
-        work[th].nbl_fep.reset(nbl_lists->nbl_fep[th]);
-        nbl_lists->nbl_fep[th] = nbl_tmp;
+        t_nblist *nbl_tmp = work[th].nbl_fep.release();
+        work[th].nbl_fep.reset(fepLists[th]);
+        fepLists[th]      = nbl_tmp;
 
         if (debug)
         {
             fprintf(debug, "nbl_fep[%d] nri %4d nrj %4d\n",
                     th,
-                    nbl_lists->nbl_fep[th]->nri,
-                    nbl_lists->nbl_fep[th]->nrj);
+                    fepLists[th]->nri,
+                    fepLists[th]->nrj);
         }
     }
 }
@@ -2922,7 +2911,8 @@ static float boundingbox_only_distance2(const Grid::Dimensions &iGridDims,
 }
 
 static int get_ci_block_size(const Grid &iGrid,
-                             gmx_bool bDomDec, int nth)
+                             const bool  haveDomDec,
+                             const int   numLists)
 {
     const int ci_block_enum      = 5;
     const int ci_block_denom     = 11;
@@ -2940,7 +2930,9 @@ static int get_ci_block_size(const Grid &iGrid,
      * zone boundaries with 3D domain decomposition. At the same time
      * the blocks will not become too small.
      */
-    ci_block = (iGrid.numCells()*ci_block_enum)/(ci_block_denom*iGrid.dimensions().numCells[XX]*nth);
+    GMX_ASSERT(iGrid.dimensions().numCells[XX] > 0, "Grid can't be empty");
+    GMX_ASSERT(numLists > 0, "We need at least one list");
+    ci_block = (iGrid.numCells()*ci_block_enum)/(ci_block_denom*iGrid.dimensions().numCells[XX]*numLists);
 
     const int numAtomsPerCell = iGrid.geometry().numAtomsPerCell;
 
@@ -2953,12 +2945,12 @@ static int get_ci_block_size(const Grid &iGrid,
     /* Without domain decomposition
      * or with less than 3 blocks per task, divide in nth blocks.
      */
-    if (!bDomDec || nth*3*ci_block > iGrid.numCells())
+    if (!haveDomDec || numLists*3*ci_block > iGrid.numCells())
     {
-        ci_block = (iGrid.numCells() + nth - 1)/nth;
+        ci_block = (iGrid.numCells() + numLists - 1)/numLists;
     }
 
-    if (ci_block > 1 && (nth - 1)*ci_block >= iGrid.numCells())
+    if (ci_block > 1 && (numLists - 1)*ci_block >= iGrid.numCells())
     {
         /* Some threads have no work. Although reducing the block size
          * does not decrease the block count on the first few threads,
@@ -3440,12 +3432,12 @@ static void nbnxn_make_pairlist_part(const PairSearch &pairSearch,
                     }
 
                     set_icell_bb(iGrid, ci, shx, shy, shz,
-                                 nbl->work);
+                                 nbl->work.get());
 
                     icell_set_x(cell0_i+ci, shx, shy, shz,
                                 nbat->xstride, nbat->x().data(),
                                 kernelType,
-                                nbl->work);
+                                nbl->work.get());
 
                     for (int cx = cxf; cx <= cxl; cx++)
                     {
@@ -3645,7 +3637,7 @@ static void nbnxn_make_pairlist_part(const PairSearch &pairSearch,
     {
         fprintf(debug, "number of distance checks %d\n", numDistanceChecks);
 
-        print_nblist_statistics(debug, nbl, pairSearch, rlist);
+        print_nblist_statistics(debug, *nbl, pairSearch, rlist);
 
         if (haveFep)
         {
@@ -3763,17 +3755,17 @@ static void copySelectedListRange(const nbnxn_ci_t * gmx_restrict srcCi,
  * to reduction of parts of the force buffer that could be avoided. But since
  * the original lists are quite balanced, this will only give minor overhead.
  */
-static void rebalanceSimpleLists(int                                  numLists,
-                                 NbnxnPairlistCpu * const * const     srcSet,
-                                 NbnxnPairlistCpu                   **destSet,
-                                 gmx::ArrayRef<PairsearchWork>        searchWork)
+static void rebalanceSimpleLists(gmx::ArrayRef<const NbnxnPairlistCpu> srcSet,
+                                 gmx::ArrayRef<NbnxnPairlistCpu>       destSet,
+                                 gmx::ArrayRef<PairsearchWork>         searchWork)
 {
     int ncjTotal = 0;
-    for (int s = 0; s < numLists; s++)
+    for (auto &src : srcSet)
     {
-        ncjTotal += srcSet[s]->ncjInUse;
+        ncjTotal += src.ncjInUse;
     }
-    int ncjTarget = (ncjTotal + numLists - 1)/numLists;
+    const int numLists  = srcSet.ssize();
+    const int ncjTarget = (ncjTotal + numLists - 1)/numLists;
 
 #pragma omp parallel num_threads(numLists)
     {
@@ -3783,23 +3775,23 @@ static void rebalanceSimpleLists(int                                  numLists,
         int cjEnd   = ncjTarget*(t + 1);
 
         /* The destination pair-list for task/thread t */
-        NbnxnPairlistCpu *dest = destSet[t];
+        NbnxnPairlistCpu &dest = destSet[t];
 
-        clear_pairlist(dest);
-        dest->na_cj   = srcSet[0]->na_cj;
+        clear_pairlist(&dest);
+        dest.na_cj = srcSet[0].na_cj;
 
         /* Note that the flags in the work struct (still) contain flags
          * for all entries that are present in srcSet->nbl[t].
          */
         gmx_bitmask_t *flag       = searchWork[t].buffer_flags.flag;
 
-        int            iFlagShift = getBufferFlagShift(dest->na_ci);
-        int            jFlagShift = getBufferFlagShift(dest->na_cj);
+        int            iFlagShift = getBufferFlagShift(dest.na_ci);
+        int            jFlagShift = getBufferFlagShift(dest.na_cj);
 
         int            cjGlobal   = 0;
         for (int s = 0; s < numLists && cjGlobal < cjEnd; s++)
         {
-            const NbnxnPairlistCpu *src = srcSet[s];
+            const NbnxnPairlistCpu *src = &srcSet[s];
 
             if (cjGlobal + src->ncjInUse > cjStart)
             {
@@ -3816,7 +3808,7 @@ static void rebalanceSimpleLists(int                                  numLists,
                         {
                             copySelectedListRange
                             <true>
-                                (srcCi, src, dest,
+                                (srcCi, src, &dest,
                                 flag, iFlagShift, jFlagShift, t);
                         }
                         else
@@ -3824,7 +3816,7 @@ static void rebalanceSimpleLists(int                                  numLists,
                             copySelectedListRange
                             <false>
                                 (srcCi, src,
-                                dest, flag, iFlagShift, jFlagShift, t);
+                                &dest, flag, iFlagShift, jFlagShift, t);
                         }
                     }
                     cjGlobal += ncj;
@@ -3836,29 +3828,29 @@ static void rebalanceSimpleLists(int                                  numLists,
             }
         }
 
-        dest->ncjInUse = dest->cj.size();
+        dest.ncjInUse = dest.cj.size();
     }
 
 #ifndef NDEBUG
     int ncjTotalNew = 0;
-    for (int s = 0; s < numLists; s++)
+    for (auto &dest : destSet)
     {
-        ncjTotalNew += destSet[s]->ncjInUse;
+        ncjTotalNew += dest.ncjInUse;
     }
     GMX_RELEASE_ASSERT(ncjTotalNew == ncjTotal, "The total size of the lists before and after rebalancing should match");
 #endif
 }
 
 /* Returns if the pairlists are so imbalanced that it is worth rebalancing. */
-static bool checkRebalanceSimpleLists(const nbnxn_pairlist_set_t *listSet)
+static bool checkRebalanceSimpleLists(gmx::ArrayRef<const NbnxnPairlistCpu> lists)
 {
-    int numLists = listSet->nnbl;
+    int numLists = lists.ssize();
     int ncjMax   = 0;
     int ncjTotal = 0;
     for (int s = 0; s < numLists; s++)
     {
-        ncjMax    = std::max(ncjMax, listSet->nbl[s]->ncjInUse);
-        ncjTotal += listSet->nbl[s]->ncjInUse;
+        ncjMax    = std::max(ncjMax, lists[s].ncjInUse);
+        ncjTotal += lists[s].ncjInUse;
     }
     if (debug)
     {
@@ -3932,44 +3924,41 @@ static void sort_sci(NbnxnPairlistGpu *nbl)
     std::swap(nbl->sci, work.sci_sort);
 }
 
+//! Prepares CPU lists produced by the search for dynamic pruning
+static void prepareListsForDynamicPruning(gmx::ArrayRef<NbnxnPairlistCpu> lists);
+
 void
-nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality,
-                                            PairSearch                *pairSearch,
-                                            nbnxn_atomdata_t          *nbat,
-                                            const t_blocka            *excl,
-                                            const Nbnxm::KernelType    kernelType,
-                                            const int64_t              step,
-                                            t_nrnb                    *nrnb)
+PairlistSet::constructPairlists(PairSearch                *pairSearch,
+                                nbnxn_atomdata_t          *nbat,
+                                const t_blocka            *excl,
+                                const Nbnxm::KernelType    kernelType,
+                                const int                  minimumIlistCountForGpuBalancing,
+                                t_nrnb                    *nrnb)
 {
-    nbnxn_pairlist_set_t *nbl_list = &pairlistSet(iLocality);
-
-    const real            rlist    = nbl_list->params.rlistOuter;
+    const real         rlist    = params_.rlistOuter;
 
     int                nsubpair_target;
     float              nsubpair_tot_est;
-    int                nnbl;
     int                ci_block;
-    gmx_bool           CombineNBLists;
     gmx_bool           progBal;
     int                np_tot, np_noq, np_hlj, nap;
 
-    nnbl            = nbl_list->nnbl;
-    CombineNBLists  = nbl_list->bCombined;
+    const int          numLists = (isCpuType_ ? cpuLists_.size() : gpuLists_.size());
 
     if (debug)
     {
-        fprintf(debug, "ns making %d nblists\n", nnbl);
+        fprintf(debug, "ns making %d nblists\n", numLists);
     }
 
     nbat->bUseBufferFlags = (nbat->out.size() > 1);
     /* We should re-init the flags before making the first list */
-    if (nbat->bUseBufferFlags && iLocality == InteractionLocality::Local)
+    if (nbat->bUseBufferFlags && locality_ == InteractionLocality::Local)
     {
         init_buffer_flags(&nbat->buffer_flags, nbat->numAtoms());
     }
 
     int nzi;
-    if (iLocality == InteractionLocality::Local)
+    if (locality_ == InteractionLocality::Local)
     {
         /* Only zone (grid) 0 vs 0 */
         nzi = 1;
@@ -3979,9 +3968,9 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
         nzi = pairSearch->domainSetup().zones->nizone;
     }
 
-    if (!nbl_list->bSimple && minimumIlistCountForGpuBalancing_ > 0)
+    if (!isCpuType_ && minimumIlistCountForGpuBalancing > 0)
     {
-        get_nsubpair_target(*pairSearch, iLocality, rlist, minimumIlistCountForGpuBalancing_,
+        get_nsubpair_target(*pairSearch, locality_, rlist, minimumIlistCountForGpuBalancing,
                             &nsubpair_target, &nsubpair_tot_est);
     }
     else
@@ -3991,20 +3980,20 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
     }
 
     /* Clear all pair-lists */
-    for (int th = 0; th < nnbl; th++)
+    for (int th = 0; th < numLists; th++)
     {
-        if (nbl_list->bSimple)
+        if (isCpuType_)
         {
-            clear_pairlist(nbl_list->nbl[th]);
+            clear_pairlist(&cpuLists_[th]);
         }
         else
         {
-            clear_pairlist(nbl_list->nblGpu[th]);
+            clear_pairlist(&gpuLists_[th]);
         }
 
-        if (pairSearch->gridSet().haveFep())
+        if (params_.haveFep)
         {
-            clear_pairlist_fep(nbl_list->nbl_fep[th]);
+            clear_pairlist_fep(fepLists_[th]);
         }
     }
 
@@ -4016,7 +4005,7 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
 
         int                 zj0;
         int                 zj1;
-        if (iLocality == InteractionLocality::Local)
+        if (locality_ == InteractionLocality::Local)
         {
             zj0 = 0;
             zj1 = 1;
@@ -4041,15 +4030,15 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
 
             pairSearch->cycleCounting_.start(PairSearch::enbsCCsearch);
 
-            ci_block = get_ci_block_size(iGrid, pairSearch->domainSetup().haveDomDec, nnbl);
+            ci_block = get_ci_block_size(iGrid, pairSearch->domainSetup().haveDomDec, numLists);
 
             /* With GPU: generate progressively smaller lists for
              * load balancing for local only or non-local with 2 zones.
              */
-            progBal = (iLocality == InteractionLocality::Local || ddZones->n <= 2);
+            progBal = (locality_ == InteractionLocality::Local || ddZones->n <= 2);
 
-#pragma omp parallel for num_threads(nnbl) schedule(static)
-            for (int th = 0; th < nnbl; th++)
+#pragma omp parallel for num_threads(numLists) schedule(static)
+            for (int th = 0; th < numLists; th++)
             {
                 try
                 {
@@ -4061,48 +4050,50 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
                         init_buffer_flags(&pairSearch->work()[th].buffer_flags, nbat->numAtoms());
                     }
 
-                    if (CombineNBLists && th > 0)
+                    if (combineLists_ && th > 0)
                     {
-                        GMX_ASSERT(!nbl_list->bSimple, "Can only combine GPU lists");
+                        GMX_ASSERT(!isCpuType_, "Can only combine GPU lists");
 
-                        clear_pairlist(nbl_list->nblGpu[th]);
+                        clear_pairlist(&gpuLists_[th]);
                     }
 
-                    auto &searchWork = pairSearch->work()[th];
+                    PairsearchWork *searchWork = &pairSearch->work()[th];
+
+                    searchWork->cycleCounter.start();
 
-                    searchWork.cycleCounter.start();
+                    t_nblist *fepListPtr = (fepLists_.empty() ? nullptr : fepLists_[th]);
 
-                    /* Divide the i super cell equally over the nblists */
-                    if (nbl_list->bSimple)
+                    /* Divide the i cells equally over the pairlists */
+                    if (isCpuType_)
                     {
                         nbnxn_make_pairlist_part(*pairSearch, iGrid, jGrid,
-                                                 &searchWork, nbat, *excl,
+                                                 searchWork, nbat, *excl,
                                                  rlist,
                                                  kernelType,
                                                  ci_block,
                                                  nbat->bUseBufferFlags,
                                                  nsubpair_target,
                                                  progBal, nsubpair_tot_est,
-                                                 th, nnbl,
-                                                 nbl_list->nbl[th],
-                                                 nbl_list->nbl_fep[th]);
+                                                 th, numLists,
+                                                 &cpuLists_[th],
+                                                 fepListPtr);
                     }
                     else
                     {
                         nbnxn_make_pairlist_part(*pairSearch, iGrid, jGrid,
-                                                 &searchWork, nbat, *excl,
+                                                 searchWork, nbat, *excl,
                                                  rlist,
                                                  kernelType,
                                                  ci_block,
                                                  nbat->bUseBufferFlags,
                                                  nsubpair_target,
                                                  progBal, nsubpair_tot_est,
-                                                 th, nnbl,
-                                                 nbl_list->nblGpu[th],
-                                                 nbl_list->nbl_fep[th]);
+                                                 th, numLists,
+                                                 &gpuLists_[th],
+                                                 fepListPtr);
                     }
 
-                    searchWork.cycleCounter.stop();
+                    searchWork->cycleCounter.stop();
                 }
                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
             }
@@ -4111,77 +4102,75 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
             np_tot = 0;
             np_noq = 0;
             np_hlj = 0;
-            for (int th = 0; th < nnbl; th++)
+            for (int th = 0; th < numLists; th++)
             {
                 inc_nrnb(nrnb, eNR_NBNXN_DIST2, pairSearch->work()[th].ndistc);
 
-                if (nbl_list->bSimple)
+                if (isCpuType_)
                 {
-                    NbnxnPairlistCpu *nbl = nbl_list->nbl[th];
-                    np_tot += nbl->cj.size();
-                    np_noq += nbl->work->ncj_noq;
-                    np_hlj += nbl->work->ncj_hlj;
+                    const NbnxnPairlistCpu &nbl = cpuLists_[th];
+                    np_tot += nbl.cj.size();
+                    np_noq += nbl.work->ncj_noq;
+                    np_hlj += nbl.work->ncj_hlj;
                 }
                 else
                 {
-                    NbnxnPairlistGpu *nbl = nbl_list->nblGpu[th];
+                    const NbnxnPairlistGpu &nbl = gpuLists_[th];
                     /* This count ignores potential subsequent pair pruning */
-                    np_tot += nbl->nci_tot;
+                    np_tot += nbl.nci_tot;
                 }
             }
-            if (nbl_list->bSimple)
+            if (isCpuType_)
             {
-                nap               = nbl_list->nbl[0]->na_ci*nbl_list->nbl[0]->na_cj;
+                nap      = cpuLists_[0].na_ci*cpuLists_[0].na_cj;
             }
             else
             {
-                nap               = gmx::square(nbl_list->nblGpu[0]->na_ci);
+                nap      = gmx::square(gpuLists_[0].na_ci);
             }
-            nbl_list->natpair_ljq = (np_tot - np_noq)*nap - np_hlj*nap/2;
-            nbl_list->natpair_lj  = np_noq*nap;
-            nbl_list->natpair_q   = np_hlj*nap/2;
+            natpair_ljq_ = (np_tot - np_noq)*nap - np_hlj*nap/2;
+            natpair_lj_  = np_noq*nap;
+            natpair_q_   = np_hlj*nap/2;
 
-            if (CombineNBLists && nnbl > 1)
+            if (combineLists_ && numLists > 1)
             {
-                GMX_ASSERT(!nbl_list->bSimple, "Can only combine GPU lists");
-                NbnxnPairlistGpu **nbl = nbl_list->nblGpu;
+                GMX_ASSERT(!isCpuType_, "Can only combine GPU lists");
 
                 pairSearch->cycleCounting_.start(PairSearch::enbsCCcombine);
 
-                combine_nblists(nnbl-1, nbl+1, nbl[0]);
+                combine_nblists(gmx::constArrayRefFromArray(&gpuLists_[1], numLists - 1),
+                                &gpuLists_[0]);
 
                 pairSearch->cycleCounting_.stop(PairSearch::enbsCCcombine);
             }
         }
     }
 
-    if (nbl_list->bSimple)
+    if (isCpuType_)
     {
-        if (nnbl > 1 && checkRebalanceSimpleLists(nbl_list))
+        if (numLists > 1 && checkRebalanceSimpleLists(cpuLists_))
         {
-            rebalanceSimpleLists(nbl_list->nnbl, nbl_list->nbl, nbl_list->nbl_work, pairSearch->work());
+            rebalanceSimpleLists(cpuLists_, cpuListsWork_, pairSearch->work());
 
-            /* Swap the pointer of the sets of pair lists */
-            NbnxnPairlistCpu **tmp = nbl_list->nbl;
-            nbl_list->nbl          = nbl_list->nbl_work;
-            nbl_list->nbl_work     = tmp;
+            /* Swap the sets of pair lists */
+            cpuLists_.swap(cpuListsWork_);
         }
     }
     else
     {
         /* Sort the entries on size, large ones first */
-        if (CombineNBLists || nnbl == 1)
+        if (combineLists_ || gpuLists_.size() == 1)
         {
-            sort_sci(nbl_list->nblGpu[0]);
+            sort_sci(&gpuLists_[0]);
         }
         else
         {
-#pragma omp parallel for num_threads(nnbl) schedule(static)
-            for (int th = 0; th < nnbl; th++)
+#pragma omp parallel for num_threads(numLists) schedule(static)
+            for (int th = 0; th < numLists; th++)
             {
                 try
                 {
-                    sort_sci(nbl_list->nblGpu[th]);
+                    sort_sci(&gpuLists_[th]);
                 }
                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
             }
@@ -4190,60 +4179,38 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
 
     if (nbat->bUseBufferFlags)
     {
-        reduce_buffer_flags(*pairSearch, nbl_list->nnbl, &nbat->buffer_flags);
+        reduce_buffer_flags(*pairSearch, numLists, &nbat->buffer_flags);
     }
 
     if (pairSearch->gridSet().haveFep())
     {
         /* Balance the free-energy lists over all the threads */
-        balance_fep_lists(pairSearch->work(), nbl_list);
+        balance_fep_lists(fepLists_, pairSearch->work());
     }
 
-    if (nbl_list->bSimple)
+    if (isCpuType_)
     {
         /* This is a fresh list, so not pruned, stored using ci.
          * ciOuter is invalid at this point.
          */
-        GMX_ASSERT(nbl_list->nbl[0]->ciOuter.empty(), "ciOuter is invalid so it should be empty");
-    }
-
-    if (iLocality == Nbnxm::InteractionLocality::Local)
-    {
-        outerListCreationStep_ = step;
-    }
-    else
-    {
-        GMX_RELEASE_ASSERT(outerListCreationStep_ == step,
-                           "Outer list should be created at the same step as the inner list");
-    }
-
-    /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
-    if (iLocality == InteractionLocality::Local)
-    {
-        pairSearch->cycleCounting_.searchCount_++;
-    }
-    if (pairSearch->cycleCounting_.recordCycles_ &&
-        (!pairSearch->domainSetup().haveDomDec || iLocality == InteractionLocality::NonLocal) &&
-        pairSearch->cycleCounting_.searchCount_ % 100 == 0)
-    {
-        pairSearch->cycleCounting_.printCycles(stderr, pairSearch->work());
+        GMX_ASSERT(cpuLists_[0].ciOuter.empty(), "ciOuter is invalid so it should be empty");
     }
 
     /* If we have more than one list, they either got rebalancing (CPU)
      * or combined (GPU), so we should dump the final result to debug.
      */
-    if (debug && nbl_list->nnbl > 1)
+    if (debug)
     {
-        if (nbl_list->bSimple)
+        if (isCpuType_ && cpuLists_.size() > 1)
         {
-            for (int t = 0; t < nbl_list->nnbl; t++)
+            for (auto &cpuList : cpuLists_)
             {
-                print_nblist_statistics(debug, nbl_list->nbl[t], *pairSearch, rlist);
+                print_nblist_statistics(debug, cpuList, *pairSearch, rlist);
             }
         }
-        else
+        else if (!isCpuType_ && gpuLists_.size() > 1)
         {
-            print_nblist_statistics(debug, nbl_list->nblGpu[0], *pairSearch, rlist);
+            print_nblist_statistics(debug, gpuLists_[0], *pairSearch, rlist);
         }
     }
 
@@ -4251,28 +4218,62 @@ nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality
     {
         if (gmx_debug_at)
         {
-            if (nbl_list->bSimple)
+            if (isCpuType_)
             {
-                for (int t = 0; t < nbl_list->nnbl; t++)
+                for (auto &cpuList : cpuLists_)
                 {
-                    print_nblist_ci_cj(debug, nbl_list->nbl[t]);
+                    print_nblist_ci_cj(debug, cpuList);
                 }
             }
             else
             {
-                print_nblist_sci_cj(debug, nbl_list->nblGpu[0]);
+                print_nblist_sci_cj(debug, gpuLists_[0]);
             }
         }
 
         if (nbat->bUseBufferFlags)
         {
-            print_reduction_cost(&nbat->buffer_flags, nbl_list->nnbl);
+            print_reduction_cost(&nbat->buffer_flags, numLists);
         }
     }
 
-    if (params_.useDynamicPruning && nbl_list->bSimple)
+    if (params_.useDynamicPruning && isCpuType_)
     {
-        nbnxnPrepareListForDynamicPruning(nbl_list);
+        prepareListsForDynamicPruning(cpuLists_);
+    }
+}
+
+void
+nonbonded_verlet_t::PairlistSets::construct(const InteractionLocality  iLocality,
+                                            PairSearch                *pairSearch,
+                                            nbnxn_atomdata_t          *nbat,
+                                            const t_blocka            *excl,
+                                            const Nbnxm::KernelType    kernelType,
+                                            const int64_t              step,
+                                            t_nrnb                    *nrnb)
+{
+    pairlistSet(iLocality).constructPairlists(pairSearch, nbat, excl, kernelType, minimumIlistCountForGpuBalancing_, nrnb);
+
+    if (iLocality == Nbnxm::InteractionLocality::Local)
+    {
+        outerListCreationStep_ = step;
+    }
+    else
+    {
+        GMX_RELEASE_ASSERT(outerListCreationStep_ == step,
+                           "Outer list should be created at the same step as the inner list");
+    }
+
+    /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
+    if (iLocality == InteractionLocality::Local)
+    {
+        pairSearch->cycleCounting_.searchCount_++;
+    }
+    if (pairSearch->cycleCounting_.recordCycles_ &&
+        (!pairSearch->domainSetup().haveDomDec || iLocality == InteractionLocality::NonLocal) &&
+        pairSearch->cycleCounting_.searchCount_ % 100 == 0)
+    {
+        pairSearch->cycleCounting_.printCycles(stderr, pairSearch->work());
     }
 }
 
@@ -4293,24 +4294,20 @@ nonbonded_verlet_t::constructPairlist(const Nbnxm::InteractionLocality  iLocalit
          * NOTE: The launch overhead is currently not timed separately
          */
         Nbnxm::gpu_init_pairlist(gpu_nbv,
-                                 pairlistSets().pairlistSet(iLocality).nblGpu[0],
+                                 pairlistSets().pairlistSet(iLocality).gpuList(),
                                  iLocality);
     }
 }
 
-void nbnxnPrepareListForDynamicPruning(nbnxn_pairlist_set_t *listSet)
+static void prepareListsForDynamicPruning(gmx::ArrayRef<NbnxnPairlistCpu> lists)
 {
-    GMX_RELEASE_ASSERT(listSet->bSimple, "Should only be called for simple lists");
-
     /* TODO: Restructure the lists so we have actual outer and inner
      *       list objects so we can set a single pointer instead of
      *       swapping several pointers.
      */
 
-    for (int i = 0; i < listSet->nnbl; i++)
+    for (auto &list : lists)
     {
-        NbnxnPairlistCpu &list = *listSet->nbl[i];
-
         /* The search produced a list in ci/cj.
          * Swap the list pointers so we get the outer list is ciOuter,cjOuter
          * and we can prune that to get an inner list in ci/cj.
index b4bbc52c92c3ba894b450ef6e6040896311a9adf..62f928e67f68ddd07610889bc7c78648e2989dcf 100644 (file)
 // to include it during OpenCL jitting without including config.h
 #include "gromacs/nbnxm/constants.h"
 
+#include "locality.h"
+
 struct NbnxnListParameters;
 struct NbnxnPairlistCpuWork;
 struct NbnxnPairlistGpuWork;
+struct nbnxn_atomdata_t;
+class PairSearch;
+struct t_blocka;
+struct t_nrnb;
 
 namespace Nbnxm
 {
@@ -212,6 +218,8 @@ struct nbnxn_excl_t
 /* Cluster pairlist type for use on CPUs */
 struct NbnxnPairlistCpu
 {
+    NbnxnPairlistCpu();
+
     gmx_cache_protect_t     cp0;
 
     int                     na_ci;       /* The number of atoms per i-cluster        */
@@ -226,9 +234,10 @@ struct NbnxnPairlistCpu
 
     int                     nci_tot;     /* The total number of i clusters           */
 
-    NbnxnPairlistCpuWork   *work;
+    /* Working data storage for list construction */
+    std::unique_ptr<NbnxnPairlistCpuWork> work;
 
-    gmx_cache_protect_t     cp1;
+    gmx_cache_protect_t                   cp1;
 };
 
 /* Cluster pairlist type, with extra hierarchies, for on the GPU
@@ -260,29 +269,94 @@ struct NbnxnPairlistGpu
     // The total number of i-clusters
     int                            nci_tot;
 
-    NbnxnPairlistGpuWork          *work;
+    /* Working data storage for list construction */
+    std::unique_ptr<NbnxnPairlistGpuWork> work;
 
-    gmx_cache_protect_t            cp1;
+    gmx_cache_protect_t                   cp1;
 };
 
-struct nbnxn_pairlist_set_t
+/*! \internal
+ * \brief An object that holds the local or non-local pairlists
+ */
+class PairlistSet
 {
-    nbnxn_pairlist_set_t(const NbnxnListParameters &listParams);
-
-    int                         nnbl;         /* number of lists */
-    NbnxnPairlistCpu          **nbl;          /* lists for CPU */
-    NbnxnPairlistCpu          **nbl_work;     /* work space for rebalancing lists */
-    NbnxnPairlistGpu          **nblGpu;       /* lists for GPU */
-    const NbnxnListParameters  &params;       /* Pairlist parameters desribing setup and ranges */
-    gmx_bool                    bCombined;    /* TRUE if lists get combined into one (the 1st) */
-    gmx_bool                    bSimple;      /* TRUE if the list of of type "simple"
-                                                 (na_sc=na_s, no super-clusters used) */
-
-    /* Counts for debug printing */
-    int                     natpair_ljq;           /* Total number of atom pairs for LJ+Q kernel */
-    int                     natpair_lj;            /* Total number of atom pairs for LJ kernel   */
-    int                     natpair_q;             /* Total number of atom pairs for Q kernel    */
-    std::vector<t_nblist *> nbl_fep;               /* List of free-energy atom pair interactions */
+    public:
+        //! Constructor: initializes the pairlist set as empty
+        PairlistSet(Nbnxm::InteractionLocality  locality,
+                    const NbnxnListParameters  &listParams);
+
+        ~PairlistSet();
+
+        //! Constructs the pairlists in the set using the coordinates in \p nbat
+        void constructPairlists(PairSearch        *pairSearch,
+                                nbnxn_atomdata_t  *nbat,
+                                const t_blocka    *excl,
+                                Nbnxm::KernelType  kernelType,
+                                int                minimumIlistCountForGpuBalancing,
+                                t_nrnb            *nrnb);
+
+        //! Dispatch the kernel for dynamic pairlist pruning
+        void dispatchPruneKernel(const nbnxn_atomdata_t *nbat,
+                                 const rvec             *shift_vec,
+                                 Nbnxm::KernelType       kernelType);
+
+        //! Returns the locality
+        Nbnxm::InteractionLocality locality() const
+        {
+            return locality_;
+        }
+
+        //! Returns the lists of CPU pairlists
+        gmx::ArrayRef<const NbnxnPairlistCpu> cpuLists() const
+        {
+            return cpuLists_;
+        }
+
+        //! Returns a pointer to the GPU pairlist, nullptr when not present
+        const NbnxnPairlistGpu *gpuList() const
+        {
+            if (!gpuLists_.empty())
+            {
+                return &gpuLists_[0];
+            }
+            else
+            {
+                return nullptr;
+            }
+        }
+
+        //! Returns the lists of free-energy pairlists, empty when nonbonded interactions are not perturbed
+        gmx::ArrayRef<t_nblist const * const> fepLists() const
+        {
+            return fepLists_;
+        }
+
+    private:
+        //! The locality of the pairlist set
+        Nbnxm::InteractionLocality     locality_;
+        //! List of pairlists in CPU layout
+        std::vector<NbnxnPairlistCpu>  cpuLists_;
+        //! List of working list for rebalancing CPU lists
+        std::vector<NbnxnPairlistCpu>  cpuListsWork_;
+        //! List of pairlists in GPU layout
+        std::vector<NbnxnPairlistGpu>  gpuLists_;
+        //! Pairlist parameters describing setup and ranges
+        const NbnxnListParameters     &params_;
+        //! Tells whether multiple lists get merged into one (the first) after creation
+        bool                           combineLists_;
+        //! Tells whether the lists is of CPU type, otherwise GPU type
+        gmx_bool                       isCpuType_;
+        //! Lists for perturbed interactions in simple atom-atom layout
+        std::vector<t_nblist *>        fepLists_;
+
+    public:
+        /* Pair counts for flop counting */
+        //! Total number of atom pairs for LJ+Q kernel
+        int natpair_ljq_;
+        //! Total number of atom pairs for LJ kernel
+        int natpair_lj_;
+        //! Total number of atom pairs for Q kernel
+        int natpair_q_;
 };
 
 //! Initializes a free-energy pair-list
index de04304177a0439f8f4480a2f2171fc9849173d1..57f90d380f280309fb1506bfaf3d027b143faed4 100644 (file)
 #include "gromacs/nbnxm/pairlist.h"
 #include "gromacs/utility/gmxassert.h"
 
+#include "pairlistwork.h"
+
 /*! \cond INTERNAL */
 
 NbnxnListParameters::NbnxnListParameters(const Nbnxm::KernelType kernelType,
+                                         const bool              haveFep,
                                          const real              rlist,
                                          const bool              haveMultipleDomains) :
+    haveFep(haveFep),
     rlistOuter(rlist),
     rlistInner(rlist),
     haveMultipleDomains(haveMultipleDomains),
@@ -86,11 +90,6 @@ NbnxnListParameters::NbnxnListParameters(const Nbnxm::KernelType kernelType,
     }
 }
 
-nbnxn_pairlist_set_t::nbnxn_pairlist_set_t(const NbnxnListParameters &listParams) :
-    params(listParams)
-{
-    // TODO move this into this constructor
-    nbnxn_init_pairlist_set(this);
-}
+PairlistSet::~PairlistSet() = default;
 
 /*! \endcond */
index 26e54c99a32313c8cb9a42a6529891814e732807..a35db0f83598be98f409f3a2bc81418ded29d31f 100644 (file)
 
 #include "locality.h"
 
-/* Initializes a set of pair lists stored in nbnxn_pairlist_set_t
- *
- * TODO: Merge into the constructor
- */
-typedef void nbnxn_free_t (void *ptr);
-
 /* Tells if the pair-list corresponding to nb_kernel_type is simple.
  * Returns FALSE for super-sub type pair-list.
  */
 gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type);
 
-/* Initializes a set of pair lists stored in nbnxn_pairlist_set_t */
-void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list);
-
-/*! \brief Prepare the list-set produced by the search for dynamic pruning
- *
- * \param[in,out] listSet  The list-set to prepare for dynamic pruning.
- */
-void nbnxnPrepareListForDynamicPruning(nbnxn_pairlist_set_t *listSet);
-
 #endif
index 15f84465b20a5bd2732307f4ec07865fa3d226f4..a1beaf2ac9d177b667c46f041165b6e937b68651 100644 (file)
 #include "kernels_simd_2xmm/kernel_prune.h"
 #include "kernels_simd_4xm/kernel_prune.h"
 
-
 void
 nonbonded_verlet_t::PairlistSets::dispatchPruneKernel(const Nbnxm::InteractionLocality  iLocality,
                                                       const nbnxn_atomdata_t           *nbat,
                                                       const rvec                       *shift_vec,
                                                       const Nbnxm::KernelType           kernelType)
 {
-    nbnxn_pairlist_set_t *nbl_lists  = &pairlistSet(iLocality);
+    pairlistSet(iLocality).dispatchPruneKernel(nbat, shift_vec, kernelType);
+}
 
-    const real            rlistInner = nbl_lists->params.rlistInner;
+void
+PairlistSet::dispatchPruneKernel(const nbnxn_atomdata_t  *nbat,
+                                 const rvec              *shift_vec,
+                                 const Nbnxm::KernelType  kernelType)
+{
+    const real rlistInner = params_.rlistInner;
 
-    GMX_ASSERT(nbl_lists->nbl[0]->ciOuter.size() >= nbl_lists->nbl[0]->ci.size(),
+    GMX_ASSERT(cpuLists_[0].ciOuter.size() >= cpuLists_[0].ci.size(),
                "Here we should either have an empty ci list or ciOuter should be >= ci");
 
     int gmx_unused nthreads = gmx_omp_nthreads_get(emntNonbonded);
+    GMX_ASSERT(nthreads == static_cast<gmx::index>(cpuLists_.size()),
+               "The number of threads should match the number of lists");
 #pragma omp parallel for schedule(static) num_threads(nthreads)
-    for (int i = 0; i < nbl_lists->nnbl; i++)
+    for (int i = 0; i < nthreads; i++)
     {
-        NbnxnPairlistCpu *nbl = nbl_lists->nbl[i];
+        NbnxnPairlistCpu *nbl = &cpuLists_[i];
 
         switch (kernelType)
         {