Convert NbnxnPairlistGpu to C++

author Berk Hess <hess@kth.se>

Fri, 4 Jan 2019 17:17:01 +0000 (18:17 +0100)

committer Mark Abraham <mark.j.abraham@gmail.com>

Tue, 22 Jan 2019 13:51:51 +0000 (14:51 +0100)
author Berk Hess <hess@kth.se>
Fri, 4 Jan 2019 17:17:01 +0000 (18:17 +0100)
committer Mark Abraham <mark.j.abraham@gmail.com>
Tue, 22 Jan 2019 13:51:51 +0000 (14:51 +0100)
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp

index 8255b7b0f8015b337e60bb367bda191c7438401b..f3a44de6659d7bd9dcd1c4df6fb744c8f74e1109 100644 (file)
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -2113,9 +2113,6 @@ static void init_nb_verlet(const gmx::MDLogger     &mdlog,
      nonbonded_verlet_t *nbv;
      char               *env;
  
-    nbnxn_alloc_t      *nb_alloc;
-    nbnxn_free_t       *nb_free;
-
      nbv = new nonbonded_verlet_t();
  
      nbv->emulateGpu = ((getenv("GMX_EMULATE_GPU") != nullptr) ? EmulateGpuNonbonded::Yes : EmulateGpuNonbonded::No);
@@ -2157,16 +2154,12 @@ static void init_nb_verlet(const gmx::MDLogger     &mdlog,
                                                        bFEP_NonBonded,
                                                        gmx_omp_nthreads_get(emntPairsearch));
  
-    gpu_set_host_malloc_and_free(nbv->grp[0].kernel_type == nbnxnk8x8x8_GPU,
-                                 &nb_alloc, &nb_free);
-
      for (int i = 0; i < nbv->ngrp; i++)
      {
          nbnxn_init_pairlist_set(&nbv->grp[i].nbl_lists,
                                  nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
                                  /* 8x8x8 "non-simple" lists are ATM always combined */
-                                !nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
-                                nb_alloc, nb_free);
+                                !nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type));
      }
  
      int      enbnxninitcombrule;
diff --git a/src/gromacs/mdlib/nbnxn_atomdata.cpp b/src/gromacs/mdlib/nbnxn_atomdata.cpp

index 34edde97c8725e3fc46a8988d946132e64a77c56..9238c07b234ad2a5ac294d52d5fbdcbc5df530db 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_atomdata.cpp
+++ b/src/gromacs/mdlib/nbnxn_atomdata.cpp
@@ -70,47 +70,6 @@
  
  using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
  
-/* Default nbnxn allocation routine, allocates NBNXN_MEM_ALIGN byte aligned */
-void nbnxn_alloc_aligned(void **ptr, size_t nbytes)
-{
-    *ptr = save_malloc_aligned("ptr", __FILE__, __LINE__, nbytes, 1, NBNXN_MEM_ALIGN);
-}
-
-/* Free function for memory allocated with nbnxn_alloc_aligned */
-void nbnxn_free_aligned(void *ptr)
-{
-    sfree_aligned(ptr);
-}
-
-/* Reallocation wrapper function for nbnxn data structures */
-void nbnxn_realloc_void(void **ptr,
-                        int nbytes_copy, int nbytes_new,
-                        nbnxn_alloc_t *ma,
-                        nbnxn_free_t  *mf)
-{
-    void *ptr_new;
-
-    ma(&ptr_new, nbytes_new);
-
-    if (nbytes_new > 0 && ptr_new == nullptr)
-    {
-        gmx_fatal(FARGS, "Allocation of %d bytes failed", nbytes_new);
-    }
-
-    if (nbytes_copy > 0)
-    {
-        if (nbytes_new < nbytes_copy)
-        {
-            gmx_incons("In nbnxn_realloc_void: new size less than copy size");
-        }
-        memcpy(ptr_new, *ptr, nbytes_copy);
-    }
-    if (*ptr != nullptr)
-    {
-        mf(*ptr);
-    }
-    *ptr = ptr_new;
-}
  
  void nbnxn_atomdata_t::resizeCoordinateBuffer(int numAtoms)
  {
diff --git a/src/gromacs/mdlib/nbnxn_atomdata.h b/src/gromacs/mdlib/nbnxn_atomdata.h

index f5a8c1a291033bfddff42354f5f86960bfe0a640..09c1042efa3b96486f3f9c3908ecb6485eaa354c 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_atomdata.h
+++ b/src/gromacs/mdlib/nbnxn_atomdata.h
@@ -51,20 +51,6 @@ class MDLogger;
  struct t_mdatoms;
  struct gmx_wallcycle;
  
-/* Default nbnxn allocation routine, allocates 32 byte aligned,
- * which works for plain C and aligned SSE and AVX loads/stores.
- */
-void nbnxn_alloc_aligned(void **ptr, size_t nbytes);
-
-/* Free function for memory allocated with nbnxn_alloc_aligned */
-void nbnxn_free_aligned(void *ptr);
-
-/* Reallocation wrapper function for nbnxn data structures */
-void nbnxn_realloc_void(void **ptr,
-                        int nbytes_copy, int nbytes_new,
-                        nbnxn_alloc_t *ma,
-                        nbnxn_free_t  *mf);
-
  /* Reallocate the nbnxn_atomdata_t for a size of n atoms */
  void nbnxn_atomdata_realloc(nbnxn_atomdata_t *nbat, int n);
  
diff --git a/src/gromacs/mdlib/nbnxn_consts.h b/src/gromacs/mdlib/nbnxn_consts.h

index 5bf3fa0e1bbe5d7fa8caca3072dc3141148810e1..c2475e826f322894d6022f6ea8adbc0e41947308 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_consts.h
+++ b/src/gromacs/mdlib/nbnxn_consts.h
@@ -51,20 +51,6 @@
  #endif
  
  
-/* Cluster-pair Interaction masks for 4xN and 2xNN kernels.
- * Bit i*CJ_SIZE + j tells if atom i and j interact.
- */
-/* All interaction mask is the same for all kernels */
-#define NBNXN_INTERACTION_MASK_ALL        0xffffffffU
-/* 4x4 kernel diagonal mask */
-#define NBNXN_INTERACTION_MASK_DIAG       0x08ceU
-/* 4x2 kernel diagonal masks */
-#define NBNXN_INTERACTION_MASK_DIAG_J2_0  0x0002U
-#define NBNXN_INTERACTION_MASK_DIAG_J2_1  0x002fU
-/* 4x8 kernel diagonal masks */
-#define NBNXN_INTERACTION_MASK_DIAG_J8_0  0xf0f8fcfeU
-#define NBNXN_INTERACTION_MASK_DIAG_J8_1  0x0080c0e0U
-
  /* The number of clusters in a super-cluster, used for GPU */
  #define c_nbnxnGpuNumClusterPerSupercluster  8
  
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu

index d9c26a08130eab01ef9d842b1d8c666f7bd6ac99..d91efb1e7721b46f9a8c86fb96195724c16cb115 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -519,7 +519,7 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t       *nb,
                               int                     iloc)
  {
      char          sbuf[STRLEN];
-    bool          bDoTime    =  (nb->bDoTime && h_plist->nsci > 0);
+    bool          bDoTime    =  (nb->bDoTime && !h_plist->sci.empty());
      cudaStream_t  stream     = nb->stream[iloc];
      cu_plist_t   *d_plist    = nb->plist[iloc];
  
@@ -545,24 +545,24 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t       *nb,
  
      Context context = nullptr;
  
-    reallocateDeviceBuffer(&d_plist->sci, h_plist->nsci,
+    reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(),
                             &d_plist->nsci, &d_plist->sci_nalloc, context);
-    copyToDeviceBuffer(&d_plist->sci, h_plist->sci, 0, h_plist->nsci,
+    copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(),
                         stream, GpuApiCallBehavior::Async,
                         bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
  
-    reallocateDeviceBuffer(&d_plist->cj4, h_plist->ncj4,
+    reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(),
                             &d_plist->ncj4, &d_plist->cj4_nalloc, context);
-    copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4, 0, h_plist->ncj4,
+    copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(),
                         stream, GpuApiCallBehavior::Async,
                         bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
  
-    reallocateDeviceBuffer(&d_plist->imask, h_plist->ncj4*c_nbnxnGpuClusterpairSplit,
+    reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size()*c_nbnxnGpuClusterpairSplit,
                             &d_plist->nimask, &d_plist->imask_nalloc, context);
  
-    reallocateDeviceBuffer(&d_plist->excl, h_plist->nexcl,
+    reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(),
                             &d_plist->nexcl, &d_plist->excl_nalloc, context);
-    copyToDeviceBuffer(&d_plist->excl, h_plist->excl, 0, h_plist->nexcl,
+    copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(),
                         stream, GpuApiCallBehavior::Async,
                         bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
  
diff --git a/src/gromacs/mdlib/nbnxn_internal.h b/src/gromacs/mdlib/nbnxn_internal.h

index b4c5820b7aaa8f2a16b813283a6842d59f1a83d3..7ea0ef061de7c89e9bda8e249058a3b34c622b92 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_internal.h
+++ b/src/gromacs/mdlib/nbnxn_internal.h
@@ -264,7 +264,8 @@ struct NbnxnPairlistGpuWork
      };
  
      NbnxnPairlistGpuWork() :
-        distanceBuffer(c_gpuNumClusterPerCell)
+        distanceBuffer(c_gpuNumClusterPerCell),
+        sci_sort({}, {gmx::PinningPolicy::PinnedIfSupported})
      {
      }
  
@@ -275,17 +276,14 @@ struct NbnxnPairlistGpuWork
      ISuperClusterData         iSuperClusterData;
      // The current j-cluster index for the current list
      int                       cj_ind;
-    // The first unitialized cj4 block
-    int                       cj4_init;
      // Bounding box distance work array
      AlignedVector<float>      distanceBuffer;
  
      // Buffer for sorting list entries
      std::vector<int>          sortBuffer;
  
-    // TODO: Replace by std::vector with default initialization when converting NbnxnPairlistGpu to C++
-    nbnxn_sci_t              *sci_sort        = nullptr; /* Second sci array, for sorting */
-    int                       sci_sort_nalloc = 0;       /* Allocation size of sci_sort   */
+    // Second sci array, for sorting
+    gmx::HostVector<nbnxn_sci_t> sci_sort;
  
      // Protect data from cache pollution between threads
      gmx_cache_protect_t       cp1;
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp

index fcbf9d01957ef566c6623da7e176c33fcb909281..43313db602f0667ab7229619de8adc5cf469c268 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp
@@ -69,14 +69,12 @@ nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu     *nbl,
                       real  *                     Vc,
                       real  *                     Vvdw)
  {
-    const nbnxn_sci_t  *nbln;
      gmx_bool            bEner;
      gmx_bool            bEwald;
      const real         *Ftab = nullptr;
      real                rcut2, rvdw2, rlist2;
      int                 ntype;
      real                facel;
-    int                 n;
      int                 ish3;
      int                 sci;
      int                 cj4_ind0, cj4_ind1, cj4_ind;
@@ -141,21 +139,19 @@ nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu     *nbl,
      nhwu        = 0;
      nhwu_pruned = 0;
  
-    for (n = 0; n < nbl->nsci; n++)
+    for (const nbnxn_sci_t &nbln : nbl->sci)
      {
-        nbln = &nbl->sci[n];
-
-        ish3             = 3*nbln->shift;
+        ish3             = 3*nbln.shift;
          shX              = shiftvec[ish3];
          shY              = shiftvec[ish3+1];
          shZ              = shiftvec[ish3+2];
-        cj4_ind0         = nbln->cj4_ind_start;
-        cj4_ind1         = nbln->cj4_ind_end;
-        sci              = nbln->sci;
+        cj4_ind0         = nbln.cj4_ind_start;
+        cj4_ind1         = nbln.cj4_ind_end;
+        sci              = nbln.sci;
          vctot            = 0;
          Vvdwtot          = 0;
  
-        if (nbln->shift == CENTRAL &&
+        if (nbln.shift == CENTRAL &&
              nbl->cj4[cj4_ind0].cj[0] == sci*c_numClPerSupercl)
          {
              /* we have the diagonal:
@@ -224,7 +220,7 @@ nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu     *nbl,
                              {
                                  ja               = cj*c_clSize + jc;
  
-                                if (nbln->shift == CENTRAL &&
+                                if (nbln.shift == CENTRAL &&
                                      ci == cj && ja <= ia)
                                  {
                                      continue;
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp

index 4e1b973d672f9b71778f5a5906d5e9e8bfeab9a1..d164d268316118220b50bd871a99aaa57f86fad1 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
@@ -780,7 +780,7 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
      // Timing accumulation should happen only if there was work to do
      // because getLastRangeTime() gets skipped with empty lists later
      // which leads to the counter not being reset.
-    bool             bDoTime    = ((nb->bDoTime == CL_TRUE) && h_plist->nsci > 0);
+    bool             bDoTime    = ((nb->bDoTime == CL_TRUE) && !h_plist->sci.empty());
      cl_command_queue stream     = nb->stream[iloc];
      cl_plist_t      *d_plist    = nb->plist[iloc];
  
@@ -807,24 +807,24 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
      // TODO most of this function is same in CUDA and OpenCL, move into the header
      Context context = nb->dev_rundata->context;
  
-    reallocateDeviceBuffer(&d_plist->sci, h_plist->nsci,
+    reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(),
                             &d_plist->nsci, &d_plist->sci_nalloc, context);
-    copyToDeviceBuffer(&d_plist->sci, h_plist->sci, 0, h_plist->nsci,
+    copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(),
                         stream, GpuApiCallBehavior::Async,
                         bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
  
-    reallocateDeviceBuffer(&d_plist->cj4, h_plist->ncj4,
+    reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(),
                             &d_plist->ncj4, &d_plist->cj4_nalloc, context);
-    copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4, 0, h_plist->ncj4,
+    copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(),
                         stream, GpuApiCallBehavior::Async,
                         bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
  
-    reallocateDeviceBuffer(&d_plist->imask, h_plist->ncj4*c_nbnxnGpuClusterpairSplit,
+    reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size()*c_nbnxnGpuClusterpairSplit,
                             &d_plist->nimask, &d_plist->imask_nalloc, context);
  
-    reallocateDeviceBuffer(&d_plist->excl, h_plist->nexcl,
+    reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(),
                             &d_plist->nexcl, &d_plist->excl_nalloc, context);
-    copyToDeviceBuffer(&d_plist->excl, h_plist->excl, 0, h_plist->nexcl,
+    copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(),
                         stream, GpuApiCallBehavior::Async,
                         bDoTime ? nb->timers->pl_h2d[iloc].fetchNextEvent() : nullptr);
  
diff --git a/src/gromacs/mdlib/nbnxn_pairlist.h b/src/gromacs/mdlib/nbnxn_pairlist.h

index 1ac278db916cd84b71740b0d4e97eda65c92d9ed..b86011ccaaf55f189ce1fb26065e257d1cb76eb3 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_pairlist.h
+++ b/src/gromacs/mdlib/nbnxn_pairlist.h
@@ -162,6 +162,21 @@ struct nbnxn_cj_t
  #define NBNXN_CI_HALF_LJ(subc)  (1<<(8+3*(subc)))
  #define NBNXN_CI_DO_COUL(subc)  (1<<(9+3*(subc)))
  
+/* Cluster-pair Interaction masks
+ * Bit i*j-cluster-size + j tells if atom i and j interact.
+ */
+// TODO: Rename according to convention when moving into Nbnxn namespace
+/* All interaction mask is the same for all kernels */
+constexpr unsigned int NBNXN_INTERACTION_MASK_ALL       = 0xffffffffU;
+/* 4x4 kernel diagonal mask */
+constexpr unsigned int NBNXN_INTERACTION_MASK_DIAG      = 0x08ceU;
+/* 4x2 kernel diagonal masks */
+constexpr unsigned int NBNXN_INTERACTION_MASK_DIAG_J2_0 = 0x0002U;
+constexpr unsigned int NBNXN_INTERACTION_MASK_DIAG_J2_1 = 0x002fU;
+/* 4x8 kernel diagonal masks */
+constexpr unsigned int NBNXN_INTERACTION_MASK_DIAG_J8_0 = 0xf0f8fcfeU;
+constexpr unsigned int NBNXN_INTERACTION_MASK_DIAG_J8_1 = 0x0080c0e0U;
+
  /* Simple pair-list i-unit */
  struct nbnxn_ci_t
  {
@@ -179,22 +194,37 @@ typedef struct {
      int cj4_ind_end;    /* End index into cj4    */
  } nbnxn_sci_t;
  
-typedef struct {
-    unsigned int imask;    /* The i-cluster interactions mask for 1 warp  */
-    int          excl_ind; /* Index into the exclusion array for 1 warp   */
-} nbnxn_im_ei_t;
+/* Interaction data for a j-group for one warp */
+struct nbnxn_im_ei_t
+{
+    // The i-cluster interactions mask for 1 warp
+    unsigned int imask    = 0U;
+    // Index into the exclusion array for 1 warp, default index 0 which means no exclusions
+    int          excl_ind = 0;
+};
  
  typedef struct {
      int           cj[c_nbnxnGpuJgroupSize];         /* The 4 j-clusters */
      nbnxn_im_ei_t imei[c_nbnxnGpuClusterpairSplit]; /* The i-cluster mask data       for 2 warps   */
  } nbnxn_cj4_t;
  
-typedef struct {
-    unsigned int pair[c_nbnxnGpuExclSize]; /* Topology exclusion interaction bits for one warp,
-                                            * each unsigned has bitS for 4*8 i clusters
-                                            */
-} nbnxn_excl_t;
+/* Struct for storing the atom-pair interaction bits for a cluster pair in a GPU pairlist */
+struct nbnxn_excl_t
+{
+    /* Constructor, sets no exclusions, so all atom pairs interacting */
+    nbnxn_excl_t()
+    {
+        for (unsigned int &pairEntry : pair)
+        {
+            pairEntry = NBNXN_INTERACTION_MASK_ALL;
+        }
+    }
+
+    /* Topology exclusion interaction bits per warp */
+    unsigned int pair[c_nbnxnGpuExclSize];
+};
  
+/* Cluster pairlist type for use on CPUs */
  struct NbnxnPairlistCpu
  {
      gmx_cache_protect_t     cp0;
@@ -216,32 +246,38 @@ struct NbnxnPairlistCpu
      gmx_cache_protect_t     cp1;
  };
  
+/* Cluster pairlist type, with extra hierarchies, for on the GPU
+ *
+ * NOTE: for better performance when combining lists over threads,
+ *       all vectors should use default initialization. But when
+ *       changing this, excl should be intialized when adding entries.
+ */
  struct NbnxnPairlistGpu
  {
-    gmx_cache_protect_t     cp0;
-
-    nbnxn_alloc_t          *alloc;
-    nbnxn_free_t           *free;
-
-    int                     na_ci;       /* The number of atoms per i-cluster        */
-    int                     na_cj;       /* The number of atoms per j-cluster        */
-    int                     na_sc;       /* The number of atoms per super cluster    */
-    real                    rlist;       /* The radius for constructing the list     */
-    int                     nsci;        /* The number of i-super-clusters in the list */
-    nbnxn_sci_t            *sci;         /* The i-super-cluster list                 */
-    int                     sci_nalloc;  /* The allocation size of sci               */
-
-    int                     ncj4;        /* The total number of 4*j clusters         */
-    nbnxn_cj4_t            *cj4;         /* The 4*j cluster list, size ncj4          */
-    int                     cj4_nalloc;  /* The allocation size of cj4               */
-    int                     nexcl;       /* The count for excl                       */
-    nbnxn_excl_t           *excl;        /* Atom interaction bits (non-exclusions)   */
-    int                     excl_nalloc; /* The allocation size for excl             */
-    int                     nci_tot;     /* The total number of i clusters           */
-
-    NbnxnPairlistGpuWork   *work;
-
-    gmx_cache_protect_t     cp1;
+    /* Constructor
+     *
+     * \param[in] pinningPolicy  Sets the pinning policy for all buffers used on the GPU
+     */
+    NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy);
+
+    gmx_cache_protect_t            cp0;
+
+    int                            na_ci; /* The number of atoms per i-cluster        */
+    int                            na_cj; /* The number of atoms per j-cluster        */
+    int                            na_sc; /* The number of atoms per super cluster    */
+    real                           rlist; /* The radius for constructing the list     */
+    // The i-super-cluster list, indexes into cj4;
+    gmx::HostVector<nbnxn_sci_t>   sci;
+    // The list of 4*j-cluster groups
+    gmx::HostVector<nbnxn_cj4_t>   cj4;
+    // Atom interaction bits (non-exclusions)
+    gmx::HostVector<nbnxn_excl_t>  excl;
+    // The total number of i-clusters
+    int                            nci_tot;
+
+    NbnxnPairlistGpuWork          *work;
+
+    gmx_cache_protect_t            cp1;
  };
  
  typedef struct {
diff --git a/src/gromacs/mdlib/nbnxn_search.cpp b/src/gromacs/mdlib/nbnxn_search.cpp

index 4a5d6da7bcaca92bd6146ab451772c3662065c35..f49d3c04858ae51edf597bf2de780c4123dbed8f 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_search.cpp
+++ b/src/gromacs/mdlib/nbnxn_search.cpp
@@ -763,13 +763,15 @@ clusterpair_in_range(const NbnxnPairlistGpuWork &work,
  }
  
  /* Returns the j-cluster index for index cjIndex in a cj list */
-static inline int nblCj(const nbnxn_cj_t *cjList, int cjIndex)
+static inline int nblCj(gmx::ArrayRef<const nbnxn_cj_t> cjList,
+                        int                             cjIndex)
  {
      return cjList[cjIndex].cj;
  }
  
  /* Returns the j-cluster index for index cjIndex in a cj4 list */
-static inline int nblCj(const nbnxn_cj4_t *cj4List, int cjIndex)
+static inline int nblCj(gmx::ArrayRef<const nbnxn_cj4_t> cj4List,
+                        int                              cjIndex)
  {
      return cj4List[cjIndex/c_nbnxnGpuJgroupSize].cj[cjIndex & (c_nbnxnGpuJgroupSize - 1)];
  }
@@ -780,72 +782,6 @@ static unsigned int nbl_imask0(const NbnxnPairlistGpu *nbl, int cj_ind)
      return nbl->cj4[cj_ind/c_nbnxnGpuJgroupSize].imei[0].imask;
  }
  
-/* Ensures there is enough space for extra extra exclusion masks */
-static void check_excl_space(NbnxnPairlistGpu *nbl, int extra)
-{
-    if (nbl->nexcl+extra > nbl->excl_nalloc)
-    {
-        nbl->excl_nalloc = over_alloc_small(nbl->nexcl+extra);
-        nbnxn_realloc_void(reinterpret_cast<void **>(&nbl->excl),
-                           nbl->nexcl*sizeof(*nbl->excl),
-                           nbl->excl_nalloc*sizeof(*nbl->excl),
-                           nbl->alloc, nbl->free);
-    }
-}
-
-/* Ensures there is enough space for ncell extra j-cells in the list */
-static void check_cell_list_space(NbnxnPairlistCpu gmx_unused *nbl,
-                                  int gmx_unused               ncell)
-{
-}
-
-/* Ensures there is enough space for ncell extra j-cells in the list */
-static void check_cell_list_space(NbnxnPairlistGpu *nbl,
-                                  int               ncell)
-{
-    int ncj4_max, w;
-
-    /* We can have maximally nsupercell*c_gpuNumClusterPerCell sj lists */
-    /* We can store 4 j-subcell - i-supercell pairs in one struct.
-     * since we round down, we need one extra entry.
-     */
-    ncj4_max = ((nbl->work->cj_ind + ncell*c_gpuNumClusterPerCell + c_nbnxnGpuJgroupSize - 1)/c_nbnxnGpuJgroupSize);
-
-    if (ncj4_max > nbl->cj4_nalloc)
-    {
-        nbl->cj4_nalloc = over_alloc_small(ncj4_max);
-        nbnxn_realloc_void(reinterpret_cast<void **>(&nbl->cj4),
-                           nbl->work->cj4_init*sizeof(*nbl->cj4),
-                           nbl->cj4_nalloc*sizeof(*nbl->cj4),
-                           nbl->alloc, nbl->free);
-    }
-
-    if (ncj4_max > nbl->work->cj4_init)
-    {
-        for (int j4 = nbl->work->cj4_init; j4 < ncj4_max; j4++)
-        {
-            /* No i-subcells and no excl's in the list initially */
-            for (w = 0; w < c_nbnxnGpuClusterpairSplit; w++)
-            {
-                nbl->cj4[j4].imei[w].imask    = 0U;
-                nbl->cj4[j4].imei[w].excl_ind = 0;
-
-            }
-        }
-        nbl->work->cj4_init = ncj4_max;
-    }
-}
-
-/* Set all excl masks for one GPU warp no exclusions */
-static void set_no_excls(nbnxn_excl_t *excl)
-{
-    for (int t = 0; t < c_nbnxnGpuExclSize; t++)
-    {
-        /* Turn all interaction bits on */
-        excl->pair[t] = NBNXN_INTERACTION_MASK_ALL;
-    }
-}
-
  /* Initializes a single NbnxnPairlistCpu data structure */
  static void nbnxn_init_pairlist(NbnxnPairlistCpu *nbl)
  {
@@ -861,59 +797,32 @@ static void nbnxn_init_pairlist(NbnxnPairlistCpu *nbl)
      nbl->work        = new NbnxnPairlistCpuWork();
  }
  
-/* Initializes a single NbnxnPairlistGpu data structure */
-static void nbnxn_init_pairlist(NbnxnPairlistGpu *nbl,
-                                nbnxn_alloc_t    *alloc,
-                                nbnxn_free_t     *free)
+NbnxnPairlistGpu::NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy) :
+    na_ci(c_nbnxnGpuClusterSize),
+    na_cj(c_nbnxnGpuClusterSize),
+    na_sc(c_gpuNumClusterPerCell*c_nbnxnGpuClusterSize),
+    rlist(0),
+    sci({}, {pinningPolicy}),
+    cj4({}, {pinningPolicy}),
+    excl({}, {pinningPolicy}),
+    nci_tot(0)
  {
-    if (alloc == nullptr)
-    {
-        nbl->alloc = nbnxn_alloc_aligned;
-    }
-    else
-    {
-        nbl->alloc = alloc;
-    }
-    if (free == nullptr)
-    {
-        nbl->free = nbnxn_free_aligned;
-    }
-    else
-    {
-        nbl->free = free;
-    }
-
-    nbl->na_ci       = c_nbnxnGpuClusterSize;
-    nbl->na_cj       = c_nbnxnGpuClusterSize;
-    nbl->na_sc       = c_gpuNumClusterPerCell*c_nbnxnGpuClusterSize;
-    nbl->nsci        = 0;
-    nbl->sci         = nullptr;
-    nbl->sci_nalloc  = 0;
-    nbl->ncj4        = 0;
-    /* We need one element extra in sj, so alloc initially with 1 */
-    nbl->cj4_nalloc  = 0;
-    nbl->cj4         = nullptr;
-    nbl->nci_tot     = 0;
+    static_assert(c_nbnxnGpuNumClusterPerSupercluster == c_gpuNumClusterPerCell,
+                  "The search code assumes that the a super-cluster matches a search grid cell");
  
-    GMX_ASSERT(c_nbnxnGpuNumClusterPerSupercluster == c_gpuNumClusterPerCell, "The search code assumes that the a super-cluster matches a search grid cell");
+    static_assert(sizeof(cj4[0].imei[0].imask)*8 >= c_nbnxnGpuJgroupSize*c_gpuNumClusterPerCell,
+                  "The i super-cluster cluster interaction mask does not contain a sufficient number of bits");
  
-    GMX_ASSERT(sizeof(nbl->cj4[0].imei[0].imask)*8 >= c_nbnxnGpuJgroupSize*c_gpuNumClusterPerCell, "The i super-cluster cluster interaction mask does not contain a sufficient number of bits");
-    GMX_ASSERT(sizeof(nbl->excl[0])*8 >= c_nbnxnGpuJgroupSize*c_gpuNumClusterPerCell, "The GPU exclusion mask does not contain a sufficient number of bits");
+    static_assert(sizeof(excl[0])*8 >= c_nbnxnGpuJgroupSize*c_gpuNumClusterPerCell, "The GPU exclusion mask does not contain a sufficient number of bits");
  
-    nbl->excl        = nullptr;
-    nbl->excl_nalloc = 0;
-    nbl->nexcl       = 0;
-    check_excl_space(nbl, 1);
-    nbl->nexcl       = 1;
-    set_no_excls(&nbl->excl[0]);
+    // We always want a first entry without any exclusions
+    excl.resize(1);
  
-    nbl->work        = new NbnxnPairlistGpuWork();
+    work = new NbnxnPairlistGpuWork();
  }
  
  void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
-                             gmx_bool bSimple, gmx_bool bCombined,
-                             nbnxn_alloc_t *alloc,
-                             nbnxn_free_t  *free)
+                             gmx_bool bSimple, gmx_bool bCombined)
  {
      GMX_RELEASE_ASSERT(!bSimple || !bCombined, "Can only combine non-simple lists");
  
@@ -964,17 +873,10 @@ void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
              }
              else
              {
-                snew(nbl_list->nblGpu[i], 1);
-
                  /* Only list 0 is used on the GPU, use normal allocation for i>0 */
-                if (i == 0)
-                {
-                    nbnxn_init_pairlist(nbl_list->nblGpu[i], alloc, free);
-                }
-                else
-                {
-                    nbnxn_init_pairlist(nbl_list->nblGpu[i], nullptr, nullptr);
-                }
+                auto pinningPolicy = (i == 0 ? gmx::PinningPolicy::PinnedIfSupported : gmx::PinningPolicy::CannotBePinned);
+
+                nbl_list->nblGpu[i] = new NbnxnPairlistGpu(pinningPolicy);
              }
  
              snew(nbl_list->nbl_fep[i], 1);
@@ -1046,8 +948,8 @@ static void print_nblist_statistics(FILE *fp, const NbnxnPairlistGpu *nbl,
      /* This code only produces correct statistics with domain decomposition */
      grid = &nbs->grid[0];
  
-    fprintf(fp, "nbl nsci %d ncj4 %d nsi %d excl4 %d\n",
-            nbl->nsci, nbl->ncj4, nbl->nci_tot, nbl->nexcl);
+    fprintf(fp, "nbl nsci %zu ncj4 %zu nsi %d excl4 %zu\n",
+            nbl->sci.size(), nbl->cj4.size(), nbl->nci_tot, nbl->excl.size());
      fprintf(fp, "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
              nbl->na_ci, rl, nbl->nci_tot, nbl->nci_tot/static_cast<double>(grid->nsubc_tot),
              nbl->nci_tot/static_cast<double>(grid->nsubc_tot)*grid->na_c,
@@ -1060,12 +962,10 @@ static void print_nblist_statistics(FILE *fp, const NbnxnPairlistGpu *nbl,
      {
          c[si] = 0;
      }
-    for (int i = 0; i < nbl->nsci; i++)
+    for (const nbnxn_sci_t &sci : nbl->sci)
      {
-        int nsp;
-
-        nsp = 0;
-        for (int j4 = nbl->sci[i].cj4_ind_start; j4 < nbl->sci[i].cj4_ind_end; j4++)
+        int nsp = 0;
+        for (int j4 = sci.cj4_ind_start; j4 < sci.cj4_ind_end; j4++)
          {
              for (int j = 0; j < c_nbnxnGpuJgroupSize; j++)
              {
@@ -1085,73 +985,46 @@ static void print_nblist_statistics(FILE *fp, const NbnxnPairlistGpu *nbl,
          sum_nsp2 += nsp*nsp;
          nsp_max   = std::max(nsp_max, nsp);
      }
-    if (nbl->nsci > 0)
+    if (!nbl->sci.empty())
      {
-        sum_nsp  /= nbl->nsci;
-        sum_nsp2 /= nbl->nsci;
+        sum_nsp  /= nbl->sci.size();
+        sum_nsp2 /= nbl->sci.size();
      }
      fprintf(fp, "nbl #cluster-pairs: av %.1f stddev %.1f max %d\n",
              sum_nsp, std::sqrt(sum_nsp2 - sum_nsp*sum_nsp), nsp_max);
  
-    if (nbl->ncj4 > 0)
+    if (!nbl->cj4.empty())
      {
          for (b = 0; b <= c_gpuNumClusterPerCell; b++)
          {
              fprintf(fp, "nbl j-list #i-subcell %d %7d %4.1f\n",
-                    b, c[b],
-                    100.0*c[b]/int{nbl->ncj4*c_nbnxnGpuJgroupSize});
+                    b, c[b], 100.0*c[b]/size_t {nbl->cj4.size()*c_nbnxnGpuJgroupSize});
          }
      }
  }
  
-/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp */
-static void low_get_nbl_exclusions(NbnxnPairlistGpu *nbl, int cj4,
-                                   int warp, nbnxn_excl_t **excl)
-{
-    if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
-    {
-        /* No exclusions set, make a new list entry */
-        nbl->cj4[cj4].imei[warp].excl_ind = nbl->nexcl;
-        nbl->nexcl++;
-        *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
-        set_no_excls(*excl);
-    }
-    else
-    {
-        /* We already have some exclusions, new ones can be added to the list */
-        *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
-    }
-}
-
-/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp,
- * generates a new element and allocates extra memory, if necessary.
+/* Returns a pointer to the exclusion mask for j-cluster-group \p cj4 and warp \p warp
+ * Generates a new exclusion entry when the j-cluster-group uses
+ * the default all-interaction mask at call time, so the returned mask
+ * can be modified when needed.
   */
-static void get_nbl_exclusions_1(NbnxnPairlistGpu *nbl, int cj4,
-                                 int warp, nbnxn_excl_t **excl)
+static nbnxn_excl_t *get_exclusion_mask(NbnxnPairlistGpu *nbl,
+                                        int               cj4,
+                                        int               warp)
  {
      if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
      {
-        /* We need to make a new list entry, check if we have space */
-        check_excl_space(nbl, 1);
+        /* No exclusions set, make a new list entry */
+        const size_t oldSize = nbl->excl.size();
+        GMX_ASSERT(oldSize >= 1, "We should always have entry [0]");
+        /* Add entry with default values: no exclusions */
+        nbl->excl.resize(oldSize + 1);
+        nbl->cj4[cj4].imei[warp].excl_ind = oldSize;
      }
-    low_get_nbl_exclusions(nbl, cj4, warp, excl);
-}
  
-/* Returns pointers to the exclusion masks for cj4-unit cj4 for both warps,
- * generates a new element and allocates extra memory, if necessary.
- */
-static void get_nbl_exclusions_2(NbnxnPairlistGpu *nbl, int cj4,
-                                 nbnxn_excl_t **excl_w0,
-                                 nbnxn_excl_t **excl_w1)
-{
-    /* Check for space we might need */
-    check_excl_space(nbl, 2);
-
-    low_get_nbl_exclusions(nbl, cj4, 0, excl_w0);
-    low_get_nbl_exclusions(nbl, cj4, 1, excl_w1);
+    return &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
  }
  
-/* Sets the self exclusions i=j and pair exclusions i>j */
  static void set_self_and_newton_excls_supersub(NbnxnPairlistGpu *nbl,
                                                 int cj4_ind, int sj_offset,
                                                 int i_cluster_in_cell)
@@ -1160,9 +1033,14 @@ static void set_self_and_newton_excls_supersub(NbnxnPairlistGpu *nbl,
  
      /* Here we only set the set self and double pair exclusions */
  
-    static_assert(c_nbnxnGpuClusterpairSplit == 2, "");
-
-    get_nbl_exclusions_2(nbl, cj4_ind, &excl[0], &excl[1]);
+    /* Reserve extra elements, so the resize() in get_exclusion_mask()
+     * will not invalidate excl entries in the loop below
+     */
+    nbl->excl.reserve(nbl->excl.size() + c_nbnxnGpuClusterpairSplit);
+    for (int w = 0; w < c_nbnxnGpuClusterpairSplit; w++)
+    {
+        excl[w] = get_exclusion_mask(nbl, cj4_ind, w);
+    }
  
      /* Only minor < major bits set */
      for (int ej = 0; ej < nbl->na_ci; ej++)
@@ -1392,16 +1270,11 @@ static void make_cluster_list_supersub(const nbnxn_grid_t &iGrid,
      {
          const int    cj4_ind   = work.cj_ind/c_nbnxnGpuJgroupSize;
          const int    cj_offset = work.cj_ind - cj4_ind*c_nbnxnGpuJgroupSize;
-        nbnxn_cj4_t *cj4       = &nbl->cj4[cj4_ind];
-
          const int    cj        = scj*c_gpuNumClusterPerCell + subc;
  
          const int    cj_gl     = jGrid.cell0*c_gpuNumClusterPerCell + cj;
  
-        /* Initialize this j-subcell i-subcell list */
-        cj4->cj[cj_offset] = cj_gl;
-
-        int ci1;
+        int          ci1;
          if (excludeSubDiagonal && sci == scj)
          {
              ci1 = subc + 1;
@@ -1477,7 +1350,14 @@ static void make_cluster_list_supersub(const nbnxn_grid_t &iGrid,
  
          if (npair > 0)
          {
-            /* We have a useful sj entry, close it now */
+            /* We have at least one cluster pair: add a j-entry */
+            if (static_cast<size_t>(cj4_ind) == nbl->cj4.size())
+            {
+                nbl->cj4.resize(nbl->cj4.size() + 1);
+            }
+            nbnxn_cj4_t *cj4   = &nbl->cj4[cj4_ind];
+
+            cj4->cj[cj_offset] = cj_gl;
  
              /* Set the exclusions for the ci==sj entry.
               * Here we don't bother to check if this entry is actually flagged,
@@ -1500,7 +1380,7 @@ static void make_cluster_list_supersub(const nbnxn_grid_t &iGrid,
              nbl->nci_tot += npair;
  
              /* Increase the closing index in i super-cell list */
-            nbl->sci[nbl->nsci - 1].cj4_ind_end =
+            nbl->sci.back().cj4_ind_end =
                  (nbl->work->cj_ind + c_nbnxnGpuJgroupSize - 1)/c_nbnxnGpuJgroupSize;
          }
      }
@@ -1508,9 +1388,9 @@ static void make_cluster_list_supersub(const nbnxn_grid_t &iGrid,
  
  /* Returns how many contiguous j-clusters we have starting in the i-list */
  template <typename CjListType>
-static int numContiguousJClusters(const int         cjIndexStart,
-                                  const int         cjIndexEnd,
-                                  const CjListType *cjList)
+static int numContiguousJClusters(const int                       cjIndexStart,
+                                  const int                       cjIndexEnd,
+                                  gmx::ArrayRef<const CjListType> cjList)
  {
      const int firstJCluster = nblCj(cjList, cjIndexStart);
  
@@ -1525,26 +1405,29 @@ static int numContiguousJClusters(const int         cjIndexStart,
      return numContiguous;
  }
  
-/* Helper struct for efficient searching for excluded atoms in a j-list */
+/*! \internal
+ * \brief Helper struct for efficient searching for excluded atoms in a j-list
+ */
  struct JListRanges
  {
-    /* Constructor */
+    /*! \brief Constructs a j-list range from \p cjList with the given index range */
      template <typename CjListType>
-    JListRanges(int               cjIndexStart,
-                int               cjIndexEnd,
-                const CjListType &cjList);
-
-    int cjIndexStart; // The start index in the j-list
-    int cjIndexEnd;   // The end index in the j-list
-    int cjFirst;      // The j-cluster with index cjIndexStart
-    int cjLast;       // The j-cluster with index cjIndexEnd-1
-    int numDirect;    // Up to cjIndexStart+numDirect the j-clusters are cjFirst + the index offset
+    JListRanges(int                             cjIndexStart,
+                int                             cjIndexEnd,
+                gmx::ArrayRef<const CjListType> cjList);
+
+    int cjIndexStart; //!< The start index in the j-list
+    int cjIndexEnd;   //!< The end index in the j-list
+    int cjFirst;      //!< The j-cluster with index cjIndexStart
+    int cjLast;       //!< The j-cluster with index cjIndexEnd-1
+    int numDirect;    //!< Up to cjIndexStart+numDirect the j-clusters are cjFirst + the index offset
  };
  
+#ifndef DOXYGEN
  template <typename CjListType>
-JListRanges::JListRanges(int               cjIndexStart,
-                         int               cjIndexEnd,
-                         const CjListType &cjList) :
+JListRanges::JListRanges(int                             cjIndexStart,
+                         int                             cjIndexEnd,
+                         gmx::ArrayRef<const CjListType> cjList) :
      cjIndexStart(cjIndexStart),
      cjIndexEnd(cjIndexEnd)
  {
@@ -1559,6 +1442,7 @@ JListRanges::JListRanges(int               cjIndexStart,
       */
      numDirect = numContiguousJClusters(cjIndexStart, cjIndexEnd, cjList);
  }
+#endif // !DOXYGEN
  
  /* Return the index of \p jCluster in the given range or -1 when not present
   *
@@ -1566,9 +1450,10 @@ JListRanges::JListRanges(int               cjIndexStart,
   *       important. It should be inlined and fully optimized.
   */
  template <typename CjListType>
-static inline int findJClusterInJList(int                jCluster,
-                                      const JListRanges &ranges,
-                                      const CjListType  &cjList)
+static inline int
+findJClusterInJList(int                              jCluster,
+                    const JListRanges               &ranges,
+                    gmx::ArrayRef<const CjListType>  cjList)
  {
      int index;
  
@@ -1619,7 +1504,7 @@ static nbnxn_ci_t *getOpenIEntry(NbnxnPairlistCpu *nbl)
  /* Return the i-entry in the list we are currently operating on */
  static nbnxn_sci_t *getOpenIEntry(NbnxnPairlistGpu *nbl)
  {
-    return &nbl->sci[nbl->nsci - 1];
+    return &nbl->sci.back();
  }
  
  /* Set all atom-pair exclusions for a simple type list i-entry
@@ -1641,7 +1526,7 @@ setExclusionsForIEntry(const nbnxn_search   *nbs,
          return;
      }
  
-    const JListRanges        ranges(iEntry.cj_ind_start, iEntry.cj_ind_end, nbl->cj.data());
+    const JListRanges        ranges(iEntry.cj_ind_start, iEntry.cj_ind_end, gmx::makeConstArrayRef(nbl->cj));
  
      const int                iCluster = iEntry.ci;
  
@@ -1682,7 +1567,8 @@ setExclusionsForIEntry(const nbnxn_search   *nbs,
                  if (jCluster >= ranges.cjFirst && jCluster <= ranges.cjLast)
                  {
                      const int index =
-                        findJClusterInJList(jCluster, ranges, nbl->cj.data());
+                        findJClusterInJList(jCluster, ranges,
+                                            gmx::makeConstArrayRef(nbl->cj));
  
                      if (index >= 0)
                      {
@@ -2043,13 +1929,13 @@ static void make_fep_list(const nbnxn_search     *nbs,
                                      (bFEP_i || (fep_cj & (1 << j))) &&
                                      (!bDiagRemoved || ind_j >= ind_i))
                                  {
-                                    nbnxn_excl_t *excl;
                                      int           excl_pair;
                                      unsigned int  excl_bit;
                                      real          dx, dy, dz;
  
                                      const int     jHalf = j/(c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit);
-                                    get_nbl_exclusions_1(nbl, cj4_ind, jHalf, &excl);
+                                    nbnxn_excl_t *excl  =
+                                        get_exclusion_mask(nbl, cj4_ind, jHalf);
  
                                      excl_pair = a_mod_wj(j)*nbl->na_ci + i;
                                      excl_bit  = (1U << (gcj*c_gpuNumClusterPerCell + c));
@@ -2134,7 +2020,7 @@ setExclusionsForIEntry(const nbnxn_search   *nbs,
       */
      const JListRanges ranges(iEntry.cj4_ind_start*c_nbnxnGpuJgroupSize,
                               nbl->work->cj_ind,
-                             nbl->cj4);
+                             gmx::makeConstArrayRef(nbl->cj4));
  
      GMX_ASSERT(nbl->na_ci == c_nbnxnGpuClusterSize, "na_ci should match the GPU cluster size");
      constexpr int            c_clusterSize      = c_nbnxnGpuClusterSize;
@@ -2187,7 +2073,8 @@ setExclusionsForIEntry(const nbnxn_search   *nbs,
                  if (jCluster >= ranges.cjFirst && jCluster <= ranges.cjLast)
                  {
                      const int index =
-                        findJClusterInJList(jCluster, ranges, nbl->cj4);
+                        findJClusterInJList(jCluster, ranges,
+                                            gmx::makeConstArrayRef(nbl->cj4));
  
                      if (index >= 0)
                      {
@@ -2204,8 +2091,8 @@ setExclusionsForIEntry(const nbnxn_search   *nbs,
                              /* Determine which j-half (CUDA warp) we are in */
                              const int     jHalf = innerJ/(c_clusterSize/c_nbnxnGpuClusterpairSplit);
  
-                            nbnxn_excl_t *interactionMask;
-                            get_nbl_exclusions_1(nbl, cj_to_cj4(index), jHalf, &interactionMask);
+                            nbnxn_excl_t *interactionMask =
+                                get_exclusion_mask(nbl, cj_to_cj4(index), jHalf);
  
                              interactionMask->pair[a_mod_wj(innerJ)*c_clusterSize + innerI] &= ~pairMask;
                          }
@@ -2216,16 +2103,6 @@ setExclusionsForIEntry(const nbnxn_search   *nbs,
      }
  }
  
-/* Reallocate the super-cell sci list for at least n entries */
-static void nb_realloc_sci(NbnxnPairlistGpu *nbl, int n)
-{
-    nbl->sci_nalloc = over_alloc_small(n);
-    nbnxn_realloc_void(reinterpret_cast<void **>(&nbl->sci),
-                       nbl->nsci*sizeof(*nbl->sci),
-                       nbl->sci_nalloc*sizeof(*nbl->sci),
-                       nbl->alloc, nbl->free);
-}
-
  /* Make a new ci entry at the back of nbl->ci */
  static void addNewIEntry(NbnxnPairlistCpu *nbl, int ci, int shift, int flags)
  {
@@ -2242,17 +2119,13 @@ static void addNewIEntry(NbnxnPairlistCpu *nbl, int ci, int shift, int flags)
  /* Make a new sci entry at index nbl->nsci */
  static void addNewIEntry(NbnxnPairlistGpu *nbl, int sci, int shift, int gmx_unused flags)
  {
-    if (nbl->nsci + 1 > nbl->sci_nalloc)
-    {
-        nb_realloc_sci(nbl, nbl->nsci + 1);
-    }
-    nbl->nsci++;
-    nbnxn_sci_t &sciEntry = nbl->sci[nbl->nsci - 1];
-
+    nbnxn_sci_t sciEntry;
      sciEntry.sci           = sci;
      sciEntry.shift         = shift;
-    sciEntry.cj4_ind_start = nbl->ncj4;
-    sciEntry.cj4_ind_end   = nbl->ncj4;
+    sciEntry.cj4_ind_start = nbl->cj4.size();
+    sciEntry.cj4_ind_end   = nbl->cj4.size();
+
+    nbl->sci.push_back(sciEntry);
  }
  
  /* Sort the simple j-list cj on exclusions.
@@ -2342,9 +2215,6 @@ static void split_sci_entry(NbnxnPairlistGpu *nbl,
                              int thread, int nthread)
  {
      int nsp_max;
-    int cj4_start, cj4_end, j4len;
-    int sci;
-    int nsp, nsp_sci, nsp_cj4, nsp_cj4_e, nsp_cj4_p;
  
      if (progBal)
      {
@@ -2368,23 +2238,21 @@ static void split_sci_entry(NbnxnPairlistGpu *nbl,
          nsp_max = nsp_target_av;
      }
  
-    cj4_start = nbl->sci[nbl->nsci-1].cj4_ind_start;
-    cj4_end   = nbl->sci[nbl->nsci-1].cj4_ind_end;
-    j4len     = cj4_end - cj4_start;
+    const int cj4_start = nbl->sci.back().cj4_ind_start;
+    const int cj4_end   = nbl->sci.back().cj4_ind_end;
+    const int j4len     = cj4_end - cj4_start;
  
      if (j4len > 1 && j4len*c_gpuNumClusterPerCell*c_nbnxnGpuJgroupSize > nsp_max)
      {
-        /* Remove the last ci entry and process the cj4's again */
-        nbl->nsci -= 1;
+        /* Modify the last ci entry and process the cj4's again */
  
-        sci        = nbl->nsci;
-        nsp        = 0;
-        nsp_sci    = 0;
-        nsp_cj4_e  = 0;
-        nsp_cj4    = 0;
+        int nsp        = 0;
+        int nsp_sci    = 0;
+        int nsp_cj4_e  = 0;
+        int nsp_cj4    = 0;
          for (int cj4 = cj4_start; cj4 < cj4_end; cj4++)
          {
-            nsp_cj4_p = nsp_cj4;
+            int nsp_cj4_p = nsp_cj4;
              /* Count the number of cluster pairs in this cj4 group */
              nsp_cj4   = 0;
              for (int p = 0; p < c_gpuNumClusterPerCell*c_nbnxnGpuJgroupSize; p++)
@@ -2398,37 +2266,33 @@ static void split_sci_entry(NbnxnPairlistGpu *nbl,
              if (nsp > 0 && nsp_max - nsp < nsp + nsp_cj4 - nsp_max)
              {
                  /* Split the list at cj4 */
-                nbl->sci[sci].cj4_ind_end = cj4;
+                nbl->sci.back().cj4_ind_end = cj4;
                  /* Create a new sci entry */
-                sci++;
-                nbl->nsci++;
-                if (nbl->nsci+1 > nbl->sci_nalloc)
-                {
-                    nb_realloc_sci(nbl, nbl->nsci+1);
-                }
-                nbl->sci[sci].sci           = nbl->sci[nbl->nsci-1].sci;
-                nbl->sci[sci].shift         = nbl->sci[nbl->nsci-1].shift;
-                nbl->sci[sci].cj4_ind_start = cj4;
-                nsp_sci                     = nsp;
-                nsp_cj4_e                   = nsp_cj4_p;
-                nsp                         = 0;
+                nbnxn_sci_t sciNew;
+                sciNew.sci           = nbl->sci.back().sci;
+                sciNew.shift         = nbl->sci.back().shift;
+                sciNew.cj4_ind_start = cj4;
+                nbl->sci.push_back(sciNew);
+
+                nsp_sci              = nsp;
+                nsp_cj4_e            = nsp_cj4_p;
+                nsp                  = 0;
              }
              nsp += nsp_cj4;
          }
  
          /* Put the remaining cj4's in the last sci entry */
-        nbl->sci[sci].cj4_ind_end = cj4_end;
+        nbl->sci.back().cj4_ind_end = cj4_end;
  
          /* Possibly balance out the last two sci's
           * by moving the last cj4 of the second last sci.
           */
          if (nsp_sci - nsp_cj4_e >= nsp + nsp_cj4_e)
          {
-            nbl->sci[sci-1].cj4_ind_end--;
-            nbl->sci[sci].cj4_ind_start--;
+            GMX_ASSERT(nbl->sci.size() >= 2, "We expect at least two elements");
+            nbl->sci[nbl->sci.size() - 2].cj4_ind_end--;
+            nbl->sci[nbl->sci.size() - 1].cj4_ind_start--;
          }
-
-        nbl->nsci++;
      }
  }
  
@@ -2438,7 +2302,7 @@ static void closeIEntry(NbnxnPairlistGpu *nbl,
                          gmx_bool progBal, float nsp_tot_est,
                          int thread, int nthread)
  {
-    nbnxn_sci_t &sciEntry = nbl->sci[nbl->nsci - 1];
+    nbnxn_sci_t &sciEntry = *getOpenIEntry(nbl);
  
      /* All content of the new ci entry have already been filled correctly,
       * we only need to, potentially, split or remove the entry when empty.
@@ -2449,8 +2313,8 @@ static void closeIEntry(NbnxnPairlistGpu *nbl,
          /* We can only have complete blocks of 4 j-entries in a list,
           * so round the count up before closing.
           */
-        nbl->ncj4         = (nbl->work->cj_ind + c_nbnxnGpuJgroupSize - 1)/c_nbnxnGpuJgroupSize;
-        nbl->work->cj_ind = nbl->ncj4*c_nbnxnGpuJgroupSize;
+        int ncj4          = (nbl->work->cj_ind + c_nbnxnGpuJgroupSize - 1)/c_nbnxnGpuJgroupSize;
+        nbl->work->cj_ind = ncj4*c_nbnxnGpuJgroupSize;
  
          if (nsp_max_av > 0)
          {
@@ -2462,7 +2326,7 @@ static void closeIEntry(NbnxnPairlistGpu *nbl,
      else
      {
          /* Entry is empty: remove it  */
-        nbl->nsci--;
+        nbl->sci.pop_back();
      }
  }
  
@@ -2474,8 +2338,7 @@ static void sync_work(NbnxnPairlistCpu gmx_unused *nbl)
  /* Syncs the working array before adding another grid pair to the GPU list */
  static void sync_work(NbnxnPairlistGpu *nbl)
  {
-    nbl->work->cj_ind   = nbl->ncj4*c_nbnxnGpuJgroupSize;
-    nbl->work->cj4_init = nbl->ncj4;
+    nbl->work->cj_ind   = nbl->cj4.size()*c_nbnxnGpuJgroupSize;
  }
  
  /* Clears an NbnxnPairlistCpu data structure */
@@ -2495,10 +2358,10 @@ static void clear_pairlist(NbnxnPairlistCpu *nbl)
  /* Clears an NbnxnPairlistGpu data structure */
  static void clear_pairlist(NbnxnPairlistGpu *nbl)
  {
-    nbl->nsci          = 0;
-    nbl->ncj4          = 0;
-    nbl->nci_tot       = 0;
-    nbl->nexcl         = 1;
+    nbl->sci.clear();
+    nbl->cj4.clear();
+    nbl->excl.resize(1);
+    nbl->nci_tot = 0;
  }
  
  /* Clears a group scheme pair list */
@@ -2912,14 +2775,14 @@ static void print_nblist_ci_cj(FILE *fp, const NbnxnPairlistCpu *nbl)
  /* Debug list print function */
  static void print_nblist_sci_cj(FILE *fp, const NbnxnPairlistGpu *nbl)
  {
-    for (int i = 0; i < nbl->nsci; i++)
+    for (const nbnxn_sci_t &sci : nbl->sci)
      {
          fprintf(fp, "ci %4d  shift %2d  ncj4 %2d\n",
-                nbl->sci[i].sci, nbl->sci[i].shift,
-                nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start);
+                sci.sci, sci.shift,
+                sci.cj4_ind_end - sci.cj4_ind_start);
  
          int ncp = 0;
-        for (int j4 = nbl->sci[i].cj4_ind_start; j4 < nbl->sci[i].cj4_ind_end; j4++)
+        for (int j4 = sci.cj4_ind_start; j4 < sci.cj4_ind_end; j4++)
          {
              for (int j = 0; j < c_nbnxnGpuJgroupSize; j++)
              {
@@ -2936,8 +2799,8 @@ static void print_nblist_sci_cj(FILE *fp, const NbnxnPairlistGpu *nbl)
              }
          }
          fprintf(fp, "ci %4d  shift %2d  ncj4 %2d ncp %3d\n",
-                nbl->sci[i].sci, nbl->sci[i].shift,
-                nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start,
+                sci.sci, sci.shift,
+                sci.cj4_ind_end - sci.cj4_ind_start,
                  ncp);
      }
  }
@@ -2946,38 +2809,21 @@ static void print_nblist_sci_cj(FILE *fp, const NbnxnPairlistGpu *nbl)
  static void combine_nblists(int nnbl, NbnxnPairlistGpu **nbl,
                              NbnxnPairlistGpu *nblc)
  {
-    int nsci, ncj4, nexcl;
-
-    nsci  = nblc->nsci;
-    ncj4  = nblc->ncj4;
-    nexcl = nblc->nexcl;
+    int nsci  = nblc->sci.size();
+    int ncj4  = nblc->cj4.size();
+    int nexcl = nblc->excl.size();
      for (int i = 0; i < nnbl; i++)
      {
-        nsci  += nbl[i]->nsci;
-        ncj4  += nbl[i]->ncj4;
-        nexcl += nbl[i]->nexcl;
+        nsci  += nbl[i]->sci.size();
+        ncj4  += nbl[i]->cj4.size();
+        nexcl += nbl[i]->excl.size();
      }
  
-    if (nsci > nblc->sci_nalloc)
-    {
-        nb_realloc_sci(nblc, nsci);
-    }
-    if (ncj4 > nblc->cj4_nalloc)
-    {
-        nblc->cj4_nalloc = over_alloc_small(ncj4);
-        nbnxn_realloc_void(reinterpret_cast<void **>(&nblc->cj4),
-                           nblc->ncj4*sizeof(*nblc->cj4),
-                           nblc->cj4_nalloc*sizeof(*nblc->cj4),
-                           nblc->alloc, nblc->free);
-    }
-    if (nexcl > nblc->excl_nalloc)
-    {
-        nblc->excl_nalloc = over_alloc_small(nexcl);
-        nbnxn_realloc_void(reinterpret_cast<void **>(&nblc->excl),
-                           nblc->nexcl*sizeof(*nblc->excl),
-                           nblc->excl_nalloc*sizeof(*nblc->excl),
-                           nblc->alloc, nblc->free);
-    }
+    /* Resize with the final, combined size, so we can fill in parallel */
+    /* NOTE: For better performance we should use default initialization */
+    nblc->sci.resize(nsci);
+    nblc->cj4.resize(ncj4);
+    nblc->excl.resize(nexcl);
  
      /* Each thread should copy its own data to the combined arrays,
       * as otherwise data will go back and forth between different caches.
@@ -2991,42 +2837,39 @@ static void combine_nblists(int nnbl, NbnxnPairlistGpu **nbl,
      {
          try
          {
-            int                     sci_offset;
-            int                     cj4_offset;
-            int                     excl_offset;
-            const NbnxnPairlistGpu *nbli;
-
-            /* Determine the offset in the combined data for our thread */
-            sci_offset  = nblc->nsci;
-            cj4_offset  = nblc->ncj4;
-            excl_offset = nblc->nexcl;
+            /* Determine the offset in the combined data for our thread.
+             * Note that the original sizes in nblc are lost.
+             */
+            int sci_offset  = nsci;
+            int cj4_offset  = ncj4;
+            int excl_offset = nexcl;
  
-            for (int i = 0; i < n; i++)
+            for (int i = n; i < nnbl; i++)
              {
-                sci_offset  += nbl[i]->nsci;
-                cj4_offset  += nbl[i]->ncj4;
-                excl_offset += nbl[i]->nexcl;
+                sci_offset  -= nbl[i]->sci.size();
+                cj4_offset  -= nbl[i]->cj4.size();
+                excl_offset -= nbl[i]->excl.size();
              }
  
-            nbli = nbl[n];
+            const NbnxnPairlistGpu &nbli = *nbl[n];
  
-            for (int i = 0; i < nbli->nsci; i++)
+            for (size_t i = 0; i < nbli.sci.size(); i++)
              {
-                nblc->sci[sci_offset+i]                = nbli->sci[i];
-                nblc->sci[sci_offset+i].cj4_ind_start += cj4_offset;
-                nblc->sci[sci_offset+i].cj4_ind_end   += cj4_offset;
+                nblc->sci[sci_offset + i]                = nbli.sci[i];
+                nblc->sci[sci_offset + i].cj4_ind_start += cj4_offset;
+                nblc->sci[sci_offset + i].cj4_ind_end   += cj4_offset;
              }
  
-            for (int j4 = 0; j4 < nbli->ncj4; j4++)
+            for (size_t j4 = 0; j4 < nbli.cj4.size(); j4++)
              {
-                nblc->cj4[cj4_offset+j4]                   = nbli->cj4[j4];
-                nblc->cj4[cj4_offset+j4].imei[0].excl_ind += excl_offset;
-                nblc->cj4[cj4_offset+j4].imei[1].excl_ind += excl_offset;
+                nblc->cj4[cj4_offset + j4]                   = nbli.cj4[j4];
+                nblc->cj4[cj4_offset + j4].imei[0].excl_ind += excl_offset;
+                nblc->cj4[cj4_offset + j4].imei[1].excl_ind += excl_offset;
              }
  
-            for (int j4 = 0; j4 < nbli->nexcl; j4++)
+            for (size_t j4 = 0; j4 < nbli.excl.size(); j4++)
              {
-                nblc->excl[excl_offset+j4] = nbli->excl[j4];
+                nblc->excl[excl_offset + j4] = nbli.excl[j4];
              }
          }
          GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
@@ -3034,10 +2877,7 @@ static void combine_nblists(int nnbl, NbnxnPairlistGpu **nbl,
  
      for (int n = 0; n < nnbl; n++)
      {
-        nblc->nsci    += nbl[n]->nsci;
-        nblc->ncj4    += nbl[n]->ncj4;
          nblc->nci_tot += nbl[n]->nci_tot;
-        nblc->nexcl   += nbl[n]->nexcl;
      }
  }
  
@@ -3881,8 +3721,6 @@ static void nbnxn_make_pairlist_part(const nbnxn_search *nbs,
                                      /* For f buffer flags with simple lists */
                                      ncj_old_j = getNumSimpleJClustersInList(*nbl);
  
-                                    check_cell_list_space(nbl, lastCell - firstCell + 1);
-
                                      makeClusterListWrapper(nbl,
                                                             iGrid, ci,
                                                             jGrid, firstCell, lastCell,
@@ -4188,7 +4026,7 @@ static bool checkRebalanceSimpleLists(const nbnxn_pairlist_set_t *listSet)
   */
  static void sort_sci(NbnxnPairlistGpu *nbl)
  {
-    if (nbl->ncj4 <= nbl->nsci)
+    if (nbl->cj4.size() <= nbl->sci.size())
      {
          /* nsci = 0 or all sci have size 1, sorting won't change the order */
          return;
@@ -4197,25 +4035,19 @@ static void sort_sci(NbnxnPairlistGpu *nbl)
      NbnxnPairlistGpuWork &work = *nbl->work;
  
      /* We will distinguish differences up to double the average */
-    const int m = (2*nbl->ncj4)/nbl->nsci;
+    const int m = (2*nbl->cj4.size())/nbl->sci.size();
  
-    if (work.sci_sort_nalloc != nbl->sci_nalloc)
-    {
-        work.sci_sort_nalloc = nbl->sci_nalloc;
-        nbnxn_realloc_void(reinterpret_cast<void **>(&work.sci_sort),
-                           0,
-                           work.sci_sort_nalloc*sizeof(*work.sci_sort),
-                           nbl->alloc, nbl->free);
-    }
+    /* Resize work.sci_sort so we can sort into it */
+    work.sci_sort.resize(nbl->sci.size());
  
      std::vector<int> &sort = work.sortBuffer;
      /* Set up m + 1 entries in sort, initialized at 0 */
      sort.clear();
      sort.resize(m + 1, 0);
      /* Count the entries of each size */
-    for (int s = 0; s < nbl->nsci; s++)
+    for (const nbnxn_sci_t &sci : nbl->sci)
      {
-        int i = std::min(m, nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start);
+        int i = std::min(m, sci.cj4_ind_end - sci.cj4_ind_start);
          sort[i]++;
      }
      /* Calculate the offset for each count */
@@ -4229,11 +4061,11 @@ static void sort_sci(NbnxnPairlistGpu *nbl)
      }
  
      /* Sort entries directly into place */
-    nbnxn_sci_t *sci_sort = work.sci_sort;
-    for (int s = 0; s < nbl->nsci; s++)
+    gmx::ArrayRef<nbnxn_sci_t> sci_sort = work.sci_sort;
+    for (const nbnxn_sci_t &sci : nbl->sci)
      {
-        int i = std::min(m, nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start);
-        sci_sort[sort[i]++] = nbl->sci[s];
+        int i = std::min(m, sci.cj4_ind_end - sci.cj4_ind_start);
+        sci_sort[sort[i]++] = sci;
      }
  
      /* Swap the sci pointers so we use the new, sorted list */
diff --git a/src/gromacs/mdlib/nbnxn_search.h b/src/gromacs/mdlib/nbnxn_search.h

index 2ff9f90726e03f2a75f6a865001413c3ddb192b5..dc043c2898bbfc554a42c59d9e6e81355b562d91 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_search.h
+++ b/src/gromacs/mdlib/nbnxn_search.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2017,2018,2019, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -61,9 +61,7 @@ nbnxn_search_t nbnxn_init_search(const ivec                *n_dd_cells,
  
  /* Initializes a set of pair lists stored in nbnxn_pairlist_set_t */
  void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
-                             gmx_bool simple, gmx_bool combined,
-                             nbnxn_alloc_t *alloc,
-                             nbnxn_free_t  *free);
+                             gmx_bool simple, gmx_bool combined);
  
  /* Make a apir-list with radius rlist, store it in nbl.
   * The parameter min_ci_balanced sets the minimum required
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 23557fde00bf6dbd47c4bf5358e4f68525d53c84..8e7af55869e7b0531c45ebb992f1da137f955376 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1601,7 +1601,7 @@ static void do_force_cutsVERLET(FILE *fplog,
              }
  
              /* skip the reduction if there was no non-local work to do */
-            if (nbv->grp[eintNonlocal].nbl_lists.nblGpu[0]->nsci > 0)
+            if (!nbv->grp[eintNonlocal].nbl_lists.nblGpu[0]->sci.empty())
              {
                  nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), eatNonlocal,
                                                 nbv->nbat, f, wcycle);
author	Berk Hess <hess@kth.se>
	Fri, 4 Jan 2019 17:17:01 +0000 (18:17 +0100)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Tue, 22 Jan 2019 13:51:51 +0000 (14:51 +0100)
src/gromacs/mdlib/forcerec.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_atomdata.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_atomdata.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_consts.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu		patch \| blob \| history
src/gromacs/mdlib/nbnxn_internal.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_pairlist.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_search.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_search.h		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history