Add dynamic pair-list pruning framework

author Berk Hess <hess@kth.se>

Fri, 24 Mar 2017 13:22:21 +0000 (14:22 +0100)

committer Mark Abraham <mark.j.abraham@gmail.com>

Wed, 16 Aug 2017 20:14:52 +0000 (22:14 +0200)
author Berk Hess <hess@kth.se>
Fri, 24 Mar 2017 13:22:21 +0000 (14:22 +0100)
committer Mark Abraham <mark.j.abraham@gmail.com>
Wed, 16 Aug 2017 20:14:52 +0000 (22:14 +0200)
diff --git a/docs/user-guide/environment-variables.rst b/docs/user-guide/environment-variables.rst

index 43016daf1a941d9a2d70dc4d77ea5ba0b4fe7e99..15059694788bd1bb853497b9a1fc5df7eab0982b 100644 (file)
--- a/docs/user-guide/environment-variables.rst
+++ b/docs/user-guide/environment-variables.rst
@@ -351,9 +351,14 @@ Performance and Run Control
  ``MDRUN``
          the :ref:`gmx mdrun` command used by :ref:`gmx tune_pme`.
  
-``GMX_NSTLIST``
-        sets the default value for :mdp:`nstlist`, preventing it from being tuned during
-        :ref:`gmx mdrun` startup when using the Verlet cutoff scheme.
+``GMX_DISABLE DYNAMICPRUNING``
+        disables dynamic pair-list pruning. Note that :ref:`gmx mdrun` will
+        still tune nstlist to the optimal value with dynamic pruning. Thus
+        for good performance the -nstlist option should be used.
+
+``GMX_NSTLIST_DYNAMICPRUNING``
+        overrides the dynamic pair-list pruning interval chosen heuristically
+        by mdrun. Values should be between 1 and :mdp:`nstlist - 1`.
  
  ``GMX_USE_TREEREDUCE``
          use tree reduction for nbnxn force reduction. Potentially faster for large number of
diff --git a/src/gromacs/ewald/pme-load-balancing.cpp b/src/gromacs/ewald/pme-load-balancing.cpp

index c61e0d640d5ccbb4a45952897ed3b2c0155c786b..384ed8daaafa23be4723d3acaa7e9b5de7d4b6ae 100644 (file)
--- a/src/gromacs/ewald/pme-load-balancing.cpp
+++ b/src/gromacs/ewald/pme-load-balancing.cpp
@@ -62,7 +62,9 @@
  #include "gromacs/math/functions.h"
  #include "gromacs/math/vec.h"
  #include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/nb_verlet.h"
  #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
  #include "gromacs/mdlib/sim_util.h"
  #include "gromacs/mdtypes/commrec.h"
  #include "gromacs/mdtypes/inputrec.h"
@@ -81,7 +83,8 @@
  /*! \brief Parameters and settings for one PP-PME setup */
  struct pme_setup_t {
      real              rcut_coulomb;    /**< Coulomb cut-off                              */
-    real              rlist;           /**< pair-list cut-off                            */
+    real              rlistOuter;      /**< cut-off for the outer pair-list              */
+    real              rlistInner;      /**< cut-off for the inner pair-list              */
      real              spacing;         /**< (largest) PME grid spacing                   */
      ivec              grid;            /**< the PME grid dimensions                      */
      real              grid_efficiency; /**< ineffiency factor for non-uniform grids <= 1 */
@@ -129,8 +132,10 @@ struct pme_load_balancing_t {
      real         cut_spacing;        /**< the minimum cutoff / PME grid spacing ratio */
      real         rcut_vdw;           /**< Vdw cutoff (does not change) */
      real         rcut_coulomb_start; /**< Initial electrostatics cutoff */
-    real         rbuf_coulomb;       /**< the pairlist buffer size */
-    real         rbuf_vdw;           /**< the pairlist buffer size */
+    real         rbufOuter_coulomb;  /**< the outer pairlist buffer size */
+    real         rbufOuter_vdw;      /**< the outer pairlist buffer size */
+    real         rbufInner_coulomb;  /**< the inner pairlist buffer size */
+    real         rbufInner_vdw;      /**< the inner pairlist buffer size */
      matrix       box_start;          /**< the initial simulation box */
      int          n;                  /**< the count of setup as well as the allocation size */
      pme_setup_t *setup;              /**< the PME+cutoff setups */
@@ -161,6 +166,7 @@ void pme_loadbal_init(pme_load_balancing_t     **pme_lb_p,
                        const t_inputrec          *ir,
                        matrix                     box,
                        const interaction_const_t *ic,
+                      const NbnxnListParameters *listParams,
                        gmx_pme_t                 *pmedata,
                        gmx_bool                   bUseGPU,
                        gmx_bool                  *bPrinting)
@@ -177,18 +183,20 @@ void pme_loadbal_init(pme_load_balancing_t     **pme_lb_p,
  
      snew(pme_lb, 1);
  
-    pme_lb->bSepPMERanks  = !(cr->duty & DUTY_PME);
+    pme_lb->bSepPMERanks      = !(cr->duty & DUTY_PME);
  
      /* Initially we turn on balancing directly on based on PP/PME imbalance */
-    pme_lb->bTriggerOnDLB = FALSE;
+    pme_lb->bTriggerOnDLB     = FALSE;
  
      /* Any number of stages >= 2 is supported */
-    pme_lb->nstage        = 2;
+    pme_lb->nstage            = 2;
  
-    pme_lb->cutoff_scheme = ir->cutoff_scheme;
+    pme_lb->cutoff_scheme     = ir->cutoff_scheme;
  
-    pme_lb->rbuf_coulomb  = ic->rlist - ic->rcoulomb;
-    pme_lb->rbuf_vdw      = ic->rlist - ic->rvdw;
+    pme_lb->rbufOuter_coulomb = listParams->rlistOuter - ic->rcoulomb;
+    pme_lb->rbufOuter_vdw     = listParams->rlistOuter - ic->rvdw;
+    pme_lb->rbufInner_coulomb = listParams->rlistInner - ic->rcoulomb;
+    pme_lb->rbufInner_vdw     = listParams->rlistInner - ic->rvdw;
  
      copy_mat(box, pme_lb->box_start);
      if (ir->ePBC == epbcXY && ir->nwall == 2)
@@ -204,7 +212,8 @@ void pme_loadbal_init(pme_load_balancing_t     **pme_lb_p,
  
      pme_lb->cur                      = 0;
      pme_lb->setup[0].rcut_coulomb    = ic->rcoulomb;
-    pme_lb->setup[0].rlist           = ic->rlist;
+    pme_lb->setup[0].rlistOuter      = listParams->rlistOuter;
+    pme_lb->setup[0].rlistInner      = listParams->rlistInner;
      pme_lb->setup[0].grid[XX]        = ir->nkx;
      pme_lb->setup[0].grid[YY]        = ir->nky;
      pme_lb->setup[0].grid[ZZ]        = ir->nkz;
@@ -356,17 +365,20 @@ static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t *pme_lb,
      if (pme_lb->cutoff_scheme == ecutsVERLET)
      {
          /* Never decrease the Coulomb and VdW list buffers */
-        set->rlist        = std::max(set->rcut_coulomb + pme_lb->rbuf_coulomb,
-                                     pme_lb->rcut_vdw + pme_lb->rbuf_vdw);
+        set->rlistOuter  = std::max(set->rcut_coulomb + pme_lb->rbufOuter_coulomb,
+                                    pme_lb->rcut_vdw + pme_lb->rbufOuter_vdw);
+        set->rlistInner  = std::max(set->rcut_coulomb + pme_lb->rbufInner_coulomb,
+                                    pme_lb->rcut_vdw + pme_lb->rbufInner_vdw);
      }
      else
      {
-        tmpr_coulomb          = set->rcut_coulomb + pme_lb->rbuf_coulomb;
-        tmpr_vdw              = pme_lb->rcut_vdw + pme_lb->rbuf_vdw;
-        set->rlist            = std::min(tmpr_coulomb, tmpr_vdw);
+        tmpr_coulomb     = set->rcut_coulomb + pme_lb->rbufOuter_coulomb;
+        tmpr_vdw         = pme_lb->rcut_vdw + pme_lb->rbufOuter_vdw;
+        set->rlistOuter  = std::min(tmpr_coulomb, tmpr_vdw);
+        set->rlistInner  = set->rlistOuter;
      }
  
-    set->spacing      = sp;
+    set->spacing         = sp;
      /* The grid efficiency is the size wrt a grid with uniform x/y/z spacing */
      set->grid_efficiency = 1;
      for (d = 0; d < DIM; d++)
@@ -602,7 +614,7 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
               * better overal performance can be obtained with a slightly
               * shorter cut-off and better DD load balancing.
               */
-            set_dd_dlb_max_cutoff(cr, pme_lb->setup[pme_lb->fastest].rlist);
+            set_dd_dlb_max_cutoff(cr, pme_lb->setup[pme_lb->fastest].rlistOuter);
          }
      }
      cycles_fast = pme_lb->setup[pme_lb->fastest].cycles;
@@ -644,7 +656,7 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
  
              if (OK && ir->ePBC != epbcNONE)
              {
-                OK = (gmx::square(pme_lb->setup[pme_lb->cur+1].rlist)
+                OK = (gmx::square(pme_lb->setup[pme_lb->cur+1].rlistOuter)
                        <= max_cutoff2(ir->ePBC, state->box));
                  if (!OK)
                  {
@@ -659,7 +671,7 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
                  if (DOMAINDECOMP(cr))
                  {
                      OK = change_dd_cutoff(cr, state, ir,
-                                          pme_lb->setup[pme_lb->cur].rlist);
+                                          pme_lb->setup[pme_lb->cur].rlistOuter);
                      if (!OK)
                      {
                          /* Failed: do not use this setup */
@@ -731,7 +743,7 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
  
      if (DOMAINDECOMP(cr) && pme_lb->stage > 0)
      {
-        OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlist);
+        OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlistOuter);
          if (!OK)
          {
              /* For some reason the chosen cut-off is incompatible with DD.
@@ -766,9 +778,12 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
  
      set = &pme_lb->setup[pme_lb->cur];
  
-    ic->rcoulomb     = set->rcut_coulomb;
-    ic->rlist        = set->rlist;
-    ic->ewaldcoeff_q = set->ewaldcoeff_q;
+    NbnxnListParameters *listParams = nbv->listParams.get();
+
+    ic->rcoulomb           = set->rcut_coulomb;
+    listParams->rlistOuter = set->rlistOuter;
+    listParams->rlistInner = set->rlistInner;
+    ic->ewaldcoeff_q       = set->ewaldcoeff_q;
      /* TODO: centralize the code that sets the potentials shifts */
      if (ic->coulomb_modifier == eintmodPOTSHIFT)
      {
@@ -794,7 +809,7 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
      /* We always re-initialize the tables whether they are used or not */
      init_interaction_const_tables(nullptr, ic, rtab);
  
-    nbnxn_gpu_pme_loadbal_update_param(nbv, ic);
+    nbnxn_gpu_pme_loadbal_update_param(nbv, ic, listParams);
  
      /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
       * also sharing texture references. To keep the code simple, we don't
@@ -978,7 +993,7 @@ void pme_loadbal_do(pme_load_balancing_t *pme_lb,
               * This also ensures that we won't disable the currently
               * optimal setting during a second round of PME balancing.
               */
-            set_dd_dlb_max_cutoff(cr, fr->ic->rlist);
+            set_dd_dlb_max_cutoff(cr, fr->nbv->listParams->rlistOuter);
          }
      }
  
@@ -997,7 +1012,7 @@ void pme_loadbal_do(pme_load_balancing_t *pme_lb,
          /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
          fr->ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
          fr->ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
-        fr->rlist         = fr->ic->rlist;
+        fr->rlist         = fr->nbv->listParams->rlistOuter;
          fr->rcoulomb      = fr->ic->rcoulomb;
          fr->rvdw          = fr->ic->rvdw;
  
@@ -1040,7 +1055,7 @@ static void print_pme_loadbal_setting(FILE              *fplog,
      fprintf(fplog,
              "   %-7s %6.3f nm %6.3f nm     %3d %3d %3d   %5.3f nm  %5.3f nm\n",
              name,
-            setup->rcut_coulomb, setup->rlist,
+            setup->rcut_coulomb, setup->rlistInner,
              setup->grid[XX], setup->grid[YY], setup->grid[ZZ],
              setup->spacing, 1/setup->ewaldcoeff_q);
  }
@@ -1054,7 +1069,7 @@ static void print_pme_loadbal_settings(pme_load_balancing_t *pme_lb,
      double     pp_ratio, grid_ratio;
      real       pp_ratio_temporary;
  
-    pp_ratio_temporary = pme_lb->setup[pme_lb->cur].rlist / pme_lb->setup[0].rlist;
+    pp_ratio_temporary = pme_lb->setup[pme_lb->cur].rlistInner / pme_lb->setup[0].rlistInner;
      pp_ratio           = gmx::power3(pp_ratio_temporary);
      grid_ratio         = pme_grid_points(&pme_lb->setup[pme_lb->cur])/
          (double)pme_grid_points(&pme_lb->setup[0]);
diff --git a/src/gromacs/ewald/pme-load-balancing.h b/src/gromacs/ewald/pme-load-balancing.h

index 3052ac05a5df619b72ff5365da21e2731675ae2f..3edce7e7182df4ba417eae0440475eb043d9e629 100644 (file)
--- a/src/gromacs/ewald/pme-load-balancing.h
+++ b/src/gromacs/ewald/pme-load-balancing.h
@@ -50,6 +50,7 @@
  #include "gromacs/mdtypes/interaction_const.h"
  #include "gromacs/timing/wallcycle.h"
  
+struct NbnxnListParameters;
  struct t_commrec;
  struct t_inputrec;
  class t_state;
@@ -79,6 +80,7 @@ void pme_loadbal_init(pme_load_balancing_t     **pme_lb_p,
                        const t_inputrec          *ir,
                        matrix                     box,
                        const interaction_const_t *ic,
+                      const NbnxnListParameters *listParams,
                        gmx_pme_t                 *pmedata,
                        gmx_bool                   bUseGPU,
                        gmx_bool                  *bPrinting);
diff --git a/src/gromacs/gmxpreprocess/grompp.cpp b/src/gromacs/gmxpreprocess/grompp.cpp

index 951290fc4cc1e51e438a2c7d26168982c3e70f7c..59e4f7374313dd7d887865bb22ecbb01037570a9 100644 (file)
--- a/src/gromacs/gmxpreprocess/grompp.cpp
+++ b/src/gromacs/gmxpreprocess/grompp.cpp
@@ -1637,13 +1637,13 @@ static void set_verlet_buffer(const gmx_mtop_t *mtop,
      /* Calculate the buffer size for simple atom vs atoms list */
      ls.cluster_size_i = 1;
      ls.cluster_size_j = 1;
-    calc_verlet_buffer_size(mtop, det(box), ir, buffer_temp,
-                            &ls, &n_nonlin_vsite, &rlist_1x1);
+    calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1,
+                            buffer_temp, &ls, &n_nonlin_vsite, &rlist_1x1);
  
      /* Set the pair-list buffer size in ir */
      verletbuf_get_list_setup(FALSE, FALSE, &ls);
-    calc_verlet_buffer_size(mtop, det(box), ir, buffer_temp,
-                            &ls, &n_nonlin_vsite, &ir->rlist);
+    calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1,
+                            buffer_temp, &ls, &n_nonlin_vsite, &ir->rlist);
  
      if (n_nonlin_vsite > 0)
      {
diff --git a/src/gromacs/mdlib/calc_verletbuf.cpp b/src/gromacs/mdlib/calc_verletbuf.cpp

index 2b5c938127646fe6a09c8461f0ceece59623e025..5a5e8be919f43c1eaa54873ea28185d28863184a 100644 (file)
--- a/src/gromacs/mdlib/calc_verletbuf.cpp
+++ b/src/gromacs/mdlib/calc_verletbuf.cpp
@@ -796,6 +796,8 @@ static real md3_force_switch(real p, real rswitch, real rc)
  
  void calc_verlet_buffer_size(const gmx_mtop_t *mtop, real boxvol,
                               const t_inputrec *ir,
+                             int               nstlist,
+                             int               list_lifetime,
                               real reference_temperature,
                               const verletbuf_list_setup_t *list_setup,
                               int *n_nonlin_vsite,
@@ -999,7 +1001,7 @@ void calc_verlet_buffer_size(const gmx_mtop_t *mtop, real boxvol,
      }
  
      /* Determine the variance of the atomic displacement
-     * over nstlist-1 steps: kT_fac
+     * over list_lifetime steps: kT_fac
       * For inertial dynamics (not Brownian dynamics) the mass factor
       * is not included in kT_fac, it is added later.
       */
@@ -1010,7 +1012,7 @@ void calc_verlet_buffer_size(const gmx_mtop_t *mtop, real boxvol,
           * should be negligible (unless nstlist is extremely large, which
           * you wouldn't do anyhow).
           */
-        kT_fac = 2*BOLTZ*reference_temperature*(ir->nstlist-1)*ir->delta_t;
+        kT_fac = 2*BOLTZ*reference_temperature*list_lifetime*ir->delta_t;
          if (ir->bd_fric > 0)
          {
              /* This is directly sigma^2 of the displacement */
@@ -1041,7 +1043,7 @@ void calc_verlet_buffer_size(const gmx_mtop_t *mtop, real boxvol,
      }
      else
      {
-        kT_fac = BOLTZ*reference_temperature*gmx::square((ir->nstlist-1)*ir->delta_t);
+        kT_fac = BOLTZ*reference_temperature*gmx::square(list_lifetime*ir->delta_t);
      }
  
      mass_min = att[0].prop.mass;
@@ -1091,7 +1093,7 @@ void calc_verlet_buffer_size(const gmx_mtop_t *mtop, real boxvol,
          drift *= nb_clust_frac_pairs_not_in_list_at_cutoff;
  
          /* Convert the drift to drift per unit time per atom */
-        drift /= ir->nstlist*ir->delta_t*mtop->natoms;
+        drift /= nstlist*ir->delta_t*mtop->natoms;
  
          if (debug)
          {
diff --git a/src/gromacs/mdlib/calc_verletbuf.h b/src/gromacs/mdlib/calc_verletbuf.h

index 1f073ff758578baeac9b962df31000a64bac9437..c494e137ec90a0efd78d9801edf0d2185f50f8d9 100644 (file)
--- a/src/gromacs/mdlib/calc_verletbuf.h
+++ b/src/gromacs/mdlib/calc_verletbuf.h
@@ -78,6 +78,9 @@ void verletbuf_get_list_setup(bool                    makeSimdPairList,
  /* Calculate the non-bonded pair-list buffer size for the Verlet list
   * based on the particle masses, temperature, LJ types, charges
   * and constraints as well as the non-bonded force behavior at the cut-off.
+ * The pair list update frequency and the list lifetime, which is nstlist-1
+ * for normal pair-list buffering, are passed separately, as in some cases
+ * we want an estimate for different values than the ones set in the inputrec.
   * If reference_temperature < 0, the maximum coupling temperature will be used.
   * The target is a maximum energy drift of ir->verletbuf_tol.
   * Returns the number of non-linear virtual sites. For these it's difficult
@@ -86,6 +89,8 @@ void verletbuf_get_list_setup(bool                    makeSimdPairList,
   */
  void calc_verlet_buffer_size(const gmx_mtop_t *mtop, real boxvol,
                               const t_inputrec *ir,
+                             int               nstlist,
+                             int               list_lifetime,
                               real reference_temperature,
                               const verletbuf_list_setup_t *list_setup,
                               int *n_nonlin_vsite,
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp

index 7b97d744ec638e24059fbd2d0a3e2d05fabaafb6..ce3af7933735234e306f62cb83b8dc6dbfa6ee64 100644 (file)
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -99,6 +99,7 @@
  #include "gromacs/utility/strconvert.h"
  
  #include "nbnxn_gpu_jit_support.h"
+#include "nbnxn_tuning.h"
  
  const char *egrp_nm[egNR+1] = {
      "Coul-SR", "LJ-SR", "Buck-SR",
@@ -1911,8 +1912,6 @@ init_interaction_const(FILE                       *fp,
      snew_aligned(ic->tabq_coul_F, 16, 32);
      snew_aligned(ic->tabq_coul_V, 16, 32);
  
-    ic->rlist           = fr->rlist;
-
      /* Lennard-Jones */
      ic->vdwtype         = fr->vdwtype;
      ic->vdw_modifier    = fr->vdw_modifier;
@@ -2037,7 +2036,9 @@ static void init_nb_verlet(FILE                *fp,
                             const t_forcerec    *fr,
                             const t_commrec     *cr,
                             const char          *nbpu_opt,
-                           gmx_device_info_t   *deviceInfo)
+                           gmx_device_info_t   *deviceInfo,
+                           const gmx_mtop_t    *mtop,
+                           matrix               box)
  {
      nonbonded_verlet_t *nbv;
      int                 i;
@@ -2047,7 +2048,7 @@ static void init_nb_verlet(FILE                *fp,
      nbnxn_alloc_t      *nb_alloc;
      nbnxn_free_t       *nb_free;
  
-    snew(nbv, 1);
+    nbv = new nonbonded_verlet_t();
  
      nbv->emulateGpu = (getenv("GMX_EMULATE_GPU") != nullptr);
      nbv->bUseGPU    = deviceInfo != nullptr;
@@ -2100,6 +2101,10 @@ static void init_nb_verlet(FILE                *fp,
          }
      }
  
+    nbv->listParams = std::unique_ptr<NbnxnListParameters>(new NbnxnListParameters(ir->rlist));
+    setupDynamicPairlistPruning(fp, ir, mtop, box, nbv->bUseGPU, fr->ic,
+                                nbv->listParams.get());
+
      nbnxn_init_search(&nbv->nbs,
                        DOMAINDECOMP(cr) ? &cr->dd->nc : nullptr,
                        DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : nullptr,
@@ -2175,6 +2180,7 @@ static void init_nb_verlet(FILE                *fp,
          nbnxn_gpu_init(&nbv->gpu_nbv,
                         deviceInfo,
                         fr->ic,
+                       nbv->listParams.get(),
                         nbv->grp,
                         cr->nodeid,
                         (nbv->ngrp > 1) && !bHybridGPURun);
@@ -3134,7 +3140,9 @@ void init_forcerec(FILE                *fp,
              GMX_RELEASE_ASSERT(ir->rcoulomb == ir->rvdw, "With Verlet lists and no PME rcoulomb and rvdw should be identical");
          }
  
-        init_nb_verlet(fp, mdlog, &fr->nbv, bFEP_NonBonded, ir, fr, cr, nbpu_opt, deviceInfo);
+        init_nb_verlet(fp, mdlog, &fr->nbv, bFEP_NonBonded, ir, fr,
+                       cr, nbpu_opt, deviceInfo,
+                       mtop, box);
      }
  
      if (ir->eDispCorr != edispcNO)
diff --git a/src/gromacs/mdlib/nb_verlet.h b/src/gromacs/mdlib/nb_verlet.h

index 886fbd4d4ef5606e904cf393cd0182dc74f6ec00..0b692333b60ffb714191e5fbaf2328833d0fa57e 100644 (file)
--- a/src/gromacs/mdlib/nb_verlet.h
+++ b/src/gromacs/mdlib/nb_verlet.h
@@ -54,6 +54,19 @@
   * which in the code is referred to as the NxN algorithms ("nbnxn_" prefix);
   * for details of the algorithm see DOI:10.1016/j.cpc.2013.06.003.
   *
+ * Algorithmically, the non-bonded computation has two different modes:
+ * A "classical" mode: generate a list every nstlist steps containing at least
+ * all atom pairs up to a distance of rlistOuter and compute pair interactions
+ * for all pairs that are within the interaction cut-off.
+ * A "dynamic pruning" mode: generate an "outer-list" up to cut-off rlistOuter
+ * every nstlist steps and prune the outer-list using a cut-off of rlistInner
+ * every nstlistPrune steps to obtain a, smaller, "inner-list". This
+ * results in fewer interaction computations and allows for a larger nstlist.
+ * On a GPU, this dynamic pruning is performed in a rolling fashion, pruning
+ * only a sub-part of the list each (second) step. This way it can often
+ * overlap with integration and constraints on the CPU.
+ * Currently a simple heuristic determines which mode will be used.
+ *
   * TODO: add a summary list and brief descriptions of the different submodules:
   * search, CPU kernels, GPU glue code + kernels.
   *
@@ -85,6 +98,8 @@
  #ifndef NB_VERLET_H
  #define NB_VERLET_H
  
+#include <memory>
+
  #include "gromacs/mdlib/nbnxn_gpu_types.h"
  #include "gromacs/mdlib/nbnxn_pairlist.h"
  
@@ -166,15 +181,16 @@ typedef struct nonbonded_verlet_group_t {
  /*! \libinternal
   *  \brief Top-level non-bonded data structure for the Verlet-type cut-off scheme. */
  typedef struct nonbonded_verlet_t {
-    nbnxn_search_t           nbs;             /**< n vs n atom pair searching data       */
-    int                      ngrp;            /**< number of interaction groups          */
-    nonbonded_verlet_group_t grp[2];          /**< local and non-local interaction group */
-
-    gmx_bool                 bUseGPU;         /**< TRUE when non-bonded interactions are computed on a physical GPU */
-    bool                     emulateGpu;      /**< true when non-bonded interactions are computed on the CPU using GPU-style pair lists */
-    gmx_nbnxn_gpu_t         *gpu_nbv;         /**< pointer to GPU nb verlet data     */
-    int                      min_ci_balanced; /**< pair list balancing parameter
-                                                   used for the 8x8x8 GPU kernels    */
+    std::unique_ptr<NbnxnListParameters> listParams;      /**< Parameters for the search and list pruning setup */
+    nbnxn_search_t                       nbs;             /**< n vs n atom pair searching data       */
+    int                                  ngrp;            /**< number of interaction groups          */
+    nonbonded_verlet_group_t             grp[2];          /**< local and non-local interaction group */
+
+    gmx_bool                             bUseGPU;         /**< TRUE when non-bonded interactions are computed on a physical GPU */
+    bool                                 emulateGpu;      /**< true when non-bonded interactions are computed on the CPU using GPU-style pair lists */
+    gmx_nbnxn_gpu_t                     *gpu_nbv;         /**< pointer to GPU nb verlet data     */
+    int                                  min_ci_balanced; /**< pair list balancing parameter
+                                                               used for the 8x8x8 GPU kernels    */
  } nonbonded_verlet_t;
  
  /*! \brief Getter for bUseGPU */
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu

index 52e255de560dc788b5f363beb5f303b8e9e93251..55c80da775082134ee3e0780e2cd606f4302f5a9 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -298,7 +298,8 @@ static int pick_ewald_kernel_type(bool                     bTwinCut,
  
  /*! Copies all parameters related to the cut-off from ic to nbp */
  static void set_cutoff_parameters(cu_nbparam_t              *nbp,
-                                  const interaction_const_t *ic)
+                                  const interaction_const_t *ic,
+                                  const NbnxnListParameters *listParams)
  {
      nbp->ewald_beta       = ic->ewaldcoeff_q;
      nbp->sh_ewald         = ic->sh_ewald;
@@ -307,7 +308,7 @@ static void set_cutoff_parameters(cu_nbparam_t              *nbp,
      nbp->c_rf             = ic->c_rf;
      nbp->rvdw_sq          = ic->rvdw * ic->rvdw;
      nbp->rcoulomb_sq      = ic->rcoulomb * ic->rcoulomb;
-    nbp->rlist_sq         = ic->rlist * ic->rlist;
+    nbp->rlist_sq         = listParams->rlistOuter * listParams->rlistOuter;
  
      nbp->sh_lj_ewald      = ic->sh_lj_ewald;
      nbp->ewaldcoeff_lj    = ic->ewaldcoeff_lj;
@@ -362,6 +363,7 @@ static void initParamLookupTable(float                    * &devPtr,
  /*! Initializes the nonbonded parameter data structure. */
  static void init_nbparam(cu_nbparam_t              *nbp,
                           const interaction_const_t *ic,
+                         const NbnxnListParameters *listParams,
                           const nbnxn_atomdata_t    *nbat,
                           const gmx_device_info_t   *dev_info)
  {
@@ -369,7 +371,7 @@ static void init_nbparam(cu_nbparam_t              *nbp,
  
      ntypes  = nbat->ntype;
  
-    set_cutoff_parameters(nbp, ic);
+    set_cutoff_parameters(nbp, ic, listParams);
  
      /* The kernel code supports LJ combination rules (geometric and LB) for
       * all kernel types, but we only generate useful combination rule kernels.
@@ -477,7 +479,8 @@ static void init_nbparam(cu_nbparam_t              *nbp,
  /*! Re-generate the GPU Ewald force table, resets rlist, and update the
   *  electrostatic type switching to twin cut-off (or back) if needed. */
  void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
-                                        const interaction_const_t   *ic)
+                                        const interaction_const_t   *ic,
+                                        const NbnxnListParameters   *listParams)
  {
      if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
      {
@@ -486,7 +489,7 @@ void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
      gmx_nbnxn_cuda_t *nb    = nbv->gpu_nbv;
      cu_nbparam_t     *nbp   = nb->nbparam;
  
-    set_cutoff_parameters(nbp, ic);
+    set_cutoff_parameters(nbp, ic, listParams);
  
      nbp->eeltype        = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
                                                   nb->dev_info);
@@ -574,10 +577,11 @@ static void init_timings(gmx_wallclock_gpu_t *t)
  /*! Initializes simulation constant data. */
  static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t               *nb,
                                    const interaction_const_t      *ic,
+                                  const NbnxnListParameters      *listParams,
                                    const nonbonded_verlet_group_t *nbv_group)
  {
      init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype);
-    init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_info);
+    init_nbparam(nb->nbparam, ic, listParams, nbv_group[0].nbat, nb->dev_info);
  
      /* clear energy and shift force outputs */
      nbnxn_cuda_clear_e_fshift(nb);
@@ -586,6 +590,7 @@ static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t               *nb,
  void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
                      const gmx_device_info_t   *deviceInfo,
                      const interaction_const_t *ic,
+                    const NbnxnListParameters *listParams,
                      nonbonded_verlet_group_t  *nbv_grp,
                      int                        /*rank*/,
                      gmx_bool                   bLocalAndNonlocal)
@@ -667,7 +672,7 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
      /* pick L1 cache configuration */
      nbnxn_cuda_set_cacheconfig(nb->dev_info);
  
-    nbnxn_cuda_init_const(nb, ic, nbv_grp);
+    nbnxn_cuda_init_const(nb, ic, listParams, nbv_grp);
  
      *p_nb = nb;
  
diff --git a/src/gromacs/mdlib/nbnxn_gpu.h b/src/gromacs/mdlib/nbnxn_gpu.h

index 318824de45ca1e19ead172e60d3241e795f9c445..1b903f13366e7c14a4025e078c981c3df2c9ea0b 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_gpu.h
+++ b/src/gromacs/mdlib/nbnxn_gpu.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -71,6 +71,46 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused               *nb,
                               int gmx_unused                            flags,
                               int gmx_unused                            iloc) GPU_FUNC_TERM
  
+/*! \brief
+ * Launch asynchronously the nonbonded prune-only kernel.
+ *
+ *  The local and non-local list pruning are launched in their separate streams.
+ *
+ *  Notes for future scheduling tuning:
+ *  Currently we schedule the dynamic pruning between two MD steps *after* both local and
+ *  nonlocal force D2H transfers completed. We could launch already after the cpyback
+ *  is launched, but we want to avoid prune kernels (especially in the non-local
+ *  high prio-stream) competing with nonbonded work.
+ *
+ *  However, this is not ideal as this schedule does not expose the available
+ *  concurrency. The dynamic pruning kernel:
+ *    - should be allowed to overlap with any task other than force compute, including
+ *      transfers (F D2H and the next step's x H2D as well as force clearing).
+ *    - we'd prefer to avoid competition with non-bonded force kernels belonging
+ *      to the same rank and ideally other ranks too.
+ *
+ *  In the most general case, the former would require scheduling pruning in a separate
+ *  stream and adding additional event sync points to ensure that force kernels read
+ *  consistent pair list data. This would lead to some overhead (due to extra
+ *  cudaStreamWaitEvent calls, 3-5 us/call) which we might be able to live with.
+ *  The gains from additional overlap might not be significant as long as
+ *  update+constraints anyway takes longer than pruning, but there will still
+ *  be use-cases where more overlap may help (e.g. multiple ranks per GPU,
+ *  no/hbonds only constraints).
+ *  The above second point is harder to address given that multiple ranks will often
+ *  share a GPU. Ranks that complete their nonbondeds sooner can schedule pruning earlier
+ *  and without a third priority level it is difficult to avoid some interference of
+ *  prune kernels with force tasks (in particular preemption of low-prio local force task).
+ *
+ * \param [inout] nb        GPU nonbonded data.
+ * \param [in]    iloc      Interaction locality flag.
+ * \param [in]    numParts  Number of parts the pair list is split into in the rolling kernel.
+ */
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t gmx_unused *nb,
+                                       int gmx_unused              iloc,
+                                       int gmx_unused              numParts) GPU_FUNC_TERM
+
  /*! \brief
   * Launch asynchronously the download of nonbonded forces from the GPU
   * (and energies/shift forces if required).
diff --git a/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h b/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h

index cddb0f05a496cbba19f74386cbf24eeeb04c9788..ebc1d5171bdf9e74c3e7ea7f5bf1d4cd8ceb35cf 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h
+++ b/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h
@@ -54,6 +54,7 @@ extern "C" {
  struct nonbonded_verlet_group_t;
  struct nbnxn_pairlist_t;
  struct nbnxn_atomdata_t;
+struct NbnxnListParameters;
  struct gmx_wallclock_gpu_t;
  struct gmx_gpu_info_t;
  
@@ -62,6 +63,7 @@ GPU_FUNC_QUALIFIER
  void nbnxn_gpu_init(gmx_nbnxn_gpu_t gmx_unused            **p_nb,
                      const gmx_device_info_t gmx_unused     *deviceInfo,
                      const interaction_const_t gmx_unused   *ic,
+                    const NbnxnListParameters gmx_unused   *listParams,
                      nonbonded_verlet_group_t gmx_unused    *nbv_grp,
                      int gmx_unused                          rank,
                      /* true if both local and non-local are done on GPU */
@@ -83,7 +85,8 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_gpu_t gmx_unused               *nb,
   */
  GPU_FUNC_QUALIFIER
  void nbnxn_gpu_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unused *nbv,
-                                        const interaction_const_t gmx_unused       *ic) GPU_FUNC_TERM
+                                        const interaction_const_t gmx_unused       *ic,
+                                        const NbnxnListParameters gmx_unused       *listParams) GPU_FUNC_TERM
  
  /** Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). */
  GPU_FUNC_QUALIFIER
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp

index 4cdd8576c16a5500da9d183927b1319d1b74765f..cd9fbce337d01e7a751b5261745d7f333dfcdbfe 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp
@@ -97,10 +97,12 @@ static void clearGroupEnergies(nbnxn_atomdata_output_t *out)
   * \param[in,out] vCoulomb        Coulomb energy output buffer
   */
  template <int unrollj> static void
-reduceGroupEnergySimdBuffers(int numGroups, int numGroups_2log,
-                             const real *vVdwSimd, const real *vCoulombSimd,
-                             real * gmx_restrict vVdw,
-                             real * gmx_restrict vCoulomb)
+reduceGroupEnergySimdBuffers(int                       numGroups,
+                             int                       numGroups_2log,
+                             const real * gmx_restrict vVdwSimd,
+                             const real * gmx_restrict vCoulombSimd,
+                             real * gmx_restrict       vVdw,
+                             real * gmx_restrict       vCoulomb)
  {
      // cppcheck-suppress duplicateExpression
      const int unrollj_half     = unrollj/2;
@@ -220,6 +222,8 @@ nbnxn_kernel_cpu(nonbonded_verlet_group_t  *nbvg,
      int                nnbl = nbvg->nbl_lists.nnbl;
      nbnxn_pairlist_t **nbl  = nbvg->nbl_lists.nbl;
  
+    GMX_ASSERT(nbl[0]->nci >= 0, "nci<0, which signals an invalid pair-list");
+
      // cppcheck-suppress unreadVariable
      int gmx_unused nthreads = gmx_omp_nthreads_get(emntNonbonded);
  #pragma omp parallel for schedule(static) num_threads(nthreads)
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp

index a7b95e49a98b03f2275caa5baa2fb0978117ff0a..5be02872c6cbca90edf1f05fe66902fd833ddd10 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
@@ -284,7 +284,8 @@ static void init_atomdata_first(cl_atomdata_t *ad, int ntypes, gmx_device_runtim
  /*! \brief Copies all parameters related to the cut-off from ic to nbp
   */
  static void set_cutoff_parameters(cl_nbparam_t              *nbp,
-                                  const interaction_const_t *ic)
+                                  const interaction_const_t *ic,
+                                  const NbnxnListParameters *listParams)
  {
      nbp->ewald_beta       = ic->ewaldcoeff_q;
      nbp->sh_ewald         = ic->sh_ewald;
@@ -293,7 +294,7 @@ static void set_cutoff_parameters(cl_nbparam_t              *nbp,
      nbp->c_rf             = ic->c_rf;
      nbp->rvdw_sq          = ic->rvdw * ic->rvdw;
      nbp->rcoulomb_sq      = ic->rcoulomb * ic->rcoulomb;
-    nbp->rlist_sq         = ic->rlist * ic->rlist;
+    nbp->rlist_sq         = listParams->rlistOuter * listParams->rlistOuter;
  
      nbp->sh_lj_ewald      = ic->sh_lj_ewald;
      nbp->ewaldcoeff_lj    = ic->ewaldcoeff_lj;
@@ -388,6 +389,7 @@ map_interaction_types_to_gpu_kernel_flavors(const interaction_const_t *ic,
   */
  static void init_nbparam(cl_nbparam_t                    *nbp,
                           const interaction_const_t       *ic,
+                         const NbnxnListParameters       *listParams,
                           const nbnxn_atomdata_t          *nbat,
                           const gmx_device_runtime_data_t *runData)
  {
@@ -397,7 +399,7 @@ static void init_nbparam(cl_nbparam_t                    *nbp,
  
      ntypes = nbat->ntype;
  
-    set_cutoff_parameters(nbp, ic);
+    set_cutoff_parameters(nbp, ic, listParams);
  
      map_interaction_types_to_gpu_kernel_flavors(ic,
                                                  nbat->comb_rule,
@@ -494,7 +496,8 @@ static void init_nbparam(cl_nbparam_t                    *nbp,
  
  //! This function is documented in the header file
  void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
-                                        const interaction_const_t   *ic)
+                                        const interaction_const_t   *ic,
+                                        const NbnxnListParameters   *listParams)
  {
      if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
      {
@@ -503,7 +506,7 @@ void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
      gmx_nbnxn_ocl_t    *nb  = nbv->gpu_nbv;
      cl_nbparam_t       *nbp = nb->nbparam;
  
-    set_cutoff_parameters(nbp, ic);
+    set_cutoff_parameters(nbp, ic, listParams);
  
      nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw);
  
@@ -678,10 +681,11 @@ static void nbnxn_gpu_init_kernels(gmx_nbnxn_ocl_t *nb)
   */
  static void nbnxn_ocl_init_const(gmx_nbnxn_ocl_t                *nb,
                                   const interaction_const_t      *ic,
+                                 const NbnxnListParameters      *listParams,
                                   const nonbonded_verlet_group_t *nbv_group)
  {
      init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype, nb->dev_rundata);
-    init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_rundata);
+    init_nbparam(nb->nbparam, ic, listParams, nbv_group[0].nbat, nb->dev_rundata);
  }
  
  
@@ -689,6 +693,7 @@ static void nbnxn_ocl_init_const(gmx_nbnxn_ocl_t                *nb,
  void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
                      const gmx_device_info_t   *deviceInfo,
                      const interaction_const_t *ic,
+                    const NbnxnListParameters *listParams,
                      nonbonded_verlet_group_t  *nbv_grp,
                      int                        rank,
                      gmx_bool                   bLocalAndNonlocal)
@@ -781,7 +786,7 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
          init_timings(nb->timings);
      }
  
-    nbnxn_ocl_init_const(nb, ic, nbv_grp);
+    nbnxn_ocl_init_const(nb, ic, listParams, nbv_grp);
  
      /* Enable LJ param manual prefetch for AMD or if we request through env. var.
       * TODO: decide about NVIDIA
diff --git a/src/gromacs/mdlib/nbnxn_pairlist.h b/src/gromacs/mdlib/nbnxn_pairlist.h

index 7576b68f067f32e265e653949dc8ff630e66a30c..8ccb2224cedd69185442f936577c55464cf05417 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_pairlist.h
+++ b/src/gromacs/mdlib/nbnxn_pairlist.h
@@ -46,6 +46,34 @@
  
  struct tMPI_Atomic;
  
+/*! \cond INTERNAL */
+
+/*! \brief The setup for generating and pruning the nbnxn pair list.
+ *
+ * Without dynamic pruning rlistOuter=rlistInner.
+ */
+struct NbnxnListParameters
+{
+    /*! \brief Constructor producing a struct with dynamic pruning disabled
+     */
+    NbnxnListParameters(real rlist) :
+        useDynamicPruning(false),
+        nstlistPrune(-1),
+        rlistOuter(rlist),
+        rlistInner(rlist),
+        numRollingParts(1)
+    {
+    }
+
+    bool useDynamicPruning; //!< Are we using dynamic pair-list pruning
+    int  nstlistPrune;      //!< Pair-list dynamic pruning interval
+    real rlistOuter;        //!< Cut-off of the larger, outer pair-list
+    real rlistInner;        //!< Cut-off of the smaller, inner pair-list
+    int  numRollingParts;   //!< The number parts to divide the pair-list into for rolling pruning, a value of 1 gives no rolling pruning
+};
+
+/*! \endcond */
+
  /* With GPU kernels the i and j cluster size is 8 atoms */
  static const int c_nbnxnGpuClusterSize = 8;
  
@@ -159,15 +187,18 @@ typedef struct nbnxn_pairlist_t {
      int                     na_sc;       /* The number of atoms per super cluster    */
      real                    rlist;       /* The radius for constructing the list     */
      int                     nci;         /* The number of i-clusters in the list     */
+    int                     nciOuter;    /* The number of i-clusters in the outer, unpruned list, -1 when invalid */
      nbnxn_ci_t             *ci;          /* The i-cluster list, size nci             */
-    int                     ci_nalloc;   /* The allocation size of ci                */
+    nbnxn_ci_t             *ciOuter;     /* The outer, unpruned i-cluster list, size nciOuter(=-1 when invalid) */
+    int                     ci_nalloc;   /* The allocation size of ci/ciOuter        */
      int                     nsci;        /* The number of i-super-clusters in the list */
      nbnxn_sci_t            *sci;         /* The i-super-cluster list                 */
      int                     sci_nalloc;  /* The allocation size of sci               */
  
      int                     ncj;         /* The number of j-clusters in the list     */
      nbnxn_cj_t             *cj;          /* The j-cluster list, size ncj             */
-    int                     cj_nalloc;   /* The allocation size of cj                */
+    nbnxn_cj_t             *cjOuter;     /* The outer, unpruned j-cluster list, size ncj    */
+    int                     cj_nalloc;   /* The allocation size of cj/cj0            */
      int                     ncjInUse;    /* The number of j-clusters that are used by ci entries in this list, will be <= ncj */
  
      int                     ncj4;        /* The total number of 4*j clusters         */
@@ -184,16 +215,17 @@ typedef struct nbnxn_pairlist_t {
  } nbnxn_pairlist_t;
  
  typedef struct {
-    int                nnbl;        /* number of lists */
-    nbnxn_pairlist_t **nbl;         /* lists */
-    nbnxn_pairlist_t **nbl_work;    /* work space for rebalancing lists */
-    gmx_bool           bCombined;   /* TRUE if lists get combined into one (the 1st) */
-    gmx_bool           bSimple;     /* TRUE if the list of of type "simple"
-                                       (na_sc=na_s, no super-clusters used) */
-    int                natpair_ljq; /* Total number of atom pairs for LJ+Q kernel */
-    int                natpair_lj;  /* Total number of atom pairs for LJ kernel   */
-    int                natpair_q;   /* Total number of atom pairs for Q kernel    */
-    t_nblist         **nbl_fep;
+    int                nnbl;                  /* number of lists */
+    nbnxn_pairlist_t **nbl;                   /* lists */
+    nbnxn_pairlist_t **nbl_work;              /* work space for rebalancing lists */
+    gmx_bool           bCombined;             /* TRUE if lists get combined into one (the 1st) */
+    gmx_bool           bSimple;               /* TRUE if the list of of type "simple"
+                                                 (na_sc=na_s, no super-clusters used) */
+    int                natpair_ljq;           /* Total number of atom pairs for LJ+Q kernel */
+    int                natpair_lj;            /* Total number of atom pairs for LJ kernel   */
+    int                natpair_q;             /* Total number of atom pairs for Q kernel    */
+    t_nblist         **nbl_fep;               /* List of free-energy atom pair interactions */
+    gmx_int64_t        outerListCreationStep; /* Step at which the outer list was created */
  } nbnxn_pairlist_set_t;
  
  enum {
diff --git a/src/gromacs/mdlib/nbnxn_search.cpp b/src/gromacs/mdlib/nbnxn_search.cpp

index 56eb5e1150205db56b5d6c769edc45424c7809b5..297c0531e36ab1daa2337627b7a4b3c6c18c2c89 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_search.cpp
+++ b/src/gromacs/mdlib/nbnxn_search.cpp
@@ -737,6 +737,11 @@ static void check_cell_list_space_simple(nbnxn_pairlist_t *nbl,
                             nbl->ncj*sizeof(*nbl->cj),
                             nbl->cj_nalloc*sizeof(*nbl->cj),
                             nbl->alloc, nbl->free);
+
+        nbnxn_realloc_void((void **)&nbl->cjOuter,
+                           nbl->ncj*sizeof(*nbl->cjOuter),
+                           nbl->cj_nalloc*sizeof(*nbl->cjOuter),
+                           nbl->alloc, nbl->free);
      }
  }
  
@@ -2136,6 +2141,11 @@ static void nb_realloc_ci(nbnxn_pairlist_t *nbl, int n)
                         nbl->nci*sizeof(*nbl->ci),
                         nbl->ci_nalloc*sizeof(*nbl->ci),
                         nbl->alloc, nbl->free);
+
+    nbnxn_realloc_void((void **)&nbl->ciOuter,
+                       nbl->nci*sizeof(*nbl->ciOuter),
+                       nbl->ci_nalloc*sizeof(*nbl->ciOuter),
+                       nbl->alloc, nbl->free);
  }
  
  /* Reallocate the super-cell sci list for at least n entries */
@@ -2399,6 +2409,7 @@ static void clear_pairlist(nbnxn_pairlist_t *nbl)
      nbl->ncjInUse      = 0;
      nbl->ncj4          = 0;
      nbl->nci_tot       = 0;
+    nbl->nciOuter      = -1;
      nbl->nexcl         = 1;
  
      nbl->work->ncj_noq = 0;
@@ -4357,6 +4368,11 @@ void nbnxn_make_pairlist(const nbnxn_search_t  nbs,
          balance_fep_lists(nbs, nbl_list);
      }
  
+    /* This is a fresh list, so not pruned, stored using ci and nci.
+     * ciOuter and nciOuter are invalid at this point.
+     */
+    GMX_ASSERT(nbl_list->nbl[0]->nciOuter == -1, "nciOuter should have been set to -1 to signal that it is invalid");
+
      /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
      if (LOCAL_I(iloc))
      {
@@ -4410,3 +4426,32 @@ void nbnxn_make_pairlist(const nbnxn_search_t  nbs,
          }
      }
  }
+
+void nbnxnPrepareListForDynamicPruning(nbnxn_pairlist_set_t *listSet)
+{
+    /* TODO: Restructure the lists so we have actual outer and inner
+     *       list objects so we can set a single pointer instead of
+     *       swapping several pointers.
+     */
+
+    for (int i = 0; i < listSet->nnbl; i++)
+    {
+        /* The search produced a list in ci/cj.
+         * Swap the list pointers so we get the outer list is ciOuter,cjOuter
+         * and we can prune that to get an inner list in ci/cj.
+         */
+        nbnxn_pairlist_t *list = listSet->nbl[i];
+        list->nciOuter         = list->nci;
+
+        nbnxn_ci_t *ciTmp      = list->ciOuter;
+        list->ciOuter          = list->ci;
+        list->ci               = ciTmp;
+
+        nbnxn_cj_t *cjTmp      = list->cjOuter;
+        list->cjOuter          = list->cj;
+        list->cj               = cjTmp;
+
+        /* Signal that this inner list is currently invalid */
+        list->nci              = -1;
+    }
+}
diff --git a/src/gromacs/mdlib/nbnxn_search.h b/src/gromacs/mdlib/nbnxn_search.h

index 8089e3f8268f15e8c21f31f5fc607024beef1ae5..f6ea8462067ac5975f79120191419f35249324f5 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_search.h
+++ b/src/gromacs/mdlib/nbnxn_search.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -83,4 +83,10 @@ void nbnxn_make_pairlist(const nbnxn_search_t  nbs,
                           int                   nb_kernel_type,
                           t_nrnb               *nrnb);
  
+/*! \brief Prepare the list-set produced by the search for dynamic pruning
+ *
+ * \param[in,out] listSet  The list-set to prepare for dynamic pruning.
+ */
+void nbnxnPrepareListForDynamicPruning(nbnxn_pairlist_set_t *listSet);
+
  #endif
diff --git a/src/gromacs/mdlib/nbnxn_tuning.cpp b/src/gromacs/mdlib/nbnxn_tuning.cpp

new file mode 100644 (file)

index 0000000..7f06963
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_tuning.cpp
@@ -0,0 +1,261 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *
+ * \brief Implements functions for tuning adjustable parameters for the nbnxn non-bonded search and interaction kernels
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \ingroup __module_nb_verlet
+ */
+
+#include "gmxpre.h"
+
+#include "nbnxn_tuning.h"
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include <cmath>
+
+#include <algorithm>
+
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/hardware/cpuinfo.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/calc_verletbuf.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_search.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/interaction_const.h"
+#include "gromacs/topology/topology.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/gmxassert.h"
+
+/*! \brief Returns if we can (heuristically) change nstlist and rlist
+ *
+ * \param [in] ir  The input parameter record
+ */
+static bool supportsDynamicPairlistGenerationInterval(const t_inputrec &ir)
+{
+    return
+        ir.cutoff_scheme == ecutsVERLET &&
+        EI_DYNAMICS(ir.eI) &&
+        !(EI_MD(ir.eI) && ir.etc == etcNO) &&
+        ir.verletbuf_tol > 0;
+}
+
+/*! \brief The interval in steps at which we perform dynamic, rolling pruning on a GPU.
+ *
+ * Ideally we should auto-tune this value.
+ * Not considering overheads, 1 would be the ideal value. But 2 seems
+ * a reasonable compromise that reduces GPU kernel launch overheads and
+ * also avoids inefficiency on large GPUs when pruning small lists.
+ * Because with domain decomposition we alternate local/non-local pruning
+ * at even/odd steps, which gives a period of 2, this value currenly needs
+ * to be 2, which is indirectly asserted when the GPU pruning is dispatched
+ * during the force evaluation.
+ */
+static const int c_nbnxnGpuRollingListPruningInterval = 2;
+
+/*! \brief The minimum nstlist for dynamic pair list pruning.
+ *
+ * In most cases going lower than 4 will lead to a too high pruning cost.
+ * This value should be a multiple of \p c_nbnxnGpuRollingListPruningInterval
+ */
+static const int c_nbnxnDynamicListPruningMinLifetime = 4;
+
+/*! \brief Set the dynamic pairlist pruning parameters in \p ic
+ *
+ * \param[in]     ir          The input parameter record
+ * \param[in]     mtop        The global topology
+ * \param[in]     box         The unit cell
+ * \param[in]     useGpu      Tells if we are using a GPU for non-bondeds
+ * \param[in]     listSetup   The nbnxn pair list setup
+ * \param[in]     userSetNstlistPrune  The user set ic->nstlistPrune (using an env.var.)
+ * \param[in] ic              The nonbonded interactions constants
+ * \param[in,out] listParams  The list setup parameters
+ */
+static void
+setDynamicPairlistPruningParameters(const t_inputrec             *ir,
+                                    const gmx_mtop_t             *mtop,
+                                    matrix                        box,
+                                    gmx_bool                      useGpu,
+                                    const verletbuf_list_setup_t &listSetup,
+                                    bool                          userSetNstlistPrune,
+                                    const interaction_const_t    *ic,
+                                    NbnxnListParameters          *listParams)
+{
+    /* When nstlistPrune was set by the user, we need to execute one loop
+     * iteration to determine rlistInner.
+     * Otherwise we compute rlistInner and increase nstlist as long as
+     * we have a pairlist buffer of length 0 (i.e. rlistInner == cutoff).
+     */
+    const real interactionCutoff = std::max(ic->rcoulomb, ic->rvdw);
+    int        tunedNstlistPrune = listParams->nstlistPrune;
+    do
+    {
+        /* Dynamic pruning on the GPU is performed on the list for
+         * the next step on the coordinates of the current step,
+         * so the list lifetime is nstlistPrune (not the usual nstlist-1).
+         */
+        int listLifetime         = tunedNstlistPrune - (useGpu ? 0 : 1);
+        listParams->nstlistPrune = tunedNstlistPrune;
+        calc_verlet_buffer_size(mtop, det(box), ir,
+                                tunedNstlistPrune, listLifetime,
+                                -1, &listSetup, NULL,
+                                &listParams->rlistInner);
+
+        /* On the GPU we apply the dynamic pruning in a rolling fashion
+         * every c_nbnxnGpuRollingListPruningInterval steps,
+         * so keep nstlistPrune a multiple of the interval.
+         */
+        tunedNstlistPrune += useGpu ? c_nbnxnGpuRollingListPruningInterval : 1;
+    }
+    while (!userSetNstlistPrune &&
+           tunedNstlistPrune < ir->nstlist &&
+           listParams->rlistInner == interactionCutoff);
+
+    if (userSetNstlistPrune)
+    {
+        listParams->useDynamicPruning = true;
+    }
+    else
+    {
+        /* Determine the pair list size increase due to zero interactions */
+        real rlistInc = nbnxn_get_rlist_effective_inc(listSetup.cluster_size_j,
+                                                      mtop->natoms/det(box));
+
+        /* Dynamic pruning is only useful when the inner list is smaller than
+         * the outer. The factor 0.99 ensures at least 3% list size reduction.
+         *
+         * With dynamic pruning on the CPU we prune after updating,
+         * so nstlistPrune=nstlist-1 would add useless extra work.
+         * With the GPU there will probably be more overhead than gain
+         * with nstlistPrune=nstlist-1, so we disable dynamic pruning.
+         * Note that in such cases the first sub-condition is likely also false.
+         */
+        listParams->useDynamicPruning =
+            (listParams->rlistInner + rlistInc < 0.99*(listParams->rlistOuter + rlistInc) &&
+             listParams->nstlistPrune < ir->nstlist - 1);
+    }
+
+    if (!listParams->useDynamicPruning)
+    {
+        /* These parameters should not be used, but set them to useful values */
+        listParams->nstlistPrune  = -1;
+        listParams->rlistInner    = listParams->rlistOuter;
+    }
+}
+
+void setupDynamicPairlistPruning(FILE                      *fplog,
+                                 const t_inputrec          *ir,
+                                 const gmx_mtop_t          *mtop,
+                                 matrix                     box,
+                                 bool                       useGpu,
+                                 const interaction_const_t *ic,
+                                 NbnxnListParameters       *listParams)
+{
+    GMX_RELEASE_ASSERT(listParams->rlistOuter > 0, "With the nbnxn setup rlist should be > 0");
+
+    /* Initialize the parameters to no dynamic list pruning */
+    listParams->useDynamicPruning = false;
+
+    if (supportsDynamicPairlistGenerationInterval(*ir) &&
+        getenv("GMX_DISABLE_DYNAMICPRUNING") == NULL)
+    {
+        verletbuf_list_setup_t ls;
+        verletbuf_get_list_setup(TRUE, TRUE, &ls);
+
+        /* Note that nstlistPrune can have any value independently of nstlist.
+         * Actually applying rolling pruning is only useful when
+         * nstlistPrune < nstlist -1
+         */
+        char *env                 = getenv("GMX_NSTLIST_DYNAMICPRUNING");
+        bool  userSetNstlistPrune = (env != NULL);
+
+        if (userSetNstlistPrune)
+        {
+            char *end;
+            listParams->nstlistPrune = strtol(env, &end, 10);
+            if (!end || (*end != 0) ||
+                !(listParams->nstlistPrune > 0 && listParams->nstlistPrune < ir->nstlist))
+            {
+                gmx_fatal(FARGS, "Invalid value passed in GMX_NSTLIST_DYNAMICPRUNING=%s, should be > 0 and < nstlist", env);
+            }
+        }
+        else
+        {
+            static_assert(c_nbnxnDynamicListPruningMinLifetime % c_nbnxnGpuRollingListPruningInterval == 0,
+                          "c_nbnxnDynamicListPruningMinLifetime sets the starting value for nstlistPrune, which should be divisible by the rolling pruning interval for efficiency reasons.");
+
+            // TODO: Use auto-tuning to determine nstlistPrune
+            listParams->nstlistPrune = c_nbnxnDynamicListPruningMinLifetime;
+        }
+
+        setDynamicPairlistPruningParameters(ir, mtop, box, useGpu, ls,
+                                            userSetNstlistPrune, ic,
+                                            listParams);
+
+        // Dynamic pruning disabled until the kernels are present
+        listParams->useDynamicPruning = false;
+
+        if (listParams->useDynamicPruning && useGpu)
+        {
+            /* Note that we can round down here. This makes the effective
+             * rolling pruning interval slightly shorter than nstlistTune,
+             * thus giving correct results, but a slightly lower efficiency.
+             */
+            listParams->numRollingParts = listParams->nstlistPrune/c_nbnxnGpuRollingListPruningInterval;
+        }
+        else
+        {
+            listParams->numRollingParts = 1;
+        }
+
+        if (fplog && listParams->useDynamicPruning)
+        {
+            const real interactionCutoff = std::max(ic->rcoulomb, ic->rvdw);
+            fprintf(fplog,
+                    "Using a dual pair-list setup updated with dynamic%s pruning:\n"
+                    "  outer list: updated every %3d steps, buffer %.3f nm, rlist %.3f nm\n"
+                    "  inner list: updated every %3d steps, buffer %.3f nm, rlist %.3f nm\n",
+                    listParams->numRollingParts > 1 ? ", rolling" : "",
+                    ir->nstlist, listParams->rlistOuter - interactionCutoff, listParams->rlistOuter,
+                    listParams->nstlistPrune, listParams->rlistInner - interactionCutoff, listParams->rlistInner);
+        }
+    }
+}
diff --git a/src/gromacs/mdlib/nbnxn_tuning.h b/src/gromacs/mdlib/nbnxn_tuning.h

new file mode 100644 (file)

index 0000000..c02fdc5
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_tuning.h
@@ -0,0 +1,74 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal \file
+ *
+ * \brief Declares functions for tuning adjustable parameters for the nbnxn non-bonded search and interaction kernels
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \ingroup __module_nb_verlet
+ */
+
+#ifndef NBNXN_TUNING_H
+#define NBNXN_TUNING_H
+
+#include <stdio.h>
+
+#include "gromacs/math/vectypes.h"
+
+struct gmx_mtop_t;
+struct interaction_const_t;
+struct NbnxnListParameters;
+struct t_inputrec;
+
+/*! \brief Set up the dynamic pairlist pruning
+ *
+ * \param[in,out] fplog       Log file
+ * \param[in]     ir          The input parameter record
+ * \param[in]     mtop        The global topology
+ * \param[in]     box         The unit cell
+ * \param[in]     useGpu      Tells if we are using a GPU for non-bondeds
+ * \param[in]     ic          The nonbonded interactions constants
+ * \param[in,out] listParams  The list setup parameters
+ */
+void setupDynamicPairlistPruning(FILE                      *fplog,
+                                 const t_inputrec          *ir,
+                                 const gmx_mtop_t          *mtop,
+                                 matrix                     box,
+                                 bool                       useGpu,
+                                 const interaction_const_t *ic,
+                                 NbnxnListParameters       *listParams);
+
+#endif /* NBNXN_TUNING_H */
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 10d67fca283510f7f3a88a669b071b5bb690f63f..3a37dc62eef562f8d18b9a488cbf551f9cea961a 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -410,20 +410,18 @@ static void do_nb_verlet(t_forcerec *fr,
                           gmx_enerdata_t *enerd,
                           int flags, int ilocality,
                           int clearF,
+                         gmx_int64_t step,
                           t_nrnb *nrnb,
                           gmx_wallcycle_t wcycle)
  {
-    int                        enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
-    nonbonded_verlet_group_t  *nbvg;
-    gmx_bool                   bUsingGpuKernels;
-
      if (!(flags & GMX_FORCE_NONBONDED))
      {
          /* skip non-bonded calculation */
          return;
      }
  
-    nbvg = &fr->nbv->grp[ilocality];
+    nonbonded_verlet_t       *nbv  = fr->nbv;
+    nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
  
      /* GPU kernel launch overhead is already timed separately */
      if (fr->cutoff_scheme != ecutsVERLET)
@@ -431,12 +429,27 @@ static void do_nb_verlet(t_forcerec *fr,
          gmx_incons("Invalid cut-off scheme passed!");
      }
  
-    bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
+    bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
  
      if (!bUsingGpuKernels)
      {
+        /* When dynamic pair-list  pruning is requested, we need to prune
+         * at nstlistPrune steps.
+         */
+        if (nbv->listParams->useDynamicPruning &&
+            (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
+        {
+            /* Prune the pair-list beyond fr->ic->rlistPrune using
+             * the current coordinates of the atoms.
+             */
+            wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
+            GMX_RELEASE_ASSERT(false, "The CPU prune kernel will be called here");
+            wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
+        }
+
          wallcycle_sub_start(wcycle, ewcsNONBONDED);
      }
+
      switch (nbvg->kernel_type)
      {
          case nbnxnk4x4_PlainC:
@@ -455,7 +468,7 @@ static void do_nb_verlet(t_forcerec *fr,
              break;
  
          case nbnxnk8x8x8_GPU:
-            nbnxn_gpu_launch_kernel(fr->nbv->gpu_nbv, nbvg->nbat, flags, ilocality);
+            nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbvg->nbat, flags, ilocality);
              break;
  
          case nbnxnk8x8x8_PlainC:
@@ -473,7 +486,7 @@ static void do_nb_verlet(t_forcerec *fr,
              break;
  
          default:
-            gmx_incons("Invalid nonbonded kernel type passed!");
+            GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
  
      }
      if (!bUsingGpuKernels)
@@ -481,12 +494,13 @@ static void do_nb_verlet(t_forcerec *fr,
          wallcycle_sub_stop(wcycle, ewcsNONBONDED);
      }
  
+    int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
      if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
      {
          enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
      }
      else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
-             (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(fr->nbv->gpu_nbv)))
+             (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
      {
          enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
      }
@@ -901,12 +915,17 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
          nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintLocal].nbat,
                              &top->excls,
-                            ic->rlist,
+                            nbv->listParams->rlistOuter,
                              nbv->min_ci_balanced,
                              &nbv->grp[eintLocal].nbl_lists,
                              eintLocal,
                              nbv->grp[eintLocal].kernel_type,
                              nrnb);
+        nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
+        if (nbv->listParams->useDynamicPruning && !bUseGPU)
+        {
+            nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
+        }
          wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
  
          if (bUseGPU)
@@ -938,7 +957,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
          /* launch local nonbonded F on GPU */
          do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
-                     nrnb, wcycle);
+                     step, nrnb, wcycle);
          wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
      }
  
@@ -976,13 +995,17 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
  
              nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintNonlocal].nbat,
                                  &top->excls,
-                                ic->rlist,
+                                nbv->listParams->rlistOuter,
                                  nbv->min_ci_balanced,
                                  &nbv->grp[eintNonlocal].nbl_lists,
                                  eintNonlocal,
                                  nbv->grp[eintNonlocal].kernel_type,
                                  nrnb);
-
+            nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
+            if (nbv->listParams->useDynamicPruning && !bUseGPU)
+            {
+                nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
+            }
              wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
  
              if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
@@ -1013,7 +1036,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
              wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
              /* launch non-local nonbonded F on GPU */
              do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
-                         nrnb, wcycle);
+                         step, nrnb, wcycle);
              wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
          }
      }
@@ -1136,9 +1159,8 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
  
      if (!bUseOrEmulGPU)
      {
-        /* Maybe we should move this into do_force_lowlevel */
          do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
-                     nrnb, wcycle);
+                     step, nrnb, wcycle);
      }
  
      if (fr->efep != efepNO)
@@ -1172,7 +1194,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          {
              do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
                           bDiffKernels ? enbvClearFYes : enbvClearFNo,
-                         nrnb, wcycle);
+                         step, nrnb, wcycle);
          }
  
          if (!bUseOrEmulGPU)
@@ -1246,7 +1268,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
              {
                  wallcycle_start_nocount(wcycle, ewcFORCE);
                  do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
-                             nrnb, wcycle);
+                             step, nrnb, wcycle);
                  wallcycle_stop(wcycle, ewcFORCE);
              }
              wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
@@ -1319,6 +1341,27 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
              /* now clear the GPU outputs while we finish the step on the CPU */
              wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
              nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
+
+            /* Is dynamic pair-list pruning activated? */
+            if (nbv->listParams->useDynamicPruning)
+            {
+                /* We should not launch the rolling pruning kernel at a search
+                 * step or just before search steps, since that's useless.
+                 * Without domain decomposition we prune at even steps.
+                 * With domain decomposition we alternate local and non-local
+                 * pruning at even and odd steps.
+                 */
+                int  numRollingParts     = nbv->listParams->numRollingParts;
+                GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
+                int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
+                bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
+                if (stepWithCurrentList > 0 &&
+                    stepWithCurrentList < inputrec->nstlist - 1 &&
+                    (stepIsEven || DOMAINDECOMP(cr)))
+                {
+                    GMX_RELEASE_ASSERT(false, "The GPU prune kernel will be called here");
+                }
+            }
              wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
          }
          else
@@ -1326,7 +1369,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
              wallcycle_start_nocount(wcycle, ewcFORCE);
              do_nb_verlet(fr, ic, enerd, flags, eintLocal,
                           DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
-                         nrnb, wcycle);
+                         step, nrnb, wcycle);
              wallcycle_stop(wcycle, ewcFORCE);
          }
          wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
diff --git a/src/gromacs/mdtypes/interaction_const.h b/src/gromacs/mdtypes/interaction_const.h

index 6682ef7d5b3ea55747ccb6a60d1c3b29083dc5ba..16b0d32e89b8645d2d1f4fb71b4bd20aadf432aa 100644 (file)
--- a/src/gromacs/mdtypes/interaction_const.h
+++ b/src/gromacs/mdtypes/interaction_const.h
@@ -92,9 +92,6 @@ struct interaction_const_t
      /* Coulomb */
      real rcoulomb;
  
-    /* Cut-off */
-    real rlist;
-
      /* PME/Ewald */
      real ewaldcoeff_q;
      real ewaldcoeff_lj;
diff --git a/src/gromacs/timing/gpu_timing.h b/src/gromacs/timing/gpu_timing.h

index 772b5ee39c49032485cce8b42b9b45e5b76975c7..e9ec67d8555e864cdb46ea6918c4a7175b83b327 100644 (file)
--- a/src/gromacs/timing/gpu_timing.h
+++ b/src/gromacs/timing/gpu_timing.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2012,2014,2015,2017, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -57,14 +57,16 @@ struct gmx_nbnxn_kernel_timing_data_t
  /*! \internal \brief GPU timings for kernels and H2d/D2H transfers. */
  struct gmx_wallclock_gpu_t
  {
-    struct gmx_nbnxn_kernel_timing_data_t ktime[2][2]; /**< table containing the timings of the four
-                                                          versions of the nonbonded kernels: force-only,
-                                                          force+energy, force+pruning, and force+energy+pruning */
-    double  nb_h2d_t;                                  /**< host to device transfer time in nb calculation  */
-    double  nb_d2h_t;                                  /**< device to host transfer time in nb calculation */
-    int     nb_c;                                      /**< total call count of the nonbonded gpu operations */
-    double  pl_h2d_t;                                  /**< pair search step host to device transfer time */
-    int     pl_h2d_c;                                  /**< pair search step  host to device transfer call count */
+    gmx_nbnxn_kernel_timing_data_t ktime[2][2];      /**< table containing the timings of the four
+                                                        versions of the nonbonded kernels: force-only,
+                                                        force+energy, force+pruning, and force+energy+pruning */
+    gmx_nbnxn_kernel_timing_data_t pruneTime;        /**< table containing the timings of the 1st pass prune-only kernels */
+    gmx_nbnxn_kernel_timing_data_t dynamicPruneTime; /**< table containing the timings of dynamic prune-only kernels */
+    double                         nb_h2d_t;         /**< host to device transfer time in nb calculation  */
+    double                         nb_d2h_t;         /**< device to host transfer time in nb calculation */
+    int                            nb_c;             /**< total call count of the nonbonded gpu operations */
+    double                         pl_h2d_t;         /**< pair search step host to device transfer time */
+    int                            pl_h2d_c;         /**< pair search step  host to device transfer call count */
  };
  
  #ifdef __cplusplus
diff --git a/src/gromacs/timing/wallcycle.cpp b/src/gromacs/timing/wallcycle.cpp

index 6b18f2dbfb8476c89f7e0e0681a4debcfb8242e1..85bc4967fb7383679649a9b2746c678a77637a77 100644 (file)
--- a/src/gromacs/timing/wallcycle.cpp
+++ b/src/gromacs/timing/wallcycle.cpp
@@ -120,6 +120,7 @@ static const char *wcsn[ewcsNR] =
      "Bonded-FEP F",
      "Restraints F",
      "Listed buffer ops.",
+    "Nonbonded pruning",
      "Nonbonded F",
      "Ewald F correction",
      "NB X buffer ops.",
@@ -890,7 +891,7 @@ void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int np
                  tot_k += gpu_t->ktime[i][j].t;
              }
          }
-        tot_gpu += tot_k;
+        tot_gpu += tot_k + gpu_t->pruneTime.t;
  
          tot_cpu_overlap = wc->wcc[ewcFORCE].c;
          if (wc->wcc[ewcPMEMESH].n > 0)
@@ -918,11 +919,23 @@ void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int np
                  }
              }
          }
-
+        if (gpu_t->pruneTime.c)
+        {
+            print_gputimes(fplog, "Pruning kernel", gpu_t->pruneTime.c, gpu_t->pruneTime.t, tot_gpu);
+        }
          print_gputimes(fplog, "F D2H",  gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu);
          fprintf(fplog, "%s\n", hline);
          print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu);
          fprintf(fplog, "%s\n", hline);
+        if (gpu_t->dynamicPruneTime.c)
+        {
+            /* We print the dynamic pruning kernel timings after a separator
+             * and avoid adding it to tot_gpu as this is not in the force
+             * overlap. We print the fraction as relative to the rest.
+             */
+            print_gputimes(fplog, "*Dynamic pruning", gpu_t->dynamicPruneTime.c, gpu_t->dynamicPruneTime.t, tot_gpu);
+            fprintf(fplog, "%s\n", hline);
+        }
  
          gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
          if (gpu_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
diff --git a/src/gromacs/timing/wallcycle.h b/src/gromacs/timing/wallcycle.h

index d69aaa63984f5fc4fcef2495a88cae3fbb27ba85..d40656e4f8c66ece87b60d0f9d26fb1634f17943 100644 (file)
--- a/src/gromacs/timing/wallcycle.h
+++ b/src/gromacs/timing/wallcycle.h
@@ -3,7 +3,7 @@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2008, The GROMACS development team.
- * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015,2017, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -68,6 +68,7 @@ enum {
      ewcsLISTED_FEP,
      ewcsRESTRAINTS,
      ewcsLISTED_BUF_OPS,
+    ewcsNONBONDED_PRUNING,
      ewcsNONBONDED,
      ewcsEWALD_CORRECTION,
      ewcsNB_X_BUF_OPS,
diff --git a/src/programs/mdrun/md.cpp b/src/programs/mdrun/md.cpp

index 5b74c841e566ba20f1de987c90614ee4096dc7bd..902f7bfb84ad0c192d77a6a8b45a6ae9445e7758 100644 (file)
--- a/src/programs/mdrun/md.cpp
+++ b/src/programs/mdrun/md.cpp
@@ -520,7 +520,7 @@ double gmx::do_md(FILE *fplog, t_commrec *cr, const gmx::MDLogger &mdlog,
      if (bPMETune)
      {
          pme_loadbal_init(&pme_loadbal, cr, mdlog, ir, state->box,
-                         fr->ic, fr->pmedata, use_GPU(fr->nbv),
+                         fr->ic, fr->nbv->listParams.get(), fr->pmedata, use_GPU(fr->nbv),
                           &bPMETunePrinting);
      }
  
diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp

index 5a71b61c27fb689d723b6ed5017af1a3e59ec4d4..d2b5a531c76133e4709bbaa65eb3376e4c114a90 100644 (file)
--- a/src/programs/mdrun/runner.cpp
+++ b/src/programs/mdrun/runner.cpp
@@ -373,8 +373,8 @@ static void increase_nstlist(FILE *fp, t_commrec *cr,
       */
      nstlist_prev = ir->nstlist;
      ir->nstlist  = nbnxnReferenceNstlist;
-    calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, nullptr,
-                            &rlistWithReferenceNstlist);
+    calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1,
+                            -1, &ls, nullptr, &rlistWithReferenceNstlist);
      ir->nstlist  = nstlist_prev;
  
      /* Determine the pair list size increase due to zero interactions */
@@ -398,7 +398,7 @@ static void increase_nstlist(FILE *fp, t_commrec *cr,
          }
  
          /* Set the pair-list buffer size in ir */
-        calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, nullptr, &rlist_new);
+        calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1, -1, &ls, nullptr, &rlist_new);
  
          /* Does rlist fit in the box? */
          bBox = (gmx::square(rlist_new) < max_cutoff2(ir->ePBC, box));
@@ -497,7 +497,7 @@ static void prepare_verlet_scheme(FILE                           *fplog,
           */
          verletbuf_get_list_setup(true, makeGpuPairList, &ls);
  
-        calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, nullptr, &rlist_new);
+        calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1, -1, &ls, nullptr, &rlist_new);
  
          if (rlist_new != ir->rlist)
          {
author	Berk Hess <hess@kth.se>
	Fri, 24 Mar 2017 13:22:21 +0000 (14:22 +0100)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 16 Aug 2017 20:14:52 +0000 (22:14 +0200)
docs/user-guide/environment-variables.rst		patch \| blob \| history
src/gromacs/ewald/pme-load-balancing.cpp		patch \| blob \| history
src/gromacs/ewald/pme-load-balancing.h		patch \| blob \| history
src/gromacs/gmxpreprocess/grompp.cpp		patch \| blob \| history
src/gromacs/mdlib/calc_verletbuf.cpp		patch \| blob \| history
src/gromacs/mdlib/calc_verletbuf.h		patch \| blob \| history
src/gromacs/mdlib/forcerec.cpp		patch \| blob \| history
src/gromacs/mdlib/nb_verlet.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu		patch \| blob \| history
src/gromacs/mdlib/nbnxn_gpu.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_pairlist.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_search.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_search.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_tuning.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/nbnxn_tuning.h	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdtypes/interaction_const.h		patch \| blob \| history
src/gromacs/timing/gpu_timing.h		patch \| blob \| history
src/gromacs/timing/wallcycle.cpp		patch \| blob \| history
src/gromacs/timing/wallcycle.h		patch \| blob \| history
src/programs/mdrun/md.cpp		patch \| blob \| history
src/programs/mdrun/runner.cpp		patch \| blob \| history