From 0e49e1983d6eb51adb3c3cb3e0ba73fe446d9bdf Mon Sep 17 00:00:00 2001
From: Berk Hess <hess@kth.se>
Date: Fri, 24 Mar 2017 14:22:21 +0100
Subject: [PATCH] Add dynamic pair-list pruning framework

The change add the logistic for setting up dynamic pruning of the
nbnxn pair-lists. Dynamic pruning allows for an increase in nstlist
and rlist while computing fewer pair interactions in the non-bonded
kernel. This comes at the cost of running an extra pruning kernel
every few steps and added communication due to larger buffers.

The kernels, documentation and heuristic for choosing nstlist
will be added in separate changes.

Change-Id: Id8040b95f812df60f117279267bf551ff4ac8d79
---
 docs/user-guide/environment-variables.rst     |  11 +-
 src/gromacs/ewald/pme-load-balancing.cpp      |  71 +++--
 src/gromacs/ewald/pme-load-balancing.h        |   2 +
 src/gromacs/gmxpreprocess/grompp.cpp          |   8 +-
 src/gromacs/mdlib/calc_verletbuf.cpp          |  10 +-
 src/gromacs/mdlib/calc_verletbuf.h            |   5 +
 src/gromacs/mdlib/forcerec.cpp                |  18 +-
 src/gromacs/mdlib/nb_verlet.h                 |  34 ++-
 .../mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu  |  19 +-
 src/gromacs/mdlib/nbnxn_gpu.h                 |  42 ++-
 src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h       |   5 +-
 .../mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp  |  12 +-
 .../mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp   |  19 +-
 src/gromacs/mdlib/nbnxn_pairlist.h            |  56 +++-
 src/gromacs/mdlib/nbnxn_search.cpp            |  45 +++
 src/gromacs/mdlib/nbnxn_search.h              |   8 +-
 src/gromacs/mdlib/nbnxn_tuning.cpp            | 261 ++++++++++++++++++
 src/gromacs/mdlib/nbnxn_tuning.h              |  74 +++++
 src/gromacs/mdlib/sim_util.cpp                |  81 ++++--
 src/gromacs/mdtypes/interaction_const.h       |   3 -
 src/gromacs/timing/gpu_timing.h               |  20 +-
 src/gromacs/timing/wallcycle.cpp              |  17 +-
 src/gromacs/timing/wallcycle.h                |   3 +-
 src/programs/mdrun/md.cpp                     |   2 +-
 src/programs/mdrun/runner.cpp                 |   8 +-
 25 files changed, 709 insertions(+), 125 deletions(-)
 create mode 100644 src/gromacs/mdlib/nbnxn_tuning.cpp
 create mode 100644 src/gromacs/mdlib/nbnxn_tuning.h

diff --git a/docs/user-guide/environment-variables.rst b/docs/user-guide/environment-variables.rst
index 43016daf1a..1505969478 100644
--- a/docs/user-guide/environment-variables.rst
+++ b/docs/user-guide/environment-variables.rst
@@ -351,9 +351,14 @@ Performance and Run Control
 ``MDRUN``
         the :ref:`gmx mdrun` command used by :ref:`gmx tune_pme`.
 
-``GMX_NSTLIST``
-        sets the default value for :mdp:`nstlist`, preventing it from being tuned during
-        :ref:`gmx mdrun` startup when using the Verlet cutoff scheme.
+``GMX_DISABLE DYNAMICPRUNING``
+        disables dynamic pair-list pruning. Note that :ref:`gmx mdrun` will
+        still tune nstlist to the optimal value with dynamic pruning. Thus
+        for good performance the -nstlist option should be used.
+
+``GMX_NSTLIST_DYNAMICPRUNING``
+        overrides the dynamic pair-list pruning interval chosen heuristically
+        by mdrun. Values should be between 1 and :mdp:`nstlist - 1`.
 
 ``GMX_USE_TREEREDUCE``
         use tree reduction for nbnxn force reduction. Potentially faster for large number of
diff --git a/src/gromacs/ewald/pme-load-balancing.cpp b/src/gromacs/ewald/pme-load-balancing.cpp
index c61e0d640d..384ed8daaa 100644
--- a/src/gromacs/ewald/pme-load-balancing.cpp
+++ b/src/gromacs/ewald/pme-load-balancing.cpp
@@ -62,7 +62,9 @@
 #include "gromacs/math/functions.h"
 #include "gromacs/math/vec.h"
 #include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/nb_verlet.h"
 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
 #include "gromacs/mdlib/sim_util.h"
 #include "gromacs/mdtypes/commrec.h"
 #include "gromacs/mdtypes/inputrec.h"
@@ -81,7 +83,8 @@
 /*! \brief Parameters and settings for one PP-PME setup */
 struct pme_setup_t {
     real              rcut_coulomb;    /**< Coulomb cut-off                              */
-    real              rlist;           /**< pair-list cut-off                            */
+    real              rlistOuter;      /**< cut-off for the outer pair-list              */
+    real              rlistInner;      /**< cut-off for the inner pair-list              */
     real              spacing;         /**< (largest) PME grid spacing                   */
     ivec              grid;            /**< the PME grid dimensions                      */
     real              grid_efficiency; /**< ineffiency factor for non-uniform grids <= 1 */
@@ -129,8 +132,10 @@ struct pme_load_balancing_t {
     real         cut_spacing;        /**< the minimum cutoff / PME grid spacing ratio */
     real         rcut_vdw;           /**< Vdw cutoff (does not change) */
     real         rcut_coulomb_start; /**< Initial electrostatics cutoff */
-    real         rbuf_coulomb;       /**< the pairlist buffer size */
-    real         rbuf_vdw;           /**< the pairlist buffer size */
+    real         rbufOuter_coulomb;  /**< the outer pairlist buffer size */
+    real         rbufOuter_vdw;      /**< the outer pairlist buffer size */
+    real         rbufInner_coulomb;  /**< the inner pairlist buffer size */
+    real         rbufInner_vdw;      /**< the inner pairlist buffer size */
     matrix       box_start;          /**< the initial simulation box */
     int          n;                  /**< the count of setup as well as the allocation size */
     pme_setup_t *setup;              /**< the PME+cutoff setups */
@@ -161,6 +166,7 @@ void pme_loadbal_init(pme_load_balancing_t     **pme_lb_p,
                       const t_inputrec          *ir,
                       matrix                     box,
                       const interaction_const_t *ic,
+                      const NbnxnListParameters *listParams,
                       gmx_pme_t                 *pmedata,
                       gmx_bool                   bUseGPU,
                       gmx_bool                  *bPrinting)
@@ -177,18 +183,20 @@ void pme_loadbal_init(pme_load_balancing_t     **pme_lb_p,
 
     snew(pme_lb, 1);
 
-    pme_lb->bSepPMERanks  = !(cr->duty & DUTY_PME);
+    pme_lb->bSepPMERanks      = !(cr->duty & DUTY_PME);
 
     /* Initially we turn on balancing directly on based on PP/PME imbalance */
-    pme_lb->bTriggerOnDLB = FALSE;
+    pme_lb->bTriggerOnDLB     = FALSE;
 
     /* Any number of stages >= 2 is supported */
-    pme_lb->nstage        = 2;
+    pme_lb->nstage            = 2;
 
-    pme_lb->cutoff_scheme = ir->cutoff_scheme;
+    pme_lb->cutoff_scheme     = ir->cutoff_scheme;
 
-    pme_lb->rbuf_coulomb  = ic->rlist - ic->rcoulomb;
-    pme_lb->rbuf_vdw      = ic->rlist - ic->rvdw;
+    pme_lb->rbufOuter_coulomb = listParams->rlistOuter - ic->rcoulomb;
+    pme_lb->rbufOuter_vdw     = listParams->rlistOuter - ic->rvdw;
+    pme_lb->rbufInner_coulomb = listParams->rlistInner - ic->rcoulomb;
+    pme_lb->rbufInner_vdw     = listParams->rlistInner - ic->rvdw;
 
     copy_mat(box, pme_lb->box_start);
     if (ir->ePBC == epbcXY && ir->nwall == 2)
@@ -204,7 +212,8 @@ void pme_loadbal_init(pme_load_balancing_t     **pme_lb_p,
 
     pme_lb->cur                      = 0;
     pme_lb->setup[0].rcut_coulomb    = ic->rcoulomb;
-    pme_lb->setup[0].rlist           = ic->rlist;
+    pme_lb->setup[0].rlistOuter      = listParams->rlistOuter;
+    pme_lb->setup[0].rlistInner      = listParams->rlistInner;
     pme_lb->setup[0].grid[XX]        = ir->nkx;
     pme_lb->setup[0].grid[YY]        = ir->nky;
     pme_lb->setup[0].grid[ZZ]        = ir->nkz;
@@ -356,17 +365,20 @@ static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t *pme_lb,
     if (pme_lb->cutoff_scheme == ecutsVERLET)
     {
         /* Never decrease the Coulomb and VdW list buffers */
-        set->rlist        = std::max(set->rcut_coulomb + pme_lb->rbuf_coulomb,
-                                     pme_lb->rcut_vdw + pme_lb->rbuf_vdw);
+        set->rlistOuter  = std::max(set->rcut_coulomb + pme_lb->rbufOuter_coulomb,
+                                    pme_lb->rcut_vdw + pme_lb->rbufOuter_vdw);
+        set->rlistInner  = std::max(set->rcut_coulomb + pme_lb->rbufInner_coulomb,
+                                    pme_lb->rcut_vdw + pme_lb->rbufInner_vdw);
     }
     else
     {
-        tmpr_coulomb          = set->rcut_coulomb + pme_lb->rbuf_coulomb;
-        tmpr_vdw              = pme_lb->rcut_vdw + pme_lb->rbuf_vdw;
-        set->rlist            = std::min(tmpr_coulomb, tmpr_vdw);
+        tmpr_coulomb     = set->rcut_coulomb + pme_lb->rbufOuter_coulomb;
+        tmpr_vdw         = pme_lb->rcut_vdw + pme_lb->rbufOuter_vdw;
+        set->rlistOuter  = std::min(tmpr_coulomb, tmpr_vdw);
+        set->rlistInner  = set->rlistOuter;
     }
 
-    set->spacing      = sp;
+    set->spacing         = sp;
     /* The grid efficiency is the size wrt a grid with uniform x/y/z spacing */
     set->grid_efficiency = 1;
     for (d = 0; d < DIM; d++)
@@ -602,7 +614,7 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
              * better overal performance can be obtained with a slightly
              * shorter cut-off and better DD load balancing.
              */
-            set_dd_dlb_max_cutoff(cr, pme_lb->setup[pme_lb->fastest].rlist);
+            set_dd_dlb_max_cutoff(cr, pme_lb->setup[pme_lb->fastest].rlistOuter);
         }
     }
     cycles_fast = pme_lb->setup[pme_lb->fastest].cycles;
@@ -644,7 +656,7 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
 
             if (OK && ir->ePBC != epbcNONE)
             {
-                OK = (gmx::square(pme_lb->setup[pme_lb->cur+1].rlist)
+                OK = (gmx::square(pme_lb->setup[pme_lb->cur+1].rlistOuter)
                       <= max_cutoff2(ir->ePBC, state->box));
                 if (!OK)
                 {
@@ -659,7 +671,7 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
                 if (DOMAINDECOMP(cr))
                 {
                     OK = change_dd_cutoff(cr, state, ir,
-                                          pme_lb->setup[pme_lb->cur].rlist);
+                                          pme_lb->setup[pme_lb->cur].rlistOuter);
                     if (!OK)
                     {
                         /* Failed: do not use this setup */
@@ -731,7 +743,7 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
 
     if (DOMAINDECOMP(cr) && pme_lb->stage > 0)
     {
-        OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlist);
+        OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlistOuter);
         if (!OK)
         {
             /* For some reason the chosen cut-off is incompatible with DD.
@@ -766,9 +778,12 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
 
     set = &pme_lb->setup[pme_lb->cur];
 
-    ic->rcoulomb     = set->rcut_coulomb;
-    ic->rlist        = set->rlist;
-    ic->ewaldcoeff_q = set->ewaldcoeff_q;
+    NbnxnListParameters *listParams = nbv->listParams.get();
+
+    ic->rcoulomb           = set->rcut_coulomb;
+    listParams->rlistOuter = set->rlistOuter;
+    listParams->rlistInner = set->rlistInner;
+    ic->ewaldcoeff_q       = set->ewaldcoeff_q;
     /* TODO: centralize the code that sets the potentials shifts */
     if (ic->coulomb_modifier == eintmodPOTSHIFT)
     {
@@ -794,7 +809,7 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
     /* We always re-initialize the tables whether they are used or not */
     init_interaction_const_tables(nullptr, ic, rtab);
 
-    nbnxn_gpu_pme_loadbal_update_param(nbv, ic);
+    nbnxn_gpu_pme_loadbal_update_param(nbv, ic, listParams);
 
     /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
      * also sharing texture references. To keep the code simple, we don't
@@ -978,7 +993,7 @@ void pme_loadbal_do(pme_load_balancing_t *pme_lb,
              * This also ensures that we won't disable the currently
              * optimal setting during a second round of PME balancing.
              */
-            set_dd_dlb_max_cutoff(cr, fr->ic->rlist);
+            set_dd_dlb_max_cutoff(cr, fr->nbv->listParams->rlistOuter);
         }
     }
 
@@ -997,7 +1012,7 @@ void pme_loadbal_do(pme_load_balancing_t *pme_lb,
         /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
         fr->ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
         fr->ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
-        fr->rlist         = fr->ic->rlist;
+        fr->rlist         = fr->nbv->listParams->rlistOuter;
         fr->rcoulomb      = fr->ic->rcoulomb;
         fr->rvdw          = fr->ic->rvdw;
 
@@ -1040,7 +1055,7 @@ static void print_pme_loadbal_setting(FILE              *fplog,
     fprintf(fplog,
             "   %-7s %6.3f nm %6.3f nm     %3d %3d %3d   %5.3f nm  %5.3f nm\n",
             name,
-            setup->rcut_coulomb, setup->rlist,
+            setup->rcut_coulomb, setup->rlistInner,
             setup->grid[XX], setup->grid[YY], setup->grid[ZZ],
             setup->spacing, 1/setup->ewaldcoeff_q);
 }
@@ -1054,7 +1069,7 @@ static void print_pme_loadbal_settings(pme_load_balancing_t *pme_lb,
     double     pp_ratio, grid_ratio;
     real       pp_ratio_temporary;
 
-    pp_ratio_temporary = pme_lb->setup[pme_lb->cur].rlist / pme_lb->setup[0].rlist;
+    pp_ratio_temporary = pme_lb->setup[pme_lb->cur].rlistInner / pme_lb->setup[0].rlistInner;
     pp_ratio           = gmx::power3(pp_ratio_temporary);
     grid_ratio         = pme_grid_points(&pme_lb->setup[pme_lb->cur])/
         (double)pme_grid_points(&pme_lb->setup[0]);
diff --git a/src/gromacs/ewald/pme-load-balancing.h b/src/gromacs/ewald/pme-load-balancing.h
index 3052ac05a5..3edce7e718 100644
--- a/src/gromacs/ewald/pme-load-balancing.h
+++ b/src/gromacs/ewald/pme-load-balancing.h
@@ -50,6 +50,7 @@
 #include "gromacs/mdtypes/interaction_const.h"
 #include "gromacs/timing/wallcycle.h"
 
+struct NbnxnListParameters;
 struct t_commrec;
 struct t_inputrec;
 class t_state;
@@ -79,6 +80,7 @@ void pme_loadbal_init(pme_load_balancing_t     **pme_lb_p,
                       const t_inputrec          *ir,
                       matrix                     box,
                       const interaction_const_t *ic,
+                      const NbnxnListParameters *listParams,
                       gmx_pme_t                 *pmedata,
                       gmx_bool                   bUseGPU,
                       gmx_bool                  *bPrinting);
diff --git a/src/gromacs/gmxpreprocess/grompp.cpp b/src/gromacs/gmxpreprocess/grompp.cpp
index 951290fc4c..59e4f73743 100644
--- a/src/gromacs/gmxpreprocess/grompp.cpp
+++ b/src/gromacs/gmxpreprocess/grompp.cpp
@@ -1637,13 +1637,13 @@ static void set_verlet_buffer(const gmx_mtop_t *mtop,
     /* Calculate the buffer size for simple atom vs atoms list */
     ls.cluster_size_i = 1;
     ls.cluster_size_j = 1;
-    calc_verlet_buffer_size(mtop, det(box), ir, buffer_temp,
-                            &ls, &n_nonlin_vsite, &rlist_1x1);
+    calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1,
+                            buffer_temp, &ls, &n_nonlin_vsite, &rlist_1x1);
 
     /* Set the pair-list buffer size in ir */
     verletbuf_get_list_setup(FALSE, FALSE, &ls);
-    calc_verlet_buffer_size(mtop, det(box), ir, buffer_temp,
-                            &ls, &n_nonlin_vsite, &ir->rlist);
+    calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1,
+                            buffer_temp, &ls, &n_nonlin_vsite, &ir->rlist);
 
     if (n_nonlin_vsite > 0)
     {
diff --git a/src/gromacs/mdlib/calc_verletbuf.cpp b/src/gromacs/mdlib/calc_verletbuf.cpp
index 2b5c938127..5a5e8be919 100644
--- a/src/gromacs/mdlib/calc_verletbuf.cpp
+++ b/src/gromacs/mdlib/calc_verletbuf.cpp
@@ -796,6 +796,8 @@ static real md3_force_switch(real p, real rswitch, real rc)
 
 void calc_verlet_buffer_size(const gmx_mtop_t *mtop, real boxvol,
                              const t_inputrec *ir,
+                             int               nstlist,
+                             int               list_lifetime,
                              real reference_temperature,
                              const verletbuf_list_setup_t *list_setup,
                              int *n_nonlin_vsite,
@@ -999,7 +1001,7 @@ void calc_verlet_buffer_size(const gmx_mtop_t *mtop, real boxvol,
     }
 
     /* Determine the variance of the atomic displacement
-     * over nstlist-1 steps: kT_fac
+     * over list_lifetime steps: kT_fac
      * For inertial dynamics (not Brownian dynamics) the mass factor
      * is not included in kT_fac, it is added later.
      */
@@ -1010,7 +1012,7 @@ void calc_verlet_buffer_size(const gmx_mtop_t *mtop, real boxvol,
          * should be negligible (unless nstlist is extremely large, which
          * you wouldn't do anyhow).
          */
-        kT_fac = 2*BOLTZ*reference_temperature*(ir->nstlist-1)*ir->delta_t;
+        kT_fac = 2*BOLTZ*reference_temperature*list_lifetime*ir->delta_t;
         if (ir->bd_fric > 0)
         {
             /* This is directly sigma^2 of the displacement */
@@ -1041,7 +1043,7 @@ void calc_verlet_buffer_size(const gmx_mtop_t *mtop, real boxvol,
     }
     else
     {
-        kT_fac = BOLTZ*reference_temperature*gmx::square((ir->nstlist-1)*ir->delta_t);
+        kT_fac = BOLTZ*reference_temperature*gmx::square(list_lifetime*ir->delta_t);
     }
 
     mass_min = att[0].prop.mass;
@@ -1091,7 +1093,7 @@ void calc_verlet_buffer_size(const gmx_mtop_t *mtop, real boxvol,
         drift *= nb_clust_frac_pairs_not_in_list_at_cutoff;
 
         /* Convert the drift to drift per unit time per atom */
-        drift /= ir->nstlist*ir->delta_t*mtop->natoms;
+        drift /= nstlist*ir->delta_t*mtop->natoms;
 
         if (debug)
         {
diff --git a/src/gromacs/mdlib/calc_verletbuf.h b/src/gromacs/mdlib/calc_verletbuf.h
index 1f073ff758..c494e137ec 100644
--- a/src/gromacs/mdlib/calc_verletbuf.h
+++ b/src/gromacs/mdlib/calc_verletbuf.h
@@ -78,6 +78,9 @@ void verletbuf_get_list_setup(bool                    makeSimdPairList,
 /* Calculate the non-bonded pair-list buffer size for the Verlet list
  * based on the particle masses, temperature, LJ types, charges
  * and constraints as well as the non-bonded force behavior at the cut-off.
+ * The pair list update frequency and the list lifetime, which is nstlist-1
+ * for normal pair-list buffering, are passed separately, as in some cases
+ * we want an estimate for different values than the ones set in the inputrec.
  * If reference_temperature < 0, the maximum coupling temperature will be used.
  * The target is a maximum energy drift of ir->verletbuf_tol.
  * Returns the number of non-linear virtual sites. For these it's difficult
@@ -86,6 +89,8 @@ void verletbuf_get_list_setup(bool                    makeSimdPairList,
  */
 void calc_verlet_buffer_size(const gmx_mtop_t *mtop, real boxvol,
                              const t_inputrec *ir,
+                             int               nstlist,
+                             int               list_lifetime,
                              real reference_temperature,
                              const verletbuf_list_setup_t *list_setup,
                              int *n_nonlin_vsite,
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp
index 7b97d744ec..ce3af79337 100644
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -99,6 +99,7 @@
 #include "gromacs/utility/strconvert.h"
 
 #include "nbnxn_gpu_jit_support.h"
+#include "nbnxn_tuning.h"
 
 const char *egrp_nm[egNR+1] = {
     "Coul-SR", "LJ-SR", "Buck-SR",
@@ -1911,8 +1912,6 @@ init_interaction_const(FILE                       *fp,
     snew_aligned(ic->tabq_coul_F, 16, 32);
     snew_aligned(ic->tabq_coul_V, 16, 32);
 
-    ic->rlist           = fr->rlist;
-
     /* Lennard-Jones */
     ic->vdwtype         = fr->vdwtype;
     ic->vdw_modifier    = fr->vdw_modifier;
@@ -2037,7 +2036,9 @@ static void init_nb_verlet(FILE                *fp,
                            const t_forcerec    *fr,
                            const t_commrec     *cr,
                            const char          *nbpu_opt,
-                           gmx_device_info_t   *deviceInfo)
+                           gmx_device_info_t   *deviceInfo,
+                           const gmx_mtop_t    *mtop,
+                           matrix               box)
 {
     nonbonded_verlet_t *nbv;
     int                 i;
@@ -2047,7 +2048,7 @@ static void init_nb_verlet(FILE                *fp,
     nbnxn_alloc_t      *nb_alloc;
     nbnxn_free_t       *nb_free;
 
-    snew(nbv, 1);
+    nbv = new nonbonded_verlet_t();
 
     nbv->emulateGpu = (getenv("GMX_EMULATE_GPU") != nullptr);
     nbv->bUseGPU    = deviceInfo != nullptr;
@@ -2100,6 +2101,10 @@ static void init_nb_verlet(FILE                *fp,
         }
     }
 
+    nbv->listParams = std::unique_ptr<NbnxnListParameters>(new NbnxnListParameters(ir->rlist));
+    setupDynamicPairlistPruning(fp, ir, mtop, box, nbv->bUseGPU, fr->ic,
+                                nbv->listParams.get());
+
     nbnxn_init_search(&nbv->nbs,
                       DOMAINDECOMP(cr) ? &cr->dd->nc : nullptr,
                       DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : nullptr,
@@ -2175,6 +2180,7 @@ static void init_nb_verlet(FILE                *fp,
         nbnxn_gpu_init(&nbv->gpu_nbv,
                        deviceInfo,
                        fr->ic,
+                       nbv->listParams.get(),
                        nbv->grp,
                        cr->nodeid,
                        (nbv->ngrp > 1) && !bHybridGPURun);
@@ -3134,7 +3140,9 @@ void init_forcerec(FILE                *fp,
             GMX_RELEASE_ASSERT(ir->rcoulomb == ir->rvdw, "With Verlet lists and no PME rcoulomb and rvdw should be identical");
         }
 
-        init_nb_verlet(fp, mdlog, &fr->nbv, bFEP_NonBonded, ir, fr, cr, nbpu_opt, deviceInfo);
+        init_nb_verlet(fp, mdlog, &fr->nbv, bFEP_NonBonded, ir, fr,
+                       cr, nbpu_opt, deviceInfo,
+                       mtop, box);
     }
 
     if (ir->eDispCorr != edispcNO)
diff --git a/src/gromacs/mdlib/nb_verlet.h b/src/gromacs/mdlib/nb_verlet.h
index 886fbd4d4e..0b692333b6 100644
--- a/src/gromacs/mdlib/nb_verlet.h
+++ b/src/gromacs/mdlib/nb_verlet.h
@@ -54,6 +54,19 @@
  * which in the code is referred to as the NxN algorithms ("nbnxn_" prefix);
  * for details of the algorithm see DOI:10.1016/j.cpc.2013.06.003.
  *
+ * Algorithmically, the non-bonded computation has two different modes:
+ * A "classical" mode: generate a list every nstlist steps containing at least
+ * all atom pairs up to a distance of rlistOuter and compute pair interactions
+ * for all pairs that are within the interaction cut-off.
+ * A "dynamic pruning" mode: generate an "outer-list" up to cut-off rlistOuter
+ * every nstlist steps and prune the outer-list using a cut-off of rlistInner
+ * every nstlistPrune steps to obtain a, smaller, "inner-list". This
+ * results in fewer interaction computations and allows for a larger nstlist.
+ * On a GPU, this dynamic pruning is performed in a rolling fashion, pruning
+ * only a sub-part of the list each (second) step. This way it can often
+ * overlap with integration and constraints on the CPU.
+ * Currently a simple heuristic determines which mode will be used.
+ *
  * TODO: add a summary list and brief descriptions of the different submodules:
  * search, CPU kernels, GPU glue code + kernels.
  *
@@ -85,6 +98,8 @@
 #ifndef NB_VERLET_H
 #define NB_VERLET_H
 
+#include <memory>
+
 #include "gromacs/mdlib/nbnxn_gpu_types.h"
 #include "gromacs/mdlib/nbnxn_pairlist.h"
 
@@ -166,15 +181,16 @@ typedef struct nonbonded_verlet_group_t {
 /*! \libinternal
  *  \brief Top-level non-bonded data structure for the Verlet-type cut-off scheme. */
 typedef struct nonbonded_verlet_t {
-    nbnxn_search_t           nbs;             /**< n vs n atom pair searching data       */
-    int                      ngrp;            /**< number of interaction groups          */
-    nonbonded_verlet_group_t grp[2];          /**< local and non-local interaction group */
-
-    gmx_bool                 bUseGPU;         /**< TRUE when non-bonded interactions are computed on a physical GPU */
-    bool                     emulateGpu;      /**< true when non-bonded interactions are computed on the CPU using GPU-style pair lists */
-    gmx_nbnxn_gpu_t         *gpu_nbv;         /**< pointer to GPU nb verlet data     */
-    int                      min_ci_balanced; /**< pair list balancing parameter
-                                                   used for the 8x8x8 GPU kernels    */
+    std::unique_ptr<NbnxnListParameters> listParams;      /**< Parameters for the search and list pruning setup */
+    nbnxn_search_t                       nbs;             /**< n vs n atom pair searching data       */
+    int                                  ngrp;            /**< number of interaction groups          */
+    nonbonded_verlet_group_t             grp[2];          /**< local and non-local interaction group */
+
+    gmx_bool                             bUseGPU;         /**< TRUE when non-bonded interactions are computed on a physical GPU */
+    bool                                 emulateGpu;      /**< true when non-bonded interactions are computed on the CPU using GPU-style pair lists */
+    gmx_nbnxn_gpu_t                     *gpu_nbv;         /**< pointer to GPU nb verlet data     */
+    int                                  min_ci_balanced; /**< pair list balancing parameter
+                                                               used for the 8x8x8 GPU kernels    */
 } nonbonded_verlet_t;
 
 /*! \brief Getter for bUseGPU */
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
index 52e255de56..55c80da775 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -298,7 +298,8 @@ static int pick_ewald_kernel_type(bool                     bTwinCut,
 
 /*! Copies all parameters related to the cut-off from ic to nbp */
 static void set_cutoff_parameters(cu_nbparam_t              *nbp,
-                                  const interaction_const_t *ic)
+                                  const interaction_const_t *ic,
+                                  const NbnxnListParameters *listParams)
 {
     nbp->ewald_beta       = ic->ewaldcoeff_q;
     nbp->sh_ewald         = ic->sh_ewald;
@@ -307,7 +308,7 @@ static void set_cutoff_parameters(cu_nbparam_t              *nbp,
     nbp->c_rf             = ic->c_rf;
     nbp->rvdw_sq          = ic->rvdw * ic->rvdw;
     nbp->rcoulomb_sq      = ic->rcoulomb * ic->rcoulomb;
-    nbp->rlist_sq         = ic->rlist * ic->rlist;
+    nbp->rlist_sq         = listParams->rlistOuter * listParams->rlistOuter;
 
     nbp->sh_lj_ewald      = ic->sh_lj_ewald;
     nbp->ewaldcoeff_lj    = ic->ewaldcoeff_lj;
@@ -362,6 +363,7 @@ static void initParamLookupTable(float                    * &devPtr,
 /*! Initializes the nonbonded parameter data structure. */
 static void init_nbparam(cu_nbparam_t              *nbp,
                          const interaction_const_t *ic,
+                         const NbnxnListParameters *listParams,
                          const nbnxn_atomdata_t    *nbat,
                          const gmx_device_info_t   *dev_info)
 {
@@ -369,7 +371,7 @@ static void init_nbparam(cu_nbparam_t              *nbp,
 
     ntypes  = nbat->ntype;
 
-    set_cutoff_parameters(nbp, ic);
+    set_cutoff_parameters(nbp, ic, listParams);
 
     /* The kernel code supports LJ combination rules (geometric and LB) for
      * all kernel types, but we only generate useful combination rule kernels.
@@ -477,7 +479,8 @@ static void init_nbparam(cu_nbparam_t              *nbp,
 /*! Re-generate the GPU Ewald force table, resets rlist, and update the
  *  electrostatic type switching to twin cut-off (or back) if needed. */
 void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
-                                        const interaction_const_t   *ic)
+                                        const interaction_const_t   *ic,
+                                        const NbnxnListParameters   *listParams)
 {
     if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
     {
@@ -486,7 +489,7 @@ void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
     gmx_nbnxn_cuda_t *nb    = nbv->gpu_nbv;
     cu_nbparam_t     *nbp   = nb->nbparam;
 
-    set_cutoff_parameters(nbp, ic);
+    set_cutoff_parameters(nbp, ic, listParams);
 
     nbp->eeltype        = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
                                                  nb->dev_info);
@@ -574,10 +577,11 @@ static void init_timings(gmx_wallclock_gpu_t *t)
 /*! Initializes simulation constant data. */
 static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t               *nb,
                                   const interaction_const_t      *ic,
+                                  const NbnxnListParameters      *listParams,
                                   const nonbonded_verlet_group_t *nbv_group)
 {
     init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype);
-    init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_info);
+    init_nbparam(nb->nbparam, ic, listParams, nbv_group[0].nbat, nb->dev_info);
 
     /* clear energy and shift force outputs */
     nbnxn_cuda_clear_e_fshift(nb);
@@ -586,6 +590,7 @@ static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t               *nb,
 void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
                     const gmx_device_info_t   *deviceInfo,
                     const interaction_const_t *ic,
+                    const NbnxnListParameters *listParams,
                     nonbonded_verlet_group_t  *nbv_grp,
                     int                        /*rank*/,
                     gmx_bool                   bLocalAndNonlocal)
@@ -667,7 +672,7 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
     /* pick L1 cache configuration */
     nbnxn_cuda_set_cacheconfig(nb->dev_info);
 
-    nbnxn_cuda_init_const(nb, ic, nbv_grp);
+    nbnxn_cuda_init_const(nb, ic, listParams, nbv_grp);
 
     *p_nb = nb;
 
diff --git a/src/gromacs/mdlib/nbnxn_gpu.h b/src/gromacs/mdlib/nbnxn_gpu.h
index 318824de45..1b903f1336 100644
--- a/src/gromacs/mdlib/nbnxn_gpu.h
+++ b/src/gromacs/mdlib/nbnxn_gpu.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -71,6 +71,46 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused               *nb,
                              int gmx_unused                            flags,
                              int gmx_unused                            iloc) GPU_FUNC_TERM
 
+/*! \brief
+ * Launch asynchronously the nonbonded prune-only kernel.
+ *
+ *  The local and non-local list pruning are launched in their separate streams.
+ *
+ *  Notes for future scheduling tuning:
+ *  Currently we schedule the dynamic pruning between two MD steps *after* both local and
+ *  nonlocal force D2H transfers completed. We could launch already after the cpyback
+ *  is launched, but we want to avoid prune kernels (especially in the non-local
+ *  high prio-stream) competing with nonbonded work.
+ *
+ *  However, this is not ideal as this schedule does not expose the available
+ *  concurrency. The dynamic pruning kernel:
+ *    - should be allowed to overlap with any task other than force compute, including
+ *      transfers (F D2H and the next step's x H2D as well as force clearing).
+ *    - we'd prefer to avoid competition with non-bonded force kernels belonging
+ *      to the same rank and ideally other ranks too.
+ *
+ *  In the most general case, the former would require scheduling pruning in a separate
+ *  stream and adding additional event sync points to ensure that force kernels read
+ *  consistent pair list data. This would lead to some overhead (due to extra
+ *  cudaStreamWaitEvent calls, 3-5 us/call) which we might be able to live with.
+ *  The gains from additional overlap might not be significant as long as
+ *  update+constraints anyway takes longer than pruning, but there will still
+ *  be use-cases where more overlap may help (e.g. multiple ranks per GPU,
+ *  no/hbonds only constraints).
+ *  The above second point is harder to address given that multiple ranks will often
+ *  share a GPU. Ranks that complete their nonbondeds sooner can schedule pruning earlier
+ *  and without a third priority level it is difficult to avoid some interference of
+ *  prune kernels with force tasks (in particular preemption of low-prio local force task).
+ *
+ * \param [inout] nb        GPU nonbonded data.
+ * \param [in]    iloc      Interaction locality flag.
+ * \param [in]    numParts  Number of parts the pair list is split into in the rolling kernel.
+ */
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t gmx_unused *nb,
+                                       int gmx_unused              iloc,
+                                       int gmx_unused              numParts) GPU_FUNC_TERM
+
 /*! \brief
  * Launch asynchronously the download of nonbonded forces from the GPU
  * (and energies/shift forces if required).
diff --git a/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h b/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h
index cddb0f05a4..ebc1d5171b 100644
--- a/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h
+++ b/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h
@@ -54,6 +54,7 @@ extern "C" {
 struct nonbonded_verlet_group_t;
 struct nbnxn_pairlist_t;
 struct nbnxn_atomdata_t;
+struct NbnxnListParameters;
 struct gmx_wallclock_gpu_t;
 struct gmx_gpu_info_t;
 
@@ -62,6 +63,7 @@ GPU_FUNC_QUALIFIER
 void nbnxn_gpu_init(gmx_nbnxn_gpu_t gmx_unused            **p_nb,
                     const gmx_device_info_t gmx_unused     *deviceInfo,
                     const interaction_const_t gmx_unused   *ic,
+                    const NbnxnListParameters gmx_unused   *listParams,
                     nonbonded_verlet_group_t gmx_unused    *nbv_grp,
                     int gmx_unused                          rank,
                     /* true if both local and non-local are done on GPU */
@@ -83,7 +85,8 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_gpu_t gmx_unused               *nb,
  */
 GPU_FUNC_QUALIFIER
 void nbnxn_gpu_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unused *nbv,
-                                        const interaction_const_t gmx_unused       *ic) GPU_FUNC_TERM
+                                        const interaction_const_t gmx_unused       *ic,
+                                        const NbnxnListParameters gmx_unused       *listParams) GPU_FUNC_TERM
 
 /** Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). */
 GPU_FUNC_QUALIFIER
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp
index 4cdd8576c1..cd9fbce337 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_cpu.cpp
@@ -97,10 +97,12 @@ static void clearGroupEnergies(nbnxn_atomdata_output_t *out)
  * \param[in,out] vCoulomb        Coulomb energy output buffer
  */
 template <int unrollj> static void
-reduceGroupEnergySimdBuffers(int numGroups, int numGroups_2log,
-                             const real *vVdwSimd, const real *vCoulombSimd,
-                             real * gmx_restrict vVdw,
-                             real * gmx_restrict vCoulomb)
+reduceGroupEnergySimdBuffers(int                       numGroups,
+                             int                       numGroups_2log,
+                             const real * gmx_restrict vVdwSimd,
+                             const real * gmx_restrict vCoulombSimd,
+                             real * gmx_restrict       vVdw,
+                             real * gmx_restrict       vCoulomb)
 {
     // cppcheck-suppress duplicateExpression
     const int unrollj_half     = unrollj/2;
@@ -220,6 +222,8 @@ nbnxn_kernel_cpu(nonbonded_verlet_group_t  *nbvg,
     int                nnbl = nbvg->nbl_lists.nnbl;
     nbnxn_pairlist_t **nbl  = nbvg->nbl_lists.nbl;
 
+    GMX_ASSERT(nbl[0]->nci >= 0, "nci<0, which signals an invalid pair-list");
+
     // cppcheck-suppress unreadVariable
     int gmx_unused nthreads = gmx_omp_nthreads_get(emntNonbonded);
 #pragma omp parallel for schedule(static) num_threads(nthreads)
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
index a7b95e49a9..5be02872c6 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
@@ -284,7 +284,8 @@ static void init_atomdata_first(cl_atomdata_t *ad, int ntypes, gmx_device_runtim
 /*! \brief Copies all parameters related to the cut-off from ic to nbp
  */
 static void set_cutoff_parameters(cl_nbparam_t              *nbp,
-                                  const interaction_const_t *ic)
+                                  const interaction_const_t *ic,
+                                  const NbnxnListParameters *listParams)
 {
     nbp->ewald_beta       = ic->ewaldcoeff_q;
     nbp->sh_ewald         = ic->sh_ewald;
@@ -293,7 +294,7 @@ static void set_cutoff_parameters(cl_nbparam_t              *nbp,
     nbp->c_rf             = ic->c_rf;
     nbp->rvdw_sq          = ic->rvdw * ic->rvdw;
     nbp->rcoulomb_sq      = ic->rcoulomb * ic->rcoulomb;
-    nbp->rlist_sq         = ic->rlist * ic->rlist;
+    nbp->rlist_sq         = listParams->rlistOuter * listParams->rlistOuter;
 
     nbp->sh_lj_ewald      = ic->sh_lj_ewald;
     nbp->ewaldcoeff_lj    = ic->ewaldcoeff_lj;
@@ -388,6 +389,7 @@ map_interaction_types_to_gpu_kernel_flavors(const interaction_const_t *ic,
  */
 static void init_nbparam(cl_nbparam_t                    *nbp,
                          const interaction_const_t       *ic,
+                         const NbnxnListParameters       *listParams,
                          const nbnxn_atomdata_t          *nbat,
                          const gmx_device_runtime_data_t *runData)
 {
@@ -397,7 +399,7 @@ static void init_nbparam(cl_nbparam_t                    *nbp,
 
     ntypes = nbat->ntype;
 
-    set_cutoff_parameters(nbp, ic);
+    set_cutoff_parameters(nbp, ic, listParams);
 
     map_interaction_types_to_gpu_kernel_flavors(ic,
                                                 nbat->comb_rule,
@@ -494,7 +496,8 @@ static void init_nbparam(cl_nbparam_t                    *nbp,
 
 //! This function is documented in the header file
 void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
-                                        const interaction_const_t   *ic)
+                                        const interaction_const_t   *ic,
+                                        const NbnxnListParameters   *listParams)
 {
     if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
     {
@@ -503,7 +506,7 @@ void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
     gmx_nbnxn_ocl_t    *nb  = nbv->gpu_nbv;
     cl_nbparam_t       *nbp = nb->nbparam;
 
-    set_cutoff_parameters(nbp, ic);
+    set_cutoff_parameters(nbp, ic, listParams);
 
     nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw);
 
@@ -678,10 +681,11 @@ static void nbnxn_gpu_init_kernels(gmx_nbnxn_ocl_t *nb)
  */
 static void nbnxn_ocl_init_const(gmx_nbnxn_ocl_t                *nb,
                                  const interaction_const_t      *ic,
+                                 const NbnxnListParameters      *listParams,
                                  const nonbonded_verlet_group_t *nbv_group)
 {
     init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype, nb->dev_rundata);
-    init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_rundata);
+    init_nbparam(nb->nbparam, ic, listParams, nbv_group[0].nbat, nb->dev_rundata);
 }
 
 
@@ -689,6 +693,7 @@ static void nbnxn_ocl_init_const(gmx_nbnxn_ocl_t                *nb,
 void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
                     const gmx_device_info_t   *deviceInfo,
                     const interaction_const_t *ic,
+                    const NbnxnListParameters *listParams,
                     nonbonded_verlet_group_t  *nbv_grp,
                     int                        rank,
                     gmx_bool                   bLocalAndNonlocal)
@@ -781,7 +786,7 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
         init_timings(nb->timings);
     }
 
-    nbnxn_ocl_init_const(nb, ic, nbv_grp);
+    nbnxn_ocl_init_const(nb, ic, listParams, nbv_grp);
 
     /* Enable LJ param manual prefetch for AMD or if we request through env. var.
      * TODO: decide about NVIDIA
diff --git a/src/gromacs/mdlib/nbnxn_pairlist.h b/src/gromacs/mdlib/nbnxn_pairlist.h
index 7576b68f06..8ccb2224ce 100644
--- a/src/gromacs/mdlib/nbnxn_pairlist.h
+++ b/src/gromacs/mdlib/nbnxn_pairlist.h
@@ -46,6 +46,34 @@
 
 struct tMPI_Atomic;
 
+/*! \cond INTERNAL */
+
+/*! \brief The setup for generating and pruning the nbnxn pair list.
+ *
+ * Without dynamic pruning rlistOuter=rlistInner.
+ */
+struct NbnxnListParameters
+{
+    /*! \brief Constructor producing a struct with dynamic pruning disabled
+     */
+    NbnxnListParameters(real rlist) :
+        useDynamicPruning(false),
+        nstlistPrune(-1),
+        rlistOuter(rlist),
+        rlistInner(rlist),
+        numRollingParts(1)
+    {
+    }
+
+    bool useDynamicPruning; //!< Are we using dynamic pair-list pruning
+    int  nstlistPrune;      //!< Pair-list dynamic pruning interval
+    real rlistOuter;        //!< Cut-off of the larger, outer pair-list
+    real rlistInner;        //!< Cut-off of the smaller, inner pair-list
+    int  numRollingParts;   //!< The number parts to divide the pair-list into for rolling pruning, a value of 1 gives no rolling pruning
+};
+
+/*! \endcond */
+
 /* With GPU kernels the i and j cluster size is 8 atoms */
 static const int c_nbnxnGpuClusterSize = 8;
 
@@ -159,15 +187,18 @@ typedef struct nbnxn_pairlist_t {
     int                     na_sc;       /* The number of atoms per super cluster    */
     real                    rlist;       /* The radius for constructing the list     */
     int                     nci;         /* The number of i-clusters in the list     */
+    int                     nciOuter;    /* The number of i-clusters in the outer, unpruned list, -1 when invalid */
     nbnxn_ci_t             *ci;          /* The i-cluster list, size nci             */
-    int                     ci_nalloc;   /* The allocation size of ci                */
+    nbnxn_ci_t             *ciOuter;     /* The outer, unpruned i-cluster list, size nciOuter(=-1 when invalid) */
+    int                     ci_nalloc;   /* The allocation size of ci/ciOuter        */
     int                     nsci;        /* The number of i-super-clusters in the list */
     nbnxn_sci_t            *sci;         /* The i-super-cluster list                 */
     int                     sci_nalloc;  /* The allocation size of sci               */
 
     int                     ncj;         /* The number of j-clusters in the list     */
     nbnxn_cj_t             *cj;          /* The j-cluster list, size ncj             */
-    int                     cj_nalloc;   /* The allocation size of cj                */
+    nbnxn_cj_t             *cjOuter;     /* The outer, unpruned j-cluster list, size ncj    */
+    int                     cj_nalloc;   /* The allocation size of cj/cj0            */
     int                     ncjInUse;    /* The number of j-clusters that are used by ci entries in this list, will be <= ncj */
 
     int                     ncj4;        /* The total number of 4*j clusters         */
@@ -184,16 +215,17 @@ typedef struct nbnxn_pairlist_t {
 } nbnxn_pairlist_t;
 
 typedef struct {
-    int                nnbl;        /* number of lists */
-    nbnxn_pairlist_t **nbl;         /* lists */
-    nbnxn_pairlist_t **nbl_work;    /* work space for rebalancing lists */
-    gmx_bool           bCombined;   /* TRUE if lists get combined into one (the 1st) */
-    gmx_bool           bSimple;     /* TRUE if the list of of type "simple"
-                                       (na_sc=na_s, no super-clusters used) */
-    int                natpair_ljq; /* Total number of atom pairs for LJ+Q kernel */
-    int                natpair_lj;  /* Total number of atom pairs for LJ kernel   */
-    int                natpair_q;   /* Total number of atom pairs for Q kernel    */
-    t_nblist         **nbl_fep;
+    int                nnbl;                  /* number of lists */
+    nbnxn_pairlist_t **nbl;                   /* lists */
+    nbnxn_pairlist_t **nbl_work;              /* work space for rebalancing lists */
+    gmx_bool           bCombined;             /* TRUE if lists get combined into one (the 1st) */
+    gmx_bool           bSimple;               /* TRUE if the list of of type "simple"
+                                                 (na_sc=na_s, no super-clusters used) */
+    int                natpair_ljq;           /* Total number of atom pairs for LJ+Q kernel */
+    int                natpair_lj;            /* Total number of atom pairs for LJ kernel   */
+    int                natpair_q;             /* Total number of atom pairs for Q kernel    */
+    t_nblist         **nbl_fep;               /* List of free-energy atom pair interactions */
+    gmx_int64_t        outerListCreationStep; /* Step at which the outer list was created */
 } nbnxn_pairlist_set_t;
 
 enum {
diff --git a/src/gromacs/mdlib/nbnxn_search.cpp b/src/gromacs/mdlib/nbnxn_search.cpp
index 56eb5e1150..297c0531e3 100644
--- a/src/gromacs/mdlib/nbnxn_search.cpp
+++ b/src/gromacs/mdlib/nbnxn_search.cpp
@@ -737,6 +737,11 @@ static void check_cell_list_space_simple(nbnxn_pairlist_t *nbl,
                            nbl->ncj*sizeof(*nbl->cj),
                            nbl->cj_nalloc*sizeof(*nbl->cj),
                            nbl->alloc, nbl->free);
+
+        nbnxn_realloc_void((void **)&nbl->cjOuter,
+                           nbl->ncj*sizeof(*nbl->cjOuter),
+                           nbl->cj_nalloc*sizeof(*nbl->cjOuter),
+                           nbl->alloc, nbl->free);
     }
 }
 
@@ -2136,6 +2141,11 @@ static void nb_realloc_ci(nbnxn_pairlist_t *nbl, int n)
                        nbl->nci*sizeof(*nbl->ci),
                        nbl->ci_nalloc*sizeof(*nbl->ci),
                        nbl->alloc, nbl->free);
+
+    nbnxn_realloc_void((void **)&nbl->ciOuter,
+                       nbl->nci*sizeof(*nbl->ciOuter),
+                       nbl->ci_nalloc*sizeof(*nbl->ciOuter),
+                       nbl->alloc, nbl->free);
 }
 
 /* Reallocate the super-cell sci list for at least n entries */
@@ -2399,6 +2409,7 @@ static void clear_pairlist(nbnxn_pairlist_t *nbl)
     nbl->ncjInUse      = 0;
     nbl->ncj4          = 0;
     nbl->nci_tot       = 0;
+    nbl->nciOuter      = -1;
     nbl->nexcl         = 1;
 
     nbl->work->ncj_noq = 0;
@@ -4357,6 +4368,11 @@ void nbnxn_make_pairlist(const nbnxn_search_t  nbs,
         balance_fep_lists(nbs, nbl_list);
     }
 
+    /* This is a fresh list, so not pruned, stored using ci and nci.
+     * ciOuter and nciOuter are invalid at this point.
+     */
+    GMX_ASSERT(nbl_list->nbl[0]->nciOuter == -1, "nciOuter should have been set to -1 to signal that it is invalid");
+
     /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
     if (LOCAL_I(iloc))
     {
@@ -4410,3 +4426,32 @@ void nbnxn_make_pairlist(const nbnxn_search_t  nbs,
         }
     }
 }
+
+void nbnxnPrepareListForDynamicPruning(nbnxn_pairlist_set_t *listSet)
+{
+    /* TODO: Restructure the lists so we have actual outer and inner
+     *       list objects so we can set a single pointer instead of
+     *       swapping several pointers.
+     */
+
+    for (int i = 0; i < listSet->nnbl; i++)
+    {
+        /* The search produced a list in ci/cj.
+         * Swap the list pointers so we get the outer list is ciOuter,cjOuter
+         * and we can prune that to get an inner list in ci/cj.
+         */
+        nbnxn_pairlist_t *list = listSet->nbl[i];
+        list->nciOuter         = list->nci;
+
+        nbnxn_ci_t *ciTmp      = list->ciOuter;
+        list->ciOuter          = list->ci;
+        list->ci               = ciTmp;
+
+        nbnxn_cj_t *cjTmp      = list->cjOuter;
+        list->cjOuter          = list->cj;
+        list->cj               = cjTmp;
+
+        /* Signal that this inner list is currently invalid */
+        list->nci              = -1;
+    }
+}
diff --git a/src/gromacs/mdlib/nbnxn_search.h b/src/gromacs/mdlib/nbnxn_search.h
index 8089e3f826..f6ea846206 100644
--- a/src/gromacs/mdlib/nbnxn_search.h
+++ b/src/gromacs/mdlib/nbnxn_search.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -83,4 +83,10 @@ void nbnxn_make_pairlist(const nbnxn_search_t  nbs,
                          int                   nb_kernel_type,
                          t_nrnb               *nrnb);
 
+/*! \brief Prepare the list-set produced by the search for dynamic pruning
+ *
+ * \param[in,out] listSet  The list-set to prepare for dynamic pruning.
+ */
+void nbnxnPrepareListForDynamicPruning(nbnxn_pairlist_set_t *listSet);
+
 #endif
diff --git a/src/gromacs/mdlib/nbnxn_tuning.cpp b/src/gromacs/mdlib/nbnxn_tuning.cpp
new file mode 100644
index 0000000000..7f06963a94
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_tuning.cpp
@@ -0,0 +1,261 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *
+ * \brief Implements functions for tuning adjustable parameters for the nbnxn non-bonded search and interaction kernels
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \ingroup __module_nb_verlet
+ */
+
+#include "gmxpre.h"
+
+#include "nbnxn_tuning.h"
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include <cmath>
+
+#include <algorithm>
+
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/hardware/cpuinfo.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/calc_verletbuf.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_search.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/interaction_const.h"
+#include "gromacs/topology/topology.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/gmxassert.h"
+
+/*! \brief Returns if we can (heuristically) change nstlist and rlist
+ *
+ * \param [in] ir  The input parameter record
+ */
+static bool supportsDynamicPairlistGenerationInterval(const t_inputrec &ir)
+{
+    return
+        ir.cutoff_scheme == ecutsVERLET &&
+        EI_DYNAMICS(ir.eI) &&
+        !(EI_MD(ir.eI) && ir.etc == etcNO) &&
+        ir.verletbuf_tol > 0;
+}
+
+/*! \brief The interval in steps at which we perform dynamic, rolling pruning on a GPU.
+ *
+ * Ideally we should auto-tune this value.
+ * Not considering overheads, 1 would be the ideal value. But 2 seems
+ * a reasonable compromise that reduces GPU kernel launch overheads and
+ * also avoids inefficiency on large GPUs when pruning small lists.
+ * Because with domain decomposition we alternate local/non-local pruning
+ * at even/odd steps, which gives a period of 2, this value currenly needs
+ * to be 2, which is indirectly asserted when the GPU pruning is dispatched
+ * during the force evaluation.
+ */
+static const int c_nbnxnGpuRollingListPruningInterval = 2;
+
+/*! \brief The minimum nstlist for dynamic pair list pruning.
+ *
+ * In most cases going lower than 4 will lead to a too high pruning cost.
+ * This value should be a multiple of \p c_nbnxnGpuRollingListPruningInterval
+ */
+static const int c_nbnxnDynamicListPruningMinLifetime = 4;
+
+/*! \brief Set the dynamic pairlist pruning parameters in \p ic
+ *
+ * \param[in]     ir          The input parameter record
+ * \param[in]     mtop        The global topology
+ * \param[in]     box         The unit cell
+ * \param[in]     useGpu      Tells if we are using a GPU for non-bondeds
+ * \param[in]     listSetup   The nbnxn pair list setup
+ * \param[in]     userSetNstlistPrune  The user set ic->nstlistPrune (using an env.var.)
+ * \param[in] ic              The nonbonded interactions constants
+ * \param[in,out] listParams  The list setup parameters
+ */
+static void
+setDynamicPairlistPruningParameters(const t_inputrec             *ir,
+                                    const gmx_mtop_t             *mtop,
+                                    matrix                        box,
+                                    gmx_bool                      useGpu,
+                                    const verletbuf_list_setup_t &listSetup,
+                                    bool                          userSetNstlistPrune,
+                                    const interaction_const_t    *ic,
+                                    NbnxnListParameters          *listParams)
+{
+    /* When nstlistPrune was set by the user, we need to execute one loop
+     * iteration to determine rlistInner.
+     * Otherwise we compute rlistInner and increase nstlist as long as
+     * we have a pairlist buffer of length 0 (i.e. rlistInner == cutoff).
+     */
+    const real interactionCutoff = std::max(ic->rcoulomb, ic->rvdw);
+    int        tunedNstlistPrune = listParams->nstlistPrune;
+    do
+    {
+        /* Dynamic pruning on the GPU is performed on the list for
+         * the next step on the coordinates of the current step,
+         * so the list lifetime is nstlistPrune (not the usual nstlist-1).
+         */
+        int listLifetime         = tunedNstlistPrune - (useGpu ? 0 : 1);
+        listParams->nstlistPrune = tunedNstlistPrune;
+        calc_verlet_buffer_size(mtop, det(box), ir,
+                                tunedNstlistPrune, listLifetime,
+                                -1, &listSetup, NULL,
+                                &listParams->rlistInner);
+
+        /* On the GPU we apply the dynamic pruning in a rolling fashion
+         * every c_nbnxnGpuRollingListPruningInterval steps,
+         * so keep nstlistPrune a multiple of the interval.
+         */
+        tunedNstlistPrune += useGpu ? c_nbnxnGpuRollingListPruningInterval : 1;
+    }
+    while (!userSetNstlistPrune &&
+           tunedNstlistPrune < ir->nstlist &&
+           listParams->rlistInner == interactionCutoff);
+
+    if (userSetNstlistPrune)
+    {
+        listParams->useDynamicPruning = true;
+    }
+    else
+    {
+        /* Determine the pair list size increase due to zero interactions */
+        real rlistInc = nbnxn_get_rlist_effective_inc(listSetup.cluster_size_j,
+                                                      mtop->natoms/det(box));
+
+        /* Dynamic pruning is only useful when the inner list is smaller than
+         * the outer. The factor 0.99 ensures at least 3% list size reduction.
+         *
+         * With dynamic pruning on the CPU we prune after updating,
+         * so nstlistPrune=nstlist-1 would add useless extra work.
+         * With the GPU there will probably be more overhead than gain
+         * with nstlistPrune=nstlist-1, so we disable dynamic pruning.
+         * Note that in such cases the first sub-condition is likely also false.
+         */
+        listParams->useDynamicPruning =
+            (listParams->rlistInner + rlistInc < 0.99*(listParams->rlistOuter + rlistInc) &&
+             listParams->nstlistPrune < ir->nstlist - 1);
+    }
+
+    if (!listParams->useDynamicPruning)
+    {
+        /* These parameters should not be used, but set them to useful values */
+        listParams->nstlistPrune  = -1;
+        listParams->rlistInner    = listParams->rlistOuter;
+    }
+}
+
+void setupDynamicPairlistPruning(FILE                      *fplog,
+                                 const t_inputrec          *ir,
+                                 const gmx_mtop_t          *mtop,
+                                 matrix                     box,
+                                 bool                       useGpu,
+                                 const interaction_const_t *ic,
+                                 NbnxnListParameters       *listParams)
+{
+    GMX_RELEASE_ASSERT(listParams->rlistOuter > 0, "With the nbnxn setup rlist should be > 0");
+
+    /* Initialize the parameters to no dynamic list pruning */
+    listParams->useDynamicPruning = false;
+
+    if (supportsDynamicPairlistGenerationInterval(*ir) &&
+        getenv("GMX_DISABLE_DYNAMICPRUNING") == NULL)
+    {
+        verletbuf_list_setup_t ls;
+        verletbuf_get_list_setup(TRUE, TRUE, &ls);
+
+        /* Note that nstlistPrune can have any value independently of nstlist.
+         * Actually applying rolling pruning is only useful when
+         * nstlistPrune < nstlist -1
+         */
+        char *env                 = getenv("GMX_NSTLIST_DYNAMICPRUNING");
+        bool  userSetNstlistPrune = (env != NULL);
+
+        if (userSetNstlistPrune)
+        {
+            char *end;
+            listParams->nstlistPrune = strtol(env, &end, 10);
+            if (!end || (*end != 0) ||
+                !(listParams->nstlistPrune > 0 && listParams->nstlistPrune < ir->nstlist))
+            {
+                gmx_fatal(FARGS, "Invalid value passed in GMX_NSTLIST_DYNAMICPRUNING=%s, should be > 0 and < nstlist", env);
+            }
+        }
+        else
+        {
+            static_assert(c_nbnxnDynamicListPruningMinLifetime % c_nbnxnGpuRollingListPruningInterval == 0,
+                          "c_nbnxnDynamicListPruningMinLifetime sets the starting value for nstlistPrune, which should be divisible by the rolling pruning interval for efficiency reasons.");
+
+            // TODO: Use auto-tuning to determine nstlistPrune
+            listParams->nstlistPrune = c_nbnxnDynamicListPruningMinLifetime;
+        }
+
+        setDynamicPairlistPruningParameters(ir, mtop, box, useGpu, ls,
+                                            userSetNstlistPrune, ic,
+                                            listParams);
+
+        // Dynamic pruning disabled until the kernels are present
+        listParams->useDynamicPruning = false;
+
+        if (listParams->useDynamicPruning && useGpu)
+        {
+            /* Note that we can round down here. This makes the effective
+             * rolling pruning interval slightly shorter than nstlistTune,
+             * thus giving correct results, but a slightly lower efficiency.
+             */
+            listParams->numRollingParts = listParams->nstlistPrune/c_nbnxnGpuRollingListPruningInterval;
+        }
+        else
+        {
+            listParams->numRollingParts = 1;
+        }
+
+        if (fplog && listParams->useDynamicPruning)
+        {
+            const real interactionCutoff = std::max(ic->rcoulomb, ic->rvdw);
+            fprintf(fplog,
+                    "Using a dual pair-list setup updated with dynamic%s pruning:\n"
+                    "  outer list: updated every %3d steps, buffer %.3f nm, rlist %.3f nm\n"
+                    "  inner list: updated every %3d steps, buffer %.3f nm, rlist %.3f nm\n",
+                    listParams->numRollingParts > 1 ? ", rolling" : "",
+                    ir->nstlist, listParams->rlistOuter - interactionCutoff, listParams->rlistOuter,
+                    listParams->nstlistPrune, listParams->rlistInner - interactionCutoff, listParams->rlistInner);
+        }
+    }
+}
diff --git a/src/gromacs/mdlib/nbnxn_tuning.h b/src/gromacs/mdlib/nbnxn_tuning.h
new file mode 100644
index 0000000000..c02fdc54bd
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_tuning.h
@@ -0,0 +1,74 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal \file
+ *
+ * \brief Declares functions for tuning adjustable parameters for the nbnxn non-bonded search and interaction kernels
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \ingroup __module_nb_verlet
+ */
+
+#ifndef NBNXN_TUNING_H
+#define NBNXN_TUNING_H
+
+#include <stdio.h>
+
+#include "gromacs/math/vectypes.h"
+
+struct gmx_mtop_t;
+struct interaction_const_t;
+struct NbnxnListParameters;
+struct t_inputrec;
+
+/*! \brief Set up the dynamic pairlist pruning
+ *
+ * \param[in,out] fplog       Log file
+ * \param[in]     ir          The input parameter record
+ * \param[in]     mtop        The global topology
+ * \param[in]     box         The unit cell
+ * \param[in]     useGpu      Tells if we are using a GPU for non-bondeds
+ * \param[in]     ic          The nonbonded interactions constants
+ * \param[in,out] listParams  The list setup parameters
+ */
+void setupDynamicPairlistPruning(FILE                      *fplog,
+                                 const t_inputrec          *ir,
+                                 const gmx_mtop_t          *mtop,
+                                 matrix                     box,
+                                 bool                       useGpu,
+                                 const interaction_const_t *ic,
+                                 NbnxnListParameters       *listParams);
+
+#endif /* NBNXN_TUNING_H */
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index 10d67fca28..3a37dc62ee 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -410,20 +410,18 @@ static void do_nb_verlet(t_forcerec *fr,
                          gmx_enerdata_t *enerd,
                          int flags, int ilocality,
                          int clearF,
+                         gmx_int64_t step,
                          t_nrnb *nrnb,
                          gmx_wallcycle_t wcycle)
 {
-    int                        enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
-    nonbonded_verlet_group_t  *nbvg;
-    gmx_bool                   bUsingGpuKernels;
-
     if (!(flags & GMX_FORCE_NONBONDED))
     {
         /* skip non-bonded calculation */
         return;
     }
 
-    nbvg = &fr->nbv->grp[ilocality];
+    nonbonded_verlet_t       *nbv  = fr->nbv;
+    nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
 
     /* GPU kernel launch overhead is already timed separately */
     if (fr->cutoff_scheme != ecutsVERLET)
@@ -431,12 +429,27 @@ static void do_nb_verlet(t_forcerec *fr,
         gmx_incons("Invalid cut-off scheme passed!");
     }
 
-    bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
+    bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
 
     if (!bUsingGpuKernels)
     {
+        /* When dynamic pair-list  pruning is requested, we need to prune
+         * at nstlistPrune steps.
+         */
+        if (nbv->listParams->useDynamicPruning &&
+            (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
+        {
+            /* Prune the pair-list beyond fr->ic->rlistPrune using
+             * the current coordinates of the atoms.
+             */
+            wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
+            GMX_RELEASE_ASSERT(false, "The CPU prune kernel will be called here");
+            wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
+        }
+
         wallcycle_sub_start(wcycle, ewcsNONBONDED);
     }
+
     switch (nbvg->kernel_type)
     {
         case nbnxnk4x4_PlainC:
@@ -455,7 +468,7 @@ static void do_nb_verlet(t_forcerec *fr,
             break;
 
         case nbnxnk8x8x8_GPU:
-            nbnxn_gpu_launch_kernel(fr->nbv->gpu_nbv, nbvg->nbat, flags, ilocality);
+            nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbvg->nbat, flags, ilocality);
             break;
 
         case nbnxnk8x8x8_PlainC:
@@ -473,7 +486,7 @@ static void do_nb_verlet(t_forcerec *fr,
             break;
 
         default:
-            gmx_incons("Invalid nonbonded kernel type passed!");
+            GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
 
     }
     if (!bUsingGpuKernels)
@@ -481,12 +494,13 @@ static void do_nb_verlet(t_forcerec *fr,
         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
     }
 
+    int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
     {
         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
     }
     else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
-             (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(fr->nbv->gpu_nbv)))
+             (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
     {
         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
     }
@@ -901,12 +915,17 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
         nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintLocal].nbat,
                             &top->excls,
-                            ic->rlist,
+                            nbv->listParams->rlistOuter,
                             nbv->min_ci_balanced,
                             &nbv->grp[eintLocal].nbl_lists,
                             eintLocal,
                             nbv->grp[eintLocal].kernel_type,
                             nrnb);
+        nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
+        if (nbv->listParams->useDynamicPruning && !bUseGPU)
+        {
+            nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
+        }
         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
 
         if (bUseGPU)
@@ -938,7 +957,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
         /* launch local nonbonded F on GPU */
         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
-                     nrnb, wcycle);
+                     step, nrnb, wcycle);
         wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
     }
 
@@ -976,13 +995,17 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
 
             nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintNonlocal].nbat,
                                 &top->excls,
-                                ic->rlist,
+                                nbv->listParams->rlistOuter,
                                 nbv->min_ci_balanced,
                                 &nbv->grp[eintNonlocal].nbl_lists,
                                 eintNonlocal,
                                 nbv->grp[eintNonlocal].kernel_type,
                                 nrnb);
-
+            nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
+            if (nbv->listParams->useDynamicPruning && !bUseGPU)
+            {
+                nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
+            }
             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
 
             if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
@@ -1013,7 +1036,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
             wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
             /* launch non-local nonbonded F on GPU */
             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
-                         nrnb, wcycle);
+                         step, nrnb, wcycle);
             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
         }
     }
@@ -1136,9 +1159,8 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
 
     if (!bUseOrEmulGPU)
     {
-        /* Maybe we should move this into do_force_lowlevel */
         do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
-                     nrnb, wcycle);
+                     step, nrnb, wcycle);
     }
 
     if (fr->efep != efepNO)
@@ -1172,7 +1194,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         {
             do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
                          bDiffKernels ? enbvClearFYes : enbvClearFNo,
-                         nrnb, wcycle);
+                         step, nrnb, wcycle);
         }
 
         if (!bUseOrEmulGPU)
@@ -1246,7 +1268,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
             {
                 wallcycle_start_nocount(wcycle, ewcFORCE);
                 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
-                             nrnb, wcycle);
+                             step, nrnb, wcycle);
                 wallcycle_stop(wcycle, ewcFORCE);
             }
             wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
@@ -1319,6 +1341,27 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
             /* now clear the GPU outputs while we finish the step on the CPU */
             wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
             nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
+
+            /* Is dynamic pair-list pruning activated? */
+            if (nbv->listParams->useDynamicPruning)
+            {
+                /* We should not launch the rolling pruning kernel at a search
+                 * step or just before search steps, since that's useless.
+                 * Without domain decomposition we prune at even steps.
+                 * With domain decomposition we alternate local and non-local
+                 * pruning at even and odd steps.
+                 */
+                int  numRollingParts     = nbv->listParams->numRollingParts;
+                GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
+                int  stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
+                bool stepIsEven          = ((stepWithCurrentList & 1) == 0);
+                if (stepWithCurrentList > 0 &&
+                    stepWithCurrentList < inputrec->nstlist - 1 &&
+                    (stepIsEven || DOMAINDECOMP(cr)))
+                {
+                    GMX_RELEASE_ASSERT(false, "The GPU prune kernel will be called here");
+                }
+            }
             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
         }
         else
@@ -1326,7 +1369,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
             wallcycle_start_nocount(wcycle, ewcFORCE);
             do_nb_verlet(fr, ic, enerd, flags, eintLocal,
                          DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
-                         nrnb, wcycle);
+                         step, nrnb, wcycle);
             wallcycle_stop(wcycle, ewcFORCE);
         }
         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
diff --git a/src/gromacs/mdtypes/interaction_const.h b/src/gromacs/mdtypes/interaction_const.h
index 6682ef7d5b..16b0d32e89 100644
--- a/src/gromacs/mdtypes/interaction_const.h
+++ b/src/gromacs/mdtypes/interaction_const.h
@@ -92,9 +92,6 @@ struct interaction_const_t
     /* Coulomb */
     real rcoulomb;
 
-    /* Cut-off */
-    real rlist;
-
     /* PME/Ewald */
     real ewaldcoeff_q;
     real ewaldcoeff_lj;
diff --git a/src/gromacs/timing/gpu_timing.h b/src/gromacs/timing/gpu_timing.h
index 772b5ee39c..e9ec67d855 100644
--- a/src/gromacs/timing/gpu_timing.h
+++ b/src/gromacs/timing/gpu_timing.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2012,2014,2015,2017, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -57,14 +57,16 @@ struct gmx_nbnxn_kernel_timing_data_t
 /*! \internal \brief GPU timings for kernels and H2d/D2H transfers. */
 struct gmx_wallclock_gpu_t
 {
-    struct gmx_nbnxn_kernel_timing_data_t ktime[2][2]; /**< table containing the timings of the four
-                                                          versions of the nonbonded kernels: force-only,
-                                                          force+energy, force+pruning, and force+energy+pruning */
-    double  nb_h2d_t;                                  /**< host to device transfer time in nb calculation  */
-    double  nb_d2h_t;                                  /**< device to host transfer time in nb calculation */
-    int     nb_c;                                      /**< total call count of the nonbonded gpu operations */
-    double  pl_h2d_t;                                  /**< pair search step host to device transfer time */
-    int     pl_h2d_c;                                  /**< pair search step  host to device transfer call count */
+    gmx_nbnxn_kernel_timing_data_t ktime[2][2];      /**< table containing the timings of the four
+                                                        versions of the nonbonded kernels: force-only,
+                                                        force+energy, force+pruning, and force+energy+pruning */
+    gmx_nbnxn_kernel_timing_data_t pruneTime;        /**< table containing the timings of the 1st pass prune-only kernels */
+    gmx_nbnxn_kernel_timing_data_t dynamicPruneTime; /**< table containing the timings of dynamic prune-only kernels */
+    double                         nb_h2d_t;         /**< host to device transfer time in nb calculation  */
+    double                         nb_d2h_t;         /**< device to host transfer time in nb calculation */
+    int                            nb_c;             /**< total call count of the nonbonded gpu operations */
+    double                         pl_h2d_t;         /**< pair search step host to device transfer time */
+    int                            pl_h2d_c;         /**< pair search step  host to device transfer call count */
 };
 
 #ifdef __cplusplus
diff --git a/src/gromacs/timing/wallcycle.cpp b/src/gromacs/timing/wallcycle.cpp
index 6b18f2dbfb..85bc4967fb 100644
--- a/src/gromacs/timing/wallcycle.cpp
+++ b/src/gromacs/timing/wallcycle.cpp
@@ -120,6 +120,7 @@ static const char *wcsn[ewcsNR] =
     "Bonded-FEP F",
     "Restraints F",
     "Listed buffer ops.",
+    "Nonbonded pruning",
     "Nonbonded F",
     "Ewald F correction",
     "NB X buffer ops.",
@@ -890,7 +891,7 @@ void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int np
                 tot_k += gpu_t->ktime[i][j].t;
             }
         }
-        tot_gpu += tot_k;
+        tot_gpu += tot_k + gpu_t->pruneTime.t;
 
         tot_cpu_overlap = wc->wcc[ewcFORCE].c;
         if (wc->wcc[ewcPMEMESH].n > 0)
@@ -918,11 +919,23 @@ void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int np
                 }
             }
         }
-
+        if (gpu_t->pruneTime.c)
+        {
+            print_gputimes(fplog, "Pruning kernel", gpu_t->pruneTime.c, gpu_t->pruneTime.t, tot_gpu);
+        }
         print_gputimes(fplog, "F D2H",  gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu);
         fprintf(fplog, "%s\n", hline);
         print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu);
         fprintf(fplog, "%s\n", hline);
+        if (gpu_t->dynamicPruneTime.c)
+        {
+            /* We print the dynamic pruning kernel timings after a separator
+             * and avoid adding it to tot_gpu as this is not in the force
+             * overlap. We print the fraction as relative to the rest.
+             */
+            print_gputimes(fplog, "*Dynamic pruning", gpu_t->dynamicPruneTime.c, gpu_t->dynamicPruneTime.t, tot_gpu);
+            fprintf(fplog, "%s\n", hline);
+        }
 
         gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
         if (gpu_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
diff --git a/src/gromacs/timing/wallcycle.h b/src/gromacs/timing/wallcycle.h
index d69aaa6398..d40656e4f8 100644
--- a/src/gromacs/timing/wallcycle.h
+++ b/src/gromacs/timing/wallcycle.h
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2008, The GROMACS development team.
- * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015,2017, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -68,6 +68,7 @@ enum {
     ewcsLISTED_FEP,
     ewcsRESTRAINTS,
     ewcsLISTED_BUF_OPS,
+    ewcsNONBONDED_PRUNING,
     ewcsNONBONDED,
     ewcsEWALD_CORRECTION,
     ewcsNB_X_BUF_OPS,
diff --git a/src/programs/mdrun/md.cpp b/src/programs/mdrun/md.cpp
index 5b74c841e5..902f7bfb84 100644
--- a/src/programs/mdrun/md.cpp
+++ b/src/programs/mdrun/md.cpp
@@ -520,7 +520,7 @@ double gmx::do_md(FILE *fplog, t_commrec *cr, const gmx::MDLogger &mdlog,
     if (bPMETune)
     {
         pme_loadbal_init(&pme_loadbal, cr, mdlog, ir, state->box,
-                         fr->ic, fr->pmedata, use_GPU(fr->nbv),
+                         fr->ic, fr->nbv->listParams.get(), fr->pmedata, use_GPU(fr->nbv),
                          &bPMETunePrinting);
     }
 
diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp
index 5a71b61c27..d2b5a531c7 100644
--- a/src/programs/mdrun/runner.cpp
+++ b/src/programs/mdrun/runner.cpp
@@ -373,8 +373,8 @@ static void increase_nstlist(FILE *fp, t_commrec *cr,
      */
     nstlist_prev = ir->nstlist;
     ir->nstlist  = nbnxnReferenceNstlist;
-    calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, nullptr,
-                            &rlistWithReferenceNstlist);
+    calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1,
+                            -1, &ls, nullptr, &rlistWithReferenceNstlist);
     ir->nstlist  = nstlist_prev;
 
     /* Determine the pair list size increase due to zero interactions */
@@ -398,7 +398,7 @@ static void increase_nstlist(FILE *fp, t_commrec *cr,
         }
 
         /* Set the pair-list buffer size in ir */
-        calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, nullptr, &rlist_new);
+        calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1, -1, &ls, nullptr, &rlist_new);
 
         /* Does rlist fit in the box? */
         bBox = (gmx::square(rlist_new) < max_cutoff2(ir->ePBC, box));
@@ -497,7 +497,7 @@ static void prepare_verlet_scheme(FILE                           *fplog,
          */
         verletbuf_get_list_setup(true, makeGpuPairList, &ls);
 
-        calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, nullptr, &rlist_new);
+        calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1, -1, &ls, nullptr, &rlist_new);
 
         if (rlist_new != ir->rlist)
         {
-- 
2.22.0