Refactoring of PME load balancing
authorBerk Hess <hess@kth.se>
Fri, 6 Mar 2015 11:32:24 +0000 (12:32 +0100)
committerGerrit Code Review <gerrit@gerrit.gromacs.org>
Wed, 27 May 2015 10:36:02 +0000 (12:36 +0200)
Moved all the higher level PME load balancing management code from
md.cpp to pme-load-balancing.c.

Change-Id: Ic9f4be7373ccaa40b3867af6facce66b873efab3

src/gromacs/ewald/pme-load-balancing.cpp
src/gromacs/ewald/pme-load-balancing.h
src/gromacs/timing/wallcycle.c
src/gromacs/timing/wallcycle.h
src/programs/mdrun/md.cpp

index 73327ca167786669d196766db6a3baf08b05fbb8..2d5ff5e6452aed2cbcfe96c54c3adc0868158d92 100644 (file)
 
 #include "config.h"
 
+#include <assert.h>
+
 #include <cmath>
 
 #include <algorithm>
 
 #include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_network.h"
 #include "gromacs/legacyheaders/calcgrid.h"
 #include "gromacs/legacyheaders/force.h"
 #include "gromacs/legacyheaders/md_logging.h"
@@ -61,6 +64,7 @@
 #include "gromacs/math/vec.h"
 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
 #include "gromacs/pbcutil/pbc.h"
+#include "gromacs/timing/wallcycle.h"
 #include "gromacs/utility/cstringutil.h"
 #include "gromacs/utility/smalloc.h"
 
@@ -82,9 +86,12 @@ struct pme_setup_t {
     double            cycles;          /**< the fastest time for this setup in cycles    */
 };
 
+/*! \brief After 50 nstlist periods of not observing imbalance: never tune PME */
+const int  PMETunePeriod = 50;
+/*! \brief Trigger PME load balancing at more than 5% PME overload */
+const real loadBalanceTriggerFactor = 1.05;
 /*! \brief In the initial scan, step by grids that are at least a factor 0.8 coarser */
 const real gridScaleFactor = 0.8;
-
 /*! \brief In the initial scan, try to skip grids with uneven x/y/z spacing,
  * checking if the "efficiency" is more than 5% worse than the previous grid.
  */
@@ -106,6 +113,9 @@ const char *pmelblim_str[epmelblimNR] =
 { "no", "box size", "domain decompostion", "PME grid restriction" };
 
 struct pme_load_balancing_t {
+    gmx_bool     bSepPMERanks;       /**< do we have separate PME ranks? */
+    gmx_bool     bActive;            /**< is PME tuning active? */
+    gmx_bool     bBalance;           /**< are we in the balancing phase, i.e. trying different setups? */
     int          nstage;             /**< the current maximum number of stages */
 
     real         cut_spacing;        /**< the minimum cutoff / PME grid spacing ratio */
@@ -125,12 +135,17 @@ struct pme_load_balancing_t {
     int          cutoff_scheme;      /**< Verlet or group cut-offs */
 
     int          stage;              /**< the current stage */
+
+    int          cycles_n;           /**< step cycle counter cummulative count */
+    double       cycles_c;           /**< step cycle counter cummulative cycles */
 };
 
 void pme_loadbal_init(pme_load_balancing_t **pme_lb_p,
                       const t_inputrec *ir, matrix box,
                       const interaction_const_t *ic,
-                      struct gmx_pme_t *pmedata)
+                      struct gmx_pme_t *pmedata,
+                      gmx_bool bUseGPU, gmx_bool bSepPMERanks,
+                      gmx_bool *bPrinting)
 {
     pme_load_balancing_t *pme_lb;
     real                  spm, sp;
@@ -138,8 +153,10 @@ void pme_loadbal_init(pme_load_balancing_t **pme_lb_p,
 
     snew(pme_lb, 1);
 
+    pme_lb->bSepPMERanks  = bSepPMERanks;
+
     /* Any number of stages >= 2 is supported */
-    pme_lb->nstage   = 2;
+    pme_lb->nstage        = 2;
 
     pme_lb->cutoff_scheme = ir->cutoff_scheme;
 
@@ -192,7 +209,7 @@ void pme_loadbal_init(pme_load_balancing_t **pme_lb_p,
     pme_lb->setup[0].ewaldcoeff_q    = ic->ewaldcoeff_q;
     pme_lb->setup[0].ewaldcoeff_lj   = ic->ewaldcoeff_lj;
 
-    pme_lb->setup[0].pmedata  = pmedata;
+    pme_lb->setup[0].pmedata         = pmedata;
 
     spm = 0;
     for (d = 0; d < DIM; d++)
@@ -221,7 +238,24 @@ void pme_loadbal_init(pme_load_balancing_t **pme_lb_p,
     pme_lb->end      = 0;
     pme_lb->elimited = epmelblimNO;
 
-    *pme_lb_p = pme_lb;
+    pme_lb->cycles_n = 0;
+    pme_lb->cycles_c = 0;
+
+    /* Tune with GPUs and/or separate PME ranks.
+     * When running only on a CPU without PME ranks, PME tuning will only help
+     * with small numbers of atoms in the cut-off sphere.
+     */
+    pme_lb->bActive  = (wallcycle_have_counter() && (bUseGPU || bSepPMERanks));
+
+    /* With GPUs and no separate PME ranks we can't measure the PP/PME
+     * imbalance, so we start balancing right away.
+     * Otherwise we only start balancing after we observe imbalance.
+     */
+    pme_lb->bBalance = (pme_lb->bActive && (bUseGPU && !bSepPMERanks));
+
+    *pme_lb_p  = pme_lb;
+
+    *bPrinting = pme_lb->bBalance;
 }
 
 /*! \brief Try to increase the cutoff during load balancing */
@@ -230,7 +264,7 @@ static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t *pme_lb,
                                             const gmx_domdec_t   *dd)
 {
     pme_setup_t *set;
-    int          npmenodes_x, npmenodes_y;
+    int          npmeranks_x, npmeranks_y;
     real         fac, sp;
     real         tmpr_coulomb, tmpr_vdw;
     int          d;
@@ -242,7 +276,7 @@ static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t *pme_lb,
     set          = &pme_lb->setup[pme_lb->n-1];
     set->pmedata = NULL;
 
-    get_pme_nnodes(dd, &npmenodes_x, &npmenodes_y);
+    get_pme_nnodes(dd, &npmeranks_x, &npmeranks_y);
 
     fac = 1;
     do
@@ -267,14 +301,14 @@ static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t *pme_lb,
                        &set->grid[YY],
                        &set->grid[ZZ]);
 
-        /* As here we can't easily check if one of the PME nodes
+        /* As here we can't easily check if one of the PME ranks
          * uses threading, we do a conservative grid check.
          * This means we can't use pme_order or less grid lines
-         * per PME node along x, which is not a strong restriction.
+         * per PME rank along x, which is not a strong restriction.
          */
         gmx_pme_check_restrictions(pme_order,
                                    set->grid[XX], set->grid[YY], set->grid[ZZ],
-                                   npmenodes_x, npmenodes_y,
+                                   npmeranks_x, npmeranks_y,
                                    TRUE,
                                    FALSE,
                                    &grid_ok);
@@ -463,17 +497,31 @@ static void switch_to_stage1(pme_load_balancing_t *pme_lb)
     pme_lb->cur = pme_lb->start - 1;
 }
 
-gmx_bool pme_load_balance(pme_load_balancing_t      *pme_lb,
-                          t_commrec                 *cr,
-                          FILE                      *fp_err,
-                          FILE                      *fp_log,
-                          t_inputrec                *ir,
-                          t_state                   *state,
-                          double                     cycles,
-                          interaction_const_t       *ic,
-                          struct nonbonded_verlet_t *nbv,
-                          struct gmx_pme_t **        pmedata,
-                          gmx_int64_t                step)
+/*! \brief Try to adjust the PME grid and Coulomb cut-off
+ *
+ * The adjustment is done to generate a different non-bonded PP and PME load.
+ * With separate PME ranks (PP and PME on different processes) or with
+ * a GPU (PP on GPU, PME on CPU), PP and PME run on different resources
+ * and changing the load will affect the load balance and performance.
+ * The total time for a set of integration steps is monitored and a range
+ * of grid/cut-off setups is scanned. After calling pme_load_balance many
+ * times and acquiring enough statistics, the best performing setup is chosen.
+ * Here we try to take into account fluctuations and changes due to external
+ * factors as well as DD load balancing.
+ * Returns TRUE the load balancing continues, FALSE is the balancing is done.
+ */
+static gmx_bool
+pme_load_balance(pme_load_balancing_t      *pme_lb,
+                 t_commrec                 *cr,
+                 FILE                      *fp_err,
+                 FILE                      *fp_log,
+                 t_inputrec                *ir,
+                 t_state                   *state,
+                 double                     cycles,
+                 interaction_const_t       *ic,
+                 struct nonbonded_verlet_t *nbv,
+                 struct gmx_pme_t **        pmedata,
+                 gmx_int64_t                step)
 {
     gmx_bool     OK;
     pme_setup_t *set;
@@ -752,7 +800,7 @@ gmx_bool pme_load_balance(pme_load_balancing_t      *pme_lb,
      */
     init_interaction_const_tables(NULL, ic, bUsesSimpleTables, rtab);
 
-    if (cr->duty & DUTY_PME)
+    if (!pme_lb->bSepPMERanks)
     {
         if (pme_lb->setup[pme_lb->cur].pmedata == NULL)
         {
@@ -767,7 +815,7 @@ gmx_bool pme_load_balance(pme_load_balancing_t      *pme_lb,
     }
     else
     {
-        /* Tell our PME-only node to switch grid */
+        /* Tell our PME-only rank to switch grid */
         gmx_pme_send_switchgrid(cr, set->grid, set->ewaldcoeff_q, set->ewaldcoeff_lj);
     }
 
@@ -784,6 +832,117 @@ gmx_bool pme_load_balance(pme_load_balancing_t      *pme_lb,
     return TRUE;
 }
 
+void pme_loadbal_do(pme_load_balancing_t *pme_lb,
+                    t_commrec            *cr,
+                    FILE                 *fp_err,
+                    FILE                 *fp_log,
+                    t_inputrec           *ir,
+                    t_forcerec           *fr,
+                    t_state              *state,
+                    gmx_wallcycle_t       wcycle,
+                    gmx_int64_t           step,
+                    gmx_int64_t           step_rel,
+                    gmx_bool             *bPrinting)
+{
+    int    n_prev;
+    double cycles_prev;
+
+    assert(pme_lb != NULL);
+
+    if (!pme_lb->bActive)
+    {
+        return;
+    }
+
+    n_prev      = pme_lb->cycles_n;
+    cycles_prev = pme_lb->cycles_c;
+    wallcycle_get(wcycle, ewcSTEP, &pme_lb->cycles_n, &pme_lb->cycles_c);
+    if (pme_lb->cycles_n == 0)
+    {
+        /* Before the first step we haven't done any steps yet */
+        return;
+    }
+    /* Sanity check, we expect nstlist cycle counts */
+    if (pme_lb->cycles_n - n_prev != ir->nstlist)
+    {
+        /* We could return here, but it's safer to issue and error and quit */
+        gmx_incons("pme_loadbal_do called at an interval != nstlist");
+    }
+
+    /* PME grid + cut-off optimization with GPUs or PME ranks */
+    if (!pme_lb->bBalance && pme_lb->bSepPMERanks)
+    {
+        if (DDMASTER(cr->dd))
+        {
+            /* PME rank load is too high, start tuning */
+            pme_lb->bBalance = (dd_pme_f_ratio(cr->dd) >= loadBalanceTriggerFactor);
+        }
+        dd_bcast(cr->dd, sizeof(gmx_bool), &pme_lb->bBalance);
+
+        if (pme_lb->bBalance &&
+            use_GPU(fr->nbv) && DOMAINDECOMP(cr) &&
+            pme_lb->bSepPMERanks)
+        {
+            /* Lock DLB=auto to off (does nothing when DLB=yes/no).
+             * With GPUs + separate PME ranks, we don't want DLB.
+             * This could happen when we scan coarse grids and
+             * it would then never be turned off again.
+             * This would hurt performance at the final, optimal
+             * grid spacing, where DLB almost never helps.
+             * Also, DLB can limit the cut-off for PME tuning.
+             */
+            dd_dlb_set_lock(cr->dd, TRUE);
+        }
+    }
+
+    if (pme_lb->bBalance)
+    {
+        /* init_step might not be a multiple of nstlist,
+         * but the first cycle is always skipped anyhow.
+         */
+        pme_lb->bBalance =
+            pme_load_balance(pme_lb, cr,
+                             fp_err, fp_log,
+                             ir, state, pme_lb->cycles_c - cycles_prev,
+                             fr->ic, fr->nbv, &fr->pmedata,
+                             step);
+
+        /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
+        fr->ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
+        fr->ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
+        fr->rlist         = fr->ic->rlist;
+        fr->rlistlong     = fr->ic->rlistlong;
+        fr->rcoulomb      = fr->ic->rcoulomb;
+        fr->rvdw          = fr->ic->rvdw;
+
+        if (ir->eDispCorr != edispcNO)
+        {
+            calc_enervirdiff(NULL, ir->eDispCorr, fr);
+        }
+
+        if (!pme_lb->bBalance &&
+            DOMAINDECOMP(cr) &&
+            dd_dlb_is_locked(cr->dd))
+        {
+            /* Unlock the DLB=auto, DLB is allowed to activate
+             * (but we don't expect it to activate in most cases).
+             */
+            dd_dlb_set_lock(cr->dd, FALSE);
+        }
+    }
+
+    if (!pme_lb->bBalance &&
+        (!pme_lb->bSepPMERanks || (step_rel <= PMETunePeriod*ir->nstlist)))
+    {
+        /* We have just deactivated the balancing and we're not measuring PP/PME
+         * imbalance during the first 50*nstlist steps: deactivate the tuning.
+         */
+        pme_lb->bActive = FALSE;
+    }
+
+    *bPrinting = pme_lb->bBalance;
+}
+
 void restart_pme_loadbal(pme_load_balancing_t *pme_lb, int n)
 {
     pme_lb->nstage += n;
index 721ead76f72497dbd9f77a54b5975fd3bb3c04ec..2a46b2cb1ee2d2560f0c68d7dabeec11c9b7cb47 100644 (file)
 #include "gromacs/legacyheaders/types/inputrec.h"
 #include "gromacs/legacyheaders/types/interaction_const.h"
 #include "gromacs/legacyheaders/types/state.h"
+#include "gromacs/timing/wallcycle.h"
 
 /*! \brief Object to manage PME load balancing */
 struct pme_load_balancing_t;
 
-/*! \brief Initialze the PP-PME load balacing data and infrastructure */
+/*! \brief Initialize the PP-PME load balacing data and infrastructure
+ *
+ * Initialize the PP-PME load balacing data and infrastructure.
+ * The actual load balancing might start right away, later or never.
+ * Returns in bPrinting whether the load balancing is printing to fp_err.
+ * The PME grid in pmedata is reused for smaller grids to lower the memory
+ * usage.
+ */
 void pme_loadbal_init(pme_load_balancing_t     **pme_lb_p,
                       const t_inputrec          *ir,
                       matrix                     box,
                       const interaction_const_t *ic,
-                      struct gmx_pme_t          *pmedata);
+                      struct gmx_pme_t          *pmedata,
+                      gmx_bool                   bUseGPU,
+                      gmx_bool                   bSepPMERanks,
+                      gmx_bool                  *bPrinting);
 
-/*! \brief Try to adjust the PME grid and Coulomb cut-off.
- *
- * The adjustment is done to generate a different non-bonded PP and PME load.
- * With separate PME nodes (PP and PME on different processes) or with
- * a GPU (PP on GPU, PME on CPU), PP and PME run on different resources
- * and changing the load will affect the load balance and performance.
- * The total time for a set of integration steps is monitored and a range
- * of grid/cut-off setups is scanned. After calling pme_load_balance many
- * times and acquiring enough statistics, the best performing setup is chosen.
- * Here we try to take into account fluctuations and changes due to external
- * factors as well as DD load balancing.
+/*! \brief Process cycles and PME load balance when necessary
  *
- * \return TRUE the load balancing continues, FALSE is the balancing is done.
+ * Process the cycles measured over the last nstlist steps and then
+ * either continue balancing or check if we need to trigger balancing.
+ * Should be called after the ewcSTEP cycle counter has been stopped.
+ * Returns if the load balancing is printing to fp_err.
  */
-gmx_bool pme_load_balance(pme_load_balancing_t      *pme_lb,
-                          t_commrec                 *cr,
-                          FILE                      *fp_err,
-                          FILE                      *fp_log,
-                          t_inputrec                *ir,
-                          t_state                   *state,
-                          double                     cycles,
-                          interaction_const_t       *ic,
-                          struct nonbonded_verlet_t *nbv,
-                          struct gmx_pme_t **        pmedata,
-                          gmx_int64_t                step);
+void pme_loadbal_do(pme_load_balancing_t *pme_lb,
+                    t_commrec            *cr,
+                    FILE                 *fp_err,
+                    FILE                 *fp_log,
+                    t_inputrec           *ir,
+                    t_forcerec           *fr,
+                    t_state              *state,
+                    gmx_wallcycle_t       wcycle,
+                    gmx_int64_t           step,
+                    gmx_int64_t           step_rel,
+                    gmx_bool             *bPrinting);
 
 /*! \brief Restart the PME load balancing discarding all timings gathered up till now */
 void restart_pme_loadbal(pme_load_balancing_t *pme_lb, int n);
index 2c3848b22c83f53abe3b9f65f28ce5e8c548b8d6..a084552875e2d0d566f4352801d3f2eddd9f801e 100644 (file)
@@ -340,6 +340,12 @@ double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
     return last;
 }
 
+void wallcycle_get(gmx_wallcycle_t wc, int ewc, int *n, double *c)
+{
+    *n = wc->wcc[ewc].n;
+    *c = (double)wc->wcc[ewc].c;
+}
+
 void wallcycle_reset_all(gmx_wallcycle_t wc)
 {
     int i;
index 1ca527949fcf98b18153b03d7b999f66400f8d09..af1f045ac0b5393a7f51490064b0de09b8a30696 100644 (file)
@@ -98,6 +98,9 @@ void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc);
 double wallcycle_stop(gmx_wallcycle_t wc, int ewc);
 /* Stop the cycle count for ewc, returns the last cycle count */
 
+void wallcycle_get(gmx_wallcycle_t wc, int ewc, int *n, double *c);
+/* Returns the cumulative count and cycle count for ewc */
+
 void wallcycle_reset_all(gmx_wallcycle_t wc);
 /* Resets all cycle counters to zero */
 
index a10d7f6b91d89eadb9c6cca022a7b24b3cd0007e..f6c3465c8df255fe3618928fa972eed7a0fbd994 100644 (file)
@@ -226,9 +226,9 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                                                                           simulation stops. If equal to zero, don't
                                                                           communicate any more between multisims.*/
     /* PME load balancing data for GPU kernels */
-    pme_load_balancing_t *pme_loadbal = NULL;
-    double                cycles_pmes;
-    gmx_bool              bPMETuneTry = FALSE, bPMETuneRunning = FALSE;
+    pme_load_balancing_t *pme_loadbal;
+    gmx_bool              bPMETune         = FALSE;
+    gmx_bool              bPMETunePrinting = FALSE;
 
     /* Interactive MD */
     gmx_bool          bIMDstep = FALSE;
@@ -465,26 +465,16 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                                         repl_ex_nst, repl_ex_nex, repl_ex_seed);
     }
 
-    /* PME tuning is only supported with GPUs or PME nodes and not with rerun.
-     * PME tuning is not supported with PME only for LJ and not for Coulomb.
+    /* PME tuning is only supported with PME for Coulomb. Is is not supported
+     * with only LJ PME, or for reruns.
      */
-    if ((Flags & MD_TUNEPME) &&
-        EEL_PME(fr->eeltype) &&
-        ( use_GPU(fr->nbv) || !(cr->duty & DUTY_PME)) &&
-        !bRerunMD)
+    bPMETune = ((Flags & MD_TUNEPME) && EEL_PME(fr->eeltype) && !bRerunMD &&
+                !(Flags & MD_REPRODUCIBLE));
+    if (bPMETune)
     {
-        pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata);
-        cycles_pmes = 0;
-        if (cr->duty & DUTY_PME)
-        {
-            /* Start tuning right away, as we can't measure the load */
-            bPMETuneRunning = TRUE;
-        }
-        else
-        {
-            /* Separate PME nodes, we can measure the PP/PME load balance */
-            bPMETuneTry = TRUE;
-        }
+        pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata,
+                         use_GPU(fr->nbv), !(cr->duty & DUTY_PME),
+                         &bPMETunePrinting);
     }
 
     if (!ir->bContinuation && !bRerunMD)
@@ -729,6 +719,20 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     while (!bLastStep || (bRerunMD && bNotLastFrame))
     {
 
+        /* Determine if this is a neighbor search step */
+        bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
+
+        if (bPMETune && bNStList)
+        {
+            /* PME grid + cut-off optimization with GPUs or PME nodes */
+            pme_loadbal_do(pme_loadbal, cr,
+                           (bVerbose && MASTER(cr)) ? stderr : NULL,
+                           fplog,
+                           ir, fr, state, wcycle,
+                           step, step_rel,
+                           &bPMETunePrinting);
+        }
+
         wallcycle_start(wcycle, ewcSTEP);
 
         if (bRerunMD)
@@ -842,9 +846,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         }
         else
         {
-            /* Determine whether or not to do Neighbour Searching and LR */
-            bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
-
             bNS = (bFirstStep || bExchanged || bNeedRepartition || bNStList || bDoFEP);
         }
 
@@ -920,7 +921,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                                     state, &f, mdatoms, top, fr,
                                     vsite, shellfc, constr,
                                     nrnb, wcycle,
-                                    do_verbose && !bPMETuneRunning);
+                                    do_verbose && !bPMETunePrinting);
                 wallcycle_stop(wcycle, ewcDOMDEC);
             }
         }
@@ -1619,7 +1620,9 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             state->fep_state = lamnew;
         }
         /* Print the remaining wall clock time for the run */
-        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
+        if (MULTIMASTER(cr) &&
+            (do_verbose || gmx_got_usr_signal()) &&
+            !bPMETunePrinting)
         {
             if (shellfc)
             {
@@ -1703,97 +1706,17 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             }
         }
 
-        if (!bRerunMD || !rerun_fr.bStep)
-        {
-            /* increase the MD step number */
-            step++;
-            step_rel++;
-        }
-
         cycles = wallcycle_stop(wcycle, ewcSTEP);
         if (DOMAINDECOMP(cr) && wcycle)
         {
             dd_cycles_add(cr->dd, cycles, ddCyclStep);
         }
 
-        if (bPMETuneRunning || bPMETuneTry)
+        if (!bRerunMD || !rerun_fr.bStep)
         {
-            /* PME grid + cut-off optimization with GPUs or PME nodes */
-
-            /* Count the total cycles over the last steps */
-            cycles_pmes += cycles;
-
-            /* We can only switch cut-off at NS steps */
-            if (step % ir->nstlist == 0)
-            {
-                /* PME grid + cut-off optimization with GPUs or PME nodes */
-                if (bPMETuneTry)
-                {
-                    if (DDMASTER(cr->dd))
-                    {
-                        /* PME node load is too high, start tuning */
-                        bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05);
-                    }
-                    dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
-
-                    if (bPMETuneRunning &&
-                        use_GPU(fr->nbv) && DOMAINDECOMP(cr) &&
-                        !(cr->duty & DUTY_PME))
-                    {
-                        /* Lock DLB=auto to off (does nothing when DLB=yes/no).
-                         * With GPUs + separate PME ranks, we don't want DLB.
-                         * This could happen when we scan coarse grids and
-                         * it would then never be turned off again.
-                         * This would hurt performance at the final, optimal
-                         * grid spacing, where DLB almost never helps.
-                         * Also, DLB can limit the cut-off for PME tuning.
-                         */
-                        dd_dlb_set_lock(cr->dd, TRUE);
-                    }
-
-                    if (bPMETuneRunning || step_rel > ir->nstlist*50)
-                    {
-                        bPMETuneTry     = FALSE;
-                    }
-                }
-                if (bPMETuneRunning)
-                {
-                    /* init_step might not be a multiple of nstlist,
-                     * but the first cycle is always skipped anyhow.
-                     */
-                    bPMETuneRunning =
-                        pme_load_balance(pme_loadbal, cr,
-                                         (bVerbose && MASTER(cr)) ? stderr : NULL,
-                                         fplog,
-                                         ir, state, cycles_pmes,
-                                         fr->ic, fr->nbv, &fr->pmedata,
-                                         step);
-
-                    /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
-                    fr->ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
-                    fr->ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
-                    fr->rlist         = fr->ic->rlist;
-                    fr->rlistlong     = fr->ic->rlistlong;
-                    fr->rcoulomb      = fr->ic->rcoulomb;
-                    fr->rvdw          = fr->ic->rvdw;
-
-                    if (ir->eDispCorr != edispcNO)
-                    {
-                        calc_enervirdiff(NULL, ir->eDispCorr, fr);
-                    }
-
-                    if (!bPMETuneRunning &&
-                        DOMAINDECOMP(cr) &&
-                        dd_dlb_is_locked(cr->dd))
-                    {
-                        /* Unlock the DLB=auto, DLB is allowed to activate
-                         * (but we don't expect it to activate in most cases).
-                         */
-                        dd_dlb_set_lock(cr->dd, FALSE);
-                    }
-                }
-                cycles_pmes = 0;
-            }
+            /* increase the MD step number */
+            step++;
+            step_rel++;
         }
 
         if (step_rel == wcycle_get_reset_counters(wcycle) ||
@@ -1851,10 +1774,9 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     done_mdoutf(outf);
     debug_gmx();
 
-    if (pme_loadbal != NULL)
+    if (bPMETune)
     {
-        pme_loadbal_done(pme_loadbal, cr, fplog,
-                         use_GPU(fr->nbv));
+        pme_loadbal_done(pme_loadbal, cr, fplog, use_GPU(fr->nbv));
     }
 
     if (shellfc && fplog)