From 34b524272dfe32bc3f34326f2e9158eb701310a1 Mon Sep 17 00:00:00 2001 From: Berk Hess Date: Fri, 6 Mar 2015 12:32:24 +0100 Subject: [PATCH] Refactoring of PME load balancing Moved all the higher level PME load balancing management code from md.cpp to pme-load-balancing.c. Change-Id: Ic9f4be7373ccaa40b3867af6facce66b873efab3 --- src/gromacs/ewald/pme-load-balancing.cpp | 205 ++++++++++++++++++++--- src/gromacs/ewald/pme-load-balancing.h | 54 +++--- src/gromacs/timing/wallcycle.c | 6 + src/gromacs/timing/wallcycle.h | 3 + src/programs/mdrun/md.cpp | 148 ++++------------ 5 files changed, 255 insertions(+), 161 deletions(-) diff --git a/src/gromacs/ewald/pme-load-balancing.cpp b/src/gromacs/ewald/pme-load-balancing.cpp index 73327ca167..2d5ff5e645 100644 --- a/src/gromacs/ewald/pme-load-balancing.cpp +++ b/src/gromacs/ewald/pme-load-balancing.cpp @@ -47,11 +47,14 @@ #include "config.h" +#include + #include #include #include "gromacs/domdec/domdec.h" +#include "gromacs/domdec/domdec_network.h" #include "gromacs/legacyheaders/calcgrid.h" #include "gromacs/legacyheaders/force.h" #include "gromacs/legacyheaders/md_logging.h" @@ -61,6 +64,7 @@ #include "gromacs/math/vec.h" #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h" #include "gromacs/pbcutil/pbc.h" +#include "gromacs/timing/wallcycle.h" #include "gromacs/utility/cstringutil.h" #include "gromacs/utility/smalloc.h" @@ -82,9 +86,12 @@ struct pme_setup_t { double cycles; /**< the fastest time for this setup in cycles */ }; +/*! \brief After 50 nstlist periods of not observing imbalance: never tune PME */ +const int PMETunePeriod = 50; +/*! \brief Trigger PME load balancing at more than 5% PME overload */ +const real loadBalanceTriggerFactor = 1.05; /*! \brief In the initial scan, step by grids that are at least a factor 0.8 coarser */ const real gridScaleFactor = 0.8; - /*! \brief In the initial scan, try to skip grids with uneven x/y/z spacing, * checking if the "efficiency" is more than 5% worse than the previous grid. */ @@ -106,6 +113,9 @@ const char *pmelblim_str[epmelblimNR] = { "no", "box size", "domain decompostion", "PME grid restriction" }; struct pme_load_balancing_t { + gmx_bool bSepPMERanks; /**< do we have separate PME ranks? */ + gmx_bool bActive; /**< is PME tuning active? */ + gmx_bool bBalance; /**< are we in the balancing phase, i.e. trying different setups? */ int nstage; /**< the current maximum number of stages */ real cut_spacing; /**< the minimum cutoff / PME grid spacing ratio */ @@ -125,12 +135,17 @@ struct pme_load_balancing_t { int cutoff_scheme; /**< Verlet or group cut-offs */ int stage; /**< the current stage */ + + int cycles_n; /**< step cycle counter cummulative count */ + double cycles_c; /**< step cycle counter cummulative cycles */ }; void pme_loadbal_init(pme_load_balancing_t **pme_lb_p, const t_inputrec *ir, matrix box, const interaction_const_t *ic, - struct gmx_pme_t *pmedata) + struct gmx_pme_t *pmedata, + gmx_bool bUseGPU, gmx_bool bSepPMERanks, + gmx_bool *bPrinting) { pme_load_balancing_t *pme_lb; real spm, sp; @@ -138,8 +153,10 @@ void pme_loadbal_init(pme_load_balancing_t **pme_lb_p, snew(pme_lb, 1); + pme_lb->bSepPMERanks = bSepPMERanks; + /* Any number of stages >= 2 is supported */ - pme_lb->nstage = 2; + pme_lb->nstage = 2; pme_lb->cutoff_scheme = ir->cutoff_scheme; @@ -192,7 +209,7 @@ void pme_loadbal_init(pme_load_balancing_t **pme_lb_p, pme_lb->setup[0].ewaldcoeff_q = ic->ewaldcoeff_q; pme_lb->setup[0].ewaldcoeff_lj = ic->ewaldcoeff_lj; - pme_lb->setup[0].pmedata = pmedata; + pme_lb->setup[0].pmedata = pmedata; spm = 0; for (d = 0; d < DIM; d++) @@ -221,7 +238,24 @@ void pme_loadbal_init(pme_load_balancing_t **pme_lb_p, pme_lb->end = 0; pme_lb->elimited = epmelblimNO; - *pme_lb_p = pme_lb; + pme_lb->cycles_n = 0; + pme_lb->cycles_c = 0; + + /* Tune with GPUs and/or separate PME ranks. + * When running only on a CPU without PME ranks, PME tuning will only help + * with small numbers of atoms in the cut-off sphere. + */ + pme_lb->bActive = (wallcycle_have_counter() && (bUseGPU || bSepPMERanks)); + + /* With GPUs and no separate PME ranks we can't measure the PP/PME + * imbalance, so we start balancing right away. + * Otherwise we only start balancing after we observe imbalance. + */ + pme_lb->bBalance = (pme_lb->bActive && (bUseGPU && !bSepPMERanks)); + + *pme_lb_p = pme_lb; + + *bPrinting = pme_lb->bBalance; } /*! \brief Try to increase the cutoff during load balancing */ @@ -230,7 +264,7 @@ static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t *pme_lb, const gmx_domdec_t *dd) { pme_setup_t *set; - int npmenodes_x, npmenodes_y; + int npmeranks_x, npmeranks_y; real fac, sp; real tmpr_coulomb, tmpr_vdw; int d; @@ -242,7 +276,7 @@ static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t *pme_lb, set = &pme_lb->setup[pme_lb->n-1]; set->pmedata = NULL; - get_pme_nnodes(dd, &npmenodes_x, &npmenodes_y); + get_pme_nnodes(dd, &npmeranks_x, &npmeranks_y); fac = 1; do @@ -267,14 +301,14 @@ static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t *pme_lb, &set->grid[YY], &set->grid[ZZ]); - /* As here we can't easily check if one of the PME nodes + /* As here we can't easily check if one of the PME ranks * uses threading, we do a conservative grid check. * This means we can't use pme_order or less grid lines - * per PME node along x, which is not a strong restriction. + * per PME rank along x, which is not a strong restriction. */ gmx_pme_check_restrictions(pme_order, set->grid[XX], set->grid[YY], set->grid[ZZ], - npmenodes_x, npmenodes_y, + npmeranks_x, npmeranks_y, TRUE, FALSE, &grid_ok); @@ -463,17 +497,31 @@ static void switch_to_stage1(pme_load_balancing_t *pme_lb) pme_lb->cur = pme_lb->start - 1; } -gmx_bool pme_load_balance(pme_load_balancing_t *pme_lb, - t_commrec *cr, - FILE *fp_err, - FILE *fp_log, - t_inputrec *ir, - t_state *state, - double cycles, - interaction_const_t *ic, - struct nonbonded_verlet_t *nbv, - struct gmx_pme_t ** pmedata, - gmx_int64_t step) +/*! \brief Try to adjust the PME grid and Coulomb cut-off + * + * The adjustment is done to generate a different non-bonded PP and PME load. + * With separate PME ranks (PP and PME on different processes) or with + * a GPU (PP on GPU, PME on CPU), PP and PME run on different resources + * and changing the load will affect the load balance and performance. + * The total time for a set of integration steps is monitored and a range + * of grid/cut-off setups is scanned. After calling pme_load_balance many + * times and acquiring enough statistics, the best performing setup is chosen. + * Here we try to take into account fluctuations and changes due to external + * factors as well as DD load balancing. + * Returns TRUE the load balancing continues, FALSE is the balancing is done. + */ +static gmx_bool +pme_load_balance(pme_load_balancing_t *pme_lb, + t_commrec *cr, + FILE *fp_err, + FILE *fp_log, + t_inputrec *ir, + t_state *state, + double cycles, + interaction_const_t *ic, + struct nonbonded_verlet_t *nbv, + struct gmx_pme_t ** pmedata, + gmx_int64_t step) { gmx_bool OK; pme_setup_t *set; @@ -752,7 +800,7 @@ gmx_bool pme_load_balance(pme_load_balancing_t *pme_lb, */ init_interaction_const_tables(NULL, ic, bUsesSimpleTables, rtab); - if (cr->duty & DUTY_PME) + if (!pme_lb->bSepPMERanks) { if (pme_lb->setup[pme_lb->cur].pmedata == NULL) { @@ -767,7 +815,7 @@ gmx_bool pme_load_balance(pme_load_balancing_t *pme_lb, } else { - /* Tell our PME-only node to switch grid */ + /* Tell our PME-only rank to switch grid */ gmx_pme_send_switchgrid(cr, set->grid, set->ewaldcoeff_q, set->ewaldcoeff_lj); } @@ -784,6 +832,117 @@ gmx_bool pme_load_balance(pme_load_balancing_t *pme_lb, return TRUE; } +void pme_loadbal_do(pme_load_balancing_t *pme_lb, + t_commrec *cr, + FILE *fp_err, + FILE *fp_log, + t_inputrec *ir, + t_forcerec *fr, + t_state *state, + gmx_wallcycle_t wcycle, + gmx_int64_t step, + gmx_int64_t step_rel, + gmx_bool *bPrinting) +{ + int n_prev; + double cycles_prev; + + assert(pme_lb != NULL); + + if (!pme_lb->bActive) + { + return; + } + + n_prev = pme_lb->cycles_n; + cycles_prev = pme_lb->cycles_c; + wallcycle_get(wcycle, ewcSTEP, &pme_lb->cycles_n, &pme_lb->cycles_c); + if (pme_lb->cycles_n == 0) + { + /* Before the first step we haven't done any steps yet */ + return; + } + /* Sanity check, we expect nstlist cycle counts */ + if (pme_lb->cycles_n - n_prev != ir->nstlist) + { + /* We could return here, but it's safer to issue and error and quit */ + gmx_incons("pme_loadbal_do called at an interval != nstlist"); + } + + /* PME grid + cut-off optimization with GPUs or PME ranks */ + if (!pme_lb->bBalance && pme_lb->bSepPMERanks) + { + if (DDMASTER(cr->dd)) + { + /* PME rank load is too high, start tuning */ + pme_lb->bBalance = (dd_pme_f_ratio(cr->dd) >= loadBalanceTriggerFactor); + } + dd_bcast(cr->dd, sizeof(gmx_bool), &pme_lb->bBalance); + + if (pme_lb->bBalance && + use_GPU(fr->nbv) && DOMAINDECOMP(cr) && + pme_lb->bSepPMERanks) + { + /* Lock DLB=auto to off (does nothing when DLB=yes/no). + * With GPUs + separate PME ranks, we don't want DLB. + * This could happen when we scan coarse grids and + * it would then never be turned off again. + * This would hurt performance at the final, optimal + * grid spacing, where DLB almost never helps. + * Also, DLB can limit the cut-off for PME tuning. + */ + dd_dlb_set_lock(cr->dd, TRUE); + } + } + + if (pme_lb->bBalance) + { + /* init_step might not be a multiple of nstlist, + * but the first cycle is always skipped anyhow. + */ + pme_lb->bBalance = + pme_load_balance(pme_lb, cr, + fp_err, fp_log, + ir, state, pme_lb->cycles_c - cycles_prev, + fr->ic, fr->nbv, &fr->pmedata, + step); + + /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */ + fr->ewaldcoeff_q = fr->ic->ewaldcoeff_q; + fr->ewaldcoeff_lj = fr->ic->ewaldcoeff_lj; + fr->rlist = fr->ic->rlist; + fr->rlistlong = fr->ic->rlistlong; + fr->rcoulomb = fr->ic->rcoulomb; + fr->rvdw = fr->ic->rvdw; + + if (ir->eDispCorr != edispcNO) + { + calc_enervirdiff(NULL, ir->eDispCorr, fr); + } + + if (!pme_lb->bBalance && + DOMAINDECOMP(cr) && + dd_dlb_is_locked(cr->dd)) + { + /* Unlock the DLB=auto, DLB is allowed to activate + * (but we don't expect it to activate in most cases). + */ + dd_dlb_set_lock(cr->dd, FALSE); + } + } + + if (!pme_lb->bBalance && + (!pme_lb->bSepPMERanks || (step_rel <= PMETunePeriod*ir->nstlist))) + { + /* We have just deactivated the balancing and we're not measuring PP/PME + * imbalance during the first 50*nstlist steps: deactivate the tuning. + */ + pme_lb->bActive = FALSE; + } + + *bPrinting = pme_lb->bBalance; +} + void restart_pme_loadbal(pme_load_balancing_t *pme_lb, int n) { pme_lb->nstage += n; diff --git a/src/gromacs/ewald/pme-load-balancing.h b/src/gromacs/ewald/pme-load-balancing.h index 721ead76f7..2a46b2cb1e 100644 --- a/src/gromacs/ewald/pme-load-balancing.h +++ b/src/gromacs/ewald/pme-load-balancing.h @@ -51,42 +51,46 @@ #include "gromacs/legacyheaders/types/inputrec.h" #include "gromacs/legacyheaders/types/interaction_const.h" #include "gromacs/legacyheaders/types/state.h" +#include "gromacs/timing/wallcycle.h" /*! \brief Object to manage PME load balancing */ struct pme_load_balancing_t; -/*! \brief Initialze the PP-PME load balacing data and infrastructure */ +/*! \brief Initialize the PP-PME load balacing data and infrastructure + * + * Initialize the PP-PME load balacing data and infrastructure. + * The actual load balancing might start right away, later or never. + * Returns in bPrinting whether the load balancing is printing to fp_err. + * The PME grid in pmedata is reused for smaller grids to lower the memory + * usage. + */ void pme_loadbal_init(pme_load_balancing_t **pme_lb_p, const t_inputrec *ir, matrix box, const interaction_const_t *ic, - struct gmx_pme_t *pmedata); + struct gmx_pme_t *pmedata, + gmx_bool bUseGPU, + gmx_bool bSepPMERanks, + gmx_bool *bPrinting); -/*! \brief Try to adjust the PME grid and Coulomb cut-off. - * - * The adjustment is done to generate a different non-bonded PP and PME load. - * With separate PME nodes (PP and PME on different processes) or with - * a GPU (PP on GPU, PME on CPU), PP and PME run on different resources - * and changing the load will affect the load balance and performance. - * The total time for a set of integration steps is monitored and a range - * of grid/cut-off setups is scanned. After calling pme_load_balance many - * times and acquiring enough statistics, the best performing setup is chosen. - * Here we try to take into account fluctuations and changes due to external - * factors as well as DD load balancing. +/*! \brief Process cycles and PME load balance when necessary * - * \return TRUE the load balancing continues, FALSE is the balancing is done. + * Process the cycles measured over the last nstlist steps and then + * either continue balancing or check if we need to trigger balancing. + * Should be called after the ewcSTEP cycle counter has been stopped. + * Returns if the load balancing is printing to fp_err. */ -gmx_bool pme_load_balance(pme_load_balancing_t *pme_lb, - t_commrec *cr, - FILE *fp_err, - FILE *fp_log, - t_inputrec *ir, - t_state *state, - double cycles, - interaction_const_t *ic, - struct nonbonded_verlet_t *nbv, - struct gmx_pme_t ** pmedata, - gmx_int64_t step); +void pme_loadbal_do(pme_load_balancing_t *pme_lb, + t_commrec *cr, + FILE *fp_err, + FILE *fp_log, + t_inputrec *ir, + t_forcerec *fr, + t_state *state, + gmx_wallcycle_t wcycle, + gmx_int64_t step, + gmx_int64_t step_rel, + gmx_bool *bPrinting); /*! \brief Restart the PME load balancing discarding all timings gathered up till now */ void restart_pme_loadbal(pme_load_balancing_t *pme_lb, int n); diff --git a/src/gromacs/timing/wallcycle.c b/src/gromacs/timing/wallcycle.c index 2c3848b22c..a084552875 100644 --- a/src/gromacs/timing/wallcycle.c +++ b/src/gromacs/timing/wallcycle.c @@ -340,6 +340,12 @@ double wallcycle_stop(gmx_wallcycle_t wc, int ewc) return last; } +void wallcycle_get(gmx_wallcycle_t wc, int ewc, int *n, double *c) +{ + *n = wc->wcc[ewc].n; + *c = (double)wc->wcc[ewc].c; +} + void wallcycle_reset_all(gmx_wallcycle_t wc) { int i; diff --git a/src/gromacs/timing/wallcycle.h b/src/gromacs/timing/wallcycle.h index 1ca527949f..af1f045ac0 100644 --- a/src/gromacs/timing/wallcycle.h +++ b/src/gromacs/timing/wallcycle.h @@ -98,6 +98,9 @@ void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc); double wallcycle_stop(gmx_wallcycle_t wc, int ewc); /* Stop the cycle count for ewc, returns the last cycle count */ +void wallcycle_get(gmx_wallcycle_t wc, int ewc, int *n, double *c); +/* Returns the cumulative count and cycle count for ewc */ + void wallcycle_reset_all(gmx_wallcycle_t wc); /* Resets all cycle counters to zero */ diff --git a/src/programs/mdrun/md.cpp b/src/programs/mdrun/md.cpp index a10d7f6b91..f6c3465c8d 100644 --- a/src/programs/mdrun/md.cpp +++ b/src/programs/mdrun/md.cpp @@ -226,9 +226,9 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], simulation stops. If equal to zero, don't communicate any more between multisims.*/ /* PME load balancing data for GPU kernels */ - pme_load_balancing_t *pme_loadbal = NULL; - double cycles_pmes; - gmx_bool bPMETuneTry = FALSE, bPMETuneRunning = FALSE; + pme_load_balancing_t *pme_loadbal; + gmx_bool bPMETune = FALSE; + gmx_bool bPMETunePrinting = FALSE; /* Interactive MD */ gmx_bool bIMDstep = FALSE; @@ -465,26 +465,16 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], repl_ex_nst, repl_ex_nex, repl_ex_seed); } - /* PME tuning is only supported with GPUs or PME nodes and not with rerun. - * PME tuning is not supported with PME only for LJ and not for Coulomb. + /* PME tuning is only supported with PME for Coulomb. Is is not supported + * with only LJ PME, or for reruns. */ - if ((Flags & MD_TUNEPME) && - EEL_PME(fr->eeltype) && - ( use_GPU(fr->nbv) || !(cr->duty & DUTY_PME)) && - !bRerunMD) + bPMETune = ((Flags & MD_TUNEPME) && EEL_PME(fr->eeltype) && !bRerunMD && + !(Flags & MD_REPRODUCIBLE)); + if (bPMETune) { - pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata); - cycles_pmes = 0; - if (cr->duty & DUTY_PME) - { - /* Start tuning right away, as we can't measure the load */ - bPMETuneRunning = TRUE; - } - else - { - /* Separate PME nodes, we can measure the PP/PME load balance */ - bPMETuneTry = TRUE; - } + pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata, + use_GPU(fr->nbv), !(cr->duty & DUTY_PME), + &bPMETunePrinting); } if (!ir->bContinuation && !bRerunMD) @@ -729,6 +719,20 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], while (!bLastStep || (bRerunMD && bNotLastFrame)) { + /* Determine if this is a neighbor search step */ + bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0); + + if (bPMETune && bNStList) + { + /* PME grid + cut-off optimization with GPUs or PME nodes */ + pme_loadbal_do(pme_loadbal, cr, + (bVerbose && MASTER(cr)) ? stderr : NULL, + fplog, + ir, fr, state, wcycle, + step, step_rel, + &bPMETunePrinting); + } + wallcycle_start(wcycle, ewcSTEP); if (bRerunMD) @@ -842,9 +846,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], } else { - /* Determine whether or not to do Neighbour Searching and LR */ - bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0); - bNS = (bFirstStep || bExchanged || bNeedRepartition || bNStList || bDoFEP); } @@ -920,7 +921,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], state, &f, mdatoms, top, fr, vsite, shellfc, constr, nrnb, wcycle, - do_verbose && !bPMETuneRunning); + do_verbose && !bPMETunePrinting); wallcycle_stop(wcycle, ewcDOMDEC); } } @@ -1619,7 +1620,9 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], state->fep_state = lamnew; } /* Print the remaining wall clock time for the run */ - if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning) + if (MULTIMASTER(cr) && + (do_verbose || gmx_got_usr_signal()) && + !bPMETunePrinting) { if (shellfc) { @@ -1703,97 +1706,17 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], } } - if (!bRerunMD || !rerun_fr.bStep) - { - /* increase the MD step number */ - step++; - step_rel++; - } - cycles = wallcycle_stop(wcycle, ewcSTEP); if (DOMAINDECOMP(cr) && wcycle) { dd_cycles_add(cr->dd, cycles, ddCyclStep); } - if (bPMETuneRunning || bPMETuneTry) + if (!bRerunMD || !rerun_fr.bStep) { - /* PME grid + cut-off optimization with GPUs or PME nodes */ - - /* Count the total cycles over the last steps */ - cycles_pmes += cycles; - - /* We can only switch cut-off at NS steps */ - if (step % ir->nstlist == 0) - { - /* PME grid + cut-off optimization with GPUs or PME nodes */ - if (bPMETuneTry) - { - if (DDMASTER(cr->dd)) - { - /* PME node load is too high, start tuning */ - bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05); - } - dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning); - - if (bPMETuneRunning && - use_GPU(fr->nbv) && DOMAINDECOMP(cr) && - !(cr->duty & DUTY_PME)) - { - /* Lock DLB=auto to off (does nothing when DLB=yes/no). - * With GPUs + separate PME ranks, we don't want DLB. - * This could happen when we scan coarse grids and - * it would then never be turned off again. - * This would hurt performance at the final, optimal - * grid spacing, where DLB almost never helps. - * Also, DLB can limit the cut-off for PME tuning. - */ - dd_dlb_set_lock(cr->dd, TRUE); - } - - if (bPMETuneRunning || step_rel > ir->nstlist*50) - { - bPMETuneTry = FALSE; - } - } - if (bPMETuneRunning) - { - /* init_step might not be a multiple of nstlist, - * but the first cycle is always skipped anyhow. - */ - bPMETuneRunning = - pme_load_balance(pme_loadbal, cr, - (bVerbose && MASTER(cr)) ? stderr : NULL, - fplog, - ir, state, cycles_pmes, - fr->ic, fr->nbv, &fr->pmedata, - step); - - /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */ - fr->ewaldcoeff_q = fr->ic->ewaldcoeff_q; - fr->ewaldcoeff_lj = fr->ic->ewaldcoeff_lj; - fr->rlist = fr->ic->rlist; - fr->rlistlong = fr->ic->rlistlong; - fr->rcoulomb = fr->ic->rcoulomb; - fr->rvdw = fr->ic->rvdw; - - if (ir->eDispCorr != edispcNO) - { - calc_enervirdiff(NULL, ir->eDispCorr, fr); - } - - if (!bPMETuneRunning && - DOMAINDECOMP(cr) && - dd_dlb_is_locked(cr->dd)) - { - /* Unlock the DLB=auto, DLB is allowed to activate - * (but we don't expect it to activate in most cases). - */ - dd_dlb_set_lock(cr->dd, FALSE); - } - } - cycles_pmes = 0; - } + /* increase the MD step number */ + step++; + step_rel++; } if (step_rel == wcycle_get_reset_counters(wcycle) || @@ -1851,10 +1774,9 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], done_mdoutf(outf); debug_gmx(); - if (pme_loadbal != NULL) + if (bPMETune) { - pme_loadbal_done(pme_loadbal, cr, fplog, - use_GPU(fr->nbv)); + pme_loadbal_done(pme_loadbal, cr, fplog, use_GPU(fr->nbv)); } if (shellfc && fplog) -- 2.22.0