#include "config.h"
+#include <assert.h>
+
#include <cmath>
#include <algorithm>
#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_network.h"
#include "gromacs/legacyheaders/calcgrid.h"
#include "gromacs/legacyheaders/force.h"
#include "gromacs/legacyheaders/md_logging.h"
#include "gromacs/math/vec.h"
#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/timing/wallcycle.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/smalloc.h"
double cycles; /**< the fastest time for this setup in cycles */
};
+/*! \brief After 50 nstlist periods of not observing imbalance: never tune PME */
+const int PMETunePeriod = 50;
+/*! \brief Trigger PME load balancing at more than 5% PME overload */
+const real loadBalanceTriggerFactor = 1.05;
/*! \brief In the initial scan, step by grids that are at least a factor 0.8 coarser */
const real gridScaleFactor = 0.8;
-
/*! \brief In the initial scan, try to skip grids with uneven x/y/z spacing,
* checking if the "efficiency" is more than 5% worse than the previous grid.
*/
{ "no", "box size", "domain decompostion", "PME grid restriction" };
struct pme_load_balancing_t {
+ gmx_bool bSepPMERanks; /**< do we have separate PME ranks? */
+ gmx_bool bActive; /**< is PME tuning active? */
+ gmx_bool bBalance; /**< are we in the balancing phase, i.e. trying different setups? */
int nstage; /**< the current maximum number of stages */
real cut_spacing; /**< the minimum cutoff / PME grid spacing ratio */
int cutoff_scheme; /**< Verlet or group cut-offs */
int stage; /**< the current stage */
+
+ int cycles_n; /**< step cycle counter cummulative count */
+ double cycles_c; /**< step cycle counter cummulative cycles */
};
void pme_loadbal_init(pme_load_balancing_t **pme_lb_p,
const t_inputrec *ir, matrix box,
const interaction_const_t *ic,
- struct gmx_pme_t *pmedata)
+ struct gmx_pme_t *pmedata,
+ gmx_bool bUseGPU, gmx_bool bSepPMERanks,
+ gmx_bool *bPrinting)
{
pme_load_balancing_t *pme_lb;
real spm, sp;
snew(pme_lb, 1);
+ pme_lb->bSepPMERanks = bSepPMERanks;
+
/* Any number of stages >= 2 is supported */
- pme_lb->nstage = 2;
+ pme_lb->nstage = 2;
pme_lb->cutoff_scheme = ir->cutoff_scheme;
pme_lb->setup[0].ewaldcoeff_q = ic->ewaldcoeff_q;
pme_lb->setup[0].ewaldcoeff_lj = ic->ewaldcoeff_lj;
- pme_lb->setup[0].pmedata = pmedata;
+ pme_lb->setup[0].pmedata = pmedata;
spm = 0;
for (d = 0; d < DIM; d++)
pme_lb->end = 0;
pme_lb->elimited = epmelblimNO;
- *pme_lb_p = pme_lb;
+ pme_lb->cycles_n = 0;
+ pme_lb->cycles_c = 0;
+
+ /* Tune with GPUs and/or separate PME ranks.
+ * When running only on a CPU without PME ranks, PME tuning will only help
+ * with small numbers of atoms in the cut-off sphere.
+ */
+ pme_lb->bActive = (wallcycle_have_counter() && (bUseGPU || bSepPMERanks));
+
+ /* With GPUs and no separate PME ranks we can't measure the PP/PME
+ * imbalance, so we start balancing right away.
+ * Otherwise we only start balancing after we observe imbalance.
+ */
+ pme_lb->bBalance = (pme_lb->bActive && (bUseGPU && !bSepPMERanks));
+
+ *pme_lb_p = pme_lb;
+
+ *bPrinting = pme_lb->bBalance;
}
/*! \brief Try to increase the cutoff during load balancing */
const gmx_domdec_t *dd)
{
pme_setup_t *set;
- int npmenodes_x, npmenodes_y;
+ int npmeranks_x, npmeranks_y;
real fac, sp;
real tmpr_coulomb, tmpr_vdw;
int d;
set = &pme_lb->setup[pme_lb->n-1];
set->pmedata = NULL;
- get_pme_nnodes(dd, &npmenodes_x, &npmenodes_y);
+ get_pme_nnodes(dd, &npmeranks_x, &npmeranks_y);
fac = 1;
do
&set->grid[YY],
&set->grid[ZZ]);
- /* As here we can't easily check if one of the PME nodes
+ /* As here we can't easily check if one of the PME ranks
* uses threading, we do a conservative grid check.
* This means we can't use pme_order or less grid lines
- * per PME node along x, which is not a strong restriction.
+ * per PME rank along x, which is not a strong restriction.
*/
gmx_pme_check_restrictions(pme_order,
set->grid[XX], set->grid[YY], set->grid[ZZ],
- npmenodes_x, npmenodes_y,
+ npmeranks_x, npmeranks_y,
TRUE,
FALSE,
&grid_ok);
pme_lb->cur = pme_lb->start - 1;
}
-gmx_bool pme_load_balance(pme_load_balancing_t *pme_lb,
- t_commrec *cr,
- FILE *fp_err,
- FILE *fp_log,
- t_inputrec *ir,
- t_state *state,
- double cycles,
- interaction_const_t *ic,
- struct nonbonded_verlet_t *nbv,
- struct gmx_pme_t ** pmedata,
- gmx_int64_t step)
+/*! \brief Try to adjust the PME grid and Coulomb cut-off
+ *
+ * The adjustment is done to generate a different non-bonded PP and PME load.
+ * With separate PME ranks (PP and PME on different processes) or with
+ * a GPU (PP on GPU, PME on CPU), PP and PME run on different resources
+ * and changing the load will affect the load balance and performance.
+ * The total time for a set of integration steps is monitored and a range
+ * of grid/cut-off setups is scanned. After calling pme_load_balance many
+ * times and acquiring enough statistics, the best performing setup is chosen.
+ * Here we try to take into account fluctuations and changes due to external
+ * factors as well as DD load balancing.
+ * Returns TRUE the load balancing continues, FALSE is the balancing is done.
+ */
+static gmx_bool
+pme_load_balance(pme_load_balancing_t *pme_lb,
+ t_commrec *cr,
+ FILE *fp_err,
+ FILE *fp_log,
+ t_inputrec *ir,
+ t_state *state,
+ double cycles,
+ interaction_const_t *ic,
+ struct nonbonded_verlet_t *nbv,
+ struct gmx_pme_t ** pmedata,
+ gmx_int64_t step)
{
gmx_bool OK;
pme_setup_t *set;
*/
init_interaction_const_tables(NULL, ic, bUsesSimpleTables, rtab);
- if (cr->duty & DUTY_PME)
+ if (!pme_lb->bSepPMERanks)
{
if (pme_lb->setup[pme_lb->cur].pmedata == NULL)
{
}
else
{
- /* Tell our PME-only node to switch grid */
+ /* Tell our PME-only rank to switch grid */
gmx_pme_send_switchgrid(cr, set->grid, set->ewaldcoeff_q, set->ewaldcoeff_lj);
}
return TRUE;
}
+void pme_loadbal_do(pme_load_balancing_t *pme_lb,
+ t_commrec *cr,
+ FILE *fp_err,
+ FILE *fp_log,
+ t_inputrec *ir,
+ t_forcerec *fr,
+ t_state *state,
+ gmx_wallcycle_t wcycle,
+ gmx_int64_t step,
+ gmx_int64_t step_rel,
+ gmx_bool *bPrinting)
+{
+ int n_prev;
+ double cycles_prev;
+
+ assert(pme_lb != NULL);
+
+ if (!pme_lb->bActive)
+ {
+ return;
+ }
+
+ n_prev = pme_lb->cycles_n;
+ cycles_prev = pme_lb->cycles_c;
+ wallcycle_get(wcycle, ewcSTEP, &pme_lb->cycles_n, &pme_lb->cycles_c);
+ if (pme_lb->cycles_n == 0)
+ {
+ /* Before the first step we haven't done any steps yet */
+ return;
+ }
+ /* Sanity check, we expect nstlist cycle counts */
+ if (pme_lb->cycles_n - n_prev != ir->nstlist)
+ {
+ /* We could return here, but it's safer to issue and error and quit */
+ gmx_incons("pme_loadbal_do called at an interval != nstlist");
+ }
+
+ /* PME grid + cut-off optimization with GPUs or PME ranks */
+ if (!pme_lb->bBalance && pme_lb->bSepPMERanks)
+ {
+ if (DDMASTER(cr->dd))
+ {
+ /* PME rank load is too high, start tuning */
+ pme_lb->bBalance = (dd_pme_f_ratio(cr->dd) >= loadBalanceTriggerFactor);
+ }
+ dd_bcast(cr->dd, sizeof(gmx_bool), &pme_lb->bBalance);
+
+ if (pme_lb->bBalance &&
+ use_GPU(fr->nbv) && DOMAINDECOMP(cr) &&
+ pme_lb->bSepPMERanks)
+ {
+ /* Lock DLB=auto to off (does nothing when DLB=yes/no).
+ * With GPUs + separate PME ranks, we don't want DLB.
+ * This could happen when we scan coarse grids and
+ * it would then never be turned off again.
+ * This would hurt performance at the final, optimal
+ * grid spacing, where DLB almost never helps.
+ * Also, DLB can limit the cut-off for PME tuning.
+ */
+ dd_dlb_set_lock(cr->dd, TRUE);
+ }
+ }
+
+ if (pme_lb->bBalance)
+ {
+ /* init_step might not be a multiple of nstlist,
+ * but the first cycle is always skipped anyhow.
+ */
+ pme_lb->bBalance =
+ pme_load_balance(pme_lb, cr,
+ fp_err, fp_log,
+ ir, state, pme_lb->cycles_c - cycles_prev,
+ fr->ic, fr->nbv, &fr->pmedata,
+ step);
+
+ /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
+ fr->ewaldcoeff_q = fr->ic->ewaldcoeff_q;
+ fr->ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
+ fr->rlist = fr->ic->rlist;
+ fr->rlistlong = fr->ic->rlistlong;
+ fr->rcoulomb = fr->ic->rcoulomb;
+ fr->rvdw = fr->ic->rvdw;
+
+ if (ir->eDispCorr != edispcNO)
+ {
+ calc_enervirdiff(NULL, ir->eDispCorr, fr);
+ }
+
+ if (!pme_lb->bBalance &&
+ DOMAINDECOMP(cr) &&
+ dd_dlb_is_locked(cr->dd))
+ {
+ /* Unlock the DLB=auto, DLB is allowed to activate
+ * (but we don't expect it to activate in most cases).
+ */
+ dd_dlb_set_lock(cr->dd, FALSE);
+ }
+ }
+
+ if (!pme_lb->bBalance &&
+ (!pme_lb->bSepPMERanks || (step_rel <= PMETunePeriod*ir->nstlist)))
+ {
+ /* We have just deactivated the balancing and we're not measuring PP/PME
+ * imbalance during the first 50*nstlist steps: deactivate the tuning.
+ */
+ pme_lb->bActive = FALSE;
+ }
+
+ *bPrinting = pme_lb->bBalance;
+}
+
void restart_pme_loadbal(pme_load_balancing_t *pme_lb, int n)
{
pme_lb->nstage += n;
#include "gromacs/legacyheaders/types/inputrec.h"
#include "gromacs/legacyheaders/types/interaction_const.h"
#include "gromacs/legacyheaders/types/state.h"
+#include "gromacs/timing/wallcycle.h"
/*! \brief Object to manage PME load balancing */
struct pme_load_balancing_t;
-/*! \brief Initialze the PP-PME load balacing data and infrastructure */
+/*! \brief Initialize the PP-PME load balacing data and infrastructure
+ *
+ * Initialize the PP-PME load balacing data and infrastructure.
+ * The actual load balancing might start right away, later or never.
+ * Returns in bPrinting whether the load balancing is printing to fp_err.
+ * The PME grid in pmedata is reused for smaller grids to lower the memory
+ * usage.
+ */
void pme_loadbal_init(pme_load_balancing_t **pme_lb_p,
const t_inputrec *ir,
matrix box,
const interaction_const_t *ic,
- struct gmx_pme_t *pmedata);
+ struct gmx_pme_t *pmedata,
+ gmx_bool bUseGPU,
+ gmx_bool bSepPMERanks,
+ gmx_bool *bPrinting);
-/*! \brief Try to adjust the PME grid and Coulomb cut-off.
- *
- * The adjustment is done to generate a different non-bonded PP and PME load.
- * With separate PME nodes (PP and PME on different processes) or with
- * a GPU (PP on GPU, PME on CPU), PP and PME run on different resources
- * and changing the load will affect the load balance and performance.
- * The total time for a set of integration steps is monitored and a range
- * of grid/cut-off setups is scanned. After calling pme_load_balance many
- * times and acquiring enough statistics, the best performing setup is chosen.
- * Here we try to take into account fluctuations and changes due to external
- * factors as well as DD load balancing.
+/*! \brief Process cycles and PME load balance when necessary
*
- * \return TRUE the load balancing continues, FALSE is the balancing is done.
+ * Process the cycles measured over the last nstlist steps and then
+ * either continue balancing or check if we need to trigger balancing.
+ * Should be called after the ewcSTEP cycle counter has been stopped.
+ * Returns if the load balancing is printing to fp_err.
*/
-gmx_bool pme_load_balance(pme_load_balancing_t *pme_lb,
- t_commrec *cr,
- FILE *fp_err,
- FILE *fp_log,
- t_inputrec *ir,
- t_state *state,
- double cycles,
- interaction_const_t *ic,
- struct nonbonded_verlet_t *nbv,
- struct gmx_pme_t ** pmedata,
- gmx_int64_t step);
+void pme_loadbal_do(pme_load_balancing_t *pme_lb,
+ t_commrec *cr,
+ FILE *fp_err,
+ FILE *fp_log,
+ t_inputrec *ir,
+ t_forcerec *fr,
+ t_state *state,
+ gmx_wallcycle_t wcycle,
+ gmx_int64_t step,
+ gmx_int64_t step_rel,
+ gmx_bool *bPrinting);
/*! \brief Restart the PME load balancing discarding all timings gathered up till now */
void restart_pme_loadbal(pme_load_balancing_t *pme_lb, int n);
simulation stops. If equal to zero, don't
communicate any more between multisims.*/
/* PME load balancing data for GPU kernels */
- pme_load_balancing_t *pme_loadbal = NULL;
- double cycles_pmes;
- gmx_bool bPMETuneTry = FALSE, bPMETuneRunning = FALSE;
+ pme_load_balancing_t *pme_loadbal;
+ gmx_bool bPMETune = FALSE;
+ gmx_bool bPMETunePrinting = FALSE;
/* Interactive MD */
gmx_bool bIMDstep = FALSE;
repl_ex_nst, repl_ex_nex, repl_ex_seed);
}
- /* PME tuning is only supported with GPUs or PME nodes and not with rerun.
- * PME tuning is not supported with PME only for LJ and not for Coulomb.
+ /* PME tuning is only supported with PME for Coulomb. Is is not supported
+ * with only LJ PME, or for reruns.
*/
- if ((Flags & MD_TUNEPME) &&
- EEL_PME(fr->eeltype) &&
- ( use_GPU(fr->nbv) || !(cr->duty & DUTY_PME)) &&
- !bRerunMD)
+ bPMETune = ((Flags & MD_TUNEPME) && EEL_PME(fr->eeltype) && !bRerunMD &&
+ !(Flags & MD_REPRODUCIBLE));
+ if (bPMETune)
{
- pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata);
- cycles_pmes = 0;
- if (cr->duty & DUTY_PME)
- {
- /* Start tuning right away, as we can't measure the load */
- bPMETuneRunning = TRUE;
- }
- else
- {
- /* Separate PME nodes, we can measure the PP/PME load balance */
- bPMETuneTry = TRUE;
- }
+ pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata,
+ use_GPU(fr->nbv), !(cr->duty & DUTY_PME),
+ &bPMETunePrinting);
}
if (!ir->bContinuation && !bRerunMD)
while (!bLastStep || (bRerunMD && bNotLastFrame))
{
+ /* Determine if this is a neighbor search step */
+ bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
+
+ if (bPMETune && bNStList)
+ {
+ /* PME grid + cut-off optimization with GPUs or PME nodes */
+ pme_loadbal_do(pme_loadbal, cr,
+ (bVerbose && MASTER(cr)) ? stderr : NULL,
+ fplog,
+ ir, fr, state, wcycle,
+ step, step_rel,
+ &bPMETunePrinting);
+ }
+
wallcycle_start(wcycle, ewcSTEP);
if (bRerunMD)
}
else
{
- /* Determine whether or not to do Neighbour Searching and LR */
- bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
-
bNS = (bFirstStep || bExchanged || bNeedRepartition || bNStList || bDoFEP);
}
state, &f, mdatoms, top, fr,
vsite, shellfc, constr,
nrnb, wcycle,
- do_verbose && !bPMETuneRunning);
+ do_verbose && !bPMETunePrinting);
wallcycle_stop(wcycle, ewcDOMDEC);
}
}
state->fep_state = lamnew;
}
/* Print the remaining wall clock time for the run */
- if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
+ if (MULTIMASTER(cr) &&
+ (do_verbose || gmx_got_usr_signal()) &&
+ !bPMETunePrinting)
{
if (shellfc)
{
}
}
- if (!bRerunMD || !rerun_fr.bStep)
- {
- /* increase the MD step number */
- step++;
- step_rel++;
- }
-
cycles = wallcycle_stop(wcycle, ewcSTEP);
if (DOMAINDECOMP(cr) && wcycle)
{
dd_cycles_add(cr->dd, cycles, ddCyclStep);
}
- if (bPMETuneRunning || bPMETuneTry)
+ if (!bRerunMD || !rerun_fr.bStep)
{
- /* PME grid + cut-off optimization with GPUs or PME nodes */
-
- /* Count the total cycles over the last steps */
- cycles_pmes += cycles;
-
- /* We can only switch cut-off at NS steps */
- if (step % ir->nstlist == 0)
- {
- /* PME grid + cut-off optimization with GPUs or PME nodes */
- if (bPMETuneTry)
- {
- if (DDMASTER(cr->dd))
- {
- /* PME node load is too high, start tuning */
- bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05);
- }
- dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
-
- if (bPMETuneRunning &&
- use_GPU(fr->nbv) && DOMAINDECOMP(cr) &&
- !(cr->duty & DUTY_PME))
- {
- /* Lock DLB=auto to off (does nothing when DLB=yes/no).
- * With GPUs + separate PME ranks, we don't want DLB.
- * This could happen when we scan coarse grids and
- * it would then never be turned off again.
- * This would hurt performance at the final, optimal
- * grid spacing, where DLB almost never helps.
- * Also, DLB can limit the cut-off for PME tuning.
- */
- dd_dlb_set_lock(cr->dd, TRUE);
- }
-
- if (bPMETuneRunning || step_rel > ir->nstlist*50)
- {
- bPMETuneTry = FALSE;
- }
- }
- if (bPMETuneRunning)
- {
- /* init_step might not be a multiple of nstlist,
- * but the first cycle is always skipped anyhow.
- */
- bPMETuneRunning =
- pme_load_balance(pme_loadbal, cr,
- (bVerbose && MASTER(cr)) ? stderr : NULL,
- fplog,
- ir, state, cycles_pmes,
- fr->ic, fr->nbv, &fr->pmedata,
- step);
-
- /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
- fr->ewaldcoeff_q = fr->ic->ewaldcoeff_q;
- fr->ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
- fr->rlist = fr->ic->rlist;
- fr->rlistlong = fr->ic->rlistlong;
- fr->rcoulomb = fr->ic->rcoulomb;
- fr->rvdw = fr->ic->rvdw;
-
- if (ir->eDispCorr != edispcNO)
- {
- calc_enervirdiff(NULL, ir->eDispCorr, fr);
- }
-
- if (!bPMETuneRunning &&
- DOMAINDECOMP(cr) &&
- dd_dlb_is_locked(cr->dd))
- {
- /* Unlock the DLB=auto, DLB is allowed to activate
- * (but we don't expect it to activate in most cases).
- */
- dd_dlb_set_lock(cr->dd, FALSE);
- }
- }
- cycles_pmes = 0;
- }
+ /* increase the MD step number */
+ step++;
+ step_rel++;
}
if (step_rel == wcycle_get_reset_counters(wcycle) ||
done_mdoutf(outf);
debug_gmx();
- if (pme_loadbal != NULL)
+ if (bPMETune)
{
- pme_loadbal_done(pme_loadbal, cr, fplog,
- use_GPU(fr->nbv));
+ pme_loadbal_done(pme_loadbal, cr, fplog, use_GPU(fr->nbv));
}
if (shellfc && fplog)