int eDLB;
/* Is eDLB=edlbAUTO locked such that we currently can't turn it on? */
gmx_bool bDLB_locked;
+ /* With eDLB=edlbAUTO, should we check if to DLB on at the next DD? */
+ gmx_bool bCheckWhetherToTurnDlbOn;
/* Are we actually using DLB? */
gmx_bool bDynLoadBal;
/* Initialize to GPU share count to 0, might change later */
comm->nrank_gpu_shared = 0;
- comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
- comm->bDLB_locked = FALSE;
+ comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
+ comm->bDLB_locked = FALSE;
+ comm->bCheckWhetherToTurnDlbOn = TRUE;
comm->bDynLoadBal = (comm->eDLB == edlbYES);
if (fplog)
return bCutoffAllowed;
}
-void change_dd_dlb_cutoff_limit(t_commrec *cr)
+void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff)
{
gmx_domdec_comm_t *comm;
comm->bPMELoadBalDLBLimits = TRUE;
/* Change the cut-off limit */
- comm->PMELoadBal_max_cutoff = comm->cutoff;
+ comm->PMELoadBal_max_cutoff = cutoff;
+
+ if (debug)
+ {
+ fprintf(debug, "PME load balancing set a limit to the DLB staggering such that a %f cut-off will continue to fit\n",
+ comm->PMELoadBal_max_cutoff);
+ }
+}
+
+/* Sets whether we should later check the load imbalance data, so that
+ * we can trigger dynamic load balancing if enough imbalance has
+ * arisen.
+ *
+ * Used after PME load balancing unlocks DLB, so that the check
+ * whether DLB will be useful can happen immediately.
+ */
+static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue)
+{
+ if (dd->comm->eDLB == edlbAUTO && !dd_dlb_is_locked(dd))
+ {
+ dd->comm->bCheckWhetherToTurnDlbOn = bValue;
+ }
+}
+
+/* Returns if we should check whether there has been enough load
+ * imbalance to trigger dynamic load balancing.
+ */
+static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd)
+{
+ const int nddp_chk_dlb = 100;
+
+ if (dd->comm->eDLB != edlbAUTO)
+ {
+ return FALSE;
+ }
+
+ /* We should check whether we should use DLB directly after
+ * unlocking DLB. */
+ if (dd->comm->bCheckWhetherToTurnDlbOn)
+ {
+ /* This flag was set when the PME load-balancing routines
+ unlocked DLB, and should now be cleared. */
+ dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, FALSE);
+ return TRUE;
+ }
+ /* We should also check whether we should use DLB every 100
+ * partitionings (we do not do this every partioning, so that we
+ * avoid excessive communication). */
+ if (dd->comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1)
+ {
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+gmx_bool dd_dlb_is_on(const gmx_domdec_t *dd)
+{
+ return dd->comm->bDynLoadBal;
}
gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
return dd->comm->bDLB_locked;
}
-void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue)
+void dd_dlb_lock(gmx_domdec_t *dd)
{
- /* We can only lock the DLB when it is set to auto, otherwise don't lock */
+ /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
if (dd->comm->eDLB == edlbAUTO)
{
- dd->comm->bDLB_locked = bValue;
+ dd->comm->bDLB_locked = TRUE;
+ }
+}
+
+void dd_dlb_unlock(gmx_domdec_t *dd)
+{
+ /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
+ if (dd->comm->eDLB == edlbAUTO)
+ {
+ dd->comm->bDLB_locked = FALSE;
+ dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, !dd->comm->bDynLoadBal);
}
}
gmx_int64_t step_pcoupl;
rvec cell_ns_x0, cell_ns_x1;
int i, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
- gmx_bool bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
+ gmx_bool bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bTurnOnDLB, bLogLoad;
gmx_bool bRedist, bSortCG, bResortAll;
ivec ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
real grid_density;
/* Check if we have recorded loads on the nodes */
if (comm->bRecordLoad && dd_load_count(comm) > 0)
{
- if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal && !dd_dlb_is_locked(dd))
- {
- /* Check if we should use DLB at the second partitioning
- * and every 100 partitionings,
- * so the extra communication cost is negligible.
- */
- const int nddp_chk_dlb = 100;
- bCheckDLB = (comm->n_load_collect == 0 ||
- comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1);
- }
- else
- {
- bCheckDLB = FALSE;
- }
+ bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd);
/* Print load every nstlog, first and last step to the log file */
bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
/* Avoid extra communication due to verbose screen output
* when nstglobalcomm is set.
*/
- if (bDoDLB || bLogLoad || bCheckDLB ||
+ if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn ||
(bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
{
get_load_distribution(dd, wcycle);
}
comm->n_load_collect++;
- if (bCheckDLB)
+ if (bCheckWhetherToTurnDlbOn)
{
/* Since the timings are node dependent, the master decides */
if (DDMASTER(dd))
struct pme_load_balancing_t {
gmx_bool bSepPMERanks; /**< do we have separate PME ranks? */
gmx_bool bActive; /**< is PME tuning active? */
+ gmx_int64_t step_rel_stop; /**< stop the tuning after this value of step_rel */
+ gmx_bool bTriggerOnDLB; /**< trigger balancing only on DD DLB */
gmx_bool bBalance; /**< are we in the balancing phase, i.e. trying different setups? */
int nstage; /**< the current maximum number of stages */
matrix box_start; /**< the initial simulation box */
int n; /**< the count of setup as well as the allocation size */
pme_setup_t *setup; /**< the PME+cutoff setups */
- int cur; /**< the current setup */
- int fastest; /**< fastest setup up till now */
- int start; /**< start of setup range to consider in stage>0 */
- int end; /**< end of setup range to consider in stage>0 */
+ int cur; /**< the inex (in setup) of the current setup */
+ int fastest; /**< index of the fastest setup up till now */
+ int lower_limit; /**< don't go below this setup index */
+ int start; /**< start of setup index range to consider in stage>0 */
+ int end; /**< end of setup index range to consider in stage>0 */
int elimited; /**< was the balancing limited, uses enum above */
int cutoff_scheme; /**< Verlet or group cut-offs */
double cycles_c; /**< step cycle counter cummulative cycles */
};
-void pme_loadbal_init(pme_load_balancing_t **pme_lb_p,
- const t_inputrec *ir, matrix box,
+void pme_loadbal_init(pme_load_balancing_t **pme_lb_p,
+ t_commrec *cr,
+ FILE *fp_log,
+ const t_inputrec *ir,
+ matrix box,
const interaction_const_t *ic,
- struct gmx_pme_t *pmedata,
- gmx_bool bUseGPU, gmx_bool bSepPMERanks,
- gmx_bool *bPrinting)
+ struct gmx_pme_t *pmedata,
+ gmx_bool bUseGPU,
+ gmx_bool *bPrinting)
{
pme_load_balancing_t *pme_lb;
real spm, sp;
snew(pme_lb, 1);
- pme_lb->bSepPMERanks = bSepPMERanks;
+ pme_lb->bSepPMERanks = !(cr->duty & DUTY_PME);
+
+ /* Initially we turn on balancing directly on based on PP/PME imbalance */
+ pme_lb->bTriggerOnDLB = FALSE;
/* Any number of stages >= 2 is supported */
pme_lb->nstage = 2;
pme_lb->stage = 0;
- pme_lb->fastest = 0;
- pme_lb->start = 0;
- pme_lb->end = 0;
- pme_lb->elimited = epmelblimNO;
+ pme_lb->fastest = 0;
+ pme_lb->lower_limit = 0;
+ pme_lb->start = 0;
+ pme_lb->end = 0;
+ pme_lb->elimited = epmelblimNO;
pme_lb->cycles_n = 0;
pme_lb->cycles_c = 0;
* When running only on a CPU without PME ranks, PME tuning will only help
* with small numbers of atoms in the cut-off sphere.
*/
- pme_lb->bActive = (wallcycle_have_counter() && (bUseGPU || bSepPMERanks));
+ pme_lb->bActive = (wallcycle_have_counter() && (bUseGPU ||
+ pme_lb->bSepPMERanks));
/* With GPUs and no separate PME ranks we can't measure the PP/PME
* imbalance, so we start balancing right away.
* Otherwise we only start balancing after we observe imbalance.
*/
- pme_lb->bBalance = (pme_lb->bActive && (bUseGPU && !bSepPMERanks));
+ pme_lb->bBalance = (pme_lb->bActive && (bUseGPU && !pme_lb->bSepPMERanks));
+
+ pme_lb->step_rel_stop = PMETunePeriod*ir->nstlist;
+
+ /* Delay DD load balancing when GPUs are used */
+ if (pme_lb->bActive && DOMAINDECOMP(cr) && cr->dd->nnodes > 1 && bUseGPU)
+ {
+ /* Lock DLB=auto to off (does nothing when DLB=yes/no.
+ * With GPUs and separate PME nodes, we want to first
+ * do PME tuning without DLB, since DLB might limit
+ * the cut-off, which never improves performance.
+ * We allow for DLB + PME tuning after a first round of tuning.
+ */
+ dd_dlb_lock(cr->dd);
+ if (dd_dlb_is_locked(cr->dd))
+ {
+ md_print_warn(cr, fp_log, "NOTE: DLB will not turn on during the first phase of PME tuning\n");
+ }
+ }
- *pme_lb_p = pme_lb;
+ *pme_lb_p = pme_lb;
*bPrinting = pme_lb->bBalance;
}
* In this stage, only reasonably fast setups are run again. */
static void switch_to_stage1(pme_load_balancing_t *pme_lb)
{
- pme_lb->start = 0;
- while (pme_lb->start+1 < pme_lb->n &&
+ pme_lb->start = pme_lb->lower_limit;
+ while (pme_lb->start + 1 < pme_lb->n &&
(pme_lb->setup[pme_lb->start].count == 0 ||
pme_lb->setup[pme_lb->start].cycles >
pme_lb->setup[pme_lb->fastest].cycles*maxRelativeSlowdownAccepted))
{
pme_lb->start++;
}
- while (pme_lb->start > 0 && pme_lb->setup[pme_lb->start-1].cycles == 0)
+ while (pme_lb->start > 0 && pme_lb->setup[pme_lb->start - 1].cycles == 0)
{
pme_lb->start--;
}
pme_lb->end = pme_lb->n;
- if (pme_lb->setup[pme_lb->end-1].count > 0 &&
- pme_lb->setup[pme_lb->end-1].cycles >
+ if (pme_lb->setup[pme_lb->end - 1].count > 0 &&
+ pme_lb->setup[pme_lb->end - 1].cycles >
pme_lb->setup[pme_lb->fastest].cycles*maxRelativeSlowdownAccepted)
{
pme_lb->end--;
pme_lb->stage = 1;
- /* Next we want to choose setup pme_lb->start, but as we will increase
- * pme_ln->cur by one right after returning, we subtract 1 here.
+ /* Next we want to choose setup pme_lb->end-1, but as we will decrease
+ * pme_ln->cur by one right after returning, we set cur to end.
*/
- pme_lb->cur = pme_lb->start - 1;
+ pme_lb->cur = pme_lb->end;
}
-/*! \brief Try to adjust the PME grid and Coulomb cut-off
+/*! \brief Process the timings and try to adjust the PME grid and Coulomb cut-off
*
* The adjustment is done to generate a different non-bonded PP and PME load.
* With separate PME ranks (PP and PME on different processes) or with
* times and acquiring enough statistics, the best performing setup is chosen.
* Here we try to take into account fluctuations and changes due to external
* factors as well as DD load balancing.
- * Returns TRUE the load balancing continues, FALSE is the balancing is done.
*/
-static gmx_bool
+static void
pme_load_balance(pme_load_balancing_t *pme_lb,
t_commrec *cr,
FILE *fp_err,
char buf[STRLEN], sbuf[22];
real rtab;
- if (pme_lb->stage == pme_lb->nstage)
- {
- return FALSE;
- }
-
if (PAR(cr))
{
gmx_sumd(1, &cycles, cr);
/* Skip the first cycle, because the first step after a switch
* is much slower due to allocation and/or caching effects.
*/
- return TRUE;
+ return;
}
sprintf(buf, "step %4s: ", gmx_step_str(step, sbuf));
"Increased the number stages to %d"
" and ignoring the previous performance\n",
set->grid[XX], set->grid[YY], set->grid[ZZ],
- cycles*1e-6, set->cycles*1e-6, maxFluctuationAccepted,
+ set->cycles*1e-6, cycles*1e-6, maxFluctuationAccepted,
pme_lb->nstage);
}
}
* better overal performance can be obtained with a slightly
* shorter cut-off and better DD load balancing.
*/
- change_dd_dlb_cutoff_limit(cr);
+ set_dd_dlb_max_cutoff(cr, pme_lb->setup[pme_lb->fastest].rlistlong);
}
}
cycles_fast = pme_lb->setup[pme_lb->fastest].cycles;
if (pme_lb->stage > 0 && pme_lb->end == 1)
{
- pme_lb->cur = 0;
+ pme_lb->cur = pme_lb->lower_limit;
pme_lb->stage = pme_lb->nstage;
}
else if (pme_lb->stage > 0 && pme_lb->end > 1)
* which are not much slower than the fastest
* else:
* use the next setup
+ * Note that we loop backward to minimize the risk of the cut-off
+ * getting limited by DD DLB, since the DLB cut-off limit is set
+ * to the fastest PME setup.
*/
do
{
- pme_lb->cur++;
- if (pme_lb->cur == pme_lb->end)
+ pme_lb->cur--;
+ if (pme_lb->cur == pme_lb->start)
{
pme_lb->stage++;
- pme_lb->cur = pme_lb->start;
+
+ pme_lb->cur = pme_lb->end - 1;
}
}
while (pme_lb->stage == pme_lb->nstage - 1 &&
OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlistlong);
if (!OK)
{
- /* Failsafe solution */
+ /* For some reason the chosen cut-off is incompatible with DD.
+ * We should continue scanning a more limited range of cut-off's.
+ */
if (pme_lb->cur > 1 && pme_lb->stage == pme_lb->nstage)
{
+ /* stage=nstage says we're finished, but we should continue
+ * balancing, so we set back stage which was just incremented.
+ */
pme_lb->stage--;
}
- pme_lb->fastest = 0;
- pme_lb->start = 0;
- pme_lb->end = pme_lb->cur;
- pme_lb->cur = pme_lb->start;
- pme_lb->elimited = epmelblimDD;
+ if (pme_lb->cur <= pme_lb->fastest)
+ {
+ /* This should not happen, as we set limits on the DLB bounds.
+ * But we implement a complete failsafe solution anyhow.
+ */
+ md_print_warn(cr, fp_log, "The fastest PP/PME load balancing setting (cutoff %.3f nm) is no longer available due to DD DLB or box size limitations\n");
+ pme_lb->fastest = pme_lb->lower_limit;
+ pme_lb->start = pme_lb->lower_limit;
+ }
+ /* Limit the range to below the current cut-off, scan from start */
+ pme_lb->end = pme_lb->cur;
+ pme_lb->cur = pme_lb->start;
+ pme_lb->elimited = epmelblimDD;
print_loadbal_limited(fp_err, fp_log, step, pme_lb);
}
}
{
print_grid(fp_err, fp_log, "", "optimal", set, -1);
}
+}
- return TRUE;
+/*! \brief Prepare for another round of PME load balancing
+ *
+ * \param[in,out] pme_lb Pointer to PME load balancing struct
+ * \param[in] bDlbUnlocked TRUE is DLB was locked and is now unlocked
+ *
+ * If the conditions (e.g. DLB off/on, CPU/GPU throttling etc.) changed,
+ * the PP/PME balance might change and re-balancing can improve performance.
+ * This function adds 2 stages and adjusts the considered setup range.
+ */
+static void continue_pme_loadbal(pme_load_balancing_t *pme_lb,
+ gmx_bool bDlbUnlocked)
+{
+ /* Add 2 tuning stages, keep the detected end of the setup range */
+ pme_lb->nstage += 2;
+ if (bDlbUnlocked && pme_lb->bSepPMERanks)
+ {
+ /* With separate PME ranks, DLB should always lower the PP load and
+ * can only increase the PME load (more communication and imbalance),
+ * so we only need to scan longer cut-off's.
+ */
+ pme_lb->lower_limit = pme_lb->cur;
+ }
+ pme_lb->start = pme_lb->lower_limit;
}
void pme_loadbal_do(pme_load_balancing_t *pme_lb,
/* PME grid + cut-off optimization with GPUs or PME ranks */
if (!pme_lb->bBalance && pme_lb->bSepPMERanks)
{
- if (DDMASTER(cr->dd))
+ if (pme_lb->bTriggerOnDLB)
{
- /* PME rank load is too high, start tuning */
- pme_lb->bBalance = (dd_pme_f_ratio(cr->dd) >= loadBalanceTriggerFactor);
+ pme_lb->bBalance = dd_dlb_is_on(cr->dd);
+ }
+ else
+ {
+ if (DDMASTER(cr->dd))
+ {
+ /* PME node load is too high, start tuning */
+ pme_lb->bBalance =
+ (dd_pme_f_ratio(cr->dd) >= loadBalanceTriggerFactor);
+ }
+ dd_bcast(cr->dd, sizeof(gmx_bool), &pme_lb->bBalance);
+ }
+
+ pme_lb->bActive = (pme_lb->bBalance ||
+ step_rel <= pme_lb->step_rel_stop);
+ }
+
+ /* The location in the code of this balancing termination is strange.
+ * You would expect to have it after the call to pme_load_balance()
+ * below, since there pme_lb->stage is updated.
+ * But when terminating directly after deciding on and selecting the
+ * optimal setup, DLB will turn on right away if it was locked before.
+ * This might be due to PME reinitialization. So we check stage here
+ * to allow for another nstlist steps with DLB locked to stabilize
+ * the performance.
+ */
+ if (pme_lb->bBalance && pme_lb->stage == pme_lb->nstage)
+ {
+ pme_lb->bBalance = FALSE;
+
+ if (DOMAINDECOMP(cr) && dd_dlb_is_locked(cr->dd))
+ {
+ /* Unlock the DLB=auto, DLB is allowed to activate */
+ dd_dlb_unlock(cr->dd);
+ md_print_warn(cr, fp_log, "NOTE: DLB can now turn on, when beneficial\n");
+
+ /* We don't deactivate the tuning yet, since we will balance again
+ * after DLB gets turned on, if it does within PMETune_period.
+ */
+ continue_pme_loadbal(pme_lb, TRUE);
+ pme_lb->bTriggerOnDLB = TRUE;
+ pme_lb->step_rel_stop = step_rel + PMETunePeriod*ir->nstlist;
+ }
+ else
+ {
+ /* We're completely done with PME tuning */
+ pme_lb->bActive = FALSE;
}
- dd_bcast(cr->dd, sizeof(gmx_bool), &pme_lb->bBalance);
- if (pme_lb->bBalance &&
- use_GPU(fr->nbv) && DOMAINDECOMP(cr) &&
- pme_lb->bSepPMERanks)
+ if (DOMAINDECOMP(cr))
{
- /* Lock DLB=auto to off (does nothing when DLB=yes/no).
- * With GPUs + separate PME ranks, we don't want DLB.
- * This could happen when we scan coarse grids and
- * it would then never be turned off again.
- * This would hurt performance at the final, optimal
- * grid spacing, where DLB almost never helps.
- * Also, DLB can limit the cut-off for PME tuning.
+ /* Set the cut-off limit to the final selected cut-off,
+ * so we don't have artificial DLB limits.
+ * This also ensures that we won't disable the currently
+ * optimal setting during a second round of PME balancing.
*/
- dd_dlb_set_lock(cr->dd, TRUE);
+ set_dd_dlb_max_cutoff(cr, fr->ic->rlistlong);
}
}
if (pme_lb->bBalance)
{
- /* init_step might not be a multiple of nstlist,
- * but the first cycle is always skipped anyhow.
+ /* We might not have collected nstlist steps in cycles yet,
+ * since init_step might not be a multiple of nstlist,
+ * but the first data collected is skipped anyhow.
*/
- pme_lb->bBalance =
- pme_load_balance(pme_lb, cr,
- fp_err, fp_log,
- ir, state, pme_lb->cycles_c - cycles_prev,
- fr->ic, fr->nbv, &fr->pmedata,
- step);
+ pme_load_balance(pme_lb, cr,
+ fp_err, fp_log,
+ ir, state, pme_lb->cycles_c - cycles_prev,
+ fr->ic, fr->nbv, &fr->pmedata,
+ step);
/* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
fr->ewaldcoeff_q = fr->ic->ewaldcoeff_q;
{
calc_enervirdiff(NULL, ir->eDispCorr, fr);
}
-
- if (!pme_lb->bBalance &&
- DOMAINDECOMP(cr) &&
- dd_dlb_is_locked(cr->dd))
- {
- /* Unlock the DLB=auto, DLB is allowed to activate
- * (but we don't expect it to activate in most cases).
- */
- dd_dlb_set_lock(cr->dd, FALSE);
- }
}
if (!pme_lb->bBalance &&
- (!pme_lb->bSepPMERanks || (step_rel <= PMETunePeriod*ir->nstlist)))
+ (!pme_lb->bSepPMERanks || (step_rel <= pme_lb->step_rel_stop)))
{
/* We have just deactivated the balancing and we're not measuring PP/PME
- * imbalance during the first 50*nstlist steps: deactivate the tuning.
+ * imbalance during the first steps of the run: deactivate the tuning.
*/
pme_lb->bActive = FALSE;
}
- *bPrinting = pme_lb->bBalance;
-}
+ if (!(pme_lb->bActive) && DOMAINDECOMP(cr) && dd_dlb_is_locked(cr->dd))
+ {
+ /* Make sure DLB is allowed when we deactivate PME tuning */
+ dd_dlb_unlock(cr->dd);
+ md_print_warn(cr, fp_log, "NOTE: DLB can now turn on, when beneficial\n");
+ }
-void restart_pme_loadbal(pme_load_balancing_t *pme_lb, int n)
-{
- pme_lb->nstage += n;
+ *bPrinting = pme_lb->bBalance;
}
/*! \brief Return product of the number of PME grid points in each dimension */