/* The DLB option */
int eDLB;
+ /* Is eDLB=edlbAUTO locked such that we currently can't turn it on? */
+ gmx_bool bDLB_locked;
/* Are we actually using DLB? */
gmx_bool bDynLoadBal;
int eFlop;
double flop;
int flop_n;
- /* Have often have did we have load measurements */
+ /* How many times have did we have load measurements */
int n_load_have;
- /* Have often have we collected the load measurements */
+ /* How many times have we collected the load measurements */
int n_load_collect;
/* Statistics */
cell_size[i] = 1.0/ncd;
}
}
- else if (dd_load_count(comm))
+ else if (dd_load_count(comm) > 0)
{
load_aver = comm->load[d].sum_m/ncd;
change_max = 0;
/* Initialize to GPU share count to 0, might change later */
comm->nrank_gpu_shared = 0;
- comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
+ comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
+ comm->bDLB_locked = FALSE;
comm->bDynLoadBal = (comm->eDLB == edlbYES);
if (fplog)
comm->PMELoadBal_max_cutoff = comm->cutoff;
}
+gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
+{
+ return dd->comm->bDLB_locked;
+}
+
+void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue)
+{
+ /* We can only lock the DLB when it is set to auto, otherwise don't lock */
+ if (dd->comm->eDLB == edlbAUTO)
+ {
+ dd->comm->bDLB_locked = bValue;
+ }
+}
+
static void merge_cg_buffers(int ncell,
gmx_domdec_comm_dim_t *cd, int pulse,
int *ncg_cell,
}
/* Check if we have recorded loads on the nodes */
- if (comm->bRecordLoad && dd_load_count(comm))
+ if (comm->bRecordLoad && dd_load_count(comm) > 0)
{
- if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
+ if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal && !dd_dlb_is_locked(dd))
{
/* Check if we should use DLB at the second partitioning
* and every 100 partitionings,
* so the extra communication cost is negligible.
*/
- n = max(100, nstglobalcomm);
+ const int nddp_chk_dlb = 100;
+
bCheckDLB = (comm->n_load_collect == 0 ||
- comm->n_load_have % n == n-1);
+ comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1);
}
else
{
/* Since the timings are node dependent, the master decides */
if (DDMASTER(dd))
{
- bTurnOnDLB =
- (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
+ /* Here we check if the max PME rank load is more than 0.98
+ * the max PP force load. If so, PP DLB will not help,
+ * since we are (almost) limited by PME. Furthermore,
+ * DLB will cause a significant extra x/f redistribution
+ * cost on the PME ranks, which will then surely result
+ * in lower total performance.
+ * This check might be fragile, since one measurement
+ * below 0.98 (although only done once every 100 DD part.)
+ * could turn on DLB for the rest of the run.
+ */
+ if (cr->npmenodes > 0 &&
+ dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
+ {
+ bTurnOnDLB = FALSE;
+ }
+ else
+ {
+ bTurnOnDLB =
+ (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
+ }
if (debug)
{
fprintf(debug, "step %s, imb loss %f\n",
}
dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
+ if (bPMETuneRunning &&
+ fr->nbv->bUseGPU && DOMAINDECOMP(cr) &&
+ !(cr->duty & DUTY_PME))
+ {
+ /* Lock DLB=auto to off (does nothing when DLB=yes/no).
+ * With GPUs + separate PME ranks, we don't want DLB.
+ * This could happen when we scan coarse grids and
+ * it would then never be turned off again.
+ * This would hurt performance at the final, optimal
+ * grid spacing, where DLB almost never helps.
+ * Also, DLB can limit the cut-off for PME tuning.
+ */
+ dd_dlb_set_lock(cr->dd, TRUE);
+ }
+
if (bPMETuneRunning || step_rel > ir->nstlist*50)
{
bPMETuneTry = FALSE;
{
calc_enervirdiff(NULL, ir->eDispCorr, fr);
}
+
+ if (!bPMETuneRunning &&
+ DOMAINDECOMP(cr) &&
+ dd_dlb_is_locked(cr->dd))
+ {
+ /* Unlock the DLB=auto, DLB is allowed to activate
+ * (but we don't expect it to activate in most cases).
+ */
+ dd_dlb_set_lock(cr->dd, FALSE);
+ }
}
cycles_pmes = 0;
}