From 95b9f1461089c8e677df8b2f948c7b851c436f3a Mon Sep 17 00:00:00 2001 From: Berk Hess Date: Fri, 5 Sep 2014 10:29:04 +0200 Subject: [PATCH] Avoid DLB with overloaded PME ranks When separate PME ranks have more load than the PP ranks, DLB can not improve improve performance. It will actually make it worse, because the PME x/f redistribution time goes up. Now with -dlb=auto DLB is not turned on in this situation. Also DLB is not activated during PME tuning with GPUs and separate PME nodes, since it then nearly always deteriorates the performance. Change-Id: I1f5e649a9562fdca9ba538196f41a12feb0a4a24 --- src/gromacs/legacyheaders/domdec.h | 6 ++++ src/gromacs/mdlib/domdec.c | 56 ++++++++++++++++++++++++------ src/programs/mdrun/md.c | 25 +++++++++++++ 3 files changed, 77 insertions(+), 10 deletions(-) diff --git a/src/gromacs/legacyheaders/domdec.h b/src/gromacs/legacyheaders/domdec.h index efeef40614..240a7901cc 100644 --- a/src/gromacs/legacyheaders/domdec.h +++ b/src/gromacs/legacyheaders/domdec.h @@ -138,6 +138,12 @@ void change_dd_dlb_cutoff_limit(t_commrec *cr); * possible after subsequently setting a shorter cut-off with change_dd_cutoff. */ +gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd); +/* Return if the DLB lock is set */ + +void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue); +/* Set a lock such that with DLB=auto DLB can (not) get turned on */ + void dd_setup_dlb_resource_sharing(t_commrec *cr, const gmx_hw_info_t *hwinfo, const gmx_hw_opt_t *hw_opt); diff --git a/src/gromacs/mdlib/domdec.c b/src/gromacs/mdlib/domdec.c index ca2514a5dd..6cc6b73895 100644 --- a/src/gromacs/mdlib/domdec.c +++ b/src/gromacs/mdlib/domdec.c @@ -269,6 +269,8 @@ typedef struct gmx_domdec_comm /* The DLB option */ int eDLB; + /* Is eDLB=edlbAUTO locked such that we currently can't turn it on? */ + gmx_bool bDLB_locked; /* Are we actually using DLB? */ gmx_bool bDynLoadBal; @@ -385,9 +387,9 @@ typedef struct gmx_domdec_comm int eFlop; double flop; int flop_n; - /* Have often have did we have load measurements */ + /* How many times have did we have load measurements */ int n_load_have; - /* Have often have we collected the load measurements */ + /* How many times have we collected the load measurements */ int n_load_collect; /* Statistics */ @@ -3462,7 +3464,7 @@ static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd, cell_size[i] = 1.0/ncd; } } - else if (dd_load_count(comm)) + else if (dd_load_count(comm) > 0) { load_aver = comm->load[d].sum_m/ncd; change_max = 0; @@ -6685,7 +6687,8 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr, /* Initialize to GPU share count to 0, might change later */ comm->nrank_gpu_shared = 0; - comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir); + comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir); + comm->bDLB_locked = FALSE; comm->bDynLoadBal = (comm->eDLB == edlbYES); if (fplog) @@ -7585,6 +7588,20 @@ void change_dd_dlb_cutoff_limit(t_commrec *cr) comm->PMELoadBal_max_cutoff = comm->cutoff; } +gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd) +{ + return dd->comm->bDLB_locked; +} + +void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue) +{ + /* We can only lock the DLB when it is set to auto, otherwise don't lock */ + if (dd->comm->eDLB == edlbAUTO) + { + dd->comm->bDLB_locked = bValue; + } +} + static void merge_cg_buffers(int ncell, gmx_domdec_comm_dim_t *cd, int pulse, int *ncg_cell, @@ -9349,17 +9366,18 @@ void dd_partition_system(FILE *fplog, } /* Check if we have recorded loads on the nodes */ - if (comm->bRecordLoad && dd_load_count(comm)) + if (comm->bRecordLoad && dd_load_count(comm) > 0) { - if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal) + if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal && !dd_dlb_is_locked(dd)) { /* Check if we should use DLB at the second partitioning * and every 100 partitionings, * so the extra communication cost is negligible. */ - n = max(100, nstglobalcomm); + const int nddp_chk_dlb = 100; + bCheckDLB = (comm->n_load_collect == 0 || - comm->n_load_have % n == n-1); + comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1); } else { @@ -9397,8 +9415,26 @@ void dd_partition_system(FILE *fplog, /* Since the timings are node dependent, the master decides */ if (DDMASTER(dd)) { - bTurnOnDLB = - (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON); + /* Here we check if the max PME rank load is more than 0.98 + * the max PP force load. If so, PP DLB will not help, + * since we are (almost) limited by PME. Furthermore, + * DLB will cause a significant extra x/f redistribution + * cost on the PME ranks, which will then surely result + * in lower total performance. + * This check might be fragile, since one measurement + * below 0.98 (although only done once every 100 DD part.) + * could turn on DLB for the rest of the run. + */ + if (cr->npmenodes > 0 && + dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON) + { + bTurnOnDLB = FALSE; + } + else + { + bTurnOnDLB = + (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON); + } if (debug) { fprintf(debug, "step %s, imb loss %f\n", diff --git a/src/programs/mdrun/md.c b/src/programs/mdrun/md.c index dc01b6de0c..3d98d597c7 100644 --- a/src/programs/mdrun/md.c +++ b/src/programs/mdrun/md.c @@ -1909,6 +1909,21 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], } dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning); + if (bPMETuneRunning && + fr->nbv->bUseGPU && DOMAINDECOMP(cr) && + !(cr->duty & DUTY_PME)) + { + /* Lock DLB=auto to off (does nothing when DLB=yes/no). + * With GPUs + separate PME ranks, we don't want DLB. + * This could happen when we scan coarse grids and + * it would then never be turned off again. + * This would hurt performance at the final, optimal + * grid spacing, where DLB almost never helps. + * Also, DLB can limit the cut-off for PME tuning. + */ + dd_dlb_set_lock(cr->dd, TRUE); + } + if (bPMETuneRunning || step_rel > ir->nstlist*50) { bPMETuneTry = FALSE; @@ -1939,6 +1954,16 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], { calc_enervirdiff(NULL, ir->eDispCorr, fr); } + + if (!bPMETuneRunning && + DOMAINDECOMP(cr) && + dd_dlb_is_locked(cr->dd)) + { + /* Unlock the DLB=auto, DLB is allowed to activate + * (but we don't expect it to activate in most cases). + */ + dd_dlb_set_lock(cr->dd, FALSE); + } } cycles_pmes = 0; } -- 2.22.0