From: Berk Hess Date: Wed, 1 Jul 2015 13:04:13 +0000 (+0200) Subject: Fix two PME DLB trigger issues X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=5e1339d82fbbb7e4a231ca7ebb290729e8beee56;p=alexxy%2Fgromacs.git Fix two PME DLB trigger issues Dynamic load balancing got triggered while locked by PME load balancing, because a check was placed incorrectly. PME load balancing would never trigger with separate PME ranks because a comparison was inverted. Fixes #1760. Fixes #1763. Change-Id: I75eeb32423b864f84bfd45ecb61d169b473ed74a --- diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp index e8de945b94..e6910aefbc 100644 --- a/src/gromacs/domdec/domdec.cpp +++ b/src/gromacs/domdec/domdec.cpp @@ -5580,6 +5580,9 @@ static float dd_f_imbal(gmx_domdec_t *dd) float dd_pme_f_ratio(gmx_domdec_t *dd) { + /* Should only be called on the DD master rank */ + assert(DDMASTER(dd)); + if (dd->comm->load[0].mdf > 0 && dd->comm->cycl_n[ddCyclPME] > 0) { return dd->comm->load[0].pme/dd->comm->load[0].mdf; @@ -7655,8 +7658,10 @@ void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff) */ static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue) { - if (dd->comm->eDLB == edlbAUTO && !dd_dlb_is_locked(dd)) + if (dd->comm->eDLB == edlbAUTO) { + assert(!dd_dlb_is_locked(dd)); + dd->comm->bCheckWhetherToTurnDlbOn = bValue; } } @@ -7668,7 +7673,7 @@ static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd) { const int nddp_chk_dlb = 100; - if (dd->comm->eDLB != edlbAUTO) + if (dd->comm->eDLB != edlbAUTO || dd_dlb_is_locked(dd)) { return FALSE; } diff --git a/src/gromacs/ewald/pme-load-balancing.cpp b/src/gromacs/ewald/pme-load-balancing.cpp index f8facc5621..089361143a 100644 --- a/src/gromacs/ewald/pme-load-balancing.cpp +++ b/src/gromacs/ewald/pme-load-balancing.cpp @@ -934,11 +934,17 @@ void pme_loadbal_do(pme_load_balancing_t *pme_lb, { pme_lb->bBalance = dd_dlb_is_on(cr->dd); } - else + /* We should ignore the first timing to avoid timing allocation + * overhead. And since the PME load balancing is called just + * before DD repartitioning, the ratio returned by dd_pme_f_ratio + * is not over the last nstlist steps, but the nstlist steps before + * that. So the first useful ratio is available at step_rel=3*nstlist. + */ + else if (step_rel >= 3*ir->nstlist) { if (DDMASTER(cr->dd)) { - /* PME node load is too high, start tuning */ + /* If PME rank load is too high, start tuning */ pme_lb->bBalance = (dd_pme_f_ratio(cr->dd) >= loadBalanceTriggerFactor); } @@ -1019,7 +1025,7 @@ void pme_loadbal_do(pme_load_balancing_t *pme_lb, } if (!pme_lb->bBalance && - (!pme_lb->bSepPMERanks || (step_rel <= pme_lb->step_rel_stop))) + (!pme_lb->bSepPMERanks || step_rel > pme_lb->step_rel_stop)) { /* We have just deactivated the balancing and we're not measuring PP/PME * imbalance during the first steps of the run: deactivate the tuning.