From: Berk Hess Date: Mon, 4 Aug 2014 15:51:03 +0000 (+0200) Subject: Improve DLB+PME tuning with GPUs X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=9975f1ad4a07dfa3d9c8aeeb83adf1b13c1cab18;p=alexxy%2Fgromacs.git Improve DLB+PME tuning with GPUs With GPUs and the DD DLB can quickly limit the PME load balancing room too much. In such cases (and only with DLB=auto) we now first do PME load balancing without DLB and then, if DLB gets turned on, a second round of PME load balancing. Also fixed that when DLB limited the tuning, the fastest choice was reset, which would often lead to stronger limitations. Change-Id: I0087e6b8512d5574d8d0fa2db82e6e38279a82f1 --- diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp index 4f31d1fe54..fb7b8eec2f 100644 --- a/src/gromacs/domdec/domdec.cpp +++ b/src/gromacs/domdec/domdec.cpp @@ -301,6 +301,8 @@ typedef struct gmx_domdec_comm int eDLB; /* Is eDLB=edlbAUTO locked such that we currently can't turn it on? */ gmx_bool bDLB_locked; + /* With eDLB=edlbAUTO, should we check if to DLB on at the next DD? */ + gmx_bool bCheckWhetherToTurnDlbOn; /* Are we actually using DLB? */ gmx_bool bDynLoadBal; @@ -6703,8 +6705,9 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr, /* Initialize to GPU share count to 0, might change later */ comm->nrank_gpu_shared = 0; - comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir); - comm->bDLB_locked = FALSE; + comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir); + comm->bDLB_locked = FALSE; + comm->bCheckWhetherToTurnDlbOn = TRUE; comm->bDynLoadBal = (comm->eDLB == edlbYES); if (fplog) @@ -7590,7 +7593,7 @@ gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir, return bCutoffAllowed; } -void change_dd_dlb_cutoff_limit(t_commrec *cr) +void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff) { gmx_domdec_comm_t *comm; @@ -7600,7 +7603,65 @@ void change_dd_dlb_cutoff_limit(t_commrec *cr) comm->bPMELoadBalDLBLimits = TRUE; /* Change the cut-off limit */ - comm->PMELoadBal_max_cutoff = comm->cutoff; + comm->PMELoadBal_max_cutoff = cutoff; + + if (debug) + { + fprintf(debug, "PME load balancing set a limit to the DLB staggering such that a %f cut-off will continue to fit\n", + comm->PMELoadBal_max_cutoff); + } +} + +/* Sets whether we should later check the load imbalance data, so that + * we can trigger dynamic load balancing if enough imbalance has + * arisen. + * + * Used after PME load balancing unlocks DLB, so that the check + * whether DLB will be useful can happen immediately. + */ +static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx_bool bValue) +{ + if (dd->comm->eDLB == edlbAUTO && !dd_dlb_is_locked(dd)) + { + dd->comm->bCheckWhetherToTurnDlbOn = bValue; + } +} + +/* Returns if we should check whether there has been enough load + * imbalance to trigger dynamic load balancing. + */ +static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd) +{ + const int nddp_chk_dlb = 100; + + if (dd->comm->eDLB != edlbAUTO) + { + return FALSE; + } + + /* We should check whether we should use DLB directly after + * unlocking DLB. */ + if (dd->comm->bCheckWhetherToTurnDlbOn) + { + /* This flag was set when the PME load-balancing routines + unlocked DLB, and should now be cleared. */ + dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, FALSE); + return TRUE; + } + /* We should also check whether we should use DLB every 100 + * partitionings (we do not do this every partioning, so that we + * avoid excessive communication). */ + if (dd->comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1) + { + return TRUE; + } + + return FALSE; +} + +gmx_bool dd_dlb_is_on(const gmx_domdec_t *dd) +{ + return dd->comm->bDynLoadBal; } gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd) @@ -7608,12 +7669,22 @@ gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd) return dd->comm->bDLB_locked; } -void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue) +void dd_dlb_lock(gmx_domdec_t *dd) { - /* We can only lock the DLB when it is set to auto, otherwise don't lock */ + /* We can only lock the DLB when it is set to auto, otherwise don't do anything */ if (dd->comm->eDLB == edlbAUTO) { - dd->comm->bDLB_locked = bValue; + dd->comm->bDLB_locked = TRUE; + } +} + +void dd_dlb_unlock(gmx_domdec_t *dd) +{ + /* We can only lock the DLB when it is set to auto, otherwise don't do anything */ + if (dd->comm->eDLB == edlbAUTO) + { + dd->comm->bDLB_locked = FALSE; + dd_dlb_set_should_check_whether_to_turn_dlb_on(dd, !dd->comm->bDynLoadBal); } } @@ -9324,7 +9395,7 @@ void dd_partition_system(FILE *fplog, gmx_int64_t step_pcoupl; rvec cell_ns_x0, cell_ns_x1; int i, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum; - gmx_bool bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad; + gmx_bool bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bTurnOnDLB, bLogLoad; gmx_bool bRedist, bSortCG, bResortAll; ivec ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np; real grid_density; @@ -9387,20 +9458,7 @@ void dd_partition_system(FILE *fplog, /* Check if we have recorded loads on the nodes */ if (comm->bRecordLoad && dd_load_count(comm) > 0) { - if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal && !dd_dlb_is_locked(dd)) - { - /* Check if we should use DLB at the second partitioning - * and every 100 partitionings, - * so the extra communication cost is negligible. - */ - const int nddp_chk_dlb = 100; - bCheckDLB = (comm->n_load_collect == 0 || - comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1); - } - else - { - bCheckDLB = FALSE; - } + bCheckWhetherToTurnDlbOn = dd_dlb_get_should_check_whether_to_turn_dlb_on(dd); /* Print load every nstlog, first and last step to the log file */ bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) || @@ -9411,7 +9469,7 @@ void dd_partition_system(FILE *fplog, /* Avoid extra communication due to verbose screen output * when nstglobalcomm is set. */ - if (bDoDLB || bLogLoad || bCheckDLB || + if (bDoDLB || bLogLoad || bCheckWhetherToTurnDlbOn || (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist))) { get_load_distribution(dd, wcycle); @@ -9428,7 +9486,7 @@ void dd_partition_system(FILE *fplog, } comm->n_load_collect++; - if (bCheckDLB) + if (bCheckWhetherToTurnDlbOn) { /* Since the timings are node dependent, the master decides */ if (DDMASTER(dd)) diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h index 645dbcc951..f7cf983207 100644 --- a/src/gromacs/domdec/domdec.h +++ b/src/gromacs/domdec/domdec.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2005,2006,2007,2008,2009,2010,2012,2013,2014, by the GROMACS development team, led by + * Copyright (c) 2005,2006,2007,2008,2009,2010,2012,2013,2014,2015, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -173,16 +173,23 @@ gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir, * * Domain boundary changes due to the DD dynamic load balancing can limit * the cut-off distance that can be set in change_dd_cutoff. This function - * limits the DLB such that using the currently set cut-off should still be - * possible after subsequently setting a shorter cut-off with change_dd_cutoff. + * sets/changes the DLB limit such that using the passed (pair-list) cut-off + * should still be possible after subsequently setting a shorter cut-off + * with change_dd_cutoff. */ -void change_dd_dlb_cutoff_limit(t_commrec *cr); +void set_dd_dlb_max_cutoff(t_commrec *cr, real cutoff); + +/*! \brief Return if we are currently using dynamic load balancing */ +gmx_bool dd_dlb_is_on(const gmx_domdec_t *dd); /*! \brief Return if the DLB lock is set */ gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd); -/*! \brief Set a lock such that with DLB=auto DLB can (not) get turned on */ -void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue); +/*! \brief Set a lock such that with DLB=auto DLB cannot get turned on */ +void dd_dlb_lock(gmx_domdec_t *dd); + +/*! \brief Clear a lock such that with DLB=auto DLB may get turned on later */ +void dd_dlb_unlock(gmx_domdec_t *dd); /*! \brief Set up communication for averaging GPU wait times over ranks * diff --git a/src/gromacs/ewald/pme-load-balancing.cpp b/src/gromacs/ewald/pme-load-balancing.cpp index fab16ec2b7..f8facc5621 100644 --- a/src/gromacs/ewald/pme-load-balancing.cpp +++ b/src/gromacs/ewald/pme-load-balancing.cpp @@ -115,6 +115,8 @@ const char *pmelblim_str[epmelblimNR] = struct pme_load_balancing_t { gmx_bool bSepPMERanks; /**< do we have separate PME ranks? */ gmx_bool bActive; /**< is PME tuning active? */ + gmx_int64_t step_rel_stop; /**< stop the tuning after this value of step_rel */ + gmx_bool bTriggerOnDLB; /**< trigger balancing only on DD DLB */ gmx_bool bBalance; /**< are we in the balancing phase, i.e. trying different setups? */ int nstage; /**< the current maximum number of stages */ @@ -127,10 +129,11 @@ struct pme_load_balancing_t { matrix box_start; /**< the initial simulation box */ int n; /**< the count of setup as well as the allocation size */ pme_setup_t *setup; /**< the PME+cutoff setups */ - int cur; /**< the current setup */ - int fastest; /**< fastest setup up till now */ - int start; /**< start of setup range to consider in stage>0 */ - int end; /**< end of setup range to consider in stage>0 */ + int cur; /**< the inex (in setup) of the current setup */ + int fastest; /**< index of the fastest setup up till now */ + int lower_limit; /**< don't go below this setup index */ + int start; /**< start of setup index range to consider in stage>0 */ + int end; /**< end of setup index range to consider in stage>0 */ int elimited; /**< was the balancing limited, uses enum above */ int cutoff_scheme; /**< Verlet or group cut-offs */ @@ -140,12 +143,15 @@ struct pme_load_balancing_t { double cycles_c; /**< step cycle counter cummulative cycles */ }; -void pme_loadbal_init(pme_load_balancing_t **pme_lb_p, - const t_inputrec *ir, matrix box, +void pme_loadbal_init(pme_load_balancing_t **pme_lb_p, + t_commrec *cr, + FILE *fp_log, + const t_inputrec *ir, + matrix box, const interaction_const_t *ic, - struct gmx_pme_t *pmedata, - gmx_bool bUseGPU, gmx_bool bSepPMERanks, - gmx_bool *bPrinting) + struct gmx_pme_t *pmedata, + gmx_bool bUseGPU, + gmx_bool *bPrinting) { pme_load_balancing_t *pme_lb; real spm, sp; @@ -153,7 +159,10 @@ void pme_loadbal_init(pme_load_balancing_t **pme_lb_p, snew(pme_lb, 1); - pme_lb->bSepPMERanks = bSepPMERanks; + pme_lb->bSepPMERanks = !(cr->duty & DUTY_PME); + + /* Initially we turn on balancing directly on based on PP/PME imbalance */ + pme_lb->bTriggerOnDLB = FALSE; /* Any number of stages >= 2 is supported */ pme_lb->nstage = 2; @@ -233,10 +242,11 @@ void pme_loadbal_init(pme_load_balancing_t **pme_lb_p, pme_lb->stage = 0; - pme_lb->fastest = 0; - pme_lb->start = 0; - pme_lb->end = 0; - pme_lb->elimited = epmelblimNO; + pme_lb->fastest = 0; + pme_lb->lower_limit = 0; + pme_lb->start = 0; + pme_lb->end = 0; + pme_lb->elimited = epmelblimNO; pme_lb->cycles_n = 0; pme_lb->cycles_c = 0; @@ -245,15 +255,34 @@ void pme_loadbal_init(pme_load_balancing_t **pme_lb_p, * When running only on a CPU without PME ranks, PME tuning will only help * with small numbers of atoms in the cut-off sphere. */ - pme_lb->bActive = (wallcycle_have_counter() && (bUseGPU || bSepPMERanks)); + pme_lb->bActive = (wallcycle_have_counter() && (bUseGPU || + pme_lb->bSepPMERanks)); /* With GPUs and no separate PME ranks we can't measure the PP/PME * imbalance, so we start balancing right away. * Otherwise we only start balancing after we observe imbalance. */ - pme_lb->bBalance = (pme_lb->bActive && (bUseGPU && !bSepPMERanks)); + pme_lb->bBalance = (pme_lb->bActive && (bUseGPU && !pme_lb->bSepPMERanks)); + + pme_lb->step_rel_stop = PMETunePeriod*ir->nstlist; + + /* Delay DD load balancing when GPUs are used */ + if (pme_lb->bActive && DOMAINDECOMP(cr) && cr->dd->nnodes > 1 && bUseGPU) + { + /* Lock DLB=auto to off (does nothing when DLB=yes/no. + * With GPUs and separate PME nodes, we want to first + * do PME tuning without DLB, since DLB might limit + * the cut-off, which never improves performance. + * We allow for DLB + PME tuning after a first round of tuning. + */ + dd_dlb_lock(cr->dd); + if (dd_dlb_is_locked(cr->dd)) + { + md_print_warn(cr, fp_log, "NOTE: DLB will not turn on during the first phase of PME tuning\n"); + } + } - *pme_lb_p = pme_lb; + *pme_lb_p = pme_lb; *bPrinting = pme_lb->bBalance; } @@ -468,22 +497,22 @@ static void print_loadbal_limited(FILE *fp_err, FILE *fp_log, * In this stage, only reasonably fast setups are run again. */ static void switch_to_stage1(pme_load_balancing_t *pme_lb) { - pme_lb->start = 0; - while (pme_lb->start+1 < pme_lb->n && + pme_lb->start = pme_lb->lower_limit; + while (pme_lb->start + 1 < pme_lb->n && (pme_lb->setup[pme_lb->start].count == 0 || pme_lb->setup[pme_lb->start].cycles > pme_lb->setup[pme_lb->fastest].cycles*maxRelativeSlowdownAccepted)) { pme_lb->start++; } - while (pme_lb->start > 0 && pme_lb->setup[pme_lb->start-1].cycles == 0) + while (pme_lb->start > 0 && pme_lb->setup[pme_lb->start - 1].cycles == 0) { pme_lb->start--; } pme_lb->end = pme_lb->n; - if (pme_lb->setup[pme_lb->end-1].count > 0 && - pme_lb->setup[pme_lb->end-1].cycles > + if (pme_lb->setup[pme_lb->end - 1].count > 0 && + pme_lb->setup[pme_lb->end - 1].cycles > pme_lb->setup[pme_lb->fastest].cycles*maxRelativeSlowdownAccepted) { pme_lb->end--; @@ -491,13 +520,13 @@ static void switch_to_stage1(pme_load_balancing_t *pme_lb) pme_lb->stage = 1; - /* Next we want to choose setup pme_lb->start, but as we will increase - * pme_ln->cur by one right after returning, we subtract 1 here. + /* Next we want to choose setup pme_lb->end-1, but as we will decrease + * pme_ln->cur by one right after returning, we set cur to end. */ - pme_lb->cur = pme_lb->start - 1; + pme_lb->cur = pme_lb->end; } -/*! \brief Try to adjust the PME grid and Coulomb cut-off +/*! \brief Process the timings and try to adjust the PME grid and Coulomb cut-off * * The adjustment is done to generate a different non-bonded PP and PME load. * With separate PME ranks (PP and PME on different processes) or with @@ -508,9 +537,8 @@ static void switch_to_stage1(pme_load_balancing_t *pme_lb) * times and acquiring enough statistics, the best performing setup is chosen. * Here we try to take into account fluctuations and changes due to external * factors as well as DD load balancing. - * Returns TRUE the load balancing continues, FALSE is the balancing is done. */ -static gmx_bool +static void pme_load_balance(pme_load_balancing_t *pme_lb, t_commrec *cr, FILE *fp_err, @@ -529,11 +557,6 @@ pme_load_balance(pme_load_balancing_t *pme_lb, char buf[STRLEN], sbuf[22]; real rtab; - if (pme_lb->stage == pme_lb->nstage) - { - return FALSE; - } - if (PAR(cr)) { gmx_sumd(1, &cycles, cr); @@ -550,7 +573,7 @@ pme_load_balance(pme_load_balancing_t *pme_lb, /* Skip the first cycle, because the first step after a switch * is much slower due to allocation and/or caching effects. */ - return TRUE; + return; } sprintf(buf, "step %4s: ", gmx_step_str(step, sbuf)); @@ -576,7 +599,7 @@ pme_load_balance(pme_load_balancing_t *pme_lb, "Increased the number stages to %d" " and ignoring the previous performance\n", set->grid[XX], set->grid[YY], set->grid[ZZ], - cycles*1e-6, set->cycles*1e-6, maxFluctuationAccepted, + set->cycles*1e-6, cycles*1e-6, maxFluctuationAccepted, pme_lb->nstage); } } @@ -601,7 +624,7 @@ pme_load_balance(pme_load_balancing_t *pme_lb, * better overal performance can be obtained with a slightly * shorter cut-off and better DD load balancing. */ - change_dd_dlb_cutoff_limit(cr); + set_dd_dlb_max_cutoff(cr, pme_lb->setup[pme_lb->fastest].rlistlong); } } cycles_fast = pme_lb->setup[pme_lb->fastest].cycles; @@ -690,7 +713,7 @@ pme_load_balance(pme_load_balancing_t *pme_lb, if (pme_lb->stage > 0 && pme_lb->end == 1) { - pme_lb->cur = 0; + pme_lb->cur = pme_lb->lower_limit; pme_lb->stage = pme_lb->nstage; } else if (pme_lb->stage > 0 && pme_lb->end > 1) @@ -700,14 +723,18 @@ pme_load_balance(pme_load_balancing_t *pme_lb, * which are not much slower than the fastest * else: * use the next setup + * Note that we loop backward to minimize the risk of the cut-off + * getting limited by DD DLB, since the DLB cut-off limit is set + * to the fastest PME setup. */ do { - pme_lb->cur++; - if (pme_lb->cur == pme_lb->end) + pme_lb->cur--; + if (pme_lb->cur == pme_lb->start) { pme_lb->stage++; - pme_lb->cur = pme_lb->start; + + pme_lb->cur = pme_lb->end - 1; } } while (pme_lb->stage == pme_lb->nstage - 1 && @@ -726,16 +753,29 @@ pme_load_balance(pme_load_balancing_t *pme_lb, OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlistlong); if (!OK) { - /* Failsafe solution */ + /* For some reason the chosen cut-off is incompatible with DD. + * We should continue scanning a more limited range of cut-off's. + */ if (pme_lb->cur > 1 && pme_lb->stage == pme_lb->nstage) { + /* stage=nstage says we're finished, but we should continue + * balancing, so we set back stage which was just incremented. + */ pme_lb->stage--; } - pme_lb->fastest = 0; - pme_lb->start = 0; - pme_lb->end = pme_lb->cur; - pme_lb->cur = pme_lb->start; - pme_lb->elimited = epmelblimDD; + if (pme_lb->cur <= pme_lb->fastest) + { + /* This should not happen, as we set limits on the DLB bounds. + * But we implement a complete failsafe solution anyhow. + */ + md_print_warn(cr, fp_log, "The fastest PP/PME load balancing setting (cutoff %.3f nm) is no longer available due to DD DLB or box size limitations\n"); + pme_lb->fastest = pme_lb->lower_limit; + pme_lb->start = pme_lb->lower_limit; + } + /* Limit the range to below the current cut-off, scan from start */ + pme_lb->end = pme_lb->cur; + pme_lb->cur = pme_lb->start; + pme_lb->elimited = epmelblimDD; print_loadbal_limited(fp_err, fp_log, step, pme_lb); } } @@ -823,8 +863,31 @@ pme_load_balance(pme_load_balancing_t *pme_lb, { print_grid(fp_err, fp_log, "", "optimal", set, -1); } +} - return TRUE; +/*! \brief Prepare for another round of PME load balancing + * + * \param[in,out] pme_lb Pointer to PME load balancing struct + * \param[in] bDlbUnlocked TRUE is DLB was locked and is now unlocked + * + * If the conditions (e.g. DLB off/on, CPU/GPU throttling etc.) changed, + * the PP/PME balance might change and re-balancing can improve performance. + * This function adds 2 stages and adjusts the considered setup range. + */ +static void continue_pme_loadbal(pme_load_balancing_t *pme_lb, + gmx_bool bDlbUnlocked) +{ + /* Add 2 tuning stages, keep the detected end of the setup range */ + pme_lb->nstage += 2; + if (bDlbUnlocked && pme_lb->bSepPMERanks) + { + /* With separate PME ranks, DLB should always lower the PP load and + * can only increase the PME load (more communication and imbalance), + * so we only need to scan longer cut-off's. + */ + pme_lb->lower_limit = pme_lb->cur; + } + pme_lb->start = pme_lb->lower_limit; } void pme_loadbal_do(pme_load_balancing_t *pme_lb, @@ -867,40 +930,79 @@ void pme_loadbal_do(pme_load_balancing_t *pme_lb, /* PME grid + cut-off optimization with GPUs or PME ranks */ if (!pme_lb->bBalance && pme_lb->bSepPMERanks) { - if (DDMASTER(cr->dd)) + if (pme_lb->bTriggerOnDLB) { - /* PME rank load is too high, start tuning */ - pme_lb->bBalance = (dd_pme_f_ratio(cr->dd) >= loadBalanceTriggerFactor); + pme_lb->bBalance = dd_dlb_is_on(cr->dd); + } + else + { + if (DDMASTER(cr->dd)) + { + /* PME node load is too high, start tuning */ + pme_lb->bBalance = + (dd_pme_f_ratio(cr->dd) >= loadBalanceTriggerFactor); + } + dd_bcast(cr->dd, sizeof(gmx_bool), &pme_lb->bBalance); + } + + pme_lb->bActive = (pme_lb->bBalance || + step_rel <= pme_lb->step_rel_stop); + } + + /* The location in the code of this balancing termination is strange. + * You would expect to have it after the call to pme_load_balance() + * below, since there pme_lb->stage is updated. + * But when terminating directly after deciding on and selecting the + * optimal setup, DLB will turn on right away if it was locked before. + * This might be due to PME reinitialization. So we check stage here + * to allow for another nstlist steps with DLB locked to stabilize + * the performance. + */ + if (pme_lb->bBalance && pme_lb->stage == pme_lb->nstage) + { + pme_lb->bBalance = FALSE; + + if (DOMAINDECOMP(cr) && dd_dlb_is_locked(cr->dd)) + { + /* Unlock the DLB=auto, DLB is allowed to activate */ + dd_dlb_unlock(cr->dd); + md_print_warn(cr, fp_log, "NOTE: DLB can now turn on, when beneficial\n"); + + /* We don't deactivate the tuning yet, since we will balance again + * after DLB gets turned on, if it does within PMETune_period. + */ + continue_pme_loadbal(pme_lb, TRUE); + pme_lb->bTriggerOnDLB = TRUE; + pme_lb->step_rel_stop = step_rel + PMETunePeriod*ir->nstlist; + } + else + { + /* We're completely done with PME tuning */ + pme_lb->bActive = FALSE; } - dd_bcast(cr->dd, sizeof(gmx_bool), &pme_lb->bBalance); - if (pme_lb->bBalance && - use_GPU(fr->nbv) && DOMAINDECOMP(cr) && - pme_lb->bSepPMERanks) + if (DOMAINDECOMP(cr)) { - /* Lock DLB=auto to off (does nothing when DLB=yes/no). - * With GPUs + separate PME ranks, we don't want DLB. - * This could happen when we scan coarse grids and - * it would then never be turned off again. - * This would hurt performance at the final, optimal - * grid spacing, where DLB almost never helps. - * Also, DLB can limit the cut-off for PME tuning. + /* Set the cut-off limit to the final selected cut-off, + * so we don't have artificial DLB limits. + * This also ensures that we won't disable the currently + * optimal setting during a second round of PME balancing. */ - dd_dlb_set_lock(cr->dd, TRUE); + set_dd_dlb_max_cutoff(cr, fr->ic->rlistlong); } } if (pme_lb->bBalance) { - /* init_step might not be a multiple of nstlist, - * but the first cycle is always skipped anyhow. + /* We might not have collected nstlist steps in cycles yet, + * since init_step might not be a multiple of nstlist, + * but the first data collected is skipped anyhow. */ - pme_lb->bBalance = - pme_load_balance(pme_lb, cr, - fp_err, fp_log, - ir, state, pme_lb->cycles_c - cycles_prev, - fr->ic, fr->nbv, &fr->pmedata, - step); + pme_load_balance(pme_lb, cr, + fp_err, fp_log, + ir, state, pme_lb->cycles_c - cycles_prev, + fr->ic, fr->nbv, &fr->pmedata, + step); /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */ fr->ewaldcoeff_q = fr->ic->ewaldcoeff_q; @@ -914,33 +1016,25 @@ void pme_loadbal_do(pme_load_balancing_t *pme_lb, { calc_enervirdiff(NULL, ir->eDispCorr, fr); } - - if (!pme_lb->bBalance && - DOMAINDECOMP(cr) && - dd_dlb_is_locked(cr->dd)) - { - /* Unlock the DLB=auto, DLB is allowed to activate - * (but we don't expect it to activate in most cases). - */ - dd_dlb_set_lock(cr->dd, FALSE); - } } if (!pme_lb->bBalance && - (!pme_lb->bSepPMERanks || (step_rel <= PMETunePeriod*ir->nstlist))) + (!pme_lb->bSepPMERanks || (step_rel <= pme_lb->step_rel_stop))) { /* We have just deactivated the balancing and we're not measuring PP/PME - * imbalance during the first 50*nstlist steps: deactivate the tuning. + * imbalance during the first steps of the run: deactivate the tuning. */ pme_lb->bActive = FALSE; } - *bPrinting = pme_lb->bBalance; -} + if (!(pme_lb->bActive) && DOMAINDECOMP(cr) && dd_dlb_is_locked(cr->dd)) + { + /* Make sure DLB is allowed when we deactivate PME tuning */ + dd_dlb_unlock(cr->dd); + md_print_warn(cr, fp_log, "NOTE: DLB can now turn on, when beneficial\n"); + } -void restart_pme_loadbal(pme_load_balancing_t *pme_lb, int n) -{ - pme_lb->nstage += n; + *bPrinting = pme_lb->bBalance; } /*! \brief Return product of the number of PME grid points in each dimension */ diff --git a/src/gromacs/ewald/pme-load-balancing.h b/src/gromacs/ewald/pme-load-balancing.h index 2a46b2cb1e..4b307cd76d 100644 --- a/src/gromacs/ewald/pme-load-balancing.h +++ b/src/gromacs/ewald/pme-load-balancing.h @@ -65,12 +65,13 @@ struct pme_load_balancing_t; * usage. */ void pme_loadbal_init(pme_load_balancing_t **pme_lb_p, + t_commrec *cr, + FILE *fp_log, const t_inputrec *ir, matrix box, const interaction_const_t *ic, struct gmx_pme_t *pmedata, gmx_bool bUseGPU, - gmx_bool bSepPMERanks, gmx_bool *bPrinting); /*! \brief Process cycles and PME load balance when necessary @@ -92,12 +93,10 @@ void pme_loadbal_do(pme_load_balancing_t *pme_lb, gmx_int64_t step_rel, gmx_bool *bPrinting); -/*! \brief Restart the PME load balancing discarding all timings gathered up till now */ -void restart_pme_loadbal(pme_load_balancing_t *pme_lb, int n); - /*! \brief Finish the PME load balancing and print the settings when fplog!=NULL */ void pme_loadbal_done(pme_load_balancing_t *pme_lb, - t_commrec *cr, FILE *fplog, - gmx_bool bNonBondedOnGPU); + t_commrec *cr, + FILE *fplog, + gmx_bool bNonBondedOnGPU); #endif diff --git a/src/programs/mdrun/md.cpp b/src/programs/mdrun/md.cpp index 505b9a7ca3..f137273195 100644 --- a/src/programs/mdrun/md.cpp +++ b/src/programs/mdrun/md.cpp @@ -475,8 +475,8 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], !(Flags & MD_REPRODUCIBLE)); if (bPMETune) { - pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata, - use_GPU(fr->nbv), !(cr->duty & DUTY_PME), + pme_loadbal_init(&pme_loadbal, cr, fplog, ir, state->box, + fr->ic, fr->pmedata, use_GPU(fr->nbv), &bPMETunePrinting); }