#include "pme_internal.h"
/*! \brief Parameters and settings for one PP-PME setup */
-struct pme_setup_t {
- real rcut_coulomb; /**< Coulomb cut-off */
- real rlistOuter; /**< cut-off for the outer pair-list */
- real rlistInner; /**< cut-off for the inner pair-list */
- real spacing; /**< (largest) PME grid spacing */
- ivec grid; /**< the PME grid dimensions */
- real grid_efficiency; /**< ineffiency factor for non-uniform grids <= 1 */
- real ewaldcoeff_q; /**< Electrostatic Ewald coefficient */
- real ewaldcoeff_lj; /**< LJ Ewald coefficient, only for the call to send_switchgrid */
- struct gmx_pme_t *pmedata; /**< the data structure used in the PME code */
- int count; /**< number of times this setup has been timed */
- double cycles; /**< the fastest time for this setup in cycles */
+struct pme_setup_t
+{
+ real rcut_coulomb; /**< Coulomb cut-off */
+ real rlistOuter; /**< cut-off for the outer pair-list */
+ real rlistInner; /**< cut-off for the inner pair-list */
+ real spacing; /**< (largest) PME grid spacing */
+ ivec grid; /**< the PME grid dimensions */
+ real grid_efficiency; /**< ineffiency factor for non-uniform grids <= 1 */
+ real ewaldcoeff_q; /**< Electrostatic Ewald coefficient */
+ real ewaldcoeff_lj; /**< LJ Ewald coefficient, only for the call to send_switchgrid */
+ struct gmx_pme_t* pmedata; /**< the data structure used in the PME code */
+ int count; /**< number of times this setup has been timed */
+ double cycles; /**< the fastest time for this setup in cycles */
};
/*! \brief After 50 nstlist periods of not observing imbalance: never tune PME */
-const int PMETunePeriod = 50;
+const int PMETunePeriod = 50;
/*! \brief Trigger PME load balancing at more than 5% PME overload */
const real loadBalanceTriggerFactor = 1.05;
/*! \brief Scale the grid by a most at factor 1.7.
const real maxFluctuationAccepted = 1.02;
/*! \brief Enumeration whose values describe the effect limiting the load balancing */
-enum epmelb {
- epmelblimNO, epmelblimBOX, epmelblimDD, epmelblimPMEGRID, epmelblimMAXSCALING, epmelblimNR
+enum epmelb
+{
+ epmelblimNO,
+ epmelblimBOX,
+ epmelblimDD,
+ epmelblimPMEGRID,
+ epmelblimMAXSCALING,
+ epmelblimNR
};
/*! \brief Descriptive strings matching ::epmelb */
-static const char *pmelblim_str[epmelblimNR] =
-{ "no", "box size", "domain decompostion", "PME grid restriction", "maximum allowed grid scaling" };
+static const char* pmelblim_str[epmelblimNR] = { "no", "box size", "domain decompostion",
+ "PME grid restriction",
+ "maximum allowed grid scaling" };
-struct pme_load_balancing_t {
- gmx_bool bSepPMERanks; /**< do we have separate PME ranks? */
- gmx_bool bActive; /**< is PME tuning active? */
- int64_t step_rel_stop; /**< stop the tuning after this value of step_rel */
- gmx_bool bTriggerOnDLB; /**< trigger balancing only on DD DLB */
- gmx_bool bBalance; /**< are we in the balancing phase, i.e. trying different setups? */
- int nstage; /**< the current maximum number of stages */
+struct pme_load_balancing_t
+{
+ gmx_bool bSepPMERanks; /**< do we have separate PME ranks? */
+ gmx_bool bActive; /**< is PME tuning active? */
+ int64_t step_rel_stop; /**< stop the tuning after this value of step_rel */
+ gmx_bool bTriggerOnDLB; /**< trigger balancing only on DD DLB */
+ gmx_bool bBalance; /**< are we in the balancing phase, i.e. trying different setups? */
+ int nstage; /**< the current maximum number of stages */
real cut_spacing; /**< the minimum cutoff / PME grid spacing ratio */
real rcut_vdw; /**< Vdw cutoff (does not change) */
int cur; /**< the index (in setup) of the current setup */
int fastest; /**< index of the fastest setup up till now */
int lower_limit; /**< don't go below this setup index */
- int start; /**< start of setup index range to consider in stage>0 */
- int end; /**< end of setup index range to consider in stage>0 */
- int elimited; /**< was the balancing limited, uses enum above */
- int cutoff_scheme; /**< Verlet or group cut-offs */
+ int start; /**< start of setup index range to consider in stage>0 */
+ int end; /**< end of setup index range to consider in stage>0 */
+ int elimited; /**< was the balancing limited, uses enum above */
+ int cutoff_scheme; /**< Verlet or group cut-offs */
- int stage; /**< the current stage */
+ int stage; /**< the current stage */
- int cycles_n; /**< step cycle counter cummulative count */
- double cycles_c; /**< step cycle counter cummulative cycles */
+ int cycles_n; /**< step cycle counter cummulative count */
+ double cycles_c; /**< step cycle counter cummulative cycles */
};
/* TODO The code in this file should call this getter, rather than
* read bActive anywhere */
-bool pme_loadbal_is_active(const pme_load_balancing_t *pme_lb)
+bool pme_loadbal_is_active(const pme_load_balancing_t* pme_lb)
{
return pme_lb != nullptr && pme_lb->bActive;
}
// TODO Return a unique_ptr to pme_load_balancing_t
-void pme_loadbal_init(pme_load_balancing_t **pme_lb_p,
- t_commrec *cr,
- const gmx::MDLogger &mdlog,
- const t_inputrec &ir,
+void pme_loadbal_init(pme_load_balancing_t** pme_lb_p,
+ t_commrec* cr,
+ const gmx::MDLogger& mdlog,
+ const t_inputrec& ir,
const matrix box,
- const interaction_const_t &ic,
- const nonbonded_verlet_t &nbv,
- gmx_pme_t *pmedata,
+ const interaction_const_t& ic,
+ const nonbonded_verlet_t& nbv,
+ gmx_pme_t* pmedata,
gmx_bool bUseGPU,
- gmx_bool *bPrinting)
+ gmx_bool* bPrinting)
{
- pme_load_balancing_t *pme_lb;
+ pme_load_balancing_t* pme_lb;
real spm, sp;
int d;
// Note that we don't (yet) support PME load balancing with LJ-PME only.
- GMX_RELEASE_ASSERT(EEL_PME(ir.coulombtype), "pme_loadbal_init called without PME electrostatics");
+ GMX_RELEASE_ASSERT(EEL_PME(ir.coulombtype),
+ "pme_loadbal_init called without PME electrostatics");
// To avoid complexity, we require a single cut-off with PME for q+LJ.
// This is checked by grompp, but it doesn't hurt to check again.
- GMX_RELEASE_ASSERT(!(EEL_PME(ir.coulombtype) && EVDW_PME(ir.vdwtype) && ir.rcoulomb != ir.rvdw), "With Coulomb and LJ PME, rcoulomb should be equal to rvdw");
+ GMX_RELEASE_ASSERT(!(EEL_PME(ir.coulombtype) && EVDW_PME(ir.vdwtype) && ir.rcoulomb != ir.rvdw),
+ "With Coulomb and LJ PME, rcoulomb should be equal to rvdw");
pme_lb = new pme_load_balancing_t;
- pme_lb->bSepPMERanks = !thisRankHasDuty(cr, DUTY_PME);
+ pme_lb->bSepPMERanks = !thisRankHasDuty(cr, DUTY_PME);
/* Initially we turn on balancing directly on based on PP/PME imbalance */
- pme_lb->bTriggerOnDLB = FALSE;
+ pme_lb->bTriggerOnDLB = FALSE;
/* Any number of stages >= 2 is supported */
- pme_lb->nstage = 2;
+ pme_lb->nstage = 2;
- pme_lb->cutoff_scheme = ir.cutoff_scheme;
+ pme_lb->cutoff_scheme = ir.cutoff_scheme;
pme_lb->rbufOuter_coulomb = nbv.pairlistOuterRadius() - ic.rcoulomb;
pme_lb->rbufOuter_vdw = nbv.pairlistOuterRadius() - ic.rvdw;
pme_lb->setup.resize(1);
- pme_lb->rcut_vdw = ic.rvdw;
- pme_lb->rcut_coulomb_start = ir.rcoulomb;
+ pme_lb->rcut_vdw = ic.rvdw;
+ pme_lb->rcut_coulomb_start = ir.rcoulomb;
- pme_lb->cur = 0;
- pme_lb->setup[0].rcut_coulomb = ic.rcoulomb;
- pme_lb->setup[0].rlistOuter = nbv.pairlistOuterRadius();
- pme_lb->setup[0].rlistInner = nbv.pairlistInnerRadius();
- pme_lb->setup[0].grid[XX] = ir.nkx;
- pme_lb->setup[0].grid[YY] = ir.nky;
- pme_lb->setup[0].grid[ZZ] = ir.nkz;
- pme_lb->setup[0].ewaldcoeff_q = ic.ewaldcoeff_q;
- pme_lb->setup[0].ewaldcoeff_lj = ic.ewaldcoeff_lj;
+ pme_lb->cur = 0;
+ pme_lb->setup[0].rcut_coulomb = ic.rcoulomb;
+ pme_lb->setup[0].rlistOuter = nbv.pairlistOuterRadius();
+ pme_lb->setup[0].rlistInner = nbv.pairlistInnerRadius();
+ pme_lb->setup[0].grid[XX] = ir.nkx;
+ pme_lb->setup[0].grid[YY] = ir.nky;
+ pme_lb->setup[0].grid[ZZ] = ir.nkz;
+ pme_lb->setup[0].ewaldcoeff_q = ic.ewaldcoeff_q;
+ pme_lb->setup[0].ewaldcoeff_lj = ic.ewaldcoeff_lj;
if (!pme_lb->bSepPMERanks)
{
- GMX_RELEASE_ASSERT(pmedata, "On ranks doing both PP and PME we need a valid pmedata object");
- pme_lb->setup[0].pmedata = pmedata;
+ GMX_RELEASE_ASSERT(pmedata,
+ "On ranks doing both PP and PME we need a valid pmedata object");
+ pme_lb->setup[0].pmedata = pmedata;
}
spm = 0;
for (d = 0; d < DIM; d++)
{
- sp = norm(pme_lb->box_start[d])/pme_lb->setup[0].grid[d];
+ sp = norm(pme_lb->box_start[d]) / pme_lb->setup[0].grid[d];
if (sp > spm)
{
spm = sp;
if (ir.fourier_spacing > 0)
{
- pme_lb->cut_spacing = ir.rcoulomb/ir.fourier_spacing;
+ pme_lb->cut_spacing = ir.rcoulomb / ir.fourier_spacing;
}
else
{
- pme_lb->cut_spacing = ir.rcoulomb/pme_lb->setup[0].spacing;
+ pme_lb->cut_spacing = ir.rcoulomb / pme_lb->setup[0].spacing;
}
pme_lb->stage = 0;
if (!wallcycle_have_counter())
{
- GMX_LOG(mdlog.warning).asParagraph().appendText("NOTE: Cycle counters unsupported or not enabled in kernel. Cannot use PME-PP balancing.");
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendText(
+ "NOTE: Cycle counters unsupported or not enabled in kernel. Cannot use "
+ "PME-PP balancing.");
}
/* Tune with GPUs and/or separate PME ranks.
* When running only on a CPU without PME ranks, PME tuning will only help
* with small numbers of atoms in the cut-off sphere.
*/
- pme_lb->bActive = (wallcycle_have_counter() && (bUseGPU ||
- pme_lb->bSepPMERanks));
+ pme_lb->bActive = (wallcycle_have_counter() && (bUseGPU || pme_lb->bSepPMERanks));
/* With GPUs and no separate PME ranks we can't measure the PP/PME
* imbalance, so we start balancing right away.
*/
pme_lb->bBalance = (pme_lb->bActive && (bUseGPU && !pme_lb->bSepPMERanks));
- pme_lb->step_rel_stop = PMETunePeriod*ir.nstlist;
+ pme_lb->step_rel_stop = PMETunePeriod * ir.nstlist;
/* Delay DD load balancing when GPUs are used */
if (pme_lb->bActive && DOMAINDECOMP(cr) && cr->dd->nnodes > 1 && bUseGPU)
dd_dlb_lock(cr->dd);
if (dd_dlb_is_locked(cr->dd))
{
- GMX_LOG(mdlog.warning).asParagraph().appendText("NOTE: DLB will not turn on during the first phase of PME tuning");
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendText("NOTE: DLB will not turn on during the first phase of PME tuning");
}
}
}
/*! \brief Try to increase the cutoff during load balancing */
-static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t *pme_lb,
- int pme_order,
- const gmx_domdec_t *dd)
+static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t* pme_lb, int pme_order, const gmx_domdec_t* dd)
{
- real fac, sp;
- real tmpr_coulomb, tmpr_vdw;
- int d;
- bool grid_ok;
+ real fac, sp;
+ real tmpr_coulomb, tmpr_vdw;
+ int d;
+ bool grid_ok;
/* Try to add a new setup with next larger cut-off to the list */
pme_setup_t set;
fac *= 1.01;
clear_ivec(set.grid);
- sp = calcFftGrid(nullptr, pme_lb->box_start,
- fac*pme_lb->setup[pme_lb->cur].spacing,
- minimalPmeGridSize(pme_order),
- &set.grid[XX],
- &set.grid[YY],
- &set.grid[ZZ]);
+ sp = calcFftGrid(nullptr, pme_lb->box_start, fac * pme_lb->setup[pme_lb->cur].spacing,
+ minimalPmeGridSize(pme_order), &set.grid[XX], &set.grid[YY], &set.grid[ZZ]);
/* As here we can't easily check if one of the PME ranks
* uses threading, we do a conservative grid check.
* This means we can't use pme_order or less grid lines
* per PME rank along x, which is not a strong restriction.
*/
- grid_ok = gmx_pme_check_restrictions(pme_order,
- set.grid[XX], set.grid[YY], set.grid[ZZ],
- numPmeDomains.x,
- true,
- false);
- }
- while (sp <= 1.001*pme_lb->setup[pme_lb->cur].spacing || !grid_ok);
+ grid_ok = gmx_pme_check_restrictions(pme_order, set.grid[XX], set.grid[YY], set.grid[ZZ],
+ numPmeDomains.x, true, false);
+ } while (sp <= 1.001 * pme_lb->setup[pme_lb->cur].spacing || !grid_ok);
- set.rcut_coulomb = pme_lb->cut_spacing*sp;
+ set.rcut_coulomb = pme_lb->cut_spacing * sp;
if (set.rcut_coulomb < pme_lb->rcut_coulomb_start)
{
/* This is unlikely, but can happen when e.g. continuing from
if (pme_lb->cutoff_scheme == ecutsVERLET)
{
/* Never decrease the Coulomb and VdW list buffers */
- set.rlistOuter = std::max(set.rcut_coulomb + pme_lb->rbufOuter_coulomb,
- pme_lb->rcut_vdw + pme_lb->rbufOuter_vdw);
- set.rlistInner = std::max(set.rcut_coulomb + pme_lb->rbufInner_coulomb,
- pme_lb->rcut_vdw + pme_lb->rbufInner_vdw);
+ set.rlistOuter = std::max(set.rcut_coulomb + pme_lb->rbufOuter_coulomb,
+ pme_lb->rcut_vdw + pme_lb->rbufOuter_vdw);
+ set.rlistInner = std::max(set.rcut_coulomb + pme_lb->rbufInner_coulomb,
+ pme_lb->rcut_vdw + pme_lb->rbufInner_vdw);
}
else
{
/* TODO Remove these lines and pme_lb->cutoff_scheme */
- tmpr_coulomb = set.rcut_coulomb + pme_lb->rbufOuter_coulomb;
- tmpr_vdw = pme_lb->rcut_vdw + pme_lb->rbufOuter_vdw;
+ tmpr_coulomb = set.rcut_coulomb + pme_lb->rbufOuter_coulomb;
+ tmpr_vdw = pme_lb->rcut_vdw + pme_lb->rbufOuter_vdw;
/* Two (known) bugs with cutoff-scheme=group here:
* - This modification of rlist results in incorrect DD comunication.
* - We should set fr->bTwinRange = (fr->rlistlong > fr->rlist).
*/
- set.rlistOuter = std::min(tmpr_coulomb, tmpr_vdw);
- set.rlistInner = set.rlistOuter;
+ set.rlistOuter = std::min(tmpr_coulomb, tmpr_vdw);
+ set.rlistInner = set.rlistOuter;
}
- set.spacing = sp;
+ set.spacing = sp;
/* The grid efficiency is the size wrt a grid with uniform x/y/z spacing */
set.grid_efficiency = 1;
for (d = 0; d < DIM; d++)
{
- set.grid_efficiency *= (set.grid[d]*sp)/norm(pme_lb->box_start[d]);
+ set.grid_efficiency *= (set.grid[d] * sp) / norm(pme_lb->box_start[d]);
}
/* The Ewald coefficient is inversly proportional to the cut-off */
- set.ewaldcoeff_q =
- pme_lb->setup[0].ewaldcoeff_q*pme_lb->setup[0].rcut_coulomb/set.rcut_coulomb;
+ set.ewaldcoeff_q = pme_lb->setup[0].ewaldcoeff_q * pme_lb->setup[0].rcut_coulomb / set.rcut_coulomb;
/* We set ewaldcoeff_lj in set, even when LJ-PME is not used */
- set.ewaldcoeff_lj =
- pme_lb->setup[0].ewaldcoeff_lj*pme_lb->setup[0].rcut_coulomb/set.rcut_coulomb;
+ set.ewaldcoeff_lj = pme_lb->setup[0].ewaldcoeff_lj * pme_lb->setup[0].rcut_coulomb / set.rcut_coulomb;
- set.count = 0;
- set.cycles = 0;
+ set.count = 0;
+ set.cycles = 0;
if (debug)
{
- fprintf(debug, "PME loadbal: grid %d %d %d, coulomb cutoff %f\n",
- set.grid[XX], set.grid[YY], set.grid[ZZ], set.rcut_coulomb);
+ fprintf(debug, "PME loadbal: grid %d %d %d, coulomb cutoff %f\n", set.grid[XX],
+ set.grid[YY], set.grid[ZZ], set.rcut_coulomb);
}
pme_lb->setup.push_back(set);
return TRUE;
}
/*! \brief Print the PME grid */
-static void print_grid(FILE *fp_err, FILE *fp_log,
- const char *pre,
- const char *desc,
- const pme_setup_t *set,
- double cycles)
+static void print_grid(FILE* fp_err, FILE* fp_log, const char* pre, const char* desc, const pme_setup_t* set, double cycles)
{
- auto buf = gmx::formatString("%-11s%10s pme grid %d %d %d, coulomb cutoff %.3f",
- pre, desc,
+ auto buf = gmx::formatString("%-11s%10s pme grid %d %d %d, coulomb cutoff %.3f", pre, desc,
set->grid[XX], set->grid[YY], set->grid[ZZ], set->rcut_coulomb);
if (cycles >= 0)
{
- buf += gmx::formatString(": %.1f M-cycles", cycles*1e-6);
+ buf += gmx::formatString(": %.1f M-cycles", cycles * 1e-6);
}
if (fp_err != nullptr)
{
}
/*! \brief Return the index of the last setup used in PME load balancing */
-static int pme_loadbal_end(pme_load_balancing_t *pme_lb)
+static int pme_loadbal_end(pme_load_balancing_t* pme_lb)
{
/* In the initial stage only n is set; end is not set yet */
if (pme_lb->end > 0)
}
/*! \brief Print descriptive string about what limits PME load balancing */
-static void print_loadbal_limited(FILE *fp_err, FILE *fp_log,
- int64_t step,
- pme_load_balancing_t *pme_lb)
+static void print_loadbal_limited(FILE* fp_err, FILE* fp_log, int64_t step, pme_load_balancing_t* pme_lb)
{
- auto buf = gmx::formatString("step %4s: the %s limits the PME load balancing to a coulomb cut-off of %.3f",
- gmx::int64ToString(step).c_str(),
- pmelblim_str[pme_lb->elimited],
- pme_lb->setup[pme_loadbal_end(pme_lb)-1].rcut_coulomb);
+ auto buf = gmx::formatString(
+ "step %4s: the %s limits the PME load balancing to a coulomb cut-off of %.3f",
+ gmx::int64ToString(step).c_str(), pmelblim_str[pme_lb->elimited],
+ pme_lb->setup[pme_loadbal_end(pme_lb) - 1].rcut_coulomb);
if (fp_err != nullptr)
{
fprintf(fp_err, "\r%s\n", buf.c_str());
/*! \brief Switch load balancing to stage 1
*
* In this stage, only reasonably fast setups are run again. */
-static void switch_to_stage1(pme_load_balancing_t *pme_lb)
+static void switch_to_stage1(pme_load_balancing_t* pme_lb)
{
/* Increase start until we find a setup that is not slower than
* maxRelativeSlowdownAccepted times the fastest setup.
*/
pme_lb->start = pme_lb->lower_limit;
- while (pme_lb->start + 1 < gmx::ssize(pme_lb->setup) &&
- (pme_lb->setup[pme_lb->start].count == 0 ||
- pme_lb->setup[pme_lb->start].cycles >
- pme_lb->setup[pme_lb->fastest].cycles*maxRelativeSlowdownAccepted))
+ while (pme_lb->start + 1 < gmx::ssize(pme_lb->setup)
+ && (pme_lb->setup[pme_lb->start].count == 0
+ || pme_lb->setup[pme_lb->start].cycles
+ > pme_lb->setup[pme_lb->fastest].cycles * maxRelativeSlowdownAccepted))
{
pme_lb->start++;
}
* any skipped setups that lie between setups that were measured to be
* acceptably fast and too slow.
*/
- while (pme_lb->start > pme_lb->lower_limit &&
- pme_lb->setup[pme_lb->start - 1].count == 0)
+ while (pme_lb->start > pme_lb->lower_limit && pme_lb->setup[pme_lb->start - 1].count == 0)
{
pme_lb->start--;
}
/* Decrease end only with setups that we timed and that are slow. */
pme_lb->end = pme_lb->setup.size();
- if (pme_lb->setup[pme_lb->end - 1].count > 0 &&
- pme_lb->setup[pme_lb->end - 1].cycles >
- pme_lb->setup[pme_lb->fastest].cycles*maxRelativeSlowdownAccepted)
+ if (pme_lb->setup[pme_lb->end - 1].count > 0
+ && pme_lb->setup[pme_lb->end - 1].cycles
+ > pme_lb->setup[pme_lb->fastest].cycles * maxRelativeSlowdownAccepted)
{
pme_lb->end--;
}
* Here we try to take into account fluctuations and changes due to external
* factors as well as DD load balancing.
*/
-static void
-pme_load_balance(pme_load_balancing_t *pme_lb,
- t_commrec *cr,
- FILE *fp_err,
- FILE *fp_log,
- const gmx::MDLogger &mdlog,
- const t_inputrec &ir,
- const matrix box,
- gmx::ArrayRef<const gmx::RVec> x,
- double cycles,
- interaction_const_t *ic,
- struct nonbonded_verlet_t *nbv,
- struct gmx_pme_t ** pmedata,
- int64_t step)
+static void pme_load_balance(pme_load_balancing_t* pme_lb,
+ t_commrec* cr,
+ FILE* fp_err,
+ FILE* fp_log,
+ const gmx::MDLogger& mdlog,
+ const t_inputrec& ir,
+ const matrix box,
+ gmx::ArrayRef<const gmx::RVec> x,
+ double cycles,
+ interaction_const_t* ic,
+ struct nonbonded_verlet_t* nbv,
+ struct gmx_pme_t** pmedata,
+ int64_t step)
{
gmx_bool OK;
- pme_setup_t *set;
+ pme_setup_t* set;
double cycles_fast;
char buf[STRLEN], sbuf[22];
}
else
{
- if (cycles*maxFluctuationAccepted < set->cycles &&
- pme_lb->stage == pme_lb->nstage - 1)
+ if (cycles * maxFluctuationAccepted < set->cycles && pme_lb->stage == pme_lb->nstage - 1)
{
/* The performance went up a lot (due to e.g. DD load balancing).
* Add a stage, keep the minima, but rescan all setups.
if (debug)
{
- fprintf(debug, "The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this is more than %f\n"
+ fprintf(debug,
+ "The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this "
+ "is more than %f\n"
"Increased the number stages to %d"
" and ignoring the previous performance\n",
- set->grid[XX], set->grid[YY], set->grid[ZZ],
- set->cycles*1e-6, cycles*1e-6, maxFluctuationAccepted,
- pme_lb->nstage);
+ set->grid[XX], set->grid[YY], set->grid[ZZ], set->cycles * 1e-6,
+ cycles * 1e-6, maxFluctuationAccepted, pme_lb->nstage);
}
}
set->cycles = std::min(set->cycles, cycles);
/* Check in stage 0 if we should stop scanning grids.
* Stop when the time is more than maxRelativeSlowDownAccepted longer than the fastest.
*/
- if (pme_lb->stage == 0 && pme_lb->cur > 0 &&
- cycles > pme_lb->setup[pme_lb->fastest].cycles*maxRelativeSlowdownAccepted)
+ if (pme_lb->stage == 0 && pme_lb->cur > 0
+ && cycles > pme_lb->setup[pme_lb->fastest].cycles * maxRelativeSlowdownAccepted)
{
pme_lb->setup.resize(pme_lb->cur + 1);
/* Done with scanning, go to stage 1 */
{
int gridsize_start;
- gridsize_start = set->grid[XX]*set->grid[YY]*set->grid[ZZ];
+ gridsize_start = set->grid[XX] * set->grid[YY] * set->grid[ZZ];
do
{
- if (pme_lb->cur+1 < gmx::ssize(pme_lb->setup))
+ if (pme_lb->cur + 1 < gmx::ssize(pme_lb->setup))
{
/* We had already generated the next setup */
OK = TRUE;
}
}
- if (OK &&
- pme_lb->setup[pme_lb->cur+1].spacing > c_maxSpacingScaling*pme_lb->setup[0].spacing)
+ if (OK
+ && pme_lb->setup[pme_lb->cur + 1].spacing > c_maxSpacingScaling * pme_lb->setup[0].spacing)
{
OK = FALSE;
pme_lb->elimited = epmelblimMAXSCALING;
if (OK && ir.ePBC != epbcNONE)
{
- OK = (gmx::square(pme_lb->setup[pme_lb->cur+1].rlistOuter)
- <= max_cutoff2(ir.ePBC, box));
+ OK = (gmx::square(pme_lb->setup[pme_lb->cur + 1].rlistOuter) <= max_cutoff2(ir.ePBC, box));
if (!OK)
{
pme_lb->elimited = epmelblimBOX;
if (DOMAINDECOMP(cr))
{
- OK = change_dd_cutoff(cr, box, x,
- pme_lb->setup[pme_lb->cur].rlistOuter);
+ OK = change_dd_cutoff(cr, box, x, pme_lb->setup[pme_lb->cur].rlistOuter);
if (!OK)
{
/* Failed: do not use this setup */
/* Switch to the next stage */
switch_to_stage1(pme_lb);
}
- }
- while (OK &&
- !(pme_lb->setup[pme_lb->cur].grid[XX]*
- pme_lb->setup[pme_lb->cur].grid[YY]*
- pme_lb->setup[pme_lb->cur].grid[ZZ] <
- gridsize_start*gridpointsScaleFactor
- &&
- pme_lb->setup[pme_lb->cur].grid_efficiency <
- pme_lb->setup[pme_lb->cur-1].grid_efficiency*relativeEfficiencyFactor));
+ } while (OK
+ && !(pme_lb->setup[pme_lb->cur].grid[XX] * pme_lb->setup[pme_lb->cur].grid[YY]
+ * pme_lb->setup[pme_lb->cur].grid[ZZ]
+ < gridsize_start * gridpointsScaleFactor
+ && pme_lb->setup[pme_lb->cur].grid_efficiency
+ < pme_lb->setup[pme_lb->cur - 1].grid_efficiency * relativeEfficiencyFactor));
}
if (pme_lb->stage > 0 && pme_lb->end == 1)
pme_lb->cur = pme_lb->end - 1;
}
- }
- while (pme_lb->stage == pme_lb->nstage - 1 &&
- pme_lb->setup[pme_lb->cur].count > 0 &&
- pme_lb->setup[pme_lb->cur].cycles > cycles_fast*maxRelativeSlowdownAccepted);
+ } while (pme_lb->stage == pme_lb->nstage - 1 && pme_lb->setup[pme_lb->cur].count > 0
+ && pme_lb->setup[pme_lb->cur].cycles > cycles_fast * maxRelativeSlowdownAccepted);
if (pme_lb->stage == pme_lb->nstage)
{
/* This should not happen, as we set limits on the DLB bounds.
* But we implement a complete failsafe solution anyhow.
*/
- GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
- "The fastest PP/PME load balancing setting (cutoff %.3d nm) is no longer available due to DD DLB or box size limitations", pme_lb->fastest);
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendTextFormatted(
+ "The fastest PP/PME load balancing setting (cutoff %.3d nm) is no "
+ "longer available due to DD DLB or box size limitations",
+ pme_lb->fastest);
pme_lb->fastest = pme_lb->lower_limit;
pme_lb->start = pme_lb->lower_limit;
}
/* Limit the range to below the current cut-off, scan from start */
- pme_lb->end = pme_lb->cur;
- pme_lb->cur = pme_lb->start;
- pme_lb->elimited = epmelblimDD;
+ pme_lb->end = pme_lb->cur;
+ pme_lb->cur = pme_lb->start;
+ pme_lb->elimited = epmelblimDD;
print_loadbal_limited(fp_err, fp_log, step, pme_lb);
}
}
set = &pme_lb->setup[pme_lb->cur];
- ic->rcoulomb = set->rcut_coulomb;
+ ic->rcoulomb = set->rcut_coulomb;
nbv->changePairlistRadii(set->rlistOuter, set->rlistInner);
- ic->ewaldcoeff_q = set->ewaldcoeff_q;
+ ic->ewaldcoeff_q = set->ewaldcoeff_q;
/* TODO: centralize the code that sets the potentials shifts */
if (ic->coulomb_modifier == eintmodPOTSHIFT)
{
GMX_RELEASE_ASSERT(ic->rcoulomb != 0, "Cutoff radius cannot be zero");
- ic->sh_ewald = std::erfc(ic->ewaldcoeff_q*ic->rcoulomb) / ic->rcoulomb;
+ ic->sh_ewald = std::erfc(ic->ewaldcoeff_q * ic->rcoulomb) / ic->rcoulomb;
}
if (EVDW_PME(ic->vdwtype))
{
/* We have PME for both Coulomb and VdW, set rvdw equal to rcoulomb */
- ic->rvdw = set->rcut_coulomb;
- ic->ewaldcoeff_lj = set->ewaldcoeff_lj;
+ ic->rvdw = set->rcut_coulomb;
+ ic->ewaldcoeff_lj = set->ewaldcoeff_lj;
if (ic->vdw_modifier == eintmodPOTSHIFT)
{
- real crc2;
+ real crc2;
- ic->dispersion_shift.cpot = -1.0/gmx::power6(static_cast<double>(ic->rvdw));
- ic->repulsion_shift.cpot = -1.0/gmx::power12(static_cast<double>(ic->rvdw));
- crc2 = gmx::square(ic->ewaldcoeff_lj*ic->rvdw);
- ic->sh_lj_ewald = (std::exp(-crc2)*(1 + crc2 + 0.5*crc2*crc2) - 1)/gmx::power6(ic->rvdw);
+ ic->dispersion_shift.cpot = -1.0 / gmx::power6(static_cast<double>(ic->rvdw));
+ ic->repulsion_shift.cpot = -1.0 / gmx::power12(static_cast<double>(ic->rvdw));
+ crc2 = gmx::square(ic->ewaldcoeff_lj * ic->rvdw);
+ ic->sh_lj_ewald =
+ (std::exp(-crc2) * (1 + crc2 + 0.5 * crc2 * crc2) - 1) / gmx::power6(ic->rvdw);
}
}
* This can lead to a lot of reallocations for PME GPU.
* Would be nicer if the allocated grid list was hidden within a single pmedata structure.
*/
- if ((pme_lb->setup[pme_lb->cur].pmedata == nullptr) || pme_gpu_task_enabled(pme_lb->setup[pme_lb->cur].pmedata))
+ if ((pme_lb->setup[pme_lb->cur].pmedata == nullptr)
+ || pme_gpu_task_enabled(pme_lb->setup[pme_lb->cur].pmedata))
{
/* Generate a new PME data structure,
* copying part of the old pointers.
*/
- gmx_pme_reinit(&set->pmedata,
- cr, pme_lb->setup[0].pmedata, &ir,
- set->grid, set->ewaldcoeff_q, set->ewaldcoeff_lj);
+ gmx_pme_reinit(&set->pmedata, cr, pme_lb->setup[0].pmedata, &ir, set->grid,
+ set->ewaldcoeff_q, set->ewaldcoeff_lj);
}
*pmedata = set->pmedata;
}
* the PP/PME balance might change and re-balancing can improve performance.
* This function adds 2 stages and adjusts the considered setup range.
*/
-static void continue_pme_loadbal(pme_load_balancing_t *pme_lb,
- gmx_bool bDlbUnlocked)
+static void continue_pme_loadbal(pme_load_balancing_t* pme_lb, gmx_bool bDlbUnlocked)
{
/* Add 2 tuning stages, keep the detected end of the setup range */
- pme_lb->nstage += 2;
+ pme_lb->nstage += 2;
if (bDlbUnlocked && pme_lb->bSepPMERanks)
{
/* With separate PME ranks, DLB should always lower the PP load and
* can only increase the PME load (more communication and imbalance),
* so we only need to scan longer cut-off's.
*/
- pme_lb->lower_limit = pme_lb->cur;
+ pme_lb->lower_limit = pme_lb->cur;
}
- pme_lb->start = pme_lb->lower_limit;
+ pme_lb->start = pme_lb->lower_limit;
}
-void pme_loadbal_do(pme_load_balancing_t *pme_lb,
- t_commrec *cr,
- FILE *fp_err,
- FILE *fp_log,
- const gmx::MDLogger &mdlog,
- const t_inputrec &ir,
- t_forcerec *fr,
+void pme_loadbal_do(pme_load_balancing_t* pme_lb,
+ t_commrec* cr,
+ FILE* fp_err,
+ FILE* fp_log,
+ const gmx::MDLogger& mdlog,
+ const t_inputrec& ir,
+ t_forcerec* fr,
const matrix box,
gmx::ArrayRef<const gmx::RVec> x,
gmx_wallcycle_t wcycle,
int64_t step,
int64_t step_rel,
- gmx_bool *bPrinting)
+ gmx_bool* bPrinting)
{
int n_prev;
double cycles_prev;
* is not over the last nstlist steps, but the nstlist steps before
* that. So the first useful ratio is available at step_rel=3*nstlist.
*/
- else if (step_rel >= 3*ir.nstlist)
+ else if (step_rel >= 3 * ir.nstlist)
{
if (DDMASTER(cr->dd))
{
/* If PME rank load is too high, start tuning */
- pme_lb->bBalance =
- (dd_pme_f_ratio(cr->dd) >= loadBalanceTriggerFactor);
+ pme_lb->bBalance = (dd_pme_f_ratio(cr->dd) >= loadBalanceTriggerFactor);
}
dd_bcast(cr->dd, sizeof(gmx_bool), &pme_lb->bBalance);
}
- pme_lb->bActive = (pme_lb->bBalance ||
- step_rel <= pme_lb->step_rel_stop);
+ pme_lb->bActive = (pme_lb->bBalance || step_rel <= pme_lb->step_rel_stop);
}
/* The location in the code of this balancing termination is strange.
{
/* Unlock the DLB=auto, DLB is allowed to activate */
dd_dlb_unlock(cr->dd);
- GMX_LOG(mdlog.warning).asParagraph().appendText("NOTE: DLB can now turn on, when beneficial");
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendText("NOTE: DLB can now turn on, when beneficial");
/* We don't deactivate the tuning yet, since we will balance again
* after DLB gets turned on, if it does within PMETune_period.
*/
continue_pme_loadbal(pme_lb, TRUE);
pme_lb->bTriggerOnDLB = TRUE;
- pme_lb->step_rel_stop = step_rel + PMETunePeriod*ir.nstlist;
+ pme_lb->step_rel_stop = step_rel + PMETunePeriod * ir.nstlist;
}
else
{
* since init_step might not be a multiple of nstlist,
* but the first data collected is skipped anyhow.
*/
- pme_load_balance(pme_lb, cr,
- fp_err, fp_log, mdlog,
- ir, box, x, pme_lb->cycles_c - cycles_prev,
- fr->ic, fr->nbv.get(), &fr->pmedata,
- step);
+ pme_load_balance(pme_lb, cr, fp_err, fp_log, mdlog, ir, box, x,
+ pme_lb->cycles_c - cycles_prev, fr->ic, fr->nbv.get(), &fr->pmedata, step);
/* Update deprecated rlist in forcerec to stay in sync with fr->nbv */
- fr->rlist = fr->nbv->pairlistOuterRadius();
+ fr->rlist = fr->nbv->pairlistOuterRadius();
if (ir.eDispCorr != edispcNO)
{
}
}
- if (!pme_lb->bBalance &&
- (!pme_lb->bSepPMERanks || step_rel > pme_lb->step_rel_stop))
+ if (!pme_lb->bBalance && (!pme_lb->bSepPMERanks || step_rel > pme_lb->step_rel_stop))
{
/* We have just deactivated the balancing and we're not measuring PP/PME
* imbalance during the first steps of the run: deactivate the tuning.
{
/* Make sure DLB is allowed when we deactivate PME tuning */
dd_dlb_unlock(cr->dd);
- GMX_LOG(mdlog.warning).asParagraph().appendText("NOTE: DLB can now turn on, when beneficial");
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendText("NOTE: DLB can now turn on, when beneficial");
}
*bPrinting = pme_lb->bBalance;
}
/*! \brief Return product of the number of PME grid points in each dimension */
-static int pme_grid_points(const pme_setup_t *setup)
+static int pme_grid_points(const pme_setup_t* setup)
{
- return setup->grid[XX]*setup->grid[YY]*setup->grid[ZZ];
+ return setup->grid[XX] * setup->grid[YY] * setup->grid[ZZ];
}
/*! \brief Print one load-balancing setting */
-static void print_pme_loadbal_setting(FILE *fplog,
- const char *name,
- const pme_setup_t *setup)
+static void print_pme_loadbal_setting(FILE* fplog, const char* name, const pme_setup_t* setup)
{
- fprintf(fplog,
- " %-7s %6.3f nm %6.3f nm %3d %3d %3d %5.3f nm %5.3f nm\n",
- name,
- setup->rcut_coulomb, setup->rlistInner,
- setup->grid[XX], setup->grid[YY], setup->grid[ZZ],
- setup->spacing, 1/setup->ewaldcoeff_q);
+ fprintf(fplog, " %-7s %6.3f nm %6.3f nm %3d %3d %3d %5.3f nm %5.3f nm\n", name,
+ setup->rcut_coulomb, setup->rlistInner, setup->grid[XX], setup->grid[YY],
+ setup->grid[ZZ], setup->spacing, 1 / setup->ewaldcoeff_q);
}
/*! \brief Print all load-balancing settings */
-static void print_pme_loadbal_settings(pme_load_balancing_t *pme_lb,
- FILE *fplog,
- const gmx::MDLogger &mdlog,
+static void print_pme_loadbal_settings(pme_load_balancing_t* pme_lb,
+ FILE* fplog,
+ const gmx::MDLogger& mdlog,
gmx_bool bNonBondedOnGPU)
{
- double pp_ratio, grid_ratio;
- real pp_ratio_temporary;
+ double pp_ratio, grid_ratio;
+ real pp_ratio_temporary;
pp_ratio_temporary = pme_lb->setup[pme_lb->cur].rlistInner / pme_lb->setup[0].rlistInner;
pp_ratio = gmx::power3(pp_ratio_temporary);
- grid_ratio = pme_grid_points(&pme_lb->setup[pme_lb->cur])/
- static_cast<double>(pme_grid_points(&pme_lb->setup[0]));
+ grid_ratio = pme_grid_points(&pme_lb->setup[pme_lb->cur])
+ / static_cast<double>(pme_grid_points(&pme_lb->setup[0]));
fprintf(fplog, "\n");
fprintf(fplog, " P P - P M E L O A D B A L A N C I N G\n");
fprintf(fplog, "\n");
/* Here we only warn when the optimal setting is the last one */
- if (pme_lb->elimited != epmelblimNO &&
- pme_lb->cur == pme_loadbal_end(pme_lb)-1)
+ if (pme_lb->elimited != epmelblimNO && pme_lb->cur == pme_loadbal_end(pme_lb) - 1)
{
fprintf(fplog, " NOTE: The PP/PME load balancing was limited by the %s,\n",
pmelblim_str[pme_lb->elimited]);
fprintf(fplog, " rcoulomb rlist grid spacing 1/beta\n");
print_pme_loadbal_setting(fplog, "initial", &pme_lb->setup[0]);
print_pme_loadbal_setting(fplog, "final", &pme_lb->setup[pme_lb->cur]);
- fprintf(fplog, " cost-ratio %4.2f %4.2f\n",
- pp_ratio, grid_ratio);
+ fprintf(fplog, " cost-ratio %4.2f %4.2f\n", pp_ratio, grid_ratio);
fprintf(fplog, " (note that these numbers concern only part of the total PP and PME load)\n");
if (pp_ratio > 1.5 && !bNonBondedOnGPU)
{
- GMX_LOG(mdlog.warning).asParagraph().appendText(
- "NOTE: PME load balancing increased the non-bonded workload by more than 50%.\n"
- " For better performance, use (more) PME ranks (mdrun -npme),\n"
- " or if you are beyond the scaling limit, use fewer total ranks (or nodes).");
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendText(
+ "NOTE: PME load balancing increased the non-bonded workload by more than "
+ "50%.\n"
+ " For better performance, use (more) PME ranks (mdrun -npme),\n"
+ " or if you are beyond the scaling limit, use fewer total ranks (or "
+ "nodes).");
}
else
{
}
}
-void pme_loadbal_done(pme_load_balancing_t *pme_lb,
- FILE *fplog,
- const gmx::MDLogger &mdlog,
- gmx_bool bNonBondedOnGPU)
+void pme_loadbal_done(pme_load_balancing_t* pme_lb, FILE* fplog, const gmx::MDLogger& mdlog, gmx_bool bNonBondedOnGPU)
{
if (fplog != nullptr && (pme_lb->cur > 0 || pme_lb->elimited != epmelblimNO))
{