Apply clang-format to source tree
[alexxy/gromacs.git] / src / gromacs / ewald / pme_load_balancing.cpp
index 60611d8f6e1db45e363fdcdbc2cb4fcd8c932a66..98d0d9e2fcc1a616f36d63ef8e72f3a51ff7f95b 100644 (file)
 #include "pme_internal.h"
 
 /*! \brief Parameters and settings for one PP-PME setup */
-struct pme_setup_t {
-    real              rcut_coulomb;    /**< Coulomb cut-off                              */
-    real              rlistOuter;      /**< cut-off for the outer pair-list              */
-    real              rlistInner;      /**< cut-off for the inner pair-list              */
-    real              spacing;         /**< (largest) PME grid spacing                   */
-    ivec              grid;            /**< the PME grid dimensions                      */
-    real              grid_efficiency; /**< ineffiency factor for non-uniform grids <= 1 */
-    real              ewaldcoeff_q;    /**< Electrostatic Ewald coefficient            */
-    real              ewaldcoeff_lj;   /**< LJ Ewald coefficient, only for the call to send_switchgrid */
-    struct gmx_pme_t *pmedata;         /**< the data structure used in the PME code      */
-    int               count;           /**< number of times this setup has been timed    */
-    double            cycles;          /**< the fastest time for this setup in cycles    */
+struct pme_setup_t
+{
+    real rcut_coulomb;         /**< Coulomb cut-off                              */
+    real rlistOuter;           /**< cut-off for the outer pair-list              */
+    real rlistInner;           /**< cut-off for the inner pair-list              */
+    real spacing;              /**< (largest) PME grid spacing                   */
+    ivec grid;                 /**< the PME grid dimensions                      */
+    real grid_efficiency;      /**< ineffiency factor for non-uniform grids <= 1 */
+    real ewaldcoeff_q;         /**< Electrostatic Ewald coefficient            */
+    real ewaldcoeff_lj;        /**< LJ Ewald coefficient, only for the call to send_switchgrid */
+    struct gmx_pme_t* pmedata; /**< the data structure used in the PME code      */
+    int               count;   /**< number of times this setup has been timed    */
+    double            cycles;  /**< the fastest time for this setup in cycles    */
 };
 
 /*! \brief After 50 nstlist periods of not observing imbalance: never tune PME */
-const int  PMETunePeriod = 50;
+const int PMETunePeriod = 50;
 /*! \brief Trigger PME load balancing at more than 5% PME overload */
 const real loadBalanceTriggerFactor = 1.05;
 /*! \brief Scale the grid by a most at factor 1.7.
@@ -124,21 +125,29 @@ const real maxRelativeSlowdownAccepted = 1.12;
 const real maxFluctuationAccepted = 1.02;
 
 /*! \brief Enumeration whose values describe the effect limiting the load balancing */
-enum epmelb {
-    epmelblimNO, epmelblimBOX, epmelblimDD, epmelblimPMEGRID, epmelblimMAXSCALING, epmelblimNR
+enum epmelb
+{
+    epmelblimNO,
+    epmelblimBOX,
+    epmelblimDD,
+    epmelblimPMEGRID,
+    epmelblimMAXSCALING,
+    epmelblimNR
 };
 
 /*! \brief Descriptive strings matching ::epmelb */
-static const char *pmelblim_str[epmelblimNR] =
-{ "no", "box size", "domain decompostion", "PME grid restriction", "maximum allowed grid scaling" };
+static const char* pmelblim_str[epmelblimNR] = { "no", "box size", "domain decompostion",
+                                                 "PME grid restriction",
+                                                 "maximum allowed grid scaling" };
 
-struct pme_load_balancing_t {
-    gmx_bool                 bSepPMERanks;       /**< do we have separate PME ranks? */
-    gmx_bool                 bActive;            /**< is PME tuning active? */
-    int64_t                  step_rel_stop;      /**< stop the tuning after this value of step_rel */
-    gmx_bool                 bTriggerOnDLB;      /**< trigger balancing only on DD DLB */
-    gmx_bool                 bBalance;           /**< are we in the balancing phase, i.e. trying different setups? */
-    int                      nstage;             /**< the current maximum number of stages */
+struct pme_load_balancing_t
+{
+    gmx_bool bSepPMERanks;  /**< do we have separate PME ranks? */
+    gmx_bool bActive;       /**< is PME tuning active? */
+    int64_t  step_rel_stop; /**< stop the tuning after this value of step_rel */
+    gmx_bool bTriggerOnDLB; /**< trigger balancing only on DD DLB */
+    gmx_bool bBalance;      /**< are we in the balancing phase, i.e. trying different setups? */
+    int      nstage;        /**< the current maximum number of stages */
 
     real                     cut_spacing;        /**< the minimum cutoff / PME grid spacing ratio */
     real                     rcut_vdw;           /**< Vdw cutoff (does not change) */
@@ -152,58 +161,60 @@ struct pme_load_balancing_t {
     int                      cur;                /**< the index (in setup) of the current setup */
     int                      fastest;            /**< index of the fastest setup up till now */
     int                      lower_limit;        /**< don't go below this setup index */
-    int                      start;              /**< start of setup index range to consider in stage>0 */
-    int                      end;                /**< end   of setup index range to consider in stage>0 */
-    int                      elimited;           /**< was the balancing limited, uses enum above */
-    int                      cutoff_scheme;      /**< Verlet or group cut-offs */
+    int                      start;    /**< start of setup index range to consider in stage>0 */
+    int                      end;      /**< end   of setup index range to consider in stage>0 */
+    int                      elimited; /**< was the balancing limited, uses enum above */
+    int                      cutoff_scheme; /**< Verlet or group cut-offs */
 
-    int                      stage;              /**< the current stage */
+    int stage; /**< the current stage */
 
-    int                      cycles_n;           /**< step cycle counter cummulative count */
-    double                   cycles_c;           /**< step cycle counter cummulative cycles */
+    int    cycles_n; /**< step cycle counter cummulative count */
+    double cycles_c; /**< step cycle counter cummulative cycles */
 };
 
 /* TODO The code in this file should call this getter, rather than
  * read bActive anywhere */
-bool pme_loadbal_is_active(const pme_load_balancing_t *pme_lb)
+bool pme_loadbal_is_active(const pme_load_balancing_tpme_lb)
 {
     return pme_lb != nullptr && pme_lb->bActive;
 }
 
 // TODO Return a unique_ptr to pme_load_balancing_t
-void pme_loadbal_init(pme_load_balancing_t     **pme_lb_p,
-                      t_commrec                 *cr,
-                      const gmx::MDLogger       &mdlog,
-                      const t_inputrec          &ir,
+void pme_loadbal_init(pme_load_balancing_t**     pme_lb_p,
+                      t_commrec*                 cr,
+                      const gmx::MDLogger&       mdlog,
+                      const t_inputrec&          ir,
                       const matrix               box,
-                      const interaction_const_t &ic,
-                      const nonbonded_verlet_t  &nbv,
-                      gmx_pme_t                 *pmedata,
+                      const interaction_const_tic,
+                      const nonbonded_verlet_t&  nbv,
+                      gmx_pme_t*                 pmedata,
                       gmx_bool                   bUseGPU,
-                      gmx_bool                  *bPrinting)
+                      gmx_bool*                  bPrinting)
 {
 
-    pme_load_balancing_t *pme_lb;
+    pme_load_balancing_tpme_lb;
     real                  spm, sp;
     int                   d;
 
     // Note that we don't (yet) support PME load balancing with LJ-PME only.
-    GMX_RELEASE_ASSERT(EEL_PME(ir.coulombtype), "pme_loadbal_init called without PME electrostatics");
+    GMX_RELEASE_ASSERT(EEL_PME(ir.coulombtype),
+                       "pme_loadbal_init called without PME electrostatics");
     // To avoid complexity, we require a single cut-off with PME for q+LJ.
     // This is checked by grompp, but it doesn't hurt to check again.
-    GMX_RELEASE_ASSERT(!(EEL_PME(ir.coulombtype) && EVDW_PME(ir.vdwtype) && ir.rcoulomb != ir.rvdw), "With Coulomb and LJ PME, rcoulomb should be equal to rvdw");
+    GMX_RELEASE_ASSERT(!(EEL_PME(ir.coulombtype) && EVDW_PME(ir.vdwtype) && ir.rcoulomb != ir.rvdw),
+                       "With Coulomb and LJ PME, rcoulomb should be equal to rvdw");
 
     pme_lb = new pme_load_balancing_t;
 
-    pme_lb->bSepPMERanks      = !thisRankHasDuty(cr, DUTY_PME);
+    pme_lb->bSepPMERanks = !thisRankHasDuty(cr, DUTY_PME);
 
     /* Initially we turn on balancing directly on based on PP/PME imbalance */
-    pme_lb->bTriggerOnDLB     = FALSE;
+    pme_lb->bTriggerOnDLB = FALSE;
 
     /* Any number of stages >= 2 is supported */
-    pme_lb->nstage            = 2;
+    pme_lb->nstage = 2;
 
-    pme_lb->cutoff_scheme     = ir.cutoff_scheme;
+    pme_lb->cutoff_scheme = ir.cutoff_scheme;
 
     pme_lb->rbufOuter_coulomb = nbv.pairlistOuterRadius() - ic.rcoulomb;
     pme_lb->rbufOuter_vdw     = nbv.pairlistOuterRadius() - ic.rvdw;
@@ -218,29 +229,30 @@ void pme_loadbal_init(pme_load_balancing_t     **pme_lb_p,
 
     pme_lb->setup.resize(1);
 
-    pme_lb->rcut_vdw                 = ic.rvdw;
-    pme_lb->rcut_coulomb_start       = ir.rcoulomb;
+    pme_lb->rcut_vdw           = ic.rvdw;
+    pme_lb->rcut_coulomb_start = ir.rcoulomb;
 
-    pme_lb->cur                      = 0;
-    pme_lb->setup[0].rcut_coulomb    = ic.rcoulomb;
-    pme_lb->setup[0].rlistOuter      = nbv.pairlistOuterRadius();
-    pme_lb->setup[0].rlistInner      = nbv.pairlistInnerRadius();
-    pme_lb->setup[0].grid[XX]        = ir.nkx;
-    pme_lb->setup[0].grid[YY]        = ir.nky;
-    pme_lb->setup[0].grid[ZZ]        = ir.nkz;
-    pme_lb->setup[0].ewaldcoeff_q    = ic.ewaldcoeff_q;
-    pme_lb->setup[0].ewaldcoeff_lj   = ic.ewaldcoeff_lj;
+    pme_lb->cur                    = 0;
+    pme_lb->setup[0].rcut_coulomb  = ic.rcoulomb;
+    pme_lb->setup[0].rlistOuter    = nbv.pairlistOuterRadius();
+    pme_lb->setup[0].rlistInner    = nbv.pairlistInnerRadius();
+    pme_lb->setup[0].grid[XX]      = ir.nkx;
+    pme_lb->setup[0].grid[YY]      = ir.nky;
+    pme_lb->setup[0].grid[ZZ]      = ir.nkz;
+    pme_lb->setup[0].ewaldcoeff_q  = ic.ewaldcoeff_q;
+    pme_lb->setup[0].ewaldcoeff_lj = ic.ewaldcoeff_lj;
 
     if (!pme_lb->bSepPMERanks)
     {
-        GMX_RELEASE_ASSERT(pmedata, "On ranks doing both PP and PME we need a valid pmedata object");
-        pme_lb->setup[0].pmedata     = pmedata;
+        GMX_RELEASE_ASSERT(pmedata,
+                           "On ranks doing both PP and PME we need a valid pmedata object");
+        pme_lb->setup[0].pmedata = pmedata;
     }
 
     spm = 0;
     for (d = 0; d < DIM; d++)
     {
-        sp = norm(pme_lb->box_start[d])/pme_lb->setup[0].grid[d];
+        sp = norm(pme_lb->box_start[d]) / pme_lb->setup[0].grid[d];
         if (sp > spm)
         {
             spm = sp;
@@ -250,11 +262,11 @@ void pme_loadbal_init(pme_load_balancing_t     **pme_lb_p,
 
     if (ir.fourier_spacing > 0)
     {
-        pme_lb->cut_spacing = ir.rcoulomb/ir.fourier_spacing;
+        pme_lb->cut_spacing = ir.rcoulomb / ir.fourier_spacing;
     }
     else
     {
-        pme_lb->cut_spacing = ir.rcoulomb/pme_lb->setup[0].spacing;
+        pme_lb->cut_spacing = ir.rcoulomb / pme_lb->setup[0].spacing;
     }
 
     pme_lb->stage = 0;
@@ -270,15 +282,18 @@ void pme_loadbal_init(pme_load_balancing_t     **pme_lb_p,
 
     if (!wallcycle_have_counter())
     {
-        GMX_LOG(mdlog.warning).asParagraph().appendText("NOTE: Cycle counters unsupported or not enabled in kernel. Cannot use PME-PP balancing.");
+        GMX_LOG(mdlog.warning)
+                .asParagraph()
+                .appendText(
+                        "NOTE: Cycle counters unsupported or not enabled in kernel. Cannot use "
+                        "PME-PP balancing.");
     }
 
     /* Tune with GPUs and/or separate PME ranks.
      * When running only on a CPU without PME ranks, PME tuning will only help
      * with small numbers of atoms in the cut-off sphere.
      */
-    pme_lb->bActive  = (wallcycle_have_counter() && (bUseGPU ||
-                                                     pme_lb->bSepPMERanks));
+    pme_lb->bActive = (wallcycle_have_counter() && (bUseGPU || pme_lb->bSepPMERanks));
 
     /* With GPUs and no separate PME ranks we can't measure the PP/PME
      * imbalance, so we start balancing right away.
@@ -286,7 +301,7 @@ void pme_loadbal_init(pme_load_balancing_t     **pme_lb_p,
      */
     pme_lb->bBalance = (pme_lb->bActive && (bUseGPU && !pme_lb->bSepPMERanks));
 
-    pme_lb->step_rel_stop = PMETunePeriod*ir.nstlist;
+    pme_lb->step_rel_stop = PMETunePeriod * ir.nstlist;
 
     /* Delay DD load balancing when GPUs are used */
     if (pme_lb->bActive && DOMAINDECOMP(cr) && cr->dd->nnodes > 1 && bUseGPU)
@@ -300,7 +315,9 @@ void pme_loadbal_init(pme_load_balancing_t     **pme_lb_p,
         dd_dlb_lock(cr->dd);
         if (dd_dlb_is_locked(cr->dd))
         {
-            GMX_LOG(mdlog.warning).asParagraph().appendText("NOTE: DLB will not turn on during the first phase of PME tuning");
+            GMX_LOG(mdlog.warning)
+                    .asParagraph()
+                    .appendText("NOTE: DLB will not turn on during the first phase of PME tuning");
         }
     }
 
@@ -310,14 +327,12 @@ void pme_loadbal_init(pme_load_balancing_t     **pme_lb_p,
 }
 
 /*! \brief Try to increase the cutoff during load balancing */
-static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t *pme_lb,
-                                            int                   pme_order,
-                                            const gmx_domdec_t   *dd)
+static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t* pme_lb, int pme_order, const gmx_domdec_t* dd)
 {
-    real         fac, sp;
-    real         tmpr_coulomb, tmpr_vdw;
-    int          d;
-    bool         grid_ok;
+    real fac, sp;
+    real tmpr_coulomb, tmpr_vdw;
+    int  d;
+    bool grid_ok;
 
     /* Try to add a new setup with next larger cut-off to the list */
     pme_setup_t set;
@@ -341,27 +356,19 @@ static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t *pme_lb,
 
         fac *= 1.01;
         clear_ivec(set.grid);
-        sp = calcFftGrid(nullptr, pme_lb->box_start,
-                         fac*pme_lb->setup[pme_lb->cur].spacing,
-                         minimalPmeGridSize(pme_order),
-                         &set.grid[XX],
-                         &set.grid[YY],
-                         &set.grid[ZZ]);
+        sp = calcFftGrid(nullptr, pme_lb->box_start, fac * pme_lb->setup[pme_lb->cur].spacing,
+                         minimalPmeGridSize(pme_order), &set.grid[XX], &set.grid[YY], &set.grid[ZZ]);
 
         /* As here we can't easily check if one of the PME ranks
          * uses threading, we do a conservative grid check.
          * This means we can't use pme_order or less grid lines
          * per PME rank along x, which is not a strong restriction.
          */
-        grid_ok = gmx_pme_check_restrictions(pme_order,
-                                             set.grid[XX], set.grid[YY], set.grid[ZZ],
-                                             numPmeDomains.x,
-                                             true,
-                                             false);
-    }
-    while (sp <= 1.001*pme_lb->setup[pme_lb->cur].spacing || !grid_ok);
+        grid_ok = gmx_pme_check_restrictions(pme_order, set.grid[XX], set.grid[YY], set.grid[ZZ],
+                                             numPmeDomains.x, true, false);
+    } while (sp <= 1.001 * pme_lb->setup[pme_lb->cur].spacing || !grid_ok);
 
-    set.rcut_coulomb = pme_lb->cut_spacing*sp;
+    set.rcut_coulomb = pme_lb->cut_spacing * sp;
     if (set.rcut_coulomb < pme_lb->rcut_coulomb_start)
     {
         /* This is unlikely, but can happen when e.g. continuing from
@@ -375,63 +382,56 @@ static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t *pme_lb,
     if (pme_lb->cutoff_scheme == ecutsVERLET)
     {
         /* Never decrease the Coulomb and VdW list buffers */
-        set.rlistOuter  = std::max(set.rcut_coulomb + pme_lb->rbufOuter_coulomb,
-                                   pme_lb->rcut_vdw + pme_lb->rbufOuter_vdw);
-        set.rlistInner  = std::max(set.rcut_coulomb + pme_lb->rbufInner_coulomb,
-                                   pme_lb->rcut_vdw + pme_lb->rbufInner_vdw);
+        set.rlistOuter = std::max(set.rcut_coulomb + pme_lb->rbufOuter_coulomb,
+                                  pme_lb->rcut_vdw + pme_lb->rbufOuter_vdw);
+        set.rlistInner = std::max(set.rcut_coulomb + pme_lb->rbufInner_coulomb,
+                                  pme_lb->rcut_vdw + pme_lb->rbufInner_vdw);
     }
     else
     {
         /* TODO Remove these lines and pme_lb->cutoff_scheme */
-        tmpr_coulomb     = set.rcut_coulomb + pme_lb->rbufOuter_coulomb;
-        tmpr_vdw         = pme_lb->rcut_vdw + pme_lb->rbufOuter_vdw;
+        tmpr_coulomb = set.rcut_coulomb + pme_lb->rbufOuter_coulomb;
+        tmpr_vdw     = pme_lb->rcut_vdw + pme_lb->rbufOuter_vdw;
         /* Two (known) bugs with cutoff-scheme=group here:
          * - This modification of rlist results in incorrect DD comunication.
          * - We should set fr->bTwinRange = (fr->rlistlong > fr->rlist).
          */
-        set.rlistOuter  = std::min(tmpr_coulomb, tmpr_vdw);
-        set.rlistInner  = set.rlistOuter;
+        set.rlistOuter = std::min(tmpr_coulomb, tmpr_vdw);
+        set.rlistInner = set.rlistOuter;
     }
 
-    set.spacing         = sp;
+    set.spacing = sp;
     /* The grid efficiency is the size wrt a grid with uniform x/y/z spacing */
     set.grid_efficiency = 1;
     for (d = 0; d < DIM; d++)
     {
-        set.grid_efficiency *= (set.grid[d]*sp)/norm(pme_lb->box_start[d]);
+        set.grid_efficiency *= (set.grid[d] * sp) / norm(pme_lb->box_start[d]);
     }
     /* The Ewald coefficient is inversly proportional to the cut-off */
-    set.ewaldcoeff_q =
-        pme_lb->setup[0].ewaldcoeff_q*pme_lb->setup[0].rcut_coulomb/set.rcut_coulomb;
+    set.ewaldcoeff_q = pme_lb->setup[0].ewaldcoeff_q * pme_lb->setup[0].rcut_coulomb / set.rcut_coulomb;
     /* We set ewaldcoeff_lj in set, even when LJ-PME is not used */
-    set.ewaldcoeff_lj =
-        pme_lb->setup[0].ewaldcoeff_lj*pme_lb->setup[0].rcut_coulomb/set.rcut_coulomb;
+    set.ewaldcoeff_lj = pme_lb->setup[0].ewaldcoeff_lj * pme_lb->setup[0].rcut_coulomb / set.rcut_coulomb;
 
-    set.count   = 0;
-    set.cycles  = 0;
+    set.count  = 0;
+    set.cycles = 0;
 
     if (debug)
     {
-        fprintf(debug, "PME loadbal: grid %d %d %d, coulomb cutoff %f\n",
-                set.grid[XX], set.grid[YY], set.grid[ZZ], set.rcut_coulomb);
+        fprintf(debug, "PME loadbal: grid %d %d %d, coulomb cutoff %f\n", set.grid[XX],
+                set.grid[YY], set.grid[ZZ], set.rcut_coulomb);
     }
     pme_lb->setup.push_back(set);
     return TRUE;
 }
 
 /*! \brief Print the PME grid */
-static void print_grid(FILE *fp_err, FILE *fp_log,
-                       const char *pre,
-                       const char *desc,
-                       const pme_setup_t *set,
-                       double cycles)
+static void print_grid(FILE* fp_err, FILE* fp_log, const char* pre, const char* desc, const pme_setup_t* set, double cycles)
 {
-    auto buf = gmx::formatString("%-11s%10s pme grid %d %d %d, coulomb cutoff %.3f",
-                                 pre, desc,
+    auto buf = gmx::formatString("%-11s%10s pme grid %d %d %d, coulomb cutoff %.3f", pre, desc,
                                  set->grid[XX], set->grid[YY], set->grid[ZZ], set->rcut_coulomb);
     if (cycles >= 0)
     {
-        buf += gmx::formatString(": %.1f M-cycles", cycles*1e-6);
+        buf += gmx::formatString(": %.1f M-cycles", cycles * 1e-6);
     }
     if (fp_err != nullptr)
     {
@@ -445,7 +445,7 @@ static void print_grid(FILE *fp_err, FILE *fp_log,
 }
 
 /*! \brief Return the index of the last setup used in PME load balancing */
-static int pme_loadbal_end(pme_load_balancing_t *pme_lb)
+static int pme_loadbal_end(pme_load_balancing_tpme_lb)
 {
     /* In the initial stage only n is set; end is not set yet */
     if (pme_lb->end > 0)
@@ -459,14 +459,12 @@ static int pme_loadbal_end(pme_load_balancing_t *pme_lb)
 }
 
 /*! \brief Print descriptive string about what limits PME load balancing */
-static void print_loadbal_limited(FILE *fp_err, FILE *fp_log,
-                                  int64_t step,
-                                  pme_load_balancing_t *pme_lb)
+static void print_loadbal_limited(FILE* fp_err, FILE* fp_log, int64_t step, pme_load_balancing_t* pme_lb)
 {
-    auto buf = gmx::formatString("step %4s: the %s limits the PME load balancing to a coulomb cut-off of %.3f",
-                                 gmx::int64ToString(step).c_str(),
-                                 pmelblim_str[pme_lb->elimited],
-                                 pme_lb->setup[pme_loadbal_end(pme_lb)-1].rcut_coulomb);
+    auto buf = gmx::formatString(
+            "step %4s: the %s limits the PME load balancing to a coulomb cut-off of %.3f",
+            gmx::int64ToString(step).c_str(), pmelblim_str[pme_lb->elimited],
+            pme_lb->setup[pme_loadbal_end(pme_lb) - 1].rcut_coulomb);
     if (fp_err != nullptr)
     {
         fprintf(fp_err, "\r%s\n", buf.c_str());
@@ -481,16 +479,16 @@ static void print_loadbal_limited(FILE *fp_err, FILE *fp_log,
 /*! \brief Switch load balancing to stage 1
  *
  * In this stage, only reasonably fast setups are run again. */
-static void switch_to_stage1(pme_load_balancing_t *pme_lb)
+static void switch_to_stage1(pme_load_balancing_tpme_lb)
 {
     /* Increase start until we find a setup that is not slower than
      * maxRelativeSlowdownAccepted times the fastest setup.
      */
     pme_lb->start = pme_lb->lower_limit;
-    while (pme_lb->start + 1 < gmx::ssize(pme_lb->setup) &&
-           (pme_lb->setup[pme_lb->start].count == 0 ||
-            pme_lb->setup[pme_lb->start].cycles >
-            pme_lb->setup[pme_lb->fastest].cycles*maxRelativeSlowdownAccepted))
+    while (pme_lb->start + 1 < gmx::ssize(pme_lb->setup)
+           && (pme_lb->setup[pme_lb->start].count == 0
+               || pme_lb->setup[pme_lb->start].cycles
+                          > pme_lb->setup[pme_lb->fastest].cycles * maxRelativeSlowdownAccepted))
     {
         pme_lb->start++;
     }
@@ -499,17 +497,16 @@ static void switch_to_stage1(pme_load_balancing_t *pme_lb)
      * any skipped setups that lie between setups that were measured to be
      * acceptably fast and too slow.
      */
-    while (pme_lb->start > pme_lb->lower_limit &&
-           pme_lb->setup[pme_lb->start - 1].count == 0)
+    while (pme_lb->start > pme_lb->lower_limit && pme_lb->setup[pme_lb->start - 1].count == 0)
     {
         pme_lb->start--;
     }
 
     /* Decrease end only with setups that we timed and that are slow. */
     pme_lb->end = pme_lb->setup.size();
-    if (pme_lb->setup[pme_lb->end - 1].count > 0 &&
-        pme_lb->setup[pme_lb->end - 1].cycles >
-        pme_lb->setup[pme_lb->fastest].cycles*maxRelativeSlowdownAccepted)
+    if (pme_lb->setup[pme_lb->end - 1].count > 0
+        && pme_lb->setup[pme_lb->end - 1].cycles
+                   > pme_lb->setup[pme_lb->fastest].cycles * maxRelativeSlowdownAccepted)
     {
         pme_lb->end--;
     }
@@ -534,23 +531,22 @@ static void switch_to_stage1(pme_load_balancing_t *pme_lb)
  * Here we try to take into account fluctuations and changes due to external
  * factors as well as DD load balancing.
  */
-static void
-pme_load_balance(pme_load_balancing_t          *pme_lb,
-                 t_commrec                     *cr,
-                 FILE                          *fp_err,
-                 FILE                          *fp_log,
-                 const gmx::MDLogger           &mdlog,
-                 const t_inputrec              &ir,
-                 const matrix                   box,
-                 gmx::ArrayRef<const gmx::RVec> x,
-                 double                         cycles,
-                 interaction_const_t           *ic,
-                 struct nonbonded_verlet_t     *nbv,
-                 struct gmx_pme_t     **        pmedata,
-                 int64_t                        step)
+static void pme_load_balance(pme_load_balancing_t*          pme_lb,
+                             t_commrec*                     cr,
+                             FILE*                          fp_err,
+                             FILE*                          fp_log,
+                             const gmx::MDLogger&           mdlog,
+                             const t_inputrec&              ir,
+                             const matrix                   box,
+                             gmx::ArrayRef<const gmx::RVec> x,
+                             double                         cycles,
+                             interaction_const_t*           ic,
+                             struct nonbonded_verlet_t*     nbv,
+                             struct gmx_pme_t**             pmedata,
+                             int64_t                        step)
 {
     gmx_bool     OK;
-    pme_setup_t *set;
+    pme_setup_tset;
     double       cycles_fast;
     char         buf[STRLEN], sbuf[22];
 
@@ -580,8 +576,7 @@ pme_load_balance(pme_load_balancing_t          *pme_lb,
     }
     else
     {
-        if (cycles*maxFluctuationAccepted < set->cycles &&
-            pme_lb->stage == pme_lb->nstage - 1)
+        if (cycles * maxFluctuationAccepted < set->cycles && pme_lb->stage == pme_lb->nstage - 1)
         {
             /* The performance went up a lot (due to e.g. DD load balancing).
              * Add a stage, keep the minima, but rescan all setups.
@@ -590,12 +585,13 @@ pme_load_balance(pme_load_balancing_t          *pme_lb,
 
             if (debug)
             {
-                fprintf(debug, "The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this is more than %f\n"
+                fprintf(debug,
+                        "The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this "
+                        "is more than %f\n"
                         "Increased the number stages to %d"
                         " and ignoring the previous performance\n",
-                        set->grid[XX], set->grid[YY], set->grid[ZZ],
-                        set->cycles*1e-6, cycles*1e-6, maxFluctuationAccepted,
-                        pme_lb->nstage);
+                        set->grid[XX], set->grid[YY], set->grid[ZZ], set->cycles * 1e-6,
+                        cycles * 1e-6, maxFluctuationAccepted, pme_lb->nstage);
             }
         }
         set->cycles = std::min(set->cycles, cycles);
@@ -627,8 +623,8 @@ pme_load_balance(pme_load_balancing_t          *pme_lb,
     /* Check in stage 0 if we should stop scanning grids.
      * Stop when the time is more than maxRelativeSlowDownAccepted longer than the fastest.
      */
-    if (pme_lb->stage == 0 && pme_lb->cur > 0 &&
-        cycles > pme_lb->setup[pme_lb->fastest].cycles*maxRelativeSlowdownAccepted)
+    if (pme_lb->stage == 0 && pme_lb->cur > 0
+        && cycles > pme_lb->setup[pme_lb->fastest].cycles * maxRelativeSlowdownAccepted)
     {
         pme_lb->setup.resize(pme_lb->cur + 1);
         /* Done with scanning, go to stage 1 */
@@ -639,11 +635,11 @@ pme_load_balance(pme_load_balancing_t          *pme_lb,
     {
         int gridsize_start;
 
-        gridsize_start = set->grid[XX]*set->grid[YY]*set->grid[ZZ];
+        gridsize_start = set->grid[XX] * set->grid[YY] * set->grid[ZZ];
 
         do
         {
-            if (pme_lb->cur+1 < gmx::ssize(pme_lb->setup))
+            if (pme_lb->cur + 1 < gmx::ssize(pme_lb->setup))
             {
                 /* We had already generated the next setup */
                 OK = TRUE;
@@ -659,8 +655,8 @@ pme_load_balance(pme_load_balancing_t          *pme_lb,
                 }
             }
 
-            if (OK &&
-                pme_lb->setup[pme_lb->cur+1].spacing > c_maxSpacingScaling*pme_lb->setup[0].spacing)
+            if (OK
+                && pme_lb->setup[pme_lb->cur + 1].spacing > c_maxSpacingScaling * pme_lb->setup[0].spacing)
             {
                 OK               = FALSE;
                 pme_lb->elimited = epmelblimMAXSCALING;
@@ -668,8 +664,7 @@ pme_load_balance(pme_load_balancing_t          *pme_lb,
 
             if (OK && ir.ePBC != epbcNONE)
             {
-                OK = (gmx::square(pme_lb->setup[pme_lb->cur+1].rlistOuter)
-                      <= max_cutoff2(ir.ePBC, box));
+                OK = (gmx::square(pme_lb->setup[pme_lb->cur + 1].rlistOuter) <= max_cutoff2(ir.ePBC, box));
                 if (!OK)
                 {
                     pme_lb->elimited = epmelblimBOX;
@@ -682,8 +677,7 @@ pme_load_balance(pme_load_balancing_t          *pme_lb,
 
                 if (DOMAINDECOMP(cr))
                 {
-                    OK = change_dd_cutoff(cr, box, x,
-                                          pme_lb->setup[pme_lb->cur].rlistOuter);
+                    OK = change_dd_cutoff(cr, box, x, pme_lb->setup[pme_lb->cur].rlistOuter);
                     if (!OK)
                     {
                         /* Failed: do not use this setup */
@@ -702,15 +696,12 @@ pme_load_balance(pme_load_balancing_t          *pme_lb,
                 /* Switch to the next stage */
                 switch_to_stage1(pme_lb);
             }
-        }
-        while (OK &&
-               !(pme_lb->setup[pme_lb->cur].grid[XX]*
-                 pme_lb->setup[pme_lb->cur].grid[YY]*
-                 pme_lb->setup[pme_lb->cur].grid[ZZ] <
-                 gridsize_start*gridpointsScaleFactor
-                 &&
-                 pme_lb->setup[pme_lb->cur].grid_efficiency <
-                 pme_lb->setup[pme_lb->cur-1].grid_efficiency*relativeEfficiencyFactor));
+        } while (OK
+                 && !(pme_lb->setup[pme_lb->cur].grid[XX] * pme_lb->setup[pme_lb->cur].grid[YY]
+                                      * pme_lb->setup[pme_lb->cur].grid[ZZ]
+                              < gridsize_start * gridpointsScaleFactor
+                      && pme_lb->setup[pme_lb->cur].grid_efficiency
+                                 < pme_lb->setup[pme_lb->cur - 1].grid_efficiency * relativeEfficiencyFactor));
     }
 
     if (pme_lb->stage > 0 && pme_lb->end == 1)
@@ -741,10 +732,8 @@ pme_load_balance(pme_load_balancing_t          *pme_lb,
 
                 pme_lb->cur = pme_lb->end - 1;
             }
-        }
-        while (pme_lb->stage == pme_lb->nstage - 1 &&
-               pme_lb->setup[pme_lb->cur].count > 0 &&
-               pme_lb->setup[pme_lb->cur].cycles > cycles_fast*maxRelativeSlowdownAccepted);
+        } while (pme_lb->stage == pme_lb->nstage - 1 && pme_lb->setup[pme_lb->cur].count > 0
+                 && pme_lb->setup[pme_lb->cur].cycles > cycles_fast * maxRelativeSlowdownAccepted);
 
         if (pme_lb->stage == pme_lb->nstage)
         {
@@ -773,15 +762,19 @@ pme_load_balance(pme_load_balancing_t          *pme_lb,
                 /* This should not happen, as we set limits on the DLB bounds.
                  * But we implement a complete failsafe solution anyhow.
                  */
-                GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
-                        "The fastest PP/PME load balancing setting (cutoff %.3d nm) is no longer available due to DD DLB or box size limitations", pme_lb->fastest);
+                GMX_LOG(mdlog.warning)
+                        .asParagraph()
+                        .appendTextFormatted(
+                                "The fastest PP/PME load balancing setting (cutoff %.3d nm) is no "
+                                "longer available due to DD DLB or box size limitations",
+                                pme_lb->fastest);
                 pme_lb->fastest = pme_lb->lower_limit;
                 pme_lb->start   = pme_lb->lower_limit;
             }
             /* Limit the range to below the current cut-off, scan from start */
-            pme_lb->end         = pme_lb->cur;
-            pme_lb->cur         = pme_lb->start;
-            pme_lb->elimited    = epmelblimDD;
+            pme_lb->end      = pme_lb->cur;
+            pme_lb->cur      = pme_lb->start;
+            pme_lb->elimited = epmelblimDD;
             print_loadbal_limited(fp_err, fp_log, step, pme_lb);
         }
     }
@@ -790,28 +783,29 @@ pme_load_balance(pme_load_balancing_t          *pme_lb,
 
     set = &pme_lb->setup[pme_lb->cur];
 
-    ic->rcoulomb           = set->rcut_coulomb;
+    ic->rcoulomb = set->rcut_coulomb;
     nbv->changePairlistRadii(set->rlistOuter, set->rlistInner);
-    ic->ewaldcoeff_q       = set->ewaldcoeff_q;
+    ic->ewaldcoeff_q = set->ewaldcoeff_q;
     /* TODO: centralize the code that sets the potentials shifts */
     if (ic->coulomb_modifier == eintmodPOTSHIFT)
     {
         GMX_RELEASE_ASSERT(ic->rcoulomb != 0, "Cutoff radius cannot be zero");
-        ic->sh_ewald = std::erfc(ic->ewaldcoeff_q*ic->rcoulomb) / ic->rcoulomb;
+        ic->sh_ewald = std::erfc(ic->ewaldcoeff_q * ic->rcoulomb) / ic->rcoulomb;
     }
     if (EVDW_PME(ic->vdwtype))
     {
         /* We have PME for both Coulomb and VdW, set rvdw equal to rcoulomb */
-        ic->rvdw            = set->rcut_coulomb;
-        ic->ewaldcoeff_lj   = set->ewaldcoeff_lj;
+        ic->rvdw          = set->rcut_coulomb;
+        ic->ewaldcoeff_lj = set->ewaldcoeff_lj;
         if (ic->vdw_modifier == eintmodPOTSHIFT)
         {
-            real       crc2;
+            real crc2;
 
-            ic->dispersion_shift.cpot = -1.0/gmx::power6(static_cast<double>(ic->rvdw));
-            ic->repulsion_shift.cpot  = -1.0/gmx::power12(static_cast<double>(ic->rvdw));
-            crc2                      = gmx::square(ic->ewaldcoeff_lj*ic->rvdw);
-            ic->sh_lj_ewald           = (std::exp(-crc2)*(1 + crc2 + 0.5*crc2*crc2) - 1)/gmx::power6(ic->rvdw);
+            ic->dispersion_shift.cpot = -1.0 / gmx::power6(static_cast<double>(ic->rvdw));
+            ic->repulsion_shift.cpot  = -1.0 / gmx::power12(static_cast<double>(ic->rvdw));
+            crc2                      = gmx::square(ic->ewaldcoeff_lj * ic->rvdw);
+            ic->sh_lj_ewald =
+                    (std::exp(-crc2) * (1 + crc2 + 0.5 * crc2 * crc2) - 1) / gmx::power6(ic->rvdw);
         }
     }
 
@@ -829,14 +823,14 @@ pme_load_balance(pme_load_balancing_t          *pme_lb,
          * This can lead to a lot of reallocations for PME GPU.
          * Would be nicer if the allocated grid list was hidden within a single pmedata structure.
          */
-        if ((pme_lb->setup[pme_lb->cur].pmedata == nullptr) || pme_gpu_task_enabled(pme_lb->setup[pme_lb->cur].pmedata))
+        if ((pme_lb->setup[pme_lb->cur].pmedata == nullptr)
+            || pme_gpu_task_enabled(pme_lb->setup[pme_lb->cur].pmedata))
         {
             /* Generate a new PME data structure,
              * copying part of the old pointers.
              */
-            gmx_pme_reinit(&set->pmedata,
-                           cr, pme_lb->setup[0].pmedata, &ir,
-                           set->grid, set->ewaldcoeff_q, set->ewaldcoeff_lj);
+            gmx_pme_reinit(&set->pmedata, cr, pme_lb->setup[0].pmedata, &ir, set->grid,
+                           set->ewaldcoeff_q, set->ewaldcoeff_lj);
         }
         *pmedata = set->pmedata;
     }
@@ -866,35 +860,34 @@ pme_load_balance(pme_load_balancing_t          *pme_lb,
  * the PP/PME balance might change and re-balancing can improve performance.
  * This function adds 2 stages and adjusts the considered setup range.
  */
-static void continue_pme_loadbal(pme_load_balancing_t *pme_lb,
-                                 gmx_bool              bDlbUnlocked)
+static void continue_pme_loadbal(pme_load_balancing_t* pme_lb, gmx_bool bDlbUnlocked)
 {
     /* Add 2 tuning stages, keep the detected end of the setup range */
-    pme_lb->nstage          += 2;
+    pme_lb->nstage += 2;
     if (bDlbUnlocked && pme_lb->bSepPMERanks)
     {
         /* With separate PME ranks, DLB should always lower the PP load and
          * can only increase the PME load (more communication and imbalance),
          * so we only need to scan longer cut-off's.
          */
-        pme_lb->lower_limit  = pme_lb->cur;
+        pme_lb->lower_limit = pme_lb->cur;
     }
-    pme_lb->start            = pme_lb->lower_limit;
+    pme_lb->start = pme_lb->lower_limit;
 }
 
-void pme_loadbal_do(pme_load_balancing_t          *pme_lb,
-                    t_commrec                     *cr,
-                    FILE                          *fp_err,
-                    FILE                          *fp_log,
-                    const gmx::MDLogger           &mdlog,
-                    const t_inputrec              &ir,
-                    t_forcerec                    *fr,
+void pme_loadbal_do(pme_load_balancing_t*          pme_lb,
+                    t_commrec*                     cr,
+                    FILE*                          fp_err,
+                    FILE*                          fp_log,
+                    const gmx::MDLogger&           mdlog,
+                    const t_inputrec&              ir,
+                    t_forcerec*                    fr,
                     const matrix                   box,
                     gmx::ArrayRef<const gmx::RVec> x,
                     gmx_wallcycle_t                wcycle,
                     int64_t                        step,
                     int64_t                        step_rel,
-                    gmx_bool                      *bPrinting)
+                    gmx_bool*                      bPrinting)
 {
     int    n_prev;
     double cycles_prev;
@@ -937,19 +930,17 @@ void pme_loadbal_do(pme_load_balancing_t          *pme_lb,
          * is not over the last nstlist steps, but the nstlist steps before
          * that. So the first useful ratio is available at step_rel=3*nstlist.
          */
-        else if (step_rel >= 3*ir.nstlist)
+        else if (step_rel >= 3 * ir.nstlist)
         {
             if (DDMASTER(cr->dd))
             {
                 /* If PME rank load is too high, start tuning */
-                pme_lb->bBalance =
-                    (dd_pme_f_ratio(cr->dd) >= loadBalanceTriggerFactor);
+                pme_lb->bBalance = (dd_pme_f_ratio(cr->dd) >= loadBalanceTriggerFactor);
             }
             dd_bcast(cr->dd, sizeof(gmx_bool), &pme_lb->bBalance);
         }
 
-        pme_lb->bActive = (pme_lb->bBalance ||
-                           step_rel <= pme_lb->step_rel_stop);
+        pme_lb->bActive = (pme_lb->bBalance || step_rel <= pme_lb->step_rel_stop);
     }
 
     /* The location in the code of this balancing termination is strange.
@@ -969,14 +960,16 @@ void pme_loadbal_do(pme_load_balancing_t          *pme_lb,
         {
             /* Unlock the DLB=auto, DLB is allowed to activate */
             dd_dlb_unlock(cr->dd);
-            GMX_LOG(mdlog.warning).asParagraph().appendText("NOTE: DLB can now turn on, when beneficial");
+            GMX_LOG(mdlog.warning)
+                    .asParagraph()
+                    .appendText("NOTE: DLB can now turn on, when beneficial");
 
             /* We don't deactivate the tuning yet, since we will balance again
              * after DLB gets turned on, if it does within PMETune_period.
              */
             continue_pme_loadbal(pme_lb, TRUE);
             pme_lb->bTriggerOnDLB = TRUE;
-            pme_lb->step_rel_stop = step_rel + PMETunePeriod*ir.nstlist;
+            pme_lb->step_rel_stop = step_rel + PMETunePeriod * ir.nstlist;
         }
         else
         {
@@ -1001,14 +994,11 @@ void pme_loadbal_do(pme_load_balancing_t          *pme_lb,
          * since init_step might not be a multiple of nstlist,
          * but the first data collected is skipped anyhow.
          */
-        pme_load_balance(pme_lb, cr,
-                         fp_err, fp_log, mdlog,
-                         ir, box, x, pme_lb->cycles_c - cycles_prev,
-                         fr->ic, fr->nbv.get(), &fr->pmedata,
-                         step);
+        pme_load_balance(pme_lb, cr, fp_err, fp_log, mdlog, ir, box, x,
+                         pme_lb->cycles_c - cycles_prev, fr->ic, fr->nbv.get(), &fr->pmedata, step);
 
         /* Update deprecated rlist in forcerec to stay in sync with fr->nbv */
-        fr->rlist         = fr->nbv->pairlistOuterRadius();
+        fr->rlist = fr->nbv->pairlistOuterRadius();
 
         if (ir.eDispCorr != edispcNO)
         {
@@ -1016,8 +1006,7 @@ void pme_loadbal_do(pme_load_balancing_t          *pme_lb,
         }
     }
 
-    if (!pme_lb->bBalance &&
-        (!pme_lb->bSepPMERanks || step_rel > pme_lb->step_rel_stop))
+    if (!pme_lb->bBalance && (!pme_lb->bSepPMERanks || step_rel > pme_lb->step_rel_stop))
     {
         /* We have just deactivated the balancing and we're not measuring PP/PME
          * imbalance during the first steps of the run: deactivate the tuning.
@@ -1029,51 +1018,47 @@ void pme_loadbal_do(pme_load_balancing_t          *pme_lb,
     {
         /* Make sure DLB is allowed when we deactivate PME tuning */
         dd_dlb_unlock(cr->dd);
-        GMX_LOG(mdlog.warning).asParagraph().appendText("NOTE: DLB can now turn on, when beneficial");
+        GMX_LOG(mdlog.warning)
+                .asParagraph()
+                .appendText("NOTE: DLB can now turn on, when beneficial");
     }
 
     *bPrinting = pme_lb->bBalance;
 }
 
 /*! \brief Return product of the number of PME grid points in each dimension */
-static int pme_grid_points(const pme_setup_t *setup)
+static int pme_grid_points(const pme_setup_tsetup)
 {
-    return setup->grid[XX]*setup->grid[YY]*setup->grid[ZZ];
+    return setup->grid[XX] * setup->grid[YY] * setup->grid[ZZ];
 }
 
 /*! \brief Print one load-balancing setting */
-static void print_pme_loadbal_setting(FILE              *fplog,
-                                      const char        *name,
-                                      const pme_setup_t *setup)
+static void print_pme_loadbal_setting(FILE* fplog, const char* name, const pme_setup_t* setup)
 {
-    fprintf(fplog,
-            "   %-7s %6.3f nm %6.3f nm     %3d %3d %3d   %5.3f nm  %5.3f nm\n",
-            name,
-            setup->rcut_coulomb, setup->rlistInner,
-            setup->grid[XX], setup->grid[YY], setup->grid[ZZ],
-            setup->spacing, 1/setup->ewaldcoeff_q);
+    fprintf(fplog, "   %-7s %6.3f nm %6.3f nm     %3d %3d %3d   %5.3f nm  %5.3f nm\n", name,
+            setup->rcut_coulomb, setup->rlistInner, setup->grid[XX], setup->grid[YY],
+            setup->grid[ZZ], setup->spacing, 1 / setup->ewaldcoeff_q);
 }
 
 /*! \brief Print all load-balancing settings */
-static void print_pme_loadbal_settings(pme_load_balancing_t *pme_lb,
-                                       FILE                 *fplog,
-                                       const gmx::MDLogger  &mdlog,
+static void print_pme_loadbal_settings(pme_load_balancing_tpme_lb,
+                                       FILE*                 fplog,
+                                       const gmx::MDLogger&  mdlog,
                                        gmx_bool              bNonBondedOnGPU)
 {
-    double     pp_ratio, grid_ratio;
-    real       pp_ratio_temporary;
+    double pp_ratio, grid_ratio;
+    real   pp_ratio_temporary;
 
     pp_ratio_temporary = pme_lb->setup[pme_lb->cur].rlistInner / pme_lb->setup[0].rlistInner;
     pp_ratio           = gmx::power3(pp_ratio_temporary);
-    grid_ratio         = pme_grid_points(&pme_lb->setup[pme_lb->cur])/
-        static_cast<double>(pme_grid_points(&pme_lb->setup[0]));
+    grid_ratio         = pme_grid_points(&pme_lb->setup[pme_lb->cur])
+                 / static_cast<double>(pme_grid_points(&pme_lb->setup[0]));
 
     fprintf(fplog, "\n");
     fprintf(fplog, "       P P   -   P M E   L O A D   B A L A N C I N G\n");
     fprintf(fplog, "\n");
     /* Here we only warn when the optimal setting is the last one */
-    if (pme_lb->elimited != epmelblimNO &&
-        pme_lb->cur == pme_loadbal_end(pme_lb)-1)
+    if (pme_lb->elimited != epmelblimNO && pme_lb->cur == pme_loadbal_end(pme_lb) - 1)
     {
         fprintf(fplog, " NOTE: The PP/PME load balancing was limited by the %s,\n",
                 pmelblim_str[pme_lb->elimited]);
@@ -1089,16 +1074,19 @@ static void print_pme_loadbal_settings(pme_load_balancing_t *pme_lb,
     fprintf(fplog, "            rcoulomb  rlist            grid      spacing   1/beta\n");
     print_pme_loadbal_setting(fplog, "initial", &pme_lb->setup[0]);
     print_pme_loadbal_setting(fplog, "final", &pme_lb->setup[pme_lb->cur]);
-    fprintf(fplog, " cost-ratio           %4.2f             %4.2f\n",
-            pp_ratio, grid_ratio);
+    fprintf(fplog, " cost-ratio           %4.2f             %4.2f\n", pp_ratio, grid_ratio);
     fprintf(fplog, " (note that these numbers concern only part of the total PP and PME load)\n");
 
     if (pp_ratio > 1.5 && !bNonBondedOnGPU)
     {
-        GMX_LOG(mdlog.warning).asParagraph().appendText(
-                "NOTE: PME load balancing increased the non-bonded workload by more than 50%.\n"
-                "      For better performance, use (more) PME ranks (mdrun -npme),\n"
-                "      or if you are beyond the scaling limit, use fewer total ranks (or nodes).");
+        GMX_LOG(mdlog.warning)
+                .asParagraph()
+                .appendText(
+                        "NOTE: PME load balancing increased the non-bonded workload by more than "
+                        "50%.\n"
+                        "      For better performance, use (more) PME ranks (mdrun -npme),\n"
+                        "      or if you are beyond the scaling limit, use fewer total ranks (or "
+                        "nodes).");
     }
     else
     {
@@ -1106,10 +1094,7 @@ static void print_pme_loadbal_settings(pme_load_balancing_t *pme_lb,
     }
 }
 
-void pme_loadbal_done(pme_load_balancing_t *pme_lb,
-                      FILE                 *fplog,
-                      const gmx::MDLogger  &mdlog,
-                      gmx_bool              bNonBondedOnGPU)
+void pme_loadbal_done(pme_load_balancing_t* pme_lb, FILE* fplog, const gmx::MDLogger& mdlog, gmx_bool bNonBondedOnGPU)
 {
     if (fplog != nullptr && (pme_lb->cur > 0 || pme_lb->elimited != epmelblimNO))
     {