Avoid DLB with overloaded PME ranks
authorBerk Hess <hess@kth.se>
Fri, 5 Sep 2014 08:29:04 +0000 (10:29 +0200)
committerGerrit Code Review <gerrit@gerrit.gromacs.org>
Tue, 30 Sep 2014 11:11:32 +0000 (13:11 +0200)
When separate PME ranks have more load than the PP ranks, DLB can
not improve improve performance. It will actually make it worse,
because the PME x/f redistribution time goes up. Now with -dlb=auto
DLB is not turned on in this situation.
Also DLB is not activated during PME tuning with GPUs and separate
PME nodes, since it then nearly always deteriorates the performance.

Change-Id: I1f5e649a9562fdca9ba538196f41a12feb0a4a24

src/gromacs/legacyheaders/domdec.h
src/gromacs/mdlib/domdec.c
src/programs/mdrun/md.c

index efeef40614522c918880397a85b0a4b1697f591e..240a7901ccd0592922e98459d37437374285905a 100644 (file)
@@ -138,6 +138,12 @@ void change_dd_dlb_cutoff_limit(t_commrec *cr);
  * possible after subsequently setting a shorter cut-off with change_dd_cutoff.
  */
 
+gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd);
+/* Return if the DLB lock is set */
+
+void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue);
+/* Set a lock such that with DLB=auto DLB can (not) get turned on */
+
 void dd_setup_dlb_resource_sharing(t_commrec           *cr,
                                    const gmx_hw_info_t *hwinfo,
                                    const gmx_hw_opt_t  *hw_opt);
index ca2514a5dd4df5bd133c842e9890aa7c54a4f297..6cc6b73895d08a33eadcd83abcb492bd2dffcbef 100644 (file)
@@ -269,6 +269,8 @@ typedef struct gmx_domdec_comm
 
     /* The DLB option */
     int      eDLB;
+    /* Is eDLB=edlbAUTO locked such that we currently can't turn it on? */
+    gmx_bool bDLB_locked;
     /* Are we actually using DLB? */
     gmx_bool bDynLoadBal;
 
@@ -385,9 +387,9 @@ typedef struct gmx_domdec_comm
     int    eFlop;
     double flop;
     int    flop_n;
-    /* Have often have did we have load measurements */
+    /* How many times have did we have load measurements */
     int    n_load_have;
-    /* Have often have we collected the load measurements */
+    /* How many times have we collected the load measurements */
     int    n_load_collect;
 
     /* Statistics */
@@ -3462,7 +3464,7 @@ static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
             cell_size[i] = 1.0/ncd;
         }
     }
-    else if (dd_load_count(comm))
+    else if (dd_load_count(comm) > 0)
     {
         load_aver  = comm->load[d].sum_m/ncd;
         change_max = 0;
@@ -6685,7 +6687,8 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
     /* Initialize to GPU share count to 0, might change later */
     comm->nrank_gpu_shared = 0;
 
-    comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
+    comm->eDLB        = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
+    comm->bDLB_locked = FALSE;
 
     comm->bDynLoadBal = (comm->eDLB == edlbYES);
     if (fplog)
@@ -7585,6 +7588,20 @@ void change_dd_dlb_cutoff_limit(t_commrec *cr)
     comm->PMELoadBal_max_cutoff = comm->cutoff;
 }
 
+gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
+{
+    return dd->comm->bDLB_locked;
+}
+
+void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue)
+{
+    /* We can only lock the DLB when it is set to auto, otherwise don't lock */
+    if (dd->comm->eDLB == edlbAUTO)
+    {
+        dd->comm->bDLB_locked = bValue;
+    }
+}
+
 static void merge_cg_buffers(int ncell,
                              gmx_domdec_comm_dim_t *cd, int pulse,
                              int  *ncg_cell,
@@ -9349,17 +9366,18 @@ void dd_partition_system(FILE                *fplog,
     }
 
     /* Check if we have recorded loads on the nodes */
-    if (comm->bRecordLoad && dd_load_count(comm))
+    if (comm->bRecordLoad && dd_load_count(comm) > 0)
     {
-        if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
+        if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal && !dd_dlb_is_locked(dd))
         {
             /* Check if we should use DLB at the second partitioning
              * and every 100 partitionings,
              * so the extra communication cost is negligible.
              */
-            n         = max(100, nstglobalcomm);
+            const int nddp_chk_dlb = 100;
+
             bCheckDLB = (comm->n_load_collect == 0 ||
-                         comm->n_load_have % n == n-1);
+                         comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1);
         }
         else
         {
@@ -9397,8 +9415,26 @@ void dd_partition_system(FILE                *fplog,
                 /* Since the timings are node dependent, the master decides */
                 if (DDMASTER(dd))
                 {
-                    bTurnOnDLB =
-                        (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
+                    /* Here we check if the max PME rank load is more than 0.98
+                     * the max PP force load. If so, PP DLB will not help,
+                     * since we are (almost) limited by PME. Furthermore,
+                     * DLB will cause a significant extra x/f redistribution
+                     * cost on the PME ranks, which will then surely result
+                     * in lower total performance.
+                     * This check might be fragile, since one measurement
+                     * below 0.98 (although only done once every 100 DD part.)
+                     * could turn on DLB for the rest of the run.
+                     */
+                    if (cr->npmenodes > 0 &&
+                        dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
+                    {
+                        bTurnOnDLB = FALSE;
+                    }
+                    else
+                    {
+                        bTurnOnDLB =
+                            (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
+                    }
                     if (debug)
                     {
                         fprintf(debug, "step %s, imb loss %f\n",
index dc01b6de0cbd0a69b693000144bf3ada388142f2..3d98d597c7d0e9c009e02d816e631b3f36846a25 100644 (file)
@@ -1909,6 +1909,21 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                     }
                     dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
 
+                    if (bPMETuneRunning &&
+                        fr->nbv->bUseGPU && DOMAINDECOMP(cr) &&
+                        !(cr->duty & DUTY_PME))
+                    {
+                        /* Lock DLB=auto to off (does nothing when DLB=yes/no).
+                         * With GPUs + separate PME ranks, we don't want DLB.
+                         * This could happen when we scan coarse grids and
+                         * it would then never be turned off again.
+                         * This would hurt performance at the final, optimal
+                         * grid spacing, where DLB almost never helps.
+                         * Also, DLB can limit the cut-off for PME tuning.
+                         */
+                        dd_dlb_set_lock(cr->dd, TRUE);
+                    }
+
                     if (bPMETuneRunning || step_rel > ir->nstlist*50)
                     {
                         bPMETuneTry     = FALSE;
@@ -1939,6 +1954,16 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                     {
                         calc_enervirdiff(NULL, ir->eDispCorr, fr);
                     }
+
+                    if (!bPMETuneRunning &&
+                        DOMAINDECOMP(cr) &&
+                        dd_dlb_is_locked(cr->dd))
+                    {
+                        /* Unlock the DLB=auto, DLB is allowed to activate
+                         * (but we don't expect it to activate in most cases).
+                         */
+                        dd_dlb_set_lock(cr->dd, FALSE);
+                    }
                 }
                 cycles_pmes = 0;
             }