Avoid DLB with overloaded PME ranks

author Berk Hess <hess@kth.se>

Fri, 5 Sep 2014 08:29:04 +0000 (10:29 +0200)

committer Gerrit Code Review <gerrit@gerrit.gromacs.org>

Tue, 30 Sep 2014 11:11:32 +0000 (13:11 +0200)
author Berk Hess <hess@kth.se>
Fri, 5 Sep 2014 08:29:04 +0000 (10:29 +0200)
committer Gerrit Code Review <gerrit@gerrit.gromacs.org>
Tue, 30 Sep 2014 11:11:32 +0000 (13:11 +0200)
diff --git a/src/gromacs/legacyheaders/domdec.h b/src/gromacs/legacyheaders/domdec.h

index efeef40614522c918880397a85b0a4b1697f591e..240a7901ccd0592922e98459d37437374285905a 100644 (file)
--- a/src/gromacs/legacyheaders/domdec.h
+++ b/src/gromacs/legacyheaders/domdec.h
@@ -138,6 +138,12 @@ void change_dd_dlb_cutoff_limit(t_commrec *cr);
   * possible after subsequently setting a shorter cut-off with change_dd_cutoff.
   */
  
+gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd);
+/* Return if the DLB lock is set */
+
+void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue);
+/* Set a lock such that with DLB=auto DLB can (not) get turned on */
+
  void dd_setup_dlb_resource_sharing(t_commrec           *cr,
                                     const gmx_hw_info_t *hwinfo,
                                     const gmx_hw_opt_t  *hw_opt);
diff --git a/src/gromacs/mdlib/domdec.c b/src/gromacs/mdlib/domdec.c

index ca2514a5dd4df5bd133c842e9890aa7c54a4f297..6cc6b73895d08a33eadcd83abcb492bd2dffcbef 100644 (file)
--- a/src/gromacs/mdlib/domdec.c
+++ b/src/gromacs/mdlib/domdec.c
@@ -269,6 +269,8 @@ typedef struct gmx_domdec_comm
  
      /* The DLB option */
      int      eDLB;
+    /* Is eDLB=edlbAUTO locked such that we currently can't turn it on? */
+    gmx_bool bDLB_locked;
      /* Are we actually using DLB? */
      gmx_bool bDynLoadBal;
  
@@ -385,9 +387,9 @@ typedef struct gmx_domdec_comm
      int    eFlop;
      double flop;
      int    flop_n;
-    /* Have often have did we have load measurements */
+    /* How many times have did we have load measurements */
      int    n_load_have;
-    /* Have often have we collected the load measurements */
+    /* How many times have we collected the load measurements */
      int    n_load_collect;
  
      /* Statistics */
@@ -3462,7 +3464,7 @@ static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
              cell_size[i] = 1.0/ncd;
          }
      }
-    else if (dd_load_count(comm))
+    else if (dd_load_count(comm) > 0)
      {
          load_aver  = comm->load[d].sum_m/ncd;
          change_max = 0;
@@ -6685,7 +6687,8 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
      /* Initialize to GPU share count to 0, might change later */
      comm->nrank_gpu_shared = 0;
  
-    comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
+    comm->eDLB        = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
+    comm->bDLB_locked = FALSE;
  
      comm->bDynLoadBal = (comm->eDLB == edlbYES);
      if (fplog)
@@ -7585,6 +7588,20 @@ void change_dd_dlb_cutoff_limit(t_commrec *cr)
      comm->PMELoadBal_max_cutoff = comm->cutoff;
  }
  
+gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
+{
+    return dd->comm->bDLB_locked;
+}
+
+void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue)
+{
+    /* We can only lock the DLB when it is set to auto, otherwise don't lock */
+    if (dd->comm->eDLB == edlbAUTO)
+    {
+        dd->comm->bDLB_locked = bValue;
+    }
+}
+
  static void merge_cg_buffers(int ncell,
                               gmx_domdec_comm_dim_t *cd, int pulse,
                               int  *ncg_cell,
@@ -9349,17 +9366,18 @@ void dd_partition_system(FILE                *fplog,
      }
  
      /* Check if we have recorded loads on the nodes */
-    if (comm->bRecordLoad && dd_load_count(comm))
+    if (comm->bRecordLoad && dd_load_count(comm) > 0)
      {
-        if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
+        if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal && !dd_dlb_is_locked(dd))
          {
              /* Check if we should use DLB at the second partitioning
               * and every 100 partitionings,
               * so the extra communication cost is negligible.
               */
-            n         = max(100, nstglobalcomm);
+            const int nddp_chk_dlb = 100;
+
              bCheckDLB = (comm->n_load_collect == 0 ||
-                         comm->n_load_have % n == n-1);
+                         comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1);
          }
          else
          {
@@ -9397,8 +9415,26 @@ void dd_partition_system(FILE                *fplog,
                  /* Since the timings are node dependent, the master decides */
                  if (DDMASTER(dd))
                  {
-                    bTurnOnDLB =
-                        (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
+                    /* Here we check if the max PME rank load is more than 0.98
+                     * the max PP force load. If so, PP DLB will not help,
+                     * since we are (almost) limited by PME. Furthermore,
+                     * DLB will cause a significant extra x/f redistribution
+                     * cost on the PME ranks, which will then surely result
+                     * in lower total performance.
+                     * This check might be fragile, since one measurement
+                     * below 0.98 (although only done once every 100 DD part.)
+                     * could turn on DLB for the rest of the run.
+                     */
+                    if (cr->npmenodes > 0 &&
+                        dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
+                    {
+                        bTurnOnDLB = FALSE;
+                    }
+                    else
+                    {
+                        bTurnOnDLB =
+                            (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
+                    }
                      if (debug)
                      {
                          fprintf(debug, "step %s, imb loss %f\n",
diff --git a/src/programs/mdrun/md.c b/src/programs/mdrun/md.c

index dc01b6de0cbd0a69b693000144bf3ada388142f2..3d98d597c7d0e9c009e02d816e631b3f36846a25 100644 (file)
--- a/src/programs/mdrun/md.c
+++ b/src/programs/mdrun/md.c
@@ -1909,6 +1909,21 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                      }
                      dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
  
+                    if (bPMETuneRunning &&
+                        fr->nbv->bUseGPU && DOMAINDECOMP(cr) &&
+                        !(cr->duty & DUTY_PME))
+                    {
+                        /* Lock DLB=auto to off (does nothing when DLB=yes/no).
+                         * With GPUs + separate PME ranks, we don't want DLB.
+                         * This could happen when we scan coarse grids and
+                         * it would then never be turned off again.
+                         * This would hurt performance at the final, optimal
+                         * grid spacing, where DLB almost never helps.
+                         * Also, DLB can limit the cut-off for PME tuning.
+                         */
+                        dd_dlb_set_lock(cr->dd, TRUE);
+                    }
+
                      if (bPMETuneRunning || step_rel > ir->nstlist*50)
                      {
                          bPMETuneTry     = FALSE;
@@ -1939,6 +1954,16 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                      {
                          calc_enervirdiff(NULL, ir->eDispCorr, fr);
                      }
+
+                    if (!bPMETuneRunning &&
+                        DOMAINDECOMP(cr) &&
+                        dd_dlb_is_locked(cr->dd))
+                    {
+                        /* Unlock the DLB=auto, DLB is allowed to activate
+                         * (but we don't expect it to activate in most cases).
+                         */
+                        dd_dlb_set_lock(cr->dd, FALSE);
+                    }
                  }
                  cycles_pmes = 0;
              }
author	Berk Hess <hess@kth.se>
	Fri, 5 Sep 2014 08:29:04 +0000 (10:29 +0200)
committer	Gerrit Code Review <gerrit@gerrit.gromacs.org>
	Tue, 30 Sep 2014 11:11:32 +0000 (13:11 +0200)
src/gromacs/legacyheaders/domdec.h		patch \| blob \| history
src/gromacs/mdlib/domdec.c		patch \| blob \| history
src/programs/mdrun/md.c		patch \| blob \| history