Avoid DLB with overloaded PME ranks

[alexxy/gromacs.git] / src / gromacs / mdlib / domdec.c
diff --git a/src/gromacs/mdlib/domdec.c b/src/gromacs/mdlib/domdec.c

index ae2a4f8098b696a91f846ef2e483abcfbc01b82f..6cc6b73895d08a33eadcd83abcb492bd2dffcbef 100644 (file)
--- a/src/gromacs/mdlib/domdec.c
+++ b/src/gromacs/mdlib/domdec.c
@@ -269,6 +269,8 @@ typedef struct gmx_domdec_comm
  
      /* The DLB option */
      int      eDLB;
+    /* Is eDLB=edlbAUTO locked such that we currently can't turn it on? */
+    gmx_bool bDLB_locked;
      /* Are we actually using DLB? */
      gmx_bool bDynLoadBal;
  
@@ -385,9 +387,9 @@ typedef struct gmx_domdec_comm
      int    eFlop;
      double flop;
      int    flop_n;
-    /* Have often have did we have load measurements */
+    /* How many times have did we have load measurements */
      int    n_load_have;
-    /* Have often have we collected the load measurements */
+    /* How many times have we collected the load measurements */
      int    n_load_collect;
  
      /* Statistics */
@@ -1320,7 +1322,6 @@ static void dd_collect_cg(gmx_domdec_t *dd,
  {
      gmx_domdec_master_t *ma = NULL;
      int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
-    t_block             *cgs_gl;
  
      if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
      {
@@ -1330,12 +1331,18 @@ static void dd_collect_cg(gmx_domdec_t *dd,
  
      if (state_local->ddp_count == dd->ddp_count)
      {
+        /* The local state and DD are in sync, use the DD indices */
          ncg_home = dd->ncg_home;
          cg       = dd->index_gl;
          nat_home = dd->nat_home;
      }
      else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
      {
+        /* The DD is out of sync with the local state, but we have stored
+         * the cg indices with the local state, so we can use those.
+         */
+        t_block *cgs_gl;
+
          cgs_gl = &dd->comm->cgs_gl;
  
          ncg_home = state_local->ncg_gl;
@@ -1351,8 +1358,8 @@ static void dd_collect_cg(gmx_domdec_t *dd,
          gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
      }
  
-    buf2[0] = dd->ncg_home;
-    buf2[1] = dd->nat_home;
+    buf2[0] = ncg_home;
+    buf2[1] = nat_home;
      if (DDMASTER(dd))
      {
          ma   = dd->ma;
@@ -1394,7 +1401,7 @@ static void dd_collect_cg(gmx_domdec_t *dd,
  
      /* Collect the charge group indices on the master */
      dd_gatherv(dd,
-               dd->ncg_home*sizeof(int), dd->index_gl,
+               ncg_home*sizeof(int), cg,
                 DDMASTER(dd) ? ma->ibuf : NULL,
                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
                 DDMASTER(dd) ? ma->cg : NULL);
@@ -3457,7 +3464,7 @@ static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
              cell_size[i] = 1.0/ncd;
          }
      }
-    else if (dd_load_count(comm))
+    else if (dd_load_count(comm) > 0)
      {
          load_aver  = comm->load[d].sum_m/ncd;
          change_max = 0;
@@ -4328,7 +4335,7 @@ static void clear_and_mark_ind(int ncg, int *move,
  static void print_cg_move(FILE *fplog,
                            gmx_domdec_t *dd,
                            gmx_int64_t step, int cg, int dim, int dir,
-                          gmx_bool bHaveLimitdAndCMOld, real limitd,
+                          gmx_bool bHaveCgcmOld, real limitd,
                            rvec cm_old, rvec cm_new, real pos_d)
  {
      gmx_domdec_comm_t *comm;
@@ -4337,19 +4344,22 @@ static void print_cg_move(FILE *fplog,
      comm = dd->comm;
  
      fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
-    if (bHaveLimitdAndCMOld)
+    if (limitd > 0)
      {
-        fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
+        fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
+                dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
                  ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
      }
      else
      {
-        fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
+        /* We don't have a limiting distance available: don't print it */
+        fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
+                dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
                  ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
      }
      fprintf(fplog, "distance out of cell %f\n",
              dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
-    if (bHaveLimitdAndCMOld)
+    if (bHaveCgcmOld)
      {
          fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
                  cm_old[XX], cm_old[YY], cm_old[ZZ]);
@@ -4367,19 +4377,20 @@ static void print_cg_move(FILE *fplog,
  static void cg_move_error(FILE *fplog,
                            gmx_domdec_t *dd,
                            gmx_int64_t step, int cg, int dim, int dir,
-                          gmx_bool bHaveLimitdAndCMOld, real limitd,
+                          gmx_bool bHaveCgcmOld, real limitd,
                            rvec cm_old, rvec cm_new, real pos_d)
  {
      if (fplog)
      {
          print_cg_move(fplog, dd, step, cg, dim, dir,
-                      bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
+                      bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
      }
      print_cg_move(stderr, dd, step, cg, dim, dir,
-                  bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
+                  bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
      gmx_fatal(FARGS,
-              "A charge group moved too far between two domain decomposition steps\n"
-              "This usually means that your system is not well equilibrated");
+              "%s moved too far between two domain decomposition steps\n"
+              "This usually means that your system is not well equilibrated",
+              dd->comm->bCGs ? "A charge group" : "An atom");
  }
  
  static void rotate_state_atom(t_state *state, int a)
@@ -4501,7 +4512,8 @@ static void calc_cg_move(FILE *fplog, gmx_int64_t step,
                  {
                      if (pos_d >= limit1[d])
                      {
-                        cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d],
+                        cg_move_error(fplog, dd, step, cg, d, 1,
+                                      cg_cm != state->x, limitd[d],
                                        cg_cm[cg], cm_new, pos_d);
                      }
                      dev[d] = 1;
@@ -4527,7 +4539,8 @@ static void calc_cg_move(FILE *fplog, gmx_int64_t step,
                  {
                      if (pos_d < limit0[d])
                      {
-                        cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d],
+                        cg_move_error(fplog, dd, step, cg, d, -1,
+                                      cg_cm != state->x, limitd[d],
                                        cg_cm[cg], cm_new, pos_d);
                      }
                      dev[d] = -1;
@@ -4945,7 +4958,7 @@ static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
                  {
                      cg_move_error(fplog, dd, step, cg, dim,
                                    (flag & DD_FLAG_FW(d)) ? 1 : 0,
-                                  FALSE, 0,
+                                  fr->cutoff_scheme == ecutsGROUP, 0,
                                    comm->vbuf.v[buf_pos],
                                    comm->vbuf.v[buf_pos],
                                    comm->vbuf.v[buf_pos][dim]);
@@ -6674,7 +6687,8 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
      /* Initialize to GPU share count to 0, might change later */
      comm->nrank_gpu_shared = 0;
  
-    comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
+    comm->eDLB        = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
+    comm->bDLB_locked = FALSE;
  
      comm->bDynLoadBal = (comm->eDLB == edlbYES);
      if (fplog)
@@ -6740,6 +6754,13 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
      comm->cellsize_limit = 0;
      comm->bBondComm      = FALSE;
  
+    /* Atoms should be able to move by up to half the list buffer size (if > 0)
+     * within nstlist steps. Since boundaries are allowed to displace by half
+     * a cell size, DD cells should be at least the size of the list buffer.
+     */
+    comm->cellsize_limit = max(comm->cellsize_limit,
+                               ir->rlistlong - max(ir->rvdw, ir->rcoulomb));
+
      if (comm->bInterCGBondeds)
      {
          if (comm_distance_min > 0)
@@ -7567,6 +7588,20 @@ void change_dd_dlb_cutoff_limit(t_commrec *cr)
      comm->PMELoadBal_max_cutoff = comm->cutoff;
  }
  
+gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
+{
+    return dd->comm->bDLB_locked;
+}
+
+void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue)
+{
+    /* We can only lock the DLB when it is set to auto, otherwise don't lock */
+    if (dd->comm->eDLB == edlbAUTO)
+    {
+        dd->comm->bDLB_locked = bValue;
+    }
+}
+
  static void merge_cg_buffers(int ncell,
                               gmx_domdec_comm_dim_t *cd, int pulse,
                               int  *ncg_cell,
@@ -9331,17 +9366,18 @@ void dd_partition_system(FILE                *fplog,
      }
  
      /* Check if we have recorded loads on the nodes */
-    if (comm->bRecordLoad && dd_load_count(comm))
+    if (comm->bRecordLoad && dd_load_count(comm) > 0)
      {
-        if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
+        if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal && !dd_dlb_is_locked(dd))
          {
              /* Check if we should use DLB at the second partitioning
               * and every 100 partitionings,
               * so the extra communication cost is negligible.
               */
-            n         = max(100, nstglobalcomm);
+            const int nddp_chk_dlb = 100;
+
              bCheckDLB = (comm->n_load_collect == 0 ||
-                         comm->n_load_have % n == n-1);
+                         comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1);
          }
          else
          {
@@ -9379,8 +9415,26 @@ void dd_partition_system(FILE                *fplog,
                  /* Since the timings are node dependent, the master decides */
                  if (DDMASTER(dd))
                  {
-                    bTurnOnDLB =
-                        (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
+                    /* Here we check if the max PME rank load is more than 0.98
+                     * the max PP force load. If so, PP DLB will not help,
+                     * since we are (almost) limited by PME. Furthermore,
+                     * DLB will cause a significant extra x/f redistribution
+                     * cost on the PME ranks, which will then surely result
+                     * in lower total performance.
+                     * This check might be fragile, since one measurement
+                     * below 0.98 (although only done once every 100 DD part.)
+                     * could turn on DLB for the rest of the run.
+                     */
+                    if (cr->npmenodes > 0 &&
+                        dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
+                    {
+                        bTurnOnDLB = FALSE;
+                    }
+                    else
+                    {
+                        bTurnOnDLB =
+                            (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
+                    }
                      if (debug)
                      {
                          fprintf(debug, "step %s, imb loss %f\n",
@@ -9750,7 +9804,8 @@ void dd_partition_system(FILE                *fplog,
      if (vsite != NULL)
      {
          /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
-        split_vsites_over_threads(top_local->idef.il, mdatoms, FALSE, vsite);
+        split_vsites_over_threads(top_local->idef.il, top_local->idef.iparams,
+                                  mdatoms, FALSE, vsite);
      }
  
      if (shellfc)