Avoid DLB with overloaded PME ranks

[alexxy/gromacs.git] / src / programs / mdrun / md.c
diff --git a/src/programs/mdrun/md.c b/src/programs/mdrun/md.c

index 2b320246b588bfd34c87f02885847096fa53499f..3d98d597c7d0e9c009e02d816e631b3f36846a25 100644 (file)
--- a/src/programs/mdrun/md.c
+++ b/src/programs/mdrun/md.c
@@ -298,7 +298,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
              &(state_global->fep_state), lam0,
              nrnb, top_global, &upd,
              nfile, fnm, &outf, &mdebin,
-            force_vir, shake_vir, mu_tot, &bSimAnn, &vcm, Flags);
+            force_vir, shake_vir, mu_tot, &bSimAnn, &vcm, Flags, wcycle);
  
      clear_mat(total_vir);
      clear_mat(pres);
@@ -1327,7 +1327,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
          do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t,
                                   ir, state, state_global, top_global, fr,
                                   outf, mdebin, ekind, f, f_global,
-                                 wcycle, &nchkpt,
+                                 &nchkpt,
                                   bCPT, bRerunMD, bLastStep, (Flags & MD_CONFOUT),
                                   bSumEkinhOld);
          /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
@@ -1909,6 +1909,21 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                      }
                      dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
  
+                    if (bPMETuneRunning &&
+                        fr->nbv->bUseGPU && DOMAINDECOMP(cr) &&
+                        !(cr->duty & DUTY_PME))
+                    {
+                        /* Lock DLB=auto to off (does nothing when DLB=yes/no).
+                         * With GPUs + separate PME ranks, we don't want DLB.
+                         * This could happen when we scan coarse grids and
+                         * it would then never be turned off again.
+                         * This would hurt performance at the final, optimal
+                         * grid spacing, where DLB almost never helps.
+                         * Also, DLB can limit the cut-off for PME tuning.
+                         */
+                        dd_dlb_set_lock(cr->dd, TRUE);
+                    }
+
                      if (bPMETuneRunning || step_rel > ir->nstlist*50)
                      {
                          bPMETuneTry     = FALSE;
@@ -1939,6 +1954,16 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                      {
                          calc_enervirdiff(NULL, ir->eDispCorr, fr);
                      }
+
+                    if (!bPMETuneRunning &&
+                        DOMAINDECOMP(cr) &&
+                        dd_dlb_is_locked(cr->dd))
+                    {
+                        /* Unlock the DLB=auto, DLB is allowed to activate
+                         * (but we don't expect it to activate in most cases).
+                         */
+                        dd_dlb_set_lock(cr->dd, FALSE);
+                    }
                  }
                  cycles_pmes = 0;
              }
@@ -1969,6 +1994,10 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
      /* End of main MD loop */
      debug_gmx();
  
+    /* Closing TNG files can include compressing data. Therefore it is good to do that
+     * before stopping the time measurements. */
+    mdoutf_tng_close(outf);
+
      /* Stop measuring walltime */
      walltime_accounting_end(walltime_accounting);