Fix handling of counter resets
authorMark Abraham <mark.j.abraham@gmail.com>
Fri, 27 Apr 2018 16:35:34 +0000 (18:35 +0200)
committerMark Abraham <mark.j.abraham@gmail.com>
Wed, 22 Aug 2018 13:59:10 +0000 (15:59 +0200)
There is no reason for or need to change max_hours, ir->nsteps or
step_rel when doing a counter reset.

This makes clear that the behaviour for the combination

mdrun -maxh t -resetstep n

matches the documentation of -maxh.

Updated the API for walltime_acccounting and its usage, because
elapsed time is an insufficiently clear context. Changed the names of
the start and stop functions so that no callers can silently rely on
semantics that have changed.

Avoided variables such as elapsed_time and max_hours, which were
insufficiently precisely worded.

Refs #1781

Change-Id: I16c96985f43a7b4ac75b94f378da3d05914d6986

src/gromacs/ewald/pme-only.cpp
src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdrun/md.cpp
src/gromacs/mdrun/minimize.cpp
src/gromacs/mdrun/tpi.cpp
src/gromacs/timing/walltime_accounting.cpp
src/gromacs/timing/walltime_accounting.h

index bec9a2e44eac8a4c74e8e1091dcc8254f74132f3..3aca73b868819abdfdc1ba0b319eaac1d76c0889 100644 (file)
@@ -154,24 +154,18 @@ static std::unique_ptr<gmx_pme_pp> gmx_pme_pp_init(const t_commrec *cr)
     return pme_pp;
 }
 
-static void reset_pmeonly_counters(gmx_wallcycle_t wcycle,
+static void reset_pmeonly_counters(gmx_wallcycle_t           wcycle,
                                    gmx_walltime_accounting_t walltime_accounting,
-                                   t_nrnb *nrnb, t_inputrec *ir,
-                                   int64_t step,
-                                   bool useGpuForPme)
+                                   t_nrnb                   *nrnb,
+                                   int64_t                   step,
+                                   bool                      useGpuForPme)
 {
     /* Reset all the counters related to performance over the run */
     wallcycle_stop(wcycle, ewcRUN);
     wallcycle_reset_all(wcycle);
     init_nrnb(nrnb);
-    if (ir->nsteps >= 0)
-    {
-        /* ir->nsteps is not used here, but we update it for consistency */
-        ir->nsteps -= step - ir->init_step;
-    }
-    ir->init_step = step;
     wallcycle_start(wcycle, ewcRUN);
-    walltime_accounting_start(walltime_accounting);
+    walltime_accounting_reset_time(walltime_accounting, step);
 
     if (useGpuForPme)
     {
@@ -601,7 +595,7 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
             if (ret == pmerecvqxRESETCOUNTERS)
             {
                 /* Reset the cycle and flop counters */
-                reset_pmeonly_counters(wcycle, walltime_accounting, mynrnb, ir, step, useGpuForPme);
+                reset_pmeonly_counters(wcycle, walltime_accounting, mynrnb, step, useGpuForPme);
             }
         }
         while (ret == pmerecvqxSWITCHGRID || ret == pmerecvqxRESETCOUNTERS);
@@ -615,7 +609,7 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
         if (count == 0)
         {
             wallcycle_start(wcycle, ewcRUN);
-            walltime_accounting_start(walltime_accounting);
+            walltime_accounting_start_time(walltime_accounting);
         }
 
         wallcycle_start(wcycle, ewcPMEMESH);
@@ -668,7 +662,7 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
     } /***** end of quasi-loop, we stop with the break above */
     while (TRUE);
 
-    walltime_accounting_end(walltime_accounting);
+    walltime_accounting_end_time(walltime_accounting);
 
     return 0;
 }
index fec5890fe5f110bdf753488e70e072c8e4d27a99..82b986ecf45e7b6bf6c0cba6c3ead412acd15c58 100644 (file)
@@ -2752,13 +2752,11 @@ void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
         nrnb_tot = nrnb;
     }
 
-    elapsed_time                                 = walltime_accounting_get_elapsed_time(walltime_accounting);
-    elapsed_time_over_all_ranks                  = elapsed_time;
-    elapsed_time_over_all_threads                = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
-    elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
-#if GMX_MPI
+    elapsed_time                  = walltime_accounting_get_time_since_reset(walltime_accounting);
+    elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
     if (cr->nnodes > 1)
     {
+#if GMX_MPI
         /* reduce elapsed_time over all MPI ranks in the current simulation */
         MPI_Allreduce(&elapsed_time,
                       &elapsed_time_over_all_ranks,
@@ -2771,8 +2769,13 @@ void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
                       &elapsed_time_over_all_threads_over_all_ranks,
                       1, MPI_DOUBLE, MPI_SUM,
                       cr->mpi_comm_mysim);
-    }
 #endif
+    }
+    else
+    {
+        elapsed_time_over_all_ranks                  = elapsed_time;
+        elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
+    }
 
     if (printReport)
     {
@@ -2820,14 +2823,14 @@ void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, const t_commrec *cr,
         {
             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
                        elapsed_time_over_all_ranks,
-                       walltime_accounting_get_nsteps_done(walltime_accounting),
+                       walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
                        delta_t, nbfs, mflop);
         }
         if (bWriteStat)
         {
             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
                        elapsed_time_over_all_ranks,
-                       walltime_accounting_get_nsteps_done(walltime_accounting),
+                       walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
                        delta_t, nbfs, mflop);
         }
     }
index 3702b59561330a150192813795b5c9911f2e9237..14f6d5b8f63298ddfcb39e9286dff838f3d3cb8a 100644 (file)
@@ -142,7 +142,6 @@ using gmx::SimulationSignaller;
 //! Resets all the counters.
 static void reset_all_counters(FILE *fplog, const gmx::MDLogger &mdlog, t_commrec *cr,
                                int64_t step,
-                               int64_t *step_rel, t_inputrec *ir,
                                gmx_wallcycle_t wcycle, t_nrnb *nrnb,
                                gmx_walltime_accounting_t walltime_accounting,
                                struct nonbonded_verlet_t *nbv,
@@ -177,11 +176,8 @@ static void reset_all_counters(FILE *fplog, const gmx::MDLogger &mdlog, t_commre
         reset_dd_statistics_counters(cr->dd);
     }
     init_nrnb(nrnb);
-    ir->init_step += *step_rel;
-    ir->nsteps    -= *step_rel;
-    *step_rel      = 0;
     wallcycle_start(wcycle, ewcRUN);
-    walltime_accounting_start(walltime_accounting);
+    walltime_accounting_reset_time(walltime_accounting, step);
     print_date_and_time(fplog, cr->nodeid, "Restarted time", gmx_gettime());
 }
 
@@ -268,7 +264,6 @@ void gmx::Integrator::do_md()
     t_inputrec       *ir   = inputrec;
     gmx_mdoutf       *outf = nullptr;
     int64_t           step, step_rel;
-    double            elapsed_time;
     double            t, t0, lam0[efptNR];
     gmx_bool          bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
     gmx_bool          bNS, bNStList, bSimAnn, bStopCM,
@@ -734,7 +729,7 @@ void gmx::Integrator::do_md()
         fprintf(fplog, "\n");
     }
 
-    walltime_accounting_start(walltime_accounting);
+    walltime_accounting_start_time(walltime_accounting);
     wallcycle_start(wcycle, ewcRUN);
     print_start(fplog, cr, walltime_accounting, "mdrun");
 
@@ -802,10 +797,6 @@ void gmx::Integrator::do_md()
         }
     }
 
-    /* Loop over MD steps or if rerunMD to end of input trajectory,
-     * or, if max_hours>0, until max_hours is reached.
-     */
-    real max_hours   = mdrunOptions.maximumHoursToRun;
     bFirstStep       = TRUE;
     /* Skip the first Nose-Hoover integration when we get the state from tpx */
     bInitStep        = !startingFromCheckpoint || EI_VV(ir->eI);
@@ -1338,7 +1329,7 @@ void gmx::Integrator::do_md()
             copy_mat(state->fvir_prev, force_vir);
         }
 
-        elapsed_time = walltime_accounting_get_current_elapsed_time(walltime_accounting);
+        double secondsSinceStart = walltime_accounting_get_time_since_start(walltime_accounting);
 
         /* Check whether everything is still allright */
         if ((static_cast<int>(gmx_get_stop_condition()) > handled_stop_condition)
@@ -1385,20 +1376,23 @@ void gmx::Integrator::do_md()
             handled_stop_condition = static_cast<int>(gmx_get_stop_condition());
         }
         else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
-                 (max_hours > 0 && elapsed_time > max_hours*60.0*60.0*0.99) &&
+                 (mdrunOptions.maximumHoursToRun > 0 &&
+                  secondsSinceStart > mdrunOptions.maximumHoursToRun*60.0*60.0*0.99) &&
                  signals[eglsSTOPCOND].sig == 0 && signals[eglsSTOPCOND].set == 0)
         {
             /* Signal to terminate the run */
             signals[eglsSTOPCOND].sig = 1;
             if (fplog)
             {
-                fprintf(fplog, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
+                fprintf(fplog, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n",
+                        gmx_step_str(step, sbuf), mdrunOptions.maximumHoursToRun*0.99);
             }
-            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
+            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n",
+                    gmx_step_str(step, sbuf), mdrunOptions.maximumHoursToRun*0.99);
         }
 
         if (bResetCountersHalfMaxH && MASTER(cr) &&
-            elapsed_time > max_hours*60.0*60.0*0.495)
+            secondsSinceStart > mdrunOptions.maximumHoursToRun*60.0*60.0*0.495)
         {
             /* Set flag that will communicate the signal to all ranks in the simulation */
             signals[eglsRESETCOUNTERS].sig = 1;
@@ -1412,7 +1406,7 @@ void gmx::Integrator::do_md()
         if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
                            cpt_period >= 0 &&
                            (cpt_period == 0 ||
-                            elapsed_time >= nchkpt*cpt_period*60.0)) &&
+                            secondsSinceStart >= nchkpt*cpt_period*60.0)) &&
             signals[eglsCHKPT].set == 0)
         {
             signals[eglsCHKPT].sig = 1;
@@ -1850,7 +1844,7 @@ void gmx::Integrator::do_md()
                           "resetting counters later in the run, e.g. with gmx "
                           "mdrun -resetstep.", step);
             }
-            reset_all_counters(fplog, mdlog, cr, step, &step_rel, ir, wcycle, nrnb, walltime_accounting,
+            reset_all_counters(fplog, mdlog, cr, step, wcycle, nrnb, walltime_accounting,
                                use_GPU(fr->nbv) ? fr->nbv : nullptr, fr->pmedata);
             wcycle_set_reset_counters(wcycle, -1);
             if (!thisRankHasDuty(cr, DUTY_PME))
@@ -1858,10 +1852,8 @@ void gmx::Integrator::do_md()
                 /* Tell our PME node to reset its counters */
                 gmx_pme_send_resetcounters(cr, step);
             }
-            /* Correct max_hours for the elapsed time */
-            max_hours                -= elapsed_time/(60.0*60.0);
             /* If mdrun -maxh -resethway was active, it can only trigger once */
-            bResetCountersHalfMaxH    = FALSE; /* TODO move this to where signals[eglsRESETCOUNTERS].sig is set */
+            bResetCountersHalfMaxH = FALSE; /* TODO move this to where signals[eglsRESETCOUNTERS].sig is set */
             /* Reset can only happen once, so clear the triggering flag. */
             signals[eglsRESETCOUNTERS].set = 0;
         }
@@ -1877,7 +1869,7 @@ void gmx::Integrator::do_md()
     mdoutf_tng_close(outf);
 
     /* Stop measuring walltime */
-    walltime_accounting_end(walltime_accounting);
+    walltime_accounting_end_time(walltime_accounting);
 
     if (bRerunMD && MASTER(cr))
     {
@@ -1922,7 +1914,7 @@ void gmx::Integrator::do_md()
     /* IMD cleanup, if bIMD is TRUE. */
     IMD_finalize(ir->bIMD, ir->imd);
 
-    walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
+    walltime_accounting_set_nsteps_done(walltime_accounting, step);
     if (step_rel >= wcycle_get_reset_counters(wcycle) &&
         signals[eglsRESETCOUNTERS].set == 0 &&
         !bResetCountersHalfMaxH)
index f80e2934bec80f5ba61c9f30b774ddbdfa409379..17f77cf0f078ca154934174229c275a99477877d 100644 (file)
@@ -123,7 +123,7 @@ static void print_em_start(FILE                     *fplog,
                            gmx_wallcycle_t           wcycle,
                            const char               *name)
 {
-    walltime_accounting_start(walltime_accounting);
+    walltime_accounting_start_time(walltime_accounting);
     wallcycle_start(wcycle, ewcRUN);
     print_start(fplog, cr, walltime_accounting, name);
 }
@@ -134,7 +134,7 @@ static void em_time_end(gmx_walltime_accounting_t walltime_accounting,
 {
     wallcycle_stop(wcycle, ewcRUN);
 
-    walltime_accounting_end(walltime_accounting);
+    walltime_accounting_end_time(walltime_accounting);
 }
 
 //! Printing a log file and console header
index 1bdbdfa98e852dadf9583aca4ea68739649b73be..66d8f30ba655378bcddccb3e61da2d96816a5ec3 100644 (file)
@@ -264,7 +264,7 @@ Integrator::do_tpi()
     f.resize(gmx::paddedRVecVectorSize(top_global->natoms));
 
     /* Print to log file  */
-    walltime_accounting_start(walltime_accounting);
+    walltime_accounting_start_time(walltime_accounting);
     wallcycle_start(wcycle, ewcRUN);
     print_start(fplog, cr, walltime_accounting, "Test Particle Insertion");
 
@@ -804,7 +804,7 @@ Integrator::do_tpi()
 
         bNotLastFrame = read_next_frame(oenv, status, &rerun_fr);
     }   /* End of the loop  */
-    walltime_accounting_end(walltime_accounting);
+    walltime_accounting_end_time(walltime_accounting);
 
     close_trx(status);
 
index b3b071ba55eb959c3d559f6b3701a564cd26b4af..0ad9fbcd73bbbd864363ca71f28ea223bb92a784 100644 (file)
 typedef struct gmx_walltime_accounting {
     //! Seconds since the epoch recorded at the start of the simulation
     double          start_time_stamp;
-    //! Seconds since the epoch recorded at the start of the simulation for this thread
-    double          start_time_stamp_per_thread;
-    //! Total seconds elapsed over the simulation
+    /*! \brief Seconds since the epoch recorded at the reset of
+     * counters for the simulation (or the start, if no reset has
+     * occured). */
+    double          reset_time_stamp;
+    /*! \brief Seconds since the epoch recorded at the reset of
+     * counters for the simulation for this thread (or the start, if
+     * no reset has occured). */
+    double          reset_time_stamp_per_thread;
+    //! Total seconds elapsed over the simulation since counter reset
     double          elapsed_time;
-    //! Total seconds elapsed over the simulation running this thread
+    //! Total seconds elapsed over the simulation since counter reset running this thread
     double          elapsed_time_over_all_threads;
     /*! \brief Number of OpenMP threads that will be launched by this
      * MPI rank.
@@ -84,6 +90,8 @@ typedef struct gmx_walltime_accounting {
      * elapsed_time_over_all_threads over all threads was constant
      * with respect to parallelism implementation. */
     int             numOpenMPThreads;
+    //! Numbers of steps done before reset of counters
+    int64_t         nsteps_done_at_reset;
     //! Set by integrators to report the amount of work they did
     int64_t         nsteps_done;
     //! Whether the simulation has finished in a way valid for walltime reporting.
@@ -114,8 +122,10 @@ walltime_accounting_init(int numOpenMPThreads)
 
     snew(walltime_accounting, 1);
     walltime_accounting->start_time_stamp            = 0;
-    walltime_accounting->start_time_stamp_per_thread = 0;
+    walltime_accounting->reset_time_stamp            = 0;
+    walltime_accounting->reset_time_stamp_per_thread = 0;
     walltime_accounting->elapsed_time                = 0;
+    walltime_accounting->nsteps_done_at_reset        = 0;
     walltime_accounting->nsteps_done                 = 0;
     walltime_accounting->numOpenMPThreads            = numOpenMPThreads;
     walltime_accounting->isValidFinish               = false;
@@ -130,24 +140,33 @@ walltime_accounting_destroy(gmx_walltime_accounting_t walltime_accounting)
 }
 
 void
-walltime_accounting_start(gmx_walltime_accounting_t walltime_accounting)
+walltime_accounting_reset_time(gmx_walltime_accounting_t walltime_accounting,
+                               int64_t                   step)
 {
-    walltime_accounting->start_time_stamp            = gmx_gettime();
-    walltime_accounting->start_time_stamp_per_thread = gmx_gettime_per_thread();
+    walltime_accounting->reset_time_stamp            = gmx_gettime();
+    walltime_accounting->reset_time_stamp_per_thread = gmx_gettime_per_thread();
     walltime_accounting->elapsed_time                = 0;
     walltime_accounting->nsteps_done                 = 0;
+    walltime_accounting->nsteps_done_at_reset        = step;
 }
 
 void
-walltime_accounting_end(gmx_walltime_accounting_t walltime_accounting)
+walltime_accounting_start_time(gmx_walltime_accounting_t walltime_accounting)
+{
+    walltime_accounting_reset_time(walltime_accounting, 0);
+    walltime_accounting->start_time_stamp = walltime_accounting->reset_time_stamp;
+}
+
+void
+walltime_accounting_end_time(gmx_walltime_accounting_t walltime_accounting)
 {
     double now, now_per_thread;
 
     now            = gmx_gettime();
     now_per_thread = gmx_gettime_per_thread();
 
-    walltime_accounting->elapsed_time                  = now - walltime_accounting->start_time_stamp;
-    walltime_accounting->elapsed_time_over_all_threads = now_per_thread - walltime_accounting->start_time_stamp_per_thread;
+    walltime_accounting->elapsed_time                  = now - walltime_accounting->reset_time_stamp;
+    walltime_accounting->elapsed_time_over_all_threads = now_per_thread - walltime_accounting->reset_time_stamp_per_thread;
     /* For thread-MPI, the per-thread CPU timer makes this just
      * work. For OpenMP threads, the per-thread CPU timer measurement
      * needs to be multiplied by the number of OpenMP threads used,
@@ -159,19 +178,19 @@ walltime_accounting_end(gmx_walltime_accounting_t walltime_accounting)
 }
 
 double
-walltime_accounting_get_current_elapsed_time(gmx_walltime_accounting_t walltime_accounting)
+walltime_accounting_get_time_since_start(gmx_walltime_accounting_t walltime_accounting)
 {
     return gmx_gettime() - walltime_accounting->start_time_stamp;
 }
 
 double
-walltime_accounting_get_elapsed_time(gmx_walltime_accounting_t walltime_accounting)
+walltime_accounting_get_time_since_reset(gmx_walltime_accounting_t walltime_accounting)
 {
-    return walltime_accounting->elapsed_time;
+    return gmx_gettime() - walltime_accounting->reset_time_stamp;
 }
 
 double
-walltime_accounting_get_elapsed_time_over_all_threads(gmx_walltime_accounting_t walltime_accounting)
+walltime_accounting_get_time_since_reset_over_all_threads(gmx_walltime_accounting_t walltime_accounting)
 {
     return walltime_accounting->elapsed_time_over_all_threads;
 }
@@ -183,9 +202,9 @@ walltime_accounting_get_start_time_stamp(gmx_walltime_accounting_t walltime_acco
 }
 
 int64_t
-walltime_accounting_get_nsteps_done(gmx_walltime_accounting_t walltime_accounting)
+walltime_accounting_get_nsteps_done_since_reset(gmx_walltime_accounting_t walltime_accounting)
 {
-    return walltime_accounting->nsteps_done;
+    return walltime_accounting->nsteps_done - walltime_accounting->nsteps_done_at_reset;
 }
 
 void
index a7b72f86f1eee88ddbf487484201fe9f14ae8b3a..351840370e50bcfc59c42e8a60a0a0b32ec6bf3e 100644 (file)
@@ -62,30 +62,37 @@ void
 walltime_accounting_destroy(gmx_walltime_accounting_t walltime_accounting);
 
 /*! \brief
- * Record initial time stamps, e.g. at run end or counter re-initalization time
+ * Record initial time stamps, e.g. at run start
  */
 void
-walltime_accounting_start(gmx_walltime_accounting_t walltime_accounting);
+walltime_accounting_start_time(gmx_walltime_accounting_t walltime_accounting);
+
+/*! \brief
+ * Reset time stamps, e.g. at counter re-initalization time
+ */
+void
+walltime_accounting_reset_time(gmx_walltime_accounting_t walltime_accounting,
+                               int64_t                   step);
 
 /*! \brief
  * Measure and cache the elapsed wall-clock time since
- * walltime_accounting_start() */
+ * walltime_accounting_reset_time() */
 void
-walltime_accounting_end(gmx_walltime_accounting_t walltime_accounting);
+walltime_accounting_end_time(gmx_walltime_accounting_t walltime_accounting);
 
 /*! \brief
  * Measure and return the elapsed wall-clock time since
- * walltime_accounting_start() */
+ * walltime_accounting_reset_time() */
 double
-walltime_accounting_get_current_elapsed_time(gmx_walltime_accounting_t walltime_accounting);
+walltime_accounting_get_time_since_reset(gmx_walltime_accounting_t walltime_accounting);
 
-//! Get the cached wall-clock time for this node
+//! Get the wall-clock time since the actual start of the run (regardless of any resets).
 double
-walltime_accounting_get_elapsed_time(gmx_walltime_accounting_t walltime_accounting);
+walltime_accounting_get_time_since_start(gmx_walltime_accounting_t walltime_accounting);
 
 //! Get the cached wall-clock time, multiplied by the number of OpenMP threads
 double
-walltime_accounting_get_elapsed_time_over_all_threads(gmx_walltime_accounting_t walltime_accounting);
+walltime_accounting_get_time_since_reset_over_all_threads(gmx_walltime_accounting_t walltime_accounting);
 
 //! Get the cached initial time stamp for this node
 double
@@ -93,7 +100,7 @@ walltime_accounting_get_start_time_stamp(gmx_walltime_accounting_t walltime_acco
 
 //! Get the number of integration steps done
 int64_t
-walltime_accounting_get_nsteps_done(gmx_walltime_accounting_t walltime_accounting);
+walltime_accounting_get_nsteps_done_since_reset(gmx_walltime_accounting_t walltime_accounting);
 
 /*! \brief Set the number of integration steps done
  *