a471a6c828721422171c68b0347fd28556d380b3
[alexxy/gromacs.git] / src / gromacs / mdlib / resethandler.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*! \internal \file
36  * \brief
37  * Defines the reset handler class.
38  *
39  * \author Pascal Merz <pascal.merz@colorado.edu>
40  * \ingroup module_mdlib
41  */
42
43 #include "gmxpre.h"
44
45 #include "resethandler.h"
46
47 #include "gromacs/domdec/domdec.h"
48 #include "gromacs/ewald/pme.h"
49 #include "gromacs/ewald/pme_load_balancing.h"
50 #include "gromacs/ewald/pme_pp.h"
51 #include "gromacs/gmxlib/nrnb.h"
52 #include "gromacs/gpu_utils/gpu_utils.h"
53 #include "gromacs/mdrunutility/printtime.h"
54 #include "gromacs/mdtypes/commrec.h"
55 #include "gromacs/nbnxm/gpu_data_mgmt.h"
56 #include "gromacs/nbnxm/nbnxm.h"
57 #include "gromacs/timing/walltime_accounting.h"
58 #include "gromacs/utility/cstringutil.h"
59 #include "gromacs/utility/fatalerror.h"
60
61 namespace gmx
62 {
63
64 /*! \brief Convert signed char (as used by SimulationSignal) to ResetSignal enum
65  *
66  * Expected values are
67  *   \p sig == 0 -- no signal
68  *   \p sig >= 1 -- signal received
69  */
70 static inline ResetSignal convertToResetSignal(signed char sig)
71 {
72     GMX_ASSERT(sig >= 0, "Unexpected reset signal < 0 received");
73     return sig >= 1 ? ResetSignal::doResetCounters : ResetSignal::noSignal;
74 }
75
76 ResetHandler::ResetHandler(compat::not_null<SimulationSignal*> signal,
77                            bool                                simulationsShareState,
78                            int64_t                             nsteps,
79                            bool                                isMaster,
80                            bool                                resetHalfway,
81                            real                                maximumHoursToRun,
82                            const MDLogger&                     mdlog,
83                            gmx_wallcycle*                      wcycle,
84                            gmx_walltime_accounting_t           walltime_accounting) :
85     signal_(*signal),
86     rankCanSetSignal_(false),
87     simulationNeedsReset_(false),
88     maximumHoursToRun_(maximumHoursToRun)
89 {
90     if (simulationsShareState)
91     {
92         signal_.isLocal = false;
93     }
94     if (resetHalfway)
95     {
96         GMX_LOG(mdlog.info)
97                 .asParagraph()
98                 .appendText(
99                         "The -resethway functionality is deprecated, and may be removed in a "
100                         "future version.");
101         if (nsteps > 0)
102         {
103             /* Signal to reset the counters half the simulation steps. */
104             wcycle_set_reset_counters(wcycle, nsteps / 2);
105         }
106         simulationNeedsReset_ = true;
107
108         if (isMaster && (maximumHoursToRun > 0))
109         {
110             rankCanSetSignal_ = true;
111         }
112     }
113     else if (wcycle_get_reset_counters(wcycle) > 0)
114     {
115         simulationNeedsReset_ = true;
116     }
117     else
118     {
119         // if no reset is happening, this will always be valid
120         walltime_accounting_set_valid_finish(walltime_accounting);
121     }
122 }
123
124 bool ResetHandler::setSignalImpl(gmx_walltime_accounting_t walltime_accounting)
125 {
126     const double secondsSinceStart = walltime_accounting_get_time_since_start(walltime_accounting);
127     if (secondsSinceStart > maximumHoursToRun_ * 60.0 * 60.0 * 0.495)
128     {
129         /* Set flag that will communicate the signal to all ranks in the simulation */
130         signal_.sig = static_cast<signed char>(ResetSignal::doResetCounters);
131         /* Let handler know that we did signal a reset */
132         return true;
133     }
134     /* Let handler know that we did not signal a reset */
135     return false;
136 }
137
138 bool ResetHandler::resetCountersImpl(int64_t                     step,
139                                      int64_t                     step_rel,
140                                      const MDLogger&             mdlog,
141                                      FILE*                       fplog,
142                                      const t_commrec*            cr,
143                                      nonbonded_verlet_t*         nbv,
144                                      t_nrnb*                     nrnb,
145                                      const gmx_pme_t*            pme,
146                                      const pme_load_balancing_t* pme_loadbal,
147                                      gmx_wallcycle*              wcycle,
148                                      gmx_walltime_accounting_t   walltime_accounting)
149 {
150     /* Reset either if signal has been passed, or if reset step has been reached */
151     if (convertToResetSignal(signal_.set) == ResetSignal::doResetCounters
152         || step_rel == wcycle_get_reset_counters(wcycle))
153     {
154         if (pme_loadbal_is_active(pme_loadbal))
155         {
156             /* Do not permit counter reset while PME load
157              * balancing is active. The only purpose for resetting
158              * counters is to measure reliable performance data,
159              * and that can't be done before balancing
160              * completes.
161              *
162              * TODO consider fixing this by delaying the reset
163              * until after load balancing completes,
164              * e.g. https://gerrit.gromacs.org/#/c/4964/2 */
165             gmx_fatal(FARGS,
166                       "PME tuning was still active when attempting to "
167                       "reset mdrun counters at step %" PRId64
168                       ". Try "
169                       "resetting counters later in the run, e.g. with gmx "
170                       "mdrun -resetstep.",
171                       step);
172         }
173
174         char sbuf[STEPSTRSIZE];
175
176         /* Reset all the counters related to performance over the run */
177         GMX_LOG(mdlog.warning)
178                 .asParagraph()
179                 .appendTextFormatted("step %s: resetting all time and cycle counters",
180                                      gmx_step_str(step, sbuf));
181
182         if (nbv && nbv->useGpu())
183         {
184             Nbnxm::gpu_reset_timings(nbv);
185         }
186
187         if (pme_gpu_task_enabled(pme))
188         {
189             pme_gpu_reset_timings(pme);
190         }
191
192         if ((nbv && nbv->useGpu()) || pme_gpu_task_enabled(pme))
193         {
194             resetGpuProfiler();
195         }
196
197         wallcycle_stop(wcycle, WallCycleCounter::Run);
198         wallcycle_reset_all(wcycle);
199         if (DOMAINDECOMP(cr))
200         {
201             reset_dd_statistics_counters(cr->dd);
202         }
203         clear_nrnb(nrnb);
204         wallcycle_start(wcycle, WallCycleCounter::Run);
205         walltime_accounting_reset_time(walltime_accounting, step);
206         print_date_and_time(fplog, cr->nodeid, "Restarted time", gmx_gettime());
207
208         wcycle_set_reset_counters(wcycle, -1);
209         if (!thisRankHasDuty(cr, DUTY_PME))
210         {
211             /* Tell our PME node to reset its counters */
212             gmx_pme_send_resetcounters(cr, step);
213         }
214         /* Reset can only happen once, so clear the triggering flag. */
215         signal_.set = static_cast<signed char>(ResetSignal::noSignal);
216         /* We have done a reset, so the finish will be valid. */
217         walltime_accounting_set_valid_finish(walltime_accounting);
218         /* Let handler know that we handled a reset */
219         return true;
220     }
221
222     /* Let handler know that we did not handle a reset */
223     return false;
224 }
225
226 } // namespace gmx