Fix wallcycle counting
[alexxy/gromacs.git] / src / gromacs / timing / wallcycle.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2008, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9  * and including many others, as listed in the AUTHORS file in the
10  * top-level source directory and at http://www.gromacs.org.
11  *
12  * GROMACS is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public License
14  * as published by the Free Software Foundation; either version 2.1
15  * of the License, or (at your option) any later version.
16  *
17  * GROMACS is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with GROMACS; if not, see
24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
26  *
27  * If you want to redistribute modifications to GROMACS, please
28  * consider that scientific software is very special. Version
29  * control is crucial - bugs must be traceable. We will be happy to
30  * consider code for inclusion in the official distribution, but
31  * derived work must not be called official GROMACS. Details are found
32  * in the README & COPYING files - if they are missing, get the
33  * official version at http://www.gromacs.org.
34  *
35  * To help us fund GROMACS development, we humbly ask that you cite
36  * the research papers on the package. Check out http://www.gromacs.org.
37  */
38 #include "gmxpre.h"
39
40 #include "wallcycle.h"
41
42 #include "config.h"
43
44 #include <cstdlib>
45
46 #include <array>
47 #include <memory>
48 #include <vector>
49
50 #include "gromacs/math/functions.h"
51 #include "gromacs/mdtypes/commrec.h"
52 #include "gromacs/timing/cyclecounter.h"
53 #include "gromacs/timing/gpu_timing.h"
54 #include "gromacs/timing/wallcyclereporting.h"
55 #include "gromacs/utility/arrayref.h"
56 #include "gromacs/utility/cstringutil.h"
57 #include "gromacs/utility/enumerationhelpers.h"
58 #include "gromacs/utility/gmxassert.h"
59 #include "gromacs/utility/gmxmpi.h"
60 #include "gromacs/utility/logger.h"
61 #include "gromacs/utility/smalloc.h"
62 #include "gromacs/utility/snprintf.h"
63 #include "gromacs/utility/stringutil.h"
64
65 //! Whether wallcycle debugging is enabled
66 constexpr bool gmx_unused enableWallcycleDebug = (DEBUG_WCYCLE != 0);
67 //! True if only the master rank should print debugging output
68 constexpr bool gmx_unused onlyMasterDebugPrints = true;
69 //! True if cycle counter nesting depth debuggin prints are enabled
70 constexpr bool gmx_unused debugPrintDepth = false /* enableWallcycleDebug */;
71
72 #if DEBUG_WCYCLE
73 #    include "gromacs/utility/fatalerror.h"
74 #endif
75
76 /* Each name should not exceed 19 printing characters
77    (ie. terminating null can be twentieth) */
78 static const char* enumValuetoString(WallCycleCounter enumValue)
79 {
80     constexpr gmx::EnumerationArray<WallCycleCounter, const char*> wallCycleCounterNames = {
81         "Run",
82         "Step",
83         "PP during PME",
84         "Domain decomp.",
85         "DD comm. load",
86         "DD comm. bounds",
87         "Vsite constr.",
88         "Send X to PME",
89         "Neighbor search",
90         "Launch GPU ops.",
91         "Comm. coord.",
92         "Force",
93         "Wait + Comm. F",
94         "PME mesh",
95         "PME redist. X/F",
96         "PME spread",
97         "PME gather",
98         "PME 3D-FFT",
99         "PME 3D-FFT Comm.",
100         "PME solve LJ",
101         "PME solve Elec",
102         "PME wait for PP",
103         "Wait + Recv. PME F",
104         "Wait PME GPU spread",
105         "PME 3D-FFT",
106         "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
107         "Wait PME GPU gather",
108         "Wait Bonded GPU",
109         "Reduce GPU PME F",
110         "Wait GPU NB nonloc.",
111         "Wait GPU NB local",
112         "Wait GPU state copy",
113         "NB X/F buffer ops.",
114         "Vsite spread",
115         "COM pull force",
116         "AWH",
117         "Write traj.",
118         "Update",
119         "Constraints",
120         "Comm. energies",
121         "Enforced rotation",
122         "Add rot. forces",
123         "Position swapping",
124         "IMD",
125         "Test"
126     };
127     return wallCycleCounterNames[enumValue];
128 }
129
130 static const char* enumValuetoString(WallCycleSubCounter enumValue)
131 {
132     constexpr gmx::EnumerationArray<WallCycleSubCounter, const char*> wallCycleSubCounterNames = {
133         "DD redist.",
134         "DD NS grid + sort",
135         "DD setup comm.",
136         "DD make top.",
137         "DD make constr.",
138         "DD top. other",
139         "DD GPU ops.",
140         "NS grid local",
141         "NS grid non-loc.",
142         "NS search local",
143         "NS search non-loc.",
144         "Bonded F",
145         "Bonded-FEP F",
146         "Restraints F",
147         "Listed buffer ops.",
148         "Nonbonded pruning",
149         "Nonbonded F kernel",
150         "Nonbonded F clear",
151         "Nonbonded FEP",
152         "Launch NB GPU tasks",
153         "Launch Bonded GPU tasks",
154         "Launch PME GPU tasks",
155         "Launch state copy",
156         "Ewald F correction",
157         "NB X buffer ops.",
158         "NB F buffer ops.",
159         "Clear force buffer",
160         "Launch GPU NB X buffer ops.",
161         "Launch GPU NB F buffer ops.",
162         "Launch GPU Comm. coord.",
163         "Launch GPU Comm. force.",
164         "Launch GPU update",
165         "Test subcounter"
166     };
167     return wallCycleSubCounterNames[enumValue];
168 }
169
170 /* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
171 static const char* enumValuetoString(PmeStage enumValue)
172 {
173     constexpr gmx::EnumerationArray<PmeStage, const char*> pmeStageNames = {
174         "PME spline", "PME spread",     "PME spline + spread", "PME 3D-FFT r2c",
175         "PME solve",  "PME 3D-FFT c2r", "PME gather"
176     };
177     return pmeStageNames[enumValue];
178 };
179
180 bool wallcycle_have_counter()
181 {
182     return gmx_cycles_have_counter();
183 }
184
185 std::unique_ptr<gmx_wallcycle> wallcycle_init(FILE* fplog, int resetstep, const t_commrec* cr)
186 {
187     std::unique_ptr<gmx_wallcycle> wc;
188
189
190     if (!wallcycle_have_counter())
191     {
192         return wc;
193     }
194
195     wc = std::make_unique<gmx_wallcycle>();
196
197     wc->haveInvalidCount = false;
198     wc->wc_barrier       = false;
199     wc->wc_depth         = 0;
200     wc->ewc_prev         = WallCycleCounter::Count;
201     wc->reset_counters   = resetstep;
202     wc->cr               = cr;
203
204
205 #if GMX_MPI
206     if (cr != nullptr && PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
207     {
208         if (fplog)
209         {
210             fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
211         }
212         wc->wc_barrier = true;
213     }
214 #endif
215
216     if (getenv("GMX_CYCLE_ALL") != nullptr)
217     {
218         if (fplog)
219         {
220             fprintf(fplog, "\nWill time all the code during the run\n\n");
221         }
222         wc->wcc_all.resize(sc_numWallCycleCountersSquared);
223     }
224
225 #if DEBUG_WCYCLE
226     wc->count_depth  = 0;
227     wc->isMasterRank = MASTER(cr);
228 #endif
229
230     return wc;
231 }
232
233 #if DEBUG_WCYCLE
234 static void debug_start_check(gmx_wallcycle* wc, WallCycleCounter ewc)
235 {
236     if (wc->count_depth < 0 || wc->count_depth >= c_MaxWallCycleDepth)
237     {
238         gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth + 1);
239     }
240     wc->counterlist[wc->count_depth] = ewc;
241     wc->count_depth++;
242
243     if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
244     {
245         std::string indentStr(4 * wc->count_depth, ' ');
246         fprintf(stderr, "%swcycle_start depth %d, %s\n", indentStr.c_str(), wc->count_depth, enumValuetoString(ewc));
247     }
248 }
249
250 static void debug_stop_check(gmx_wallcycle* wc, WallCycleCounter ewc)
251 {
252     if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
253     {
254         std::string indentStr(4 * wc->count_depth, ' ');
255         fprintf(stderr, "%swcycle_stop  depth %d, %s\n", indentStr.c_str(), wc->count_depth, enumValuetoString(ewc));
256     }
257
258     wc->count_depth--;
259
260     if (wc->count_depth < 0)
261     {
262         gmx_fatal(FARGS,
263                   "wallcycle counter depth out of range when stopping %s: %d",
264                   enumValuetoString(ewc),
265                   wc->count_depth);
266     }
267     if (wc->counterlist[wc->count_depth] != ewc)
268     {
269         gmx_fatal(FARGS,
270                   "wallcycle mismatch at stop, start %s, stop %s",
271                   enumValuetoString(wc->counterlist[wc->count_depth]),
272                   enumValuetoString(ewc));
273     }
274 }
275 #endif
276
277 void wallcycle_get(gmx_wallcycle* wc, WallCycleCounter ewc, int* n, double* c)
278 {
279     *n = wc->wcc[ewc].n;
280     *c = static_cast<double>(wc->wcc[ewc].c);
281 }
282
283 void wallcycle_sub_get(gmx_wallcycle* wc, WallCycleSubCounter ewcs, int* n, double* c)
284 {
285     if (sc_useCycleSubcounters && wc != nullptr)
286     {
287         *n = wc->wcsc[ewcs].n;
288         *c = static_cast<double>(wc->wcsc[ewcs].c);
289     }
290 }
291
292 void wallcycle_reset_all(gmx_wallcycle* wc)
293 {
294     if (wc == nullptr)
295     {
296         return;
297     }
298
299     for (auto& counter : wc->wcc)
300     {
301         counter.n = 0;
302         counter.c = 0;
303     }
304     wc->haveInvalidCount = false;
305
306     if (!wc->wcc_all.empty())
307     {
308         for (int i = 0; i < sc_numWallCycleCountersSquared; i++)
309         {
310             wc->wcc_all[i].n = 0;
311             wc->wcc_all[i].c = 0;
312         }
313     }
314     for (auto& counter : wc->wcsc)
315     {
316         counter.n = 0;
317         counter.c = 0;
318     }
319 }
320
321 static bool is_pme_counter(WallCycleCounter ewc)
322 {
323     return (ewc >= WallCycleCounter::PmeMesh && ewc <= WallCycleCounter::PmeWaitComm);
324 }
325
326 static bool is_pme_subcounter(WallCycleCounter ewc)
327 {
328     return (ewc >= WallCycleCounter::PmeRedistXF && ewc < WallCycleCounter::PmeWaitComm);
329 }
330
331 void wallcycleBarrier(gmx_wallcycle* wc)
332 {
333 #if GMX_MPI
334     if (wc->wc_barrier)
335     {
336         MPI_Barrier(wc->cr->mpi_comm_mygroup);
337     }
338 #else
339     GMX_UNUSED_VALUE(wc);
340 #endif
341 }
342
343 /* Subtract counter ewc_sub timed inside a timing block for ewc_main */
344 // NOLINTNEXTLINE(google-runtime-references)
345 static void subtract_cycles(gmx::EnumerationArray<WallCycleCounter, wallcc_t>& wcc,
346                             WallCycleCounter                                   ewc_main,
347                             WallCycleCounter                                   ewc_sub)
348 {
349     if (wcc[ewc_sub].n > 0)
350     {
351         if (wcc[ewc_main].c >= wcc[ewc_sub].c)
352         {
353             wcc[ewc_main].c -= wcc[ewc_sub].c;
354         }
355         else
356         {
357             /* Something is wrong with the cycle counting */
358             wcc[ewc_main].c = 0;
359         }
360     }
361 }
362
363 void wallcycle_scale_by_num_threads(gmx_wallcycle* wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
364 {
365     if (wc == nullptr)
366     {
367         return;
368     }
369
370     for (auto key : keysOf(wc->wcc))
371     {
372         if (is_pme_counter(key) || (key == WallCycleCounter::Run && isPmeRank))
373         {
374             wc->wcc[key].c *= nthreads_pme;
375
376             if (!wc->wcc_all.empty())
377             {
378                 const int current = static_cast<int>(key);
379                 for (int j = 0; j < sc_numWallCycleCounters; j++)
380                 {
381                     wc->wcc_all[current * sc_numWallCycleCounters + j].c *= nthreads_pme;
382                 }
383             }
384         }
385         else
386         {
387             wc->wcc[key].c *= nthreads_pp;
388
389             if (!wc->wcc_all.empty())
390             {
391                 const int current = static_cast<int>(key);
392                 for (int j = 0; j < sc_numWallCycleCounters; j++)
393                 {
394                     wc->wcc_all[current * sc_numWallCycleCounters + j].c *= nthreads_pp;
395                 }
396             }
397         }
398     }
399     if (sc_useCycleSubcounters && !isPmeRank)
400     {
401         for (auto& counter : wc->wcsc)
402         {
403             counter.c *= nthreads_pp;
404         }
405     }
406 }
407
408 /* TODO Make an object for this function to return, containing some
409  * vectors of something like wallcc_t for the summed wcc, wcc_all and
410  * wcsc, AND the original wcc for rank 0.
411  *
412  * The GPU timing is reported only for rank 0, so we want to preserve
413  * the original wcycle on that rank. Rank 0 also reports the global
414  * counts before that, so needs something to contain the global data
415  * without over-writing the rank-0 data. The current implementation
416  * uses cycles_sum to manage this, which works OK now because wcsc and
417  * wcc_all are unused by the GPU reporting, but it is not satisfactory
418  * for the future. Also, there's no need for MPI_Allreduce, since
419  * only MASTERRANK uses any of the results. */
420 WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle* wc)
421 {
422     WallcycleCounts                                    cycles_sum;
423     gmx::EnumerationArray<WallCycleCounter, double>    cyclesMain;
424     gmx::EnumerationArray<WallCycleSubCounter, double> cyclesSub;
425 #if GMX_MPI
426     gmx::EnumerationArray<WallCycleCounter, double>    cyclesMainOnNode;
427     gmx::EnumerationArray<WallCycleSubCounter, double> cyclesSubOnNode;
428 #endif
429
430     if (wc == nullptr)
431     {
432         /* Default construction of std::array of non-class T can leave
433            the values indeterminate, just like a C array */
434         cycles_sum.fill(0);
435         return cycles_sum;
436     }
437
438     auto& wcc = wc->wcc;
439
440     subtract_cycles(wcc, WallCycleCounter::Domdec, WallCycleCounter::DDCommLoad);
441     subtract_cycles(wcc, WallCycleCounter::Domdec, WallCycleCounter::DDCommBound);
442
443     subtract_cycles(wcc, WallCycleCounter::PmeFft, WallCycleCounter::PmeFftComm);
444
445     if (cr->npmenodes == 0)
446     {
447         /* All nodes do PME (or no PME at all) */
448         subtract_cycles(wcc, WallCycleCounter::Force, WallCycleCounter::PmeMesh);
449     }
450     else
451     {
452         /* The are PME-only nodes */
453         if (wcc[WallCycleCounter::PmeMesh].n > 0)
454         {
455             /* This must be a PME only node, calculate the Wait + Comm. time */
456             GMX_ASSERT(wcc[WallCycleCounter::Run].c >= wcc[WallCycleCounter::PmeMesh].c,
457                        "Total run ticks must be greater than PME-only ticks");
458             wcc[WallCycleCounter::PmeWaitComm].c =
459                     wcc[WallCycleCounter::Run].c - wcc[WallCycleCounter::PmeMesh].c;
460         }
461     }
462
463     /* Store the cycles in a double buffer for summing */
464     for (auto key : keysOf(wcc))
465     {
466 #if GMX_MPI
467         cyclesMainOnNode[key] = static_cast<double>(wcc[key].n);
468 #endif
469         cyclesMain[key] = static_cast<double>(wcc[key].c);
470     }
471     if (sc_useCycleSubcounters)
472     {
473         for (auto key : keysOf(wc->wcsc))
474         {
475 #if GMX_MPI
476             cyclesSubOnNode[key] = static_cast<double>(wc->wcsc[key].n);
477 #endif
478             cyclesSub[key] = static_cast<double>(wc->wcsc[key].c);
479         }
480     }
481
482 #if GMX_MPI
483     if (cr->nnodes > 1)
484     {
485         gmx::EnumerationArray<WallCycleCounter, double>    bufMain;
486         gmx::EnumerationArray<WallCycleSubCounter, double> bufSub;
487
488         // TODO this code is used only at the end of the run, so we
489         // can just do a simple reduce of haveInvalidCount in
490         // wallcycle_print, and avoid bugs
491         double haveInvalidCount = (wc->haveInvalidCount ? 1 : 0);
492         // TODO Use MPI_Reduce
493         MPI_Allreduce(cyclesMainOnNode.data(), bufMain.data(), bufMain.size(), MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
494         if (sc_useCycleSubcounters)
495         {
496             MPI_Allreduce(cyclesSubOnNode.data(), bufSub.data(), bufSub.size(), MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
497         }
498         MPI_Allreduce(MPI_IN_PLACE, &haveInvalidCount, 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
499         for (auto key : keysOf(wcc))
500         {
501             wcc[key].n = gmx::roundToInt(bufMain[key]);
502         }
503         wc->haveInvalidCount = (haveInvalidCount > 0);
504         if (sc_useCycleSubcounters)
505         {
506             for (auto key : keysOf(wc->wcsc))
507             {
508                 wc->wcsc[key].n = gmx::roundToInt(bufSub[key]);
509             }
510         }
511
512         // TODO Use MPI_Reduce
513         MPI_Allreduce(cyclesMain.data(), cycles_sum.data(), cyclesMain.size(), MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
514         if (sc_useCycleSubcounters)
515         {
516             MPI_Allreduce(cyclesSub.data(),
517                           cycles_sum.data() + sc_numWallCycleCounters,
518                           cyclesSub.size(),
519                           MPI_DOUBLE,
520                           MPI_SUM,
521                           cr->mpi_comm_mysim);
522         }
523
524         if (!wc->wcc_all.empty())
525         {
526             std::array<double, sc_numWallCycleCountersSquared> cyc_all;
527             std::array<double, sc_numWallCycleCountersSquared> buf_all;
528
529             for (int i = 0; i < sc_numWallCycleCountersSquared; i++)
530             {
531                 cyc_all[i] = wc->wcc_all[i].c;
532             }
533             // TODO Use MPI_Reduce
534             MPI_Allreduce(cyc_all.data(),
535                           buf_all.data(),
536                           sc_numWallCycleCountersSquared,
537                           MPI_DOUBLE,
538                           MPI_SUM,
539                           cr->mpi_comm_mysim);
540             for (int i = 0; i < sc_numWallCycleCountersSquared; i++)
541             {
542                 wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
543             }
544         }
545     }
546     else
547 #endif
548     {
549         for (auto key : keysOf(cyclesMain))
550         {
551             cycles_sum[static_cast<int>(key)] = cyclesMain[key];
552         }
553         if (sc_useCycleSubcounters)
554         {
555             for (auto key : keysOf(cyclesSub))
556             {
557                 const int offset   = static_cast<int>(key) + sc_numWallCycleCounters;
558                 cycles_sum[offset] = cyclesSub[key];
559             }
560         }
561     }
562
563     return cycles_sum;
564 }
565
566 static void
567 print_cycles(FILE* fplog, double c2t, const char* name, int nnodes, int nthreads, int ncalls, double c_sum, double tot)
568 {
569     char   nnodes_str[STRLEN];
570     char   nthreads_str[STRLEN];
571     char   ncalls_str[STRLEN];
572     double wallt;
573     double percentage = (tot > 0.) ? (100. * c_sum / tot) : 0.;
574
575     if (c_sum > 0)
576     {
577         if (ncalls > 0)
578         {
579             snprintf(ncalls_str, sizeof(ncalls_str), "%10d", ncalls);
580             if (nnodes < 0)
581             {
582                 snprintf(nnodes_str, sizeof(nnodes_str), "N/A");
583             }
584             else
585             {
586                 snprintf(nnodes_str, sizeof(nnodes_str), "%4d", nnodes);
587             }
588             if (nthreads < 0)
589             {
590                 snprintf(nthreads_str, sizeof(nthreads_str), "N/A");
591             }
592             else
593             {
594                 snprintf(nthreads_str, sizeof(nthreads_str), "%4d", nthreads);
595             }
596         }
597         else
598         {
599             nnodes_str[0]   = 0;
600             nthreads_str[0] = 0;
601             ncalls_str[0]   = 0;
602         }
603         /* Convert the cycle count to wallclock time for this task */
604         wallt = c_sum * c2t;
605
606         fprintf(fplog,
607                 " %-19.19s %4s %4s %10s  %10.3f %14.3f %5.1f\n",
608                 name,
609                 nnodes_str,
610                 nthreads_str,
611                 ncalls_str,
612                 wallt,
613                 c_sum * 1e-9,
614                 percentage);
615     }
616 }
617
618 static void print_gputimes(FILE* fplog, const char* name, int n, double t, double tot_t)
619 {
620     char num[11];
621     char avg_perf[11];
622
623     if (n > 0)
624     {
625         snprintf(num, sizeof(num), "%10d", n);
626         snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t / n);
627     }
628     else
629     {
630         sprintf(num, "          ");
631         sprintf(avg_perf, "          ");
632     }
633     if (t != tot_t && tot_t > 0)
634     {
635         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, num, t / 1000, avg_perf, 100 * t / tot_t);
636     }
637     else
638     {
639         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, "", t / 1000, avg_perf, 100.0);
640     }
641 }
642
643 static void print_header(FILE* fplog, int nrank_pp, int nth_pp, int nrank_pme, int nth_pme)
644 {
645     int nrank_tot = nrank_pp + nrank_pme;
646     if (0 == nrank_pme)
647     {
648         fprintf(fplog, "On %d MPI rank%s", nrank_tot, nrank_tot == 1 ? "" : "s");
649         if (nth_pp > 1)
650         {
651             fprintf(fplog, ", each using %d OpenMP threads", nth_pp);
652         }
653         /* Don't report doing PP+PME, because we can't tell here if
654          * this is RF, etc. */
655     }
656     else
657     {
658         fprintf(fplog, "On %d MPI rank%s doing PP", nrank_pp, nrank_pp == 1 ? "" : "s");
659         if (nth_pp > 1)
660         {
661             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pp > 1 ? " each" : "", nth_pp);
662         }
663         fprintf(fplog, ", and\non %d MPI rank%s doing PME", nrank_pme, nrank_pme == 1 ? "" : "s");
664         if (nth_pme > 1)
665         {
666             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pme > 1 ? " each" : "", nth_pme);
667         }
668     }
669
670     fprintf(fplog, "\n\n");
671     fprintf(fplog, " Computing:          Num   Num      Call    Wall time         Giga-Cycles\n");
672     fprintf(fplog, "                     Ranks Threads  Count      (s)         total sum    %%\n");
673 }
674
675
676 void wallcycle_print(FILE*                            fplog,
677                      const gmx::MDLogger&             mdlog,
678                      int                              nnodes,
679                      int                              npme,
680                      int                              nth_pp,
681                      int                              nth_pme,
682                      double                           realtime,
683                      gmx_wallcycle*                   wc,
684                      const WallcycleCounts&           cyc_sum,
685                      const gmx_wallclock_gpu_nbnxn_t* gpu_nbnxn_t,
686                      const gmx_wallclock_gpu_pme_t*   gpu_pme_t)
687 {
688     double      tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
689     double      c2t, c2t_pp, c2t_pme = 0;
690     int         npp, nth_tot;
691     char        buf[STRLEN];
692     const char* hline =
693             "-----------------------------------------------------------------------------";
694
695     if (wc == nullptr)
696     {
697         return;
698     }
699
700     GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0");
701     GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0");
702     GMX_ASSERT(nnodes > 0, "Number of nodes must be >0");
703     GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative");
704     npp = nnodes - npme;
705     /* npme is the number of PME-only ranks used, and we always do PP work */
706     GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0");
707
708     nth_tot = npp * nth_pp + npme * nth_pme;
709
710     /* When using PME-only nodes, the next line is valid for both
711        PP-only and PME-only nodes because they started ewcRUN at the
712        same time. */
713     tot        = cyc_sum[static_cast<int>(WallCycleCounter::Run)];
714     tot_for_pp = 0;
715
716     if (tot <= 0.0)
717     {
718         /* TODO This is heavy handed, but until someone reworks the
719            code so that it is provably robust with respect to
720            non-positive values for all possible timer and cycle
721            counters, there is less value gained from printing whatever
722            timing data might still be sensible for some non-Jenkins
723            run, than is lost from diagnosing Jenkins FP exceptions on
724            runs about whose execution time we don't care. */
725         GMX_LOG(mdlog.warning)
726                 .asParagraph()
727                 .appendTextFormatted(
728                         "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a "
729                         "time accounting",
730                         tot);
731         return;
732     }
733
734     if (wc->haveInvalidCount)
735     {
736         GMX_LOG(mdlog.warning)
737                 .asParagraph()
738                 .appendText(
739                         "NOTE: Detected invalid cycle counts, probably because threads moved "
740                         "between CPU cores that do not have synchronized cycle counters. Will not "
741                         "print the cycle accounting.");
742         return;
743     }
744
745
746     /* Conversion factor from cycles to seconds */
747     c2t    = realtime / tot;
748     c2t_pp = c2t * nth_tot / static_cast<double>(npp * nth_pp);
749     if (npme > 0)
750     {
751         c2t_pme = c2t * nth_tot / static_cast<double>(npme * nth_pme);
752     }
753     else
754     {
755         c2t_pme = 0;
756     }
757
758     fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");
759
760     print_header(fplog, npp, nth_pp, npme, nth_pme);
761
762     fprintf(fplog, "%s\n", hline);
763     gmx::EnumerationWrapper<WallCycleCounter> iter;
764     for (auto key = gmx::EnumerationIterator<WallCycleCounter>(WallCycleCounter::Domdec);
765          key != iter.end();
766          ++key)
767     {
768
769         if (is_pme_subcounter(*key))
770         {
771             /* Do not count these at all */
772         }
773         else if (npme > 0 && is_pme_counter(*key))
774         {
775             /* Print timing information for PME-only nodes, but add an
776              * asterisk so the reader of the table can know that the
777              * walltimes are not meant to add up. The asterisk still
778              * fits in the required maximum of 19 characters. */
779             std::string message = gmx::formatString("%s *", enumValuetoString(*key));
780             print_cycles(fplog,
781                          c2t_pme,
782                          message.c_str(),
783                          npme,
784                          nth_pme,
785                          wc->wcc[*key].n,
786                          cyc_sum[static_cast<int>(*key)],
787                          tot);
788         }
789         else
790         {
791             /* Print timing information when it is for a PP or PP+PME
792                node */
793             print_cycles(fplog,
794                          c2t_pp,
795                          enumValuetoString(*key),
796                          npp,
797                          nth_pp,
798                          wc->wcc[*key].n,
799                          cyc_sum[static_cast<int>(*key)],
800                          tot);
801             tot_for_pp += cyc_sum[static_cast<int>(*key)];
802         }
803     }
804     if (!wc->wcc_all.empty())
805     {
806         for (auto i : keysOf(wc->wcc))
807         {
808             const int countI = static_cast<int>(i);
809             for (auto j : keysOf(wc->wcc))
810             {
811                 const int countJ = static_cast<int>(j);
812                 snprintf(buf, 20, "%-9.9s %-9.9s", enumValuetoString(i), enumValuetoString(j));
813                 print_cycles(fplog,
814                              c2t_pp,
815                              buf,
816                              npp,
817                              nth_pp,
818                              wc->wcc_all[countI * sc_numWallCycleCounters + countJ].n,
819                              wc->wcc_all[countI * sc_numWallCycleCounters + countJ].c,
820                              tot);
821             }
822         }
823     }
824     tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot);
825     print_cycles(fplog, c2t_pp, "Rest", npp, nth_pp, -1, tot_for_rest - tot_for_pp, tot);
826     fprintf(fplog, "%s\n", hline);
827     print_cycles(fplog, c2t, "Total", npp, nth_pp, -1, tot, tot);
828     fprintf(fplog, "%s\n", hline);
829
830     if (npme > 0)
831     {
832         fprintf(fplog,
833                 "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
834                 "    twice the total reported, but the cycle count total and %% are correct.\n"
835                 "%s\n",
836                 hline);
837     }
838
839     if (wc->wcc[WallCycleCounter::PmeMesh].n > 0)
840     {
841         // A workaround to not print breakdown when no subcounters were recorded.
842         // TODO: figure out and record PME GPU counters (what to do with the waiting ones?)
843         std::vector<WallCycleCounter> validPmeSubcounterIndices;
844         for (auto key = gmx::EnumerationIterator<WallCycleCounter>(WallCycleCounter::Domdec);
845              key != iter.end();
846              key++)
847         {
848             if (is_pme_subcounter(*key) && wc->wcc[*key].n > 0)
849             {
850                 validPmeSubcounterIndices.push_back(*key);
851             }
852         }
853
854         if (!validPmeSubcounterIndices.empty())
855         {
856             fprintf(fplog, " Breakdown of PME mesh computation\n");
857             fprintf(fplog, "%s\n", hline);
858             for (auto i : validPmeSubcounterIndices)
859             {
860                 print_cycles(fplog,
861                              npme > 0 ? c2t_pme : c2t_pp,
862                              enumValuetoString(i),
863                              npme > 0 ? npme : npp,
864                              nth_pme,
865                              wc->wcc[i].n,
866                              cyc_sum[static_cast<int>(i)],
867                              tot);
868             }
869             fprintf(fplog, "%s\n", hline);
870         }
871     }
872
873     if (sc_useCycleSubcounters)
874     {
875         fprintf(fplog, " Breakdown of PP computation\n");
876         fprintf(fplog, "%s\n", hline);
877         for (auto key : keysOf(wc->wcsc))
878         {
879             print_cycles(fplog,
880                          c2t_pp,
881                          enumValuetoString(key),
882                          npp,
883                          nth_pp,
884                          wc->wcsc[key].n,
885                          cyc_sum[sc_numWallCycleCounters + static_cast<int>(key)],
886                          tot);
887         }
888         fprintf(fplog, "%s\n", hline);
889     }
890
891     /* print GPU timing summary */
892     double tot_gpu = 0.0;
893     if (gpu_pme_t)
894     {
895         for (auto key : keysOf(gpu_pme_t->timing))
896         {
897             tot_gpu += gpu_pme_t->timing[key].t;
898         }
899     }
900     if (gpu_nbnxn_t)
901     {
902         const char* k_log_str[2][2] = { { "Nonbonded F kernel", "Nonbonded F+ene k." },
903                                         { "Nonbonded F+prune k.", "Nonbonded F+ene+prune k." } };
904         tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
905
906         /* add up the kernel timings */
907         for (int i = 0; i < 2; i++)
908         {
909             for (int j = 0; j < 2; j++)
910             {
911                 tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
912             }
913         }
914         tot_gpu += gpu_nbnxn_t->pruneTime.t;
915
916         tot_cpu_overlap = wc->wcc[WallCycleCounter::Force].c;
917         if (wc->wcc[WallCycleCounter::PmeMesh].n > 0)
918         {
919             tot_cpu_overlap += wc->wcc[WallCycleCounter::PmeMesh].c;
920         }
921         tot_cpu_overlap *= realtime * 1000 / tot; /* convert s to ms */
922
923         fprintf(fplog, "\n GPU timings\n%s\n", hline);
924         fprintf(fplog,
925                 " Computing:                         Count  Wall t (s)      ms/step       %c\n",
926                 '%');
927         fprintf(fplog, "%s\n", hline);
928         print_gputimes(fplog, "Pair list H2D", gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
929         print_gputimes(fplog, "X / q H2D", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
930
931         for (int i = 0; i < 2; i++)
932         {
933             for (int j = 0; j < 2; j++)
934             {
935                 if (gpu_nbnxn_t->ktime[i][j].c)
936                 {
937                     print_gputimes(fplog,
938                                    k_log_str[i][j],
939                                    gpu_nbnxn_t->ktime[i][j].c,
940                                    gpu_nbnxn_t->ktime[i][j].t,
941                                    tot_gpu);
942                 }
943             }
944         }
945         if (gpu_pme_t)
946         {
947             for (auto key : keysOf(gpu_pme_t->timing))
948             {
949                 if (gpu_pme_t->timing[key].c)
950                 {
951                     print_gputimes(fplog,
952                                    enumValuetoString(key),
953                                    gpu_pme_t->timing[key].c,
954                                    gpu_pme_t->timing[key].t,
955                                    tot_gpu);
956                 }
957             }
958         }
959         if (gpu_nbnxn_t->pruneTime.c)
960         {
961             print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c, gpu_nbnxn_t->pruneTime.t, tot_gpu);
962         }
963         print_gputimes(fplog, "F D2H", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
964         fprintf(fplog, "%s\n", hline);
965         print_gputimes(fplog, "Total ", gpu_nbnxn_t->nb_c, tot_gpu, tot_gpu);
966         fprintf(fplog, "%s\n", hline);
967         if (gpu_nbnxn_t->dynamicPruneTime.c)
968         {
969             /* We print the dynamic pruning kernel timings after a separator
970              * and avoid adding it to tot_gpu as this is not in the force
971              * overlap. We print the fraction as relative to the rest.
972              */
973             print_gputimes(fplog,
974                            "*Dynamic pruning",
975                            gpu_nbnxn_t->dynamicPruneTime.c,
976                            gpu_nbnxn_t->dynamicPruneTime.t,
977                            tot_gpu);
978             fprintf(fplog, "%s\n", hline);
979         }
980         gpu_cpu_ratio = tot_gpu / tot_cpu_overlap;
981         if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[WallCycleCounter::Force].n > 0)
982         {
983             fprintf(fplog,
984                     "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = "
985                     "%.3f\n",
986                     tot_gpu / gpu_nbnxn_t->nb_c,
987                     tot_cpu_overlap / wc->wcc[WallCycleCounter::Force].n,
988                     gpu_cpu_ratio);
989         }
990
991         /* only print notes related to CPU-GPU load balance with PME */
992         if (wc->wcc[WallCycleCounter::PmeMesh].n > 0)
993         {
994             fprintf(fplog, "For optimal resource utilization this ratio should be close to 1\n");
995
996             /* print note if the imbalance is high with PME case in which
997              * CPU-GPU load balancing is possible */
998             if (gpu_cpu_ratio < 0.8 || gpu_cpu_ratio > 1.25)
999             {
1000                 /* Only the sim master calls this function, so always print to stderr */
1001                 if (gpu_cpu_ratio < 0.8)
1002                 {
1003                     if (npp > 1)
1004                     {
1005                         /* The user could have used -notunepme,
1006                          * but we currently can't check that here.
1007                          */
1008                         GMX_LOG(mdlog.warning)
1009                                 .asParagraph()
1010                                 .appendText(
1011                                         "NOTE: The CPU has >25% more load than the GPU. This "
1012                                         "imbalance wastes\n"
1013                                         "      GPU resources. Maybe the domain decomposition "
1014                                         "limits the PME tuning.\n"
1015                                         "      In that case, try setting the DD grid manually "
1016                                         "(-dd) or lowering -dds.");
1017                     }
1018                     else
1019                     {
1020                         /* We should not end up here, unless the box is
1021                          * too small for increasing the cut-off for PME tuning.
1022                          */
1023                         GMX_LOG(mdlog.warning)
1024                                 .asParagraph()
1025                                 .appendText(
1026                                         "NOTE: The CPU has >25% more load than the GPU. This "
1027                                         "imbalance wastes\n"
1028                                         "      GPU resources.");
1029                     }
1030                 }
1031                 if (gpu_cpu_ratio > 1.25)
1032                 {
1033                     GMX_LOG(mdlog.warning)
1034                             .asParagraph()
1035                             .appendText(
1036                                     "NOTE: The GPU has >25% more load than the CPU. This imbalance "
1037                                     "wastes\n"
1038                                     "      CPU resources.");
1039                 }
1040             }
1041         }
1042     }
1043
1044     if (wc->wc_barrier)
1045     {
1046         GMX_LOG(mdlog.warning)
1047                 .asParagraph()
1048                 .appendText(
1049                         "MPI_Barrier was called before each cycle start/stop\n"
1050                         "call, so timings are not those of real runs.");
1051     }
1052
1053     if (wc->wcc[WallCycleCounter::NbXFBufOps].n > 0
1054         && (cyc_sum[static_cast<int>(WallCycleCounter::Domdec)] > tot * 0.1
1055             || cyc_sum[static_cast<int>(WallCycleCounter::NS)] > tot * 0.1))
1056     {
1057         /* Only the sim master calls this function, so always print to stderr */
1058         if (wc->wcc[WallCycleCounter::Domdec].n == 0)
1059         {
1060             GMX_LOG(mdlog.warning)
1061                     .asParagraph()
1062                     .appendTextFormatted(
1063                             "NOTE: %d %% of the run time was spent in pair search,\n"
1064                             "      you might want to increase nstlist (this has no effect on "
1065                             "accuracy)\n",
1066                             gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::NS)] / tot));
1067         }
1068         else
1069         {
1070             GMX_LOG(mdlog.warning)
1071                     .asParagraph()
1072                     .appendTextFormatted(
1073                             "NOTE: %d %% of the run time was spent in domain decomposition,\n"
1074                             "      %d %% of the run time was spent in pair search,\n"
1075                             "      you might want to increase nstlist (this has no effect on "
1076                             "accuracy)\n",
1077                             gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::Domdec)] / tot),
1078                             gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::NS)] / tot));
1079         }
1080     }
1081
1082     if (cyc_sum[static_cast<int>(WallCycleCounter::MoveE)] > tot * 0.05)
1083     {
1084         GMX_LOG(mdlog.warning)
1085                 .asParagraph()
1086                 .appendTextFormatted(
1087                         "NOTE: %d %% of the run time was spent communicating energies,\n"
1088                         "      you might want to increase some nst* mdp options\n",
1089                         gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::MoveE)] / tot));
1090     }
1091 }
1092
1093 int64_t wcycle_get_reset_counters(gmx_wallcycle* wc)
1094 {
1095     if (wc == nullptr)
1096     {
1097         return -1;
1098     }
1099     return wc->reset_counters;
1100 }
1101
1102 void wcycle_set_reset_counters(gmx_wallcycle* wc, int64_t reset_counters)
1103 {
1104     if (wc == nullptr)
1105     {
1106         return;
1107     }
1108     wc->reset_counters = reset_counters;
1109 }