12d5f376d0f33727800eaf78d7e68844e2f1e0da
[alexxy/gromacs.git] / src / gromacs / timing / wallcycle.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2008, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9  * and including many others, as listed in the AUTHORS file in the
10  * top-level source directory and at http://www.gromacs.org.
11  *
12  * GROMACS is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public License
14  * as published by the Free Software Foundation; either version 2.1
15  * of the License, or (at your option) any later version.
16  *
17  * GROMACS is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with GROMACS; if not, see
24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
26  *
27  * If you want to redistribute modifications to GROMACS, please
28  * consider that scientific software is very special. Version
29  * control is crucial - bugs must be traceable. We will be happy to
30  * consider code for inclusion in the official distribution, but
31  * derived work must not be called official GROMACS. Details are found
32  * in the README & COPYING files - if they are missing, get the
33  * official version at http://www.gromacs.org.
34  *
35  * To help us fund GROMACS development, we humbly ask that you cite
36  * the research papers on the package. Check out http://www.gromacs.org.
37  */
38 #include "gmxpre.h"
39
40 #include "wallcycle.h"
41
42 #include "config.h"
43
44 #include <cstdlib>
45
46 #include <array>
47 #include <memory>
48 #include <vector>
49
50 #include "gromacs/math/functions.h"
51 #include "gromacs/mdtypes/commrec.h"
52 #include "gromacs/timing/cyclecounter.h"
53 #include "gromacs/timing/gpu_timing.h"
54 #include "gromacs/timing/wallcyclereporting.h"
55 #include "gromacs/utility/arrayref.h"
56 #include "gromacs/utility/cstringutil.h"
57 #include "gromacs/utility/enumerationhelpers.h"
58 #include "gromacs/utility/gmxassert.h"
59 #include "gromacs/utility/gmxmpi.h"
60 #include "gromacs/utility/logger.h"
61 #include "gromacs/utility/smalloc.h"
62 #include "gromacs/utility/snprintf.h"
63 #include "gromacs/utility/stringutil.h"
64
65 //! Whether wallcycle debugging is enabled
66 constexpr bool gmx_unused enableWallcycleDebug = (DEBUG_WCYCLE != 0);
67 //! True if only the master rank should print debugging output
68 constexpr bool gmx_unused onlyMasterDebugPrints = true;
69 //! True if cycle counter nesting depth debuggin prints are enabled
70 constexpr bool gmx_unused debugPrintDepth = false /* enableWallcycleDebug */;
71
72 #if DEBUG_WCYCLE
73 #    include "gromacs/utility/fatalerror.h"
74 #endif
75
76 /* Each name should not exceed 19 printing characters
77    (ie. terminating null can be twentieth) */
78 static const char* enumValuetoString(WallCycleCounter enumValue)
79 {
80     constexpr gmx::EnumerationArray<WallCycleCounter, const char*> wallCycleCounterNames = {
81         "Run",
82         "Step",
83         "PP during PME",
84         "Domain decomp.",
85         "DD comm. load",
86         "DD comm. bounds",
87         "Vsite constr.",
88         "Send X to PME",
89         "Neighbor search",
90         "Launch GPU ops.",
91         "Comm. coord.",
92         "Force",
93         "Wait + Comm. F",
94         "PME mesh",
95         "PME redist. X/F",
96         "PME spread",
97         "PME gather",
98         "PME 3D-FFT",
99         "PME 3D-FFT Comm.",
100         "PME solve LJ",
101         "PME solve Elec",
102         "PME wait for PP",
103         "Wait + Recv. PME F",
104         "Wait PME GPU spread",
105         "PME 3D-FFT",
106         "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
107         "Wait PME GPU gather",
108         "Wait Bonded GPU",
109         "Reduce GPU PME F",
110         "Wait GPU NB nonloc.",
111         "Wait GPU NB local",
112         "Wait GPU state copy",
113         "NB X/F buffer ops.",
114         "Vsite spread",
115         "COM pull force",
116         "AWH",
117         "Write traj.",
118         "Update",
119         "Constraints",
120         "Comm. energies",
121         "Enforced rotation",
122         "Add rot. forces",
123         "Position swapping",
124         "IMD",
125         "Test"
126     };
127     return wallCycleCounterNames[enumValue];
128 }
129
130 static const char* enumValuetoString(WallCycleSubCounter enumValue)
131 {
132     constexpr gmx::EnumerationArray<WallCycleSubCounter, const char*> wallCycleSubCounterNames = {
133         "DD redist.",
134         "DD NS grid + sort",
135         "DD setup comm.",
136         "DD make top.",
137         "DD make constr.",
138         "DD top. other",
139         "DD GPU ops.",
140         "NS grid local",
141         "NS grid non-loc.",
142         "NS search local",
143         "NS search non-loc.",
144         "Bonded F",
145         "Bonded-FEP F",
146         "Restraints F",
147         "Listed buffer ops.",
148         "Nonbonded pruning",
149         "Nonbonded F kernel",
150         "Nonbonded F clear",
151         "Nonbonded FEP",
152         "Nonbonded FEP reduction",
153         "Launch NB GPU tasks",
154         "Launch Bonded GPU tasks",
155         "Launch PME GPU tasks",
156         "Launch state copy",
157         "Ewald F correction",
158         "NB X buffer ops.",
159         "NB F buffer ops.",
160         "Clear force buffer",
161         "Launch GPU NB X buffer ops.",
162         "Launch GPU NB F buffer ops.",
163         "Launch GPU Comm. coord.",
164         "Launch GPU Comm. force.",
165         "Launch GPU update",
166         "Test subcounter"
167     };
168     return wallCycleSubCounterNames[enumValue];
169 }
170
171 /* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
172 static const char* enumValuetoString(PmeStage enumValue)
173 {
174     constexpr gmx::EnumerationArray<PmeStage, const char*> pmeStageNames = {
175         "PME spline", "PME spread",     "PME spline + spread", "PME 3D-FFT r2c",
176         "PME solve",  "PME 3D-FFT c2r", "PME gather"
177     };
178     return pmeStageNames[enumValue];
179 };
180
181 bool wallcycle_have_counter()
182 {
183     return gmx_cycles_have_counter();
184 }
185
186 std::unique_ptr<gmx_wallcycle> wallcycle_init(FILE* fplog, int resetstep, const t_commrec* cr)
187 {
188     std::unique_ptr<gmx_wallcycle> wc;
189
190
191     if (!wallcycle_have_counter())
192     {
193         return wc;
194     }
195
196     wc = std::make_unique<gmx_wallcycle>();
197
198     wc->haveInvalidCount = false;
199     wc->wc_barrier       = false;
200     wc->wc_depth         = 0;
201     wc->ewc_prev         = WallCycleCounter::Count;
202     wc->reset_counters   = resetstep;
203     wc->cr               = cr;
204
205
206 #if GMX_MPI
207     if (cr != nullptr && PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
208     {
209         if (fplog)
210         {
211             fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
212         }
213         wc->wc_barrier = true;
214     }
215 #endif
216
217     if (getenv("GMX_CYCLE_ALL") != nullptr)
218     {
219         if (fplog)
220         {
221             fprintf(fplog, "\nWill time all the code during the run\n\n");
222         }
223         wc->wcc_all.resize(sc_numWallCycleCountersSquared);
224     }
225
226 #if DEBUG_WCYCLE
227     wc->count_depth  = 0;
228     wc->isMasterRank = MASTER(cr);
229 #endif
230
231     return wc;
232 }
233
234 #if DEBUG_WCYCLE
235 static void debug_start_check(gmx_wallcycle* wc, WallCycleCounter ewc)
236 {
237     if (wc->count_depth < 0 || wc->count_depth >= c_MaxWallCycleDepth)
238     {
239         gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth + 1);
240     }
241     wc->counterlist[wc->count_depth] = ewc;
242     wc->count_depth++;
243
244     if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
245     {
246         std::string indentStr(4 * wc->count_depth, ' ');
247         fprintf(stderr, "%swcycle_start depth %d, %s\n", indentStr.c_str(), wc->count_depth, enumValuetoString(ewc));
248     }
249 }
250
251 static void debug_stop_check(gmx_wallcycle* wc, WallCycleCounter ewc)
252 {
253     if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
254     {
255         std::string indentStr(4 * wc->count_depth, ' ');
256         fprintf(stderr, "%swcycle_stop  depth %d, %s\n", indentStr.c_str(), wc->count_depth, enumValuetoString(ewc));
257     }
258
259     wc->count_depth--;
260
261     if (wc->count_depth < 0)
262     {
263         gmx_fatal(FARGS,
264                   "wallcycle counter depth out of range when stopping %s: %d",
265                   enumValuetoString(ewc),
266                   wc->count_depth);
267     }
268     if (wc->counterlist[wc->count_depth] != ewc)
269     {
270         gmx_fatal(FARGS,
271                   "wallcycle mismatch at stop, start %s, stop %s",
272                   enumValuetoString(wc->counterlist[wc->count_depth]),
273                   enumValuetoString(ewc));
274     }
275 }
276 #endif
277
278 void wallcycle_get(gmx_wallcycle* wc, WallCycleCounter ewc, int* n, double* c)
279 {
280     *n = wc->wcc[ewc].n;
281     *c = static_cast<double>(wc->wcc[ewc].c);
282 }
283
284 void wallcycle_sub_get(gmx_wallcycle* wc, WallCycleSubCounter ewcs, int* n, double* c)
285 {
286     if (sc_useCycleSubcounters && wc != nullptr)
287     {
288         *n = wc->wcsc[ewcs].n;
289         *c = static_cast<double>(wc->wcsc[ewcs].c);
290     }
291 }
292
293 void wallcycle_reset_all(gmx_wallcycle* wc)
294 {
295     if (wc == nullptr)
296     {
297         return;
298     }
299
300     for (auto& counter : wc->wcc)
301     {
302         counter.n = 0;
303         counter.c = 0;
304     }
305     wc->haveInvalidCount = false;
306
307     if (!wc->wcc_all.empty())
308     {
309         for (int i = 0; i < sc_numWallCycleCountersSquared; i++)
310         {
311             wc->wcc_all[i].n = 0;
312             wc->wcc_all[i].c = 0;
313         }
314     }
315     for (auto& counter : wc->wcsc)
316     {
317         counter.n = 0;
318         counter.c = 0;
319     }
320 }
321
322 static bool is_pme_counter(WallCycleCounter ewc)
323 {
324     return (ewc >= WallCycleCounter::PmeMesh && ewc <= WallCycleCounter::PmeWaitComm);
325 }
326
327 static bool is_pme_subcounter(WallCycleCounter ewc)
328 {
329     return (ewc >= WallCycleCounter::PmeRedistXF && ewc < WallCycleCounter::PmeWaitComm);
330 }
331
332 void wallcycleBarrier(gmx_wallcycle* wc)
333 {
334 #if GMX_MPI
335     if (wc->wc_barrier)
336     {
337         MPI_Barrier(wc->cr->mpi_comm_mygroup);
338     }
339 #else
340     GMX_UNUSED_VALUE(wc);
341 #endif
342 }
343
344 /* Subtract counter ewc_sub timed inside a timing block for ewc_main */
345 // NOLINTNEXTLINE(google-runtime-references)
346 static void subtract_cycles(gmx::EnumerationArray<WallCycleCounter, wallcc_t>& wcc,
347                             WallCycleCounter                                   ewc_main,
348                             WallCycleCounter                                   ewc_sub)
349 {
350     if (wcc[ewc_sub].n > 0)
351     {
352         if (wcc[ewc_main].c >= wcc[ewc_sub].c)
353         {
354             wcc[ewc_main].c -= wcc[ewc_sub].c;
355         }
356         else
357         {
358             /* Something is wrong with the cycle counting */
359             wcc[ewc_main].c = 0;
360         }
361     }
362 }
363
364 void wallcycle_scale_by_num_threads(gmx_wallcycle* wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
365 {
366     if (wc == nullptr)
367     {
368         return;
369     }
370
371     for (auto key : keysOf(wc->wcc))
372     {
373         if (is_pme_counter(key) || (key == WallCycleCounter::Run && isPmeRank))
374         {
375             wc->wcc[key].c *= nthreads_pme;
376
377             if (!wc->wcc_all.empty())
378             {
379                 const int current = static_cast<int>(key);
380                 for (int j = 0; j < sc_numWallCycleCounters; j++)
381                 {
382                     wc->wcc_all[current * sc_numWallCycleCounters + j].c *= nthreads_pme;
383                 }
384             }
385         }
386         else
387         {
388             wc->wcc[key].c *= nthreads_pp;
389
390             if (!wc->wcc_all.empty())
391             {
392                 const int current = static_cast<int>(key);
393                 for (int j = 0; j < sc_numWallCycleCounters; j++)
394                 {
395                     wc->wcc_all[current * sc_numWallCycleCounters + j].c *= nthreads_pp;
396                 }
397             }
398         }
399     }
400     if (sc_useCycleSubcounters && !isPmeRank)
401     {
402         for (auto& counter : wc->wcsc)
403         {
404             counter.c *= nthreads_pp;
405         }
406     }
407 }
408
409 /* TODO Make an object for this function to return, containing some
410  * vectors of something like wallcc_t for the summed wcc, wcc_all and
411  * wcsc, AND the original wcc for rank 0.
412  *
413  * The GPU timing is reported only for rank 0, so we want to preserve
414  * the original wcycle on that rank. Rank 0 also reports the global
415  * counts before that, so needs something to contain the global data
416  * without over-writing the rank-0 data. The current implementation
417  * uses cycles_sum to manage this, which works OK now because wcsc and
418  * wcc_all are unused by the GPU reporting, but it is not satisfactory
419  * for the future. Also, there's no need for MPI_Allreduce, since
420  * only MASTERRANK uses any of the results. */
421 WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle* wc)
422 {
423     WallcycleCounts                                    cycles_sum;
424     gmx::EnumerationArray<WallCycleCounter, double>    cyclesMain;
425     gmx::EnumerationArray<WallCycleSubCounter, double> cyclesSub;
426 #if GMX_MPI
427     gmx::EnumerationArray<WallCycleCounter, double>    cyclesMainOnNode;
428     gmx::EnumerationArray<WallCycleSubCounter, double> cyclesSubOnNode;
429 #endif
430
431     if (wc == nullptr)
432     {
433         /* Default construction of std::array of non-class T can leave
434            the values indeterminate, just like a C array */
435         cycles_sum.fill(0);
436         return cycles_sum;
437     }
438
439     auto& wcc = wc->wcc;
440
441     subtract_cycles(wcc, WallCycleCounter::Domdec, WallCycleCounter::DDCommLoad);
442     subtract_cycles(wcc, WallCycleCounter::Domdec, WallCycleCounter::DDCommBound);
443
444     subtract_cycles(wcc, WallCycleCounter::PmeFft, WallCycleCounter::PmeFftComm);
445
446     if (cr->npmenodes == 0)
447     {
448         /* All nodes do PME (or no PME at all) */
449         subtract_cycles(wcc, WallCycleCounter::Force, WallCycleCounter::PmeMesh);
450     }
451     else
452     {
453         /* The are PME-only nodes */
454         if (wcc[WallCycleCounter::PmeMesh].n > 0)
455         {
456             /* This must be a PME only node, calculate the Wait + Comm. time */
457             GMX_ASSERT(wcc[WallCycleCounter::Run].c >= wcc[WallCycleCounter::PmeMesh].c,
458                        "Total run ticks must be greater than PME-only ticks");
459             wcc[WallCycleCounter::PmeWaitComm].c =
460                     wcc[WallCycleCounter::Run].c - wcc[WallCycleCounter::PmeMesh].c;
461         }
462     }
463
464     /* Store the cycles in a double buffer for summing */
465     for (auto key : keysOf(wcc))
466     {
467 #if GMX_MPI
468         cyclesMainOnNode[key] = static_cast<double>(wcc[key].n);
469 #endif
470         cyclesMain[key] = static_cast<double>(wcc[key].c);
471     }
472     if (sc_useCycleSubcounters)
473     {
474         for (auto key : keysOf(wc->wcsc))
475         {
476 #if GMX_MPI
477             cyclesSubOnNode[key] = static_cast<double>(wc->wcsc[key].n);
478 #endif
479             cyclesSub[key] = static_cast<double>(wc->wcsc[key].c);
480         }
481     }
482
483 #if GMX_MPI
484     if (cr->nnodes > 1)
485     {
486         gmx::EnumerationArray<WallCycleCounter, double>    bufMain;
487         gmx::EnumerationArray<WallCycleSubCounter, double> bufSub;
488
489         // TODO this code is used only at the end of the run, so we
490         // can just do a simple reduce of haveInvalidCount in
491         // wallcycle_print, and avoid bugs
492         double haveInvalidCount = (wc->haveInvalidCount ? 1 : 0);
493         // TODO Use MPI_Reduce
494         MPI_Allreduce(cyclesMainOnNode.data(), bufMain.data(), bufMain.size(), MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
495         if (sc_useCycleSubcounters)
496         {
497             MPI_Allreduce(cyclesSubOnNode.data(), bufSub.data(), bufSub.size(), MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
498         }
499         MPI_Allreduce(MPI_IN_PLACE, &haveInvalidCount, 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
500         for (auto key : keysOf(wcc))
501         {
502             wcc[key].n = gmx::roundToInt(bufMain[key]);
503         }
504         wc->haveInvalidCount = (haveInvalidCount > 0);
505         if (sc_useCycleSubcounters)
506         {
507             for (auto key : keysOf(wc->wcsc))
508             {
509                 wc->wcsc[key].n = gmx::roundToInt(bufSub[key]);
510             }
511         }
512
513         // TODO Use MPI_Reduce
514         MPI_Allreduce(cyclesMain.data(), cycles_sum.data(), cyclesMain.size(), MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
515         if (sc_useCycleSubcounters)
516         {
517             MPI_Allreduce(cyclesSub.data(),
518                           cycles_sum.data() + sc_numWallCycleCounters,
519                           cyclesSub.size(),
520                           MPI_DOUBLE,
521                           MPI_SUM,
522                           cr->mpi_comm_mysim);
523         }
524
525         if (!wc->wcc_all.empty())
526         {
527             std::array<double, sc_numWallCycleCountersSquared> cyc_all;
528             std::array<double, sc_numWallCycleCountersSquared> buf_all;
529
530             for (int i = 0; i < sc_numWallCycleCountersSquared; i++)
531             {
532                 cyc_all[i] = wc->wcc_all[i].c;
533             }
534             // TODO Use MPI_Reduce
535             MPI_Allreduce(cyc_all.data(),
536                           buf_all.data(),
537                           sc_numWallCycleCountersSquared,
538                           MPI_DOUBLE,
539                           MPI_SUM,
540                           cr->mpi_comm_mysim);
541             for (int i = 0; i < sc_numWallCycleCountersSquared; i++)
542             {
543                 wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
544             }
545         }
546     }
547     else
548 #endif
549     {
550         for (auto key : keysOf(cyclesMain))
551         {
552             cycles_sum[static_cast<int>(key)] = cyclesMain[key];
553         }
554         if (sc_useCycleSubcounters)
555         {
556             for (auto key : keysOf(cyclesSub))
557             {
558                 const int offset   = static_cast<int>(key) + sc_numWallCycleCounters;
559                 cycles_sum[offset] = cyclesSub[key];
560             }
561         }
562     }
563
564     return cycles_sum;
565 }
566
567 static void
568 print_cycles(FILE* fplog, double c2t, const char* name, int nnodes, int nthreads, int ncalls, double c_sum, double tot)
569 {
570     char   nnodes_str[STRLEN];
571     char   nthreads_str[STRLEN];
572     char   ncalls_str[STRLEN];
573     double wallt;
574     double percentage = (tot > 0.) ? (100. * c_sum / tot) : 0.;
575
576     if (c_sum > 0)
577     {
578         if (ncalls > 0)
579         {
580             snprintf(ncalls_str, sizeof(ncalls_str), "%10d", ncalls);
581             if (nnodes < 0)
582             {
583                 snprintf(nnodes_str, sizeof(nnodes_str), "N/A");
584             }
585             else
586             {
587                 snprintf(nnodes_str, sizeof(nnodes_str), "%4d", nnodes);
588             }
589             if (nthreads < 0)
590             {
591                 snprintf(nthreads_str, sizeof(nthreads_str), "N/A");
592             }
593             else
594             {
595                 snprintf(nthreads_str, sizeof(nthreads_str), "%4d", nthreads);
596             }
597         }
598         else
599         {
600             nnodes_str[0]   = 0;
601             nthreads_str[0] = 0;
602             ncalls_str[0]   = 0;
603         }
604         /* Convert the cycle count to wallclock time for this task */
605         wallt = c_sum * c2t;
606
607         fprintf(fplog,
608                 " %-19.19s %4s %4s %10s  %10.3f %14.3f %5.1f\n",
609                 name,
610                 nnodes_str,
611                 nthreads_str,
612                 ncalls_str,
613                 wallt,
614                 c_sum * 1e-9,
615                 percentage);
616     }
617 }
618
619 static void print_gputimes(FILE* fplog, const char* name, int n, double t, double tot_t)
620 {
621     char num[11];
622     char avg_perf[11];
623
624     if (n > 0)
625     {
626         snprintf(num, sizeof(num), "%10d", n);
627         snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t / n);
628     }
629     else
630     {
631         sprintf(num, "          ");
632         sprintf(avg_perf, "          ");
633     }
634     if (t != tot_t && tot_t > 0)
635     {
636         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, num, t / 1000, avg_perf, 100 * t / tot_t);
637     }
638     else
639     {
640         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, "", t / 1000, avg_perf, 100.0);
641     }
642 }
643
644 static void print_header(FILE* fplog, int nrank_pp, int nth_pp, int nrank_pme, int nth_pme)
645 {
646     int nrank_tot = nrank_pp + nrank_pme;
647     if (0 == nrank_pme)
648     {
649         fprintf(fplog, "On %d MPI rank%s", nrank_tot, nrank_tot == 1 ? "" : "s");
650         if (nth_pp > 1)
651         {
652             fprintf(fplog, ", each using %d OpenMP threads", nth_pp);
653         }
654         /* Don't report doing PP+PME, because we can't tell here if
655          * this is RF, etc. */
656     }
657     else
658     {
659         fprintf(fplog, "On %d MPI rank%s doing PP", nrank_pp, nrank_pp == 1 ? "" : "s");
660         if (nth_pp > 1)
661         {
662             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pp > 1 ? " each" : "", nth_pp);
663         }
664         fprintf(fplog, ", and\non %d MPI rank%s doing PME", nrank_pme, nrank_pme == 1 ? "" : "s");
665         if (nth_pme > 1)
666         {
667             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pme > 1 ? " each" : "", nth_pme);
668         }
669     }
670
671     fprintf(fplog, "\n\n");
672     fprintf(fplog, " Computing:          Num   Num      Call    Wall time         Giga-Cycles\n");
673     fprintf(fplog, "                     Ranks Threads  Count      (s)         total sum    %%\n");
674 }
675
676
677 void wallcycle_print(FILE*                            fplog,
678                      const gmx::MDLogger&             mdlog,
679                      int                              nnodes,
680                      int                              npme,
681                      int                              nth_pp,
682                      int                              nth_pme,
683                      double                           realtime,
684                      gmx_wallcycle*                   wc,
685                      const WallcycleCounts&           cyc_sum,
686                      const gmx_wallclock_gpu_nbnxn_t* gpu_nbnxn_t,
687                      const gmx_wallclock_gpu_pme_t*   gpu_pme_t)
688 {
689     double      tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
690     double      c2t, c2t_pp, c2t_pme = 0;
691     int         npp, nth_tot;
692     char        buf[STRLEN];
693     const char* hline =
694             "-----------------------------------------------------------------------------";
695
696     if (wc == nullptr)
697     {
698         return;
699     }
700
701     GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0");
702     GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0");
703     GMX_ASSERT(nnodes > 0, "Number of nodes must be >0");
704     GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative");
705     npp = nnodes - npme;
706     /* npme is the number of PME-only ranks used, and we always do PP work */
707     GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0");
708
709     nth_tot = npp * nth_pp + npme * nth_pme;
710
711     /* When using PME-only nodes, the next line is valid for both
712        PP-only and PME-only nodes because they started ewcRUN at the
713        same time. */
714     tot        = cyc_sum[static_cast<int>(WallCycleCounter::Run)];
715     tot_for_pp = 0;
716
717     if (tot <= 0.0)
718     {
719         /* TODO This is heavy handed, but until someone reworks the
720            code so that it is provably robust with respect to
721            non-positive values for all possible timer and cycle
722            counters, there is less value gained from printing whatever
723            timing data might still be sensible for some non-Jenkins
724            run, than is lost from diagnosing Jenkins FP exceptions on
725            runs about whose execution time we don't care. */
726         GMX_LOG(mdlog.warning)
727                 .asParagraph()
728                 .appendTextFormatted(
729                         "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a "
730                         "time accounting",
731                         tot);
732         return;
733     }
734
735     if (wc->haveInvalidCount)
736     {
737         GMX_LOG(mdlog.warning)
738                 .asParagraph()
739                 .appendText(
740                         "NOTE: Detected invalid cycle counts, probably because threads moved "
741                         "between CPU cores that do not have synchronized cycle counters. Will not "
742                         "print the cycle accounting.");
743         return;
744     }
745
746
747     /* Conversion factor from cycles to seconds */
748     c2t    = realtime / tot;
749     c2t_pp = c2t * nth_tot / static_cast<double>(npp * nth_pp);
750     if (npme > 0)
751     {
752         c2t_pme = c2t * nth_tot / static_cast<double>(npme * nth_pme);
753     }
754     else
755     {
756         c2t_pme = 0;
757     }
758
759     fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");
760
761     print_header(fplog, npp, nth_pp, npme, nth_pme);
762
763     fprintf(fplog, "%s\n", hline);
764     gmx::EnumerationWrapper<WallCycleCounter> iter;
765     for (auto key = gmx::EnumerationIterator<WallCycleCounter>(WallCycleCounter::Domdec);
766          key != iter.end();
767          ++key)
768     {
769
770         if (is_pme_subcounter(*key))
771         {
772             /* Do not count these at all */
773         }
774         else if (npme > 0 && is_pme_counter(*key))
775         {
776             /* Print timing information for PME-only nodes, but add an
777              * asterisk so the reader of the table can know that the
778              * walltimes are not meant to add up. The asterisk still
779              * fits in the required maximum of 19 characters. */
780             std::string message = gmx::formatString("%s *", enumValuetoString(*key));
781             print_cycles(fplog,
782                          c2t_pme,
783                          message.c_str(),
784                          npme,
785                          nth_pme,
786                          wc->wcc[*key].n,
787                          cyc_sum[static_cast<int>(*key)],
788                          tot);
789         }
790         else
791         {
792             /* Print timing information when it is for a PP or PP+PME
793                node */
794             print_cycles(fplog,
795                          c2t_pp,
796                          enumValuetoString(*key),
797                          npp,
798                          nth_pp,
799                          wc->wcc[*key].n,
800                          cyc_sum[static_cast<int>(*key)],
801                          tot);
802             tot_for_pp += cyc_sum[static_cast<int>(*key)];
803         }
804     }
805     if (!wc->wcc_all.empty())
806     {
807         for (auto i : keysOf(wc->wcc))
808         {
809             const int countI = static_cast<int>(i);
810             for (auto j : keysOf(wc->wcc))
811             {
812                 const int countJ = static_cast<int>(j);
813                 snprintf(buf, 20, "%-9.9s %-9.9s", enumValuetoString(i), enumValuetoString(j));
814                 print_cycles(fplog,
815                              c2t_pp,
816                              buf,
817                              npp,
818                              nth_pp,
819                              wc->wcc_all[countI * sc_numWallCycleCounters + countJ].n,
820                              wc->wcc_all[countI * sc_numWallCycleCounters + countJ].c,
821                              tot);
822             }
823         }
824     }
825     tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot);
826     print_cycles(fplog, c2t_pp, "Rest", npp, nth_pp, -1, tot_for_rest - tot_for_pp, tot);
827     fprintf(fplog, "%s\n", hline);
828     print_cycles(fplog, c2t, "Total", npp, nth_pp, -1, tot, tot);
829     fprintf(fplog, "%s\n", hline);
830
831     if (npme > 0)
832     {
833         fprintf(fplog,
834                 "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
835                 "    twice the total reported, but the cycle count total and %% are correct.\n"
836                 "%s\n",
837                 hline);
838     }
839
840     if (wc->wcc[WallCycleCounter::PmeMesh].n > 0)
841     {
842         // A workaround to not print breakdown when no subcounters were recorded.
843         // TODO: figure out and record PME GPU counters (what to do with the waiting ones?)
844         std::vector<WallCycleCounter> validPmeSubcounterIndices;
845         for (auto key = gmx::EnumerationIterator<WallCycleCounter>(WallCycleCounter::Domdec);
846              key != iter.end();
847              key++)
848         {
849             if (is_pme_subcounter(*key) && wc->wcc[*key].n > 0)
850             {
851                 validPmeSubcounterIndices.push_back(*key);
852             }
853         }
854
855         if (!validPmeSubcounterIndices.empty())
856         {
857             fprintf(fplog, " Breakdown of PME mesh computation\n");
858             fprintf(fplog, "%s\n", hline);
859             for (auto i : validPmeSubcounterIndices)
860             {
861                 print_cycles(fplog,
862                              npme > 0 ? c2t_pme : c2t_pp,
863                              enumValuetoString(i),
864                              npme > 0 ? npme : npp,
865                              nth_pme,
866                              wc->wcc[i].n,
867                              cyc_sum[static_cast<int>(i)],
868                              tot);
869             }
870             fprintf(fplog, "%s\n", hline);
871         }
872     }
873
874     if (sc_useCycleSubcounters)
875     {
876         fprintf(fplog, " Breakdown of PP computation\n");
877         fprintf(fplog, "%s\n", hline);
878         for (auto key : keysOf(wc->wcsc))
879         {
880             print_cycles(fplog,
881                          c2t_pp,
882                          enumValuetoString(key),
883                          npp,
884                          nth_pp,
885                          wc->wcsc[key].n,
886                          cyc_sum[sc_numWallCycleCounters + static_cast<int>(key)],
887                          tot);
888         }
889         fprintf(fplog, "%s\n", hline);
890     }
891
892     /* print GPU timing summary */
893     double tot_gpu = 0.0;
894     if (gpu_pme_t)
895     {
896         for (auto key : keysOf(gpu_pme_t->timing))
897         {
898             tot_gpu += gpu_pme_t->timing[key].t;
899         }
900     }
901     if (gpu_nbnxn_t)
902     {
903         const char* k_log_str[2][2] = { { "Nonbonded F kernel", "Nonbonded F+ene k." },
904                                         { "Nonbonded F+prune k.", "Nonbonded F+ene+prune k." } };
905         tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
906
907         /* add up the kernel timings */
908         for (int i = 0; i < 2; i++)
909         {
910             for (int j = 0; j < 2; j++)
911             {
912                 tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
913             }
914         }
915         tot_gpu += gpu_nbnxn_t->pruneTime.t;
916
917         tot_cpu_overlap = wc->wcc[WallCycleCounter::Force].c;
918         if (wc->wcc[WallCycleCounter::PmeMesh].n > 0)
919         {
920             tot_cpu_overlap += wc->wcc[WallCycleCounter::PmeMesh].c;
921         }
922         tot_cpu_overlap *= realtime * 1000 / tot; /* convert s to ms */
923
924         fprintf(fplog, "\n GPU timings\n%s\n", hline);
925         fprintf(fplog,
926                 " Computing:                         Count  Wall t (s)      ms/step       %c\n",
927                 '%');
928         fprintf(fplog, "%s\n", hline);
929         print_gputimes(fplog, "Pair list H2D", gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
930         print_gputimes(fplog, "X / q H2D", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
931
932         for (int i = 0; i < 2; i++)
933         {
934             for (int j = 0; j < 2; j++)
935             {
936                 if (gpu_nbnxn_t->ktime[i][j].c)
937                 {
938                     print_gputimes(fplog,
939                                    k_log_str[i][j],
940                                    gpu_nbnxn_t->ktime[i][j].c,
941                                    gpu_nbnxn_t->ktime[i][j].t,
942                                    tot_gpu);
943                 }
944             }
945         }
946         if (gpu_pme_t)
947         {
948             for (auto key : keysOf(gpu_pme_t->timing))
949             {
950                 if (gpu_pme_t->timing[key].c)
951                 {
952                     print_gputimes(fplog,
953                                    enumValuetoString(key),
954                                    gpu_pme_t->timing[key].c,
955                                    gpu_pme_t->timing[key].t,
956                                    tot_gpu);
957                 }
958             }
959         }
960         if (gpu_nbnxn_t->pruneTime.c)
961         {
962             print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c, gpu_nbnxn_t->pruneTime.t, tot_gpu);
963         }
964         print_gputimes(fplog, "F D2H", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
965         fprintf(fplog, "%s\n", hline);
966         print_gputimes(fplog, "Total ", gpu_nbnxn_t->nb_c, tot_gpu, tot_gpu);
967         fprintf(fplog, "%s\n", hline);
968         if (gpu_nbnxn_t->dynamicPruneTime.c)
969         {
970             /* We print the dynamic pruning kernel timings after a separator
971              * and avoid adding it to tot_gpu as this is not in the force
972              * overlap. We print the fraction as relative to the rest.
973              */
974             print_gputimes(fplog,
975                            "*Dynamic pruning",
976                            gpu_nbnxn_t->dynamicPruneTime.c,
977                            gpu_nbnxn_t->dynamicPruneTime.t,
978                            tot_gpu);
979             fprintf(fplog, "%s\n", hline);
980         }
981         gpu_cpu_ratio = tot_gpu / tot_cpu_overlap;
982         if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[WallCycleCounter::Force].n > 0)
983         {
984             fprintf(fplog,
985                     "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = "
986                     "%.3f\n",
987                     tot_gpu / gpu_nbnxn_t->nb_c,
988                     tot_cpu_overlap / wc->wcc[WallCycleCounter::Force].n,
989                     gpu_cpu_ratio);
990         }
991
992         /* only print notes related to CPU-GPU load balance with PME */
993         if (wc->wcc[WallCycleCounter::PmeMesh].n > 0)
994         {
995             fprintf(fplog, "For optimal resource utilization this ratio should be close to 1\n");
996
997             /* print note if the imbalance is high with PME case in which
998              * CPU-GPU load balancing is possible */
999             if (gpu_cpu_ratio < 0.8 || gpu_cpu_ratio > 1.25)
1000             {
1001                 /* Only the sim master calls this function, so always print to stderr */
1002                 if (gpu_cpu_ratio < 0.8)
1003                 {
1004                     if (npp > 1)
1005                     {
1006                         /* The user could have used -notunepme,
1007                          * but we currently can't check that here.
1008                          */
1009                         GMX_LOG(mdlog.warning)
1010                                 .asParagraph()
1011                                 .appendText(
1012                                         "NOTE: The CPU has >25% more load than the GPU. This "
1013                                         "imbalance wastes\n"
1014                                         "      GPU resources. Maybe the domain decomposition "
1015                                         "limits the PME tuning.\n"
1016                                         "      In that case, try setting the DD grid manually "
1017                                         "(-dd) or lowering -dds.");
1018                     }
1019                     else
1020                     {
1021                         /* We should not end up here, unless the box is
1022                          * too small for increasing the cut-off for PME tuning.
1023                          */
1024                         GMX_LOG(mdlog.warning)
1025                                 .asParagraph()
1026                                 .appendText(
1027                                         "NOTE: The CPU has >25% more load than the GPU. This "
1028                                         "imbalance wastes\n"
1029                                         "      GPU resources.");
1030                     }
1031                 }
1032                 if (gpu_cpu_ratio > 1.25)
1033                 {
1034                     GMX_LOG(mdlog.warning)
1035                             .asParagraph()
1036                             .appendText(
1037                                     "NOTE: The GPU has >25% more load than the CPU. This imbalance "
1038                                     "wastes\n"
1039                                     "      CPU resources.");
1040                 }
1041             }
1042         }
1043     }
1044
1045     if (wc->wc_barrier)
1046     {
1047         GMX_LOG(mdlog.warning)
1048                 .asParagraph()
1049                 .appendText(
1050                         "MPI_Barrier was called before each cycle start/stop\n"
1051                         "call, so timings are not those of real runs.");
1052     }
1053
1054     if (wc->wcc[WallCycleCounter::NbXFBufOps].n > 0
1055         && (cyc_sum[static_cast<int>(WallCycleCounter::Domdec)] > tot * 0.1
1056             || cyc_sum[static_cast<int>(WallCycleCounter::NS)] > tot * 0.1))
1057     {
1058         /* Only the sim master calls this function, so always print to stderr */
1059         if (wc->wcc[WallCycleCounter::Domdec].n == 0)
1060         {
1061             GMX_LOG(mdlog.warning)
1062                     .asParagraph()
1063                     .appendTextFormatted(
1064                             "NOTE: %d %% of the run time was spent in pair search,\n"
1065                             "      you might want to increase nstlist (this has no effect on "
1066                             "accuracy)\n",
1067                             gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::NS)] / tot));
1068         }
1069         else
1070         {
1071             GMX_LOG(mdlog.warning)
1072                     .asParagraph()
1073                     .appendTextFormatted(
1074                             "NOTE: %d %% of the run time was spent in domain decomposition,\n"
1075                             "      %d %% of the run time was spent in pair search,\n"
1076                             "      you might want to increase nstlist (this has no effect on "
1077                             "accuracy)\n",
1078                             gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::Domdec)] / tot),
1079                             gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::NS)] / tot));
1080         }
1081     }
1082
1083     if (cyc_sum[static_cast<int>(WallCycleCounter::MoveE)] > tot * 0.05)
1084     {
1085         GMX_LOG(mdlog.warning)
1086                 .asParagraph()
1087                 .appendTextFormatted(
1088                         "NOTE: %d %% of the run time was spent communicating energies,\n"
1089                         "      you might want to increase some nst* mdp options\n",
1090                         gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::MoveE)] / tot));
1091     }
1092 }
1093
1094 int64_t wcycle_get_reset_counters(gmx_wallcycle* wc)
1095 {
1096     if (wc == nullptr)
1097     {
1098         return -1;
1099     }
1100     return wc->reset_counters;
1101 }
1102
1103 void wcycle_set_reset_counters(gmx_wallcycle* wc, int64_t reset_counters)
1104 {
1105     if (wc == nullptr)
1106     {
1107         return;
1108     }
1109     wc->reset_counters = reset_counters;
1110 }