src/gromacs/timing/wallcycle.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "wallcycle.h"
  40
  41 #include "config.h"
  42
  43 #include <cstdlib>
  44
  45 #include <array>
  46 #include <vector>
  47
  48 #include "gromacs/math/functions.h"
  49 #include "gromacs/mdtypes/commrec.h"
  50 #include "gromacs/timing/cyclecounter.h"
  51 #include "gromacs/timing/gpu_timing.h"
  52 #include "gromacs/timing/wallcyclereporting.h"
  53 #include "gromacs/utility/cstringutil.h"
  54 #include "gromacs/utility/gmxassert.h"
  55 #include "gromacs/utility/gmxmpi.h"
  56 #include "gromacs/utility/logger.h"
  57 #include "gromacs/utility/smalloc.h"
  58 #include "gromacs/utility/snprintf.h"
  59
  60 static const bool useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS;
  61
  62 /* DEBUG_WCYCLE adds consistency checking for the counters.
  63  * It checks if you stop a counter different from the last
  64  * one that was opened and if you do nest too deep.
  65  */
  66 /* #define DEBUG_WCYCLE */
  67
  68 #ifdef DEBUG_WCYCLE
  69 #    include "gromacs/utility/fatalerror.h"
  70 #endif
  71
  72 typedef struct
  73 {
  74     int          n;
  75     gmx_cycles_t c;
  76     gmx_cycles_t start;
  77 } wallcc_t;
  78
  79 struct gmx_wallcycle
  80 {
  81     wallcc_t* wcc;
  82     /* did we detect one or more invalid cycle counts */
  83     gmx_bool haveInvalidCount;
  84     /* variables for testing/debugging */
  85     gmx_bool  wc_barrier;
  86     wallcc_t* wcc_all;
  87     int       wc_depth;
  88 #ifdef DEBUG_WCYCLE
  89 #    define DEPTH_MAX 6
  90     int counterlist[DEPTH_MAX];
  91     int count_depth;
  92 #endif
  93     int          ewc_prev;
  94     gmx_cycles_t cycle_prev;
  95     int64_t      reset_counters;
  96 #if GMX_MPI
  97     MPI_Comm mpi_comm_mygroup;
  98 #endif
  99     wallcc_t* wcsc;
 100 };
 101
 102 /* Each name should not exceed 19 printing characters
 103    (ie. terminating null can be twentieth) */
 104 static const char* wcn[ewcNR] = { "Run",
 105                                   "Step",
 106                                   "PP during PME",
 107                                   "Domain decomp.",
 108                                   "DD comm. load",
 109                                   "DD comm. bounds",
 110                                   "Vsite constr.",
 111                                   "Send X to PME",
 112                                   "Neighbor search",
 113                                   "Launch GPU ops.",
 114                                   "Comm. coord.",
 115                                   "Force",
 116                                   "Wait + Comm. F",
 117                                   "PME mesh",
 118                                   "PME redist. X/F",
 119                                   "PME spread",
 120                                   "PME gather",
 121                                   "PME 3D-FFT",
 122                                   "PME 3D-FFT Comm.",
 123                                   "PME solve LJ",
 124                                   "PME solve Elec",
 125                                   "PME wait for PP",
 126                                   "Wait + Recv. PME F",
 127                                   "Wait PME GPU spread",
 128                                   "PME 3D-FFT",
 129                                   "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
 130                                   "Wait PME GPU gather",
 131                                   "Wait Bonded GPU",
 132                                   "Reduce GPU PME F",
 133                                   "Wait GPU NB nonloc.",
 134                                   "Wait GPU NB local",
 135                                   "Wait GPU state copy",
 136                                   "NB X/F buffer ops.",
 137                                   "Vsite spread",
 138                                   "COM pull force",
 139                                   "AWH",
 140                                   "Write traj.",
 141                                   "Update",
 142                                   "Constraints",
 143                                   "Comm. energies",
 144                                   "Enforced rotation",
 145                                   "Add rot. forces",
 146                                   "Position swapping",
 147                                   "IMD",
 148                                   "Test" };
 149
 150 static const char* wcsn[ewcsNR] = {
 151     "DD redist.",
 152     "DD NS grid + sort",
 153     "DD setup comm.",
 154     "DD make top.",
 155     "DD make constr.",
 156     "DD top. other",
 157     "NS grid local",
 158     "NS grid non-loc.",
 159     "NS search local",
 160     "NS search non-loc.",
 161     "Bonded F",
 162     "Bonded-FEP F",
 163     "Restraints F",
 164     "Listed buffer ops.",
 165     "Nonbonded pruning",
 166     "Nonbonded F kernel",
 167     "Nonbonded F clear",
 168     "Nonbonded FEP",
 169     "Launch NB GPU tasks",
 170     "Launch Bonded GPU tasks",
 171     "Launch PME GPU tasks",
 172     "Launch state copy",
 173     "Ewald F correction",
 174     "NB X buffer ops.",
 175     "NB F buffer ops.",
 176     "Clear force buffer",
 177     "Test subcounter",
 178 };
 179
 180 /* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
 181 static const char* PMEStageNames[] = {
 182     "PME spline", "PME spread",     "PME spline + spread", "PME 3D-FFT r2c",
 183     "PME solve",  "PME 3D-FFT c2r", "PME gather",
 184 };
 185
 186 gmx_bool wallcycle_have_counter()
 187 {
 188     return gmx_cycles_have_counter();
 189 }
 190
 191 gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, t_commrec gmx_unused* cr)
 192 {
 193     gmx_wallcycle_t wc;
 194
 195
 196     if (!wallcycle_have_counter())
 197     {
 198         return nullptr;
 199     }
 200
 201     snew(wc, 1);
 202
 203     wc->haveInvalidCount = FALSE;
 204     wc->wc_barrier       = FALSE;
 205     wc->wcc_all          = nullptr;
 206     wc->wc_depth         = 0;
 207     wc->ewc_prev         = -1;
 208     wc->reset_counters   = resetstep;
 209
 210 #if GMX_MPI
 211     if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
 212     {
 213         if (fplog)
 214         {
 215             fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
 216         }
 217         wc->wc_barrier       = TRUE;
 218         wc->mpi_comm_mygroup = cr->mpi_comm_mygroup;
 219     }
 220 #endif
 221
 222     snew(wc->wcc, ewcNR);
 223     if (getenv("GMX_CYCLE_ALL") != nullptr)
 224     {
 225         if (fplog)
 226         {
 227             fprintf(fplog, "\nWill time all the code during the run\n\n");
 228         }
 229         snew(wc->wcc_all, ewcNR * ewcNR);
 230     }
 231
 232     if (useCycleSubcounters)
 233     {
 234         snew(wc->wcsc, ewcsNR);
 235     }
 236
 237 #ifdef DEBUG_WCYCLE
 238     wc->count_depth = 0;
 239 #endif
 240
 241     return wc;
 242 }
 243
 244 void wallcycle_destroy(gmx_wallcycle_t wc)
 245 {
 246     if (wc == nullptr)
 247     {
 248         return;
 249     }
 250
 251     if (wc->wcc != nullptr)
 252     {
 253         sfree(wc->wcc);
 254     }
 255     if (wc->wcc_all != nullptr)
 256     {
 257         sfree(wc->wcc_all);
 258     }
 259     if (wc->wcsc != nullptr)
 260     {
 261         sfree(wc->wcsc);
 262     }
 263     sfree(wc);
 264 }
 265
 266 static void wallcycle_all_start(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 267 {
 268     wc->ewc_prev   = ewc;
 269     wc->cycle_prev = cycle;
 270 }
 271
 272 static void wallcycle_all_stop(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 273 {
 274     wc->wcc_all[wc->ewc_prev * ewcNR + ewc].n += 1;
 275     wc->wcc_all[wc->ewc_prev * ewcNR + ewc].c += cycle - wc->cycle_prev;
 276 }
 277
 278
 279 #ifdef DEBUG_WCYCLE
 280 static void debug_start_check(gmx_wallcycle_t wc, int ewc)
 281 {
 282     /* fprintf(stderr,"wcycle_start depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 283
 284     if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
 285     {
 286         gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth);
 287     }
 288     wc->counterlist[wc->count_depth] = ewc;
 289     wc->count_depth++;
 290 }
 291
 292 static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
 293 {
 294     wc->count_depth--;
 295
 296     /* fprintf(stderr,"wcycle_stop depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 297
 298     if (wc->count_depth < 0)
 299     {
 300         gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc],
 301                   wc->count_depth);
 302     }
 303     if (wc->counterlist[wc->count_depth] != ewc)
 304     {
 305         gmx_fatal(FARGS, "wallcycle mismatch at stop, start %s, stop %s",
 306                   wcn[wc->counterlist[wc->count_depth]], wcn[ewc]);
 307     }
 308 }
 309 #endif
 310
 311 void wallcycle_start(gmx_wallcycle_t wc, int ewc)
 312 {
 313     gmx_cycles_t cycle;
 314
 315     if (wc == nullptr)
 316     {
 317         return;
 318     }
 319
 320 #if GMX_MPI
 321     if (wc->wc_barrier)
 322     {
 323         MPI_Barrier(wc->mpi_comm_mygroup);
 324     }
 325 #endif
 326
 327 #ifdef DEBUG_WCYCLE
 328     debug_start_check(wc, ewc);
 329 #endif
 330
 331     cycle              = gmx_cycles_read();
 332     wc->wcc[ewc].start = cycle;
 333     if (wc->wcc_all != nullptr)
 334     {
 335         wc->wc_depth++;
 336         if (ewc == ewcRUN)
 337         {
 338             wallcycle_all_start(wc, ewc, cycle);
 339         }
 340         else if (wc->wc_depth == 3)
 341         {
 342             wallcycle_all_stop(wc, ewc, cycle);
 343         }
 344     }
 345 }
 346
 347 void wallcycle_increment_event_count(gmx_wallcycle_t wc, int ewc)
 348 {
 349     if (wc == nullptr)
 350     {
 351         return;
 352     }
 353     wc->wcc[ewc].n++;
 354 }
 355
 356 void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
 357 {
 358     if (wc == nullptr)
 359     {
 360         return;
 361     }
 362
 363     wallcycle_start(wc, ewc);
 364     wc->wcc[ewc].n--;
 365 }
 366
 367 double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
 368 {
 369     gmx_cycles_t cycle, last;
 370
 371     if (wc == nullptr)
 372     {
 373         return 0;
 374     }
 375
 376 #if GMX_MPI
 377     if (wc->wc_barrier)
 378     {
 379         MPI_Barrier(wc->mpi_comm_mygroup);
 380     }
 381 #endif
 382
 383 #ifdef DEBUG_WCYCLE
 384     debug_stop_check(wc, ewc);
 385 #endif
 386
 387     /* When processes or threads migrate between cores, the cycle counting
 388      * can get messed up if the cycle counter on different cores are not
 389      * synchronized. When this happens we expect both large negative and
 390      * positive cycle differences. We can detect negative cycle differences.
 391      * Detecting too large positive counts if difficult, since count can be
 392      * large, especially for ewcRUN. If we detect a negative count,
 393      * we will not print the cycle accounting table.
 394      */
 395     cycle = gmx_cycles_read();
 396     if (cycle >= wc->wcc[ewc].start)
 397     {
 398         last = cycle - wc->wcc[ewc].start;
 399     }
 400     else
 401     {
 402         last                 = 0;
 403         wc->haveInvalidCount = TRUE;
 404     }
 405     wc->wcc[ewc].c += last;
 406     wc->wcc[ewc].n++;
 407     if (wc->wcc_all)
 408     {
 409         wc->wc_depth--;
 410         if (ewc == ewcRUN)
 411         {
 412             wallcycle_all_stop(wc, ewc, cycle);
 413         }
 414         else if (wc->wc_depth == 2)
 415         {
 416             wallcycle_all_start(wc, ewc, cycle);
 417         }
 418     }
 419
 420     return last;
 421 }
 422
 423 void wallcycle_get(gmx_wallcycle_t wc, int ewc, int* n, double* c)
 424 {
 425     *n = wc->wcc[ewc].n;
 426     *c = static_cast<double>(wc->wcc[ewc].c);
 427 }
 428
 429 void wallcycle_reset_all(gmx_wallcycle_t wc)
 430 {
 431     int i;
 432
 433     if (wc == nullptr)
 434     {
 435         return;
 436     }
 437
 438     for (i = 0; i < ewcNR; i++)
 439     {
 440         wc->wcc[i].n = 0;
 441         wc->wcc[i].c = 0;
 442     }
 443     wc->haveInvalidCount = FALSE;
 444
 445     if (wc->wcc_all)
 446     {
 447         for (i = 0; i < ewcNR * ewcNR; i++)
 448         {
 449             wc->wcc_all[i].n = 0;
 450             wc->wcc_all[i].c = 0;
 451         }
 452     }
 453     if (wc->wcsc)
 454     {
 455         for (i = 0; i < ewcsNR; i++)
 456         {
 457             wc->wcsc[i].n = 0;
 458             wc->wcsc[i].c = 0;
 459         }
 460     }
 461 }
 462
 463 static gmx_bool is_pme_counter(int ewc)
 464 {
 465     return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
 466 }
 467
 468 static gmx_bool is_pme_subcounter(int ewc)
 469 {
 470     return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
 471 }
 472
 473 /* Subtract counter ewc_sub timed inside a timing block for ewc_main */
 474 static void subtract_cycles(wallcc_t* wcc, int ewc_main, int ewc_sub)
 475 {
 476     if (wcc[ewc_sub].n > 0)
 477     {
 478         if (wcc[ewc_main].c >= wcc[ewc_sub].c)
 479         {
 480             wcc[ewc_main].c -= wcc[ewc_sub].c;
 481         }
 482         else
 483         {
 484             /* Something is wrong with the cycle counting */
 485             wcc[ewc_main].c = 0;
 486         }
 487     }
 488 }
 489
 490 void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
 491 {
 492     if (wc == nullptr)
 493     {
 494         return;
 495     }
 496
 497     for (int i = 0; i < ewcNR; i++)
 498     {
 499         if (is_pme_counter(i) || (i == ewcRUN && isPmeRank))
 500         {
 501             wc->wcc[i].c *= nthreads_pme;
 502
 503             if (wc->wcc_all)
 504             {
 505                 for (int j = 0; j < ewcNR; j++)
 506                 {
 507                     wc->wcc_all[i * ewcNR + j].c *= nthreads_pme;
 508                 }
 509             }
 510         }
 511         else
 512         {
 513             wc->wcc[i].c *= nthreads_pp;
 514
 515             if (wc->wcc_all)
 516             {
 517                 for (int j = 0; j < ewcNR; j++)
 518                 {
 519                     wc->wcc_all[i * ewcNR + j].c *= nthreads_pp;
 520                 }
 521             }
 522         }
 523     }
 524     if (useCycleSubcounters && wc->wcsc && !isPmeRank)
 525     {
 526         for (int i = 0; i < ewcsNR; i++)
 527         {
 528             wc->wcsc[i].c *= nthreads_pp;
 529         }
 530     }
 531 }
 532
 533 /* TODO Make an object for this function to return, containing some
 534  * vectors of something like wallcc_t for the summed wcc, wcc_all and
 535  * wcsc, AND the original wcc for rank 0.
 536  *
 537  * The GPU timing is reported only for rank 0, so we want to preserve
 538  * the original wcycle on that rank. Rank 0 also reports the global
 539  * counts before that, so needs something to contain the global data
 540  * without over-writing the rank-0 data. The current implementation
 541  * uses cycles_sum to manage this, which works OK now because wcsc and
 542  * wcc_all are unused by the GPU reporting, but it is not satisfactory
 543  * for the future. Also, there's no need for MPI_Allreduce, since
 544  * only MASTERRANK uses any of the results. */
 545 WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle_t wc)
 546 {
 547     WallcycleCounts cycles_sum;
 548     wallcc_t*       wcc;
 549     double          cycles[ewcNR + ewcsNR];
 550 #if GMX_MPI
 551     double cycles_n[ewcNR + ewcsNR + 1];
 552 #endif
 553     int i;
 554     int nsum;
 555
 556     if (wc == nullptr)
 557     {
 558         /* Default construction of std::array of non-class T can leave
 559            the values indeterminate, just like a C array, and icc
 560            warns about it. */
 561         cycles_sum.fill(0);
 562         return cycles_sum;
 563     }
 564
 565     wcc = wc->wcc;
 566
 567     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
 568     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);
 569
 570     subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);
 571
 572     if (cr->npmenodes == 0)
 573     {
 574         /* All nodes do PME (or no PME at all) */
 575         subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
 576     }
 577     else
 578     {
 579         /* The are PME-only nodes */
 580         if (wcc[ewcPMEMESH].n > 0)
 581         {
 582             /* This must be a PME only node, calculate the Wait + Comm. time */
 583             GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c,
 584                        "Total run ticks must be greater than PME-only ticks");
 585             wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
 586         }
 587     }
 588
 589     /* Store the cycles in a double buffer for summing */
 590     for (i = 0; i < ewcNR; i++)
 591     {
 592 #if GMX_MPI
 593         cycles_n[i] = static_cast<double>(wcc[i].n);
 594 #endif
 595         cycles[i] = static_cast<double>(wcc[i].c);
 596     }
 597     nsum = ewcNR;
 598     if (wc->wcsc)
 599     {
 600         for (i = 0; i < ewcsNR; i++)
 601         {
 602 #if GMX_MPI
 603             cycles_n[ewcNR + i] = static_cast<double>(wc->wcsc[i].n);
 604 #endif
 605             cycles[ewcNR + i] = static_cast<double>(wc->wcsc[i].c);
 606         }
 607         nsum += ewcsNR;
 608     }
 609
 610 #if GMX_MPI
 611     if (cr->nnodes > 1)
 612     {
 613         double buf[ewcNR + ewcsNR + 1];
 614
 615         // TODO this code is used only at the end of the run, so we
 616         // can just do a simple reduce of haveInvalidCount in
 617         // wallcycle_print, and avoid bugs
 618         cycles_n[nsum] = (wc->haveInvalidCount ? 1 : 0);
 619         // TODO Use MPI_Reduce
 620         MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
 621         for (i = 0; i < ewcNR; i++)
 622         {
 623             wcc[i].n = gmx::roundToInt(buf[i]);
 624         }
 625         wc->haveInvalidCount = (buf[nsum] > 0);
 626         if (wc->wcsc)
 627         {
 628             for (i = 0; i < ewcsNR; i++)
 629             {
 630                 wc->wcsc[i].n = gmx::roundToInt(buf[ewcNR + i]);
 631             }
 632         }
 633
 634         // TODO Use MPI_Reduce
 635         MPI_Allreduce(cycles, cycles_sum.data(), nsum, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 636
 637         if (wc->wcc_all != nullptr)
 638         {
 639             double *buf_all, *cyc_all;
 640
 641             snew(cyc_all, ewcNR * ewcNR);
 642             snew(buf_all, ewcNR * ewcNR);
 643             for (i = 0; i < ewcNR * ewcNR; i++)
 644             {
 645                 cyc_all[i] = wc->wcc_all[i].c;
 646             }
 647             // TODO Use MPI_Reduce
 648             MPI_Allreduce(cyc_all, buf_all, ewcNR * ewcNR, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 649             for (i = 0; i < ewcNR * ewcNR; i++)
 650             {
 651                 wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
 652             }
 653             sfree(buf_all);
 654             sfree(cyc_all);
 655         }
 656     }
 657     else
 658 #endif
 659     {
 660         for (i = 0; i < nsum; i++)
 661         {
 662             cycles_sum[i] = cycles[i];
 663         }
 664     }
 665
 666     return cycles_sum;
 667 }
 668
 669 static void
 670 print_cycles(FILE* fplog, double c2t, const char* name, int nnodes, int nthreads, int ncalls, double c_sum, double tot)
 671 {
 672     char   nnodes_str[STRLEN];
 673     char   nthreads_str[STRLEN];
 674     char   ncalls_str[STRLEN];
 675     double wallt;
 676     double percentage = (tot > 0.) ? (100. * c_sum / tot) : 0.;
 677
 678     if (c_sum > 0)
 679     {
 680         if (ncalls > 0)
 681         {
 682             snprintf(ncalls_str, sizeof(ncalls_str), "%10d", ncalls);
 683             if (nnodes < 0)
 684             {
 685                 snprintf(nnodes_str, sizeof(nnodes_str), "N/A");
 686             }
 687             else
 688             {
 689                 snprintf(nnodes_str, sizeof(nnodes_str), "%4d", nnodes);
 690             }
 691             if (nthreads < 0)
 692             {
 693                 snprintf(nthreads_str, sizeof(nthreads_str), "N/A");
 694             }
 695             else
 696             {
 697                 snprintf(nthreads_str, sizeof(nthreads_str), "%4d", nthreads);
 698             }
 699         }
 700         else
 701         {
 702             nnodes_str[0]   = 0;
 703             nthreads_str[0] = 0;
 704             ncalls_str[0]   = 0;
 705         }
 706         /* Convert the cycle count to wallclock time for this task */
 707         wallt = c_sum * c2t;
 708
 709         fprintf(fplog, " %-19.19s %4s %4s %10s  %10.3f %14.3f %5.1f\n", name, nnodes_str,
 710                 nthreads_str, ncalls_str, wallt, c_sum * 1e-9, percentage);
 711     }
 712 }
 713
 714 static void print_gputimes(FILE* fplog, const char* name, int n, double t, double tot_t)
 715 {
 716     char num[11];
 717     char avg_perf[11];
 718
 719     if (n > 0)
 720     {
 721         snprintf(num, sizeof(num), "%10d", n);
 722         snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t / n);
 723     }
 724     else
 725     {
 726         sprintf(num, "          ");
 727         sprintf(avg_perf, "          ");
 728     }
 729     if (t != tot_t && tot_t > 0)
 730     {
 731         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, num, t / 1000, avg_perf, 100 * t / tot_t);
 732     }
 733     else
 734     {
 735         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, "", t / 1000, avg_perf, 100.0);
 736     }
 737 }
 738
 739 static void print_header(FILE* fplog, int nrank_pp, int nth_pp, int nrank_pme, int nth_pme)
 740 {
 741     int nrank_tot = nrank_pp + nrank_pme;
 742     if (0 == nrank_pme)
 743     {
 744         fprintf(fplog, "On %d MPI rank%s", nrank_tot, nrank_tot == 1 ? "" : "s");
 745         if (nth_pp > 1)
 746         {
 747             fprintf(fplog, ", each using %d OpenMP threads", nth_pp);
 748         }
 749         /* Don't report doing PP+PME, because we can't tell here if
 750          * this is RF, etc. */
 751     }
 752     else
 753     {
 754         fprintf(fplog, "On %d MPI rank%s doing PP", nrank_pp, nrank_pp == 1 ? "" : "s");
 755         if (nth_pp > 1)
 756         {
 757             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pp > 1 ? " each" : "", nth_pp);
 758         }
 759         fprintf(fplog, ", and\non %d MPI rank%s doing PME", nrank_pme, nrank_pme == 1 ? "" : "s");
 760         if (nth_pme > 1)
 761         {
 762             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pme > 1 ? " each" : "", nth_pme);
 763         }
 764     }
 765
 766     fprintf(fplog, "\n\n");
 767     fprintf(fplog, " Computing:          Num   Num      Call    Wall time         Giga-Cycles\n");
 768     fprintf(fplog, "                     Ranks Threads  Count      (s)         total sum    %%\n");
 769 }
 770
 771
 772 void wallcycle_print(FILE*                            fplog,
 773                      const gmx::MDLogger&             mdlog,
 774                      int                              nnodes,
 775                      int                              npme,
 776                      int                              nth_pp,
 777                      int                              nth_pme,
 778                      double                           realtime,
 779                      gmx_wallcycle_t                  wc,
 780                      const WallcycleCounts&           cyc_sum,
 781                      const gmx_wallclock_gpu_nbnxn_t* gpu_nbnxn_t,
 782                      const gmx_wallclock_gpu_pme_t*   gpu_pme_t)
 783 {
 784     double      tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
 785     double      c2t, c2t_pp, c2t_pme = 0;
 786     int         i, j, npp, nth_tot;
 787     char        buf[STRLEN];
 788     const char* hline =
 789             "-----------------------------------------------------------------------------";
 790
 791     if (wc == nullptr)
 792     {
 793         return;
 794     }
 795
 796     GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0");
 797     GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0");
 798     GMX_ASSERT(nnodes > 0, "Number of nodes must be >0");
 799     GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative");
 800     npp = nnodes - npme;
 801     /* npme is the number of PME-only ranks used, and we always do PP work */
 802     GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0");
 803
 804     nth_tot = npp * nth_pp + npme * nth_pme;
 805
 806     /* When using PME-only nodes, the next line is valid for both
 807        PP-only and PME-only nodes because they started ewcRUN at the
 808        same time. */
 809     tot        = cyc_sum[ewcRUN];
 810     tot_for_pp = 0;
 811
 812     if (tot <= 0.0)
 813     {
 814         /* TODO This is heavy handed, but until someone reworks the
 815            code so that it is provably robust with respect to
 816            non-positive values for all possible timer and cycle
 817            counters, there is less value gained from printing whatever
 818            timing data might still be sensible for some non-Jenkins
 819            run, than is lost from diagnosing Jenkins FP exceptions on
 820            runs about whose execution time we don't care. */
 821         GMX_LOG(mdlog.warning)
 822                 .asParagraph()
 823                 .appendTextFormatted(
 824                         "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a "
 825                         "time accounting",
 826                         tot);
 827         return;
 828     }
 829
 830     if (wc->haveInvalidCount)
 831     {
 832         GMX_LOG(mdlog.warning)
 833                 .asParagraph()
 834                 .appendText(
 835                         "NOTE: Detected invalid cycle counts, probably because threads moved "
 836                         "between CPU cores that do not have synchronized cycle counters. Will not "
 837                         "print the cycle accounting.");
 838         return;
 839     }
 840
 841
 842     /* Conversion factor from cycles to seconds */
 843     c2t    = realtime / tot;
 844     c2t_pp = c2t * nth_tot / static_cast<double>(npp * nth_pp);
 845     if (npme > 0)
 846     {
 847         c2t_pme = c2t * nth_tot / static_cast<double>(npme * nth_pme);
 848     }
 849     else
 850     {
 851         c2t_pme = 0;
 852     }
 853
 854     fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");
 855
 856     print_header(fplog, npp, nth_pp, npme, nth_pme);
 857
 858     fprintf(fplog, "%s\n", hline);
 859     for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
 860     {
 861         if (is_pme_subcounter(i))
 862         {
 863             /* Do not count these at all */
 864         }
 865         else if (npme > 0 && is_pme_counter(i))
 866         {
 867             /* Print timing information for PME-only nodes, but add an
 868              * asterisk so the reader of the table can know that the
 869              * walltimes are not meant to add up. The asterisk still
 870              * fits in the required maximum of 19 characters. */
 871             char buffer[STRLEN];
 872             snprintf(buffer, STRLEN, "%s *", wcn[i]);
 873             print_cycles(fplog, c2t_pme, buffer, npme, nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
 874         }
 875         else
 876         {
 877             /* Print timing information when it is for a PP or PP+PME
 878                node */
 879             print_cycles(fplog, c2t_pp, wcn[i], npp, nth_pp, wc->wcc[i].n, cyc_sum[i], tot);
 880             tot_for_pp += cyc_sum[i];
 881         }
 882     }
 883     if (wc->wcc_all != nullptr)
 884     {
 885         for (i = 0; i < ewcNR; i++)
 886         {
 887             for (j = 0; j < ewcNR; j++)
 888             {
 889                 snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
 890                 print_cycles(fplog, c2t_pp, buf, npp, nth_pp, wc->wcc_all[i * ewcNR + j].n,
 891                              wc->wcc_all[i * ewcNR + j].c, tot);
 892             }
 893         }
 894     }
 895     tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot);
 896     print_cycles(fplog, c2t_pp, "Rest", npp, nth_pp, -1, tot_for_rest - tot_for_pp, tot);
 897     fprintf(fplog, "%s\n", hline);
 898     print_cycles(fplog, c2t, "Total", npp, nth_pp, -1, tot, tot);
 899     fprintf(fplog, "%s\n", hline);
 900
 901     if (npme > 0)
 902     {
 903         fprintf(fplog,
 904                 "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
 905                 "    twice the total reported, but the cycle count total and %% are correct.\n"
 906                 "%s\n",
 907                 hline);
 908     }
 909
 910     if (wc->wcc[ewcPMEMESH].n > 0)
 911     {
 912         // A workaround to not print breakdown when no subcounters were recorded.
 913         // TODO: figure out and record PME GPU counters (what to do with the waiting ones?)
 914         std::vector<int> validPmeSubcounterIndices;
 915         for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
 916         {
 917             if (is_pme_subcounter(i) && wc->wcc[i].n > 0)
 918             {
 919                 validPmeSubcounterIndices.push_back(i);
 920             }
 921         }
 922
 923         if (!validPmeSubcounterIndices.empty())
 924         {
 925             fprintf(fplog, " Breakdown of PME mesh computation\n");
 926             fprintf(fplog, "%s\n", hline);
 927             for (auto i : validPmeSubcounterIndices)
 928             {
 929                 print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i], npme > 0 ? npme : npp,
 930                              nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
 931             }
 932             fprintf(fplog, "%s\n", hline);
 933         }
 934     }
 935
 936     if (useCycleSubcounters && wc->wcsc)
 937     {
 938         fprintf(fplog, " Breakdown of PP computation\n");
 939         fprintf(fplog, "%s\n", hline);
 940         for (i = 0; i < ewcsNR; i++)
 941         {
 942             print_cycles(fplog, c2t_pp, wcsn[i], npp, nth_pp, wc->wcsc[i].n, cyc_sum[ewcNR + i], tot);
 943         }
 944         fprintf(fplog, "%s\n", hline);
 945     }
 946
 947     /* print GPU timing summary */
 948     double tot_gpu = 0.0;
 949     if (gpu_pme_t)
 950     {
 951         for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
 952         {
 953             tot_gpu += gpu_pme_t->timing[k].t;
 954         }
 955     }
 956     if (gpu_nbnxn_t)
 957     {
 958         const char* k_log_str[2][2] = { { "Nonbonded F kernel", "Nonbonded F+ene k." },
 959                                         { "Nonbonded F+prune k.", "Nonbonded F+ene+prune k." } };
 960         tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
 961
 962         /* add up the kernel timings */
 963         for (i = 0; i < 2; i++)
 964         {
 965             for (j = 0; j < 2; j++)
 966             {
 967                 tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
 968             }
 969         }
 970         tot_gpu += gpu_nbnxn_t->pruneTime.t;
 971
 972         tot_cpu_overlap = wc->wcc[ewcFORCE].c;
 973         if (wc->wcc[ewcPMEMESH].n > 0)
 974         {
 975             tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
 976         }
 977         tot_cpu_overlap *= realtime * 1000 / tot; /* convert s to ms */
 978
 979         fprintf(fplog, "\n GPU timings\n%s\n", hline);
 980         fprintf(fplog,
 981                 " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
 982         fprintf(fplog, "%s\n", hline);
 983         print_gputimes(fplog, "Pair list H2D", gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
 984         print_gputimes(fplog, "X / q H2D", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
 985
 986         for (i = 0; i < 2; i++)
 987         {
 988             for (j = 0; j < 2; j++)
 989             {
 990                 if (gpu_nbnxn_t->ktime[i][j].c)
 991                 {
 992                     print_gputimes(fplog, k_log_str[i][j], gpu_nbnxn_t->ktime[i][j].c,
 993                                    gpu_nbnxn_t->ktime[i][j].t, tot_gpu);
 994                 }
 995             }
 996         }
 997         if (gpu_pme_t)
 998         {
 999             for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
1000             {
1001                 if (gpu_pme_t->timing[k].c)
1002                 {
1003                     print_gputimes(fplog, PMEStageNames[k], gpu_pme_t->timing[k].c,
1004                                    gpu_pme_t->timing[k].t, tot_gpu);
1005                 }
1006             }
1007         }
1008         if (gpu_nbnxn_t->pruneTime.c)
1009         {
1010             print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c,
1011                            gpu_nbnxn_t->pruneTime.t, tot_gpu);
1012         }
1013         print_gputimes(fplog, "F D2H", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
1014         fprintf(fplog, "%s\n", hline);
1015         print_gputimes(fplog, "Total ", gpu_nbnxn_t->nb_c, tot_gpu, tot_gpu);
1016         fprintf(fplog, "%s\n", hline);
1017         if (gpu_nbnxn_t->dynamicPruneTime.c)
1018         {
1019             /* We print the dynamic pruning kernel timings after a separator
1020              * and avoid adding it to tot_gpu as this is not in the force
1021              * overlap. We print the fraction as relative to the rest.
1022              */
1023             print_gputimes(fplog, "*Dynamic pruning", gpu_nbnxn_t->dynamicPruneTime.c,
1024                            gpu_nbnxn_t->dynamicPruneTime.t, tot_gpu);
1025             fprintf(fplog, "%s\n", hline);
1026         }
1027         gpu_cpu_ratio = tot_gpu / tot_cpu_overlap;
1028         if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
1029         {
1030             fprintf(fplog,
1031                     "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = "
1032                     "%.3f\n",
1033                     tot_gpu / gpu_nbnxn_t->nb_c, tot_cpu_overlap / wc->wcc[ewcFORCE].n, gpu_cpu_ratio);
1034         }
1035
1036         /* only print notes related to CPU-GPU load balance with PME */
1037         if (wc->wcc[ewcPMEMESH].n > 0)
1038         {
1039             fprintf(fplog, "For optimal resource utilization this ratio should be close to 1\n");
1040
1041             /* print note if the imbalance is high with PME case in which
1042              * CPU-GPU load balancing is possible */
1043             if (gpu_cpu_ratio < 0.8 || gpu_cpu_ratio > 1.25)
1044             {
1045                 /* Only the sim master calls this function, so always print to stderr */
1046                 if (gpu_cpu_ratio < 0.8)
1047                 {
1048                     if (npp > 1)
1049                     {
1050                         /* The user could have used -notunepme,
1051                          * but we currently can't check that here.
1052                          */
1053                         GMX_LOG(mdlog.warning)
1054                                 .asParagraph()
1055                                 .appendText(
1056                                         "NOTE: The CPU has >25% more load than the GPU. This "
1057                                         "imbalance wastes\n"
1058                                         "      GPU resources. Maybe the domain decomposition "
1059                                         "limits the PME tuning.\n"
1060                                         "      In that case, try setting the DD grid manually "
1061                                         "(-dd) or lowering -dds.");
1062                     }
1063                     else
1064                     {
1065                         /* We should not end up here, unless the box is
1066                          * too small for increasing the cut-off for PME tuning.
1067                          */
1068                         GMX_LOG(mdlog.warning)
1069                                 .asParagraph()
1070                                 .appendText(
1071                                         "NOTE: The CPU has >25% more load than the GPU. This "
1072                                         "imbalance wastes\n"
1073                                         "      GPU resources.");
1074                     }
1075                 }
1076                 if (gpu_cpu_ratio > 1.25)
1077                 {
1078                     GMX_LOG(mdlog.warning)
1079                             .asParagraph()
1080                             .appendText(
1081                                     "NOTE: The GPU has >25% more load than the CPU. This imbalance "
1082                                     "wastes\n"
1083                                     "      CPU resources.");
1084                 }
1085             }
1086         }
1087     }
1088
1089     if (wc->wc_barrier)
1090     {
1091         GMX_LOG(mdlog.warning)
1092                 .asParagraph()
1093                 .appendText(
1094                         "MPI_Barrier was called before each cycle start/stop\n"
1095                         "call, so timings are not those of real runs.");
1096     }
1097
1098     if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 && (cyc_sum[ewcDOMDEC] > tot * 0.1 || cyc_sum[ewcNS] > tot * 0.1))
1099     {
1100         /* Only the sim master calls this function, so always print to stderr */
1101         if (wc->wcc[ewcDOMDEC].n == 0)
1102         {
1103             GMX_LOG(mdlog.warning)
1104                     .asParagraph()
1105                     .appendTextFormatted(
1106                             "NOTE: %d %% of the run time was spent in pair search,\n"
1107                             "      you might want to increase nstlist (this has no effect on "
1108                             "accuracy)\n",
1109                             gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1110         }
1111         else
1112         {
1113             GMX_LOG(mdlog.warning)
1114                     .asParagraph()
1115                     .appendTextFormatted(
1116                             "NOTE: %d %% of the run time was spent in domain decomposition,\n"
1117                             "      %d %% of the run time was spent in pair search,\n"
1118                             "      you might want to increase nstlist (this has no effect on "
1119                             "accuracy)\n",
1120                             gmx::roundToInt(100 * cyc_sum[ewcDOMDEC] / tot),
1121                             gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1122         }
1123     }
1124
1125     if (cyc_sum[ewcMoveE] > tot * 0.05)
1126     {
1127         GMX_LOG(mdlog.warning)
1128                 .asParagraph()
1129                 .appendTextFormatted(
1130                         "NOTE: %d %% of the run time was spent communicating energies,\n"
1131                         "      you might want to increase some nst* mdp options\n",
1132                         gmx::roundToInt(100 * cyc_sum[ewcMoveE] / tot));
1133     }
1134 }
1135
1136 extern int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
1137 {
1138     if (wc == nullptr)
1139     {
1140         return -1;
1141     }
1142
1143     return wc->reset_counters;
1144 }
1145
1146 extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, int64_t reset_counters)
1147 {
1148     if (wc == nullptr)
1149     {
1150         return;
1151     }
1152
1153     wc->reset_counters = reset_counters;
1154 }
1155
1156 void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
1157 {
1158     if (useCycleSubcounters && wc != nullptr)
1159     {
1160         wc->wcsc[ewcs].start = gmx_cycles_read();
1161     }
1162 }
1163
1164 void wallcycle_sub_start_nocount(gmx_wallcycle_t wc, int ewcs)
1165 {
1166     if (useCycleSubcounters && wc != nullptr)
1167     {
1168         wallcycle_sub_start(wc, ewcs);
1169         wc->wcsc[ewcs].n--;
1170     }
1171 }
1172
1173 void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
1174 {
1175     if (useCycleSubcounters && wc != nullptr)
1176     {
1177         wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start;
1178         wc->wcsc[ewcs].n++;
1179     }
1180 }