src/gromacs/timing/wallcycle.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #include "gmxpre.h"
  39
  40 #include "wallcycle.h"
  41
  42 #include "config.h"
  43
  44 #include <cstdlib>
  45
  46 #include <array>
  47 #include <vector>
  48
  49 #include "gromacs/math/functions.h"
  50 #include "gromacs/mdtypes/commrec.h"
  51 #include "gromacs/timing/cyclecounter.h"
  52 #include "gromacs/timing/gpu_timing.h"
  53 #include "gromacs/timing/wallcyclereporting.h"
  54 #include "gromacs/utility/cstringutil.h"
  55 #include "gromacs/utility/gmxassert.h"
  56 #include "gromacs/utility/gmxmpi.h"
  57 #include "gromacs/utility/logger.h"
  58 #include "gromacs/utility/smalloc.h"
  59 #include "gromacs/utility/snprintf.h"
  60
  61 static const bool useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS;
  62
  63 /* DEBUG_WCYCLE adds consistency checking for the counters.
  64  * It checks if you stop a counter different from the last
  65  * one that was opened and if you do nest too deep.
  66  */
  67 /* #define DEBUG_WCYCLE */
  68
  69 #ifdef DEBUG_WCYCLE
  70 #    include "gromacs/utility/fatalerror.h"
  71 #endif
  72
  73 typedef struct
  74 {
  75     int          n;
  76     gmx_cycles_t c;
  77     gmx_cycles_t start;
  78 } wallcc_t;
  79
  80 struct gmx_wallcycle
  81 {
  82     wallcc_t* wcc;
  83     /* did we detect one or more invalid cycle counts */
  84     gmx_bool haveInvalidCount;
  85     /* variables for testing/debugging */
  86     gmx_bool  wc_barrier;
  87     wallcc_t* wcc_all;
  88     int       wc_depth;
  89 #ifdef DEBUG_WCYCLE
  90 #    define DEPTH_MAX 6
  91     int counterlist[DEPTH_MAX];
  92     int count_depth;
  93 #endif
  94     int          ewc_prev;
  95     gmx_cycles_t cycle_prev;
  96     int64_t      reset_counters;
  97 #if GMX_MPI
  98     MPI_Comm mpi_comm_mygroup;
  99 #endif
 100     wallcc_t* wcsc;
 101 };
 102
 103 /* Each name should not exceed 19 printing characters
 104    (ie. terminating null can be twentieth) */
 105 static const char* wcn[ewcNR] = { "Run",
 106                                   "Step",
 107                                   "PP during PME",
 108                                   "Domain decomp.",
 109                                   "DD comm. load",
 110                                   "DD comm. bounds",
 111                                   "Vsite constr.",
 112                                   "Send X to PME",
 113                                   "Neighbor search",
 114                                   "Launch GPU ops.",
 115                                   "Comm. coord.",
 116                                   "Force",
 117                                   "Wait + Comm. F",
 118                                   "PME mesh",
 119                                   "PME redist. X/F",
 120                                   "PME spread",
 121                                   "PME gather",
 122                                   "PME 3D-FFT",
 123                                   "PME 3D-FFT Comm.",
 124                                   "PME solve LJ",
 125                                   "PME solve Elec",
 126                                   "PME wait for PP",
 127                                   "Wait + Recv. PME F",
 128                                   "Wait PME GPU spread",
 129                                   "PME 3D-FFT",
 130                                   "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
 131                                   "Wait PME GPU gather",
 132                                   "Wait Bonded GPU",
 133                                   "Reduce GPU PME F",
 134                                   "Wait GPU NB nonloc.",
 135                                   "Wait GPU NB local",
 136                                   "Wait GPU state copy",
 137                                   "NB X/F buffer ops.",
 138                                   "Vsite spread",
 139                                   "COM pull force",
 140                                   "AWH",
 141                                   "Write traj.",
 142                                   "Update",
 143                                   "Constraints",
 144                                   "Comm. energies",
 145                                   "Enforced rotation",
 146                                   "Add rot. forces",
 147                                   "Position swapping",
 148                                   "IMD",
 149                                   "Test" };
 150
 151 static const char* wcsn[ewcsNR] = {
 152     "DD redist.",
 153     "DD NS grid + sort",
 154     "DD setup comm.",
 155     "DD make top.",
 156     "DD make constr.",
 157     "DD top. other",
 158     "NS grid local",
 159     "NS grid non-loc.",
 160     "NS search local",
 161     "NS search non-loc.",
 162     "Bonded F",
 163     "Bonded-FEP F",
 164     "Restraints F",
 165     "Listed buffer ops.",
 166     "Nonbonded pruning",
 167     "Nonbonded F kernel",
 168     "Nonbonded F clear",
 169     "Nonbonded FEP",
 170     "Launch NB GPU tasks",
 171     "Launch Bonded GPU tasks",
 172     "Launch PME GPU tasks",
 173     "Launch state copy",
 174     "Ewald F correction",
 175     "NB X buffer ops.",
 176     "NB F buffer ops.",
 177     "Clear force buffer",
 178     "Launch GPU NB X buffer ops.",
 179     "Launch GPU NB F buffer ops.",
 180     "Launch GPU Comm. coord.",
 181     "Launch GPU Comm. force.",
 182     "Test subcounter",
 183 };
 184
 185 /* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
 186 static const char* PMEStageNames[] = {
 187     "PME spline", "PME spread",     "PME spline + spread", "PME 3D-FFT r2c",
 188     "PME solve",  "PME 3D-FFT c2r", "PME gather",
 189 };
 190
 191 gmx_bool wallcycle_have_counter()
 192 {
 193     return gmx_cycles_have_counter();
 194 }
 195
 196 gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, t_commrec gmx_unused* cr)
 197 {
 198     gmx_wallcycle_t wc;
 199
 200
 201     if (!wallcycle_have_counter())
 202     {
 203         return nullptr;
 204     }
 205
 206     snew(wc, 1);
 207
 208     wc->haveInvalidCount = FALSE;
 209     wc->wc_barrier       = FALSE;
 210     wc->wcc_all          = nullptr;
 211     wc->wc_depth         = 0;
 212     wc->ewc_prev         = -1;
 213     wc->reset_counters   = resetstep;
 214
 215 #if GMX_MPI
 216     if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
 217     {
 218         if (fplog)
 219         {
 220             fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
 221         }
 222         wc->wc_barrier       = TRUE;
 223         wc->mpi_comm_mygroup = cr->mpi_comm_mygroup;
 224     }
 225 #endif
 226
 227     snew(wc->wcc, ewcNR);
 228     if (getenv("GMX_CYCLE_ALL") != nullptr)
 229     {
 230         if (fplog)
 231         {
 232             fprintf(fplog, "\nWill time all the code during the run\n\n");
 233         }
 234         snew(wc->wcc_all, ewcNR * ewcNR);
 235     }
 236
 237     if (useCycleSubcounters)
 238     {
 239         snew(wc->wcsc, ewcsNR);
 240     }
 241
 242 #ifdef DEBUG_WCYCLE
 243     wc->count_depth = 0;
 244 #endif
 245
 246     return wc;
 247 }
 248
 249 void wallcycle_destroy(gmx_wallcycle_t wc)
 250 {
 251     if (wc == nullptr)
 252     {
 253         return;
 254     }
 255
 256     if (wc->wcc != nullptr)
 257     {
 258         sfree(wc->wcc);
 259     }
 260     if (wc->wcc_all != nullptr)
 261     {
 262         sfree(wc->wcc_all);
 263     }
 264     if (wc->wcsc != nullptr)
 265     {
 266         sfree(wc->wcsc);
 267     }
 268     sfree(wc);
 269 }
 270
 271 static void wallcycle_all_start(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 272 {
 273     wc->ewc_prev   = ewc;
 274     wc->cycle_prev = cycle;
 275 }
 276
 277 static void wallcycle_all_stop(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 278 {
 279     wc->wcc_all[wc->ewc_prev * ewcNR + ewc].n += 1;
 280     wc->wcc_all[wc->ewc_prev * ewcNR + ewc].c += cycle - wc->cycle_prev;
 281 }
 282
 283
 284 #ifdef DEBUG_WCYCLE
 285 static void debug_start_check(gmx_wallcycle_t wc, int ewc)
 286 {
 287     /* fprintf(stderr,"wcycle_start depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 288
 289     if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
 290     {
 291         gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth);
 292     }
 293     wc->counterlist[wc->count_depth] = ewc;
 294     wc->count_depth++;
 295 }
 296
 297 static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
 298 {
 299     wc->count_depth--;
 300
 301     /* fprintf(stderr,"wcycle_stop depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 302
 303     if (wc->count_depth < 0)
 304     {
 305         gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc],
 306                   wc->count_depth);
 307     }
 308     if (wc->counterlist[wc->count_depth] != ewc)
 309     {
 310         gmx_fatal(FARGS, "wallcycle mismatch at stop, start %s, stop %s",
 311                   wcn[wc->counterlist[wc->count_depth]], wcn[ewc]);
 312     }
 313 }
 314 #endif
 315
 316 void wallcycle_start(gmx_wallcycle_t wc, int ewc)
 317 {
 318     gmx_cycles_t cycle;
 319
 320     if (wc == nullptr)
 321     {
 322         return;
 323     }
 324
 325 #if GMX_MPI
 326     if (wc->wc_barrier)
 327     {
 328         MPI_Barrier(wc->mpi_comm_mygroup);
 329     }
 330 #endif
 331
 332 #ifdef DEBUG_WCYCLE
 333     debug_start_check(wc, ewc);
 334 #endif
 335
 336     cycle              = gmx_cycles_read();
 337     wc->wcc[ewc].start = cycle;
 338     if (wc->wcc_all != nullptr)
 339     {
 340         wc->wc_depth++;
 341         if (ewc == ewcRUN)
 342         {
 343             wallcycle_all_start(wc, ewc, cycle);
 344         }
 345         else if (wc->wc_depth == 3)
 346         {
 347             wallcycle_all_stop(wc, ewc, cycle);
 348         }
 349     }
 350 }
 351
 352 void wallcycle_increment_event_count(gmx_wallcycle_t wc, int ewc)
 353 {
 354     if (wc == nullptr)
 355     {
 356         return;
 357     }
 358     wc->wcc[ewc].n++;
 359 }
 360
 361 void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
 362 {
 363     if (wc == nullptr)
 364     {
 365         return;
 366     }
 367
 368     wallcycle_start(wc, ewc);
 369     wc->wcc[ewc].n--;
 370 }
 371
 372 double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
 373 {
 374     gmx_cycles_t cycle, last;
 375
 376     if (wc == nullptr)
 377     {
 378         return 0;
 379     }
 380
 381 #if GMX_MPI
 382     if (wc->wc_barrier)
 383     {
 384         MPI_Barrier(wc->mpi_comm_mygroup);
 385     }
 386 #endif
 387
 388 #ifdef DEBUG_WCYCLE
 389     debug_stop_check(wc, ewc);
 390 #endif
 391
 392     /* When processes or threads migrate between cores, the cycle counting
 393      * can get messed up if the cycle counter on different cores are not
 394      * synchronized. When this happens we expect both large negative and
 395      * positive cycle differences. We can detect negative cycle differences.
 396      * Detecting too large positive counts if difficult, since count can be
 397      * large, especially for ewcRUN. If we detect a negative count,
 398      * we will not print the cycle accounting table.
 399      */
 400     cycle = gmx_cycles_read();
 401     if (cycle >= wc->wcc[ewc].start)
 402     {
 403         last = cycle - wc->wcc[ewc].start;
 404     }
 405     else
 406     {
 407         last                 = 0;
 408         wc->haveInvalidCount = TRUE;
 409     }
 410     wc->wcc[ewc].c += last;
 411     wc->wcc[ewc].n++;
 412     if (wc->wcc_all)
 413     {
 414         wc->wc_depth--;
 415         if (ewc == ewcRUN)
 416         {
 417             wallcycle_all_stop(wc, ewc, cycle);
 418         }
 419         else if (wc->wc_depth == 2)
 420         {
 421             wallcycle_all_start(wc, ewc, cycle);
 422         }
 423     }
 424
 425     return last;
 426 }
 427
 428 void wallcycle_get(gmx_wallcycle_t wc, int ewc, int* n, double* c)
 429 {
 430     *n = wc->wcc[ewc].n;
 431     *c = static_cast<double>(wc->wcc[ewc].c);
 432 }
 433
 434 void wallcycle_reset_all(gmx_wallcycle_t wc)
 435 {
 436     int i;
 437
 438     if (wc == nullptr)
 439     {
 440         return;
 441     }
 442
 443     for (i = 0; i < ewcNR; i++)
 444     {
 445         wc->wcc[i].n = 0;
 446         wc->wcc[i].c = 0;
 447     }
 448     wc->haveInvalidCount = FALSE;
 449
 450     if (wc->wcc_all)
 451     {
 452         for (i = 0; i < ewcNR * ewcNR; i++)
 453         {
 454             wc->wcc_all[i].n = 0;
 455             wc->wcc_all[i].c = 0;
 456         }
 457     }
 458     if (wc->wcsc)
 459     {
 460         for (i = 0; i < ewcsNR; i++)
 461         {
 462             wc->wcsc[i].n = 0;
 463             wc->wcsc[i].c = 0;
 464         }
 465     }
 466 }
 467
 468 static gmx_bool is_pme_counter(int ewc)
 469 {
 470     return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
 471 }
 472
 473 static gmx_bool is_pme_subcounter(int ewc)
 474 {
 475     return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
 476 }
 477
 478 /* Subtract counter ewc_sub timed inside a timing block for ewc_main */
 479 static void subtract_cycles(wallcc_t* wcc, int ewc_main, int ewc_sub)
 480 {
 481     if (wcc[ewc_sub].n > 0)
 482     {
 483         if (wcc[ewc_main].c >= wcc[ewc_sub].c)
 484         {
 485             wcc[ewc_main].c -= wcc[ewc_sub].c;
 486         }
 487         else
 488         {
 489             /* Something is wrong with the cycle counting */
 490             wcc[ewc_main].c = 0;
 491         }
 492     }
 493 }
 494
 495 void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
 496 {
 497     if (wc == nullptr)
 498     {
 499         return;
 500     }
 501
 502     for (int i = 0; i < ewcNR; i++)
 503     {
 504         if (is_pme_counter(i) || (i == ewcRUN && isPmeRank))
 505         {
 506             wc->wcc[i].c *= nthreads_pme;
 507
 508             if (wc->wcc_all)
 509             {
 510                 for (int j = 0; j < ewcNR; j++)
 511                 {
 512                     wc->wcc_all[i * ewcNR + j].c *= nthreads_pme;
 513                 }
 514             }
 515         }
 516         else
 517         {
 518             wc->wcc[i].c *= nthreads_pp;
 519
 520             if (wc->wcc_all)
 521             {
 522                 for (int j = 0; j < ewcNR; j++)
 523                 {
 524                     wc->wcc_all[i * ewcNR + j].c *= nthreads_pp;
 525                 }
 526             }
 527         }
 528     }
 529     if (useCycleSubcounters && wc->wcsc && !isPmeRank)
 530     {
 531         for (int i = 0; i < ewcsNR; i++)
 532         {
 533             wc->wcsc[i].c *= nthreads_pp;
 534         }
 535     }
 536 }
 537
 538 /* TODO Make an object for this function to return, containing some
 539  * vectors of something like wallcc_t for the summed wcc, wcc_all and
 540  * wcsc, AND the original wcc for rank 0.
 541  *
 542  * The GPU timing is reported only for rank 0, so we want to preserve
 543  * the original wcycle on that rank. Rank 0 also reports the global
 544  * counts before that, so needs something to contain the global data
 545  * without over-writing the rank-0 data. The current implementation
 546  * uses cycles_sum to manage this, which works OK now because wcsc and
 547  * wcc_all are unused by the GPU reporting, but it is not satisfactory
 548  * for the future. Also, there's no need for MPI_Allreduce, since
 549  * only MASTERRANK uses any of the results. */
 550 WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle_t wc)
 551 {
 552     WallcycleCounts cycles_sum;
 553     wallcc_t*       wcc;
 554     double          cycles[int(ewcNR) + int(ewcsNR)];
 555 #if GMX_MPI
 556     double cycles_n[int(ewcNR) + int(ewcsNR) + 1];
 557 #endif
 558     int i;
 559     int nsum;
 560
 561     if (wc == nullptr)
 562     {
 563         /* Default construction of std::array of non-class T can leave
 564            the values indeterminate, just like a C array, and icc
 565            warns about it. */
 566         cycles_sum.fill(0);
 567         return cycles_sum;
 568     }
 569
 570     wcc = wc->wcc;
 571
 572     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
 573     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);
 574
 575     subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);
 576
 577     if (cr->npmenodes == 0)
 578     {
 579         /* All nodes do PME (or no PME at all) */
 580         subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
 581     }
 582     else
 583     {
 584         /* The are PME-only nodes */
 585         if (wcc[ewcPMEMESH].n > 0)
 586         {
 587             /* This must be a PME only node, calculate the Wait + Comm. time */
 588             GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c,
 589                        "Total run ticks must be greater than PME-only ticks");
 590             wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
 591         }
 592     }
 593
 594     /* Store the cycles in a double buffer for summing */
 595     for (i = 0; i < ewcNR; i++)
 596     {
 597 #if GMX_MPI
 598         cycles_n[i] = static_cast<double>(wcc[i].n);
 599 #endif
 600         cycles[i] = static_cast<double>(wcc[i].c);
 601     }
 602     nsum = ewcNR;
 603     if (wc->wcsc)
 604     {
 605         for (i = 0; i < ewcsNR; i++)
 606         {
 607 #if GMX_MPI
 608             cycles_n[ewcNR + i] = static_cast<double>(wc->wcsc[i].n);
 609 #endif
 610             cycles[ewcNR + i] = static_cast<double>(wc->wcsc[i].c);
 611         }
 612         nsum += ewcsNR;
 613     }
 614
 615 #if GMX_MPI
 616     if (cr->nnodes > 1)
 617     {
 618         double buf[int(ewcNR) + int(ewcsNR) + 1];
 619
 620         // TODO this code is used only at the end of the run, so we
 621         // can just do a simple reduce of haveInvalidCount in
 622         // wallcycle_print, and avoid bugs
 623         cycles_n[nsum] = (wc->haveInvalidCount ? 1 : 0);
 624         // TODO Use MPI_Reduce
 625         MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
 626         for (i = 0; i < ewcNR; i++)
 627         {
 628             wcc[i].n = gmx::roundToInt(buf[i]);
 629         }
 630         wc->haveInvalidCount = (buf[nsum] > 0);
 631         if (wc->wcsc)
 632         {
 633             for (i = 0; i < ewcsNR; i++)
 634             {
 635                 wc->wcsc[i].n = gmx::roundToInt(buf[ewcNR + i]);
 636             }
 637         }
 638
 639         // TODO Use MPI_Reduce
 640         MPI_Allreduce(cycles, cycles_sum.data(), nsum, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 641
 642         if (wc->wcc_all != nullptr)
 643         {
 644             double *buf_all, *cyc_all;
 645
 646             snew(cyc_all, ewcNR * ewcNR);
 647             snew(buf_all, ewcNR * ewcNR);
 648             for (i = 0; i < ewcNR * ewcNR; i++)
 649             {
 650                 cyc_all[i] = wc->wcc_all[i].c;
 651             }
 652             // TODO Use MPI_Reduce
 653             MPI_Allreduce(cyc_all, buf_all, ewcNR * ewcNR, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 654             for (i = 0; i < ewcNR * ewcNR; i++)
 655             {
 656                 wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
 657             }
 658             sfree(buf_all);
 659             sfree(cyc_all);
 660         }
 661     }
 662     else
 663 #endif
 664     {
 665         for (i = 0; i < nsum; i++)
 666         {
 667             cycles_sum[i] = cycles[i];
 668         }
 669     }
 670
 671     return cycles_sum;
 672 }
 673
 674 static void
 675 print_cycles(FILE* fplog, double c2t, const char* name, int nnodes, int nthreads, int ncalls, double c_sum, double tot)
 676 {
 677     char   nnodes_str[STRLEN];
 678     char   nthreads_str[STRLEN];
 679     char   ncalls_str[STRLEN];
 680     double wallt;
 681     double percentage = (tot > 0.) ? (100. * c_sum / tot) : 0.;
 682
 683     if (c_sum > 0)
 684     {
 685         if (ncalls > 0)
 686         {
 687             snprintf(ncalls_str, sizeof(ncalls_str), "%10d", ncalls);
 688             if (nnodes < 0)
 689             {
 690                 snprintf(nnodes_str, sizeof(nnodes_str), "N/A");
 691             }
 692             else
 693             {
 694                 snprintf(nnodes_str, sizeof(nnodes_str), "%4d", nnodes);
 695             }
 696             if (nthreads < 0)
 697             {
 698                 snprintf(nthreads_str, sizeof(nthreads_str), "N/A");
 699             }
 700             else
 701             {
 702                 snprintf(nthreads_str, sizeof(nthreads_str), "%4d", nthreads);
 703             }
 704         }
 705         else
 706         {
 707             nnodes_str[0]   = 0;
 708             nthreads_str[0] = 0;
 709             ncalls_str[0]   = 0;
 710         }
 711         /* Convert the cycle count to wallclock time for this task */
 712         wallt = c_sum * c2t;
 713
 714         fprintf(fplog, " %-19.19s %4s %4s %10s  %10.3f %14.3f %5.1f\n", name, nnodes_str,
 715                 nthreads_str, ncalls_str, wallt, c_sum * 1e-9, percentage);
 716     }
 717 }
 718
 719 static void print_gputimes(FILE* fplog, const char* name, int n, double t, double tot_t)
 720 {
 721     char num[11];
 722     char avg_perf[11];
 723
 724     if (n > 0)
 725     {
 726         snprintf(num, sizeof(num), "%10d", n);
 727         snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t / n);
 728     }
 729     else
 730     {
 731         sprintf(num, "          ");
 732         sprintf(avg_perf, "          ");
 733     }
 734     if (t != tot_t && tot_t > 0)
 735     {
 736         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, num, t / 1000, avg_perf, 100 * t / tot_t);
 737     }
 738     else
 739     {
 740         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, "", t / 1000, avg_perf, 100.0);
 741     }
 742 }
 743
 744 static void print_header(FILE* fplog, int nrank_pp, int nth_pp, int nrank_pme, int nth_pme)
 745 {
 746     int nrank_tot = nrank_pp + nrank_pme;
 747     if (0 == nrank_pme)
 748     {
 749         fprintf(fplog, "On %d MPI rank%s", nrank_tot, nrank_tot == 1 ? "" : "s");
 750         if (nth_pp > 1)
 751         {
 752             fprintf(fplog, ", each using %d OpenMP threads", nth_pp);
 753         }
 754         /* Don't report doing PP+PME, because we can't tell here if
 755          * this is RF, etc. */
 756     }
 757     else
 758     {
 759         fprintf(fplog, "On %d MPI rank%s doing PP", nrank_pp, nrank_pp == 1 ? "" : "s");
 760         if (nth_pp > 1)
 761         {
 762             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pp > 1 ? " each" : "", nth_pp);
 763         }
 764         fprintf(fplog, ", and\non %d MPI rank%s doing PME", nrank_pme, nrank_pme == 1 ? "" : "s");
 765         if (nth_pme > 1)
 766         {
 767             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pme > 1 ? " each" : "", nth_pme);
 768         }
 769     }
 770
 771     fprintf(fplog, "\n\n");
 772     fprintf(fplog, " Computing:          Num   Num      Call    Wall time         Giga-Cycles\n");
 773     fprintf(fplog, "                     Ranks Threads  Count      (s)         total sum    %%\n");
 774 }
 775
 776
 777 void wallcycle_print(FILE*                            fplog,
 778                      const gmx::MDLogger&             mdlog,
 779                      int                              nnodes,
 780                      int                              npme,
 781                      int                              nth_pp,
 782                      int                              nth_pme,
 783                      double                           realtime,
 784                      gmx_wallcycle_t                  wc,
 785                      const WallcycleCounts&           cyc_sum,
 786                      const gmx_wallclock_gpu_nbnxn_t* gpu_nbnxn_t,
 787                      const gmx_wallclock_gpu_pme_t*   gpu_pme_t)
 788 {
 789     double      tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
 790     double      c2t, c2t_pp, c2t_pme = 0;
 791     int         i, j, npp, nth_tot;
 792     char        buf[STRLEN];
 793     const char* hline =
 794             "-----------------------------------------------------------------------------";
 795
 796     if (wc == nullptr)
 797     {
 798         return;
 799     }
 800
 801     GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0");
 802     GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0");
 803     GMX_ASSERT(nnodes > 0, "Number of nodes must be >0");
 804     GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative");
 805     npp = nnodes - npme;
 806     /* npme is the number of PME-only ranks used, and we always do PP work */
 807     GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0");
 808
 809     nth_tot = npp * nth_pp + npme * nth_pme;
 810
 811     /* When using PME-only nodes, the next line is valid for both
 812        PP-only and PME-only nodes because they started ewcRUN at the
 813        same time. */
 814     tot        = cyc_sum[ewcRUN];
 815     tot_for_pp = 0;
 816
 817     if (tot <= 0.0)
 818     {
 819         /* TODO This is heavy handed, but until someone reworks the
 820            code so that it is provably robust with respect to
 821            non-positive values for all possible timer and cycle
 822            counters, there is less value gained from printing whatever
 823            timing data might still be sensible for some non-Jenkins
 824            run, than is lost from diagnosing Jenkins FP exceptions on
 825            runs about whose execution time we don't care. */
 826         GMX_LOG(mdlog.warning)
 827                 .asParagraph()
 828                 .appendTextFormatted(
 829                         "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a "
 830                         "time accounting",
 831                         tot);
 832         return;
 833     }
 834
 835     if (wc->haveInvalidCount)
 836     {
 837         GMX_LOG(mdlog.warning)
 838                 .asParagraph()
 839                 .appendText(
 840                         "NOTE: Detected invalid cycle counts, probably because threads moved "
 841                         "between CPU cores that do not have synchronized cycle counters. Will not "
 842                         "print the cycle accounting.");
 843         return;
 844     }
 845
 846
 847     /* Conversion factor from cycles to seconds */
 848     c2t    = realtime / tot;
 849     c2t_pp = c2t * nth_tot / static_cast<double>(npp * nth_pp);
 850     if (npme > 0)
 851     {
 852         c2t_pme = c2t * nth_tot / static_cast<double>(npme * nth_pme);
 853     }
 854     else
 855     {
 856         c2t_pme = 0;
 857     }
 858
 859     fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");
 860
 861     print_header(fplog, npp, nth_pp, npme, nth_pme);
 862
 863     fprintf(fplog, "%s\n", hline);
 864     for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
 865     {
 866         if (is_pme_subcounter(i))
 867         {
 868             /* Do not count these at all */
 869         }
 870         else if (npme > 0 && is_pme_counter(i))
 871         {
 872             /* Print timing information for PME-only nodes, but add an
 873              * asterisk so the reader of the table can know that the
 874              * walltimes are not meant to add up. The asterisk still
 875              * fits in the required maximum of 19 characters. */
 876             char buffer[STRLEN];
 877             snprintf(buffer, STRLEN, "%s *", wcn[i]);
 878             print_cycles(fplog, c2t_pme, buffer, npme, nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
 879         }
 880         else
 881         {
 882             /* Print timing information when it is for a PP or PP+PME
 883                node */
 884             print_cycles(fplog, c2t_pp, wcn[i], npp, nth_pp, wc->wcc[i].n, cyc_sum[i], tot);
 885             tot_for_pp += cyc_sum[i];
 886         }
 887     }
 888     if (wc->wcc_all != nullptr)
 889     {
 890         for (i = 0; i < ewcNR; i++)
 891         {
 892             for (j = 0; j < ewcNR; j++)
 893             {
 894                 snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
 895                 print_cycles(fplog, c2t_pp, buf, npp, nth_pp, wc->wcc_all[i * ewcNR + j].n,
 896                              wc->wcc_all[i * ewcNR + j].c, tot);
 897             }
 898         }
 899     }
 900     tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot);
 901     print_cycles(fplog, c2t_pp, "Rest", npp, nth_pp, -1, tot_for_rest - tot_for_pp, tot);
 902     fprintf(fplog, "%s\n", hline);
 903     print_cycles(fplog, c2t, "Total", npp, nth_pp, -1, tot, tot);
 904     fprintf(fplog, "%s\n", hline);
 905
 906     if (npme > 0)
 907     {
 908         fprintf(fplog,
 909                 "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
 910                 "    twice the total reported, but the cycle count total and %% are correct.\n"
 911                 "%s\n",
 912                 hline);
 913     }
 914
 915     if (wc->wcc[ewcPMEMESH].n > 0)
 916     {
 917         // A workaround to not print breakdown when no subcounters were recorded.
 918         // TODO: figure out and record PME GPU counters (what to do with the waiting ones?)
 919         std::vector<int> validPmeSubcounterIndices;
 920         for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
 921         {
 922             if (is_pme_subcounter(i) && wc->wcc[i].n > 0)
 923             {
 924                 validPmeSubcounterIndices.push_back(i);
 925             }
 926         }
 927
 928         if (!validPmeSubcounterIndices.empty())
 929         {
 930             fprintf(fplog, " Breakdown of PME mesh computation\n");
 931             fprintf(fplog, "%s\n", hline);
 932             for (auto i : validPmeSubcounterIndices)
 933             {
 934                 print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i], npme > 0 ? npme : npp,
 935                              nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
 936             }
 937             fprintf(fplog, "%s\n", hline);
 938         }
 939     }
 940
 941     if (useCycleSubcounters && wc->wcsc)
 942     {
 943         fprintf(fplog, " Breakdown of PP computation\n");
 944         fprintf(fplog, "%s\n", hline);
 945         for (i = 0; i < ewcsNR; i++)
 946         {
 947             print_cycles(fplog, c2t_pp, wcsn[i], npp, nth_pp, wc->wcsc[i].n, cyc_sum[ewcNR + i], tot);
 948         }
 949         fprintf(fplog, "%s\n", hline);
 950     }
 951
 952     /* print GPU timing summary */
 953     double tot_gpu = 0.0;
 954     if (gpu_pme_t)
 955     {
 956         for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
 957         {
 958             tot_gpu += gpu_pme_t->timing[k].t;
 959         }
 960     }
 961     if (gpu_nbnxn_t)
 962     {
 963         const char* k_log_str[2][2] = { { "Nonbonded F kernel", "Nonbonded F+ene k." },
 964                                         { "Nonbonded F+prune k.", "Nonbonded F+ene+prune k." } };
 965         tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
 966
 967         /* add up the kernel timings */
 968         for (i = 0; i < 2; i++)
 969         {
 970             for (j = 0; j < 2; j++)
 971             {
 972                 tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
 973             }
 974         }
 975         tot_gpu += gpu_nbnxn_t->pruneTime.t;
 976
 977         tot_cpu_overlap = wc->wcc[ewcFORCE].c;
 978         if (wc->wcc[ewcPMEMESH].n > 0)
 979         {
 980             tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
 981         }
 982         tot_cpu_overlap *= realtime * 1000 / tot; /* convert s to ms */
 983
 984         fprintf(fplog, "\n GPU timings\n%s\n", hline);
 985         fprintf(fplog,
 986                 " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
 987         fprintf(fplog, "%s\n", hline);
 988         print_gputimes(fplog, "Pair list H2D", gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
 989         print_gputimes(fplog, "X / q H2D", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
 990
 991         for (i = 0; i < 2; i++)
 992         {
 993             for (j = 0; j < 2; j++)
 994             {
 995                 if (gpu_nbnxn_t->ktime[i][j].c)
 996                 {
 997                     print_gputimes(fplog, k_log_str[i][j], gpu_nbnxn_t->ktime[i][j].c,
 998                                    gpu_nbnxn_t->ktime[i][j].t, tot_gpu);
 999                 }
1000             }
1001         }
1002         if (gpu_pme_t)
1003         {
1004             for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
1005             {
1006                 if (gpu_pme_t->timing[k].c)
1007                 {
1008                     print_gputimes(fplog, PMEStageNames[k], gpu_pme_t->timing[k].c,
1009                                    gpu_pme_t->timing[k].t, tot_gpu);
1010                 }
1011             }
1012         }
1013         if (gpu_nbnxn_t->pruneTime.c)
1014         {
1015             print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c,
1016                            gpu_nbnxn_t->pruneTime.t, tot_gpu);
1017         }
1018         print_gputimes(fplog, "F D2H", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
1019         fprintf(fplog, "%s\n", hline);
1020         print_gputimes(fplog, "Total ", gpu_nbnxn_t->nb_c, tot_gpu, tot_gpu);
1021         fprintf(fplog, "%s\n", hline);
1022         if (gpu_nbnxn_t->dynamicPruneTime.c)
1023         {
1024             /* We print the dynamic pruning kernel timings after a separator
1025              * and avoid adding it to tot_gpu as this is not in the force
1026              * overlap. We print the fraction as relative to the rest.
1027              */
1028             print_gputimes(fplog, "*Dynamic pruning", gpu_nbnxn_t->dynamicPruneTime.c,
1029                            gpu_nbnxn_t->dynamicPruneTime.t, tot_gpu);
1030             fprintf(fplog, "%s\n", hline);
1031         }
1032         gpu_cpu_ratio = tot_gpu / tot_cpu_overlap;
1033         if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
1034         {
1035             fprintf(fplog,
1036                     "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = "
1037                     "%.3f\n",
1038                     tot_gpu / gpu_nbnxn_t->nb_c, tot_cpu_overlap / wc->wcc[ewcFORCE].n, gpu_cpu_ratio);
1039         }
1040
1041         /* only print notes related to CPU-GPU load balance with PME */
1042         if (wc->wcc[ewcPMEMESH].n > 0)
1043         {
1044             fprintf(fplog, "For optimal resource utilization this ratio should be close to 1\n");
1045
1046             /* print note if the imbalance is high with PME case in which
1047              * CPU-GPU load balancing is possible */
1048             if (gpu_cpu_ratio < 0.8 || gpu_cpu_ratio > 1.25)
1049             {
1050                 /* Only the sim master calls this function, so always print to stderr */
1051                 if (gpu_cpu_ratio < 0.8)
1052                 {
1053                     if (npp > 1)
1054                     {
1055                         /* The user could have used -notunepme,
1056                          * but we currently can't check that here.
1057                          */
1058                         GMX_LOG(mdlog.warning)
1059                                 .asParagraph()
1060                                 .appendText(
1061                                         "NOTE: The CPU has >25% more load than the GPU. This "
1062                                         "imbalance wastes\n"
1063                                         "      GPU resources. Maybe the domain decomposition "
1064                                         "limits the PME tuning.\n"
1065                                         "      In that case, try setting the DD grid manually "
1066                                         "(-dd) or lowering -dds.");
1067                     }
1068                     else
1069                     {
1070                         /* We should not end up here, unless the box is
1071                          * too small for increasing the cut-off for PME tuning.
1072                          */
1073                         GMX_LOG(mdlog.warning)
1074                                 .asParagraph()
1075                                 .appendText(
1076                                         "NOTE: The CPU has >25% more load than the GPU. This "
1077                                         "imbalance wastes\n"
1078                                         "      GPU resources.");
1079                     }
1080                 }
1081                 if (gpu_cpu_ratio > 1.25)
1082                 {
1083                     GMX_LOG(mdlog.warning)
1084                             .asParagraph()
1085                             .appendText(
1086                                     "NOTE: The GPU has >25% more load than the CPU. This imbalance "
1087                                     "wastes\n"
1088                                     "      CPU resources.");
1089                 }
1090             }
1091         }
1092     }
1093
1094     if (wc->wc_barrier)
1095     {
1096         GMX_LOG(mdlog.warning)
1097                 .asParagraph()
1098                 .appendText(
1099                         "MPI_Barrier was called before each cycle start/stop\n"
1100                         "call, so timings are not those of real runs.");
1101     }
1102
1103     if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 && (cyc_sum[ewcDOMDEC] > tot * 0.1 || cyc_sum[ewcNS] > tot * 0.1))
1104     {
1105         /* Only the sim master calls this function, so always print to stderr */
1106         if (wc->wcc[ewcDOMDEC].n == 0)
1107         {
1108             GMX_LOG(mdlog.warning)
1109                     .asParagraph()
1110                     .appendTextFormatted(
1111                             "NOTE: %d %% of the run time was spent in pair search,\n"
1112                             "      you might want to increase nstlist (this has no effect on "
1113                             "accuracy)\n",
1114                             gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1115         }
1116         else
1117         {
1118             GMX_LOG(mdlog.warning)
1119                     .asParagraph()
1120                     .appendTextFormatted(
1121                             "NOTE: %d %% of the run time was spent in domain decomposition,\n"
1122                             "      %d %% of the run time was spent in pair search,\n"
1123                             "      you might want to increase nstlist (this has no effect on "
1124                             "accuracy)\n",
1125                             gmx::roundToInt(100 * cyc_sum[ewcDOMDEC] / tot),
1126                             gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1127         }
1128     }
1129
1130     if (cyc_sum[ewcMoveE] > tot * 0.05)
1131     {
1132         GMX_LOG(mdlog.warning)
1133                 .asParagraph()
1134                 .appendTextFormatted(
1135                         "NOTE: %d %% of the run time was spent communicating energies,\n"
1136                         "      you might want to increase some nst* mdp options\n",
1137                         gmx::roundToInt(100 * cyc_sum[ewcMoveE] / tot));
1138     }
1139 }
1140
1141 extern int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
1142 {
1143     if (wc == nullptr)
1144     {
1145         return -1;
1146     }
1147
1148     return wc->reset_counters;
1149 }
1150
1151 extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, int64_t reset_counters)
1152 {
1153     if (wc == nullptr)
1154     {
1155         return;
1156     }
1157
1158     wc->reset_counters = reset_counters;
1159 }
1160
1161 void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
1162 {
1163     if (useCycleSubcounters && wc != nullptr)
1164     {
1165         wc->wcsc[ewcs].start = gmx_cycles_read();
1166     }
1167 }
1168
1169 void wallcycle_sub_start_nocount(gmx_wallcycle_t wc, int ewcs)
1170 {
1171     if (useCycleSubcounters && wc != nullptr)
1172     {
1173         wallcycle_sub_start(wc, ewcs);
1174         wc->wcsc[ewcs].n--;
1175     }
1176 }
1177
1178 void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
1179 {
1180     if (useCycleSubcounters && wc != nullptr)
1181     {
1182         wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start;
1183         wc->wcsc[ewcs].n++;
1184     }
1185 }