src/gromacs/timing/wallcycle.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #include "gmxpre.h"
  39
  40 #include "wallcycle.h"
  41
  42 #include "config.h"
  43
  44 #include <cstdlib>
  45
  46 #include <array>
  47 #include <vector>
  48
  49 #include "gromacs/math/functions.h"
  50 #include "gromacs/mdtypes/commrec.h"
  51 #include "gromacs/timing/cyclecounter.h"
  52 #include "gromacs/timing/gpu_timing.h"
  53 #include "gromacs/timing/wallcyclereporting.h"
  54 #include "gromacs/utility/cstringutil.h"
  55 #include "gromacs/utility/gmxassert.h"
  56 #include "gromacs/utility/gmxmpi.h"
  57 #include "gromacs/utility/logger.h"
  58 #include "gromacs/utility/smalloc.h"
  59 #include "gromacs/utility/snprintf.h"
  60
  61 static const bool useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS;
  62
  63 /* DEBUG_WCYCLE adds consistency checking for the counters.
  64  * It checks if you stop a counter different from the last
  65  * one that was opened and if you do nest too deep.
  66  */
  67 /* #define DEBUG_WCYCLE */
  68
  69 #ifdef DEBUG_WCYCLE
  70 #    include "gromacs/utility/fatalerror.h"
  71 #endif
  72
  73 typedef struct
  74 {
  75     int          n;
  76     gmx_cycles_t c;
  77     gmx_cycles_t start;
  78 } wallcc_t;
  79
  80 struct gmx_wallcycle
  81 {
  82     wallcc_t* wcc;
  83     /* did we detect one or more invalid cycle counts */
  84     gmx_bool haveInvalidCount;
  85     /* variables for testing/debugging */
  86     gmx_bool  wc_barrier;
  87     wallcc_t* wcc_all;
  88     int       wc_depth;
  89 #ifdef DEBUG_WCYCLE
  90 #    define DEPTH_MAX 6
  91     int counterlist[DEPTH_MAX];
  92     int count_depth;
  93 #endif
  94     int          ewc_prev;
  95     gmx_cycles_t cycle_prev;
  96     int64_t      reset_counters;
  97 #if GMX_MPI
  98     MPI_Comm mpi_comm_mygroup;
  99 #endif
 100     wallcc_t* wcsc;
 101 };
 102
 103 /* Each name should not exceed 19 printing characters
 104    (ie. terminating null can be twentieth) */
 105 static const char* wcn[ewcNR] = { "Run",
 106                                   "Step",
 107                                   "PP during PME",
 108                                   "Domain decomp.",
 109                                   "DD comm. load",
 110                                   "DD comm. bounds",
 111                                   "Vsite constr.",
 112                                   "Send X to PME",
 113                                   "Neighbor search",
 114                                   "Launch GPU ops.",
 115                                   "Comm. coord.",
 116                                   "Force",
 117                                   "Wait + Comm. F",
 118                                   "PME mesh",
 119                                   "PME redist. X/F",
 120                                   "PME spread",
 121                                   "PME gather",
 122                                   "PME 3D-FFT",
 123                                   "PME 3D-FFT Comm.",
 124                                   "PME solve LJ",
 125                                   "PME solve Elec",
 126                                   "PME wait for PP",
 127                                   "Wait + Recv. PME F",
 128                                   "Wait PME GPU spread",
 129                                   "PME 3D-FFT",
 130                                   "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
 131                                   "Wait PME GPU gather",
 132                                   "Wait Bonded GPU",
 133                                   "Reduce GPU PME F",
 134                                   "Wait GPU NB nonloc.",
 135                                   "Wait GPU NB local",
 136                                   "Wait GPU state copy",
 137                                   "NB X/F buffer ops.",
 138                                   "Vsite spread",
 139                                   "COM pull force",
 140                                   "AWH",
 141                                   "Write traj.",
 142                                   "Update",
 143                                   "Constraints",
 144                                   "Comm. energies",
 145                                   "Enforced rotation",
 146                                   "Add rot. forces",
 147                                   "Position swapping",
 148                                   "IMD",
 149                                   "Test" };
 150
 151 static const char* wcsn[ewcsNR] = {
 152     "DD redist.",
 153     "DD NS grid + sort",
 154     "DD setup comm.",
 155     "DD make top.",
 156     "DD make constr.",
 157     "DD top. other",
 158     "DD GPU ops.",
 159     "NS grid local",
 160     "NS grid non-loc.",
 161     "NS search local",
 162     "NS search non-loc.",
 163     "Bonded F",
 164     "Bonded-FEP F",
 165     "Restraints F",
 166     "Listed buffer ops.",
 167     "Nonbonded pruning",
 168     "Nonbonded F kernel",
 169     "Nonbonded F clear",
 170     "Nonbonded FEP",
 171     "Launch NB GPU tasks",
 172     "Launch Bonded GPU tasks",
 173     "Launch PME GPU tasks",
 174     "Launch state copy",
 175     "Ewald F correction",
 176     "NB X buffer ops.",
 177     "NB F buffer ops.",
 178     "Clear force buffer",
 179     "Launch GPU NB X buffer ops.",
 180     "Launch GPU NB F buffer ops.",
 181     "Launch GPU Comm. coord.",
 182     "Launch GPU Comm. force.",
 183     "Launch GPU update",
 184     "Test subcounter",
 185 };
 186
 187 /* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
 188 static const char* PMEStageNames[] = {
 189     "PME spline", "PME spread",     "PME spline + spread", "PME 3D-FFT r2c",
 190     "PME solve",  "PME 3D-FFT c2r", "PME gather",
 191 };
 192
 193 gmx_bool wallcycle_have_counter()
 194 {
 195     return gmx_cycles_have_counter();
 196 }
 197
 198 gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, t_commrec gmx_unused* cr)
 199 {
 200     gmx_wallcycle_t wc;
 201
 202
 203     if (!wallcycle_have_counter())
 204     {
 205         return nullptr;
 206     }
 207
 208     snew(wc, 1);
 209
 210     wc->haveInvalidCount = FALSE;
 211     wc->wc_barrier       = FALSE;
 212     wc->wcc_all          = nullptr;
 213     wc->wc_depth         = 0;
 214     wc->ewc_prev         = -1;
 215     wc->reset_counters   = resetstep;
 216
 217 #if GMX_MPI
 218     if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
 219     {
 220         if (fplog)
 221         {
 222             fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
 223         }
 224         wc->wc_barrier       = TRUE;
 225         wc->mpi_comm_mygroup = cr->mpi_comm_mygroup;
 226     }
 227 #endif
 228
 229     snew(wc->wcc, ewcNR);
 230     if (getenv("GMX_CYCLE_ALL") != nullptr)
 231     {
 232         if (fplog)
 233         {
 234             fprintf(fplog, "\nWill time all the code during the run\n\n");
 235         }
 236         snew(wc->wcc_all, ewcNR * ewcNR);
 237     }
 238
 239     if (useCycleSubcounters)
 240     {
 241         snew(wc->wcsc, ewcsNR);
 242     }
 243
 244 #ifdef DEBUG_WCYCLE
 245     wc->count_depth = 0;
 246 #endif
 247
 248     return wc;
 249 }
 250
 251 void wallcycle_destroy(gmx_wallcycle_t wc)
 252 {
 253     if (wc == nullptr)
 254     {
 255         return;
 256     }
 257
 258     if (wc->wcc != nullptr)
 259     {
 260         sfree(wc->wcc);
 261     }
 262     if (wc->wcc_all != nullptr)
 263     {
 264         sfree(wc->wcc_all);
 265     }
 266     if (wc->wcsc != nullptr)
 267     {
 268         sfree(wc->wcsc);
 269     }
 270     sfree(wc);
 271 }
 272
 273 static void wallcycle_all_start(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 274 {
 275     wc->ewc_prev   = ewc;
 276     wc->cycle_prev = cycle;
 277 }
 278
 279 static void wallcycle_all_stop(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 280 {
 281     wc->wcc_all[wc->ewc_prev * ewcNR + ewc].n += 1;
 282     wc->wcc_all[wc->ewc_prev * ewcNR + ewc].c += cycle - wc->cycle_prev;
 283 }
 284
 285
 286 #ifdef DEBUG_WCYCLE
 287 static void debug_start_check(gmx_wallcycle_t wc, int ewc)
 288 {
 289     /* fprintf(stderr,"wcycle_start depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 290
 291     if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
 292     {
 293         gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth);
 294     }
 295     wc->counterlist[wc->count_depth] = ewc;
 296     wc->count_depth++;
 297 }
 298
 299 static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
 300 {
 301     wc->count_depth--;
 302
 303     /* fprintf(stderr,"wcycle_stop depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 304
 305     if (wc->count_depth < 0)
 306     {
 307         gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc],
 308                   wc->count_depth);
 309     }
 310     if (wc->counterlist[wc->count_depth] != ewc)
 311     {
 312         gmx_fatal(FARGS, "wallcycle mismatch at stop, start %s, stop %s",
 313                   wcn[wc->counterlist[wc->count_depth]], wcn[ewc]);
 314     }
 315 }
 316 #endif
 317
 318 void wallcycle_start(gmx_wallcycle_t wc, int ewc)
 319 {
 320     gmx_cycles_t cycle;
 321
 322     if (wc == nullptr)
 323     {
 324         return;
 325     }
 326
 327 #if GMX_MPI
 328     if (wc->wc_barrier)
 329     {
 330         MPI_Barrier(wc->mpi_comm_mygroup);
 331     }
 332 #endif
 333
 334 #ifdef DEBUG_WCYCLE
 335     debug_start_check(wc, ewc);
 336 #endif
 337
 338     cycle              = gmx_cycles_read();
 339     wc->wcc[ewc].start = cycle;
 340     if (wc->wcc_all != nullptr)
 341     {
 342         wc->wc_depth++;
 343         if (ewc == ewcRUN)
 344         {
 345             wallcycle_all_start(wc, ewc, cycle);
 346         }
 347         else if (wc->wc_depth == 3)
 348         {
 349             wallcycle_all_stop(wc, ewc, cycle);
 350         }
 351     }
 352 }
 353
 354 void wallcycle_increment_event_count(gmx_wallcycle_t wc, int ewc)
 355 {
 356     if (wc == nullptr)
 357     {
 358         return;
 359     }
 360     wc->wcc[ewc].n++;
 361 }
 362
 363 void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
 364 {
 365     if (wc == nullptr)
 366     {
 367         return;
 368     }
 369
 370     wallcycle_start(wc, ewc);
 371     wc->wcc[ewc].n--;
 372 }
 373
 374 double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
 375 {
 376     gmx_cycles_t cycle, last;
 377
 378     if (wc == nullptr)
 379     {
 380         return 0;
 381     }
 382
 383 #if GMX_MPI
 384     if (wc->wc_barrier)
 385     {
 386         MPI_Barrier(wc->mpi_comm_mygroup);
 387     }
 388 #endif
 389
 390 #ifdef DEBUG_WCYCLE
 391     debug_stop_check(wc, ewc);
 392 #endif
 393
 394     /* When processes or threads migrate between cores, the cycle counting
 395      * can get messed up if the cycle counter on different cores are not
 396      * synchronized. When this happens we expect both large negative and
 397      * positive cycle differences. We can detect negative cycle differences.
 398      * Detecting too large positive counts if difficult, since count can be
 399      * large, especially for ewcRUN. If we detect a negative count,
 400      * we will not print the cycle accounting table.
 401      */
 402     cycle = gmx_cycles_read();
 403     if (cycle >= wc->wcc[ewc].start)
 404     {
 405         last = cycle - wc->wcc[ewc].start;
 406     }
 407     else
 408     {
 409         last                 = 0;
 410         wc->haveInvalidCount = TRUE;
 411     }
 412     wc->wcc[ewc].c += last;
 413     wc->wcc[ewc].n++;
 414     if (wc->wcc_all)
 415     {
 416         wc->wc_depth--;
 417         if (ewc == ewcRUN)
 418         {
 419             wallcycle_all_stop(wc, ewc, cycle);
 420         }
 421         else if (wc->wc_depth == 2)
 422         {
 423             wallcycle_all_start(wc, ewc, cycle);
 424         }
 425     }
 426
 427     return last;
 428 }
 429
 430 void wallcycle_get(gmx_wallcycle_t wc, int ewc, int* n, double* c)
 431 {
 432     *n = wc->wcc[ewc].n;
 433     *c = static_cast<double>(wc->wcc[ewc].c);
 434 }
 435
 436 void wallcycle_reset_all(gmx_wallcycle_t wc)
 437 {
 438     int i;
 439
 440     if (wc == nullptr)
 441     {
 442         return;
 443     }
 444
 445     for (i = 0; i < ewcNR; i++)
 446     {
 447         wc->wcc[i].n = 0;
 448         wc->wcc[i].c = 0;
 449     }
 450     wc->haveInvalidCount = FALSE;
 451
 452     if (wc->wcc_all)
 453     {
 454         for (i = 0; i < ewcNR * ewcNR; i++)
 455         {
 456             wc->wcc_all[i].n = 0;
 457             wc->wcc_all[i].c = 0;
 458         }
 459     }
 460     if (wc->wcsc)
 461     {
 462         for (i = 0; i < ewcsNR; i++)
 463         {
 464             wc->wcsc[i].n = 0;
 465             wc->wcsc[i].c = 0;
 466         }
 467     }
 468 }
 469
 470 static gmx_bool is_pme_counter(int ewc)
 471 {
 472     return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
 473 }
 474
 475 static gmx_bool is_pme_subcounter(int ewc)
 476 {
 477     return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
 478 }
 479
 480 /* Subtract counter ewc_sub timed inside a timing block for ewc_main */
 481 static void subtract_cycles(wallcc_t* wcc, int ewc_main, int ewc_sub)
 482 {
 483     if (wcc[ewc_sub].n > 0)
 484     {
 485         if (wcc[ewc_main].c >= wcc[ewc_sub].c)
 486         {
 487             wcc[ewc_main].c -= wcc[ewc_sub].c;
 488         }
 489         else
 490         {
 491             /* Something is wrong with the cycle counting */
 492             wcc[ewc_main].c = 0;
 493         }
 494     }
 495 }
 496
 497 void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
 498 {
 499     if (wc == nullptr)
 500     {
 501         return;
 502     }
 503
 504     for (int i = 0; i < ewcNR; i++)
 505     {
 506         if (is_pme_counter(i) || (i == ewcRUN && isPmeRank))
 507         {
 508             wc->wcc[i].c *= nthreads_pme;
 509
 510             if (wc->wcc_all)
 511             {
 512                 for (int j = 0; j < ewcNR; j++)
 513                 {
 514                     wc->wcc_all[i * ewcNR + j].c *= nthreads_pme;
 515                 }
 516             }
 517         }
 518         else
 519         {
 520             wc->wcc[i].c *= nthreads_pp;
 521
 522             if (wc->wcc_all)
 523             {
 524                 for (int j = 0; j < ewcNR; j++)
 525                 {
 526                     wc->wcc_all[i * ewcNR + j].c *= nthreads_pp;
 527                 }
 528             }
 529         }
 530     }
 531     if (useCycleSubcounters && wc->wcsc && !isPmeRank)
 532     {
 533         for (int i = 0; i < ewcsNR; i++)
 534         {
 535             wc->wcsc[i].c *= nthreads_pp;
 536         }
 537     }
 538 }
 539
 540 /* TODO Make an object for this function to return, containing some
 541  * vectors of something like wallcc_t for the summed wcc, wcc_all and
 542  * wcsc, AND the original wcc for rank 0.
 543  *
 544  * The GPU timing is reported only for rank 0, so we want to preserve
 545  * the original wcycle on that rank. Rank 0 also reports the global
 546  * counts before that, so needs something to contain the global data
 547  * without over-writing the rank-0 data. The current implementation
 548  * uses cycles_sum to manage this, which works OK now because wcsc and
 549  * wcc_all are unused by the GPU reporting, but it is not satisfactory
 550  * for the future. Also, there's no need for MPI_Allreduce, since
 551  * only MASTERRANK uses any of the results. */
 552 WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle_t wc)
 553 {
 554     WallcycleCounts cycles_sum;
 555     wallcc_t*       wcc;
 556     double          cycles[int(ewcNR) + int(ewcsNR)];
 557 #if GMX_MPI
 558     double cycles_n[int(ewcNR) + int(ewcsNR) + 1];
 559 #endif
 560     int i;
 561     int nsum;
 562
 563     if (wc == nullptr)
 564     {
 565         /* Default construction of std::array of non-class T can leave
 566            the values indeterminate, just like a C array, and icc
 567            warns about it. */
 568         cycles_sum.fill(0);
 569         return cycles_sum;
 570     }
 571
 572     wcc = wc->wcc;
 573
 574     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
 575     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);
 576
 577     subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);
 578
 579     if (cr->npmenodes == 0)
 580     {
 581         /* All nodes do PME (or no PME at all) */
 582         subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
 583     }
 584     else
 585     {
 586         /* The are PME-only nodes */
 587         if (wcc[ewcPMEMESH].n > 0)
 588         {
 589             /* This must be a PME only node, calculate the Wait + Comm. time */
 590             GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c,
 591                        "Total run ticks must be greater than PME-only ticks");
 592             wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
 593         }
 594     }
 595
 596     /* Store the cycles in a double buffer for summing */
 597     for (i = 0; i < ewcNR; i++)
 598     {
 599 #if GMX_MPI
 600         cycles_n[i] = static_cast<double>(wcc[i].n);
 601 #endif
 602         cycles[i] = static_cast<double>(wcc[i].c);
 603     }
 604     nsum = ewcNR;
 605     if (wc->wcsc)
 606     {
 607         for (i = 0; i < ewcsNR; i++)
 608         {
 609 #if GMX_MPI
 610             cycles_n[ewcNR + i] = static_cast<double>(wc->wcsc[i].n);
 611 #endif
 612             cycles[ewcNR + i] = static_cast<double>(wc->wcsc[i].c);
 613         }
 614         nsum += ewcsNR;
 615     }
 616
 617 #if GMX_MPI
 618     if (cr->nnodes > 1)
 619     {
 620         double buf[int(ewcNR) + int(ewcsNR) + 1];
 621
 622         // TODO this code is used only at the end of the run, so we
 623         // can just do a simple reduce of haveInvalidCount in
 624         // wallcycle_print, and avoid bugs
 625         cycles_n[nsum] = (wc->haveInvalidCount ? 1 : 0);
 626         // TODO Use MPI_Reduce
 627         MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
 628         for (i = 0; i < ewcNR; i++)
 629         {
 630             wcc[i].n = gmx::roundToInt(buf[i]);
 631         }
 632         wc->haveInvalidCount = (buf[nsum] > 0);
 633         if (wc->wcsc)
 634         {
 635             for (i = 0; i < ewcsNR; i++)
 636             {
 637                 wc->wcsc[i].n = gmx::roundToInt(buf[ewcNR + i]);
 638             }
 639         }
 640
 641         // TODO Use MPI_Reduce
 642         MPI_Allreduce(cycles, cycles_sum.data(), nsum, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 643
 644         if (wc->wcc_all != nullptr)
 645         {
 646             double *buf_all, *cyc_all;
 647
 648             snew(cyc_all, ewcNR * ewcNR);
 649             snew(buf_all, ewcNR * ewcNR);
 650             for (i = 0; i < ewcNR * ewcNR; i++)
 651             {
 652                 cyc_all[i] = wc->wcc_all[i].c;
 653             }
 654             // TODO Use MPI_Reduce
 655             MPI_Allreduce(cyc_all, buf_all, ewcNR * ewcNR, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 656             for (i = 0; i < ewcNR * ewcNR; i++)
 657             {
 658                 wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
 659             }
 660             sfree(buf_all);
 661             sfree(cyc_all);
 662         }
 663     }
 664     else
 665 #endif
 666     {
 667         for (i = 0; i < nsum; i++)
 668         {
 669             cycles_sum[i] = cycles[i];
 670         }
 671     }
 672
 673     return cycles_sum;
 674 }
 675
 676 static void
 677 print_cycles(FILE* fplog, double c2t, const char* name, int nnodes, int nthreads, int ncalls, double c_sum, double tot)
 678 {
 679     char   nnodes_str[STRLEN];
 680     char   nthreads_str[STRLEN];
 681     char   ncalls_str[STRLEN];
 682     double wallt;
 683     double percentage = (tot > 0.) ? (100. * c_sum / tot) : 0.;
 684
 685     if (c_sum > 0)
 686     {
 687         if (ncalls > 0)
 688         {
 689             snprintf(ncalls_str, sizeof(ncalls_str), "%10d", ncalls);
 690             if (nnodes < 0)
 691             {
 692                 snprintf(nnodes_str, sizeof(nnodes_str), "N/A");
 693             }
 694             else
 695             {
 696                 snprintf(nnodes_str, sizeof(nnodes_str), "%4d", nnodes);
 697             }
 698             if (nthreads < 0)
 699             {
 700                 snprintf(nthreads_str, sizeof(nthreads_str), "N/A");
 701             }
 702             else
 703             {
 704                 snprintf(nthreads_str, sizeof(nthreads_str), "%4d", nthreads);
 705             }
 706         }
 707         else
 708         {
 709             nnodes_str[0]   = 0;
 710             nthreads_str[0] = 0;
 711             ncalls_str[0]   = 0;
 712         }
 713         /* Convert the cycle count to wallclock time for this task */
 714         wallt = c_sum * c2t;
 715
 716         fprintf(fplog, " %-19.19s %4s %4s %10s  %10.3f %14.3f %5.1f\n", name, nnodes_str,
 717                 nthreads_str, ncalls_str, wallt, c_sum * 1e-9, percentage);
 718     }
 719 }
 720
 721 static void print_gputimes(FILE* fplog, const char* name, int n, double t, double tot_t)
 722 {
 723     char num[11];
 724     char avg_perf[11];
 725
 726     if (n > 0)
 727     {
 728         snprintf(num, sizeof(num), "%10d", n);
 729         snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t / n);
 730     }
 731     else
 732     {
 733         sprintf(num, "          ");
 734         sprintf(avg_perf, "          ");
 735     }
 736     if (t != tot_t && tot_t > 0)
 737     {
 738         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, num, t / 1000, avg_perf, 100 * t / tot_t);
 739     }
 740     else
 741     {
 742         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, "", t / 1000, avg_perf, 100.0);
 743     }
 744 }
 745
 746 static void print_header(FILE* fplog, int nrank_pp, int nth_pp, int nrank_pme, int nth_pme)
 747 {
 748     int nrank_tot = nrank_pp + nrank_pme;
 749     if (0 == nrank_pme)
 750     {
 751         fprintf(fplog, "On %d MPI rank%s", nrank_tot, nrank_tot == 1 ? "" : "s");
 752         if (nth_pp > 1)
 753         {
 754             fprintf(fplog, ", each using %d OpenMP threads", nth_pp);
 755         }
 756         /* Don't report doing PP+PME, because we can't tell here if
 757          * this is RF, etc. */
 758     }
 759     else
 760     {
 761         fprintf(fplog, "On %d MPI rank%s doing PP", nrank_pp, nrank_pp == 1 ? "" : "s");
 762         if (nth_pp > 1)
 763         {
 764             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pp > 1 ? " each" : "", nth_pp);
 765         }
 766         fprintf(fplog, ", and\non %d MPI rank%s doing PME", nrank_pme, nrank_pme == 1 ? "" : "s");
 767         if (nth_pme > 1)
 768         {
 769             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pme > 1 ? " each" : "", nth_pme);
 770         }
 771     }
 772
 773     fprintf(fplog, "\n\n");
 774     fprintf(fplog, " Computing:          Num   Num      Call    Wall time         Giga-Cycles\n");
 775     fprintf(fplog, "                     Ranks Threads  Count      (s)         total sum    %%\n");
 776 }
 777
 778
 779 void wallcycle_print(FILE*                            fplog,
 780                      const gmx::MDLogger&             mdlog,
 781                      int                              nnodes,
 782                      int                              npme,
 783                      int                              nth_pp,
 784                      int                              nth_pme,
 785                      double                           realtime,
 786                      gmx_wallcycle_t                  wc,
 787                      const WallcycleCounts&           cyc_sum,
 788                      const gmx_wallclock_gpu_nbnxn_t* gpu_nbnxn_t,
 789                      const gmx_wallclock_gpu_pme_t*   gpu_pme_t)
 790 {
 791     double      tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
 792     double      c2t, c2t_pp, c2t_pme = 0;
 793     int         i, j, npp, nth_tot;
 794     char        buf[STRLEN];
 795     const char* hline =
 796             "-----------------------------------------------------------------------------";
 797
 798     if (wc == nullptr)
 799     {
 800         return;
 801     }
 802
 803     GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0");
 804     GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0");
 805     GMX_ASSERT(nnodes > 0, "Number of nodes must be >0");
 806     GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative");
 807     npp = nnodes - npme;
 808     /* npme is the number of PME-only ranks used, and we always do PP work */
 809     GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0");
 810
 811     nth_tot = npp * nth_pp + npme * nth_pme;
 812
 813     /* When using PME-only nodes, the next line is valid for both
 814        PP-only and PME-only nodes because they started ewcRUN at the
 815        same time. */
 816     tot        = cyc_sum[ewcRUN];
 817     tot_for_pp = 0;
 818
 819     if (tot <= 0.0)
 820     {
 821         /* TODO This is heavy handed, but until someone reworks the
 822            code so that it is provably robust with respect to
 823            non-positive values for all possible timer and cycle
 824            counters, there is less value gained from printing whatever
 825            timing data might still be sensible for some non-Jenkins
 826            run, than is lost from diagnosing Jenkins FP exceptions on
 827            runs about whose execution time we don't care. */
 828         GMX_LOG(mdlog.warning)
 829                 .asParagraph()
 830                 .appendTextFormatted(
 831                         "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a "
 832                         "time accounting",
 833                         tot);
 834         return;
 835     }
 836
 837     if (wc->haveInvalidCount)
 838     {
 839         GMX_LOG(mdlog.warning)
 840                 .asParagraph()
 841                 .appendText(
 842                         "NOTE: Detected invalid cycle counts, probably because threads moved "
 843                         "between CPU cores that do not have synchronized cycle counters. Will not "
 844                         "print the cycle accounting.");
 845         return;
 846     }
 847
 848
 849     /* Conversion factor from cycles to seconds */
 850     c2t    = realtime / tot;
 851     c2t_pp = c2t * nth_tot / static_cast<double>(npp * nth_pp);
 852     if (npme > 0)
 853     {
 854         c2t_pme = c2t * nth_tot / static_cast<double>(npme * nth_pme);
 855     }
 856     else
 857     {
 858         c2t_pme = 0;
 859     }
 860
 861     fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");
 862
 863     print_header(fplog, npp, nth_pp, npme, nth_pme);
 864
 865     fprintf(fplog, "%s\n", hline);
 866     for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
 867     {
 868         if (is_pme_subcounter(i))
 869         {
 870             /* Do not count these at all */
 871         }
 872         else if (npme > 0 && is_pme_counter(i))
 873         {
 874             /* Print timing information for PME-only nodes, but add an
 875              * asterisk so the reader of the table can know that the
 876              * walltimes are not meant to add up. The asterisk still
 877              * fits in the required maximum of 19 characters. */
 878             char buffer[STRLEN];
 879             snprintf(buffer, STRLEN, "%s *", wcn[i]);
 880             print_cycles(fplog, c2t_pme, buffer, npme, nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
 881         }
 882         else
 883         {
 884             /* Print timing information when it is for a PP or PP+PME
 885                node */
 886             print_cycles(fplog, c2t_pp, wcn[i], npp, nth_pp, wc->wcc[i].n, cyc_sum[i], tot);
 887             tot_for_pp += cyc_sum[i];
 888         }
 889     }
 890     if (wc->wcc_all != nullptr)
 891     {
 892         for (i = 0; i < ewcNR; i++)
 893         {
 894             for (j = 0; j < ewcNR; j++)
 895             {
 896                 snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
 897                 print_cycles(fplog, c2t_pp, buf, npp, nth_pp, wc->wcc_all[i * ewcNR + j].n,
 898                              wc->wcc_all[i * ewcNR + j].c, tot);
 899             }
 900         }
 901     }
 902     tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot);
 903     print_cycles(fplog, c2t_pp, "Rest", npp, nth_pp, -1, tot_for_rest - tot_for_pp, tot);
 904     fprintf(fplog, "%s\n", hline);
 905     print_cycles(fplog, c2t, "Total", npp, nth_pp, -1, tot, tot);
 906     fprintf(fplog, "%s\n", hline);
 907
 908     if (npme > 0)
 909     {
 910         fprintf(fplog,
 911                 "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
 912                 "    twice the total reported, but the cycle count total and %% are correct.\n"
 913                 "%s\n",
 914                 hline);
 915     }
 916
 917     if (wc->wcc[ewcPMEMESH].n > 0)
 918     {
 919         // A workaround to not print breakdown when no subcounters were recorded.
 920         // TODO: figure out and record PME GPU counters (what to do with the waiting ones?)
 921         std::vector<int> validPmeSubcounterIndices;
 922         for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
 923         {
 924             if (is_pme_subcounter(i) && wc->wcc[i].n > 0)
 925             {
 926                 validPmeSubcounterIndices.push_back(i);
 927             }
 928         }
 929
 930         if (!validPmeSubcounterIndices.empty())
 931         {
 932             fprintf(fplog, " Breakdown of PME mesh computation\n");
 933             fprintf(fplog, "%s\n", hline);
 934             for (auto i : validPmeSubcounterIndices)
 935             {
 936                 print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i], npme > 0 ? npme : npp,
 937                              nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
 938             }
 939             fprintf(fplog, "%s\n", hline);
 940         }
 941     }
 942
 943     if (useCycleSubcounters && wc->wcsc)
 944     {
 945         fprintf(fplog, " Breakdown of PP computation\n");
 946         fprintf(fplog, "%s\n", hline);
 947         for (i = 0; i < ewcsNR; i++)
 948         {
 949             print_cycles(fplog, c2t_pp, wcsn[i], npp, nth_pp, wc->wcsc[i].n, cyc_sum[ewcNR + i], tot);
 950         }
 951         fprintf(fplog, "%s\n", hline);
 952     }
 953
 954     /* print GPU timing summary */
 955     double tot_gpu = 0.0;
 956     if (gpu_pme_t)
 957     {
 958         for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
 959         {
 960             tot_gpu += gpu_pme_t->timing[k].t;
 961         }
 962     }
 963     if (gpu_nbnxn_t)
 964     {
 965         const char* k_log_str[2][2] = { { "Nonbonded F kernel", "Nonbonded F+ene k." },
 966                                         { "Nonbonded F+prune k.", "Nonbonded F+ene+prune k." } };
 967         tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
 968
 969         /* add up the kernel timings */
 970         for (i = 0; i < 2; i++)
 971         {
 972             for (j = 0; j < 2; j++)
 973             {
 974                 tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
 975             }
 976         }
 977         tot_gpu += gpu_nbnxn_t->pruneTime.t;
 978
 979         tot_cpu_overlap = wc->wcc[ewcFORCE].c;
 980         if (wc->wcc[ewcPMEMESH].n > 0)
 981         {
 982             tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
 983         }
 984         tot_cpu_overlap *= realtime * 1000 / tot; /* convert s to ms */
 985
 986         fprintf(fplog, "\n GPU timings\n%s\n", hline);
 987         fprintf(fplog,
 988                 " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
 989         fprintf(fplog, "%s\n", hline);
 990         print_gputimes(fplog, "Pair list H2D", gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
 991         print_gputimes(fplog, "X / q H2D", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
 992
 993         for (i = 0; i < 2; i++)
 994         {
 995             for (j = 0; j < 2; j++)
 996             {
 997                 if (gpu_nbnxn_t->ktime[i][j].c)
 998                 {
 999                     print_gputimes(fplog, k_log_str[i][j], gpu_nbnxn_t->ktime[i][j].c,
1000                                    gpu_nbnxn_t->ktime[i][j].t, tot_gpu);
1001                 }
1002             }
1003         }
1004         if (gpu_pme_t)
1005         {
1006             for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
1007             {
1008                 if (gpu_pme_t->timing[k].c)
1009                 {
1010                     print_gputimes(fplog, PMEStageNames[k], gpu_pme_t->timing[k].c,
1011                                    gpu_pme_t->timing[k].t, tot_gpu);
1012                 }
1013             }
1014         }
1015         if (gpu_nbnxn_t->pruneTime.c)
1016         {
1017             print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c,
1018                            gpu_nbnxn_t->pruneTime.t, tot_gpu);
1019         }
1020         print_gputimes(fplog, "F D2H", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
1021         fprintf(fplog, "%s\n", hline);
1022         print_gputimes(fplog, "Total ", gpu_nbnxn_t->nb_c, tot_gpu, tot_gpu);
1023         fprintf(fplog, "%s\n", hline);
1024         if (gpu_nbnxn_t->dynamicPruneTime.c)
1025         {
1026             /* We print the dynamic pruning kernel timings after a separator
1027              * and avoid adding it to tot_gpu as this is not in the force
1028              * overlap. We print the fraction as relative to the rest.
1029              */
1030             print_gputimes(fplog, "*Dynamic pruning", gpu_nbnxn_t->dynamicPruneTime.c,
1031                            gpu_nbnxn_t->dynamicPruneTime.t, tot_gpu);
1032             fprintf(fplog, "%s\n", hline);
1033         }
1034         gpu_cpu_ratio = tot_gpu / tot_cpu_overlap;
1035         if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
1036         {
1037             fprintf(fplog,
1038                     "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = "
1039                     "%.3f\n",
1040                     tot_gpu / gpu_nbnxn_t->nb_c, tot_cpu_overlap / wc->wcc[ewcFORCE].n, gpu_cpu_ratio);
1041         }
1042
1043         /* only print notes related to CPU-GPU load balance with PME */
1044         if (wc->wcc[ewcPMEMESH].n > 0)
1045         {
1046             fprintf(fplog, "For optimal resource utilization this ratio should be close to 1\n");
1047
1048             /* print note if the imbalance is high with PME case in which
1049              * CPU-GPU load balancing is possible */
1050             if (gpu_cpu_ratio < 0.8 || gpu_cpu_ratio > 1.25)
1051             {
1052                 /* Only the sim master calls this function, so always print to stderr */
1053                 if (gpu_cpu_ratio < 0.8)
1054                 {
1055                     if (npp > 1)
1056                     {
1057                         /* The user could have used -notunepme,
1058                          * but we currently can't check that here.
1059                          */
1060                         GMX_LOG(mdlog.warning)
1061                                 .asParagraph()
1062                                 .appendText(
1063                                         "NOTE: The CPU has >25% more load than the GPU. This "
1064                                         "imbalance wastes\n"
1065                                         "      GPU resources. Maybe the domain decomposition "
1066                                         "limits the PME tuning.\n"
1067                                         "      In that case, try setting the DD grid manually "
1068                                         "(-dd) or lowering -dds.");
1069                     }
1070                     else
1071                     {
1072                         /* We should not end up here, unless the box is
1073                          * too small for increasing the cut-off for PME tuning.
1074                          */
1075                         GMX_LOG(mdlog.warning)
1076                                 .asParagraph()
1077                                 .appendText(
1078                                         "NOTE: The CPU has >25% more load than the GPU. This "
1079                                         "imbalance wastes\n"
1080                                         "      GPU resources.");
1081                     }
1082                 }
1083                 if (gpu_cpu_ratio > 1.25)
1084                 {
1085                     GMX_LOG(mdlog.warning)
1086                             .asParagraph()
1087                             .appendText(
1088                                     "NOTE: The GPU has >25% more load than the CPU. This imbalance "
1089                                     "wastes\n"
1090                                     "      CPU resources.");
1091                 }
1092             }
1093         }
1094     }
1095
1096     if (wc->wc_barrier)
1097     {
1098         GMX_LOG(mdlog.warning)
1099                 .asParagraph()
1100                 .appendText(
1101                         "MPI_Barrier was called before each cycle start/stop\n"
1102                         "call, so timings are not those of real runs.");
1103     }
1104
1105     if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 && (cyc_sum[ewcDOMDEC] > tot * 0.1 || cyc_sum[ewcNS] > tot * 0.1))
1106     {
1107         /* Only the sim master calls this function, so always print to stderr */
1108         if (wc->wcc[ewcDOMDEC].n == 0)
1109         {
1110             GMX_LOG(mdlog.warning)
1111                     .asParagraph()
1112                     .appendTextFormatted(
1113                             "NOTE: %d %% of the run time was spent in pair search,\n"
1114                             "      you might want to increase nstlist (this has no effect on "
1115                             "accuracy)\n",
1116                             gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1117         }
1118         else
1119         {
1120             GMX_LOG(mdlog.warning)
1121                     .asParagraph()
1122                     .appendTextFormatted(
1123                             "NOTE: %d %% of the run time was spent in domain decomposition,\n"
1124                             "      %d %% of the run time was spent in pair search,\n"
1125                             "      you might want to increase nstlist (this has no effect on "
1126                             "accuracy)\n",
1127                             gmx::roundToInt(100 * cyc_sum[ewcDOMDEC] / tot),
1128                             gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1129         }
1130     }
1131
1132     if (cyc_sum[ewcMoveE] > tot * 0.05)
1133     {
1134         GMX_LOG(mdlog.warning)
1135                 .asParagraph()
1136                 .appendTextFormatted(
1137                         "NOTE: %d %% of the run time was spent communicating energies,\n"
1138                         "      you might want to increase some nst* mdp options\n",
1139                         gmx::roundToInt(100 * cyc_sum[ewcMoveE] / tot));
1140     }
1141 }
1142
1143 extern int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
1144 {
1145     if (wc == nullptr)
1146     {
1147         return -1;
1148     }
1149
1150     return wc->reset_counters;
1151 }
1152
1153 extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, int64_t reset_counters)
1154 {
1155     if (wc == nullptr)
1156     {
1157         return;
1158     }
1159
1160     wc->reset_counters = reset_counters;
1161 }
1162
1163 void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
1164 {
1165     if (useCycleSubcounters && wc != nullptr)
1166     {
1167         wc->wcsc[ewcs].start = gmx_cycles_read();
1168     }
1169 }
1170
1171 void wallcycle_sub_start_nocount(gmx_wallcycle_t wc, int ewcs)
1172 {
1173     if (useCycleSubcounters && wc != nullptr)
1174     {
1175         wallcycle_sub_start(wc, ewcs);
1176         wc->wcsc[ewcs].n--;
1177     }
1178 }
1179
1180 void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
1181 {
1182     if (useCycleSubcounters && wc != nullptr)
1183     {
1184         wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start;
1185         wc->wcsc[ewcs].n++;
1186     }
1187 }