src/gromacs/timing/wallcycle.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #include "gmxpre.h"
  39
  40 #include "wallcycle.h"
  41
  42 #include "config.h"
  43
  44 #include <cstdlib>
  45
  46 #include <array>
  47 #include <vector>
  48
  49 #include "gromacs/math/functions.h"
  50 #include "gromacs/mdtypes/commrec.h"
  51 #include "gromacs/timing/cyclecounter.h"
  52 #include "gromacs/timing/gpu_timing.h"
  53 #include "gromacs/timing/wallcyclereporting.h"
  54 #include "gromacs/utility/cstringutil.h"
  55 #include "gromacs/utility/gmxassert.h"
  56 #include "gromacs/utility/gmxmpi.h"
  57 #include "gromacs/utility/logger.h"
  58 #include "gromacs/utility/smalloc.h"
  59 #include "gromacs/utility/snprintf.h"
  60
  61 static const bool useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS;
  62
  63 /* DEBUG_WCYCLE adds consistency checking for the counters.
  64  * It checks if you stop a counter different from the last
  65  * one that was opened and if you do nest too deep.
  66  */
  67 /* #define DEBUG_WCYCLE */
  68
  69 #ifdef DEBUG_WCYCLE
  70 #    include "gromacs/utility/fatalerror.h"
  71 #endif
  72
  73 typedef struct
  74 {
  75     int          n;
  76     gmx_cycles_t c;
  77     gmx_cycles_t start;
  78 } wallcc_t;
  79
  80 struct gmx_wallcycle
  81 {
  82     wallcc_t* wcc;
  83     /* did we detect one or more invalid cycle counts */
  84     gmx_bool haveInvalidCount;
  85     /* variables for testing/debugging */
  86     gmx_bool  wc_barrier;
  87     wallcc_t* wcc_all;
  88     int       wc_depth;
  89 #ifdef DEBUG_WCYCLE
  90 #    define DEPTH_MAX 6
  91     int counterlist[DEPTH_MAX];
  92     int count_depth;
  93 #endif
  94     int          ewc_prev;
  95     gmx_cycles_t cycle_prev;
  96     int64_t      reset_counters;
  97 #if GMX_MPI
  98     MPI_Comm mpi_comm_mygroup;
  99 #endif
 100     wallcc_t* wcsc;
 101 };
 102
 103 /* Each name should not exceed 19 printing characters
 104    (ie. terminating null can be twentieth) */
 105 static const char* wcn[ewcNR] = { "Run",
 106                                   "Step",
 107                                   "PP during PME",
 108                                   "Domain decomp.",
 109                                   "DD comm. load",
 110                                   "DD comm. bounds",
 111                                   "Vsite constr.",
 112                                   "Send X to PME",
 113                                   "Neighbor search",
 114                                   "Launch GPU ops.",
 115                                   "Comm. coord.",
 116                                   "Force",
 117                                   "Wait + Comm. F",
 118                                   "PME mesh",
 119                                   "PME redist. X/F",
 120                                   "PME spread",
 121                                   "PME gather",
 122                                   "PME 3D-FFT",
 123                                   "PME 3D-FFT Comm.",
 124                                   "PME solve LJ",
 125                                   "PME solve Elec",
 126                                   "PME wait for PP",
 127                                   "Wait + Recv. PME F",
 128                                   "Wait PME GPU spread",
 129                                   "PME 3D-FFT",
 130                                   "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
 131                                   "Wait PME GPU gather",
 132                                   "Wait Bonded GPU",
 133                                   "Reduce GPU PME F",
 134                                   "Wait GPU NB nonloc.",
 135                                   "Wait GPU NB local",
 136                                   "Wait GPU state copy",
 137                                   "NB X/F buffer ops.",
 138                                   "Vsite spread",
 139                                   "COM pull force",
 140                                   "AWH",
 141                                   "Write traj.",
 142                                   "Update",
 143                                   "Constraints",
 144                                   "Comm. energies",
 145                                   "Enforced rotation",
 146                                   "Add rot. forces",
 147                                   "Position swapping",
 148                                   "IMD",
 149                                   "Test" };
 150
 151 static const char* wcsn[ewcsNR] = {
 152     "DD redist.",
 153     "DD NS grid + sort",
 154     "DD setup comm.",
 155     "DD make top.",
 156     "DD make constr.",
 157     "DD top. other",
 158     "DD GPU ops.",
 159     "NS grid local",
 160     "NS grid non-loc.",
 161     "NS search local",
 162     "NS search non-loc.",
 163     "Bonded F",
 164     "Bonded-FEP F",
 165     "Restraints F",
 166     "Listed buffer ops.",
 167     "Nonbonded pruning",
 168     "Nonbonded F kernel",
 169     "Nonbonded F clear",
 170     "Nonbonded FEP",
 171     "Launch NB GPU tasks",
 172     "Launch Bonded GPU tasks",
 173     "Launch PME GPU tasks",
 174     "Launch state copy",
 175     "Ewald F correction",
 176     "NB X buffer ops.",
 177     "NB F buffer ops.",
 178     "Clear force buffer",
 179     "Launch GPU NB X buffer ops.",
 180     "Launch GPU NB F buffer ops.",
 181     "Launch GPU Comm. coord.",
 182     "Launch GPU Comm. force.",
 183     "Test subcounter",
 184 };
 185
 186 /* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
 187 static const char* PMEStageNames[] = {
 188     "PME spline", "PME spread",     "PME spline + spread", "PME 3D-FFT r2c",
 189     "PME solve",  "PME 3D-FFT c2r", "PME gather",
 190 };
 191
 192 gmx_bool wallcycle_have_counter()
 193 {
 194     return gmx_cycles_have_counter();
 195 }
 196
 197 gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, t_commrec gmx_unused* cr)
 198 {
 199     gmx_wallcycle_t wc;
 200
 201
 202     if (!wallcycle_have_counter())
 203     {
 204         return nullptr;
 205     }
 206
 207     snew(wc, 1);
 208
 209     wc->haveInvalidCount = FALSE;
 210     wc->wc_barrier       = FALSE;
 211     wc->wcc_all          = nullptr;
 212     wc->wc_depth         = 0;
 213     wc->ewc_prev         = -1;
 214     wc->reset_counters   = resetstep;
 215
 216 #if GMX_MPI
 217     if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
 218     {
 219         if (fplog)
 220         {
 221             fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
 222         }
 223         wc->wc_barrier       = TRUE;
 224         wc->mpi_comm_mygroup = cr->mpi_comm_mygroup;
 225     }
 226 #endif
 227
 228     snew(wc->wcc, ewcNR);
 229     if (getenv("GMX_CYCLE_ALL") != nullptr)
 230     {
 231         if (fplog)
 232         {
 233             fprintf(fplog, "\nWill time all the code during the run\n\n");
 234         }
 235         snew(wc->wcc_all, ewcNR * ewcNR);
 236     }
 237
 238     if (useCycleSubcounters)
 239     {
 240         snew(wc->wcsc, ewcsNR);
 241     }
 242
 243 #ifdef DEBUG_WCYCLE
 244     wc->count_depth = 0;
 245 #endif
 246
 247     return wc;
 248 }
 249
 250 void wallcycle_destroy(gmx_wallcycle_t wc)
 251 {
 252     if (wc == nullptr)
 253     {
 254         return;
 255     }
 256
 257     if (wc->wcc != nullptr)
 258     {
 259         sfree(wc->wcc);
 260     }
 261     if (wc->wcc_all != nullptr)
 262     {
 263         sfree(wc->wcc_all);
 264     }
 265     if (wc->wcsc != nullptr)
 266     {
 267         sfree(wc->wcsc);
 268     }
 269     sfree(wc);
 270 }
 271
 272 static void wallcycle_all_start(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 273 {
 274     wc->ewc_prev   = ewc;
 275     wc->cycle_prev = cycle;
 276 }
 277
 278 static void wallcycle_all_stop(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 279 {
 280     wc->wcc_all[wc->ewc_prev * ewcNR + ewc].n += 1;
 281     wc->wcc_all[wc->ewc_prev * ewcNR + ewc].c += cycle - wc->cycle_prev;
 282 }
 283
 284
 285 #ifdef DEBUG_WCYCLE
 286 static void debug_start_check(gmx_wallcycle_t wc, int ewc)
 287 {
 288     /* fprintf(stderr,"wcycle_start depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 289
 290     if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
 291     {
 292         gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth);
 293     }
 294     wc->counterlist[wc->count_depth] = ewc;
 295     wc->count_depth++;
 296 }
 297
 298 static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
 299 {
 300     wc->count_depth--;
 301
 302     /* fprintf(stderr,"wcycle_stop depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 303
 304     if (wc->count_depth < 0)
 305     {
 306         gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc],
 307                   wc->count_depth);
 308     }
 309     if (wc->counterlist[wc->count_depth] != ewc)
 310     {
 311         gmx_fatal(FARGS, "wallcycle mismatch at stop, start %s, stop %s",
 312                   wcn[wc->counterlist[wc->count_depth]], wcn[ewc]);
 313     }
 314 }
 315 #endif
 316
 317 void wallcycle_start(gmx_wallcycle_t wc, int ewc)
 318 {
 319     gmx_cycles_t cycle;
 320
 321     if (wc == nullptr)
 322     {
 323         return;
 324     }
 325
 326 #if GMX_MPI
 327     if (wc->wc_barrier)
 328     {
 329         MPI_Barrier(wc->mpi_comm_mygroup);
 330     }
 331 #endif
 332
 333 #ifdef DEBUG_WCYCLE
 334     debug_start_check(wc, ewc);
 335 #endif
 336
 337     cycle              = gmx_cycles_read();
 338     wc->wcc[ewc].start = cycle;
 339     if (wc->wcc_all != nullptr)
 340     {
 341         wc->wc_depth++;
 342         if (ewc == ewcRUN)
 343         {
 344             wallcycle_all_start(wc, ewc, cycle);
 345         }
 346         else if (wc->wc_depth == 3)
 347         {
 348             wallcycle_all_stop(wc, ewc, cycle);
 349         }
 350     }
 351 }
 352
 353 void wallcycle_increment_event_count(gmx_wallcycle_t wc, int ewc)
 354 {
 355     if (wc == nullptr)
 356     {
 357         return;
 358     }
 359     wc->wcc[ewc].n++;
 360 }
 361
 362 void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
 363 {
 364     if (wc == nullptr)
 365     {
 366         return;
 367     }
 368
 369     wallcycle_start(wc, ewc);
 370     wc->wcc[ewc].n--;
 371 }
 372
 373 double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
 374 {
 375     gmx_cycles_t cycle, last;
 376
 377     if (wc == nullptr)
 378     {
 379         return 0;
 380     }
 381
 382 #if GMX_MPI
 383     if (wc->wc_barrier)
 384     {
 385         MPI_Barrier(wc->mpi_comm_mygroup);
 386     }
 387 #endif
 388
 389 #ifdef DEBUG_WCYCLE
 390     debug_stop_check(wc, ewc);
 391 #endif
 392
 393     /* When processes or threads migrate between cores, the cycle counting
 394      * can get messed up if the cycle counter on different cores are not
 395      * synchronized. When this happens we expect both large negative and
 396      * positive cycle differences. We can detect negative cycle differences.
 397      * Detecting too large positive counts if difficult, since count can be
 398      * large, especially for ewcRUN. If we detect a negative count,
 399      * we will not print the cycle accounting table.
 400      */
 401     cycle = gmx_cycles_read();
 402     if (cycle >= wc->wcc[ewc].start)
 403     {
 404         last = cycle - wc->wcc[ewc].start;
 405     }
 406     else
 407     {
 408         last                 = 0;
 409         wc->haveInvalidCount = TRUE;
 410     }
 411     wc->wcc[ewc].c += last;
 412     wc->wcc[ewc].n++;
 413     if (wc->wcc_all)
 414     {
 415         wc->wc_depth--;
 416         if (ewc == ewcRUN)
 417         {
 418             wallcycle_all_stop(wc, ewc, cycle);
 419         }
 420         else if (wc->wc_depth == 2)
 421         {
 422             wallcycle_all_start(wc, ewc, cycle);
 423         }
 424     }
 425
 426     return last;
 427 }
 428
 429 void wallcycle_get(gmx_wallcycle_t wc, int ewc, int* n, double* c)
 430 {
 431     *n = wc->wcc[ewc].n;
 432     *c = static_cast<double>(wc->wcc[ewc].c);
 433 }
 434
 435 void wallcycle_reset_all(gmx_wallcycle_t wc)
 436 {
 437     int i;
 438
 439     if (wc == nullptr)
 440     {
 441         return;
 442     }
 443
 444     for (i = 0; i < ewcNR; i++)
 445     {
 446         wc->wcc[i].n = 0;
 447         wc->wcc[i].c = 0;
 448     }
 449     wc->haveInvalidCount = FALSE;
 450
 451     if (wc->wcc_all)
 452     {
 453         for (i = 0; i < ewcNR * ewcNR; i++)
 454         {
 455             wc->wcc_all[i].n = 0;
 456             wc->wcc_all[i].c = 0;
 457         }
 458     }
 459     if (wc->wcsc)
 460     {
 461         for (i = 0; i < ewcsNR; i++)
 462         {
 463             wc->wcsc[i].n = 0;
 464             wc->wcsc[i].c = 0;
 465         }
 466     }
 467 }
 468
 469 static gmx_bool is_pme_counter(int ewc)
 470 {
 471     return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
 472 }
 473
 474 static gmx_bool is_pme_subcounter(int ewc)
 475 {
 476     return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
 477 }
 478
 479 /* Subtract counter ewc_sub timed inside a timing block for ewc_main */
 480 static void subtract_cycles(wallcc_t* wcc, int ewc_main, int ewc_sub)
 481 {
 482     if (wcc[ewc_sub].n > 0)
 483     {
 484         if (wcc[ewc_main].c >= wcc[ewc_sub].c)
 485         {
 486             wcc[ewc_main].c -= wcc[ewc_sub].c;
 487         }
 488         else
 489         {
 490             /* Something is wrong with the cycle counting */
 491             wcc[ewc_main].c = 0;
 492         }
 493     }
 494 }
 495
 496 void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
 497 {
 498     if (wc == nullptr)
 499     {
 500         return;
 501     }
 502
 503     for (int i = 0; i < ewcNR; i++)
 504     {
 505         if (is_pme_counter(i) || (i == ewcRUN && isPmeRank))
 506         {
 507             wc->wcc[i].c *= nthreads_pme;
 508
 509             if (wc->wcc_all)
 510             {
 511                 for (int j = 0; j < ewcNR; j++)
 512                 {
 513                     wc->wcc_all[i * ewcNR + j].c *= nthreads_pme;
 514                 }
 515             }
 516         }
 517         else
 518         {
 519             wc->wcc[i].c *= nthreads_pp;
 520
 521             if (wc->wcc_all)
 522             {
 523                 for (int j = 0; j < ewcNR; j++)
 524                 {
 525                     wc->wcc_all[i * ewcNR + j].c *= nthreads_pp;
 526                 }
 527             }
 528         }
 529     }
 530     if (useCycleSubcounters && wc->wcsc && !isPmeRank)
 531     {
 532         for (int i = 0; i < ewcsNR; i++)
 533         {
 534             wc->wcsc[i].c *= nthreads_pp;
 535         }
 536     }
 537 }
 538
 539 /* TODO Make an object for this function to return, containing some
 540  * vectors of something like wallcc_t for the summed wcc, wcc_all and
 541  * wcsc, AND the original wcc for rank 0.
 542  *
 543  * The GPU timing is reported only for rank 0, so we want to preserve
 544  * the original wcycle on that rank. Rank 0 also reports the global
 545  * counts before that, so needs something to contain the global data
 546  * without over-writing the rank-0 data. The current implementation
 547  * uses cycles_sum to manage this, which works OK now because wcsc and
 548  * wcc_all are unused by the GPU reporting, but it is not satisfactory
 549  * for the future. Also, there's no need for MPI_Allreduce, since
 550  * only MASTERRANK uses any of the results. */
 551 WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle_t wc)
 552 {
 553     WallcycleCounts cycles_sum;
 554     wallcc_t*       wcc;
 555     double          cycles[int(ewcNR) + int(ewcsNR)];
 556 #if GMX_MPI
 557     double cycles_n[int(ewcNR) + int(ewcsNR) + 1];
 558 #endif
 559     int i;
 560     int nsum;
 561
 562     if (wc == nullptr)
 563     {
 564         /* Default construction of std::array of non-class T can leave
 565            the values indeterminate, just like a C array, and icc
 566            warns about it. */
 567         cycles_sum.fill(0);
 568         return cycles_sum;
 569     }
 570
 571     wcc = wc->wcc;
 572
 573     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
 574     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);
 575
 576     subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);
 577
 578     if (cr->npmenodes == 0)
 579     {
 580         /* All nodes do PME (or no PME at all) */
 581         subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
 582     }
 583     else
 584     {
 585         /* The are PME-only nodes */
 586         if (wcc[ewcPMEMESH].n > 0)
 587         {
 588             /* This must be a PME only node, calculate the Wait + Comm. time */
 589             GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c,
 590                        "Total run ticks must be greater than PME-only ticks");
 591             wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
 592         }
 593     }
 594
 595     /* Store the cycles in a double buffer for summing */
 596     for (i = 0; i < ewcNR; i++)
 597     {
 598 #if GMX_MPI
 599         cycles_n[i] = static_cast<double>(wcc[i].n);
 600 #endif
 601         cycles[i] = static_cast<double>(wcc[i].c);
 602     }
 603     nsum = ewcNR;
 604     if (wc->wcsc)
 605     {
 606         for (i = 0; i < ewcsNR; i++)
 607         {
 608 #if GMX_MPI
 609             cycles_n[ewcNR + i] = static_cast<double>(wc->wcsc[i].n);
 610 #endif
 611             cycles[ewcNR + i] = static_cast<double>(wc->wcsc[i].c);
 612         }
 613         nsum += ewcsNR;
 614     }
 615
 616 #if GMX_MPI
 617     if (cr->nnodes > 1)
 618     {
 619         double buf[int(ewcNR) + int(ewcsNR) + 1];
 620
 621         // TODO this code is used only at the end of the run, so we
 622         // can just do a simple reduce of haveInvalidCount in
 623         // wallcycle_print, and avoid bugs
 624         cycles_n[nsum] = (wc->haveInvalidCount ? 1 : 0);
 625         // TODO Use MPI_Reduce
 626         MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
 627         for (i = 0; i < ewcNR; i++)
 628         {
 629             wcc[i].n = gmx::roundToInt(buf[i]);
 630         }
 631         wc->haveInvalidCount = (buf[nsum] > 0);
 632         if (wc->wcsc)
 633         {
 634             for (i = 0; i < ewcsNR; i++)
 635             {
 636                 wc->wcsc[i].n = gmx::roundToInt(buf[ewcNR + i]);
 637             }
 638         }
 639
 640         // TODO Use MPI_Reduce
 641         MPI_Allreduce(cycles, cycles_sum.data(), nsum, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 642
 643         if (wc->wcc_all != nullptr)
 644         {
 645             double *buf_all, *cyc_all;
 646
 647             snew(cyc_all, ewcNR * ewcNR);
 648             snew(buf_all, ewcNR * ewcNR);
 649             for (i = 0; i < ewcNR * ewcNR; i++)
 650             {
 651                 cyc_all[i] = wc->wcc_all[i].c;
 652             }
 653             // TODO Use MPI_Reduce
 654             MPI_Allreduce(cyc_all, buf_all, ewcNR * ewcNR, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 655             for (i = 0; i < ewcNR * ewcNR; i++)
 656             {
 657                 wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
 658             }
 659             sfree(buf_all);
 660             sfree(cyc_all);
 661         }
 662     }
 663     else
 664 #endif
 665     {
 666         for (i = 0; i < nsum; i++)
 667         {
 668             cycles_sum[i] = cycles[i];
 669         }
 670     }
 671
 672     return cycles_sum;
 673 }
 674
 675 static void
 676 print_cycles(FILE* fplog, double c2t, const char* name, int nnodes, int nthreads, int ncalls, double c_sum, double tot)
 677 {
 678     char   nnodes_str[STRLEN];
 679     char   nthreads_str[STRLEN];
 680     char   ncalls_str[STRLEN];
 681     double wallt;
 682     double percentage = (tot > 0.) ? (100. * c_sum / tot) : 0.;
 683
 684     if (c_sum > 0)
 685     {
 686         if (ncalls > 0)
 687         {
 688             snprintf(ncalls_str, sizeof(ncalls_str), "%10d", ncalls);
 689             if (nnodes < 0)
 690             {
 691                 snprintf(nnodes_str, sizeof(nnodes_str), "N/A");
 692             }
 693             else
 694             {
 695                 snprintf(nnodes_str, sizeof(nnodes_str), "%4d", nnodes);
 696             }
 697             if (nthreads < 0)
 698             {
 699                 snprintf(nthreads_str, sizeof(nthreads_str), "N/A");
 700             }
 701             else
 702             {
 703                 snprintf(nthreads_str, sizeof(nthreads_str), "%4d", nthreads);
 704             }
 705         }
 706         else
 707         {
 708             nnodes_str[0]   = 0;
 709             nthreads_str[0] = 0;
 710             ncalls_str[0]   = 0;
 711         }
 712         /* Convert the cycle count to wallclock time for this task */
 713         wallt = c_sum * c2t;
 714
 715         fprintf(fplog, " %-19.19s %4s %4s %10s  %10.3f %14.3f %5.1f\n", name, nnodes_str,
 716                 nthreads_str, ncalls_str, wallt, c_sum * 1e-9, percentage);
 717     }
 718 }
 719
 720 static void print_gputimes(FILE* fplog, const char* name, int n, double t, double tot_t)
 721 {
 722     char num[11];
 723     char avg_perf[11];
 724
 725     if (n > 0)
 726     {
 727         snprintf(num, sizeof(num), "%10d", n);
 728         snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t / n);
 729     }
 730     else
 731     {
 732         sprintf(num, "          ");
 733         sprintf(avg_perf, "          ");
 734     }
 735     if (t != tot_t && tot_t > 0)
 736     {
 737         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, num, t / 1000, avg_perf, 100 * t / tot_t);
 738     }
 739     else
 740     {
 741         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, "", t / 1000, avg_perf, 100.0);
 742     }
 743 }
 744
 745 static void print_header(FILE* fplog, int nrank_pp, int nth_pp, int nrank_pme, int nth_pme)
 746 {
 747     int nrank_tot = nrank_pp + nrank_pme;
 748     if (0 == nrank_pme)
 749     {
 750         fprintf(fplog, "On %d MPI rank%s", nrank_tot, nrank_tot == 1 ? "" : "s");
 751         if (nth_pp > 1)
 752         {
 753             fprintf(fplog, ", each using %d OpenMP threads", nth_pp);
 754         }
 755         /* Don't report doing PP+PME, because we can't tell here if
 756          * this is RF, etc. */
 757     }
 758     else
 759     {
 760         fprintf(fplog, "On %d MPI rank%s doing PP", nrank_pp, nrank_pp == 1 ? "" : "s");
 761         if (nth_pp > 1)
 762         {
 763             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pp > 1 ? " each" : "", nth_pp);
 764         }
 765         fprintf(fplog, ", and\non %d MPI rank%s doing PME", nrank_pme, nrank_pme == 1 ? "" : "s");
 766         if (nth_pme > 1)
 767         {
 768             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pme > 1 ? " each" : "", nth_pme);
 769         }
 770     }
 771
 772     fprintf(fplog, "\n\n");
 773     fprintf(fplog, " Computing:          Num   Num      Call    Wall time         Giga-Cycles\n");
 774     fprintf(fplog, "                     Ranks Threads  Count      (s)         total sum    %%\n");
 775 }
 776
 777
 778 void wallcycle_print(FILE*                            fplog,
 779                      const gmx::MDLogger&             mdlog,
 780                      int                              nnodes,
 781                      int                              npme,
 782                      int                              nth_pp,
 783                      int                              nth_pme,
 784                      double                           realtime,
 785                      gmx_wallcycle_t                  wc,
 786                      const WallcycleCounts&           cyc_sum,
 787                      const gmx_wallclock_gpu_nbnxn_t* gpu_nbnxn_t,
 788                      const gmx_wallclock_gpu_pme_t*   gpu_pme_t)
 789 {
 790     double      tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
 791     double      c2t, c2t_pp, c2t_pme = 0;
 792     int         i, j, npp, nth_tot;
 793     char        buf[STRLEN];
 794     const char* hline =
 795             "-----------------------------------------------------------------------------";
 796
 797     if (wc == nullptr)
 798     {
 799         return;
 800     }
 801
 802     GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0");
 803     GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0");
 804     GMX_ASSERT(nnodes > 0, "Number of nodes must be >0");
 805     GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative");
 806     npp = nnodes - npme;
 807     /* npme is the number of PME-only ranks used, and we always do PP work */
 808     GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0");
 809
 810     nth_tot = npp * nth_pp + npme * nth_pme;
 811
 812     /* When using PME-only nodes, the next line is valid for both
 813        PP-only and PME-only nodes because they started ewcRUN at the
 814        same time. */
 815     tot        = cyc_sum[ewcRUN];
 816     tot_for_pp = 0;
 817
 818     if (tot <= 0.0)
 819     {
 820         /* TODO This is heavy handed, but until someone reworks the
 821            code so that it is provably robust with respect to
 822            non-positive values for all possible timer and cycle
 823            counters, there is less value gained from printing whatever
 824            timing data might still be sensible for some non-Jenkins
 825            run, than is lost from diagnosing Jenkins FP exceptions on
 826            runs about whose execution time we don't care. */
 827         GMX_LOG(mdlog.warning)
 828                 .asParagraph()
 829                 .appendTextFormatted(
 830                         "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a "
 831                         "time accounting",
 832                         tot);
 833         return;
 834     }
 835
 836     if (wc->haveInvalidCount)
 837     {
 838         GMX_LOG(mdlog.warning)
 839                 .asParagraph()
 840                 .appendText(
 841                         "NOTE: Detected invalid cycle counts, probably because threads moved "
 842                         "between CPU cores that do not have synchronized cycle counters. Will not "
 843                         "print the cycle accounting.");
 844         return;
 845     }
 846
 847
 848     /* Conversion factor from cycles to seconds */
 849     c2t    = realtime / tot;
 850     c2t_pp = c2t * nth_tot / static_cast<double>(npp * nth_pp);
 851     if (npme > 0)
 852     {
 853         c2t_pme = c2t * nth_tot / static_cast<double>(npme * nth_pme);
 854     }
 855     else
 856     {
 857         c2t_pme = 0;
 858     }
 859
 860     fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");
 861
 862     print_header(fplog, npp, nth_pp, npme, nth_pme);
 863
 864     fprintf(fplog, "%s\n", hline);
 865     for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
 866     {
 867         if (is_pme_subcounter(i))
 868         {
 869             /* Do not count these at all */
 870         }
 871         else if (npme > 0 && is_pme_counter(i))
 872         {
 873             /* Print timing information for PME-only nodes, but add an
 874              * asterisk so the reader of the table can know that the
 875              * walltimes are not meant to add up. The asterisk still
 876              * fits in the required maximum of 19 characters. */
 877             char buffer[STRLEN];
 878             snprintf(buffer, STRLEN, "%s *", wcn[i]);
 879             print_cycles(fplog, c2t_pme, buffer, npme, nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
 880         }
 881         else
 882         {
 883             /* Print timing information when it is for a PP or PP+PME
 884                node */
 885             print_cycles(fplog, c2t_pp, wcn[i], npp, nth_pp, wc->wcc[i].n, cyc_sum[i], tot);
 886             tot_for_pp += cyc_sum[i];
 887         }
 888     }
 889     if (wc->wcc_all != nullptr)
 890     {
 891         for (i = 0; i < ewcNR; i++)
 892         {
 893             for (j = 0; j < ewcNR; j++)
 894             {
 895                 snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
 896                 print_cycles(fplog, c2t_pp, buf, npp, nth_pp, wc->wcc_all[i * ewcNR + j].n,
 897                              wc->wcc_all[i * ewcNR + j].c, tot);
 898             }
 899         }
 900     }
 901     tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot);
 902     print_cycles(fplog, c2t_pp, "Rest", npp, nth_pp, -1, tot_for_rest - tot_for_pp, tot);
 903     fprintf(fplog, "%s\n", hline);
 904     print_cycles(fplog, c2t, "Total", npp, nth_pp, -1, tot, tot);
 905     fprintf(fplog, "%s\n", hline);
 906
 907     if (npme > 0)
 908     {
 909         fprintf(fplog,
 910                 "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
 911                 "    twice the total reported, but the cycle count total and %% are correct.\n"
 912                 "%s\n",
 913                 hline);
 914     }
 915
 916     if (wc->wcc[ewcPMEMESH].n > 0)
 917     {
 918         // A workaround to not print breakdown when no subcounters were recorded.
 919         // TODO: figure out and record PME GPU counters (what to do with the waiting ones?)
 920         std::vector<int> validPmeSubcounterIndices;
 921         for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
 922         {
 923             if (is_pme_subcounter(i) && wc->wcc[i].n > 0)
 924             {
 925                 validPmeSubcounterIndices.push_back(i);
 926             }
 927         }
 928
 929         if (!validPmeSubcounterIndices.empty())
 930         {
 931             fprintf(fplog, " Breakdown of PME mesh computation\n");
 932             fprintf(fplog, "%s\n", hline);
 933             for (auto i : validPmeSubcounterIndices)
 934             {
 935                 print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i], npme > 0 ? npme : npp,
 936                              nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
 937             }
 938             fprintf(fplog, "%s\n", hline);
 939         }
 940     }
 941
 942     if (useCycleSubcounters && wc->wcsc)
 943     {
 944         fprintf(fplog, " Breakdown of PP computation\n");
 945         fprintf(fplog, "%s\n", hline);
 946         for (i = 0; i < ewcsNR; i++)
 947         {
 948             print_cycles(fplog, c2t_pp, wcsn[i], npp, nth_pp, wc->wcsc[i].n, cyc_sum[ewcNR + i], tot);
 949         }
 950         fprintf(fplog, "%s\n", hline);
 951     }
 952
 953     /* print GPU timing summary */
 954     double tot_gpu = 0.0;
 955     if (gpu_pme_t)
 956     {
 957         for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
 958         {
 959             tot_gpu += gpu_pme_t->timing[k].t;
 960         }
 961     }
 962     if (gpu_nbnxn_t)
 963     {
 964         const char* k_log_str[2][2] = { { "Nonbonded F kernel", "Nonbonded F+ene k." },
 965                                         { "Nonbonded F+prune k.", "Nonbonded F+ene+prune k." } };
 966         tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
 967
 968         /* add up the kernel timings */
 969         for (i = 0; i < 2; i++)
 970         {
 971             for (j = 0; j < 2; j++)
 972             {
 973                 tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
 974             }
 975         }
 976         tot_gpu += gpu_nbnxn_t->pruneTime.t;
 977
 978         tot_cpu_overlap = wc->wcc[ewcFORCE].c;
 979         if (wc->wcc[ewcPMEMESH].n > 0)
 980         {
 981             tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
 982         }
 983         tot_cpu_overlap *= realtime * 1000 / tot; /* convert s to ms */
 984
 985         fprintf(fplog, "\n GPU timings\n%s\n", hline);
 986         fprintf(fplog,
 987                 " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
 988         fprintf(fplog, "%s\n", hline);
 989         print_gputimes(fplog, "Pair list H2D", gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
 990         print_gputimes(fplog, "X / q H2D", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
 991
 992         for (i = 0; i < 2; i++)
 993         {
 994             for (j = 0; j < 2; j++)
 995             {
 996                 if (gpu_nbnxn_t->ktime[i][j].c)
 997                 {
 998                     print_gputimes(fplog, k_log_str[i][j], gpu_nbnxn_t->ktime[i][j].c,
 999                                    gpu_nbnxn_t->ktime[i][j].t, tot_gpu);
1000                 }
1001             }
1002         }
1003         if (gpu_pme_t)
1004         {
1005             for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
1006             {
1007                 if (gpu_pme_t->timing[k].c)
1008                 {
1009                     print_gputimes(fplog, PMEStageNames[k], gpu_pme_t->timing[k].c,
1010                                    gpu_pme_t->timing[k].t, tot_gpu);
1011                 }
1012             }
1013         }
1014         if (gpu_nbnxn_t->pruneTime.c)
1015         {
1016             print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c,
1017                            gpu_nbnxn_t->pruneTime.t, tot_gpu);
1018         }
1019         print_gputimes(fplog, "F D2H", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
1020         fprintf(fplog, "%s\n", hline);
1021         print_gputimes(fplog, "Total ", gpu_nbnxn_t->nb_c, tot_gpu, tot_gpu);
1022         fprintf(fplog, "%s\n", hline);
1023         if (gpu_nbnxn_t->dynamicPruneTime.c)
1024         {
1025             /* We print the dynamic pruning kernel timings after a separator
1026              * and avoid adding it to tot_gpu as this is not in the force
1027              * overlap. We print the fraction as relative to the rest.
1028              */
1029             print_gputimes(fplog, "*Dynamic pruning", gpu_nbnxn_t->dynamicPruneTime.c,
1030                            gpu_nbnxn_t->dynamicPruneTime.t, tot_gpu);
1031             fprintf(fplog, "%s\n", hline);
1032         }
1033         gpu_cpu_ratio = tot_gpu / tot_cpu_overlap;
1034         if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
1035         {
1036             fprintf(fplog,
1037                     "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = "
1038                     "%.3f\n",
1039                     tot_gpu / gpu_nbnxn_t->nb_c, tot_cpu_overlap / wc->wcc[ewcFORCE].n, gpu_cpu_ratio);
1040         }
1041
1042         /* only print notes related to CPU-GPU load balance with PME */
1043         if (wc->wcc[ewcPMEMESH].n > 0)
1044         {
1045             fprintf(fplog, "For optimal resource utilization this ratio should be close to 1\n");
1046
1047             /* print note if the imbalance is high with PME case in which
1048              * CPU-GPU load balancing is possible */
1049             if (gpu_cpu_ratio < 0.8 || gpu_cpu_ratio > 1.25)
1050             {
1051                 /* Only the sim master calls this function, so always print to stderr */
1052                 if (gpu_cpu_ratio < 0.8)
1053                 {
1054                     if (npp > 1)
1055                     {
1056                         /* The user could have used -notunepme,
1057                          * but we currently can't check that here.
1058                          */
1059                         GMX_LOG(mdlog.warning)
1060                                 .asParagraph()
1061                                 .appendText(
1062                                         "NOTE: The CPU has >25% more load than the GPU. This "
1063                                         "imbalance wastes\n"
1064                                         "      GPU resources. Maybe the domain decomposition "
1065                                         "limits the PME tuning.\n"
1066                                         "      In that case, try setting the DD grid manually "
1067                                         "(-dd) or lowering -dds.");
1068                     }
1069                     else
1070                     {
1071                         /* We should not end up here, unless the box is
1072                          * too small for increasing the cut-off for PME tuning.
1073                          */
1074                         GMX_LOG(mdlog.warning)
1075                                 .asParagraph()
1076                                 .appendText(
1077                                         "NOTE: The CPU has >25% more load than the GPU. This "
1078                                         "imbalance wastes\n"
1079                                         "      GPU resources.");
1080                     }
1081                 }
1082                 if (gpu_cpu_ratio > 1.25)
1083                 {
1084                     GMX_LOG(mdlog.warning)
1085                             .asParagraph()
1086                             .appendText(
1087                                     "NOTE: The GPU has >25% more load than the CPU. This imbalance "
1088                                     "wastes\n"
1089                                     "      CPU resources.");
1090                 }
1091             }
1092         }
1093     }
1094
1095     if (wc->wc_barrier)
1096     {
1097         GMX_LOG(mdlog.warning)
1098                 .asParagraph()
1099                 .appendText(
1100                         "MPI_Barrier was called before each cycle start/stop\n"
1101                         "call, so timings are not those of real runs.");
1102     }
1103
1104     if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 && (cyc_sum[ewcDOMDEC] > tot * 0.1 || cyc_sum[ewcNS] > tot * 0.1))
1105     {
1106         /* Only the sim master calls this function, so always print to stderr */
1107         if (wc->wcc[ewcDOMDEC].n == 0)
1108         {
1109             GMX_LOG(mdlog.warning)
1110                     .asParagraph()
1111                     .appendTextFormatted(
1112                             "NOTE: %d %% of the run time was spent in pair search,\n"
1113                             "      you might want to increase nstlist (this has no effect on "
1114                             "accuracy)\n",
1115                             gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1116         }
1117         else
1118         {
1119             GMX_LOG(mdlog.warning)
1120                     .asParagraph()
1121                     .appendTextFormatted(
1122                             "NOTE: %d %% of the run time was spent in domain decomposition,\n"
1123                             "      %d %% of the run time was spent in pair search,\n"
1124                             "      you might want to increase nstlist (this has no effect on "
1125                             "accuracy)\n",
1126                             gmx::roundToInt(100 * cyc_sum[ewcDOMDEC] / tot),
1127                             gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1128         }
1129     }
1130
1131     if (cyc_sum[ewcMoveE] > tot * 0.05)
1132     {
1133         GMX_LOG(mdlog.warning)
1134                 .asParagraph()
1135                 .appendTextFormatted(
1136                         "NOTE: %d %% of the run time was spent communicating energies,\n"
1137                         "      you might want to increase some nst* mdp options\n",
1138                         gmx::roundToInt(100 * cyc_sum[ewcMoveE] / tot));
1139     }
1140 }
1141
1142 extern int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
1143 {
1144     if (wc == nullptr)
1145     {
1146         return -1;
1147     }
1148
1149     return wc->reset_counters;
1150 }
1151
1152 extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, int64_t reset_counters)
1153 {
1154     if (wc == nullptr)
1155     {
1156         return;
1157     }
1158
1159     wc->reset_counters = reset_counters;
1160 }
1161
1162 void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
1163 {
1164     if (useCycleSubcounters && wc != nullptr)
1165     {
1166         wc->wcsc[ewcs].start = gmx_cycles_read();
1167     }
1168 }
1169
1170 void wallcycle_sub_start_nocount(gmx_wallcycle_t wc, int ewcs)
1171 {
1172     if (useCycleSubcounters && wc != nullptr)
1173     {
1174         wallcycle_sub_start(wc, ewcs);
1175         wc->wcsc[ewcs].n--;
1176     }
1177 }
1178
1179 void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
1180 {
1181     if (useCycleSubcounters && wc != nullptr)
1182     {
1183         wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start;
1184         wc->wcsc[ewcs].n++;
1185     }
1186 }