src/gromacs/timing/wallcycle.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #include "gmxpre.h"
  39
  40 #include "wallcycle.h"
  41
  42 #include "config.h"
  43
  44 #include <cstdlib>
  45
  46 #include <array>
  47 #include <vector>
  48
  49 #include "gromacs/math/functions.h"
  50 #include "gromacs/mdtypes/commrec.h"
  51 #include "gromacs/timing/cyclecounter.h"
  52 #include "gromacs/timing/gpu_timing.h"
  53 #include "gromacs/timing/wallcyclereporting.h"
  54 #include "gromacs/utility/cstringutil.h"
  55 #include "gromacs/utility/gmxassert.h"
  56 #include "gromacs/utility/gmxmpi.h"
  57 #include "gromacs/utility/logger.h"
  58 #include "gromacs/utility/smalloc.h"
  59 #include "gromacs/utility/snprintf.h"
  60
  61 static const bool useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS;
  62
  63 /* DEBUG_WCYCLE adds consistency checking for the counters.
  64  * It checks if you stop a counter different from the last
  65  * one that was opened and if you do nest too deep.
  66  */
  67 /* #define DEBUG_WCYCLE */
  68
  69 #ifdef DEBUG_WCYCLE
  70 #    include "gromacs/utility/fatalerror.h"
  71 #endif
  72
  73 typedef struct
  74 {
  75     int          n;
  76     gmx_cycles_t c;
  77     gmx_cycles_t start;
  78 } wallcc_t;
  79
  80 struct gmx_wallcycle
  81 {
  82     wallcc_t* wcc;
  83     /* did we detect one or more invalid cycle counts */
  84     gmx_bool haveInvalidCount;
  85     /* variables for testing/debugging */
  86     gmx_bool  wc_barrier;
  87     wallcc_t* wcc_all;
  88     int       wc_depth;
  89 #ifdef DEBUG_WCYCLE
  90 #    define DEPTH_MAX 6
  91     int counterlist[DEPTH_MAX];
  92     int count_depth;
  93 #endif
  94     int          ewc_prev;
  95     gmx_cycles_t cycle_prev;
  96     int64_t      reset_counters;
  97 #if GMX_MPI
  98     MPI_Comm mpi_comm_mygroup;
  99 #endif
 100     wallcc_t* wcsc;
 101 };
 102
 103 /* Each name should not exceed 19 printing characters
 104    (ie. terminating null can be twentieth) */
 105 static const char* wcn[ewcNR] = { "Run",
 106                                   "Step",
 107                                   "PP during PME",
 108                                   "Domain decomp.",
 109                                   "DD comm. load",
 110                                   "DD comm. bounds",
 111                                   "Vsite constr.",
 112                                   "Send X to PME",
 113                                   "Neighbor search",
 114                                   "Launch GPU ops.",
 115                                   "Comm. coord.",
 116                                   "Force",
 117                                   "Wait + Comm. F",
 118                                   "PME mesh",
 119                                   "PME redist. X/F",
 120                                   "PME spread",
 121                                   "PME gather",
 122                                   "PME 3D-FFT",
 123                                   "PME 3D-FFT Comm.",
 124                                   "PME solve LJ",
 125                                   "PME solve Elec",
 126                                   "PME wait for PP",
 127                                   "Wait + Recv. PME F",
 128                                   "Wait PME GPU spread",
 129                                   "PME 3D-FFT",
 130                                   "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
 131                                   "Wait PME GPU gather",
 132                                   "Wait Bonded GPU",
 133                                   "Reduce GPU PME F",
 134                                   "Wait GPU NB nonloc.",
 135                                   "Wait GPU NB local",
 136                                   "Wait GPU state copy",
 137                                   "NB X/F buffer ops.",
 138                                   "Vsite spread",
 139                                   "COM pull force",
 140                                   "AWH",
 141                                   "Write traj.",
 142                                   "Update",
 143                                   "Constraints",
 144                                   "Comm. energies",
 145                                   "Enforced rotation",
 146                                   "Add rot. forces",
 147                                   "Position swapping",
 148                                   "IMD",
 149                                   "Test" };
 150
 151 static const char* wcsn[ewcsNR] = {
 152     "DD redist.",
 153     "DD NS grid + sort",
 154     "DD setup comm.",
 155     "DD make top.",
 156     "DD make constr.",
 157     "DD top. other",
 158     "NS grid local",
 159     "NS grid non-loc.",
 160     "NS search local",
 161     "NS search non-loc.",
 162     "Bonded F",
 163     "Bonded-FEP F",
 164     "Restraints F",
 165     "Listed buffer ops.",
 166     "Nonbonded pruning",
 167     "Nonbonded F kernel",
 168     "Nonbonded F clear",
 169     "Nonbonded FEP",
 170     "Launch NB GPU tasks",
 171     "Launch Bonded GPU tasks",
 172     "Launch PME GPU tasks",
 173     "Launch state copy",
 174     "Ewald F correction",
 175     "NB X buffer ops.",
 176     "NB F buffer ops.",
 177     "Clear force buffer",
 178     "Test subcounter",
 179 };
 180
 181 /* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
 182 static const char* PMEStageNames[] = {
 183     "PME spline", "PME spread",     "PME spline + spread", "PME 3D-FFT r2c",
 184     "PME solve",  "PME 3D-FFT c2r", "PME gather",
 185 };
 186
 187 gmx_bool wallcycle_have_counter()
 188 {
 189     return gmx_cycles_have_counter();
 190 }
 191
 192 gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, t_commrec gmx_unused* cr)
 193 {
 194     gmx_wallcycle_t wc;
 195
 196
 197     if (!wallcycle_have_counter())
 198     {
 199         return nullptr;
 200     }
 201
 202     snew(wc, 1);
 203
 204     wc->haveInvalidCount = FALSE;
 205     wc->wc_barrier       = FALSE;
 206     wc->wcc_all          = nullptr;
 207     wc->wc_depth         = 0;
 208     wc->ewc_prev         = -1;
 209     wc->reset_counters   = resetstep;
 210
 211 #if GMX_MPI
 212     if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
 213     {
 214         if (fplog)
 215         {
 216             fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
 217         }
 218         wc->wc_barrier       = TRUE;
 219         wc->mpi_comm_mygroup = cr->mpi_comm_mygroup;
 220     }
 221 #endif
 222
 223     snew(wc->wcc, ewcNR);
 224     if (getenv("GMX_CYCLE_ALL") != nullptr)
 225     {
 226         if (fplog)
 227         {
 228             fprintf(fplog, "\nWill time all the code during the run\n\n");
 229         }
 230         snew(wc->wcc_all, ewcNR * ewcNR);
 231     }
 232
 233     if (useCycleSubcounters)
 234     {
 235         snew(wc->wcsc, ewcsNR);
 236     }
 237
 238 #ifdef DEBUG_WCYCLE
 239     wc->count_depth = 0;
 240 #endif
 241
 242     return wc;
 243 }
 244
 245 void wallcycle_destroy(gmx_wallcycle_t wc)
 246 {
 247     if (wc == nullptr)
 248     {
 249         return;
 250     }
 251
 252     if (wc->wcc != nullptr)
 253     {
 254         sfree(wc->wcc);
 255     }
 256     if (wc->wcc_all != nullptr)
 257     {
 258         sfree(wc->wcc_all);
 259     }
 260     if (wc->wcsc != nullptr)
 261     {
 262         sfree(wc->wcsc);
 263     }
 264     sfree(wc);
 265 }
 266
 267 static void wallcycle_all_start(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 268 {
 269     wc->ewc_prev   = ewc;
 270     wc->cycle_prev = cycle;
 271 }
 272
 273 static void wallcycle_all_stop(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 274 {
 275     wc->wcc_all[wc->ewc_prev * ewcNR + ewc].n += 1;
 276     wc->wcc_all[wc->ewc_prev * ewcNR + ewc].c += cycle - wc->cycle_prev;
 277 }
 278
 279
 280 #ifdef DEBUG_WCYCLE
 281 static void debug_start_check(gmx_wallcycle_t wc, int ewc)
 282 {
 283     /* fprintf(stderr,"wcycle_start depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 284
 285     if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
 286     {
 287         gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth);
 288     }
 289     wc->counterlist[wc->count_depth] = ewc;
 290     wc->count_depth++;
 291 }
 292
 293 static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
 294 {
 295     wc->count_depth--;
 296
 297     /* fprintf(stderr,"wcycle_stop depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 298
 299     if (wc->count_depth < 0)
 300     {
 301         gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc],
 302                   wc->count_depth);
 303     }
 304     if (wc->counterlist[wc->count_depth] != ewc)
 305     {
 306         gmx_fatal(FARGS, "wallcycle mismatch at stop, start %s, stop %s",
 307                   wcn[wc->counterlist[wc->count_depth]], wcn[ewc]);
 308     }
 309 }
 310 #endif
 311
 312 void wallcycle_start(gmx_wallcycle_t wc, int ewc)
 313 {
 314     gmx_cycles_t cycle;
 315
 316     if (wc == nullptr)
 317     {
 318         return;
 319     }
 320
 321 #if GMX_MPI
 322     if (wc->wc_barrier)
 323     {
 324         MPI_Barrier(wc->mpi_comm_mygroup);
 325     }
 326 #endif
 327
 328 #ifdef DEBUG_WCYCLE
 329     debug_start_check(wc, ewc);
 330 #endif
 331
 332     cycle              = gmx_cycles_read();
 333     wc->wcc[ewc].start = cycle;
 334     if (wc->wcc_all != nullptr)
 335     {
 336         wc->wc_depth++;
 337         if (ewc == ewcRUN)
 338         {
 339             wallcycle_all_start(wc, ewc, cycle);
 340         }
 341         else if (wc->wc_depth == 3)
 342         {
 343             wallcycle_all_stop(wc, ewc, cycle);
 344         }
 345     }
 346 }
 347
 348 void wallcycle_increment_event_count(gmx_wallcycle_t wc, int ewc)
 349 {
 350     if (wc == nullptr)
 351     {
 352         return;
 353     }
 354     wc->wcc[ewc].n++;
 355 }
 356
 357 void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
 358 {
 359     if (wc == nullptr)
 360     {
 361         return;
 362     }
 363
 364     wallcycle_start(wc, ewc);
 365     wc->wcc[ewc].n--;
 366 }
 367
 368 double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
 369 {
 370     gmx_cycles_t cycle, last;
 371
 372     if (wc == nullptr)
 373     {
 374         return 0;
 375     }
 376
 377 #if GMX_MPI
 378     if (wc->wc_barrier)
 379     {
 380         MPI_Barrier(wc->mpi_comm_mygroup);
 381     }
 382 #endif
 383
 384 #ifdef DEBUG_WCYCLE
 385     debug_stop_check(wc, ewc);
 386 #endif
 387
 388     /* When processes or threads migrate between cores, the cycle counting
 389      * can get messed up if the cycle counter on different cores are not
 390      * synchronized. When this happens we expect both large negative and
 391      * positive cycle differences. We can detect negative cycle differences.
 392      * Detecting too large positive counts if difficult, since count can be
 393      * large, especially for ewcRUN. If we detect a negative count,
 394      * we will not print the cycle accounting table.
 395      */
 396     cycle = gmx_cycles_read();
 397     if (cycle >= wc->wcc[ewc].start)
 398     {
 399         last = cycle - wc->wcc[ewc].start;
 400     }
 401     else
 402     {
 403         last                 = 0;
 404         wc->haveInvalidCount = TRUE;
 405     }
 406     wc->wcc[ewc].c += last;
 407     wc->wcc[ewc].n++;
 408     if (wc->wcc_all)
 409     {
 410         wc->wc_depth--;
 411         if (ewc == ewcRUN)
 412         {
 413             wallcycle_all_stop(wc, ewc, cycle);
 414         }
 415         else if (wc->wc_depth == 2)
 416         {
 417             wallcycle_all_start(wc, ewc, cycle);
 418         }
 419     }
 420
 421     return last;
 422 }
 423
 424 void wallcycle_get(gmx_wallcycle_t wc, int ewc, int* n, double* c)
 425 {
 426     *n = wc->wcc[ewc].n;
 427     *c = static_cast<double>(wc->wcc[ewc].c);
 428 }
 429
 430 void wallcycle_reset_all(gmx_wallcycle_t wc)
 431 {
 432     int i;
 433
 434     if (wc == nullptr)
 435     {
 436         return;
 437     }
 438
 439     for (i = 0; i < ewcNR; i++)
 440     {
 441         wc->wcc[i].n = 0;
 442         wc->wcc[i].c = 0;
 443     }
 444     wc->haveInvalidCount = FALSE;
 445
 446     if (wc->wcc_all)
 447     {
 448         for (i = 0; i < ewcNR * ewcNR; i++)
 449         {
 450             wc->wcc_all[i].n = 0;
 451             wc->wcc_all[i].c = 0;
 452         }
 453     }
 454     if (wc->wcsc)
 455     {
 456         for (i = 0; i < ewcsNR; i++)
 457         {
 458             wc->wcsc[i].n = 0;
 459             wc->wcsc[i].c = 0;
 460         }
 461     }
 462 }
 463
 464 static gmx_bool is_pme_counter(int ewc)
 465 {
 466     return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
 467 }
 468
 469 static gmx_bool is_pme_subcounter(int ewc)
 470 {
 471     return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
 472 }
 473
 474 /* Subtract counter ewc_sub timed inside a timing block for ewc_main */
 475 static void subtract_cycles(wallcc_t* wcc, int ewc_main, int ewc_sub)
 476 {
 477     if (wcc[ewc_sub].n > 0)
 478     {
 479         if (wcc[ewc_main].c >= wcc[ewc_sub].c)
 480         {
 481             wcc[ewc_main].c -= wcc[ewc_sub].c;
 482         }
 483         else
 484         {
 485             /* Something is wrong with the cycle counting */
 486             wcc[ewc_main].c = 0;
 487         }
 488     }
 489 }
 490
 491 void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
 492 {
 493     if (wc == nullptr)
 494     {
 495         return;
 496     }
 497
 498     for (int i = 0; i < ewcNR; i++)
 499     {
 500         if (is_pme_counter(i) || (i == ewcRUN && isPmeRank))
 501         {
 502             wc->wcc[i].c *= nthreads_pme;
 503
 504             if (wc->wcc_all)
 505             {
 506                 for (int j = 0; j < ewcNR; j++)
 507                 {
 508                     wc->wcc_all[i * ewcNR + j].c *= nthreads_pme;
 509                 }
 510             }
 511         }
 512         else
 513         {
 514             wc->wcc[i].c *= nthreads_pp;
 515
 516             if (wc->wcc_all)
 517             {
 518                 for (int j = 0; j < ewcNR; j++)
 519                 {
 520                     wc->wcc_all[i * ewcNR + j].c *= nthreads_pp;
 521                 }
 522             }
 523         }
 524     }
 525     if (useCycleSubcounters && wc->wcsc && !isPmeRank)
 526     {
 527         for (int i = 0; i < ewcsNR; i++)
 528         {
 529             wc->wcsc[i].c *= nthreads_pp;
 530         }
 531     }
 532 }
 533
 534 /* TODO Make an object for this function to return, containing some
 535  * vectors of something like wallcc_t for the summed wcc, wcc_all and
 536  * wcsc, AND the original wcc for rank 0.
 537  *
 538  * The GPU timing is reported only for rank 0, so we want to preserve
 539  * the original wcycle on that rank. Rank 0 also reports the global
 540  * counts before that, so needs something to contain the global data
 541  * without over-writing the rank-0 data. The current implementation
 542  * uses cycles_sum to manage this, which works OK now because wcsc and
 543  * wcc_all are unused by the GPU reporting, but it is not satisfactory
 544  * for the future. Also, there's no need for MPI_Allreduce, since
 545  * only MASTERRANK uses any of the results. */
 546 WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle_t wc)
 547 {
 548     WallcycleCounts cycles_sum;
 549     wallcc_t*       wcc;
 550     double          cycles[ewcNR + ewcsNR];
 551 #if GMX_MPI
 552     double cycles_n[ewcNR + ewcsNR + 1];
 553 #endif
 554     int i;
 555     int nsum;
 556
 557     if (wc == nullptr)
 558     {
 559         /* Default construction of std::array of non-class T can leave
 560            the values indeterminate, just like a C array, and icc
 561            warns about it. */
 562         cycles_sum.fill(0);
 563         return cycles_sum;
 564     }
 565
 566     wcc = wc->wcc;
 567
 568     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
 569     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);
 570
 571     subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);
 572
 573     if (cr->npmenodes == 0)
 574     {
 575         /* All nodes do PME (or no PME at all) */
 576         subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
 577     }
 578     else
 579     {
 580         /* The are PME-only nodes */
 581         if (wcc[ewcPMEMESH].n > 0)
 582         {
 583             /* This must be a PME only node, calculate the Wait + Comm. time */
 584             GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c,
 585                        "Total run ticks must be greater than PME-only ticks");
 586             wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
 587         }
 588     }
 589
 590     /* Store the cycles in a double buffer for summing */
 591     for (i = 0; i < ewcNR; i++)
 592     {
 593 #if GMX_MPI
 594         cycles_n[i] = static_cast<double>(wcc[i].n);
 595 #endif
 596         cycles[i] = static_cast<double>(wcc[i].c);
 597     }
 598     nsum = ewcNR;
 599     if (wc->wcsc)
 600     {
 601         for (i = 0; i < ewcsNR; i++)
 602         {
 603 #if GMX_MPI
 604             cycles_n[ewcNR + i] = static_cast<double>(wc->wcsc[i].n);
 605 #endif
 606             cycles[ewcNR + i] = static_cast<double>(wc->wcsc[i].c);
 607         }
 608         nsum += ewcsNR;
 609     }
 610
 611 #if GMX_MPI
 612     if (cr->nnodes > 1)
 613     {
 614         double buf[ewcNR + ewcsNR + 1];
 615
 616         // TODO this code is used only at the end of the run, so we
 617         // can just do a simple reduce of haveInvalidCount in
 618         // wallcycle_print, and avoid bugs
 619         cycles_n[nsum] = (wc->haveInvalidCount ? 1 : 0);
 620         // TODO Use MPI_Reduce
 621         MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
 622         for (i = 0; i < ewcNR; i++)
 623         {
 624             wcc[i].n = gmx::roundToInt(buf[i]);
 625         }
 626         wc->haveInvalidCount = (buf[nsum] > 0);
 627         if (wc->wcsc)
 628         {
 629             for (i = 0; i < ewcsNR; i++)
 630             {
 631                 wc->wcsc[i].n = gmx::roundToInt(buf[ewcNR + i]);
 632             }
 633         }
 634
 635         // TODO Use MPI_Reduce
 636         MPI_Allreduce(cycles, cycles_sum.data(), nsum, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 637
 638         if (wc->wcc_all != nullptr)
 639         {
 640             double *buf_all, *cyc_all;
 641
 642             snew(cyc_all, ewcNR * ewcNR);
 643             snew(buf_all, ewcNR * ewcNR);
 644             for (i = 0; i < ewcNR * ewcNR; i++)
 645             {
 646                 cyc_all[i] = wc->wcc_all[i].c;
 647             }
 648             // TODO Use MPI_Reduce
 649             MPI_Allreduce(cyc_all, buf_all, ewcNR * ewcNR, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 650             for (i = 0; i < ewcNR * ewcNR; i++)
 651             {
 652                 wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
 653             }
 654             sfree(buf_all);
 655             sfree(cyc_all);
 656         }
 657     }
 658     else
 659 #endif
 660     {
 661         for (i = 0; i < nsum; i++)
 662         {
 663             cycles_sum[i] = cycles[i];
 664         }
 665     }
 666
 667     return cycles_sum;
 668 }
 669
 670 static void
 671 print_cycles(FILE* fplog, double c2t, const char* name, int nnodes, int nthreads, int ncalls, double c_sum, double tot)
 672 {
 673     char   nnodes_str[STRLEN];
 674     char   nthreads_str[STRLEN];
 675     char   ncalls_str[STRLEN];
 676     double wallt;
 677     double percentage = (tot > 0.) ? (100. * c_sum / tot) : 0.;
 678
 679     if (c_sum > 0)
 680     {
 681         if (ncalls > 0)
 682         {
 683             snprintf(ncalls_str, sizeof(ncalls_str), "%10d", ncalls);
 684             if (nnodes < 0)
 685             {
 686                 snprintf(nnodes_str, sizeof(nnodes_str), "N/A");
 687             }
 688             else
 689             {
 690                 snprintf(nnodes_str, sizeof(nnodes_str), "%4d", nnodes);
 691             }
 692             if (nthreads < 0)
 693             {
 694                 snprintf(nthreads_str, sizeof(nthreads_str), "N/A");
 695             }
 696             else
 697             {
 698                 snprintf(nthreads_str, sizeof(nthreads_str), "%4d", nthreads);
 699             }
 700         }
 701         else
 702         {
 703             nnodes_str[0]   = 0;
 704             nthreads_str[0] = 0;
 705             ncalls_str[0]   = 0;
 706         }
 707         /* Convert the cycle count to wallclock time for this task */
 708         wallt = c_sum * c2t;
 709
 710         fprintf(fplog, " %-19.19s %4s %4s %10s  %10.3f %14.3f %5.1f\n", name, nnodes_str,
 711                 nthreads_str, ncalls_str, wallt, c_sum * 1e-9, percentage);
 712     }
 713 }
 714
 715 static void print_gputimes(FILE* fplog, const char* name, int n, double t, double tot_t)
 716 {
 717     char num[11];
 718     char avg_perf[11];
 719
 720     if (n > 0)
 721     {
 722         snprintf(num, sizeof(num), "%10d", n);
 723         snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t / n);
 724     }
 725     else
 726     {
 727         sprintf(num, "          ");
 728         sprintf(avg_perf, "          ");
 729     }
 730     if (t != tot_t && tot_t > 0)
 731     {
 732         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, num, t / 1000, avg_perf, 100 * t / tot_t);
 733     }
 734     else
 735     {
 736         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, "", t / 1000, avg_perf, 100.0);
 737     }
 738 }
 739
 740 static void print_header(FILE* fplog, int nrank_pp, int nth_pp, int nrank_pme, int nth_pme)
 741 {
 742     int nrank_tot = nrank_pp + nrank_pme;
 743     if (0 == nrank_pme)
 744     {
 745         fprintf(fplog, "On %d MPI rank%s", nrank_tot, nrank_tot == 1 ? "" : "s");
 746         if (nth_pp > 1)
 747         {
 748             fprintf(fplog, ", each using %d OpenMP threads", nth_pp);
 749         }
 750         /* Don't report doing PP+PME, because we can't tell here if
 751          * this is RF, etc. */
 752     }
 753     else
 754     {
 755         fprintf(fplog, "On %d MPI rank%s doing PP", nrank_pp, nrank_pp == 1 ? "" : "s");
 756         if (nth_pp > 1)
 757         {
 758             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pp > 1 ? " each" : "", nth_pp);
 759         }
 760         fprintf(fplog, ", and\non %d MPI rank%s doing PME", nrank_pme, nrank_pme == 1 ? "" : "s");
 761         if (nth_pme > 1)
 762         {
 763             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pme > 1 ? " each" : "", nth_pme);
 764         }
 765     }
 766
 767     fprintf(fplog, "\n\n");
 768     fprintf(fplog, " Computing:          Num   Num      Call    Wall time         Giga-Cycles\n");
 769     fprintf(fplog, "                     Ranks Threads  Count      (s)         total sum    %%\n");
 770 }
 771
 772
 773 void wallcycle_print(FILE*                            fplog,
 774                      const gmx::MDLogger&             mdlog,
 775                      int                              nnodes,
 776                      int                              npme,
 777                      int                              nth_pp,
 778                      int                              nth_pme,
 779                      double                           realtime,
 780                      gmx_wallcycle_t                  wc,
 781                      const WallcycleCounts&           cyc_sum,
 782                      const gmx_wallclock_gpu_nbnxn_t* gpu_nbnxn_t,
 783                      const gmx_wallclock_gpu_pme_t*   gpu_pme_t)
 784 {
 785     double      tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
 786     double      c2t, c2t_pp, c2t_pme = 0;
 787     int         i, j, npp, nth_tot;
 788     char        buf[STRLEN];
 789     const char* hline =
 790             "-----------------------------------------------------------------------------";
 791
 792     if (wc == nullptr)
 793     {
 794         return;
 795     }
 796
 797     GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0");
 798     GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0");
 799     GMX_ASSERT(nnodes > 0, "Number of nodes must be >0");
 800     GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative");
 801     npp = nnodes - npme;
 802     /* npme is the number of PME-only ranks used, and we always do PP work */
 803     GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0");
 804
 805     nth_tot = npp * nth_pp + npme * nth_pme;
 806
 807     /* When using PME-only nodes, the next line is valid for both
 808        PP-only and PME-only nodes because they started ewcRUN at the
 809        same time. */
 810     tot        = cyc_sum[ewcRUN];
 811     tot_for_pp = 0;
 812
 813     if (tot <= 0.0)
 814     {
 815         /* TODO This is heavy handed, but until someone reworks the
 816            code so that it is provably robust with respect to
 817            non-positive values for all possible timer and cycle
 818            counters, there is less value gained from printing whatever
 819            timing data might still be sensible for some non-Jenkins
 820            run, than is lost from diagnosing Jenkins FP exceptions on
 821            runs about whose execution time we don't care. */
 822         GMX_LOG(mdlog.warning)
 823                 .asParagraph()
 824                 .appendTextFormatted(
 825                         "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a "
 826                         "time accounting",
 827                         tot);
 828         return;
 829     }
 830
 831     if (wc->haveInvalidCount)
 832     {
 833         GMX_LOG(mdlog.warning)
 834                 .asParagraph()
 835                 .appendText(
 836                         "NOTE: Detected invalid cycle counts, probably because threads moved "
 837                         "between CPU cores that do not have synchronized cycle counters. Will not "
 838                         "print the cycle accounting.");
 839         return;
 840     }
 841
 842
 843     /* Conversion factor from cycles to seconds */
 844     c2t    = realtime / tot;
 845     c2t_pp = c2t * nth_tot / static_cast<double>(npp * nth_pp);
 846     if (npme > 0)
 847     {
 848         c2t_pme = c2t * nth_tot / static_cast<double>(npme * nth_pme);
 849     }
 850     else
 851     {
 852         c2t_pme = 0;
 853     }
 854
 855     fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");
 856
 857     print_header(fplog, npp, nth_pp, npme, nth_pme);
 858
 859     fprintf(fplog, "%s\n", hline);
 860     for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
 861     {
 862         if (is_pme_subcounter(i))
 863         {
 864             /* Do not count these at all */
 865         }
 866         else if (npme > 0 && is_pme_counter(i))
 867         {
 868             /* Print timing information for PME-only nodes, but add an
 869              * asterisk so the reader of the table can know that the
 870              * walltimes are not meant to add up. The asterisk still
 871              * fits in the required maximum of 19 characters. */
 872             char buffer[STRLEN];
 873             snprintf(buffer, STRLEN, "%s *", wcn[i]);
 874             print_cycles(fplog, c2t_pme, buffer, npme, nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
 875         }
 876         else
 877         {
 878             /* Print timing information when it is for a PP or PP+PME
 879                node */
 880             print_cycles(fplog, c2t_pp, wcn[i], npp, nth_pp, wc->wcc[i].n, cyc_sum[i], tot);
 881             tot_for_pp += cyc_sum[i];
 882         }
 883     }
 884     if (wc->wcc_all != nullptr)
 885     {
 886         for (i = 0; i < ewcNR; i++)
 887         {
 888             for (j = 0; j < ewcNR; j++)
 889             {
 890                 snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
 891                 print_cycles(fplog, c2t_pp, buf, npp, nth_pp, wc->wcc_all[i * ewcNR + j].n,
 892                              wc->wcc_all[i * ewcNR + j].c, tot);
 893             }
 894         }
 895     }
 896     tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot);
 897     print_cycles(fplog, c2t_pp, "Rest", npp, nth_pp, -1, tot_for_rest - tot_for_pp, tot);
 898     fprintf(fplog, "%s\n", hline);
 899     print_cycles(fplog, c2t, "Total", npp, nth_pp, -1, tot, tot);
 900     fprintf(fplog, "%s\n", hline);
 901
 902     if (npme > 0)
 903     {
 904         fprintf(fplog,
 905                 "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
 906                 "    twice the total reported, but the cycle count total and %% are correct.\n"
 907                 "%s\n",
 908                 hline);
 909     }
 910
 911     if (wc->wcc[ewcPMEMESH].n > 0)
 912     {
 913         // A workaround to not print breakdown when no subcounters were recorded.
 914         // TODO: figure out and record PME GPU counters (what to do with the waiting ones?)
 915         std::vector<int> validPmeSubcounterIndices;
 916         for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
 917         {
 918             if (is_pme_subcounter(i) && wc->wcc[i].n > 0)
 919             {
 920                 validPmeSubcounterIndices.push_back(i);
 921             }
 922         }
 923
 924         if (!validPmeSubcounterIndices.empty())
 925         {
 926             fprintf(fplog, " Breakdown of PME mesh computation\n");
 927             fprintf(fplog, "%s\n", hline);
 928             for (auto i : validPmeSubcounterIndices)
 929             {
 930                 print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i], npme > 0 ? npme : npp,
 931                              nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
 932             }
 933             fprintf(fplog, "%s\n", hline);
 934         }
 935     }
 936
 937     if (useCycleSubcounters && wc->wcsc)
 938     {
 939         fprintf(fplog, " Breakdown of PP computation\n");
 940         fprintf(fplog, "%s\n", hline);
 941         for (i = 0; i < ewcsNR; i++)
 942         {
 943             print_cycles(fplog, c2t_pp, wcsn[i], npp, nth_pp, wc->wcsc[i].n, cyc_sum[ewcNR + i], tot);
 944         }
 945         fprintf(fplog, "%s\n", hline);
 946     }
 947
 948     /* print GPU timing summary */
 949     double tot_gpu = 0.0;
 950     if (gpu_pme_t)
 951     {
 952         for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
 953         {
 954             tot_gpu += gpu_pme_t->timing[k].t;
 955         }
 956     }
 957     if (gpu_nbnxn_t)
 958     {
 959         const char* k_log_str[2][2] = { { "Nonbonded F kernel", "Nonbonded F+ene k." },
 960                                         { "Nonbonded F+prune k.", "Nonbonded F+ene+prune k." } };
 961         tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
 962
 963         /* add up the kernel timings */
 964         for (i = 0; i < 2; i++)
 965         {
 966             for (j = 0; j < 2; j++)
 967             {
 968                 tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
 969             }
 970         }
 971         tot_gpu += gpu_nbnxn_t->pruneTime.t;
 972
 973         tot_cpu_overlap = wc->wcc[ewcFORCE].c;
 974         if (wc->wcc[ewcPMEMESH].n > 0)
 975         {
 976             tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
 977         }
 978         tot_cpu_overlap *= realtime * 1000 / tot; /* convert s to ms */
 979
 980         fprintf(fplog, "\n GPU timings\n%s\n", hline);
 981         fprintf(fplog,
 982                 " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
 983         fprintf(fplog, "%s\n", hline);
 984         print_gputimes(fplog, "Pair list H2D", gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
 985         print_gputimes(fplog, "X / q H2D", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
 986
 987         for (i = 0; i < 2; i++)
 988         {
 989             for (j = 0; j < 2; j++)
 990             {
 991                 if (gpu_nbnxn_t->ktime[i][j].c)
 992                 {
 993                     print_gputimes(fplog, k_log_str[i][j], gpu_nbnxn_t->ktime[i][j].c,
 994                                    gpu_nbnxn_t->ktime[i][j].t, tot_gpu);
 995                 }
 996             }
 997         }
 998         if (gpu_pme_t)
 999         {
1000             for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
1001             {
1002                 if (gpu_pme_t->timing[k].c)
1003                 {
1004                     print_gputimes(fplog, PMEStageNames[k], gpu_pme_t->timing[k].c,
1005                                    gpu_pme_t->timing[k].t, tot_gpu);
1006                 }
1007             }
1008         }
1009         if (gpu_nbnxn_t->pruneTime.c)
1010         {
1011             print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c,
1012                            gpu_nbnxn_t->pruneTime.t, tot_gpu);
1013         }
1014         print_gputimes(fplog, "F D2H", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
1015         fprintf(fplog, "%s\n", hline);
1016         print_gputimes(fplog, "Total ", gpu_nbnxn_t->nb_c, tot_gpu, tot_gpu);
1017         fprintf(fplog, "%s\n", hline);
1018         if (gpu_nbnxn_t->dynamicPruneTime.c)
1019         {
1020             /* We print the dynamic pruning kernel timings after a separator
1021              * and avoid adding it to tot_gpu as this is not in the force
1022              * overlap. We print the fraction as relative to the rest.
1023              */
1024             print_gputimes(fplog, "*Dynamic pruning", gpu_nbnxn_t->dynamicPruneTime.c,
1025                            gpu_nbnxn_t->dynamicPruneTime.t, tot_gpu);
1026             fprintf(fplog, "%s\n", hline);
1027         }
1028         gpu_cpu_ratio = tot_gpu / tot_cpu_overlap;
1029         if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
1030         {
1031             fprintf(fplog,
1032                     "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = "
1033                     "%.3f\n",
1034                     tot_gpu / gpu_nbnxn_t->nb_c, tot_cpu_overlap / wc->wcc[ewcFORCE].n, gpu_cpu_ratio);
1035         }
1036
1037         /* only print notes related to CPU-GPU load balance with PME */
1038         if (wc->wcc[ewcPMEMESH].n > 0)
1039         {
1040             fprintf(fplog, "For optimal resource utilization this ratio should be close to 1\n");
1041
1042             /* print note if the imbalance is high with PME case in which
1043              * CPU-GPU load balancing is possible */
1044             if (gpu_cpu_ratio < 0.8 || gpu_cpu_ratio > 1.25)
1045             {
1046                 /* Only the sim master calls this function, so always print to stderr */
1047                 if (gpu_cpu_ratio < 0.8)
1048                 {
1049                     if (npp > 1)
1050                     {
1051                         /* The user could have used -notunepme,
1052                          * but we currently can't check that here.
1053                          */
1054                         GMX_LOG(mdlog.warning)
1055                                 .asParagraph()
1056                                 .appendText(
1057                                         "NOTE: The CPU has >25% more load than the GPU. This "
1058                                         "imbalance wastes\n"
1059                                         "      GPU resources. Maybe the domain decomposition "
1060                                         "limits the PME tuning.\n"
1061                                         "      In that case, try setting the DD grid manually "
1062                                         "(-dd) or lowering -dds.");
1063                     }
1064                     else
1065                     {
1066                         /* We should not end up here, unless the box is
1067                          * too small for increasing the cut-off for PME tuning.
1068                          */
1069                         GMX_LOG(mdlog.warning)
1070                                 .asParagraph()
1071                                 .appendText(
1072                                         "NOTE: The CPU has >25% more load than the GPU. This "
1073                                         "imbalance wastes\n"
1074                                         "      GPU resources.");
1075                     }
1076                 }
1077                 if (gpu_cpu_ratio > 1.25)
1078                 {
1079                     GMX_LOG(mdlog.warning)
1080                             .asParagraph()
1081                             .appendText(
1082                                     "NOTE: The GPU has >25% more load than the CPU. This imbalance "
1083                                     "wastes\n"
1084                                     "      CPU resources.");
1085                 }
1086             }
1087         }
1088     }
1089
1090     if (wc->wc_barrier)
1091     {
1092         GMX_LOG(mdlog.warning)
1093                 .asParagraph()
1094                 .appendText(
1095                         "MPI_Barrier was called before each cycle start/stop\n"
1096                         "call, so timings are not those of real runs.");
1097     }
1098
1099     if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 && (cyc_sum[ewcDOMDEC] > tot * 0.1 || cyc_sum[ewcNS] > tot * 0.1))
1100     {
1101         /* Only the sim master calls this function, so always print to stderr */
1102         if (wc->wcc[ewcDOMDEC].n == 0)
1103         {
1104             GMX_LOG(mdlog.warning)
1105                     .asParagraph()
1106                     .appendTextFormatted(
1107                             "NOTE: %d %% of the run time was spent in pair search,\n"
1108                             "      you might want to increase nstlist (this has no effect on "
1109                             "accuracy)\n",
1110                             gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1111         }
1112         else
1113         {
1114             GMX_LOG(mdlog.warning)
1115                     .asParagraph()
1116                     .appendTextFormatted(
1117                             "NOTE: %d %% of the run time was spent in domain decomposition,\n"
1118                             "      %d %% of the run time was spent in pair search,\n"
1119                             "      you might want to increase nstlist (this has no effect on "
1120                             "accuracy)\n",
1121                             gmx::roundToInt(100 * cyc_sum[ewcDOMDEC] / tot),
1122                             gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1123         }
1124     }
1125
1126     if (cyc_sum[ewcMoveE] > tot * 0.05)
1127     {
1128         GMX_LOG(mdlog.warning)
1129                 .asParagraph()
1130                 .appendTextFormatted(
1131                         "NOTE: %d %% of the run time was spent communicating energies,\n"
1132                         "      you might want to increase some nst* mdp options\n",
1133                         gmx::roundToInt(100 * cyc_sum[ewcMoveE] / tot));
1134     }
1135 }
1136
1137 extern int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
1138 {
1139     if (wc == nullptr)
1140     {
1141         return -1;
1142     }
1143
1144     return wc->reset_counters;
1145 }
1146
1147 extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, int64_t reset_counters)
1148 {
1149     if (wc == nullptr)
1150     {
1151         return;
1152     }
1153
1154     wc->reset_counters = reset_counters;
1155 }
1156
1157 void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
1158 {
1159     if (useCycleSubcounters && wc != nullptr)
1160     {
1161         wc->wcsc[ewcs].start = gmx_cycles_read();
1162     }
1163 }
1164
1165 void wallcycle_sub_start_nocount(gmx_wallcycle_t wc, int ewcs)
1166 {
1167     if (useCycleSubcounters && wc != nullptr)
1168     {
1169         wallcycle_sub_start(wc, ewcs);
1170         wc->wcsc[ewcs].n--;
1171     }
1172 }
1173
1174 void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
1175 {
1176     if (useCycleSubcounters && wc != nullptr)
1177     {
1178         wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start;
1179         wc->wcsc[ewcs].n++;
1180     }
1181 }