src/gromacs/timing/wallcycle.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #include "gmxpre.h"
  39
  40 #include "wallcycle.h"
  41
  42 #include "config.h"
  43
  44 #include <cstdlib>
  45
  46 #include <array>
  47 #include <vector>
  48
  49 #include "gromacs/math/functions.h"
  50 #include "gromacs/mdtypes/commrec.h"
  51 #include "gromacs/timing/cyclecounter.h"
  52 #include "gromacs/timing/gpu_timing.h"
  53 #include "gromacs/timing/wallcyclereporting.h"
  54 #include "gromacs/utility/cstringutil.h"
  55 #include "gromacs/utility/gmxassert.h"
  56 #include "gromacs/utility/gmxmpi.h"
  57 #include "gromacs/utility/logger.h"
  58 #include "gromacs/utility/smalloc.h"
  59 #include "gromacs/utility/snprintf.h"
  60
  61 static const bool useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS;
  62
  63 #ifndef DEBUG_WCYCLE
  64 /*! \brief Enables consistency checking for the counters.
  65  *
  66  * If the macro is set to 1, code checks if you stop a counter different from the last
  67  * one that was opened and if you do nest too deep.
  68  */
  69 #    define DEBUG_WCYCLE 0
  70 #endif
  71 //! Whether wallcycle debugging is enabled
  72 constexpr bool gmx_unused enableWallcycleDebug = (DEBUG_WCYCLE != 0);
  73 //! True if only the master rank should print debugging output
  74 constexpr bool gmx_unused onlyMasterDebugPrints = true;
  75 //! True if cycle counter nesting depth debuggin prints are enabled
  76 constexpr bool gmx_unused debugPrintDepth = false /* enableWallcycleDebug */;
  77
  78 #if DEBUG_WCYCLE
  79 #    include "gromacs/utility/fatalerror.h"
  80 #endif
  81
  82 typedef struct
  83 {
  84     int          n;
  85     gmx_cycles_t c;
  86     gmx_cycles_t start;
  87 } wallcc_t;
  88
  89 struct gmx_wallcycle
  90 {
  91     wallcc_t* wcc;
  92     /* did we detect one or more invalid cycle counts */
  93     gmx_bool haveInvalidCount;
  94     /* variables for testing/debugging */
  95     gmx_bool  wc_barrier;
  96     wallcc_t* wcc_all;
  97     int       wc_depth;
  98 #if DEBUG_WCYCLE
  99 #    define DEPTH_MAX 6
 100     int  counterlist[DEPTH_MAX];
 101     int  count_depth;
 102     bool isMasterRank;
 103 #endif
 104     int          ewc_prev;
 105     gmx_cycles_t cycle_prev;
 106     int64_t      reset_counters;
 107 #if GMX_MPI
 108     MPI_Comm mpi_comm_mygroup;
 109 #endif
 110     wallcc_t* wcsc;
 111 };
 112
 113 /* Each name should not exceed 19 printing characters
 114    (ie. terminating null can be twentieth) */
 115 static const char* wcn[ewcNR] = { "Run",
 116                                   "Step",
 117                                   "PP during PME",
 118                                   "Domain decomp.",
 119                                   "DD comm. load",
 120                                   "DD comm. bounds",
 121                                   "Vsite constr.",
 122                                   "Send X to PME",
 123                                   "Neighbor search",
 124                                   "Launch GPU ops.",
 125                                   "Comm. coord.",
 126                                   "Force",
 127                                   "Wait + Comm. F",
 128                                   "PME mesh",
 129                                   "PME redist. X/F",
 130                                   "PME spread",
 131                                   "PME gather",
 132                                   "PME 3D-FFT",
 133                                   "PME 3D-FFT Comm.",
 134                                   "PME solve LJ",
 135                                   "PME solve Elec",
 136                                   "PME wait for PP",
 137                                   "Wait + Recv. PME F",
 138                                   "Wait PME GPU spread",
 139                                   "PME 3D-FFT",
 140                                   "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
 141                                   "Wait PME GPU gather",
 142                                   "Wait Bonded GPU",
 143                                   "Reduce GPU PME F",
 144                                   "Wait GPU NB nonloc.",
 145                                   "Wait GPU NB local",
 146                                   "Wait GPU state copy",
 147                                   "NB X/F buffer ops.",
 148                                   "Vsite spread",
 149                                   "COM pull force",
 150                                   "AWH",
 151                                   "Write traj.",
 152                                   "Update",
 153                                   "Constraints",
 154                                   "Comm. energies",
 155                                   "Enforced rotation",
 156                                   "Add rot. forces",
 157                                   "Position swapping",
 158                                   "IMD",
 159                                   "Test" };
 160
 161 static const char* wcsn[ewcsNR] = {
 162     "DD redist.",
 163     "DD NS grid + sort",
 164     "DD setup comm.",
 165     "DD make top.",
 166     "DD make constr.",
 167     "DD top. other",
 168     "DD GPU ops.",
 169     "NS grid local",
 170     "NS grid non-loc.",
 171     "NS search local",
 172     "NS search non-loc.",
 173     "Bonded F",
 174     "Bonded-FEP F",
 175     "Restraints F",
 176     "Listed buffer ops.",
 177     "Nonbonded pruning",
 178     "Nonbonded F kernel",
 179     "Nonbonded F clear",
 180     "Nonbonded FEP",
 181     "Launch NB GPU tasks",
 182     "Launch Bonded GPU tasks",
 183     "Launch PME GPU tasks",
 184     "Launch state copy",
 185     "Ewald F correction",
 186     "NB X buffer ops.",
 187     "NB F buffer ops.",
 188     "Clear force buffer",
 189     "Launch GPU NB X buffer ops.",
 190     "Launch GPU NB F buffer ops.",
 191     "Launch GPU Comm. coord.",
 192     "Launch GPU Comm. force.",
 193     "Launch GPU update",
 194     "Test subcounter",
 195 };
 196
 197 /* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
 198 static const char* PMEStageNames[] = {
 199     "PME spline", "PME spread",     "PME spline + spread", "PME 3D-FFT r2c",
 200     "PME solve",  "PME 3D-FFT c2r", "PME gather",
 201 };
 202
 203 gmx_bool wallcycle_have_counter()
 204 {
 205     return gmx_cycles_have_counter();
 206 }
 207
 208 gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, t_commrec gmx_unused* cr)
 209 {
 210     gmx_wallcycle_t wc;
 211
 212
 213     if (!wallcycle_have_counter())
 214     {
 215         return nullptr;
 216     }
 217
 218     snew(wc, 1);
 219
 220     wc->haveInvalidCount = FALSE;
 221     wc->wc_barrier       = FALSE;
 222     wc->wcc_all          = nullptr;
 223     wc->wc_depth         = 0;
 224     wc->ewc_prev         = -1;
 225     wc->reset_counters   = resetstep;
 226
 227
 228 #if GMX_MPI
 229     if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
 230     {
 231         if (fplog)
 232         {
 233             fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
 234         }
 235         wc->wc_barrier       = TRUE;
 236         wc->mpi_comm_mygroup = cr->mpi_comm_mygroup;
 237     }
 238 #endif
 239
 240     snew(wc->wcc, ewcNR);
 241     if (getenv("GMX_CYCLE_ALL") != nullptr)
 242     {
 243         if (fplog)
 244         {
 245             fprintf(fplog, "\nWill time all the code during the run\n\n");
 246         }
 247         snew(wc->wcc_all, ewcNR * ewcNR);
 248     }
 249
 250     if (useCycleSubcounters)
 251     {
 252         snew(wc->wcsc, ewcsNR);
 253     }
 254
 255 #if DEBUG_WCYCLE
 256     wc->count_depth  = 0;
 257     wc->isMasterRank = MASTER(cr);
 258 #endif
 259
 260     return wc;
 261 }
 262
 263 void wallcycle_destroy(gmx_wallcycle_t wc)
 264 {
 265     if (wc == nullptr)
 266     {
 267         return;
 268     }
 269
 270     if (wc->wcc != nullptr)
 271     {
 272         sfree(wc->wcc);
 273     }
 274     if (wc->wcc_all != nullptr)
 275     {
 276         sfree(wc->wcc_all);
 277     }
 278     if (wc->wcsc != nullptr)
 279     {
 280         sfree(wc->wcsc);
 281     }
 282     sfree(wc);
 283 }
 284
 285 static void wallcycle_all_start(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 286 {
 287     wc->ewc_prev   = ewc;
 288     wc->cycle_prev = cycle;
 289 }
 290
 291 static void wallcycle_all_stop(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 292 {
 293     wc->wcc_all[wc->ewc_prev * ewcNR + ewc].n += 1;
 294     wc->wcc_all[wc->ewc_prev * ewcNR + ewc].c += cycle - wc->cycle_prev;
 295 }
 296
 297
 298 #if DEBUG_WCYCLE
 299 static void debug_start_check(gmx_wallcycle_t wc, int ewc)
 300 {
 301     if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
 302     {
 303         gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth + 1);
 304     }
 305     wc->counterlist[wc->count_depth] = ewc;
 306     wc->count_depth++;
 307
 308     if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
 309     {
 310         std::string indentStr(4 * wc->count_depth, ' ');
 311         fprintf(stderr, "%swcycle_start depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]);
 312     }
 313 }
 314
 315 static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
 316 {
 317     if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
 318     {
 319         std::string indentStr(4 * wc->count_depth, ' ');
 320         fprintf(stderr, "%swcycle_stop  depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]);
 321     }
 322
 323     wc->count_depth--;
 324
 325     if (wc->count_depth < 0)
 326     {
 327         gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc], wc->count_depth);
 328     }
 329     if (wc->counterlist[wc->count_depth] != ewc)
 330     {
 331         gmx_fatal(FARGS,
 332                   "wallcycle mismatch at stop, start %s, stop %s",
 333                   wcn[wc->counterlist[wc->count_depth]],
 334                   wcn[ewc]);
 335     }
 336 }
 337 #endif
 338
 339 void wallcycle_start(gmx_wallcycle_t wc, int ewc)
 340 {
 341     gmx_cycles_t cycle;
 342
 343     if (wc == nullptr)
 344     {
 345         return;
 346     }
 347
 348 #if GMX_MPI
 349     if (wc->wc_barrier)
 350     {
 351         MPI_Barrier(wc->mpi_comm_mygroup);
 352     }
 353 #endif
 354
 355 #if DEBUG_WCYCLE
 356     debug_start_check(wc, ewc);
 357 #endif
 358
 359     cycle              = gmx_cycles_read();
 360     wc->wcc[ewc].start = cycle;
 361     if (wc->wcc_all != nullptr)
 362     {
 363         wc->wc_depth++;
 364         if (ewc == ewcRUN)
 365         {
 366             wallcycle_all_start(wc, ewc, cycle);
 367         }
 368         else if (wc->wc_depth == 3)
 369         {
 370             wallcycle_all_stop(wc, ewc, cycle);
 371         }
 372     }
 373 }
 374
 375 void wallcycle_increment_event_count(gmx_wallcycle_t wc, int ewc)
 376 {
 377     if (wc == nullptr)
 378     {
 379         return;
 380     }
 381     wc->wcc[ewc].n++;
 382 }
 383
 384 void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
 385 {
 386     if (wc == nullptr)
 387     {
 388         return;
 389     }
 390
 391     wallcycle_start(wc, ewc);
 392     wc->wcc[ewc].n--;
 393 }
 394
 395 double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
 396 {
 397     gmx_cycles_t cycle, last;
 398
 399     if (wc == nullptr)
 400     {
 401         return 0;
 402     }
 403
 404 #if GMX_MPI
 405     if (wc->wc_barrier)
 406     {
 407         MPI_Barrier(wc->mpi_comm_mygroup);
 408     }
 409 #endif
 410
 411 #if DEBUG_WCYCLE
 412     debug_stop_check(wc, ewc);
 413 #endif
 414
 415     /* When processes or threads migrate between cores, the cycle counting
 416      * can get messed up if the cycle counter on different cores are not
 417      * synchronized. When this happens we expect both large negative and
 418      * positive cycle differences. We can detect negative cycle differences.
 419      * Detecting too large positive counts if difficult, since count can be
 420      * large, especially for ewcRUN. If we detect a negative count,
 421      * we will not print the cycle accounting table.
 422      */
 423     cycle = gmx_cycles_read();
 424     if (cycle >= wc->wcc[ewc].start)
 425     {
 426         last = cycle - wc->wcc[ewc].start;
 427     }
 428     else
 429     {
 430         last                 = 0;
 431         wc->haveInvalidCount = TRUE;
 432     }
 433     wc->wcc[ewc].c += last;
 434     wc->wcc[ewc].n++;
 435     if (wc->wcc_all)
 436     {
 437         wc->wc_depth--;
 438         if (ewc == ewcRUN)
 439         {
 440             wallcycle_all_stop(wc, ewc, cycle);
 441         }
 442         else if (wc->wc_depth == 2)
 443         {
 444             wallcycle_all_start(wc, ewc, cycle);
 445         }
 446     }
 447
 448     return last;
 449 }
 450
 451 void wallcycle_get(gmx_wallcycle_t wc, int ewc, int* n, double* c)
 452 {
 453     *n = wc->wcc[ewc].n;
 454     *c = static_cast<double>(wc->wcc[ewc].c);
 455 }
 456
 457 void wallcycle_reset_all(gmx_wallcycle_t wc)
 458 {
 459     int i;
 460
 461     if (wc == nullptr)
 462     {
 463         return;
 464     }
 465
 466     for (i = 0; i < ewcNR; i++)
 467     {
 468         wc->wcc[i].n = 0;
 469         wc->wcc[i].c = 0;
 470     }
 471     wc->haveInvalidCount = FALSE;
 472
 473     if (wc->wcc_all)
 474     {
 475         for (i = 0; i < ewcNR * ewcNR; i++)
 476         {
 477             wc->wcc_all[i].n = 0;
 478             wc->wcc_all[i].c = 0;
 479         }
 480     }
 481     if (wc->wcsc)
 482     {
 483         for (i = 0; i < ewcsNR; i++)
 484         {
 485             wc->wcsc[i].n = 0;
 486             wc->wcsc[i].c = 0;
 487         }
 488     }
 489 }
 490
 491 static gmx_bool is_pme_counter(int ewc)
 492 {
 493     return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
 494 }
 495
 496 static gmx_bool is_pme_subcounter(int ewc)
 497 {
 498     return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
 499 }
 500
 501 /* Subtract counter ewc_sub timed inside a timing block for ewc_main */
 502 static void subtract_cycles(wallcc_t* wcc, int ewc_main, int ewc_sub)
 503 {
 504     if (wcc[ewc_sub].n > 0)
 505     {
 506         if (wcc[ewc_main].c >= wcc[ewc_sub].c)
 507         {
 508             wcc[ewc_main].c -= wcc[ewc_sub].c;
 509         }
 510         else
 511         {
 512             /* Something is wrong with the cycle counting */
 513             wcc[ewc_main].c = 0;
 514         }
 515     }
 516 }
 517
 518 void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
 519 {
 520     if (wc == nullptr)
 521     {
 522         return;
 523     }
 524
 525     for (int i = 0; i < ewcNR; i++)
 526     {
 527         if (is_pme_counter(i) || (i == ewcRUN && isPmeRank))
 528         {
 529             wc->wcc[i].c *= nthreads_pme;
 530
 531             if (wc->wcc_all)
 532             {
 533                 for (int j = 0; j < ewcNR; j++)
 534                 {
 535                     wc->wcc_all[i * ewcNR + j].c *= nthreads_pme;
 536                 }
 537             }
 538         }
 539         else
 540         {
 541             wc->wcc[i].c *= nthreads_pp;
 542
 543             if (wc->wcc_all)
 544             {
 545                 for (int j = 0; j < ewcNR; j++)
 546                 {
 547                     wc->wcc_all[i * ewcNR + j].c *= nthreads_pp;
 548                 }
 549             }
 550         }
 551     }
 552     if (useCycleSubcounters && wc->wcsc && !isPmeRank)
 553     {
 554         for (int i = 0; i < ewcsNR; i++)
 555         {
 556             wc->wcsc[i].c *= nthreads_pp;
 557         }
 558     }
 559 }
 560
 561 /* TODO Make an object for this function to return, containing some
 562  * vectors of something like wallcc_t for the summed wcc, wcc_all and
 563  * wcsc, AND the original wcc for rank 0.
 564  *
 565  * The GPU timing is reported only for rank 0, so we want to preserve
 566  * the original wcycle on that rank. Rank 0 also reports the global
 567  * counts before that, so needs something to contain the global data
 568  * without over-writing the rank-0 data. The current implementation
 569  * uses cycles_sum to manage this, which works OK now because wcsc and
 570  * wcc_all are unused by the GPU reporting, but it is not satisfactory
 571  * for the future. Also, there's no need for MPI_Allreduce, since
 572  * only MASTERRANK uses any of the results. */
 573 WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle_t wc)
 574 {
 575     WallcycleCounts cycles_sum;
 576     wallcc_t*       wcc;
 577     double          cycles[int(ewcNR) + int(ewcsNR)];
 578 #if GMX_MPI
 579     double cycles_n[int(ewcNR) + int(ewcsNR) + 1];
 580 #endif
 581     int i;
 582     int nsum;
 583
 584     if (wc == nullptr)
 585     {
 586         /* Default construction of std::array of non-class T can leave
 587            the values indeterminate, just like a C array, and icc
 588            warns about it. */
 589         cycles_sum.fill(0);
 590         return cycles_sum;
 591     }
 592
 593     wcc = wc->wcc;
 594
 595     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
 596     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);
 597
 598     subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);
 599
 600     if (cr->npmenodes == 0)
 601     {
 602         /* All nodes do PME (or no PME at all) */
 603         subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
 604     }
 605     else
 606     {
 607         /* The are PME-only nodes */
 608         if (wcc[ewcPMEMESH].n > 0)
 609         {
 610             /* This must be a PME only node, calculate the Wait + Comm. time */
 611             GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c,
 612                        "Total run ticks must be greater than PME-only ticks");
 613             wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
 614         }
 615     }
 616
 617     /* Store the cycles in a double buffer for summing */
 618     for (i = 0; i < ewcNR; i++)
 619     {
 620 #if GMX_MPI
 621         cycles_n[i] = static_cast<double>(wcc[i].n);
 622 #endif
 623         cycles[i] = static_cast<double>(wcc[i].c);
 624     }
 625     nsum = ewcNR;
 626     if (wc->wcsc)
 627     {
 628         for (i = 0; i < ewcsNR; i++)
 629         {
 630 #if GMX_MPI
 631             cycles_n[ewcNR + i] = static_cast<double>(wc->wcsc[i].n);
 632 #endif
 633             cycles[ewcNR + i] = static_cast<double>(wc->wcsc[i].c);
 634         }
 635         nsum += ewcsNR;
 636     }
 637
 638 #if GMX_MPI
 639     if (cr->nnodes > 1)
 640     {
 641         double buf[int(ewcNR) + int(ewcsNR) + 1];
 642
 643         // TODO this code is used only at the end of the run, so we
 644         // can just do a simple reduce of haveInvalidCount in
 645         // wallcycle_print, and avoid bugs
 646         cycles_n[nsum] = (wc->haveInvalidCount ? 1 : 0);
 647         // TODO Use MPI_Reduce
 648         MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
 649         for (i = 0; i < ewcNR; i++)
 650         {
 651             wcc[i].n = gmx::roundToInt(buf[i]);
 652         }
 653         wc->haveInvalidCount = (buf[nsum] > 0);
 654         if (wc->wcsc)
 655         {
 656             for (i = 0; i < ewcsNR; i++)
 657             {
 658                 wc->wcsc[i].n = gmx::roundToInt(buf[ewcNR + i]);
 659             }
 660         }
 661
 662         // TODO Use MPI_Reduce
 663         MPI_Allreduce(cycles, cycles_sum.data(), nsum, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 664
 665         if (wc->wcc_all != nullptr)
 666         {
 667             double *buf_all, *cyc_all;
 668
 669             snew(cyc_all, ewcNR * ewcNR);
 670             snew(buf_all, ewcNR * ewcNR);
 671             for (i = 0; i < ewcNR * ewcNR; i++)
 672             {
 673                 cyc_all[i] = wc->wcc_all[i].c;
 674             }
 675             // TODO Use MPI_Reduce
 676             MPI_Allreduce(cyc_all, buf_all, ewcNR * ewcNR, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 677             for (i = 0; i < ewcNR * ewcNR; i++)
 678             {
 679                 wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
 680             }
 681             sfree(buf_all);
 682             sfree(cyc_all);
 683         }
 684     }
 685     else
 686 #endif
 687     {
 688         for (i = 0; i < nsum; i++)
 689         {
 690             cycles_sum[i] = cycles[i];
 691         }
 692     }
 693
 694     return cycles_sum;
 695 }
 696
 697 static void
 698 print_cycles(FILE* fplog, double c2t, const char* name, int nnodes, int nthreads, int ncalls, double c_sum, double tot)
 699 {
 700     char   nnodes_str[STRLEN];
 701     char   nthreads_str[STRLEN];
 702     char   ncalls_str[STRLEN];
 703     double wallt;
 704     double percentage = (tot > 0.) ? (100. * c_sum / tot) : 0.;
 705
 706     if (c_sum > 0)
 707     {
 708         if (ncalls > 0)
 709         {
 710             snprintf(ncalls_str, sizeof(ncalls_str), "%10d", ncalls);
 711             if (nnodes < 0)
 712             {
 713                 snprintf(nnodes_str, sizeof(nnodes_str), "N/A");
 714             }
 715             else
 716             {
 717                 snprintf(nnodes_str, sizeof(nnodes_str), "%4d", nnodes);
 718             }
 719             if (nthreads < 0)
 720             {
 721                 snprintf(nthreads_str, sizeof(nthreads_str), "N/A");
 722             }
 723             else
 724             {
 725                 snprintf(nthreads_str, sizeof(nthreads_str), "%4d", nthreads);
 726             }
 727         }
 728         else
 729         {
 730             nnodes_str[0]   = 0;
 731             nthreads_str[0] = 0;
 732             ncalls_str[0]   = 0;
 733         }
 734         /* Convert the cycle count to wallclock time for this task */
 735         wallt = c_sum * c2t;
 736
 737         fprintf(fplog,
 738                 " %-19.19s %4s %4s %10s  %10.3f %14.3f %5.1f\n",
 739                 name,
 740                 nnodes_str,
 741                 nthreads_str,
 742                 ncalls_str,
 743                 wallt,
 744                 c_sum * 1e-9,
 745                 percentage);
 746     }
 747 }
 748
 749 static void print_gputimes(FILE* fplog, const char* name, int n, double t, double tot_t)
 750 {
 751     char num[11];
 752     char avg_perf[11];
 753
 754     if (n > 0)
 755     {
 756         snprintf(num, sizeof(num), "%10d", n);
 757         snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t / n);
 758     }
 759     else
 760     {
 761         sprintf(num, "          ");
 762         sprintf(avg_perf, "          ");
 763     }
 764     if (t != tot_t && tot_t > 0)
 765     {
 766         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, num, t / 1000, avg_perf, 100 * t / tot_t);
 767     }
 768     else
 769     {
 770         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, "", t / 1000, avg_perf, 100.0);
 771     }
 772 }
 773
 774 static void print_header(FILE* fplog, int nrank_pp, int nth_pp, int nrank_pme, int nth_pme)
 775 {
 776     int nrank_tot = nrank_pp + nrank_pme;
 777     if (0 == nrank_pme)
 778     {
 779         fprintf(fplog, "On %d MPI rank%s", nrank_tot, nrank_tot == 1 ? "" : "s");
 780         if (nth_pp > 1)
 781         {
 782             fprintf(fplog, ", each using %d OpenMP threads", nth_pp);
 783         }
 784         /* Don't report doing PP+PME, because we can't tell here if
 785          * this is RF, etc. */
 786     }
 787     else
 788     {
 789         fprintf(fplog, "On %d MPI rank%s doing PP", nrank_pp, nrank_pp == 1 ? "" : "s");
 790         if (nth_pp > 1)
 791         {
 792             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pp > 1 ? " each" : "", nth_pp);
 793         }
 794         fprintf(fplog, ", and\non %d MPI rank%s doing PME", nrank_pme, nrank_pme == 1 ? "" : "s");
 795         if (nth_pme > 1)
 796         {
 797             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pme > 1 ? " each" : "", nth_pme);
 798         }
 799     }
 800
 801     fprintf(fplog, "\n\n");
 802     fprintf(fplog, " Computing:          Num   Num      Call    Wall time         Giga-Cycles\n");
 803     fprintf(fplog, "                     Ranks Threads  Count      (s)         total sum    %%\n");
 804 }
 805
 806
 807 void wallcycle_print(FILE*                            fplog,
 808                      const gmx::MDLogger&             mdlog,
 809                      int                              nnodes,
 810                      int                              npme,
 811                      int                              nth_pp,
 812                      int                              nth_pme,
 813                      double                           realtime,
 814                      gmx_wallcycle_t                  wc,
 815                      const WallcycleCounts&           cyc_sum,
 816                      const gmx_wallclock_gpu_nbnxn_t* gpu_nbnxn_t,
 817                      const gmx_wallclock_gpu_pme_t*   gpu_pme_t)
 818 {
 819     double      tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
 820     double      c2t, c2t_pp, c2t_pme = 0;
 821     int         i, j, npp, nth_tot;
 822     char        buf[STRLEN];
 823     const char* hline =
 824             "-----------------------------------------------------------------------------";
 825
 826     if (wc == nullptr)
 827     {
 828         return;
 829     }
 830
 831     GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0");
 832     GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0");
 833     GMX_ASSERT(nnodes > 0, "Number of nodes must be >0");
 834     GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative");
 835     npp = nnodes - npme;
 836     /* npme is the number of PME-only ranks used, and we always do PP work */
 837     GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0");
 838
 839     nth_tot = npp * nth_pp + npme * nth_pme;
 840
 841     /* When using PME-only nodes, the next line is valid for both
 842        PP-only and PME-only nodes because they started ewcRUN at the
 843        same time. */
 844     tot        = cyc_sum[ewcRUN];
 845     tot_for_pp = 0;
 846
 847     if (tot <= 0.0)
 848     {
 849         /* TODO This is heavy handed, but until someone reworks the
 850            code so that it is provably robust with respect to
 851            non-positive values for all possible timer and cycle
 852            counters, there is less value gained from printing whatever
 853            timing data might still be sensible for some non-Jenkins
 854            run, than is lost from diagnosing Jenkins FP exceptions on
 855            runs about whose execution time we don't care. */
 856         GMX_LOG(mdlog.warning)
 857                 .asParagraph()
 858                 .appendTextFormatted(
 859                         "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a "
 860                         "time accounting",
 861                         tot);
 862         return;
 863     }
 864
 865     if (wc->haveInvalidCount)
 866     {
 867         GMX_LOG(mdlog.warning)
 868                 .asParagraph()
 869                 .appendText(
 870                         "NOTE: Detected invalid cycle counts, probably because threads moved "
 871                         "between CPU cores that do not have synchronized cycle counters. Will not "
 872                         "print the cycle accounting.");
 873         return;
 874     }
 875
 876
 877     /* Conversion factor from cycles to seconds */
 878     c2t    = realtime / tot;
 879     c2t_pp = c2t * nth_tot / static_cast<double>(npp * nth_pp);
 880     if (npme > 0)
 881     {
 882         c2t_pme = c2t * nth_tot / static_cast<double>(npme * nth_pme);
 883     }
 884     else
 885     {
 886         c2t_pme = 0;
 887     }
 888
 889     fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");
 890
 891     print_header(fplog, npp, nth_pp, npme, nth_pme);
 892
 893     fprintf(fplog, "%s\n", hline);
 894     for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
 895     {
 896         if (is_pme_subcounter(i))
 897         {
 898             /* Do not count these at all */
 899         }
 900         else if (npme > 0 && is_pme_counter(i))
 901         {
 902             /* Print timing information for PME-only nodes, but add an
 903              * asterisk so the reader of the table can know that the
 904              * walltimes are not meant to add up. The asterisk still
 905              * fits in the required maximum of 19 characters. */
 906             char buffer[STRLEN];
 907             snprintf(buffer, STRLEN, "%s *", wcn[i]);
 908             print_cycles(fplog, c2t_pme, buffer, npme, nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
 909         }
 910         else
 911         {
 912             /* Print timing information when it is for a PP or PP+PME
 913                node */
 914             print_cycles(fplog, c2t_pp, wcn[i], npp, nth_pp, wc->wcc[i].n, cyc_sum[i], tot);
 915             tot_for_pp += cyc_sum[i];
 916         }
 917     }
 918     if (wc->wcc_all != nullptr)
 919     {
 920         for (i = 0; i < ewcNR; i++)
 921         {
 922             for (j = 0; j < ewcNR; j++)
 923             {
 924                 snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
 925                 print_cycles(fplog,
 926                              c2t_pp,
 927                              buf,
 928                              npp,
 929                              nth_pp,
 930                              wc->wcc_all[i * ewcNR + j].n,
 931                              wc->wcc_all[i * ewcNR + j].c,
 932                              tot);
 933             }
 934         }
 935     }
 936     tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot);
 937     print_cycles(fplog, c2t_pp, "Rest", npp, nth_pp, -1, tot_for_rest - tot_for_pp, tot);
 938     fprintf(fplog, "%s\n", hline);
 939     print_cycles(fplog, c2t, "Total", npp, nth_pp, -1, tot, tot);
 940     fprintf(fplog, "%s\n", hline);
 941
 942     if (npme > 0)
 943     {
 944         fprintf(fplog,
 945                 "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
 946                 "    twice the total reported, but the cycle count total and %% are correct.\n"
 947                 "%s\n",
 948                 hline);
 949     }
 950
 951     if (wc->wcc[ewcPMEMESH].n > 0)
 952     {
 953         // A workaround to not print breakdown when no subcounters were recorded.
 954         // TODO: figure out and record PME GPU counters (what to do with the waiting ones?)
 955         std::vector<int> validPmeSubcounterIndices;
 956         for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
 957         {
 958             if (is_pme_subcounter(i) && wc->wcc[i].n > 0)
 959             {
 960                 validPmeSubcounterIndices.push_back(i);
 961             }
 962         }
 963
 964         if (!validPmeSubcounterIndices.empty())
 965         {
 966             fprintf(fplog, " Breakdown of PME mesh computation\n");
 967             fprintf(fplog, "%s\n", hline);
 968             for (auto i : validPmeSubcounterIndices)
 969             {
 970                 print_cycles(fplog,
 971                              npme > 0 ? c2t_pme : c2t_pp,
 972                              wcn[i],
 973                              npme > 0 ? npme : npp,
 974                              nth_pme,
 975                              wc->wcc[i].n,
 976                              cyc_sum[i],
 977                              tot);
 978             }
 979             fprintf(fplog, "%s\n", hline);
 980         }
 981     }
 982
 983     if (useCycleSubcounters && wc->wcsc)
 984     {
 985         fprintf(fplog, " Breakdown of PP computation\n");
 986         fprintf(fplog, "%s\n", hline);
 987         for (i = 0; i < ewcsNR; i++)
 988         {
 989             print_cycles(fplog, c2t_pp, wcsn[i], npp, nth_pp, wc->wcsc[i].n, cyc_sum[ewcNR + i], tot);
 990         }
 991         fprintf(fplog, "%s\n", hline);
 992     }
 993
 994     /* print GPU timing summary */
 995     double tot_gpu = 0.0;
 996     if (gpu_pme_t)
 997     {
 998         for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
 999         {
1000             tot_gpu += gpu_pme_t->timing[k].t;
1001         }
1002     }
1003     if (gpu_nbnxn_t)
1004     {
1005         const char* k_log_str[2][2] = { { "Nonbonded F kernel", "Nonbonded F+ene k." },
1006                                         { "Nonbonded F+prune k.", "Nonbonded F+ene+prune k." } };
1007         tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
1008
1009         /* add up the kernel timings */
1010         for (i = 0; i < 2; i++)
1011         {
1012             for (j = 0; j < 2; j++)
1013             {
1014                 tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
1015             }
1016         }
1017         tot_gpu += gpu_nbnxn_t->pruneTime.t;
1018
1019         tot_cpu_overlap = wc->wcc[ewcFORCE].c;
1020         if (wc->wcc[ewcPMEMESH].n > 0)
1021         {
1022             tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
1023         }
1024         tot_cpu_overlap *= realtime * 1000 / tot; /* convert s to ms */
1025
1026         fprintf(fplog, "\n GPU timings\n%s\n", hline);
1027         fprintf(fplog,
1028                 " Computing:                         Count  Wall t (s)      ms/step       %c\n",
1029                 '%');
1030         fprintf(fplog, "%s\n", hline);
1031         print_gputimes(fplog, "Pair list H2D", gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
1032         print_gputimes(fplog, "X / q H2D", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
1033
1034         for (i = 0; i < 2; i++)
1035         {
1036             for (j = 0; j < 2; j++)
1037             {
1038                 if (gpu_nbnxn_t->ktime[i][j].c)
1039                 {
1040                     print_gputimes(fplog,
1041                                    k_log_str[i][j],
1042                                    gpu_nbnxn_t->ktime[i][j].c,
1043                                    gpu_nbnxn_t->ktime[i][j].t,
1044                                    tot_gpu);
1045                 }
1046             }
1047         }
1048         if (gpu_pme_t)
1049         {
1050             for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
1051             {
1052                 if (gpu_pme_t->timing[k].c)
1053                 {
1054                     print_gputimes(
1055                             fplog, PMEStageNames[k], gpu_pme_t->timing[k].c, gpu_pme_t->timing[k].t, tot_gpu);
1056                 }
1057             }
1058         }
1059         if (gpu_nbnxn_t->pruneTime.c)
1060         {
1061             print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c, gpu_nbnxn_t->pruneTime.t, tot_gpu);
1062         }
1063         print_gputimes(fplog, "F D2H", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
1064         fprintf(fplog, "%s\n", hline);
1065         print_gputimes(fplog, "Total ", gpu_nbnxn_t->nb_c, tot_gpu, tot_gpu);
1066         fprintf(fplog, "%s\n", hline);
1067         if (gpu_nbnxn_t->dynamicPruneTime.c)
1068         {
1069             /* We print the dynamic pruning kernel timings after a separator
1070              * and avoid adding it to tot_gpu as this is not in the force
1071              * overlap. We print the fraction as relative to the rest.
1072              */
1073             print_gputimes(fplog,
1074                            "*Dynamic pruning",
1075                            gpu_nbnxn_t->dynamicPruneTime.c,
1076                            gpu_nbnxn_t->dynamicPruneTime.t,
1077                            tot_gpu);
1078             fprintf(fplog, "%s\n", hline);
1079         }
1080         gpu_cpu_ratio = tot_gpu / tot_cpu_overlap;
1081         if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
1082         {
1083             fprintf(fplog,
1084                     "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = "
1085                     "%.3f\n",
1086                     tot_gpu / gpu_nbnxn_t->nb_c,
1087                     tot_cpu_overlap / wc->wcc[ewcFORCE].n,
1088                     gpu_cpu_ratio);
1089         }
1090
1091         /* only print notes related to CPU-GPU load balance with PME */
1092         if (wc->wcc[ewcPMEMESH].n > 0)
1093         {
1094             fprintf(fplog, "For optimal resource utilization this ratio should be close to 1\n");
1095
1096             /* print note if the imbalance is high with PME case in which
1097              * CPU-GPU load balancing is possible */
1098             if (gpu_cpu_ratio < 0.8 || gpu_cpu_ratio > 1.25)
1099             {
1100                 /* Only the sim master calls this function, so always print to stderr */
1101                 if (gpu_cpu_ratio < 0.8)
1102                 {
1103                     if (npp > 1)
1104                     {
1105                         /* The user could have used -notunepme,
1106                          * but we currently can't check that here.
1107                          */
1108                         GMX_LOG(mdlog.warning)
1109                                 .asParagraph()
1110                                 .appendText(
1111                                         "NOTE: The CPU has >25% more load than the GPU. This "
1112                                         "imbalance wastes\n"
1113                                         "      GPU resources. Maybe the domain decomposition "
1114                                         "limits the PME tuning.\n"
1115                                         "      In that case, try setting the DD grid manually "
1116                                         "(-dd) or lowering -dds.");
1117                     }
1118                     else
1119                     {
1120                         /* We should not end up here, unless the box is
1121                          * too small for increasing the cut-off for PME tuning.
1122                          */
1123                         GMX_LOG(mdlog.warning)
1124                                 .asParagraph()
1125                                 .appendText(
1126                                         "NOTE: The CPU has >25% more load than the GPU. This "
1127                                         "imbalance wastes\n"
1128                                         "      GPU resources.");
1129                     }
1130                 }
1131                 if (gpu_cpu_ratio > 1.25)
1132                 {
1133                     GMX_LOG(mdlog.warning)
1134                             .asParagraph()
1135                             .appendText(
1136                                     "NOTE: The GPU has >25% more load than the CPU. This imbalance "
1137                                     "wastes\n"
1138                                     "      CPU resources.");
1139                 }
1140             }
1141         }
1142     }
1143
1144     if (wc->wc_barrier)
1145     {
1146         GMX_LOG(mdlog.warning)
1147                 .asParagraph()
1148                 .appendText(
1149                         "MPI_Barrier was called before each cycle start/stop\n"
1150                         "call, so timings are not those of real runs.");
1151     }
1152
1153     if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 && (cyc_sum[ewcDOMDEC] > tot * 0.1 || cyc_sum[ewcNS] > tot * 0.1))
1154     {
1155         /* Only the sim master calls this function, so always print to stderr */
1156         if (wc->wcc[ewcDOMDEC].n == 0)
1157         {
1158             GMX_LOG(mdlog.warning)
1159                     .asParagraph()
1160                     .appendTextFormatted(
1161                             "NOTE: %d %% of the run time was spent in pair search,\n"
1162                             "      you might want to increase nstlist (this has no effect on "
1163                             "accuracy)\n",
1164                             gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1165         }
1166         else
1167         {
1168             GMX_LOG(mdlog.warning)
1169                     .asParagraph()
1170                     .appendTextFormatted(
1171                             "NOTE: %d %% of the run time was spent in domain decomposition,\n"
1172                             "      %d %% of the run time was spent in pair search,\n"
1173                             "      you might want to increase nstlist (this has no effect on "
1174                             "accuracy)\n",
1175                             gmx::roundToInt(100 * cyc_sum[ewcDOMDEC] / tot),
1176                             gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1177         }
1178     }
1179
1180     if (cyc_sum[ewcMoveE] > tot * 0.05)
1181     {
1182         GMX_LOG(mdlog.warning)
1183                 .asParagraph()
1184                 .appendTextFormatted(
1185                         "NOTE: %d %% of the run time was spent communicating energies,\n"
1186                         "      you might want to increase some nst* mdp options\n",
1187                         gmx::roundToInt(100 * cyc_sum[ewcMoveE] / tot));
1188     }
1189 }
1190
1191 extern int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
1192 {
1193     if (wc == nullptr)
1194     {
1195         return -1;
1196     }
1197
1198     return wc->reset_counters;
1199 }
1200
1201 extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, int64_t reset_counters)
1202 {
1203     if (wc == nullptr)
1204     {
1205         return;
1206     }
1207
1208     wc->reset_counters = reset_counters;
1209 }
1210
1211 void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
1212 {
1213     if (useCycleSubcounters && wc != nullptr)
1214     {
1215         wc->wcsc[ewcs].start = gmx_cycles_read();
1216     }
1217 }
1218
1219 void wallcycle_sub_start_nocount(gmx_wallcycle_t wc, int ewcs)
1220 {
1221     if (useCycleSubcounters && wc != nullptr)
1222     {
1223         wallcycle_sub_start(wc, ewcs);
1224         wc->wcsc[ewcs].n--;
1225     }
1226 }
1227
1228 void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
1229 {
1230     if (useCycleSubcounters && wc != nullptr)
1231     {
1232         wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start;
1233         wc->wcsc[ewcs].n++;
1234     }
1235 }