src/gromacs/timing/wallcycle.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #include "gmxpre.h"
  39
  40 #include "wallcycle.h"
  41
  42 #include "config.h"
  43
  44 #include <cstdlib>
  45
  46 #include <array>
  47 #include <vector>
  48
  49 #include "gromacs/math/functions.h"
  50 #include "gromacs/mdtypes/commrec.h"
  51 #include "gromacs/timing/cyclecounter.h"
  52 #include "gromacs/timing/gpu_timing.h"
  53 #include "gromacs/timing/wallcyclereporting.h"
  54 #include "gromacs/utility/cstringutil.h"
  55 #include "gromacs/utility/gmxassert.h"
  56 #include "gromacs/utility/gmxmpi.h"
  57 #include "gromacs/utility/logger.h"
  58 #include "gromacs/utility/smalloc.h"
  59 #include "gromacs/utility/snprintf.h"
  60
  61 static const bool useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS;
  62
  63 #ifndef DEBUG_WCYCLE
  64 /*! \brief Enables consistency checking for the counters.
  65  *
  66  * If the macro is set to 1, code checks if you stop a counter different from the last
  67  * one that was opened and if you do nest too deep.
  68  */
  69 #    define DEBUG_WCYCLE 0
  70 #endif
  71 //! Whether wallcycle debugging is enabled
  72 constexpr bool gmx_unused enableWallcycleDebug = (DEBUG_WCYCLE != 0);
  73 //! True if only the master rank should print debugging output
  74 constexpr bool gmx_unused onlyMasterDebugPrints = true;
  75 //! True if cycle counter nesting depth debuggin prints are enabled
  76 constexpr bool gmx_unused debugPrintDepth = false /* enableWallcycleDebug */;
  77
  78 #if DEBUG_WCYCLE
  79 #    include "gromacs/utility/fatalerror.h"
  80 #endif
  81
  82 typedef struct
  83 {
  84     int          n;
  85     gmx_cycles_t c;
  86     gmx_cycles_t start;
  87 } wallcc_t;
  88
  89 struct gmx_wallcycle
  90 {
  91     wallcc_t* wcc;
  92     /* did we detect one or more invalid cycle counts */
  93     gmx_bool haveInvalidCount;
  94     /* variables for testing/debugging */
  95     gmx_bool  wc_barrier;
  96     wallcc_t* wcc_all;
  97     int       wc_depth;
  98 #if DEBUG_WCYCLE
  99 #    define DEPTH_MAX 6
 100     int  counterlist[DEPTH_MAX];
 101     int  count_depth;
 102     bool isMasterRank;
 103 #endif
 104     int          ewc_prev;
 105     gmx_cycles_t cycle_prev;
 106     int64_t      reset_counters;
 107 #if GMX_MPI
 108     MPI_Comm mpi_comm_mygroup;
 109 #endif
 110     wallcc_t* wcsc;
 111 };
 112
 113 /* Each name should not exceed 19 printing characters
 114    (ie. terminating null can be twentieth) */
 115 static const char* wcn[ewcNR] = { "Run",
 116                                   "Step",
 117                                   "PP during PME",
 118                                   "Domain decomp.",
 119                                   "DD comm. load",
 120                                   "DD comm. bounds",
 121                                   "Vsite constr.",
 122                                   "Send X to PME",
 123                                   "Neighbor search",
 124                                   "Launch GPU ops.",
 125                                   "Comm. coord.",
 126                                   "Force",
 127                                   "Wait + Comm. F",
 128                                   "PME mesh",
 129                                   "PME redist. X/F",
 130                                   "PME spread",
 131                                   "PME gather",
 132                                   "PME 3D-FFT",
 133                                   "PME 3D-FFT Comm.",
 134                                   "PME solve LJ",
 135                                   "PME solve Elec",
 136                                   "PME wait for PP",
 137                                   "Wait + Recv. PME F",
 138                                   "Wait PME GPU spread",
 139                                   "PME 3D-FFT",
 140                                   "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
 141                                   "Wait PME GPU gather",
 142                                   "Wait Bonded GPU",
 143                                   "Reduce GPU PME F",
 144                                   "Wait GPU NB nonloc.",
 145                                   "Wait GPU NB local",
 146                                   "Wait GPU state copy",
 147                                   "NB X/F buffer ops.",
 148                                   "Vsite spread",
 149                                   "COM pull force",
 150                                   "AWH",
 151                                   "Write traj.",
 152                                   "Update",
 153                                   "Constraints",
 154                                   "Comm. energies",
 155                                   "Enforced rotation",
 156                                   "Add rot. forces",
 157                                   "Position swapping",
 158                                   "IMD",
 159                                   "Test" };
 160
 161 static const char* wcsn[ewcsNR] = {
 162     "DD redist.",
 163     "DD NS grid + sort",
 164     "DD setup comm.",
 165     "DD make top.",
 166     "DD make constr.",
 167     "DD top. other",
 168     "DD GPU ops.",
 169     "NS grid local",
 170     "NS grid non-loc.",
 171     "NS search local",
 172     "NS search non-loc.",
 173     "Bonded F",
 174     "Bonded-FEP F",
 175     "Restraints F",
 176     "Listed buffer ops.",
 177     "Nonbonded pruning",
 178     "Nonbonded F kernel",
 179     "Nonbonded F clear",
 180     "Nonbonded FEP",
 181     "Launch NB GPU tasks",
 182     "Launch Bonded GPU tasks",
 183     "Launch PME GPU tasks",
 184     "Launch state copy",
 185     "Ewald F correction",
 186     "NB X buffer ops.",
 187     "NB F buffer ops.",
 188     "Clear force buffer",
 189     "Launch GPU NB X buffer ops.",
 190     "Launch GPU NB F buffer ops.",
 191     "Launch GPU Comm. coord.",
 192     "Launch GPU Comm. force.",
 193     "Launch GPU update",
 194     "Test subcounter",
 195 };
 196
 197 /* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
 198 static const char* enumValuetoString(PmeStage enumValue)
 199 {
 200     constexpr gmx::EnumerationArray<PmeStage, const char*> pmeStageNames = {
 201         "PME spline", "PME spread",     "PME spline + spread", "PME 3D-FFT r2c",
 202         "PME solve",  "PME 3D-FFT c2r", "PME gather"
 203     };
 204     return pmeStageNames[enumValue];
 205 };
 206
 207 gmx_bool wallcycle_have_counter()
 208 {
 209     return gmx_cycles_have_counter();
 210 }
 211
 212 gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, t_commrec gmx_unused* cr)
 213 {
 214     gmx_wallcycle_t wc;
 215
 216
 217     if (!wallcycle_have_counter())
 218     {
 219         return nullptr;
 220     }
 221
 222     snew(wc, 1);
 223
 224     wc->haveInvalidCount = FALSE;
 225     wc->wc_barrier       = FALSE;
 226     wc->wcc_all          = nullptr;
 227     wc->wc_depth         = 0;
 228     wc->ewc_prev         = -1;
 229     wc->reset_counters   = resetstep;
 230
 231
 232 #if GMX_MPI
 233     if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
 234     {
 235         if (fplog)
 236         {
 237             fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
 238         }
 239         wc->wc_barrier       = TRUE;
 240         wc->mpi_comm_mygroup = cr->mpi_comm_mygroup;
 241     }
 242 #endif
 243
 244     snew(wc->wcc, ewcNR);
 245     if (getenv("GMX_CYCLE_ALL") != nullptr)
 246     {
 247         if (fplog)
 248         {
 249             fprintf(fplog, "\nWill time all the code during the run\n\n");
 250         }
 251         snew(wc->wcc_all, ewcNR * ewcNR);
 252     }
 253
 254     if (useCycleSubcounters)
 255     {
 256         snew(wc->wcsc, ewcsNR);
 257     }
 258
 259 #if DEBUG_WCYCLE
 260     wc->count_depth  = 0;
 261     wc->isMasterRank = MASTER(cr);
 262 #endif
 263
 264     return wc;
 265 }
 266
 267 void wallcycle_destroy(gmx_wallcycle_t wc)
 268 {
 269     if (wc == nullptr)
 270     {
 271         return;
 272     }
 273
 274     if (wc->wcc != nullptr)
 275     {
 276         sfree(wc->wcc);
 277     }
 278     if (wc->wcc_all != nullptr)
 279     {
 280         sfree(wc->wcc_all);
 281     }
 282     if (wc->wcsc != nullptr)
 283     {
 284         sfree(wc->wcsc);
 285     }
 286     sfree(wc);
 287 }
 288
 289 static void wallcycle_all_start(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 290 {
 291     wc->ewc_prev   = ewc;
 292     wc->cycle_prev = cycle;
 293 }
 294
 295 static void wallcycle_all_stop(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 296 {
 297     wc->wcc_all[wc->ewc_prev * ewcNR + ewc].n += 1;
 298     wc->wcc_all[wc->ewc_prev * ewcNR + ewc].c += cycle - wc->cycle_prev;
 299 }
 300
 301
 302 #if DEBUG_WCYCLE
 303 static void debug_start_check(gmx_wallcycle_t wc, int ewc)
 304 {
 305     if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
 306     {
 307         gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth + 1);
 308     }
 309     wc->counterlist[wc->count_depth] = ewc;
 310     wc->count_depth++;
 311
 312     if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
 313     {
 314         std::string indentStr(4 * wc->count_depth, ' ');
 315         fprintf(stderr, "%swcycle_start depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]);
 316     }
 317 }
 318
 319 static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
 320 {
 321     if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
 322     {
 323         std::string indentStr(4 * wc->count_depth, ' ');
 324         fprintf(stderr, "%swcycle_stop  depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]);
 325     }
 326
 327     wc->count_depth--;
 328
 329     if (wc->count_depth < 0)
 330     {
 331         gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc], wc->count_depth);
 332     }
 333     if (wc->counterlist[wc->count_depth] != ewc)
 334     {
 335         gmx_fatal(FARGS,
 336                   "wallcycle mismatch at stop, start %s, stop %s",
 337                   wcn[wc->counterlist[wc->count_depth]],
 338                   wcn[ewc]);
 339     }
 340 }
 341 #endif
 342
 343 void wallcycle_start(gmx_wallcycle_t wc, int ewc)
 344 {
 345     gmx_cycles_t cycle;
 346
 347     if (wc == nullptr)
 348     {
 349         return;
 350     }
 351
 352 #if GMX_MPI
 353     if (wc->wc_barrier)
 354     {
 355         MPI_Barrier(wc->mpi_comm_mygroup);
 356     }
 357 #endif
 358
 359 #if DEBUG_WCYCLE
 360     debug_start_check(wc, ewc);
 361 #endif
 362
 363     cycle              = gmx_cycles_read();
 364     wc->wcc[ewc].start = cycle;
 365     if (wc->wcc_all != nullptr)
 366     {
 367         wc->wc_depth++;
 368         if (ewc == ewcRUN)
 369         {
 370             wallcycle_all_start(wc, ewc, cycle);
 371         }
 372         else if (wc->wc_depth == 3)
 373         {
 374             wallcycle_all_stop(wc, ewc, cycle);
 375         }
 376     }
 377 }
 378
 379 void wallcycle_increment_event_count(gmx_wallcycle_t wc, int ewc)
 380 {
 381     if (wc == nullptr)
 382     {
 383         return;
 384     }
 385     wc->wcc[ewc].n++;
 386 }
 387
 388 void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
 389 {
 390     if (wc == nullptr)
 391     {
 392         return;
 393     }
 394
 395     wallcycle_start(wc, ewc);
 396     wc->wcc[ewc].n--;
 397 }
 398
 399 double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
 400 {
 401     gmx_cycles_t cycle, last;
 402
 403     if (wc == nullptr)
 404     {
 405         return 0;
 406     }
 407
 408 #if GMX_MPI
 409     if (wc->wc_barrier)
 410     {
 411         MPI_Barrier(wc->mpi_comm_mygroup);
 412     }
 413 #endif
 414
 415 #if DEBUG_WCYCLE
 416     debug_stop_check(wc, ewc);
 417 #endif
 418
 419     /* When processes or threads migrate between cores, the cycle counting
 420      * can get messed up if the cycle counter on different cores are not
 421      * synchronized. When this happens we expect both large negative and
 422      * positive cycle differences. We can detect negative cycle differences.
 423      * Detecting too large positive counts if difficult, since count can be
 424      * large, especially for ewcRUN. If we detect a negative count,
 425      * we will not print the cycle accounting table.
 426      */
 427     cycle = gmx_cycles_read();
 428     if (cycle >= wc->wcc[ewc].start)
 429     {
 430         last = cycle - wc->wcc[ewc].start;
 431     }
 432     else
 433     {
 434         last                 = 0;
 435         wc->haveInvalidCount = TRUE;
 436     }
 437     wc->wcc[ewc].c += last;
 438     wc->wcc[ewc].n++;
 439     if (wc->wcc_all)
 440     {
 441         wc->wc_depth--;
 442         if (ewc == ewcRUN)
 443         {
 444             wallcycle_all_stop(wc, ewc, cycle);
 445         }
 446         else if (wc->wc_depth == 2)
 447         {
 448             wallcycle_all_start(wc, ewc, cycle);
 449         }
 450     }
 451
 452     return last;
 453 }
 454
 455 void wallcycle_get(gmx_wallcycle_t wc, int ewc, int* n, double* c)
 456 {
 457     *n = wc->wcc[ewc].n;
 458     *c = static_cast<double>(wc->wcc[ewc].c);
 459 }
 460
 461 void wallcycle_reset_all(gmx_wallcycle_t wc)
 462 {
 463     int i;
 464
 465     if (wc == nullptr)
 466     {
 467         return;
 468     }
 469
 470     for (i = 0; i < ewcNR; i++)
 471     {
 472         wc->wcc[i].n = 0;
 473         wc->wcc[i].c = 0;
 474     }
 475     wc->haveInvalidCount = FALSE;
 476
 477     if (wc->wcc_all)
 478     {
 479         for (i = 0; i < ewcNR * ewcNR; i++)
 480         {
 481             wc->wcc_all[i].n = 0;
 482             wc->wcc_all[i].c = 0;
 483         }
 484     }
 485     if (wc->wcsc)
 486     {
 487         for (i = 0; i < ewcsNR; i++)
 488         {
 489             wc->wcsc[i].n = 0;
 490             wc->wcsc[i].c = 0;
 491         }
 492     }
 493 }
 494
 495 static gmx_bool is_pme_counter(int ewc)
 496 {
 497     return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
 498 }
 499
 500 static gmx_bool is_pme_subcounter(int ewc)
 501 {
 502     return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
 503 }
 504
 505 /* Subtract counter ewc_sub timed inside a timing block for ewc_main */
 506 static void subtract_cycles(wallcc_t* wcc, int ewc_main, int ewc_sub)
 507 {
 508     if (wcc[ewc_sub].n > 0)
 509     {
 510         if (wcc[ewc_main].c >= wcc[ewc_sub].c)
 511         {
 512             wcc[ewc_main].c -= wcc[ewc_sub].c;
 513         }
 514         else
 515         {
 516             /* Something is wrong with the cycle counting */
 517             wcc[ewc_main].c = 0;
 518         }
 519     }
 520 }
 521
 522 void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
 523 {
 524     if (wc == nullptr)
 525     {
 526         return;
 527     }
 528
 529     for (int i = 0; i < ewcNR; i++)
 530     {
 531         if (is_pme_counter(i) || (i == ewcRUN && isPmeRank))
 532         {
 533             wc->wcc[i].c *= nthreads_pme;
 534
 535             if (wc->wcc_all)
 536             {
 537                 for (int j = 0; j < ewcNR; j++)
 538                 {
 539                     wc->wcc_all[i * ewcNR + j].c *= nthreads_pme;
 540                 }
 541             }
 542         }
 543         else
 544         {
 545             wc->wcc[i].c *= nthreads_pp;
 546
 547             if (wc->wcc_all)
 548             {
 549                 for (int j = 0; j < ewcNR; j++)
 550                 {
 551                     wc->wcc_all[i * ewcNR + j].c *= nthreads_pp;
 552                 }
 553             }
 554         }
 555     }
 556     if (useCycleSubcounters && wc->wcsc && !isPmeRank)
 557     {
 558         for (int i = 0; i < ewcsNR; i++)
 559         {
 560             wc->wcsc[i].c *= nthreads_pp;
 561         }
 562     }
 563 }
 564
 565 /* TODO Make an object for this function to return, containing some
 566  * vectors of something like wallcc_t for the summed wcc, wcc_all and
 567  * wcsc, AND the original wcc for rank 0.
 568  *
 569  * The GPU timing is reported only for rank 0, so we want to preserve
 570  * the original wcycle on that rank. Rank 0 also reports the global
 571  * counts before that, so needs something to contain the global data
 572  * without over-writing the rank-0 data. The current implementation
 573  * uses cycles_sum to manage this, which works OK now because wcsc and
 574  * wcc_all are unused by the GPU reporting, but it is not satisfactory
 575  * for the future. Also, there's no need for MPI_Allreduce, since
 576  * only MASTERRANK uses any of the results. */
 577 WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle_t wc)
 578 {
 579     WallcycleCounts cycles_sum;
 580     wallcc_t*       wcc;
 581     double          cycles[int(ewcNR) + int(ewcsNR)];
 582 #if GMX_MPI
 583     double cycles_n[int(ewcNR) + int(ewcsNR) + 1];
 584 #endif
 585     int i;
 586     int nsum;
 587
 588     if (wc == nullptr)
 589     {
 590         /* Default construction of std::array of non-class T can leave
 591            the values indeterminate, just like a C array */
 592         cycles_sum.fill(0);
 593         return cycles_sum;
 594     }
 595
 596     wcc = wc->wcc;
 597
 598     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
 599     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);
 600
 601     subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);
 602
 603     if (cr->npmenodes == 0)
 604     {
 605         /* All nodes do PME (or no PME at all) */
 606         subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
 607     }
 608     else
 609     {
 610         /* The are PME-only nodes */
 611         if (wcc[ewcPMEMESH].n > 0)
 612         {
 613             /* This must be a PME only node, calculate the Wait + Comm. time */
 614             GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c,
 615                        "Total run ticks must be greater than PME-only ticks");
 616             wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
 617         }
 618     }
 619
 620     /* Store the cycles in a double buffer for summing */
 621     for (i = 0; i < ewcNR; i++)
 622     {
 623 #if GMX_MPI
 624         cycles_n[i] = static_cast<double>(wcc[i].n);
 625 #endif
 626         cycles[i] = static_cast<double>(wcc[i].c);
 627     }
 628     nsum = ewcNR;
 629     if (wc->wcsc)
 630     {
 631         for (i = 0; i < ewcsNR; i++)
 632         {
 633 #if GMX_MPI
 634             cycles_n[ewcNR + i] = static_cast<double>(wc->wcsc[i].n);
 635 #endif
 636             cycles[ewcNR + i] = static_cast<double>(wc->wcsc[i].c);
 637         }
 638         nsum += ewcsNR;
 639     }
 640
 641 #if GMX_MPI
 642     if (cr->nnodes > 1)
 643     {
 644         double buf[int(ewcNR) + int(ewcsNR) + 1];
 645
 646         // TODO this code is used only at the end of the run, so we
 647         // can just do a simple reduce of haveInvalidCount in
 648         // wallcycle_print, and avoid bugs
 649         cycles_n[nsum] = (wc->haveInvalidCount ? 1 : 0);
 650         // TODO Use MPI_Reduce
 651         MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
 652         for (i = 0; i < ewcNR; i++)
 653         {
 654             wcc[i].n = gmx::roundToInt(buf[i]);
 655         }
 656         wc->haveInvalidCount = (buf[nsum] > 0);
 657         if (wc->wcsc)
 658         {
 659             for (i = 0; i < ewcsNR; i++)
 660             {
 661                 wc->wcsc[i].n = gmx::roundToInt(buf[ewcNR + i]);
 662             }
 663         }
 664
 665         // TODO Use MPI_Reduce
 666         MPI_Allreduce(cycles, cycles_sum.data(), nsum, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 667
 668         if (wc->wcc_all != nullptr)
 669         {
 670             double *buf_all, *cyc_all;
 671
 672             snew(cyc_all, ewcNR * ewcNR);
 673             snew(buf_all, ewcNR * ewcNR);
 674             for (i = 0; i < ewcNR * ewcNR; i++)
 675             {
 676                 cyc_all[i] = wc->wcc_all[i].c;
 677             }
 678             // TODO Use MPI_Reduce
 679             MPI_Allreduce(cyc_all, buf_all, ewcNR * ewcNR, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 680             for (i = 0; i < ewcNR * ewcNR; i++)
 681             {
 682                 wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
 683             }
 684             sfree(buf_all);
 685             sfree(cyc_all);
 686         }
 687     }
 688     else
 689 #endif
 690     {
 691         for (i = 0; i < nsum; i++)
 692         {
 693             cycles_sum[i] = cycles[i];
 694         }
 695     }
 696
 697     return cycles_sum;
 698 }
 699
 700 static void
 701 print_cycles(FILE* fplog, double c2t, const char* name, int nnodes, int nthreads, int ncalls, double c_sum, double tot)
 702 {
 703     char   nnodes_str[STRLEN];
 704     char   nthreads_str[STRLEN];
 705     char   ncalls_str[STRLEN];
 706     double wallt;
 707     double percentage = (tot > 0.) ? (100. * c_sum / tot) : 0.;
 708
 709     if (c_sum > 0)
 710     {
 711         if (ncalls > 0)
 712         {
 713             snprintf(ncalls_str, sizeof(ncalls_str), "%10d", ncalls);
 714             if (nnodes < 0)
 715             {
 716                 snprintf(nnodes_str, sizeof(nnodes_str), "N/A");
 717             }
 718             else
 719             {
 720                 snprintf(nnodes_str, sizeof(nnodes_str), "%4d", nnodes);
 721             }
 722             if (nthreads < 0)
 723             {
 724                 snprintf(nthreads_str, sizeof(nthreads_str), "N/A");
 725             }
 726             else
 727             {
 728                 snprintf(nthreads_str, sizeof(nthreads_str), "%4d", nthreads);
 729             }
 730         }
 731         else
 732         {
 733             nnodes_str[0]   = 0;
 734             nthreads_str[0] = 0;
 735             ncalls_str[0]   = 0;
 736         }
 737         /* Convert the cycle count to wallclock time for this task */
 738         wallt = c_sum * c2t;
 739
 740         fprintf(fplog,
 741                 " %-19.19s %4s %4s %10s  %10.3f %14.3f %5.1f\n",
 742                 name,
 743                 nnodes_str,
 744                 nthreads_str,
 745                 ncalls_str,
 746                 wallt,
 747                 c_sum * 1e-9,
 748                 percentage);
 749     }
 750 }
 751
 752 static void print_gputimes(FILE* fplog, const char* name, int n, double t, double tot_t)
 753 {
 754     char num[11];
 755     char avg_perf[11];
 756
 757     if (n > 0)
 758     {
 759         snprintf(num, sizeof(num), "%10d", n);
 760         snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t / n);
 761     }
 762     else
 763     {
 764         sprintf(num, "          ");
 765         sprintf(avg_perf, "          ");
 766     }
 767     if (t != tot_t && tot_t > 0)
 768     {
 769         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, num, t / 1000, avg_perf, 100 * t / tot_t);
 770     }
 771     else
 772     {
 773         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, "", t / 1000, avg_perf, 100.0);
 774     }
 775 }
 776
 777 static void print_header(FILE* fplog, int nrank_pp, int nth_pp, int nrank_pme, int nth_pme)
 778 {
 779     int nrank_tot = nrank_pp + nrank_pme;
 780     if (0 == nrank_pme)
 781     {
 782         fprintf(fplog, "On %d MPI rank%s", nrank_tot, nrank_tot == 1 ? "" : "s");
 783         if (nth_pp > 1)
 784         {
 785             fprintf(fplog, ", each using %d OpenMP threads", nth_pp);
 786         }
 787         /* Don't report doing PP+PME, because we can't tell here if
 788          * this is RF, etc. */
 789     }
 790     else
 791     {
 792         fprintf(fplog, "On %d MPI rank%s doing PP", nrank_pp, nrank_pp == 1 ? "" : "s");
 793         if (nth_pp > 1)
 794         {
 795             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pp > 1 ? " each" : "", nth_pp);
 796         }
 797         fprintf(fplog, ", and\non %d MPI rank%s doing PME", nrank_pme, nrank_pme == 1 ? "" : "s");
 798         if (nth_pme > 1)
 799         {
 800             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pme > 1 ? " each" : "", nth_pme);
 801         }
 802     }
 803
 804     fprintf(fplog, "\n\n");
 805     fprintf(fplog, " Computing:          Num   Num      Call    Wall time         Giga-Cycles\n");
 806     fprintf(fplog, "                     Ranks Threads  Count      (s)         total sum    %%\n");
 807 }
 808
 809
 810 void wallcycle_print(FILE*                            fplog,
 811                      const gmx::MDLogger&             mdlog,
 812                      int                              nnodes,
 813                      int                              npme,
 814                      int                              nth_pp,
 815                      int                              nth_pme,
 816                      double                           realtime,
 817                      gmx_wallcycle_t                  wc,
 818                      const WallcycleCounts&           cyc_sum,
 819                      const gmx_wallclock_gpu_nbnxn_t* gpu_nbnxn_t,
 820                      const gmx_wallclock_gpu_pme_t*   gpu_pme_t)
 821 {
 822     double      tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
 823     double      c2t, c2t_pp, c2t_pme = 0;
 824     int         i, j, npp, nth_tot;
 825     char        buf[STRLEN];
 826     const char* hline =
 827             "-----------------------------------------------------------------------------";
 828
 829     if (wc == nullptr)
 830     {
 831         return;
 832     }
 833
 834     GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0");
 835     GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0");
 836     GMX_ASSERT(nnodes > 0, "Number of nodes must be >0");
 837     GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative");
 838     npp = nnodes - npme;
 839     /* npme is the number of PME-only ranks used, and we always do PP work */
 840     GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0");
 841
 842     nth_tot = npp * nth_pp + npme * nth_pme;
 843
 844     /* When using PME-only nodes, the next line is valid for both
 845        PP-only and PME-only nodes because they started ewcRUN at the
 846        same time. */
 847     tot        = cyc_sum[ewcRUN];
 848     tot_for_pp = 0;
 849
 850     if (tot <= 0.0)
 851     {
 852         /* TODO This is heavy handed, but until someone reworks the
 853            code so that it is provably robust with respect to
 854            non-positive values for all possible timer and cycle
 855            counters, there is less value gained from printing whatever
 856            timing data might still be sensible for some non-Jenkins
 857            run, than is lost from diagnosing Jenkins FP exceptions on
 858            runs about whose execution time we don't care. */
 859         GMX_LOG(mdlog.warning)
 860                 .asParagraph()
 861                 .appendTextFormatted(
 862                         "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a "
 863                         "time accounting",
 864                         tot);
 865         return;
 866     }
 867
 868     if (wc->haveInvalidCount)
 869     {
 870         GMX_LOG(mdlog.warning)
 871                 .asParagraph()
 872                 .appendText(
 873                         "NOTE: Detected invalid cycle counts, probably because threads moved "
 874                         "between CPU cores that do not have synchronized cycle counters. Will not "
 875                         "print the cycle accounting.");
 876         return;
 877     }
 878
 879
 880     /* Conversion factor from cycles to seconds */
 881     c2t    = realtime / tot;
 882     c2t_pp = c2t * nth_tot / static_cast<double>(npp * nth_pp);
 883     if (npme > 0)
 884     {
 885         c2t_pme = c2t * nth_tot / static_cast<double>(npme * nth_pme);
 886     }
 887     else
 888     {
 889         c2t_pme = 0;
 890     }
 891
 892     fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");
 893
 894     print_header(fplog, npp, nth_pp, npme, nth_pme);
 895
 896     fprintf(fplog, "%s\n", hline);
 897     for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
 898     {
 899         if (is_pme_subcounter(i))
 900         {
 901             /* Do not count these at all */
 902         }
 903         else if (npme > 0 && is_pme_counter(i))
 904         {
 905             /* Print timing information for PME-only nodes, but add an
 906              * asterisk so the reader of the table can know that the
 907              * walltimes are not meant to add up. The asterisk still
 908              * fits in the required maximum of 19 characters. */
 909             char buffer[STRLEN];
 910             snprintf(buffer, STRLEN, "%s *", wcn[i]);
 911             print_cycles(fplog, c2t_pme, buffer, npme, nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
 912         }
 913         else
 914         {
 915             /* Print timing information when it is for a PP or PP+PME
 916                node */
 917             print_cycles(fplog, c2t_pp, wcn[i], npp, nth_pp, wc->wcc[i].n, cyc_sum[i], tot);
 918             tot_for_pp += cyc_sum[i];
 919         }
 920     }
 921     if (wc->wcc_all != nullptr)
 922     {
 923         for (i = 0; i < ewcNR; i++)
 924         {
 925             for (j = 0; j < ewcNR; j++)
 926             {
 927                 snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
 928                 print_cycles(fplog,
 929                              c2t_pp,
 930                              buf,
 931                              npp,
 932                              nth_pp,
 933                              wc->wcc_all[i * ewcNR + j].n,
 934                              wc->wcc_all[i * ewcNR + j].c,
 935                              tot);
 936             }
 937         }
 938     }
 939     tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot);
 940     print_cycles(fplog, c2t_pp, "Rest", npp, nth_pp, -1, tot_for_rest - tot_for_pp, tot);
 941     fprintf(fplog, "%s\n", hline);
 942     print_cycles(fplog, c2t, "Total", npp, nth_pp, -1, tot, tot);
 943     fprintf(fplog, "%s\n", hline);
 944
 945     if (npme > 0)
 946     {
 947         fprintf(fplog,
 948                 "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
 949                 "    twice the total reported, but the cycle count total and %% are correct.\n"
 950                 "%s\n",
 951                 hline);
 952     }
 953
 954     if (wc->wcc[ewcPMEMESH].n > 0)
 955     {
 956         // A workaround to not print breakdown when no subcounters were recorded.
 957         // TODO: figure out and record PME GPU counters (what to do with the waiting ones?)
 958         std::vector<int> validPmeSubcounterIndices;
 959         for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
 960         {
 961             if (is_pme_subcounter(i) && wc->wcc[i].n > 0)
 962             {
 963                 validPmeSubcounterIndices.push_back(i);
 964             }
 965         }
 966
 967         if (!validPmeSubcounterIndices.empty())
 968         {
 969             fprintf(fplog, " Breakdown of PME mesh computation\n");
 970             fprintf(fplog, "%s\n", hline);
 971             for (auto i : validPmeSubcounterIndices)
 972             {
 973                 print_cycles(fplog,
 974                              npme > 0 ? c2t_pme : c2t_pp,
 975                              wcn[i],
 976                              npme > 0 ? npme : npp,
 977                              nth_pme,
 978                              wc->wcc[i].n,
 979                              cyc_sum[i],
 980                              tot);
 981             }
 982             fprintf(fplog, "%s\n", hline);
 983         }
 984     }
 985
 986     if (useCycleSubcounters && wc->wcsc)
 987     {
 988         fprintf(fplog, " Breakdown of PP computation\n");
 989         fprintf(fplog, "%s\n", hline);
 990         for (i = 0; i < ewcsNR; i++)
 991         {
 992             print_cycles(fplog, c2t_pp, wcsn[i], npp, nth_pp, wc->wcsc[i].n, cyc_sum[ewcNR + i], tot);
 993         }
 994         fprintf(fplog, "%s\n", hline);
 995     }
 996
 997     /* print GPU timing summary */
 998     double tot_gpu = 0.0;
 999     if (gpu_pme_t)
1000     {
1001         for (auto key : keysOf(gpu_pme_t->timing))
1002         {
1003             tot_gpu += gpu_pme_t->timing[key].t;
1004         }
1005     }
1006     if (gpu_nbnxn_t)
1007     {
1008         const char* k_log_str[2][2] = { { "Nonbonded F kernel", "Nonbonded F+ene k." },
1009                                         { "Nonbonded F+prune k.", "Nonbonded F+ene+prune k." } };
1010         tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
1011
1012         /* add up the kernel timings */
1013         for (i = 0; i < 2; i++)
1014         {
1015             for (j = 0; j < 2; j++)
1016             {
1017                 tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
1018             }
1019         }
1020         tot_gpu += gpu_nbnxn_t->pruneTime.t;
1021
1022         tot_cpu_overlap = wc->wcc[ewcFORCE].c;
1023         if (wc->wcc[ewcPMEMESH].n > 0)
1024         {
1025             tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
1026         }
1027         tot_cpu_overlap *= realtime * 1000 / tot; /* convert s to ms */
1028
1029         fprintf(fplog, "\n GPU timings\n%s\n", hline);
1030         fprintf(fplog,
1031                 " Computing:                         Count  Wall t (s)      ms/step       %c\n",
1032                 '%');
1033         fprintf(fplog, "%s\n", hline);
1034         print_gputimes(fplog, "Pair list H2D", gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
1035         print_gputimes(fplog, "X / q H2D", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
1036
1037         for (i = 0; i < 2; i++)
1038         {
1039             for (j = 0; j < 2; j++)
1040             {
1041                 if (gpu_nbnxn_t->ktime[i][j].c)
1042                 {
1043                     print_gputimes(fplog,
1044                                    k_log_str[i][j],
1045                                    gpu_nbnxn_t->ktime[i][j].c,
1046                                    gpu_nbnxn_t->ktime[i][j].t,
1047                                    tot_gpu);
1048                 }
1049             }
1050         }
1051         if (gpu_pme_t)
1052         {
1053             for (auto key : keysOf(gpu_pme_t->timing))
1054             {
1055                 if (gpu_pme_t->timing[key].c)
1056                 {
1057                     print_gputimes(fplog,
1058                                    enumValuetoString(key),
1059                                    gpu_pme_t->timing[key].c,
1060                                    gpu_pme_t->timing[key].t,
1061                                    tot_gpu);
1062                 }
1063             }
1064         }
1065         if (gpu_nbnxn_t->pruneTime.c)
1066         {
1067             print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c, gpu_nbnxn_t->pruneTime.t, tot_gpu);
1068         }
1069         print_gputimes(fplog, "F D2H", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
1070         fprintf(fplog, "%s\n", hline);
1071         print_gputimes(fplog, "Total ", gpu_nbnxn_t->nb_c, tot_gpu, tot_gpu);
1072         fprintf(fplog, "%s\n", hline);
1073         if (gpu_nbnxn_t->dynamicPruneTime.c)
1074         {
1075             /* We print the dynamic pruning kernel timings after a separator
1076              * and avoid adding it to tot_gpu as this is not in the force
1077              * overlap. We print the fraction as relative to the rest.
1078              */
1079             print_gputimes(fplog,
1080                            "*Dynamic pruning",
1081                            gpu_nbnxn_t->dynamicPruneTime.c,
1082                            gpu_nbnxn_t->dynamicPruneTime.t,
1083                            tot_gpu);
1084             fprintf(fplog, "%s\n", hline);
1085         }
1086         gpu_cpu_ratio = tot_gpu / tot_cpu_overlap;
1087         if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
1088         {
1089             fprintf(fplog,
1090                     "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = "
1091                     "%.3f\n",
1092                     tot_gpu / gpu_nbnxn_t->nb_c,
1093                     tot_cpu_overlap / wc->wcc[ewcFORCE].n,
1094                     gpu_cpu_ratio);
1095         }
1096
1097         /* only print notes related to CPU-GPU load balance with PME */
1098         if (wc->wcc[ewcPMEMESH].n > 0)
1099         {
1100             fprintf(fplog, "For optimal resource utilization this ratio should be close to 1\n");
1101
1102             /* print note if the imbalance is high with PME case in which
1103              * CPU-GPU load balancing is possible */
1104             if (gpu_cpu_ratio < 0.8 || gpu_cpu_ratio > 1.25)
1105             {
1106                 /* Only the sim master calls this function, so always print to stderr */
1107                 if (gpu_cpu_ratio < 0.8)
1108                 {
1109                     if (npp > 1)
1110                     {
1111                         /* The user could have used -notunepme,
1112                          * but we currently can't check that here.
1113                          */
1114                         GMX_LOG(mdlog.warning)
1115                                 .asParagraph()
1116                                 .appendText(
1117                                         "NOTE: The CPU has >25% more load than the GPU. This "
1118                                         "imbalance wastes\n"
1119                                         "      GPU resources. Maybe the domain decomposition "
1120                                         "limits the PME tuning.\n"
1121                                         "      In that case, try setting the DD grid manually "
1122                                         "(-dd) or lowering -dds.");
1123                     }
1124                     else
1125                     {
1126                         /* We should not end up here, unless the box is
1127                          * too small for increasing the cut-off for PME tuning.
1128                          */
1129                         GMX_LOG(mdlog.warning)
1130                                 .asParagraph()
1131                                 .appendText(
1132                                         "NOTE: The CPU has >25% more load than the GPU. This "
1133                                         "imbalance wastes\n"
1134                                         "      GPU resources.");
1135                     }
1136                 }
1137                 if (gpu_cpu_ratio > 1.25)
1138                 {
1139                     GMX_LOG(mdlog.warning)
1140                             .asParagraph()
1141                             .appendText(
1142                                     "NOTE: The GPU has >25% more load than the CPU. This imbalance "
1143                                     "wastes\n"
1144                                     "      CPU resources.");
1145                 }
1146             }
1147         }
1148     }
1149
1150     if (wc->wc_barrier)
1151     {
1152         GMX_LOG(mdlog.warning)
1153                 .asParagraph()
1154                 .appendText(
1155                         "MPI_Barrier was called before each cycle start/stop\n"
1156                         "call, so timings are not those of real runs.");
1157     }
1158
1159     if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 && (cyc_sum[ewcDOMDEC] > tot * 0.1 || cyc_sum[ewcNS] > tot * 0.1))
1160     {
1161         /* Only the sim master calls this function, so always print to stderr */
1162         if (wc->wcc[ewcDOMDEC].n == 0)
1163         {
1164             GMX_LOG(mdlog.warning)
1165                     .asParagraph()
1166                     .appendTextFormatted(
1167                             "NOTE: %d %% of the run time was spent in pair search,\n"
1168                             "      you might want to increase nstlist (this has no effect on "
1169                             "accuracy)\n",
1170                             gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1171         }
1172         else
1173         {
1174             GMX_LOG(mdlog.warning)
1175                     .asParagraph()
1176                     .appendTextFormatted(
1177                             "NOTE: %d %% of the run time was spent in domain decomposition,\n"
1178                             "      %d %% of the run time was spent in pair search,\n"
1179                             "      you might want to increase nstlist (this has no effect on "
1180                             "accuracy)\n",
1181                             gmx::roundToInt(100 * cyc_sum[ewcDOMDEC] / tot),
1182                             gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1183         }
1184     }
1185
1186     if (cyc_sum[ewcMoveE] > tot * 0.05)
1187     {
1188         GMX_LOG(mdlog.warning)
1189                 .asParagraph()
1190                 .appendTextFormatted(
1191                         "NOTE: %d %% of the run time was spent communicating energies,\n"
1192                         "      you might want to increase some nst* mdp options\n",
1193                         gmx::roundToInt(100 * cyc_sum[ewcMoveE] / tot));
1194     }
1195 }
1196
1197 extern int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
1198 {
1199     if (wc == nullptr)
1200     {
1201         return -1;
1202     }
1203
1204     return wc->reset_counters;
1205 }
1206
1207 extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, int64_t reset_counters)
1208 {
1209     if (wc == nullptr)
1210     {
1211         return;
1212     }
1213
1214     wc->reset_counters = reset_counters;
1215 }
1216
1217 void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
1218 {
1219     if (useCycleSubcounters && wc != nullptr)
1220     {
1221         wc->wcsc[ewcs].start = gmx_cycles_read();
1222     }
1223 }
1224
1225 void wallcycle_sub_start_nocount(gmx_wallcycle_t wc, int ewcs)
1226 {
1227     if (useCycleSubcounters && wc != nullptr)
1228     {
1229         wallcycle_sub_start(wc, ewcs);
1230         wc->wcsc[ewcs].n--;
1231     }
1232 }
1233
1234 void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
1235 {
1236     if (useCycleSubcounters && wc != nullptr)
1237     {
1238         wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start;
1239         wc->wcsc[ewcs].n++;
1240     }
1241 }