src/gromacs/timing/wallcycle.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "wallcycle.h"
  40
  41 #include "config.h"
  42
  43 #include <cstdlib>
  44
  45 #include <array>
  46 #include <vector>
  47
  48 #include "gromacs/math/functions.h"
  49 #include "gromacs/mdtypes/commrec.h"
  50 #include "gromacs/timing/cyclecounter.h"
  51 #include "gromacs/timing/gpu_timing.h"
  52 #include "gromacs/timing/wallcyclereporting.h"
  53 #include "gromacs/utility/cstringutil.h"
  54 #include "gromacs/utility/gmxassert.h"
  55 #include "gromacs/utility/gmxmpi.h"
  56 #include "gromacs/utility/logger.h"
  57 #include "gromacs/utility/smalloc.h"
  58 #include "gromacs/utility/snprintf.h"
  59
  60 static const bool useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS;
  61
  62 /* DEBUG_WCYCLE adds consistency checking for the counters.
  63  * It checks if you stop a counter different from the last
  64  * one that was opened and if you do nest too deep.
  65  */
  66 /* #define DEBUG_WCYCLE */
  67
  68 #ifdef DEBUG_WCYCLE
  69 #    include "gromacs/utility/fatalerror.h"
  70 #endif
  71
  72 typedef struct
  73 {
  74     int          n;
  75     gmx_cycles_t c;
  76     gmx_cycles_t start;
  77 } wallcc_t;
  78
  79 struct gmx_wallcycle
  80 {
  81     wallcc_t* wcc;
  82     /* did we detect one or more invalid cycle counts */
  83     gmx_bool haveInvalidCount;
  84     /* variables for testing/debugging */
  85     gmx_bool  wc_barrier;
  86     wallcc_t* wcc_all;
  87     int       wc_depth;
  88 #ifdef DEBUG_WCYCLE
  89 #    define DEPTH_MAX 6
  90     int counterlist[DEPTH_MAX];
  91     int count_depth;
  92 #endif
  93     int          ewc_prev;
  94     gmx_cycles_t cycle_prev;
  95     int64_t      reset_counters;
  96 #if GMX_MPI
  97     MPI_Comm mpi_comm_mygroup;
  98 #endif
  99     wallcc_t* wcsc;
 100 };
 101
 102 /* Each name should not exceed 19 printing characters
 103    (ie. terminating null can be twentieth) */
 104 static const char* wcn[ewcNR] = { "Run",
 105                                   "Step",
 106                                   "PP during PME",
 107                                   "Domain decomp.",
 108                                   "DD comm. load",
 109                                   "DD comm. bounds",
 110                                   "Vsite constr.",
 111                                   "Send X to PME",
 112                                   "Neighbor search",
 113                                   "Launch GPU ops.",
 114                                   "Comm. coord.",
 115                                   "Force",
 116                                   "Wait + Comm. F",
 117                                   "PME mesh",
 118                                   "PME redist. X/F",
 119                                   "PME spread",
 120                                   "PME gather",
 121                                   "PME 3D-FFT",
 122                                   "PME 3D-FFT Comm.",
 123                                   "PME solve LJ",
 124                                   "PME solve Elec",
 125                                   "PME wait for PP",
 126                                   "Wait + Recv. PME F",
 127                                   "Wait PME GPU spread",
 128                                   "PME 3D-FFT",
 129                                   "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
 130                                   "Wait PME GPU gather",
 131                                   "Wait Bonded GPU",
 132                                   "Reduce GPU PME F",
 133                                   "Wait GPU NB nonloc.",
 134                                   "Wait GPU NB local",
 135                                   "NB X/F buffer ops.",
 136                                   "Vsite spread",
 137                                   "COM pull force",
 138                                   "AWH",
 139                                   "Write traj.",
 140                                   "Update",
 141                                   "Constraints",
 142                                   "Comm. energies",
 143                                   "Enforced rotation",
 144                                   "Add rot. forces",
 145                                   "Position swapping",
 146                                   "IMD",
 147                                   "Test" };
 148
 149 static const char* wcsn[ewcsNR] = {
 150     "DD redist.",
 151     "DD NS grid + sort",
 152     "DD setup comm.",
 153     "DD make top.",
 154     "DD make constr.",
 155     "DD top. other",
 156     "NS grid local",
 157     "NS grid non-loc.",
 158     "NS search local",
 159     "NS search non-loc.",
 160     "Bonded F",
 161     "Bonded-FEP F",
 162     "Restraints F",
 163     "Listed buffer ops.",
 164     "Nonbonded pruning",
 165     "Nonbonded F kernel",
 166     "Nonbonded F clear",
 167     "Nonbonded FEP",
 168     "Launch NB GPU tasks",
 169     "Launch Bonded GPU tasks",
 170     "Launch PME GPU tasks",
 171     "Ewald F correction",
 172     "NB X buffer ops.",
 173     "NB F buffer ops.",
 174     "Clear force buffer",
 175     "Test subcounter",
 176 };
 177
 178 /* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
 179 static const char* PMEStageNames[] = {
 180     "PME spline", "PME spread",     "PME spline + spread", "PME 3D-FFT r2c",
 181     "PME solve",  "PME 3D-FFT c2r", "PME gather",
 182 };
 183
 184 gmx_bool wallcycle_have_counter()
 185 {
 186     return gmx_cycles_have_counter();
 187 }
 188
 189 gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, t_commrec gmx_unused* cr)
 190 {
 191     gmx_wallcycle_t wc;
 192
 193
 194     if (!wallcycle_have_counter())
 195     {
 196         return nullptr;
 197     }
 198
 199     snew(wc, 1);
 200
 201     wc->haveInvalidCount = FALSE;
 202     wc->wc_barrier       = FALSE;
 203     wc->wcc_all          = nullptr;
 204     wc->wc_depth         = 0;
 205     wc->ewc_prev         = -1;
 206     wc->reset_counters   = resetstep;
 207
 208 #if GMX_MPI
 209     if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
 210     {
 211         if (fplog)
 212         {
 213             fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
 214         }
 215         wc->wc_barrier       = TRUE;
 216         wc->mpi_comm_mygroup = cr->mpi_comm_mygroup;
 217     }
 218 #endif
 219
 220     snew(wc->wcc, ewcNR);
 221     if (getenv("GMX_CYCLE_ALL") != nullptr)
 222     {
 223         if (fplog)
 224         {
 225             fprintf(fplog, "\nWill time all the code during the run\n\n");
 226         }
 227         snew(wc->wcc_all, ewcNR * ewcNR);
 228     }
 229
 230     if (useCycleSubcounters)
 231     {
 232         snew(wc->wcsc, ewcsNR);
 233     }
 234
 235 #ifdef DEBUG_WCYCLE
 236     wc->count_depth = 0;
 237 #endif
 238
 239     return wc;
 240 }
 241
 242 void wallcycle_destroy(gmx_wallcycle_t wc)
 243 {
 244     if (wc == nullptr)
 245     {
 246         return;
 247     }
 248
 249     if (wc->wcc != nullptr)
 250     {
 251         sfree(wc->wcc);
 252     }
 253     if (wc->wcc_all != nullptr)
 254     {
 255         sfree(wc->wcc_all);
 256     }
 257     if (wc->wcsc != nullptr)
 258     {
 259         sfree(wc->wcsc);
 260     }
 261     sfree(wc);
 262 }
 263
 264 static void wallcycle_all_start(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 265 {
 266     wc->ewc_prev   = ewc;
 267     wc->cycle_prev = cycle;
 268 }
 269
 270 static void wallcycle_all_stop(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 271 {
 272     wc->wcc_all[wc->ewc_prev * ewcNR + ewc].n += 1;
 273     wc->wcc_all[wc->ewc_prev * ewcNR + ewc].c += cycle - wc->cycle_prev;
 274 }
 275
 276
 277 #ifdef DEBUG_WCYCLE
 278 static void debug_start_check(gmx_wallcycle_t wc, int ewc)
 279 {
 280     /* fprintf(stderr,"wcycle_start depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 281
 282     if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
 283     {
 284         gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth);
 285     }
 286     wc->counterlist[wc->count_depth] = ewc;
 287     wc->count_depth++;
 288 }
 289
 290 static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
 291 {
 292     wc->count_depth--;
 293
 294     /* fprintf(stderr,"wcycle_stop depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 295
 296     if (wc->count_depth < 0)
 297     {
 298         gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc],
 299                   wc->count_depth);
 300     }
 301     if (wc->counterlist[wc->count_depth] != ewc)
 302     {
 303         gmx_fatal(FARGS, "wallcycle mismatch at stop, start %s, stop %s",
 304                   wcn[wc->counterlist[wc->count_depth]], wcn[ewc]);
 305     }
 306 }
 307 #endif
 308
 309 void wallcycle_start(gmx_wallcycle_t wc, int ewc)
 310 {
 311     gmx_cycles_t cycle;
 312
 313     if (wc == nullptr)
 314     {
 315         return;
 316     }
 317
 318 #if GMX_MPI
 319     if (wc->wc_barrier)
 320     {
 321         MPI_Barrier(wc->mpi_comm_mygroup);
 322     }
 323 #endif
 324
 325 #ifdef DEBUG_WCYCLE
 326     debug_start_check(wc, ewc);
 327 #endif
 328
 329     cycle              = gmx_cycles_read();
 330     wc->wcc[ewc].start = cycle;
 331     if (wc->wcc_all != nullptr)
 332     {
 333         wc->wc_depth++;
 334         if (ewc == ewcRUN)
 335         {
 336             wallcycle_all_start(wc, ewc, cycle);
 337         }
 338         else if (wc->wc_depth == 3)
 339         {
 340             wallcycle_all_stop(wc, ewc, cycle);
 341         }
 342     }
 343 }
 344
 345 void wallcycle_increment_event_count(gmx_wallcycle_t wc, int ewc)
 346 {
 347     if (wc == nullptr)
 348     {
 349         return;
 350     }
 351     wc->wcc[ewc].n++;
 352 }
 353
 354 void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
 355 {
 356     if (wc == nullptr)
 357     {
 358         return;
 359     }
 360
 361     wallcycle_start(wc, ewc);
 362     wc->wcc[ewc].n--;
 363 }
 364
 365 double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
 366 {
 367     gmx_cycles_t cycle, last;
 368
 369     if (wc == nullptr)
 370     {
 371         return 0;
 372     }
 373
 374 #if GMX_MPI
 375     if (wc->wc_barrier)
 376     {
 377         MPI_Barrier(wc->mpi_comm_mygroup);
 378     }
 379 #endif
 380
 381 #ifdef DEBUG_WCYCLE
 382     debug_stop_check(wc, ewc);
 383 #endif
 384
 385     /* When processes or threads migrate between cores, the cycle counting
 386      * can get messed up if the cycle counter on different cores are not
 387      * synchronized. When this happens we expect both large negative and
 388      * positive cycle differences. We can detect negative cycle differences.
 389      * Detecting too large positive counts if difficult, since count can be
 390      * large, especially for ewcRUN. If we detect a negative count,
 391      * we will not print the cycle accounting table.
 392      */
 393     cycle = gmx_cycles_read();
 394     if (cycle >= wc->wcc[ewc].start)
 395     {
 396         last = cycle - wc->wcc[ewc].start;
 397     }
 398     else
 399     {
 400         last                 = 0;
 401         wc->haveInvalidCount = TRUE;
 402     }
 403     wc->wcc[ewc].c += last;
 404     wc->wcc[ewc].n++;
 405     if (wc->wcc_all)
 406     {
 407         wc->wc_depth--;
 408         if (ewc == ewcRUN)
 409         {
 410             wallcycle_all_stop(wc, ewc, cycle);
 411         }
 412         else if (wc->wc_depth == 2)
 413         {
 414             wallcycle_all_start(wc, ewc, cycle);
 415         }
 416     }
 417
 418     return last;
 419 }
 420
 421 void wallcycle_get(gmx_wallcycle_t wc, int ewc, int* n, double* c)
 422 {
 423     *n = wc->wcc[ewc].n;
 424     *c = static_cast<double>(wc->wcc[ewc].c);
 425 }
 426
 427 void wallcycle_reset_all(gmx_wallcycle_t wc)
 428 {
 429     int i;
 430
 431     if (wc == nullptr)
 432     {
 433         return;
 434     }
 435
 436     for (i = 0; i < ewcNR; i++)
 437     {
 438         wc->wcc[i].n = 0;
 439         wc->wcc[i].c = 0;
 440     }
 441     wc->haveInvalidCount = FALSE;
 442
 443     if (wc->wcc_all)
 444     {
 445         for (i = 0; i < ewcNR * ewcNR; i++)
 446         {
 447             wc->wcc_all[i].n = 0;
 448             wc->wcc_all[i].c = 0;
 449         }
 450     }
 451     if (wc->wcsc)
 452     {
 453         for (i = 0; i < ewcsNR; i++)
 454         {
 455             wc->wcsc[i].n = 0;
 456             wc->wcsc[i].c = 0;
 457         }
 458     }
 459 }
 460
 461 static gmx_bool is_pme_counter(int ewc)
 462 {
 463     return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
 464 }
 465
 466 static gmx_bool is_pme_subcounter(int ewc)
 467 {
 468     return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
 469 }
 470
 471 /* Subtract counter ewc_sub timed inside a timing block for ewc_main */
 472 static void subtract_cycles(wallcc_t* wcc, int ewc_main, int ewc_sub)
 473 {
 474     if (wcc[ewc_sub].n > 0)
 475     {
 476         if (wcc[ewc_main].c >= wcc[ewc_sub].c)
 477         {
 478             wcc[ewc_main].c -= wcc[ewc_sub].c;
 479         }
 480         else
 481         {
 482             /* Something is wrong with the cycle counting */
 483             wcc[ewc_main].c = 0;
 484         }
 485     }
 486 }
 487
 488 void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
 489 {
 490     if (wc == nullptr)
 491     {
 492         return;
 493     }
 494
 495     for (int i = 0; i < ewcNR; i++)
 496     {
 497         if (is_pme_counter(i) || (i == ewcRUN && isPmeRank))
 498         {
 499             wc->wcc[i].c *= nthreads_pme;
 500
 501             if (wc->wcc_all)
 502             {
 503                 for (int j = 0; j < ewcNR; j++)
 504                 {
 505                     wc->wcc_all[i * ewcNR + j].c *= nthreads_pme;
 506                 }
 507             }
 508         }
 509         else
 510         {
 511             wc->wcc[i].c *= nthreads_pp;
 512
 513             if (wc->wcc_all)
 514             {
 515                 for (int j = 0; j < ewcNR; j++)
 516                 {
 517                     wc->wcc_all[i * ewcNR + j].c *= nthreads_pp;
 518                 }
 519             }
 520         }
 521     }
 522     if (useCycleSubcounters && wc->wcsc && !isPmeRank)
 523     {
 524         for (int i = 0; i < ewcsNR; i++)
 525         {
 526             wc->wcsc[i].c *= nthreads_pp;
 527         }
 528     }
 529 }
 530
 531 /* TODO Make an object for this function to return, containing some
 532  * vectors of something like wallcc_t for the summed wcc, wcc_all and
 533  * wcsc, AND the original wcc for rank 0.
 534  *
 535  * The GPU timing is reported only for rank 0, so we want to preserve
 536  * the original wcycle on that rank. Rank 0 also reports the global
 537  * counts before that, so needs something to contain the global data
 538  * without over-writing the rank-0 data. The current implementation
 539  * uses cycles_sum to manage this, which works OK now because wcsc and
 540  * wcc_all are unused by the GPU reporting, but it is not satisfactory
 541  * for the future. Also, there's no need for MPI_Allreduce, since
 542  * only MASTERRANK uses any of the results. */
 543 WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle_t wc)
 544 {
 545     WallcycleCounts cycles_sum;
 546     wallcc_t*       wcc;
 547     double          cycles[ewcNR + ewcsNR];
 548 #if GMX_MPI
 549     double cycles_n[ewcNR + ewcsNR + 1];
 550 #endif
 551     int i;
 552     int nsum;
 553
 554     if (wc == nullptr)
 555     {
 556         /* Default construction of std::array of non-class T can leave
 557            the values indeterminate, just like a C array, and icc
 558            warns about it. */
 559         cycles_sum.fill(0);
 560         return cycles_sum;
 561     }
 562
 563     wcc = wc->wcc;
 564
 565     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
 566     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);
 567
 568     subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);
 569
 570     if (cr->npmenodes == 0)
 571     {
 572         /* All nodes do PME (or no PME at all) */
 573         subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
 574     }
 575     else
 576     {
 577         /* The are PME-only nodes */
 578         if (wcc[ewcPMEMESH].n > 0)
 579         {
 580             /* This must be a PME only node, calculate the Wait + Comm. time */
 581             GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c,
 582                        "Total run ticks must be greater than PME-only ticks");
 583             wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
 584         }
 585     }
 586
 587     /* Store the cycles in a double buffer for summing */
 588     for (i = 0; i < ewcNR; i++)
 589     {
 590 #if GMX_MPI
 591         cycles_n[i] = static_cast<double>(wcc[i].n);
 592 #endif
 593         cycles[i] = static_cast<double>(wcc[i].c);
 594     }
 595     nsum = ewcNR;
 596     if (wc->wcsc)
 597     {
 598         for (i = 0; i < ewcsNR; i++)
 599         {
 600 #if GMX_MPI
 601             cycles_n[ewcNR + i] = static_cast<double>(wc->wcsc[i].n);
 602 #endif
 603             cycles[ewcNR + i] = static_cast<double>(wc->wcsc[i].c);
 604         }
 605         nsum += ewcsNR;
 606     }
 607
 608 #if GMX_MPI
 609     if (cr->nnodes > 1)
 610     {
 611         double buf[ewcNR + ewcsNR + 1];
 612
 613         // TODO this code is used only at the end of the run, so we
 614         // can just do a simple reduce of haveInvalidCount in
 615         // wallcycle_print, and avoid bugs
 616         cycles_n[nsum] = (wc->haveInvalidCount ? 1 : 0);
 617         // TODO Use MPI_Reduce
 618         MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
 619         for (i = 0; i < ewcNR; i++)
 620         {
 621             wcc[i].n = gmx::roundToInt(buf[i]);
 622         }
 623         wc->haveInvalidCount = (buf[nsum] > 0);
 624         if (wc->wcsc)
 625         {
 626             for (i = 0; i < ewcsNR; i++)
 627             {
 628                 wc->wcsc[i].n = gmx::roundToInt(buf[ewcNR + i]);
 629             }
 630         }
 631
 632         // TODO Use MPI_Reduce
 633         MPI_Allreduce(cycles, cycles_sum.data(), nsum, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 634
 635         if (wc->wcc_all != nullptr)
 636         {
 637             double *buf_all, *cyc_all;
 638
 639             snew(cyc_all, ewcNR * ewcNR);
 640             snew(buf_all, ewcNR * ewcNR);
 641             for (i = 0; i < ewcNR * ewcNR; i++)
 642             {
 643                 cyc_all[i] = wc->wcc_all[i].c;
 644             }
 645             // TODO Use MPI_Reduce
 646             MPI_Allreduce(cyc_all, buf_all, ewcNR * ewcNR, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 647             for (i = 0; i < ewcNR * ewcNR; i++)
 648             {
 649                 wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
 650             }
 651             sfree(buf_all);
 652             sfree(cyc_all);
 653         }
 654     }
 655     else
 656 #endif
 657     {
 658         for (i = 0; i < nsum; i++)
 659         {
 660             cycles_sum[i] = cycles[i];
 661         }
 662     }
 663
 664     return cycles_sum;
 665 }
 666
 667 static void
 668 print_cycles(FILE* fplog, double c2t, const char* name, int nnodes, int nthreads, int ncalls, double c_sum, double tot)
 669 {
 670     char   nnodes_str[STRLEN];
 671     char   nthreads_str[STRLEN];
 672     char   ncalls_str[STRLEN];
 673     double wallt;
 674     double percentage = (tot > 0.) ? (100. * c_sum / tot) : 0.;
 675
 676     if (c_sum > 0)
 677     {
 678         if (ncalls > 0)
 679         {
 680             snprintf(ncalls_str, sizeof(ncalls_str), "%10d", ncalls);
 681             if (nnodes < 0)
 682             {
 683                 snprintf(nnodes_str, sizeof(nnodes_str), "N/A");
 684             }
 685             else
 686             {
 687                 snprintf(nnodes_str, sizeof(nnodes_str), "%4d", nnodes);
 688             }
 689             if (nthreads < 0)
 690             {
 691                 snprintf(nthreads_str, sizeof(nthreads_str), "N/A");
 692             }
 693             else
 694             {
 695                 snprintf(nthreads_str, sizeof(nthreads_str), "%4d", nthreads);
 696             }
 697         }
 698         else
 699         {
 700             nnodes_str[0]   = 0;
 701             nthreads_str[0] = 0;
 702             ncalls_str[0]   = 0;
 703         }
 704         /* Convert the cycle count to wallclock time for this task */
 705         wallt = c_sum * c2t;
 706
 707         fprintf(fplog, " %-19.19s %4s %4s %10s  %10.3f %14.3f %5.1f\n", name, nnodes_str,
 708                 nthreads_str, ncalls_str, wallt, c_sum * 1e-9, percentage);
 709     }
 710 }
 711
 712 static void print_gputimes(FILE* fplog, const char* name, int n, double t, double tot_t)
 713 {
 714     char num[11];
 715     char avg_perf[11];
 716
 717     if (n > 0)
 718     {
 719         snprintf(num, sizeof(num), "%10d", n);
 720         snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t / n);
 721     }
 722     else
 723     {
 724         sprintf(num, "          ");
 725         sprintf(avg_perf, "          ");
 726     }
 727     if (t != tot_t && tot_t > 0)
 728     {
 729         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, num, t / 1000, avg_perf, 100 * t / tot_t);
 730     }
 731     else
 732     {
 733         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, "", t / 1000, avg_perf, 100.0);
 734     }
 735 }
 736
 737 static void print_header(FILE* fplog, int nrank_pp, int nth_pp, int nrank_pme, int nth_pme)
 738 {
 739     int nrank_tot = nrank_pp + nrank_pme;
 740     if (0 == nrank_pme)
 741     {
 742         fprintf(fplog, "On %d MPI rank%s", nrank_tot, nrank_tot == 1 ? "" : "s");
 743         if (nth_pp > 1)
 744         {
 745             fprintf(fplog, ", each using %d OpenMP threads", nth_pp);
 746         }
 747         /* Don't report doing PP+PME, because we can't tell here if
 748          * this is RF, etc. */
 749     }
 750     else
 751     {
 752         fprintf(fplog, "On %d MPI rank%s doing PP", nrank_pp, nrank_pp == 1 ? "" : "s");
 753         if (nth_pp > 1)
 754         {
 755             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pp > 1 ? " each" : "", nth_pp);
 756         }
 757         fprintf(fplog, ", and\non %d MPI rank%s doing PME", nrank_pme, nrank_pme == 1 ? "" : "s");
 758         if (nth_pme > 1)
 759         {
 760             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pme > 1 ? " each" : "", nth_pme);
 761         }
 762     }
 763
 764     fprintf(fplog, "\n\n");
 765     fprintf(fplog, " Computing:          Num   Num      Call    Wall time         Giga-Cycles\n");
 766     fprintf(fplog, "                     Ranks Threads  Count      (s)         total sum    %%\n");
 767 }
 768
 769
 770 void wallcycle_print(FILE*                            fplog,
 771                      const gmx::MDLogger&             mdlog,
 772                      int                              nnodes,
 773                      int                              npme,
 774                      int                              nth_pp,
 775                      int                              nth_pme,
 776                      double                           realtime,
 777                      gmx_wallcycle_t                  wc,
 778                      const WallcycleCounts&           cyc_sum,
 779                      const gmx_wallclock_gpu_nbnxn_t* gpu_nbnxn_t,
 780                      const gmx_wallclock_gpu_pme_t*   gpu_pme_t)
 781 {
 782     double      tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
 783     double      c2t, c2t_pp, c2t_pme = 0;
 784     int         i, j, npp, nth_tot;
 785     char        buf[STRLEN];
 786     const char* hline =
 787             "-----------------------------------------------------------------------------";
 788
 789     if (wc == nullptr)
 790     {
 791         return;
 792     }
 793
 794     GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0");
 795     GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0");
 796     GMX_ASSERT(nnodes > 0, "Number of nodes must be >0");
 797     GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative");
 798     npp = nnodes - npme;
 799     /* npme is the number of PME-only ranks used, and we always do PP work */
 800     GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0");
 801
 802     nth_tot = npp * nth_pp + npme * nth_pme;
 803
 804     /* When using PME-only nodes, the next line is valid for both
 805        PP-only and PME-only nodes because they started ewcRUN at the
 806        same time. */
 807     tot        = cyc_sum[ewcRUN];
 808     tot_for_pp = 0;
 809
 810     if (tot <= 0.0)
 811     {
 812         /* TODO This is heavy handed, but until someone reworks the
 813            code so that it is provably robust with respect to
 814            non-positive values for all possible timer and cycle
 815            counters, there is less value gained from printing whatever
 816            timing data might still be sensible for some non-Jenkins
 817            run, than is lost from diagnosing Jenkins FP exceptions on
 818            runs about whose execution time we don't care. */
 819         GMX_LOG(mdlog.warning)
 820                 .asParagraph()
 821                 .appendTextFormatted(
 822                         "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a "
 823                         "time accounting",
 824                         tot);
 825         return;
 826     }
 827
 828     if (wc->haveInvalidCount)
 829     {
 830         GMX_LOG(mdlog.warning)
 831                 .asParagraph()
 832                 .appendText(
 833                         "NOTE: Detected invalid cycle counts, probably because threads moved "
 834                         "between CPU cores that do not have synchronized cycle counters. Will not "
 835                         "print the cycle accounting.");
 836         return;
 837     }
 838
 839
 840     /* Conversion factor from cycles to seconds */
 841     c2t    = realtime / tot;
 842     c2t_pp = c2t * nth_tot / static_cast<double>(npp * nth_pp);
 843     if (npme > 0)
 844     {
 845         c2t_pme = c2t * nth_tot / static_cast<double>(npme * nth_pme);
 846     }
 847     else
 848     {
 849         c2t_pme = 0;
 850     }
 851
 852     fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");
 853
 854     print_header(fplog, npp, nth_pp, npme, nth_pme);
 855
 856     fprintf(fplog, "%s\n", hline);
 857     for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
 858     {
 859         if (is_pme_subcounter(i))
 860         {
 861             /* Do not count these at all */
 862         }
 863         else if (npme > 0 && is_pme_counter(i))
 864         {
 865             /* Print timing information for PME-only nodes, but add an
 866              * asterisk so the reader of the table can know that the
 867              * walltimes are not meant to add up. The asterisk still
 868              * fits in the required maximum of 19 characters. */
 869             char buffer[STRLEN];
 870             snprintf(buffer, STRLEN, "%s *", wcn[i]);
 871             print_cycles(fplog, c2t_pme, buffer, npme, nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
 872         }
 873         else
 874         {
 875             /* Print timing information when it is for a PP or PP+PME
 876                node */
 877             print_cycles(fplog, c2t_pp, wcn[i], npp, nth_pp, wc->wcc[i].n, cyc_sum[i], tot);
 878             tot_for_pp += cyc_sum[i];
 879         }
 880     }
 881     if (wc->wcc_all != nullptr)
 882     {
 883         for (i = 0; i < ewcNR; i++)
 884         {
 885             for (j = 0; j < ewcNR; j++)
 886             {
 887                 snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
 888                 print_cycles(fplog, c2t_pp, buf, npp, nth_pp, wc->wcc_all[i * ewcNR + j].n,
 889                              wc->wcc_all[i * ewcNR + j].c, tot);
 890             }
 891         }
 892     }
 893     tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot);
 894     print_cycles(fplog, c2t_pp, "Rest", npp, nth_pp, -1, tot_for_rest - tot_for_pp, tot);
 895     fprintf(fplog, "%s\n", hline);
 896     print_cycles(fplog, c2t, "Total", npp, nth_pp, -1, tot, tot);
 897     fprintf(fplog, "%s\n", hline);
 898
 899     if (npme > 0)
 900     {
 901         fprintf(fplog,
 902                 "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
 903                 "    twice the total reported, but the cycle count total and %% are correct.\n"
 904                 "%s\n",
 905                 hline);
 906     }
 907
 908     if (wc->wcc[ewcPMEMESH].n > 0)
 909     {
 910         // A workaround to not print breakdown when no subcounters were recorded.
 911         // TODO: figure out and record PME GPU counters (what to do with the waiting ones?)
 912         std::vector<int> validPmeSubcounterIndices;
 913         for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
 914         {
 915             if (is_pme_subcounter(i) && wc->wcc[i].n > 0)
 916             {
 917                 validPmeSubcounterIndices.push_back(i);
 918             }
 919         }
 920
 921         if (!validPmeSubcounterIndices.empty())
 922         {
 923             fprintf(fplog, " Breakdown of PME mesh computation\n");
 924             fprintf(fplog, "%s\n", hline);
 925             for (auto i : validPmeSubcounterIndices)
 926             {
 927                 print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i], npme > 0 ? npme : npp,
 928                              nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
 929             }
 930             fprintf(fplog, "%s\n", hline);
 931         }
 932     }
 933
 934     if (useCycleSubcounters && wc->wcsc)
 935     {
 936         fprintf(fplog, " Breakdown of PP computation\n");
 937         fprintf(fplog, "%s\n", hline);
 938         for (i = 0; i < ewcsNR; i++)
 939         {
 940             print_cycles(fplog, c2t_pp, wcsn[i], npp, nth_pp, wc->wcsc[i].n, cyc_sum[ewcNR + i], tot);
 941         }
 942         fprintf(fplog, "%s\n", hline);
 943     }
 944
 945     /* print GPU timing summary */
 946     double tot_gpu = 0.0;
 947     if (gpu_pme_t)
 948     {
 949         for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
 950         {
 951             tot_gpu += gpu_pme_t->timing[k].t;
 952         }
 953     }
 954     if (gpu_nbnxn_t)
 955     {
 956         const char* k_log_str[2][2] = { { "Nonbonded F kernel", "Nonbonded F+ene k." },
 957                                         { "Nonbonded F+prune k.", "Nonbonded F+ene+prune k." } };
 958         tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
 959
 960         /* add up the kernel timings */
 961         for (i = 0; i < 2; i++)
 962         {
 963             for (j = 0; j < 2; j++)
 964             {
 965                 tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
 966             }
 967         }
 968         tot_gpu += gpu_nbnxn_t->pruneTime.t;
 969
 970         tot_cpu_overlap = wc->wcc[ewcFORCE].c;
 971         if (wc->wcc[ewcPMEMESH].n > 0)
 972         {
 973             tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
 974         }
 975         tot_cpu_overlap *= realtime * 1000 / tot; /* convert s to ms */
 976
 977         fprintf(fplog, "\n GPU timings\n%s\n", hline);
 978         fprintf(fplog,
 979                 " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
 980         fprintf(fplog, "%s\n", hline);
 981         print_gputimes(fplog, "Pair list H2D", gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
 982         print_gputimes(fplog, "X / q H2D", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
 983
 984         for (i = 0; i < 2; i++)
 985         {
 986             for (j = 0; j < 2; j++)
 987             {
 988                 if (gpu_nbnxn_t->ktime[i][j].c)
 989                 {
 990                     print_gputimes(fplog, k_log_str[i][j], gpu_nbnxn_t->ktime[i][j].c,
 991                                    gpu_nbnxn_t->ktime[i][j].t, tot_gpu);
 992                 }
 993             }
 994         }
 995         if (gpu_pme_t)
 996         {
 997             for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
 998             {
 999                 if (gpu_pme_t->timing[k].c)
1000                 {
1001                     print_gputimes(fplog, PMEStageNames[k], gpu_pme_t->timing[k].c,
1002                                    gpu_pme_t->timing[k].t, tot_gpu);
1003                 }
1004             }
1005         }
1006         if (gpu_nbnxn_t->pruneTime.c)
1007         {
1008             print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c,
1009                            gpu_nbnxn_t->pruneTime.t, tot_gpu);
1010         }
1011         print_gputimes(fplog, "F D2H", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
1012         fprintf(fplog, "%s\n", hline);
1013         print_gputimes(fplog, "Total ", gpu_nbnxn_t->nb_c, tot_gpu, tot_gpu);
1014         fprintf(fplog, "%s\n", hline);
1015         if (gpu_nbnxn_t->dynamicPruneTime.c)
1016         {
1017             /* We print the dynamic pruning kernel timings after a separator
1018              * and avoid adding it to tot_gpu as this is not in the force
1019              * overlap. We print the fraction as relative to the rest.
1020              */
1021             print_gputimes(fplog, "*Dynamic pruning", gpu_nbnxn_t->dynamicPruneTime.c,
1022                            gpu_nbnxn_t->dynamicPruneTime.t, tot_gpu);
1023             fprintf(fplog, "%s\n", hline);
1024         }
1025         gpu_cpu_ratio = tot_gpu / tot_cpu_overlap;
1026         if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
1027         {
1028             fprintf(fplog,
1029                     "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = "
1030                     "%.3f\n",
1031                     tot_gpu / gpu_nbnxn_t->nb_c, tot_cpu_overlap / wc->wcc[ewcFORCE].n, gpu_cpu_ratio);
1032         }
1033
1034         /* only print notes related to CPU-GPU load balance with PME */
1035         if (wc->wcc[ewcPMEMESH].n > 0)
1036         {
1037             fprintf(fplog, "For optimal resource utilization this ratio should be close to 1\n");
1038
1039             /* print note if the imbalance is high with PME case in which
1040              * CPU-GPU load balancing is possible */
1041             if (gpu_cpu_ratio < 0.8 || gpu_cpu_ratio > 1.25)
1042             {
1043                 /* Only the sim master calls this function, so always print to stderr */
1044                 if (gpu_cpu_ratio < 0.8)
1045                 {
1046                     if (npp > 1)
1047                     {
1048                         /* The user could have used -notunepme,
1049                          * but we currently can't check that here.
1050                          */
1051                         GMX_LOG(mdlog.warning)
1052                                 .asParagraph()
1053                                 .appendText(
1054                                         "NOTE: The CPU has >25% more load than the GPU. This "
1055                                         "imbalance wastes\n"
1056                                         "      GPU resources. Maybe the domain decomposition "
1057                                         "limits the PME tuning.\n"
1058                                         "      In that case, try setting the DD grid manually "
1059                                         "(-dd) or lowering -dds.");
1060                     }
1061                     else
1062                     {
1063                         /* We should not end up here, unless the box is
1064                          * too small for increasing the cut-off for PME tuning.
1065                          */
1066                         GMX_LOG(mdlog.warning)
1067                                 .asParagraph()
1068                                 .appendText(
1069                                         "NOTE: The CPU has >25% more load than the GPU. This "
1070                                         "imbalance wastes\n"
1071                                         "      GPU resources.");
1072                     }
1073                 }
1074                 if (gpu_cpu_ratio > 1.25)
1075                 {
1076                     GMX_LOG(mdlog.warning)
1077                             .asParagraph()
1078                             .appendText(
1079                                     "NOTE: The GPU has >25% more load than the CPU. This imbalance "
1080                                     "wastes\n"
1081                                     "      CPU resources.");
1082                 }
1083             }
1084         }
1085     }
1086
1087     if (wc->wc_barrier)
1088     {
1089         GMX_LOG(mdlog.warning)
1090                 .asParagraph()
1091                 .appendText(
1092                         "MPI_Barrier was called before each cycle start/stop\n"
1093                         "call, so timings are not those of real runs.");
1094     }
1095
1096     if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 && (cyc_sum[ewcDOMDEC] > tot * 0.1 || cyc_sum[ewcNS] > tot * 0.1))
1097     {
1098         /* Only the sim master calls this function, so always print to stderr */
1099         if (wc->wcc[ewcDOMDEC].n == 0)
1100         {
1101             GMX_LOG(mdlog.warning)
1102                     .asParagraph()
1103                     .appendTextFormatted(
1104                             "NOTE: %d %% of the run time was spent in pair search,\n"
1105                             "      you might want to increase nstlist (this has no effect on "
1106                             "accuracy)\n",
1107                             gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1108         }
1109         else
1110         {
1111             GMX_LOG(mdlog.warning)
1112                     .asParagraph()
1113                     .appendTextFormatted(
1114                             "NOTE: %d %% of the run time was spent in domain decomposition,\n"
1115                             "      %d %% of the run time was spent in pair search,\n"
1116                             "      you might want to increase nstlist (this has no effect on "
1117                             "accuracy)\n",
1118                             gmx::roundToInt(100 * cyc_sum[ewcDOMDEC] / tot),
1119                             gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1120         }
1121     }
1122
1123     if (cyc_sum[ewcMoveE] > tot * 0.05)
1124     {
1125         GMX_LOG(mdlog.warning)
1126                 .asParagraph()
1127                 .appendTextFormatted(
1128                         "NOTE: %d %% of the run time was spent communicating energies,\n"
1129                         "      you might want to increase some nst* mdp options\n",
1130                         gmx::roundToInt(100 * cyc_sum[ewcMoveE] / tot));
1131     }
1132 }
1133
1134 extern int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
1135 {
1136     if (wc == nullptr)
1137     {
1138         return -1;
1139     }
1140
1141     return wc->reset_counters;
1142 }
1143
1144 extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, int64_t reset_counters)
1145 {
1146     if (wc == nullptr)
1147     {
1148         return;
1149     }
1150
1151     wc->reset_counters = reset_counters;
1152 }
1153
1154 void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
1155 {
1156     if (useCycleSubcounters && wc != nullptr)
1157     {
1158         wc->wcsc[ewcs].start = gmx_cycles_read();
1159     }
1160 }
1161
1162 void wallcycle_sub_start_nocount(gmx_wallcycle_t wc, int ewcs)
1163 {
1164     if (useCycleSubcounters && wc != nullptr)
1165     {
1166         wallcycle_sub_start(wc, ewcs);
1167         wc->wcsc[ewcs].n--;
1168     }
1169 }
1170
1171 void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
1172 {
1173     if (useCycleSubcounters && wc != nullptr)
1174     {
1175         wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start;
1176         wc->wcsc[ewcs].n++;
1177     }
1178 }