src/gromacs/timing/wallcycle.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #include "gmxpre.h"
  39
  40 #include "wallcycle.h"
  41
  42 #include "config.h"
  43
  44 #include <cstdlib>
  45
  46 #include <array>
  47 #include <vector>
  48
  49 #include "gromacs/math/functions.h"
  50 #include "gromacs/mdtypes/commrec.h"
  51 #include "gromacs/timing/cyclecounter.h"
  52 #include "gromacs/timing/gpu_timing.h"
  53 #include "gromacs/timing/wallcyclereporting.h"
  54 #include "gromacs/utility/cstringutil.h"
  55 #include "gromacs/utility/gmxassert.h"
  56 #include "gromacs/utility/gmxmpi.h"
  57 #include "gromacs/utility/logger.h"
  58 #include "gromacs/utility/smalloc.h"
  59 #include "gromacs/utility/snprintf.h"
  60
  61 //! Whether wallcycle debugging is enabled
  62 constexpr bool gmx_unused enableWallcycleDebug = (DEBUG_WCYCLE != 0);
  63 //! True if only the master rank should print debugging output
  64 constexpr bool gmx_unused onlyMasterDebugPrints = true;
  65 //! True if cycle counter nesting depth debuggin prints are enabled
  66 constexpr bool gmx_unused debugPrintDepth = false /* enableWallcycleDebug */;
  67
  68 #if DEBUG_WCYCLE
  69 #    include "gromacs/utility/fatalerror.h"
  70 #endif
  71
  72 /* Each name should not exceed 19 printing characters
  73    (ie. terminating null can be twentieth) */
  74 static const char* wcn[ewcNR] = { "Run",
  75                                   "Step",
  76                                   "PP during PME",
  77                                   "Domain decomp.",
  78                                   "DD comm. load",
  79                                   "DD comm. bounds",
  80                                   "Vsite constr.",
  81                                   "Send X to PME",
  82                                   "Neighbor search",
  83                                   "Launch GPU ops.",
  84                                   "Comm. coord.",
  85                                   "Force",
  86                                   "Wait + Comm. F",
  87                                   "PME mesh",
  88                                   "PME redist. X/F",
  89                                   "PME spread",
  90                                   "PME gather",
  91                                   "PME 3D-FFT",
  92                                   "PME 3D-FFT Comm.",
  93                                   "PME solve LJ",
  94                                   "PME solve Elec",
  95                                   "PME wait for PP",
  96                                   "Wait + Recv. PME F",
  97                                   "Wait PME GPU spread",
  98                                   "PME 3D-FFT",
  99                                   "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
 100                                   "Wait PME GPU gather",
 101                                   "Wait Bonded GPU",
 102                                   "Reduce GPU PME F",
 103                                   "Wait GPU NB nonloc.",
 104                                   "Wait GPU NB local",
 105                                   "Wait GPU state copy",
 106                                   "NB X/F buffer ops.",
 107                                   "Vsite spread",
 108                                   "COM pull force",
 109                                   "AWH",
 110                                   "Write traj.",
 111                                   "Update",
 112                                   "Constraints",
 113                                   "Comm. energies",
 114                                   "Enforced rotation",
 115                                   "Add rot. forces",
 116                                   "Position swapping",
 117                                   "IMD",
 118                                   "Test" };
 119
 120 static const char* wcsn[ewcsNR] = {
 121     "DD redist.",
 122     "DD NS grid + sort",
 123     "DD setup comm.",
 124     "DD make top.",
 125     "DD make constr.",
 126     "DD top. other",
 127     "DD GPU ops.",
 128     "NS grid local",
 129     "NS grid non-loc.",
 130     "NS search local",
 131     "NS search non-loc.",
 132     "Bonded F",
 133     "Bonded-FEP F",
 134     "Restraints F",
 135     "Listed buffer ops.",
 136     "Nonbonded pruning",
 137     "Nonbonded F kernel",
 138     "Nonbonded F clear",
 139     "Nonbonded FEP",
 140     "Launch NB GPU tasks",
 141     "Launch Bonded GPU tasks",
 142     "Launch PME GPU tasks",
 143     "Launch state copy",
 144     "Ewald F correction",
 145     "NB X buffer ops.",
 146     "NB F buffer ops.",
 147     "Clear force buffer",
 148     "Launch GPU NB X buffer ops.",
 149     "Launch GPU NB F buffer ops.",
 150     "Launch GPU Comm. coord.",
 151     "Launch GPU Comm. force.",
 152     "Launch GPU update",
 153     "Test subcounter",
 154 };
 155
 156 /* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
 157 static const char* enumValuetoString(PmeStage enumValue)
 158 {
 159     constexpr gmx::EnumerationArray<PmeStage, const char*> pmeStageNames = {
 160         "PME spline", "PME spread",     "PME spline + spread", "PME 3D-FFT r2c",
 161         "PME solve",  "PME 3D-FFT c2r", "PME gather"
 162     };
 163     return pmeStageNames[enumValue];
 164 };
 165
 166 bool wallcycle_have_counter()
 167 {
 168     return gmx_cycles_have_counter();
 169 }
 170
 171 gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, const t_commrec* cr)
 172 {
 173     gmx_wallcycle_t wc;
 174
 175
 176     if (!wallcycle_have_counter())
 177     {
 178         return nullptr;
 179     }
 180
 181     snew(wc, 1);
 182
 183     wc->haveInvalidCount = FALSE;
 184     wc->wc_barrier       = FALSE;
 185     wc->wcc_all          = nullptr;
 186     wc->wc_depth         = 0;
 187     wc->ewc_prev         = -1;
 188     wc->reset_counters   = resetstep;
 189     wc->cr               = cr;
 190
 191
 192 #if GMX_MPI
 193     if (cr != nullptr && PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
 194     {
 195         if (fplog)
 196         {
 197             fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
 198         }
 199         wc->wc_barrier = TRUE;
 200     }
 201 #endif
 202
 203     snew(wc->wcc, ewcNR);
 204     if (getenv("GMX_CYCLE_ALL") != nullptr)
 205     {
 206         if (fplog)
 207         {
 208             fprintf(fplog, "\nWill time all the code during the run\n\n");
 209         }
 210         snew(wc->wcc_all, ewcNR * ewcNR);
 211     }
 212
 213     if (sc_useCycleSubcounters)
 214     {
 215         snew(wc->wcsc, ewcsNR);
 216     }
 217
 218 #if DEBUG_WCYCLE
 219     wc->count_depth  = 0;
 220     wc->isMasterRank = MASTER(cr);
 221 #endif
 222
 223     return wc;
 224 }
 225
 226 void wallcycle_destroy(gmx_wallcycle_t wc)
 227 {
 228     if (wc == nullptr)
 229     {
 230         return;
 231     }
 232
 233     if (wc->wcc != nullptr)
 234     {
 235         sfree(wc->wcc);
 236     }
 237     if (wc->wcc_all != nullptr)
 238     {
 239         sfree(wc->wcc_all);
 240     }
 241     if (wc->wcsc != nullptr)
 242     {
 243         sfree(wc->wcsc);
 244     }
 245     sfree(wc);
 246 }
 247
 248 #if DEBUG_WCYCLE
 249 static void debug_start_check(gmx_wallcycle_t wc, int ewc)
 250 {
 251     if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
 252     {
 253         gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth + 1);
 254     }
 255     wc->counterlist[wc->count_depth] = ewc;
 256     wc->count_depth++;
 257
 258     if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
 259     {
 260         std::string indentStr(4 * wc->count_depth, ' ');
 261         fprintf(stderr, "%swcycle_start depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]);
 262     }
 263 }
 264
 265 static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
 266 {
 267     if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
 268     {
 269         std::string indentStr(4 * wc->count_depth, ' ');
 270         fprintf(stderr, "%swcycle_stop  depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]);
 271     }
 272
 273     wc->count_depth--;
 274
 275     if (wc->count_depth < 0)
 276     {
 277         gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc], wc->count_depth);
 278     }
 279     if (wc->counterlist[wc->count_depth] != ewc)
 280     {
 281         gmx_fatal(FARGS,
 282                   "wallcycle mismatch at stop, start %s, stop %s",
 283                   wcn[wc->counterlist[wc->count_depth]],
 284                   wcn[ewc]);
 285     }
 286 }
 287 #endif
 288
 289 void wallcycle_get(gmx_wallcycle_t wc, int ewc, int* n, double* c)
 290 {
 291     *n = wc->wcc[ewc].n;
 292     *c = static_cast<double>(wc->wcc[ewc].c);
 293 }
 294
 295 void wallcycle_sub_get(gmx_wallcycle_t wc, int ewcs, int* n, double* c)
 296 {
 297     if (sc_useCycleSubcounters && wc != nullptr)
 298     {
 299         *n = wc->wcsc[ewcs].n;
 300         *c = static_cast<double>(wc->wcsc[ewcs].c);
 301     }
 302 }
 303
 304 void wallcycle_reset_all(gmx_wallcycle_t wc)
 305 {
 306     int i;
 307
 308     if (wc == nullptr)
 309     {
 310         return;
 311     }
 312
 313     for (i = 0; i < ewcNR; i++)
 314     {
 315         wc->wcc[i].n = 0;
 316         wc->wcc[i].c = 0;
 317     }
 318     wc->haveInvalidCount = FALSE;
 319
 320     if (wc->wcc_all)
 321     {
 322         for (i = 0; i < ewcNR * ewcNR; i++)
 323         {
 324             wc->wcc_all[i].n = 0;
 325             wc->wcc_all[i].c = 0;
 326         }
 327     }
 328     if (wc->wcsc)
 329     {
 330         for (i = 0; i < ewcsNR; i++)
 331         {
 332             wc->wcsc[i].n = 0;
 333             wc->wcsc[i].c = 0;
 334         }
 335     }
 336 }
 337
 338 static gmx_bool is_pme_counter(int ewc)
 339 {
 340     return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
 341 }
 342
 343 static gmx_bool is_pme_subcounter(int ewc)
 344 {
 345     return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
 346 }
 347
 348 void wallcycleBarrier(gmx_wallcycle* wc)
 349 {
 350 #if GMX_MPI
 351     if (wc->wc_barrier)
 352     {
 353         MPI_Barrier(wc->cr->mpi_comm_mygroup);
 354     }
 355 #endif
 356 }
 357
 358 /* Subtract counter ewc_sub timed inside a timing block for ewc_main */
 359 static void subtract_cycles(wallcc_t* wcc, int ewc_main, int ewc_sub)
 360 {
 361     if (wcc[ewc_sub].n > 0)
 362     {
 363         if (wcc[ewc_main].c >= wcc[ewc_sub].c)
 364         {
 365             wcc[ewc_main].c -= wcc[ewc_sub].c;
 366         }
 367         else
 368         {
 369             /* Something is wrong with the cycle counting */
 370             wcc[ewc_main].c = 0;
 371         }
 372     }
 373 }
 374
 375 void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
 376 {
 377     if (wc == nullptr)
 378     {
 379         return;
 380     }
 381
 382     for (int i = 0; i < ewcNR; i++)
 383     {
 384         if (is_pme_counter(i) || (i == ewcRUN && isPmeRank))
 385         {
 386             wc->wcc[i].c *= nthreads_pme;
 387
 388             if (wc->wcc_all)
 389             {
 390                 for (int j = 0; j < ewcNR; j++)
 391                 {
 392                     wc->wcc_all[i * ewcNR + j].c *= nthreads_pme;
 393                 }
 394             }
 395         }
 396         else
 397         {
 398             wc->wcc[i].c *= nthreads_pp;
 399
 400             if (wc->wcc_all)
 401             {
 402                 for (int j = 0; j < ewcNR; j++)
 403                 {
 404                     wc->wcc_all[i * ewcNR + j].c *= nthreads_pp;
 405                 }
 406             }
 407         }
 408     }
 409     if (sc_useCycleSubcounters && wc->wcsc && !isPmeRank)
 410     {
 411         for (int i = 0; i < ewcsNR; i++)
 412         {
 413             wc->wcsc[i].c *= nthreads_pp;
 414         }
 415     }
 416 }
 417
 418 /* TODO Make an object for this function to return, containing some
 419  * vectors of something like wallcc_t for the summed wcc, wcc_all and
 420  * wcsc, AND the original wcc for rank 0.
 421  *
 422  * The GPU timing is reported only for rank 0, so we want to preserve
 423  * the original wcycle on that rank. Rank 0 also reports the global
 424  * counts before that, so needs something to contain the global data
 425  * without over-writing the rank-0 data. The current implementation
 426  * uses cycles_sum to manage this, which works OK now because wcsc and
 427  * wcc_all are unused by the GPU reporting, but it is not satisfactory
 428  * for the future. Also, there's no need for MPI_Allreduce, since
 429  * only MASTERRANK uses any of the results. */
 430 WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle_t wc)
 431 {
 432     WallcycleCounts cycles_sum;
 433     wallcc_t*       wcc;
 434     double          cycles[int(ewcNR) + int(ewcsNR)];
 435 #if GMX_MPI
 436     double cycles_n[int(ewcNR) + int(ewcsNR) + 1];
 437 #endif
 438     int i;
 439     int nsum;
 440
 441     if (wc == nullptr)
 442     {
 443         /* Default construction of std::array of non-class T can leave
 444            the values indeterminate, just like a C array */
 445         cycles_sum.fill(0);
 446         return cycles_sum;
 447     }
 448
 449     wcc = wc->wcc;
 450
 451     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
 452     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);
 453
 454     subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);
 455
 456     if (cr->npmenodes == 0)
 457     {
 458         /* All nodes do PME (or no PME at all) */
 459         subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
 460     }
 461     else
 462     {
 463         /* The are PME-only nodes */
 464         if (wcc[ewcPMEMESH].n > 0)
 465         {
 466             /* This must be a PME only node, calculate the Wait + Comm. time */
 467             GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c,
 468                        "Total run ticks must be greater than PME-only ticks");
 469             wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
 470         }
 471     }
 472
 473     /* Store the cycles in a double buffer for summing */
 474     for (i = 0; i < ewcNR; i++)
 475     {
 476 #if GMX_MPI
 477         cycles_n[i] = static_cast<double>(wcc[i].n);
 478 #endif
 479         cycles[i] = static_cast<double>(wcc[i].c);
 480     }
 481     nsum = ewcNR;
 482     if (wc->wcsc)
 483     {
 484         for (i = 0; i < ewcsNR; i++)
 485         {
 486 #if GMX_MPI
 487             cycles_n[ewcNR + i] = static_cast<double>(wc->wcsc[i].n);
 488 #endif
 489             cycles[ewcNR + i] = static_cast<double>(wc->wcsc[i].c);
 490         }
 491         nsum += ewcsNR;
 492     }
 493
 494 #if GMX_MPI
 495     if (cr->nnodes > 1)
 496     {
 497         double buf[int(ewcNR) + int(ewcsNR) + 1];
 498
 499         // TODO this code is used only at the end of the run, so we
 500         // can just do a simple reduce of haveInvalidCount in
 501         // wallcycle_print, and avoid bugs
 502         cycles_n[nsum] = (wc->haveInvalidCount ? 1 : 0);
 503         // TODO Use MPI_Reduce
 504         MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
 505         for (i = 0; i < ewcNR; i++)
 506         {
 507             wcc[i].n = gmx::roundToInt(buf[i]);
 508         }
 509         wc->haveInvalidCount = (buf[nsum] > 0);
 510         if (wc->wcsc)
 511         {
 512             for (i = 0; i < ewcsNR; i++)
 513             {
 514                 wc->wcsc[i].n = gmx::roundToInt(buf[ewcNR + i]);
 515             }
 516         }
 517
 518         // TODO Use MPI_Reduce
 519         MPI_Allreduce(cycles, cycles_sum.data(), nsum, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 520
 521         if (wc->wcc_all != nullptr)
 522         {
 523             double *buf_all, *cyc_all;
 524
 525             snew(cyc_all, ewcNR * ewcNR);
 526             snew(buf_all, ewcNR * ewcNR);
 527             for (i = 0; i < ewcNR * ewcNR; i++)
 528             {
 529                 cyc_all[i] = wc->wcc_all[i].c;
 530             }
 531             // TODO Use MPI_Reduce
 532             MPI_Allreduce(cyc_all, buf_all, ewcNR * ewcNR, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 533             for (i = 0; i < ewcNR * ewcNR; i++)
 534             {
 535                 wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
 536             }
 537             sfree(buf_all);
 538             sfree(cyc_all);
 539         }
 540     }
 541     else
 542 #endif
 543     {
 544         for (i = 0; i < nsum; i++)
 545         {
 546             cycles_sum[i] = cycles[i];
 547         }
 548     }
 549
 550     return cycles_sum;
 551 }
 552
 553 static void
 554 print_cycles(FILE* fplog, double c2t, const char* name, int nnodes, int nthreads, int ncalls, double c_sum, double tot)
 555 {
 556     char   nnodes_str[STRLEN];
 557     char   nthreads_str[STRLEN];
 558     char   ncalls_str[STRLEN];
 559     double wallt;
 560     double percentage = (tot > 0.) ? (100. * c_sum / tot) : 0.;
 561
 562     if (c_sum > 0)
 563     {
 564         if (ncalls > 0)
 565         {
 566             snprintf(ncalls_str, sizeof(ncalls_str), "%10d", ncalls);
 567             if (nnodes < 0)
 568             {
 569                 snprintf(nnodes_str, sizeof(nnodes_str), "N/A");
 570             }
 571             else
 572             {
 573                 snprintf(nnodes_str, sizeof(nnodes_str), "%4d", nnodes);
 574             }
 575             if (nthreads < 0)
 576             {
 577                 snprintf(nthreads_str, sizeof(nthreads_str), "N/A");
 578             }
 579             else
 580             {
 581                 snprintf(nthreads_str, sizeof(nthreads_str), "%4d", nthreads);
 582             }
 583         }
 584         else
 585         {
 586             nnodes_str[0]   = 0;
 587             nthreads_str[0] = 0;
 588             ncalls_str[0]   = 0;
 589         }
 590         /* Convert the cycle count to wallclock time for this task */
 591         wallt = c_sum * c2t;
 592
 593         fprintf(fplog,
 594                 " %-19.19s %4s %4s %10s  %10.3f %14.3f %5.1f\n",
 595                 name,
 596                 nnodes_str,
 597                 nthreads_str,
 598                 ncalls_str,
 599                 wallt,
 600                 c_sum * 1e-9,
 601                 percentage);
 602     }
 603 }
 604
 605 static void print_gputimes(FILE* fplog, const char* name, int n, double t, double tot_t)
 606 {
 607     char num[11];
 608     char avg_perf[11];
 609
 610     if (n > 0)
 611     {
 612         snprintf(num, sizeof(num), "%10d", n);
 613         snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t / n);
 614     }
 615     else
 616     {
 617         sprintf(num, "          ");
 618         sprintf(avg_perf, "          ");
 619     }
 620     if (t != tot_t && tot_t > 0)
 621     {
 622         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, num, t / 1000, avg_perf, 100 * t / tot_t);
 623     }
 624     else
 625     {
 626         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, "", t / 1000, avg_perf, 100.0);
 627     }
 628 }
 629
 630 static void print_header(FILE* fplog, int nrank_pp, int nth_pp, int nrank_pme, int nth_pme)
 631 {
 632     int nrank_tot = nrank_pp + nrank_pme;
 633     if (0 == nrank_pme)
 634     {
 635         fprintf(fplog, "On %d MPI rank%s", nrank_tot, nrank_tot == 1 ? "" : "s");
 636         if (nth_pp > 1)
 637         {
 638             fprintf(fplog, ", each using %d OpenMP threads", nth_pp);
 639         }
 640         /* Don't report doing PP+PME, because we can't tell here if
 641          * this is RF, etc. */
 642     }
 643     else
 644     {
 645         fprintf(fplog, "On %d MPI rank%s doing PP", nrank_pp, nrank_pp == 1 ? "" : "s");
 646         if (nth_pp > 1)
 647         {
 648             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pp > 1 ? " each" : "", nth_pp);
 649         }
 650         fprintf(fplog, ", and\non %d MPI rank%s doing PME", nrank_pme, nrank_pme == 1 ? "" : "s");
 651         if (nth_pme > 1)
 652         {
 653             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pme > 1 ? " each" : "", nth_pme);
 654         }
 655     }
 656
 657     fprintf(fplog, "\n\n");
 658     fprintf(fplog, " Computing:          Num   Num      Call    Wall time         Giga-Cycles\n");
 659     fprintf(fplog, "                     Ranks Threads  Count      (s)         total sum    %%\n");
 660 }
 661
 662
 663 void wallcycle_print(FILE*                            fplog,
 664                      const gmx::MDLogger&             mdlog,
 665                      int                              nnodes,
 666                      int                              npme,
 667                      int                              nth_pp,
 668                      int                              nth_pme,
 669                      double                           realtime,
 670                      gmx_wallcycle_t                  wc,
 671                      const WallcycleCounts&           cyc_sum,
 672                      const gmx_wallclock_gpu_nbnxn_t* gpu_nbnxn_t,
 673                      const gmx_wallclock_gpu_pme_t*   gpu_pme_t)
 674 {
 675     double      tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
 676     double      c2t, c2t_pp, c2t_pme = 0;
 677     int         i, j, npp, nth_tot;
 678     char        buf[STRLEN];
 679     const char* hline =
 680             "-----------------------------------------------------------------------------";
 681
 682     if (wc == nullptr)
 683     {
 684         return;
 685     }
 686
 687     GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0");
 688     GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0");
 689     GMX_ASSERT(nnodes > 0, "Number of nodes must be >0");
 690     GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative");
 691     npp = nnodes - npme;
 692     /* npme is the number of PME-only ranks used, and we always do PP work */
 693     GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0");
 694
 695     nth_tot = npp * nth_pp + npme * nth_pme;
 696
 697     /* When using PME-only nodes, the next line is valid for both
 698        PP-only and PME-only nodes because they started ewcRUN at the
 699        same time. */
 700     tot        = cyc_sum[ewcRUN];
 701     tot_for_pp = 0;
 702
 703     if (tot <= 0.0)
 704     {
 705         /* TODO This is heavy handed, but until someone reworks the
 706            code so that it is provably robust with respect to
 707            non-positive values for all possible timer and cycle
 708            counters, there is less value gained from printing whatever
 709            timing data might still be sensible for some non-Jenkins
 710            run, than is lost from diagnosing Jenkins FP exceptions on
 711            runs about whose execution time we don't care. */
 712         GMX_LOG(mdlog.warning)
 713                 .asParagraph()
 714                 .appendTextFormatted(
 715                         "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a "
 716                         "time accounting",
 717                         tot);
 718         return;
 719     }
 720
 721     if (wc->haveInvalidCount)
 722     {
 723         GMX_LOG(mdlog.warning)
 724                 .asParagraph()
 725                 .appendText(
 726                         "NOTE: Detected invalid cycle counts, probably because threads moved "
 727                         "between CPU cores that do not have synchronized cycle counters. Will not "
 728                         "print the cycle accounting.");
 729         return;
 730     }
 731
 732
 733     /* Conversion factor from cycles to seconds */
 734     c2t    = realtime / tot;
 735     c2t_pp = c2t * nth_tot / static_cast<double>(npp * nth_pp);
 736     if (npme > 0)
 737     {
 738         c2t_pme = c2t * nth_tot / static_cast<double>(npme * nth_pme);
 739     }
 740     else
 741     {
 742         c2t_pme = 0;
 743     }
 744
 745     fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");
 746
 747     print_header(fplog, npp, nth_pp, npme, nth_pme);
 748
 749     fprintf(fplog, "%s\n", hline);
 750     for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
 751     {
 752         if (is_pme_subcounter(i))
 753         {
 754             /* Do not count these at all */
 755         }
 756         else if (npme > 0 && is_pme_counter(i))
 757         {
 758             /* Print timing information for PME-only nodes, but add an
 759              * asterisk so the reader of the table can know that the
 760              * walltimes are not meant to add up. The asterisk still
 761              * fits in the required maximum of 19 characters. */
 762             char buffer[STRLEN];
 763             snprintf(buffer, STRLEN, "%s *", wcn[i]);
 764             print_cycles(fplog, c2t_pme, buffer, npme, nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
 765         }
 766         else
 767         {
 768             /* Print timing information when it is for a PP or PP+PME
 769                node */
 770             print_cycles(fplog, c2t_pp, wcn[i], npp, nth_pp, wc->wcc[i].n, cyc_sum[i], tot);
 771             tot_for_pp += cyc_sum[i];
 772         }
 773     }
 774     if (wc->wcc_all != nullptr)
 775     {
 776         for (i = 0; i < ewcNR; i++)
 777         {
 778             for (j = 0; j < ewcNR; j++)
 779             {
 780                 snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
 781                 print_cycles(fplog,
 782                              c2t_pp,
 783                              buf,
 784                              npp,
 785                              nth_pp,
 786                              wc->wcc_all[i * ewcNR + j].n,
 787                              wc->wcc_all[i * ewcNR + j].c,
 788                              tot);
 789             }
 790         }
 791     }
 792     tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot);
 793     print_cycles(fplog, c2t_pp, "Rest", npp, nth_pp, -1, tot_for_rest - tot_for_pp, tot);
 794     fprintf(fplog, "%s\n", hline);
 795     print_cycles(fplog, c2t, "Total", npp, nth_pp, -1, tot, tot);
 796     fprintf(fplog, "%s\n", hline);
 797
 798     if (npme > 0)
 799     {
 800         fprintf(fplog,
 801                 "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
 802                 "    twice the total reported, but the cycle count total and %% are correct.\n"
 803                 "%s\n",
 804                 hline);
 805     }
 806
 807     if (wc->wcc[ewcPMEMESH].n > 0)
 808     {
 809         // A workaround to not print breakdown when no subcounters were recorded.
 810         // TODO: figure out and record PME GPU counters (what to do with the waiting ones?)
 811         std::vector<int> validPmeSubcounterIndices;
 812         for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
 813         {
 814             if (is_pme_subcounter(i) && wc->wcc[i].n > 0)
 815             {
 816                 validPmeSubcounterIndices.push_back(i);
 817             }
 818         }
 819
 820         if (!validPmeSubcounterIndices.empty())
 821         {
 822             fprintf(fplog, " Breakdown of PME mesh computation\n");
 823             fprintf(fplog, "%s\n", hline);
 824             for (auto i : validPmeSubcounterIndices)
 825             {
 826                 print_cycles(fplog,
 827                              npme > 0 ? c2t_pme : c2t_pp,
 828                              wcn[i],
 829                              npme > 0 ? npme : npp,
 830                              nth_pme,
 831                              wc->wcc[i].n,
 832                              cyc_sum[i],
 833                              tot);
 834             }
 835             fprintf(fplog, "%s\n", hline);
 836         }
 837     }
 838
 839     if (sc_useCycleSubcounters && wc->wcsc)
 840     {
 841         fprintf(fplog, " Breakdown of PP computation\n");
 842         fprintf(fplog, "%s\n", hline);
 843         for (i = 0; i < ewcsNR; i++)
 844         {
 845             print_cycles(fplog, c2t_pp, wcsn[i], npp, nth_pp, wc->wcsc[i].n, cyc_sum[ewcNR + i], tot);
 846         }
 847         fprintf(fplog, "%s\n", hline);
 848     }
 849
 850     /* print GPU timing summary */
 851     double tot_gpu = 0.0;
 852     if (gpu_pme_t)
 853     {
 854         for (auto key : keysOf(gpu_pme_t->timing))
 855         {
 856             tot_gpu += gpu_pme_t->timing[key].t;
 857         }
 858     }
 859     if (gpu_nbnxn_t)
 860     {
 861         const char* k_log_str[2][2] = { { "Nonbonded F kernel", "Nonbonded F+ene k." },
 862                                         { "Nonbonded F+prune k.", "Nonbonded F+ene+prune k." } };
 863         tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
 864
 865         /* add up the kernel timings */
 866         for (i = 0; i < 2; i++)
 867         {
 868             for (j = 0; j < 2; j++)
 869             {
 870                 tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
 871             }
 872         }
 873         tot_gpu += gpu_nbnxn_t->pruneTime.t;
 874
 875         tot_cpu_overlap = wc->wcc[ewcFORCE].c;
 876         if (wc->wcc[ewcPMEMESH].n > 0)
 877         {
 878             tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
 879         }
 880         tot_cpu_overlap *= realtime * 1000 / tot; /* convert s to ms */
 881
 882         fprintf(fplog, "\n GPU timings\n%s\n", hline);
 883         fprintf(fplog,
 884                 " Computing:                         Count  Wall t (s)      ms/step       %c\n",
 885                 '%');
 886         fprintf(fplog, "%s\n", hline);
 887         print_gputimes(fplog, "Pair list H2D", gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
 888         print_gputimes(fplog, "X / q H2D", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
 889
 890         for (i = 0; i < 2; i++)
 891         {
 892             for (j = 0; j < 2; j++)
 893             {
 894                 if (gpu_nbnxn_t->ktime[i][j].c)
 895                 {
 896                     print_gputimes(fplog,
 897                                    k_log_str[i][j],
 898                                    gpu_nbnxn_t->ktime[i][j].c,
 899                                    gpu_nbnxn_t->ktime[i][j].t,
 900                                    tot_gpu);
 901                 }
 902             }
 903         }
 904         if (gpu_pme_t)
 905         {
 906             for (auto key : keysOf(gpu_pme_t->timing))
 907             {
 908                 if (gpu_pme_t->timing[key].c)
 909                 {
 910                     print_gputimes(fplog,
 911                                    enumValuetoString(key),
 912                                    gpu_pme_t->timing[key].c,
 913                                    gpu_pme_t->timing[key].t,
 914                                    tot_gpu);
 915                 }
 916             }
 917         }
 918         if (gpu_nbnxn_t->pruneTime.c)
 919         {
 920             print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c, gpu_nbnxn_t->pruneTime.t, tot_gpu);
 921         }
 922         print_gputimes(fplog, "F D2H", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
 923         fprintf(fplog, "%s\n", hline);
 924         print_gputimes(fplog, "Total ", gpu_nbnxn_t->nb_c, tot_gpu, tot_gpu);
 925         fprintf(fplog, "%s\n", hline);
 926         if (gpu_nbnxn_t->dynamicPruneTime.c)
 927         {
 928             /* We print the dynamic pruning kernel timings after a separator
 929              * and avoid adding it to tot_gpu as this is not in the force
 930              * overlap. We print the fraction as relative to the rest.
 931              */
 932             print_gputimes(fplog,
 933                            "*Dynamic pruning",
 934                            gpu_nbnxn_t->dynamicPruneTime.c,
 935                            gpu_nbnxn_t->dynamicPruneTime.t,
 936                            tot_gpu);
 937             fprintf(fplog, "%s\n", hline);
 938         }
 939         gpu_cpu_ratio = tot_gpu / tot_cpu_overlap;
 940         if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
 941         {
 942             fprintf(fplog,
 943                     "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = "
 944                     "%.3f\n",
 945                     tot_gpu / gpu_nbnxn_t->nb_c,
 946                     tot_cpu_overlap / wc->wcc[ewcFORCE].n,
 947                     gpu_cpu_ratio);
 948         }
 949
 950         /* only print notes related to CPU-GPU load balance with PME */
 951         if (wc->wcc[ewcPMEMESH].n > 0)
 952         {
 953             fprintf(fplog, "For optimal resource utilization this ratio should be close to 1\n");
 954
 955             /* print note if the imbalance is high with PME case in which
 956              * CPU-GPU load balancing is possible */
 957             if (gpu_cpu_ratio < 0.8 || gpu_cpu_ratio > 1.25)
 958             {
 959                 /* Only the sim master calls this function, so always print to stderr */
 960                 if (gpu_cpu_ratio < 0.8)
 961                 {
 962                     if (npp > 1)
 963                     {
 964                         /* The user could have used -notunepme,
 965                          * but we currently can't check that here.
 966                          */
 967                         GMX_LOG(mdlog.warning)
 968                                 .asParagraph()
 969                                 .appendText(
 970                                         "NOTE: The CPU has >25% more load than the GPU. This "
 971                                         "imbalance wastes\n"
 972                                         "      GPU resources. Maybe the domain decomposition "
 973                                         "limits the PME tuning.\n"
 974                                         "      In that case, try setting the DD grid manually "
 975                                         "(-dd) or lowering -dds.");
 976                     }
 977                     else
 978                     {
 979                         /* We should not end up here, unless the box is
 980                          * too small for increasing the cut-off for PME tuning.
 981                          */
 982                         GMX_LOG(mdlog.warning)
 983                                 .asParagraph()
 984                                 .appendText(
 985                                         "NOTE: The CPU has >25% more load than the GPU. This "
 986                                         "imbalance wastes\n"
 987                                         "      GPU resources.");
 988                     }
 989                 }
 990                 if (gpu_cpu_ratio > 1.25)
 991                 {
 992                     GMX_LOG(mdlog.warning)
 993                             .asParagraph()
 994                             .appendText(
 995                                     "NOTE: The GPU has >25% more load than the CPU. This imbalance "
 996                                     "wastes\n"
 997                                     "      CPU resources.");
 998                 }
 999             }
1000         }
1001     }
1002
1003     if (wc->wc_barrier)
1004     {
1005         GMX_LOG(mdlog.warning)
1006                 .asParagraph()
1007                 .appendText(
1008                         "MPI_Barrier was called before each cycle start/stop\n"
1009                         "call, so timings are not those of real runs.");
1010     }
1011
1012     if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 && (cyc_sum[ewcDOMDEC] > tot * 0.1 || cyc_sum[ewcNS] > tot * 0.1))
1013     {
1014         /* Only the sim master calls this function, so always print to stderr */
1015         if (wc->wcc[ewcDOMDEC].n == 0)
1016         {
1017             GMX_LOG(mdlog.warning)
1018                     .asParagraph()
1019                     .appendTextFormatted(
1020                             "NOTE: %d %% of the run time was spent in pair search,\n"
1021                             "      you might want to increase nstlist (this has no effect on "
1022                             "accuracy)\n",
1023                             gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1024         }
1025         else
1026         {
1027             GMX_LOG(mdlog.warning)
1028                     .asParagraph()
1029                     .appendTextFormatted(
1030                             "NOTE: %d %% of the run time was spent in domain decomposition,\n"
1031                             "      %d %% of the run time was spent in pair search,\n"
1032                             "      you might want to increase nstlist (this has no effect on "
1033                             "accuracy)\n",
1034                             gmx::roundToInt(100 * cyc_sum[ewcDOMDEC] / tot),
1035                             gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1036         }
1037     }
1038
1039     if (cyc_sum[ewcMoveE] > tot * 0.05)
1040     {
1041         GMX_LOG(mdlog.warning)
1042                 .asParagraph()
1043                 .appendTextFormatted(
1044                         "NOTE: %d %% of the run time was spent communicating energies,\n"
1045                         "      you might want to increase some nst* mdp options\n",
1046                         gmx::roundToInt(100 * cyc_sum[ewcMoveE] / tot));
1047     }
1048 }
1049
1050 extern int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
1051 {
1052     if (wc == nullptr)
1053     {
1054         return -1;
1055     }
1056
1057     return wc->reset_counters;
1058 }
1059
1060 extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, int64_t reset_counters)
1061 {
1062     if (wc == nullptr)
1063     {
1064         return;
1065     }
1066
1067     wc->reset_counters = reset_counters;
1068 }