src/gromacs/timing/wallcycle.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #include "gmxpre.h"
  39
  40 #include "wallcycle.h"
  41
  42 #include "config.h"
  43
  44 #include <cstdlib>
  45
  46 #include <array>
  47 #include <vector>
  48
  49 #include "gromacs/math/functions.h"
  50 #include "gromacs/mdtypes/commrec.h"
  51 #include "gromacs/timing/cyclecounter.h"
  52 #include "gromacs/timing/gpu_timing.h"
  53 #include "gromacs/timing/wallcyclereporting.h"
  54 #include "gromacs/utility/cstringutil.h"
  55 #include "gromacs/utility/gmxassert.h"
  56 #include "gromacs/utility/gmxmpi.h"
  57 #include "gromacs/utility/logger.h"
  58 #include "gromacs/utility/smalloc.h"
  59 #include "gromacs/utility/snprintf.h"
  60
  61 //! Whether wallcycle debugging is enabled
  62 constexpr bool gmx_unused enableWallcycleDebug = (DEBUG_WCYCLE != 0);
  63 //! True if only the master rank should print debugging output
  64 constexpr bool gmx_unused onlyMasterDebugPrints = true;
  65 //! True if cycle counter nesting depth debuggin prints are enabled
  66 constexpr bool gmx_unused debugPrintDepth = false /* enableWallcycleDebug */;
  67
  68 #if DEBUG_WCYCLE
  69 #    include "gromacs/utility/fatalerror.h"
  70 #endif
  71
  72 /* Each name should not exceed 19 printing characters
  73    (ie. terminating null can be twentieth) */
  74 static const char* wcn[ewcNR] = { "Run",
  75                                   "Step",
  76                                   "PP during PME",
  77                                   "Domain decomp.",
  78                                   "DD comm. load",
  79                                   "DD comm. bounds",
  80                                   "Vsite constr.",
  81                                   "Send X to PME",
  82                                   "Neighbor search",
  83                                   "Launch GPU ops.",
  84                                   "Comm. coord.",
  85                                   "Force",
  86                                   "Wait + Comm. F",
  87                                   "PME mesh",
  88                                   "PME redist. X/F",
  89                                   "PME spread",
  90                                   "PME gather",
  91                                   "PME 3D-FFT",
  92                                   "PME 3D-FFT Comm.",
  93                                   "PME solve LJ",
  94                                   "PME solve Elec",
  95                                   "PME wait for PP",
  96                                   "Wait + Recv. PME F",
  97                                   "Wait PME GPU spread",
  98                                   "PME 3D-FFT",
  99                                   "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
 100                                   "Wait PME GPU gather",
 101                                   "Wait Bonded GPU",
 102                                   "Reduce GPU PME F",
 103                                   "Wait GPU NB nonloc.",
 104                                   "Wait GPU NB local",
 105                                   "Wait GPU state copy",
 106                                   "NB X/F buffer ops.",
 107                                   "Vsite spread",
 108                                   "COM pull force",
 109                                   "AWH",
 110                                   "Write traj.",
 111                                   "Update",
 112                                   "Constraints",
 113                                   "Comm. energies",
 114                                   "Enforced rotation",
 115                                   "Add rot. forces",
 116                                   "Position swapping",
 117                                   "IMD",
 118                                   "Test" };
 119
 120 static const char* wcsn[ewcsNR] = {
 121     "DD redist.",
 122     "DD NS grid + sort",
 123     "DD setup comm.",
 124     "DD make top.",
 125     "DD make constr.",
 126     "DD top. other",
 127     "DD GPU ops.",
 128     "NS grid local",
 129     "NS grid non-loc.",
 130     "NS search local",
 131     "NS search non-loc.",
 132     "Bonded F",
 133     "Bonded-FEP F",
 134     "Restraints F",
 135     "Listed buffer ops.",
 136     "Nonbonded pruning",
 137     "Nonbonded F kernel",
 138     "Nonbonded F clear",
 139     "Nonbonded FEP",
 140     "Launch NB GPU tasks",
 141     "Launch Bonded GPU tasks",
 142     "Launch PME GPU tasks",
 143     "Launch state copy",
 144     "Ewald F correction",
 145     "NB X buffer ops.",
 146     "NB F buffer ops.",
 147     "Clear force buffer",
 148     "Launch GPU NB X buffer ops.",
 149     "Launch GPU NB F buffer ops.",
 150     "Launch GPU Comm. coord.",
 151     "Launch GPU Comm. force.",
 152     "Launch GPU update",
 153     "Test subcounter",
 154 };
 155
 156 /* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
 157 static const char* enumValuetoString(PmeStage enumValue)
 158 {
 159     constexpr gmx::EnumerationArray<PmeStage, const char*> pmeStageNames = {
 160         "PME spline", "PME spread",     "PME spline + spread", "PME 3D-FFT r2c",
 161         "PME solve",  "PME 3D-FFT c2r", "PME gather"
 162     };
 163     return pmeStageNames[enumValue];
 164 };
 165
 166 bool wallcycle_have_counter()
 167 {
 168     return gmx_cycles_have_counter();
 169 }
 170
 171 gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, const t_commrec* cr)
 172 {
 173     gmx_wallcycle_t wc;
 174
 175
 176     if (!wallcycle_have_counter())
 177     {
 178         return nullptr;
 179     }
 180
 181     snew(wc, 1);
 182
 183     wc->haveInvalidCount = FALSE;
 184     wc->wc_barrier       = FALSE;
 185     wc->wcc_all          = nullptr;
 186     wc->wc_depth         = 0;
 187     wc->ewc_prev         = -1;
 188     wc->reset_counters   = resetstep;
 189     wc->cr               = cr;
 190
 191
 192 #if GMX_MPI
 193     if (cr != nullptr && PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
 194     {
 195         if (fplog)
 196         {
 197             fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
 198         }
 199         wc->wc_barrier = TRUE;
 200     }
 201 #endif
 202
 203     snew(wc->wcc, ewcNR);
 204     if (getenv("GMX_CYCLE_ALL") != nullptr)
 205     {
 206         if (fplog)
 207         {
 208             fprintf(fplog, "\nWill time all the code during the run\n\n");
 209         }
 210         snew(wc->wcc_all, ewcNR * ewcNR);
 211     }
 212
 213     if (sc_useCycleSubcounters)
 214     {
 215         snew(wc->wcsc, ewcsNR);
 216     }
 217
 218 #if DEBUG_WCYCLE
 219     wc->count_depth  = 0;
 220     wc->isMasterRank = MASTER(cr);
 221 #endif
 222
 223     return wc;
 224 }
 225
 226 void wallcycle_destroy(gmx_wallcycle_t wc)
 227 {
 228     if (wc == nullptr)
 229     {
 230         return;
 231     }
 232
 233     if (wc->wcc != nullptr)
 234     {
 235         sfree(wc->wcc);
 236     }
 237     if (wc->wcc_all != nullptr)
 238     {
 239         sfree(wc->wcc_all);
 240     }
 241     if (wc->wcsc != nullptr)
 242     {
 243         sfree(wc->wcsc);
 244     }
 245     sfree(wc);
 246 }
 247
 248 #if DEBUG_WCYCLE
 249 static void debug_start_check(gmx_wallcycle_t wc, int ewc)
 250 {
 251     if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
 252     {
 253         gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth + 1);
 254     }
 255     wc->counterlist[wc->count_depth] = ewc;
 256     wc->count_depth++;
 257
 258     if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
 259     {
 260         std::string indentStr(4 * wc->count_depth, ' ');
 261         fprintf(stderr, "%swcycle_start depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]);
 262     }
 263 }
 264
 265 static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
 266 {
 267     if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
 268     {
 269         std::string indentStr(4 * wc->count_depth, ' ');
 270         fprintf(stderr, "%swcycle_stop  depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]);
 271     }
 272
 273     wc->count_depth--;
 274
 275     if (wc->count_depth < 0)
 276     {
 277         gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc], wc->count_depth);
 278     }
 279     if (wc->counterlist[wc->count_depth] != ewc)
 280     {
 281         gmx_fatal(FARGS,
 282                   "wallcycle mismatch at stop, start %s, stop %s",
 283                   wcn[wc->counterlist[wc->count_depth]],
 284                   wcn[ewc]);
 285     }
 286 }
 287 #endif
 288
 289 void wallcycle_get(gmx_wallcycle_t wc, int ewc, int* n, double* c)
 290 {
 291     *n = wc->wcc[ewc].n;
 292     *c = static_cast<double>(wc->wcc[ewc].c);
 293 }
 294
 295 void wallcycle_sub_get(gmx_wallcycle_t wc, int ewcs, int* n, double* c)
 296 {
 297     if (sc_useCycleSubcounters && wc != nullptr)
 298     {
 299         *n = wc->wcsc[ewcs].n;
 300         *c = static_cast<double>(wc->wcsc[ewcs].c);
 301     }
 302 }
 303
 304 void wallcycle_reset_all(gmx_wallcycle_t wc)
 305 {
 306     int i;
 307
 308     if (wc == nullptr)
 309     {
 310         return;
 311     }
 312
 313     for (i = 0; i < ewcNR; i++)
 314     {
 315         wc->wcc[i].n = 0;
 316         wc->wcc[i].c = 0;
 317     }
 318     wc->haveInvalidCount = FALSE;
 319
 320     if (wc->wcc_all)
 321     {
 322         for (i = 0; i < ewcNR * ewcNR; i++)
 323         {
 324             wc->wcc_all[i].n = 0;
 325             wc->wcc_all[i].c = 0;
 326         }
 327     }
 328     if (wc->wcsc)
 329     {
 330         for (i = 0; i < ewcsNR; i++)
 331         {
 332             wc->wcsc[i].n = 0;
 333             wc->wcsc[i].c = 0;
 334         }
 335     }
 336 }
 337
 338 static gmx_bool is_pme_counter(int ewc)
 339 {
 340     return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
 341 }
 342
 343 static gmx_bool is_pme_subcounter(int ewc)
 344 {
 345     return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
 346 }
 347
 348 void wallcycleBarrier(gmx_wallcycle* wc)
 349 {
 350 #if GMX_MPI
 351     if (wc->wc_barrier)
 352     {
 353         MPI_Barrier(wc->cr->mpi_comm_mygroup);
 354     }
 355 #else
 356     GMX_UNUSED_VALUE(wc);
 357 #endif
 358 }
 359
 360 /* Subtract counter ewc_sub timed inside a timing block for ewc_main */
 361 static void subtract_cycles(wallcc_t* wcc, int ewc_main, int ewc_sub)
 362 {
 363     if (wcc[ewc_sub].n > 0)
 364     {
 365         if (wcc[ewc_main].c >= wcc[ewc_sub].c)
 366         {
 367             wcc[ewc_main].c -= wcc[ewc_sub].c;
 368         }
 369         else
 370         {
 371             /* Something is wrong with the cycle counting */
 372             wcc[ewc_main].c = 0;
 373         }
 374     }
 375 }
 376
 377 void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
 378 {
 379     if (wc == nullptr)
 380     {
 381         return;
 382     }
 383
 384     for (int i = 0; i < ewcNR; i++)
 385     {
 386         if (is_pme_counter(i) || (i == ewcRUN && isPmeRank))
 387         {
 388             wc->wcc[i].c *= nthreads_pme;
 389
 390             if (wc->wcc_all)
 391             {
 392                 for (int j = 0; j < ewcNR; j++)
 393                 {
 394                     wc->wcc_all[i * ewcNR + j].c *= nthreads_pme;
 395                 }
 396             }
 397         }
 398         else
 399         {
 400             wc->wcc[i].c *= nthreads_pp;
 401
 402             if (wc->wcc_all)
 403             {
 404                 for (int j = 0; j < ewcNR; j++)
 405                 {
 406                     wc->wcc_all[i * ewcNR + j].c *= nthreads_pp;
 407                 }
 408             }
 409         }
 410     }
 411     if (sc_useCycleSubcounters && wc->wcsc && !isPmeRank)
 412     {
 413         for (int i = 0; i < ewcsNR; i++)
 414         {
 415             wc->wcsc[i].c *= nthreads_pp;
 416         }
 417     }
 418 }
 419
 420 /* TODO Make an object for this function to return, containing some
 421  * vectors of something like wallcc_t for the summed wcc, wcc_all and
 422  * wcsc, AND the original wcc for rank 0.
 423  *
 424  * The GPU timing is reported only for rank 0, so we want to preserve
 425  * the original wcycle on that rank. Rank 0 also reports the global
 426  * counts before that, so needs something to contain the global data
 427  * without over-writing the rank-0 data. The current implementation
 428  * uses cycles_sum to manage this, which works OK now because wcsc and
 429  * wcc_all are unused by the GPU reporting, but it is not satisfactory
 430  * for the future. Also, there's no need for MPI_Allreduce, since
 431  * only MASTERRANK uses any of the results. */
 432 WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle_t wc)
 433 {
 434     WallcycleCounts cycles_sum;
 435     wallcc_t*       wcc;
 436     double          cycles[int(ewcNR) + int(ewcsNR)];
 437 #if GMX_MPI
 438     double cycles_n[int(ewcNR) + int(ewcsNR) + 1];
 439 #endif
 440     int i;
 441     int nsum;
 442
 443     if (wc == nullptr)
 444     {
 445         /* Default construction of std::array of non-class T can leave
 446            the values indeterminate, just like a C array */
 447         cycles_sum.fill(0);
 448         return cycles_sum;
 449     }
 450
 451     wcc = wc->wcc;
 452
 453     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
 454     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);
 455
 456     subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);
 457
 458     if (cr->npmenodes == 0)
 459     {
 460         /* All nodes do PME (or no PME at all) */
 461         subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
 462     }
 463     else
 464     {
 465         /* The are PME-only nodes */
 466         if (wcc[ewcPMEMESH].n > 0)
 467         {
 468             /* This must be a PME only node, calculate the Wait + Comm. time */
 469             GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c,
 470                        "Total run ticks must be greater than PME-only ticks");
 471             wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
 472         }
 473     }
 474
 475     /* Store the cycles in a double buffer for summing */
 476     for (i = 0; i < ewcNR; i++)
 477     {
 478 #if GMX_MPI
 479         cycles_n[i] = static_cast<double>(wcc[i].n);
 480 #endif
 481         cycles[i] = static_cast<double>(wcc[i].c);
 482     }
 483     nsum = ewcNR;
 484     if (wc->wcsc)
 485     {
 486         for (i = 0; i < ewcsNR; i++)
 487         {
 488 #if GMX_MPI
 489             cycles_n[ewcNR + i] = static_cast<double>(wc->wcsc[i].n);
 490 #endif
 491             cycles[ewcNR + i] = static_cast<double>(wc->wcsc[i].c);
 492         }
 493         nsum += ewcsNR;
 494     }
 495
 496 #if GMX_MPI
 497     if (cr->nnodes > 1)
 498     {
 499         double buf[int(ewcNR) + int(ewcsNR) + 1];
 500
 501         // TODO this code is used only at the end of the run, so we
 502         // can just do a simple reduce of haveInvalidCount in
 503         // wallcycle_print, and avoid bugs
 504         cycles_n[nsum] = (wc->haveInvalidCount ? 1 : 0);
 505         // TODO Use MPI_Reduce
 506         MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
 507         for (i = 0; i < ewcNR; i++)
 508         {
 509             wcc[i].n = gmx::roundToInt(buf[i]);
 510         }
 511         wc->haveInvalidCount = (buf[nsum] > 0);
 512         if (wc->wcsc)
 513         {
 514             for (i = 0; i < ewcsNR; i++)
 515             {
 516                 wc->wcsc[i].n = gmx::roundToInt(buf[ewcNR + i]);
 517             }
 518         }
 519
 520         // TODO Use MPI_Reduce
 521         MPI_Allreduce(cycles, cycles_sum.data(), nsum, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 522
 523         if (wc->wcc_all != nullptr)
 524         {
 525             double *buf_all, *cyc_all;
 526
 527             snew(cyc_all, ewcNR * ewcNR);
 528             snew(buf_all, ewcNR * ewcNR);
 529             for (i = 0; i < ewcNR * ewcNR; i++)
 530             {
 531                 cyc_all[i] = wc->wcc_all[i].c;
 532             }
 533             // TODO Use MPI_Reduce
 534             MPI_Allreduce(cyc_all, buf_all, ewcNR * ewcNR, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 535             for (i = 0; i < ewcNR * ewcNR; i++)
 536             {
 537                 wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
 538             }
 539             sfree(buf_all);
 540             sfree(cyc_all);
 541         }
 542     }
 543     else
 544 #endif
 545     {
 546         for (i = 0; i < nsum; i++)
 547         {
 548             cycles_sum[i] = cycles[i];
 549         }
 550     }
 551
 552     return cycles_sum;
 553 }
 554
 555 static void
 556 print_cycles(FILE* fplog, double c2t, const char* name, int nnodes, int nthreads, int ncalls, double c_sum, double tot)
 557 {
 558     char   nnodes_str[STRLEN];
 559     char   nthreads_str[STRLEN];
 560     char   ncalls_str[STRLEN];
 561     double wallt;
 562     double percentage = (tot > 0.) ? (100. * c_sum / tot) : 0.;
 563
 564     if (c_sum > 0)
 565     {
 566         if (ncalls > 0)
 567         {
 568             snprintf(ncalls_str, sizeof(ncalls_str), "%10d", ncalls);
 569             if (nnodes < 0)
 570             {
 571                 snprintf(nnodes_str, sizeof(nnodes_str), "N/A");
 572             }
 573             else
 574             {
 575                 snprintf(nnodes_str, sizeof(nnodes_str), "%4d", nnodes);
 576             }
 577             if (nthreads < 0)
 578             {
 579                 snprintf(nthreads_str, sizeof(nthreads_str), "N/A");
 580             }
 581             else
 582             {
 583                 snprintf(nthreads_str, sizeof(nthreads_str), "%4d", nthreads);
 584             }
 585         }
 586         else
 587         {
 588             nnodes_str[0]   = 0;
 589             nthreads_str[0] = 0;
 590             ncalls_str[0]   = 0;
 591         }
 592         /* Convert the cycle count to wallclock time for this task */
 593         wallt = c_sum * c2t;
 594
 595         fprintf(fplog,
 596                 " %-19.19s %4s %4s %10s  %10.3f %14.3f %5.1f\n",
 597                 name,
 598                 nnodes_str,
 599                 nthreads_str,
 600                 ncalls_str,
 601                 wallt,
 602                 c_sum * 1e-9,
 603                 percentage);
 604     }
 605 }
 606
 607 static void print_gputimes(FILE* fplog, const char* name, int n, double t, double tot_t)
 608 {
 609     char num[11];
 610     char avg_perf[11];
 611
 612     if (n > 0)
 613     {
 614         snprintf(num, sizeof(num), "%10d", n);
 615         snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t / n);
 616     }
 617     else
 618     {
 619         sprintf(num, "          ");
 620         sprintf(avg_perf, "          ");
 621     }
 622     if (t != tot_t && tot_t > 0)
 623     {
 624         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, num, t / 1000, avg_perf, 100 * t / tot_t);
 625     }
 626     else
 627     {
 628         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, "", t / 1000, avg_perf, 100.0);
 629     }
 630 }
 631
 632 static void print_header(FILE* fplog, int nrank_pp, int nth_pp, int nrank_pme, int nth_pme)
 633 {
 634     int nrank_tot = nrank_pp + nrank_pme;
 635     if (0 == nrank_pme)
 636     {
 637         fprintf(fplog, "On %d MPI rank%s", nrank_tot, nrank_tot == 1 ? "" : "s");
 638         if (nth_pp > 1)
 639         {
 640             fprintf(fplog, ", each using %d OpenMP threads", nth_pp);
 641         }
 642         /* Don't report doing PP+PME, because we can't tell here if
 643          * this is RF, etc. */
 644     }
 645     else
 646     {
 647         fprintf(fplog, "On %d MPI rank%s doing PP", nrank_pp, nrank_pp == 1 ? "" : "s");
 648         if (nth_pp > 1)
 649         {
 650             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pp > 1 ? " each" : "", nth_pp);
 651         }
 652         fprintf(fplog, ", and\non %d MPI rank%s doing PME", nrank_pme, nrank_pme == 1 ? "" : "s");
 653         if (nth_pme > 1)
 654         {
 655             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pme > 1 ? " each" : "", nth_pme);
 656         }
 657     }
 658
 659     fprintf(fplog, "\n\n");
 660     fprintf(fplog, " Computing:          Num   Num      Call    Wall time         Giga-Cycles\n");
 661     fprintf(fplog, "                     Ranks Threads  Count      (s)         total sum    %%\n");
 662 }
 663
 664
 665 void wallcycle_print(FILE*                            fplog,
 666                      const gmx::MDLogger&             mdlog,
 667                      int                              nnodes,
 668                      int                              npme,
 669                      int                              nth_pp,
 670                      int                              nth_pme,
 671                      double                           realtime,
 672                      gmx_wallcycle_t                  wc,
 673                      const WallcycleCounts&           cyc_sum,
 674                      const gmx_wallclock_gpu_nbnxn_t* gpu_nbnxn_t,
 675                      const gmx_wallclock_gpu_pme_t*   gpu_pme_t)
 676 {
 677     double      tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
 678     double      c2t, c2t_pp, c2t_pme = 0;
 679     int         i, j, npp, nth_tot;
 680     char        buf[STRLEN];
 681     const char* hline =
 682             "-----------------------------------------------------------------------------";
 683
 684     if (wc == nullptr)
 685     {
 686         return;
 687     }
 688
 689     GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0");
 690     GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0");
 691     GMX_ASSERT(nnodes > 0, "Number of nodes must be >0");
 692     GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative");
 693     npp = nnodes - npme;
 694     /* npme is the number of PME-only ranks used, and we always do PP work */
 695     GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0");
 696
 697     nth_tot = npp * nth_pp + npme * nth_pme;
 698
 699     /* When using PME-only nodes, the next line is valid for both
 700        PP-only and PME-only nodes because they started ewcRUN at the
 701        same time. */
 702     tot        = cyc_sum[ewcRUN];
 703     tot_for_pp = 0;
 704
 705     if (tot <= 0.0)
 706     {
 707         /* TODO This is heavy handed, but until someone reworks the
 708            code so that it is provably robust with respect to
 709            non-positive values for all possible timer and cycle
 710            counters, there is less value gained from printing whatever
 711            timing data might still be sensible for some non-Jenkins
 712            run, than is lost from diagnosing Jenkins FP exceptions on
 713            runs about whose execution time we don't care. */
 714         GMX_LOG(mdlog.warning)
 715                 .asParagraph()
 716                 .appendTextFormatted(
 717                         "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a "
 718                         "time accounting",
 719                         tot);
 720         return;
 721     }
 722
 723     if (wc->haveInvalidCount)
 724     {
 725         GMX_LOG(mdlog.warning)
 726                 .asParagraph()
 727                 .appendText(
 728                         "NOTE: Detected invalid cycle counts, probably because threads moved "
 729                         "between CPU cores that do not have synchronized cycle counters. Will not "
 730                         "print the cycle accounting.");
 731         return;
 732     }
 733
 734
 735     /* Conversion factor from cycles to seconds */
 736     c2t    = realtime / tot;
 737     c2t_pp = c2t * nth_tot / static_cast<double>(npp * nth_pp);
 738     if (npme > 0)
 739     {
 740         c2t_pme = c2t * nth_tot / static_cast<double>(npme * nth_pme);
 741     }
 742     else
 743     {
 744         c2t_pme = 0;
 745     }
 746
 747     fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");
 748
 749     print_header(fplog, npp, nth_pp, npme, nth_pme);
 750
 751     fprintf(fplog, "%s\n", hline);
 752     for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
 753     {
 754         if (is_pme_subcounter(i))
 755         {
 756             /* Do not count these at all */
 757         }
 758         else if (npme > 0 && is_pme_counter(i))
 759         {
 760             /* Print timing information for PME-only nodes, but add an
 761              * asterisk so the reader of the table can know that the
 762              * walltimes are not meant to add up. The asterisk still
 763              * fits in the required maximum of 19 characters. */
 764             char buffer[STRLEN];
 765             snprintf(buffer, STRLEN, "%s *", wcn[i]);
 766             print_cycles(fplog, c2t_pme, buffer, npme, nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
 767         }
 768         else
 769         {
 770             /* Print timing information when it is for a PP or PP+PME
 771                node */
 772             print_cycles(fplog, c2t_pp, wcn[i], npp, nth_pp, wc->wcc[i].n, cyc_sum[i], tot);
 773             tot_for_pp += cyc_sum[i];
 774         }
 775     }
 776     if (wc->wcc_all != nullptr)
 777     {
 778         for (i = 0; i < ewcNR; i++)
 779         {
 780             for (j = 0; j < ewcNR; j++)
 781             {
 782                 snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
 783                 print_cycles(fplog,
 784                              c2t_pp,
 785                              buf,
 786                              npp,
 787                              nth_pp,
 788                              wc->wcc_all[i * ewcNR + j].n,
 789                              wc->wcc_all[i * ewcNR + j].c,
 790                              tot);
 791             }
 792         }
 793     }
 794     tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot);
 795     print_cycles(fplog, c2t_pp, "Rest", npp, nth_pp, -1, tot_for_rest - tot_for_pp, tot);
 796     fprintf(fplog, "%s\n", hline);
 797     print_cycles(fplog, c2t, "Total", npp, nth_pp, -1, tot, tot);
 798     fprintf(fplog, "%s\n", hline);
 799
 800     if (npme > 0)
 801     {
 802         fprintf(fplog,
 803                 "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
 804                 "    twice the total reported, but the cycle count total and %% are correct.\n"
 805                 "%s\n",
 806                 hline);
 807     }
 808
 809     if (wc->wcc[ewcPMEMESH].n > 0)
 810     {
 811         // A workaround to not print breakdown when no subcounters were recorded.
 812         // TODO: figure out and record PME GPU counters (what to do with the waiting ones?)
 813         std::vector<int> validPmeSubcounterIndices;
 814         for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
 815         {
 816             if (is_pme_subcounter(i) && wc->wcc[i].n > 0)
 817             {
 818                 validPmeSubcounterIndices.push_back(i);
 819             }
 820         }
 821
 822         if (!validPmeSubcounterIndices.empty())
 823         {
 824             fprintf(fplog, " Breakdown of PME mesh computation\n");
 825             fprintf(fplog, "%s\n", hline);
 826             for (auto i : validPmeSubcounterIndices)
 827             {
 828                 print_cycles(fplog,
 829                              npme > 0 ? c2t_pme : c2t_pp,
 830                              wcn[i],
 831                              npme > 0 ? npme : npp,
 832                              nth_pme,
 833                              wc->wcc[i].n,
 834                              cyc_sum[i],
 835                              tot);
 836             }
 837             fprintf(fplog, "%s\n", hline);
 838         }
 839     }
 840
 841     if (sc_useCycleSubcounters && wc->wcsc)
 842     {
 843         fprintf(fplog, " Breakdown of PP computation\n");
 844         fprintf(fplog, "%s\n", hline);
 845         for (i = 0; i < ewcsNR; i++)
 846         {
 847             print_cycles(fplog, c2t_pp, wcsn[i], npp, nth_pp, wc->wcsc[i].n, cyc_sum[ewcNR + i], tot);
 848         }
 849         fprintf(fplog, "%s\n", hline);
 850     }
 851
 852     /* print GPU timing summary */
 853     double tot_gpu = 0.0;
 854     if (gpu_pme_t)
 855     {
 856         for (auto key : keysOf(gpu_pme_t->timing))
 857         {
 858             tot_gpu += gpu_pme_t->timing[key].t;
 859         }
 860     }
 861     if (gpu_nbnxn_t)
 862     {
 863         const char* k_log_str[2][2] = { { "Nonbonded F kernel", "Nonbonded F+ene k." },
 864                                         { "Nonbonded F+prune k.", "Nonbonded F+ene+prune k." } };
 865         tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
 866
 867         /* add up the kernel timings */
 868         for (i = 0; i < 2; i++)
 869         {
 870             for (j = 0; j < 2; j++)
 871             {
 872                 tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
 873             }
 874         }
 875         tot_gpu += gpu_nbnxn_t->pruneTime.t;
 876
 877         tot_cpu_overlap = wc->wcc[ewcFORCE].c;
 878         if (wc->wcc[ewcPMEMESH].n > 0)
 879         {
 880             tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
 881         }
 882         tot_cpu_overlap *= realtime * 1000 / tot; /* convert s to ms */
 883
 884         fprintf(fplog, "\n GPU timings\n%s\n", hline);
 885         fprintf(fplog,
 886                 " Computing:                         Count  Wall t (s)      ms/step       %c\n",
 887                 '%');
 888         fprintf(fplog, "%s\n", hline);
 889         print_gputimes(fplog, "Pair list H2D", gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
 890         print_gputimes(fplog, "X / q H2D", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
 891
 892         for (i = 0; i < 2; i++)
 893         {
 894             for (j = 0; j < 2; j++)
 895             {
 896                 if (gpu_nbnxn_t->ktime[i][j].c)
 897                 {
 898                     print_gputimes(fplog,
 899                                    k_log_str[i][j],
 900                                    gpu_nbnxn_t->ktime[i][j].c,
 901                                    gpu_nbnxn_t->ktime[i][j].t,
 902                                    tot_gpu);
 903                 }
 904             }
 905         }
 906         if (gpu_pme_t)
 907         {
 908             for (auto key : keysOf(gpu_pme_t->timing))
 909             {
 910                 if (gpu_pme_t->timing[key].c)
 911                 {
 912                     print_gputimes(fplog,
 913                                    enumValuetoString(key),
 914                                    gpu_pme_t->timing[key].c,
 915                                    gpu_pme_t->timing[key].t,
 916                                    tot_gpu);
 917                 }
 918             }
 919         }
 920         if (gpu_nbnxn_t->pruneTime.c)
 921         {
 922             print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c, gpu_nbnxn_t->pruneTime.t, tot_gpu);
 923         }
 924         print_gputimes(fplog, "F D2H", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
 925         fprintf(fplog, "%s\n", hline);
 926         print_gputimes(fplog, "Total ", gpu_nbnxn_t->nb_c, tot_gpu, tot_gpu);
 927         fprintf(fplog, "%s\n", hline);
 928         if (gpu_nbnxn_t->dynamicPruneTime.c)
 929         {
 930             /* We print the dynamic pruning kernel timings after a separator
 931              * and avoid adding it to tot_gpu as this is not in the force
 932              * overlap. We print the fraction as relative to the rest.
 933              */
 934             print_gputimes(fplog,
 935                            "*Dynamic pruning",
 936                            gpu_nbnxn_t->dynamicPruneTime.c,
 937                            gpu_nbnxn_t->dynamicPruneTime.t,
 938                            tot_gpu);
 939             fprintf(fplog, "%s\n", hline);
 940         }
 941         gpu_cpu_ratio = tot_gpu / tot_cpu_overlap;
 942         if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
 943         {
 944             fprintf(fplog,
 945                     "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = "
 946                     "%.3f\n",
 947                     tot_gpu / gpu_nbnxn_t->nb_c,
 948                     tot_cpu_overlap / wc->wcc[ewcFORCE].n,
 949                     gpu_cpu_ratio);
 950         }
 951
 952         /* only print notes related to CPU-GPU load balance with PME */
 953         if (wc->wcc[ewcPMEMESH].n > 0)
 954         {
 955             fprintf(fplog, "For optimal resource utilization this ratio should be close to 1\n");
 956
 957             /* print note if the imbalance is high with PME case in which
 958              * CPU-GPU load balancing is possible */
 959             if (gpu_cpu_ratio < 0.8 || gpu_cpu_ratio > 1.25)
 960             {
 961                 /* Only the sim master calls this function, so always print to stderr */
 962                 if (gpu_cpu_ratio < 0.8)
 963                 {
 964                     if (npp > 1)
 965                     {
 966                         /* The user could have used -notunepme,
 967                          * but we currently can't check that here.
 968                          */
 969                         GMX_LOG(mdlog.warning)
 970                                 .asParagraph()
 971                                 .appendText(
 972                                         "NOTE: The CPU has >25% more load than the GPU. This "
 973                                         "imbalance wastes\n"
 974                                         "      GPU resources. Maybe the domain decomposition "
 975                                         "limits the PME tuning.\n"
 976                                         "      In that case, try setting the DD grid manually "
 977                                         "(-dd) or lowering -dds.");
 978                     }
 979                     else
 980                     {
 981                         /* We should not end up here, unless the box is
 982                          * too small for increasing the cut-off for PME tuning.
 983                          */
 984                         GMX_LOG(mdlog.warning)
 985                                 .asParagraph()
 986                                 .appendText(
 987                                         "NOTE: The CPU has >25% more load than the GPU. This "
 988                                         "imbalance wastes\n"
 989                                         "      GPU resources.");
 990                     }
 991                 }
 992                 if (gpu_cpu_ratio > 1.25)
 993                 {
 994                     GMX_LOG(mdlog.warning)
 995                             .asParagraph()
 996                             .appendText(
 997                                     "NOTE: The GPU has >25% more load than the CPU. This imbalance "
 998                                     "wastes\n"
 999                                     "      CPU resources.");
1000                 }
1001             }
1002         }
1003     }
1004
1005     if (wc->wc_barrier)
1006     {
1007         GMX_LOG(mdlog.warning)
1008                 .asParagraph()
1009                 .appendText(
1010                         "MPI_Barrier was called before each cycle start/stop\n"
1011                         "call, so timings are not those of real runs.");
1012     }
1013
1014     if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 && (cyc_sum[ewcDOMDEC] > tot * 0.1 || cyc_sum[ewcNS] > tot * 0.1))
1015     {
1016         /* Only the sim master calls this function, so always print to stderr */
1017         if (wc->wcc[ewcDOMDEC].n == 0)
1018         {
1019             GMX_LOG(mdlog.warning)
1020                     .asParagraph()
1021                     .appendTextFormatted(
1022                             "NOTE: %d %% of the run time was spent in pair search,\n"
1023                             "      you might want to increase nstlist (this has no effect on "
1024                             "accuracy)\n",
1025                             gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1026         }
1027         else
1028         {
1029             GMX_LOG(mdlog.warning)
1030                     .asParagraph()
1031                     .appendTextFormatted(
1032                             "NOTE: %d %% of the run time was spent in domain decomposition,\n"
1033                             "      %d %% of the run time was spent in pair search,\n"
1034                             "      you might want to increase nstlist (this has no effect on "
1035                             "accuracy)\n",
1036                             gmx::roundToInt(100 * cyc_sum[ewcDOMDEC] / tot),
1037                             gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1038         }
1039     }
1040
1041     if (cyc_sum[ewcMoveE] > tot * 0.05)
1042     {
1043         GMX_LOG(mdlog.warning)
1044                 .asParagraph()
1045                 .appendTextFormatted(
1046                         "NOTE: %d %% of the run time was spent communicating energies,\n"
1047                         "      you might want to increase some nst* mdp options\n",
1048                         gmx::roundToInt(100 * cyc_sum[ewcMoveE] / tot));
1049     }
1050 }
1051
1052 extern int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
1053 {
1054     if (wc == nullptr)
1055     {
1056         return -1;
1057     }
1058
1059     return wc->reset_counters;
1060 }
1061
1062 extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, int64_t reset_counters)
1063 {
1064     if (wc == nullptr)
1065     {
1066         return;
1067     }
1068
1069     wc->reset_counters = reset_counters;
1070 }