src/gromacs/timing/wallcycle.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "wallcycle.h"
  40
  41 #include "config.h"
  42
  43 #include <cstdlib>
  44
  45 #include <array>
  46 #include <vector>
  47
  48 #include "gromacs/math/functions.h"
  49 #include "gromacs/mdtypes/commrec.h"
  50 #include "gromacs/timing/cyclecounter.h"
  51 #include "gromacs/timing/gpu_timing.h"
  52 #include "gromacs/timing/wallcyclereporting.h"
  53 #include "gromacs/utility/cstringutil.h"
  54 #include "gromacs/utility/gmxassert.h"
  55 #include "gromacs/utility/gmxmpi.h"
  56 #include "gromacs/utility/logger.h"
  57 #include "gromacs/utility/smalloc.h"
  58 #include "gromacs/utility/snprintf.h"
  59
  60 static const bool useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS;
  61
  62 /* DEBUG_WCYCLE adds consistency checking for the counters.
  63  * It checks if you stop a counter different from the last
  64  * one that was opened and if you do nest too deep.
  65  */
  66 /* #define DEBUG_WCYCLE */
  67
  68 #ifdef DEBUG_WCYCLE
  69 #include "gromacs/utility/fatalerror.h"
  70 #endif
  71
  72 typedef struct
  73 {
  74     int          n;
  75     gmx_cycles_t c;
  76     gmx_cycles_t start;
  77 } wallcc_t;
  78
  79 struct gmx_wallcycle
  80 {
  81     wallcc_t        *wcc;
  82     /* did we detect one or more invalid cycle counts */
  83     gmx_bool         haveInvalidCount;
  84     /* variables for testing/debugging */
  85     gmx_bool         wc_barrier;
  86     wallcc_t        *wcc_all;
  87     int              wc_depth;
  88 #ifdef DEBUG_WCYCLE
  89 #define DEPTH_MAX 6
  90     int               counterlist[DEPTH_MAX];
  91     int               count_depth;
  92 #endif
  93     int               ewc_prev;
  94     gmx_cycles_t      cycle_prev;
  95     int64_t           reset_counters;
  96 #if GMX_MPI
  97     MPI_Comm          mpi_comm_mygroup;
  98 #endif
  99     wallcc_t         *wcsc;
 100 };
 101
 102 /* Each name should not exceed 19 printing characters
 103    (ie. terminating null can be twentieth) */
 104 static const char *wcn[ewcNR] =
 105 {
 106     "Run", "Step", "PP during PME", "Domain decomp.", "DD comm. load",
 107     "DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.",
 108     "Comm. coord.", "Force", "Wait + Comm. F", "PME mesh",
 109     "PME redist. X/F", "PME spread", "PME gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve LJ", "PME solve Elec",
 110     "PME wait for PP", "Wait + Recv. PME F",
 111     "Wait PME GPU spread", "PME 3D-FFT", "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
 112     "Wait PME GPU gather", "Wait Bonded GPU", "Reduce GPU PME F",
 113     "Wait GPU NB nonloc.", "Wait GPU NB local", "NB X/F buffer ops.",
 114     "Vsite spread", "COM pull force", "AWH",
 115     "Write traj.", "Update", "Constraints", "Comm. energies",
 116     "Enforced rotation", "Add rot. forces", "Position swapping", "IMD", "Test"
 117 };
 118
 119 static const char *wcsn[ewcsNR] =
 120 {
 121     "DD redist.", "DD NS grid + sort", "DD setup comm.",
 122     "DD make top.", "DD make constr.", "DD top. other",
 123     "NS grid local", "NS grid non-loc.", "NS search local", "NS search non-loc.",
 124     "Bonded F",
 125     "Bonded-FEP F",
 126     "Restraints F",
 127     "Listed buffer ops.",
 128     "Nonbonded pruning",
 129     "Nonbonded F",
 130     "Launch NB GPU tasks",
 131     "Launch Bonded GPU tasks",
 132     "Launch PME GPU tasks",
 133     "Ewald F correction",
 134     "NB X buffer ops.",
 135     "NB F buffer ops.",
 136 };
 137
 138 /* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
 139 static const char *PMEStageNames[] =
 140 {
 141     "PME spline",
 142     "PME spread",
 143     "PME spline + spread",
 144     "PME 3D-FFT r2c",
 145     "PME solve",
 146     "PME 3D-FFT c2r",
 147     "PME gather",
 148 };
 149
 150 gmx_bool wallcycle_have_counter()
 151 {
 152     return gmx_cycles_have_counter();
 153 }
 154
 155 gmx_wallcycle_t wallcycle_init(FILE *fplog, int resetstep, t_commrec gmx_unused *cr)
 156 {
 157     gmx_wallcycle_t wc;
 158
 159
 160     if (!wallcycle_have_counter())
 161     {
 162         return nullptr;
 163     }
 164
 165     snew(wc, 1);
 166
 167     wc->haveInvalidCount    = FALSE;
 168     wc->wc_barrier          = FALSE;
 169     wc->wcc_all             = nullptr;
 170     wc->wc_depth            = 0;
 171     wc->ewc_prev            = -1;
 172     wc->reset_counters      = resetstep;
 173
 174 #if GMX_MPI
 175     if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
 176     {
 177         if (fplog)
 178         {
 179             fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
 180         }
 181         wc->wc_barrier       = TRUE;
 182         wc->mpi_comm_mygroup = cr->mpi_comm_mygroup;
 183     }
 184 #endif
 185
 186     snew(wc->wcc, ewcNR);
 187     if (getenv("GMX_CYCLE_ALL") != nullptr)
 188     {
 189         if (fplog)
 190         {
 191             fprintf(fplog, "\nWill time all the code during the run\n\n");
 192         }
 193         snew(wc->wcc_all, ewcNR*ewcNR);
 194     }
 195
 196     if (useCycleSubcounters)
 197     {
 198         snew(wc->wcsc, ewcsNR);
 199     }
 200
 201 #ifdef DEBUG_WCYCLE
 202     wc->count_depth = 0;
 203 #endif
 204
 205     return wc;
 206 }
 207
 208 void wallcycle_destroy(gmx_wallcycle_t wc)
 209 {
 210     if (wc == nullptr)
 211     {
 212         return;
 213     }
 214
 215     if (wc->wcc != nullptr)
 216     {
 217         sfree(wc->wcc);
 218     }
 219     if (wc->wcc_all != nullptr)
 220     {
 221         sfree(wc->wcc_all);
 222     }
 223     if (wc->wcsc != nullptr)
 224     {
 225         sfree(wc->wcsc);
 226     }
 227     sfree(wc);
 228 }
 229
 230 static void wallcycle_all_start(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 231 {
 232     wc->ewc_prev   = ewc;
 233     wc->cycle_prev = cycle;
 234 }
 235
 236 static void wallcycle_all_stop(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 237 {
 238     wc->wcc_all[wc->ewc_prev*ewcNR+ewc].n += 1;
 239     wc->wcc_all[wc->ewc_prev*ewcNR+ewc].c += cycle - wc->cycle_prev;
 240 }
 241
 242
 243 #ifdef DEBUG_WCYCLE
 244 static void debug_start_check(gmx_wallcycle_t wc, int ewc)
 245 {
 246     /* fprintf(stderr,"wcycle_start depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 247
 248     if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
 249     {
 250         gmx_fatal(FARGS, "wallcycle counter depth out of range: %d",
 251                   wc->count_depth);
 252     }
 253     wc->counterlist[wc->count_depth] = ewc;
 254     wc->count_depth++;
 255 }
 256
 257 static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
 258 {
 259     wc->count_depth--;
 260
 261     /* fprintf(stderr,"wcycle_stop depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 262
 263     if (wc->count_depth < 0)
 264     {
 265         gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc], wc->count_depth);
 266     }
 267     if (wc->counterlist[wc->count_depth] != ewc)
 268     {
 269         gmx_fatal(FARGS, "wallcycle mismatch at stop, start %s, stop %s",
 270                   wcn[wc->counterlist[wc->count_depth]], wcn[ewc]);
 271     }
 272 }
 273 #endif
 274
 275 void wallcycle_start(gmx_wallcycle_t wc, int ewc)
 276 {
 277     gmx_cycles_t cycle;
 278
 279     if (wc == nullptr)
 280     {
 281         return;
 282     }
 283
 284 #if GMX_MPI
 285     if (wc->wc_barrier)
 286     {
 287         MPI_Barrier(wc->mpi_comm_mygroup);
 288     }
 289 #endif
 290
 291 #ifdef DEBUG_WCYCLE
 292     debug_start_check(wc, ewc);
 293 #endif
 294
 295     cycle              = gmx_cycles_read();
 296     wc->wcc[ewc].start = cycle;
 297     if (wc->wcc_all != nullptr)
 298     {
 299         wc->wc_depth++;
 300         if (ewc == ewcRUN)
 301         {
 302             wallcycle_all_start(wc, ewc, cycle);
 303         }
 304         else if (wc->wc_depth == 3)
 305         {
 306             wallcycle_all_stop(wc, ewc, cycle);
 307         }
 308     }
 309 }
 310
 311 void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
 312 {
 313     if (wc == nullptr)
 314     {
 315         return;
 316     }
 317
 318     wallcycle_start(wc, ewc);
 319     wc->wcc[ewc].n--;
 320 }
 321
 322 double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
 323 {
 324     gmx_cycles_t cycle, last;
 325
 326     if (wc == nullptr)
 327     {
 328         return 0;
 329     }
 330
 331 #if GMX_MPI
 332     if (wc->wc_barrier)
 333     {
 334         MPI_Barrier(wc->mpi_comm_mygroup);
 335     }
 336 #endif
 337
 338 #ifdef DEBUG_WCYCLE
 339     debug_stop_check(wc, ewc);
 340 #endif
 341
 342     /* When processes or threads migrate between cores, the cycle counting
 343      * can get messed up if the cycle counter on different cores are not
 344      * synchronized. When this happens we expect both large negative and
 345      * positive cycle differences. We can detect negative cycle differences.
 346      * Detecting too large positive counts if difficult, since count can be
 347      * large, especially for ewcRUN. If we detect a negative count,
 348      * we will not print the cycle accounting table.
 349      */
 350     cycle                    = gmx_cycles_read();
 351     if (cycle >= wc->wcc[ewc].start)
 352     {
 353         last                 = cycle - wc->wcc[ewc].start;
 354     }
 355     else
 356     {
 357         last                 = 0;
 358         wc->haveInvalidCount = TRUE;
 359     }
 360     wc->wcc[ewc].c          += last;
 361     wc->wcc[ewc].n++;
 362     if (wc->wcc_all)
 363     {
 364         wc->wc_depth--;
 365         if (ewc == ewcRUN)
 366         {
 367             wallcycle_all_stop(wc, ewc, cycle);
 368         }
 369         else if (wc->wc_depth == 2)
 370         {
 371             wallcycle_all_start(wc, ewc, cycle);
 372         }
 373     }
 374
 375     return last;
 376 }
 377
 378 void wallcycle_get(gmx_wallcycle_t wc, int ewc, int *n, double *c)
 379 {
 380     *n = wc->wcc[ewc].n;
 381     *c = static_cast<double>(wc->wcc[ewc].c);
 382 }
 383
 384 void wallcycle_reset_all(gmx_wallcycle_t wc)
 385 {
 386     int i;
 387
 388     if (wc == nullptr)
 389     {
 390         return;
 391     }
 392
 393     for (i = 0; i < ewcNR; i++)
 394     {
 395         wc->wcc[i].n = 0;
 396         wc->wcc[i].c = 0;
 397     }
 398     wc->haveInvalidCount = FALSE;
 399
 400     if (wc->wcc_all)
 401     {
 402         for (i = 0; i < ewcNR*ewcNR; i++)
 403         {
 404             wc->wcc_all[i].n = 0;
 405             wc->wcc_all[i].c = 0;
 406         }
 407     }
 408     if (wc->wcsc)
 409     {
 410         for (i = 0; i < ewcsNR; i++)
 411         {
 412             wc->wcsc[i].n = 0;
 413             wc->wcsc[i].c = 0;
 414         }
 415     }
 416 }
 417
 418 static gmx_bool is_pme_counter(int ewc)
 419 {
 420     return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
 421 }
 422
 423 static gmx_bool is_pme_subcounter(int ewc)
 424 {
 425     return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
 426 }
 427
 428 /* Subtract counter ewc_sub timed inside a timing block for ewc_main */
 429 static void subtract_cycles(wallcc_t *wcc, int ewc_main, int ewc_sub)
 430 {
 431     if (wcc[ewc_sub].n > 0)
 432     {
 433         if (wcc[ewc_main].c >= wcc[ewc_sub].c)
 434         {
 435             wcc[ewc_main].c -= wcc[ewc_sub].c;
 436         }
 437         else
 438         {
 439             /* Something is wrong with the cycle counting */
 440             wcc[ewc_main].c  = 0;
 441         }
 442     }
 443 }
 444
 445 void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
 446 {
 447     if (wc == nullptr)
 448     {
 449         return;
 450     }
 451
 452     for (int i = 0; i < ewcNR; i++)
 453     {
 454         if (is_pme_counter(i) || (i == ewcRUN && isPmeRank))
 455         {
 456             wc->wcc[i].c *= nthreads_pme;
 457
 458             if (wc->wcc_all)
 459             {
 460                 for (int j = 0; j < ewcNR; j++)
 461                 {
 462                     wc->wcc_all[i*ewcNR+j].c *= nthreads_pme;
 463                 }
 464             }
 465         }
 466         else
 467         {
 468             wc->wcc[i].c *= nthreads_pp;
 469
 470             if (wc->wcc_all)
 471             {
 472                 for (int j = 0; j < ewcNR; j++)
 473                 {
 474                     wc->wcc_all[i*ewcNR+j].c *= nthreads_pp;
 475                 }
 476             }
 477         }
 478     }
 479     if (useCycleSubcounters && wc->wcsc && !isPmeRank)
 480     {
 481         for (int i = 0; i < ewcsNR; i++)
 482         {
 483             wc->wcsc[i].c *= nthreads_pp;
 484         }
 485     }
 486 }
 487
 488 /* TODO Make an object for this function to return, containing some
 489  * vectors of something like wallcc_t for the summed wcc, wcc_all and
 490  * wcsc, AND the original wcc for rank 0.
 491  *
 492  * The GPU timing is reported only for rank 0, so we want to preserve
 493  * the original wcycle on that rank. Rank 0 also reports the global
 494  * counts before that, so needs something to contain the global data
 495  * without over-writing the rank-0 data. The current implementation
 496  * uses cycles_sum to manage this, which works OK now because wcsc and
 497  * wcc_all are unused by the GPU reporting, but it is not satisfactory
 498  * for the future. Also, there's no need for MPI_Allreduce, since
 499  * only MASTERRANK uses any of the results. */
 500 WallcycleCounts wallcycle_sum(const t_commrec *cr, gmx_wallcycle_t wc)
 501 {
 502     WallcycleCounts cycles_sum;
 503     wallcc_t       *wcc;
 504     double          cycles[ewcNR+ewcsNR];
 505 #if GMX_MPI
 506     double          cycles_n[ewcNR+ewcsNR+1];
 507 #endif
 508     int             i;
 509     int             nsum;
 510
 511     if (wc == nullptr)
 512     {
 513         /* Default construction of std::array of non-class T can leave
 514            the values indeterminate, just like a C array, and icc
 515            warns about it. */
 516         cycles_sum.fill(0);
 517         return cycles_sum;
 518     }
 519
 520     wcc = wc->wcc;
 521
 522     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
 523     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);
 524
 525     subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);
 526
 527     if (cr->npmenodes == 0)
 528     {
 529         /* All nodes do PME (or no PME at all) */
 530         subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
 531     }
 532     else
 533     {
 534         /* The are PME-only nodes */
 535         if (wcc[ewcPMEMESH].n > 0)
 536         {
 537             /* This must be a PME only node, calculate the Wait + Comm. time */
 538             GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c, "Total run ticks must be greater than PME-only ticks");
 539             wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
 540         }
 541     }
 542
 543     /* Store the cycles in a double buffer for summing */
 544     for (i = 0; i < ewcNR; i++)
 545     {
 546 #if GMX_MPI
 547         cycles_n[i] = static_cast<double>(wcc[i].n);
 548 #endif
 549         cycles[i]   = static_cast<double>(wcc[i].c);
 550     }
 551     nsum = ewcNR;
 552     if (wc->wcsc)
 553     {
 554         for (i = 0; i < ewcsNR; i++)
 555         {
 556 #if GMX_MPI
 557             cycles_n[ewcNR+i] = static_cast<double>(wc->wcsc[i].n);
 558 #endif
 559             cycles[ewcNR+i]   = static_cast<double>(wc->wcsc[i].c);
 560         }
 561         nsum += ewcsNR;
 562     }
 563
 564 #if GMX_MPI
 565     if (cr->nnodes > 1)
 566     {
 567         double buf[ewcNR+ewcsNR+1];
 568
 569         // TODO this code is used only at the end of the run, so we
 570         // can just do a simple reduce of haveInvalidCount in
 571         // wallcycle_print, and avoid bugs
 572         cycles_n[nsum] = (wc->haveInvalidCount ? 1 : 0);
 573         // TODO Use MPI_Reduce
 574         MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX,
 575                       cr->mpi_comm_mysim);
 576         for (i = 0; i < ewcNR; i++)
 577         {
 578             wcc[i].n = gmx::roundToInt(buf[i]);
 579         }
 580         wc->haveInvalidCount = (buf[nsum] > 0);
 581         if (wc->wcsc)
 582         {
 583             for (i = 0; i < ewcsNR; i++)
 584             {
 585                 wc->wcsc[i].n = gmx::roundToInt(buf[ewcNR+i]);
 586             }
 587         }
 588
 589         // TODO Use MPI_Reduce
 590         MPI_Allreduce(cycles, cycles_sum.data(), nsum, MPI_DOUBLE, MPI_SUM,
 591                       cr->mpi_comm_mysim);
 592
 593         if (wc->wcc_all != nullptr)
 594         {
 595             double *buf_all, *cyc_all;
 596
 597             snew(cyc_all, ewcNR*ewcNR);
 598             snew(buf_all, ewcNR*ewcNR);
 599             for (i = 0; i < ewcNR*ewcNR; i++)
 600             {
 601                 cyc_all[i] = wc->wcc_all[i].c;
 602             }
 603             // TODO Use MPI_Reduce
 604             MPI_Allreduce(cyc_all, buf_all, ewcNR*ewcNR, MPI_DOUBLE, MPI_SUM,
 605                           cr->mpi_comm_mysim);
 606             for (i = 0; i < ewcNR*ewcNR; i++)
 607             {
 608                 wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
 609             }
 610             sfree(buf_all);
 611             sfree(cyc_all);
 612         }
 613     }
 614     else
 615 #endif
 616     {
 617         for (i = 0; i < nsum; i++)
 618         {
 619             cycles_sum[i] = cycles[i];
 620         }
 621     }
 622
 623     return cycles_sum;
 624 }
 625
 626 static void print_cycles(FILE *fplog, double c2t, const char *name,
 627                          int nnodes, int nthreads,
 628                          int ncalls, double c_sum, double tot)
 629 {
 630     char   nnodes_str[STRLEN];
 631     char   nthreads_str[STRLEN];
 632     char   ncalls_str[STRLEN];
 633     double wallt;
 634     double percentage = (tot > 0.) ? (100. * c_sum / tot) : 0.;
 635
 636     if (c_sum > 0)
 637     {
 638         if (ncalls > 0)
 639         {
 640             snprintf(ncalls_str, sizeof(ncalls_str), "%10d", ncalls);
 641             if (nnodes < 0)
 642             {
 643                 snprintf(nnodes_str, sizeof(nnodes_str), "N/A");
 644             }
 645             else
 646             {
 647                 snprintf(nnodes_str, sizeof(nnodes_str), "%4d", nnodes);
 648             }
 649             if (nthreads < 0)
 650             {
 651                 snprintf(nthreads_str, sizeof(nthreads_str), "N/A");
 652             }
 653             else
 654             {
 655                 snprintf(nthreads_str, sizeof(nthreads_str), "%4d", nthreads);
 656             }
 657         }
 658         else
 659         {
 660             nnodes_str[0]   = 0;
 661             nthreads_str[0] = 0;
 662             ncalls_str[0]   = 0;
 663         }
 664         /* Convert the cycle count to wallclock time for this task */
 665         wallt = c_sum*c2t;
 666
 667         fprintf(fplog, " %-19.19s %4s %4s %10s  %10.3f %14.3f %5.1f\n",
 668                 name, nnodes_str, nthreads_str, ncalls_str, wallt,
 669                 c_sum*1e-9, percentage);
 670     }
 671 }
 672
 673 static void print_gputimes(FILE *fplog, const char *name,
 674                            int n, double t, double tot_t)
 675 {
 676     char num[11];
 677     char avg_perf[11];
 678
 679     if (n > 0)
 680     {
 681         snprintf(num, sizeof(num), "%10d", n);
 682         snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t/n);
 683     }
 684     else
 685     {
 686         sprintf(num, "          ");
 687         sprintf(avg_perf, "          ");
 688     }
 689     if (t != tot_t && tot_t > 0)
 690     {
 691         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n",
 692                 name, num, t/1000, avg_perf, 100 * t/tot_t);
 693     }
 694     else
 695     {
 696         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n",
 697                 name, "", t/1000, avg_perf, 100.0);
 698     }
 699 }
 700
 701 static void print_header(FILE *fplog, int nrank_pp, int nth_pp, int nrank_pme, int nth_pme)
 702 {
 703     int nrank_tot = nrank_pp + nrank_pme;
 704     if (0 == nrank_pme)
 705     {
 706         fprintf(fplog, "On %d MPI rank%s", nrank_tot, nrank_tot == 1 ? "" : "s");
 707         if (nth_pp > 1)
 708         {
 709             fprintf(fplog, ", each using %d OpenMP threads", nth_pp);
 710         }
 711         /* Don't report doing PP+PME, because we can't tell here if
 712          * this is RF, etc. */
 713     }
 714     else
 715     {
 716         fprintf(fplog, "On %d MPI rank%s doing PP", nrank_pp, nrank_pp == 1 ? "" : "s");
 717         if (nth_pp > 1)
 718         {
 719             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pp > 1 ? " each" : "", nth_pp);
 720         }
 721         fprintf(fplog, ", and\non %d MPI rank%s doing PME", nrank_pme, nrank_pme == 1 ? "" : "s");
 722         if (nth_pme > 1)
 723         {
 724             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pme > 1 ? " each" : "", nth_pme);
 725         }
 726     }
 727
 728     fprintf(fplog, "\n\n");
 729     fprintf(fplog, " Computing:          Num   Num      Call    Wall time         Giga-Cycles\n");
 730     fprintf(fplog, "                     Ranks Threads  Count      (s)         total sum    %%\n");
 731 }
 732
 733
 734 void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int npme,
 735                      int nth_pp, int nth_pme, double realtime,
 736                      gmx_wallcycle_t wc, const WallcycleCounts &cyc_sum,
 737                      const gmx_wallclock_gpu_nbnxn_t *gpu_nbnxn_t,
 738                      const gmx_wallclock_gpu_pme_t *gpu_pme_t)
 739 {
 740     double      tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
 741     double      c2t, c2t_pp, c2t_pme = 0;
 742     int         i, j, npp, nth_tot;
 743     char        buf[STRLEN];
 744     const char *hline = "-----------------------------------------------------------------------------";
 745
 746     if (wc == nullptr)
 747     {
 748         return;
 749     }
 750
 751     GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0");
 752     GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0");
 753     GMX_ASSERT(nnodes > 0, "Number of nodes must be >0");
 754     GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative");
 755     npp     = nnodes - npme;
 756     /* npme is the number of PME-only ranks used, and we always do PP work */
 757     GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0");
 758
 759     nth_tot = npp*nth_pp + npme*nth_pme;
 760
 761     /* When using PME-only nodes, the next line is valid for both
 762        PP-only and PME-only nodes because they started ewcRUN at the
 763        same time. */
 764     tot        = cyc_sum[ewcRUN];
 765     tot_for_pp = 0;
 766
 767     if (tot <= 0.0)
 768     {
 769         /* TODO This is heavy handed, but until someone reworks the
 770            code so that it is provably robust with respect to
 771            non-positive values for all possible timer and cycle
 772            counters, there is less value gained from printing whatever
 773            timing data might still be sensible for some non-Jenkins
 774            run, than is lost from diagnosing Jenkins FP exceptions on
 775            runs about whose execution time we don't care. */
 776         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 777                 "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a time accounting",
 778                 tot);
 779         return;
 780     }
 781
 782     if (wc->haveInvalidCount)
 783     {
 784         GMX_LOG(mdlog.warning).asParagraph().appendText("NOTE: Detected invalid cycle counts, probably because threads moved between CPU cores that do not have synchronized cycle counters. Will not print the cycle accounting.");
 785         return;
 786     }
 787
 788
 789     /* Conversion factor from cycles to seconds */
 790     c2t     = realtime/tot;
 791     c2t_pp  = c2t * nth_tot / static_cast<double>(npp*nth_pp);
 792     if (npme > 0)
 793     {
 794         c2t_pme = c2t * nth_tot / static_cast<double>(npme*nth_pme);
 795     }
 796     else
 797     {
 798         c2t_pme = 0;
 799     }
 800
 801     fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");
 802
 803     print_header(fplog, npp, nth_pp, npme, nth_pme);
 804
 805     fprintf(fplog, "%s\n", hline);
 806     for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
 807     {
 808         if (is_pme_subcounter(i))
 809         {
 810             /* Do not count these at all */
 811         }
 812         else if (npme > 0 && is_pme_counter(i))
 813         {
 814             /* Print timing information for PME-only nodes, but add an
 815              * asterisk so the reader of the table can know that the
 816              * walltimes are not meant to add up. The asterisk still
 817              * fits in the required maximum of 19 characters. */
 818             char buffer[STRLEN];
 819             snprintf(buffer, STRLEN, "%s *", wcn[i]);
 820             print_cycles(fplog, c2t_pme, buffer,
 821                          npme, nth_pme,
 822                          wc->wcc[i].n, cyc_sum[i], tot);
 823         }
 824         else
 825         {
 826             /* Print timing information when it is for a PP or PP+PME
 827                node */
 828             print_cycles(fplog, c2t_pp, wcn[i],
 829                          npp, nth_pp,
 830                          wc->wcc[i].n, cyc_sum[i], tot);
 831             tot_for_pp += cyc_sum[i];
 832         }
 833     }
 834     if (wc->wcc_all != nullptr)
 835     {
 836         for (i = 0; i < ewcNR; i++)
 837         {
 838             for (j = 0; j < ewcNR; j++)
 839             {
 840                 snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
 841                 print_cycles(fplog, c2t_pp, buf,
 842                              npp, nth_pp,
 843                              wc->wcc_all[i*ewcNR+j].n,
 844                              wc->wcc_all[i*ewcNR+j].c,
 845                              tot);
 846             }
 847         }
 848     }
 849     tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot);
 850     print_cycles(fplog, c2t_pp, "Rest",
 851                  npp, nth_pp,
 852                  -1, tot_for_rest - tot_for_pp, tot);
 853     fprintf(fplog, "%s\n", hline);
 854     print_cycles(fplog, c2t, "Total",
 855                  npp, nth_pp,
 856                  -1, tot, tot);
 857     fprintf(fplog, "%s\n", hline);
 858
 859     if (npme > 0)
 860     {
 861         fprintf(fplog,
 862                 "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
 863                 "    twice the total reported, but the cycle count total and %% are correct.\n"
 864                 "%s\n", hline);
 865     }
 866
 867     if (wc->wcc[ewcPMEMESH].n > 0)
 868     {
 869         // A workaround to not print breakdown when no subcounters were recorded.
 870         // TODO: figure out and record PME GPU counters (what to do with the waiting ones?)
 871         std::vector<int> validPmeSubcounterIndices;
 872         for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
 873         {
 874             if (is_pme_subcounter(i) && wc->wcc[i].n > 0)
 875             {
 876                 validPmeSubcounterIndices.push_back(i);
 877             }
 878         }
 879
 880         if (!validPmeSubcounterIndices.empty())
 881         {
 882             fprintf(fplog, " Breakdown of PME mesh computation\n");
 883             fprintf(fplog, "%s\n", hline);
 884             for (auto i : validPmeSubcounterIndices)
 885             {
 886                 print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i],
 887                              npme > 0 ? npme : npp, nth_pme,
 888                              wc->wcc[i].n, cyc_sum[i], tot);
 889             }
 890             fprintf(fplog, "%s\n", hline);
 891         }
 892     }
 893
 894     if (useCycleSubcounters && wc->wcsc)
 895     {
 896         fprintf(fplog, " Breakdown of PP computation\n");
 897         fprintf(fplog, "%s\n", hline);
 898         for (i = 0; i < ewcsNR; i++)
 899         {
 900             print_cycles(fplog, c2t_pp, wcsn[i],
 901                          npp, nth_pp,
 902                          wc->wcsc[i].n, cyc_sum[ewcNR+i], tot);
 903         }
 904         fprintf(fplog, "%s\n", hline);
 905     }
 906
 907     /* print GPU timing summary */
 908     double tot_gpu = 0.0;
 909     if (gpu_pme_t)
 910     {
 911         for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
 912         {
 913             tot_gpu += gpu_pme_t->timing[k].t;
 914         }
 915     }
 916     if (gpu_nbnxn_t)
 917     {
 918         const char *k_log_str[2][2] = {
 919             {"Nonbonded F kernel", "Nonbonded F+ene k."},
 920             {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."}
 921         };
 922         tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
 923
 924         /* add up the kernel timings */
 925         for (i = 0; i < 2; i++)
 926         {
 927             for (j = 0; j < 2; j++)
 928             {
 929                 tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
 930             }
 931         }
 932         tot_gpu += gpu_nbnxn_t->pruneTime.t;
 933
 934         tot_cpu_overlap = wc->wcc[ewcFORCE].c;
 935         if (wc->wcc[ewcPMEMESH].n > 0)
 936         {
 937             tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
 938         }
 939         tot_cpu_overlap *= realtime*1000/tot; /* convert s to ms */
 940
 941         fprintf(fplog, "\n GPU timings\n%s\n", hline);
 942         fprintf(fplog, " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
 943         fprintf(fplog, "%s\n", hline);
 944         print_gputimes(fplog, "Pair list H2D",
 945                        gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
 946         print_gputimes(fplog, "X / q H2D",
 947                        gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
 948
 949         for (i = 0; i < 2; i++)
 950         {
 951             for (j = 0; j < 2; j++)
 952             {
 953                 if (gpu_nbnxn_t->ktime[i][j].c)
 954                 {
 955                     print_gputimes(fplog, k_log_str[i][j],
 956                                    gpu_nbnxn_t->ktime[i][j].c, gpu_nbnxn_t->ktime[i][j].t, tot_gpu);
 957                 }
 958             }
 959         }
 960         if (gpu_pme_t)
 961         {
 962             for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
 963             {
 964                 if (gpu_pme_t->timing[k].c)
 965                 {
 966                     print_gputimes(fplog, PMEStageNames[k],
 967                                    gpu_pme_t->timing[k].c,
 968                                    gpu_pme_t->timing[k].t,
 969                                    tot_gpu);
 970                 }
 971             }
 972         }
 973         if (gpu_nbnxn_t->pruneTime.c)
 974         {
 975             print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c, gpu_nbnxn_t->pruneTime.t, tot_gpu);
 976         }
 977         print_gputimes(fplog, "F D2H",  gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
 978         fprintf(fplog, "%s\n", hline);
 979         print_gputimes(fplog, "Total ", gpu_nbnxn_t->nb_c, tot_gpu, tot_gpu);
 980         fprintf(fplog, "%s\n", hline);
 981         if (gpu_nbnxn_t->dynamicPruneTime.c)
 982         {
 983             /* We print the dynamic pruning kernel timings after a separator
 984              * and avoid adding it to tot_gpu as this is not in the force
 985              * overlap. We print the fraction as relative to the rest.
 986              */
 987             print_gputimes(fplog, "*Dynamic pruning", gpu_nbnxn_t->dynamicPruneTime.c, gpu_nbnxn_t->dynamicPruneTime.t, tot_gpu);
 988             fprintf(fplog, "%s\n", hline);
 989         }
 990         gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
 991         if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
 992         {
 993             fprintf(fplog, "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = %.3f\n",
 994                     tot_gpu/gpu_nbnxn_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
 995                     gpu_cpu_ratio);
 996         }
 997
 998         /* only print notes related to CPU-GPU load balance with PME */
 999         if (wc->wcc[ewcPMEMESH].n > 0)
1000         {
1001             fprintf(fplog, "For optimal resource utilization this ratio should be close to 1\n");
1002
1003             /* print note if the imbalance is high with PME case in which
1004              * CPU-GPU load balancing is possible */
1005             if (gpu_cpu_ratio < 0.8 || gpu_cpu_ratio > 1.25)
1006             {
1007                 /* Only the sim master calls this function, so always print to stderr */
1008                 if (gpu_cpu_ratio < 0.8)
1009                 {
1010                     if (npp > 1)
1011                     {
1012                         /* The user could have used -notunepme,
1013                          * but we currently can't check that here.
1014                          */
1015                         GMX_LOG(mdlog.warning).asParagraph().appendText(
1016                                 "NOTE: The CPU has >25% more load than the GPU. This imbalance wastes\n"
1017                                 "      GPU resources. Maybe the domain decomposition limits the PME tuning.\n"
1018                                 "      In that case, try setting the DD grid manually (-dd) or lowering -dds.");
1019                     }
1020                     else
1021                     {
1022                         /* We should not end up here, unless the box is
1023                          * too small for increasing the cut-off for PME tuning.
1024                          */
1025                         GMX_LOG(mdlog.warning).asParagraph().appendText(
1026                                 "NOTE: The CPU has >25% more load than the GPU. This imbalance wastes\n"
1027                                 "      GPU resources.");
1028                     }
1029                 }
1030                 if (gpu_cpu_ratio > 1.25)
1031                 {
1032                     GMX_LOG(mdlog.warning).asParagraph().appendText(
1033                             "NOTE: The GPU has >25% more load than the CPU. This imbalance wastes\n"
1034                             "      CPU resources.");
1035                 }
1036             }
1037         }
1038     }
1039
1040     if (wc->wc_barrier)
1041     {
1042         GMX_LOG(mdlog.warning).asParagraph().appendText(
1043                 "MPI_Barrier was called before each cycle start/stop\n"
1044                 "call, so timings are not those of real runs.");
1045     }
1046
1047     if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 &&
1048         (cyc_sum[ewcDOMDEC] > tot*0.1 ||
1049          cyc_sum[ewcNS] > tot*0.1))
1050     {
1051         /* Only the sim master calls this function, so always print to stderr */
1052         if (wc->wcc[ewcDOMDEC].n == 0)
1053         {
1054             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
1055                     "NOTE: %d %% of the run time was spent in pair search,\n"
1056                     "      you might want to increase nstlist (this has no effect on accuracy)\n",
1057                     gmx::roundToInt(100*cyc_sum[ewcNS]/tot));
1058         }
1059         else
1060         {
1061             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
1062                     "NOTE: %d %% of the run time was spent in domain decomposition,\n"
1063                     "      %d %% of the run time was spent in pair search,\n"
1064                     "      you might want to increase nstlist (this has no effect on accuracy)\n",
1065                     gmx::roundToInt(100*cyc_sum[ewcDOMDEC]/tot),
1066                     gmx::roundToInt(100*cyc_sum[ewcNS]/tot));
1067         }
1068     }
1069
1070     if (cyc_sum[ewcMoveE] > tot*0.05)
1071     {
1072         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
1073                 "NOTE: %d %% of the run time was spent communicating energies,\n"
1074                 "      you might want to use the -gcom option of mdrun\n",
1075                 gmx::roundToInt(100*cyc_sum[ewcMoveE]/tot));
1076     }
1077 }
1078
1079 extern int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
1080 {
1081     if (wc == nullptr)
1082     {
1083         return -1;
1084     }
1085
1086     return wc->reset_counters;
1087 }
1088
1089 extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, int64_t reset_counters)
1090 {
1091     if (wc == nullptr)
1092     {
1093         return;
1094     }
1095
1096     wc->reset_counters = reset_counters;
1097 }
1098
1099 void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
1100 {
1101     if (useCycleSubcounters && wc != nullptr)
1102     {
1103         wc->wcsc[ewcs].start = gmx_cycles_read();
1104     }
1105 }
1106
1107 void wallcycle_sub_start_nocount(gmx_wallcycle_t wc, int ewcs)
1108 {
1109     if (useCycleSubcounters && wc != nullptr)
1110     {
1111         wallcycle_sub_start(wc, ewcs);
1112         wc->wcsc[ewcs].n--;
1113     }
1114 }
1115
1116 void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
1117 {
1118     if (useCycleSubcounters && wc != nullptr)
1119     {
1120         wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start;
1121         wc->wcsc[ewcs].n++;
1122     }
1123 }