src/gromacs/timing/wallcycle.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "wallcycle.h"
  40
  41 #include "config.h"
  42
  43 #include <cstdlib>
  44
  45 #include <array>
  46 #include <vector>
  47
  48 #include "gromacs/math/functions.h"
  49 #include "gromacs/mdtypes/commrec.h"
  50 #include "gromacs/timing/cyclecounter.h"
  51 #include "gromacs/timing/gpu_timing.h"
  52 #include "gromacs/timing/wallcyclereporting.h"
  53 #include "gromacs/utility/cstringutil.h"
  54 #include "gromacs/utility/gmxassert.h"
  55 #include "gromacs/utility/gmxmpi.h"
  56 #include "gromacs/utility/logger.h"
  57 #include "gromacs/utility/smalloc.h"
  58 #include "gromacs/utility/snprintf.h"
  59
  60 static const bool useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS;
  61
  62 /* DEBUG_WCYCLE adds consistency checking for the counters.
  63  * It checks if you stop a counter different from the last
  64  * one that was opened and if you do nest too deep.
  65  */
  66 /* #define DEBUG_WCYCLE */
  67
  68 #ifdef DEBUG_WCYCLE
  69 #include "gromacs/utility/fatalerror.h"
  70 #endif
  71
  72 typedef struct
  73 {
  74     int          n;
  75     gmx_cycles_t c;
  76     gmx_cycles_t start;
  77 } wallcc_t;
  78
  79 struct gmx_wallcycle
  80 {
  81     wallcc_t        *wcc;
  82     /* did we detect one or more invalid cycle counts */
  83     gmx_bool         haveInvalidCount;
  84     /* variables for testing/debugging */
  85     gmx_bool         wc_barrier;
  86     wallcc_t        *wcc_all;
  87     int              wc_depth;
  88 #ifdef DEBUG_WCYCLE
  89 #define DEPTH_MAX 6
  90     int               counterlist[DEPTH_MAX];
  91     int               count_depth;
  92 #endif
  93     int               ewc_prev;
  94     gmx_cycles_t      cycle_prev;
  95     int64_t           reset_counters;
  96 #if GMX_MPI
  97     MPI_Comm          mpi_comm_mygroup;
  98 #endif
  99     wallcc_t         *wcsc;
 100 };
 101
 102 /* Each name should not exceed 19 printing characters
 103    (ie. terminating null can be twentieth) */
 104 static const char *wcn[ewcNR] =
 105 {
 106     "Run", "Step", "PP during PME", "Domain decomp.", "DD comm. load",
 107     "DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.",
 108     "Comm. coord.", "Force", "Wait + Comm. F", "PME mesh",
 109     "PME redist. X/F", "PME spread", "PME gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve LJ", "PME solve Elec",
 110     "PME wait for PP", "Wait + Recv. PME F",
 111     "Wait PME GPU spread", "PME 3D-FFT", "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
 112     "Wait PME GPU gather", "Wait Bonded GPU", "Reduce GPU PME F",
 113     "Wait GPU NB nonloc.", "Wait GPU NB local", "NB X/F buffer ops.",
 114     "Vsite spread", "COM pull force", "AWH",
 115     "Write traj.", "Update", "Constraints", "Comm. energies",
 116     "Enforced rotation", "Add rot. forces", "Position swapping", "IMD", "Test"
 117 };
 118
 119 static const char *wcsn[ewcsNR] =
 120 {
 121     "DD redist.", "DD NS grid + sort", "DD setup comm.",
 122     "DD make top.", "DD make constr.", "DD top. other",
 123     "NS grid local", "NS grid non-loc.", "NS search local", "NS search non-loc.",
 124     "Bonded F",
 125     "Bonded-FEP F",
 126     "Restraints F",
 127     "Listed buffer ops.",
 128     "Nonbonded pruning",
 129     "Nonbonded F",
 130     "Launch NB GPU tasks",
 131     "Launch Bonded GPU tasks",
 132     "Launch PME GPU tasks",
 133     "Ewald F correction",
 134     "NB X buffer ops.",
 135     "NB F buffer ops.",
 136     "Clear force buffer",
 137     "Test subcounter",
 138 };
 139
 140 /* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
 141 static const char *PMEStageNames[] =
 142 {
 143     "PME spline",
 144     "PME spread",
 145     "PME spline + spread",
 146     "PME 3D-FFT r2c",
 147     "PME solve",
 148     "PME 3D-FFT c2r",
 149     "PME gather",
 150 };
 151
 152 gmx_bool wallcycle_have_counter()
 153 {
 154     return gmx_cycles_have_counter();
 155 }
 156
 157 gmx_wallcycle_t wallcycle_init(FILE *fplog, int resetstep, t_commrec gmx_unused *cr)
 158 {
 159     gmx_wallcycle_t wc;
 160
 161
 162     if (!wallcycle_have_counter())
 163     {
 164         return nullptr;
 165     }
 166
 167     snew(wc, 1);
 168
 169     wc->haveInvalidCount    = FALSE;
 170     wc->wc_barrier          = FALSE;
 171     wc->wcc_all             = nullptr;
 172     wc->wc_depth            = 0;
 173     wc->ewc_prev            = -1;
 174     wc->reset_counters      = resetstep;
 175
 176 #if GMX_MPI
 177     if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
 178     {
 179         if (fplog)
 180         {
 181             fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
 182         }
 183         wc->wc_barrier       = TRUE;
 184         wc->mpi_comm_mygroup = cr->mpi_comm_mygroup;
 185     }
 186 #endif
 187
 188     snew(wc->wcc, ewcNR);
 189     if (getenv("GMX_CYCLE_ALL") != nullptr)
 190     {
 191         if (fplog)
 192         {
 193             fprintf(fplog, "\nWill time all the code during the run\n\n");
 194         }
 195         snew(wc->wcc_all, ewcNR*ewcNR);
 196     }
 197
 198     if (useCycleSubcounters)
 199     {
 200         snew(wc->wcsc, ewcsNR);
 201     }
 202
 203 #ifdef DEBUG_WCYCLE
 204     wc->count_depth = 0;
 205 #endif
 206
 207     return wc;
 208 }
 209
 210 void wallcycle_destroy(gmx_wallcycle_t wc)
 211 {
 212     if (wc == nullptr)
 213     {
 214         return;
 215     }
 216
 217     if (wc->wcc != nullptr)
 218     {
 219         sfree(wc->wcc);
 220     }
 221     if (wc->wcc_all != nullptr)
 222     {
 223         sfree(wc->wcc_all);
 224     }
 225     if (wc->wcsc != nullptr)
 226     {
 227         sfree(wc->wcsc);
 228     }
 229     sfree(wc);
 230 }
 231
 232 static void wallcycle_all_start(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 233 {
 234     wc->ewc_prev   = ewc;
 235     wc->cycle_prev = cycle;
 236 }
 237
 238 static void wallcycle_all_stop(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 239 {
 240     wc->wcc_all[wc->ewc_prev*ewcNR+ewc].n += 1;
 241     wc->wcc_all[wc->ewc_prev*ewcNR+ewc].c += cycle - wc->cycle_prev;
 242 }
 243
 244
 245 #ifdef DEBUG_WCYCLE
 246 static void debug_start_check(gmx_wallcycle_t wc, int ewc)
 247 {
 248     /* fprintf(stderr,"wcycle_start depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 249
 250     if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
 251     {
 252         gmx_fatal(FARGS, "wallcycle counter depth out of range: %d",
 253                   wc->count_depth);
 254     }
 255     wc->counterlist[wc->count_depth] = ewc;
 256     wc->count_depth++;
 257 }
 258
 259 static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
 260 {
 261     wc->count_depth--;
 262
 263     /* fprintf(stderr,"wcycle_stop depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 264
 265     if (wc->count_depth < 0)
 266     {
 267         gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc], wc->count_depth);
 268     }
 269     if (wc->counterlist[wc->count_depth] != ewc)
 270     {
 271         gmx_fatal(FARGS, "wallcycle mismatch at stop, start %s, stop %s",
 272                   wcn[wc->counterlist[wc->count_depth]], wcn[ewc]);
 273     }
 274 }
 275 #endif
 276
 277 void wallcycle_start(gmx_wallcycle_t wc, int ewc)
 278 {
 279     gmx_cycles_t cycle;
 280
 281     if (wc == nullptr)
 282     {
 283         return;
 284     }
 285
 286 #if GMX_MPI
 287     if (wc->wc_barrier)
 288     {
 289         MPI_Barrier(wc->mpi_comm_mygroup);
 290     }
 291 #endif
 292
 293 #ifdef DEBUG_WCYCLE
 294     debug_start_check(wc, ewc);
 295 #endif
 296
 297     cycle              = gmx_cycles_read();
 298     wc->wcc[ewc].start = cycle;
 299     if (wc->wcc_all != nullptr)
 300     {
 301         wc->wc_depth++;
 302         if (ewc == ewcRUN)
 303         {
 304             wallcycle_all_start(wc, ewc, cycle);
 305         }
 306         else if (wc->wc_depth == 3)
 307         {
 308             wallcycle_all_stop(wc, ewc, cycle);
 309         }
 310     }
 311 }
 312
 313 void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
 314 {
 315     if (wc == nullptr)
 316     {
 317         return;
 318     }
 319
 320     wallcycle_start(wc, ewc);
 321     wc->wcc[ewc].n--;
 322 }
 323
 324 double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
 325 {
 326     gmx_cycles_t cycle, last;
 327
 328     if (wc == nullptr)
 329     {
 330         return 0;
 331     }
 332
 333 #if GMX_MPI
 334     if (wc->wc_barrier)
 335     {
 336         MPI_Barrier(wc->mpi_comm_mygroup);
 337     }
 338 #endif
 339
 340 #ifdef DEBUG_WCYCLE
 341     debug_stop_check(wc, ewc);
 342 #endif
 343
 344     /* When processes or threads migrate between cores, the cycle counting
 345      * can get messed up if the cycle counter on different cores are not
 346      * synchronized. When this happens we expect both large negative and
 347      * positive cycle differences. We can detect negative cycle differences.
 348      * Detecting too large positive counts if difficult, since count can be
 349      * large, especially for ewcRUN. If we detect a negative count,
 350      * we will not print the cycle accounting table.
 351      */
 352     cycle                    = gmx_cycles_read();
 353     if (cycle >= wc->wcc[ewc].start)
 354     {
 355         last                 = cycle - wc->wcc[ewc].start;
 356     }
 357     else
 358     {
 359         last                 = 0;
 360         wc->haveInvalidCount = TRUE;
 361     }
 362     wc->wcc[ewc].c          += last;
 363     wc->wcc[ewc].n++;
 364     if (wc->wcc_all)
 365     {
 366         wc->wc_depth--;
 367         if (ewc == ewcRUN)
 368         {
 369             wallcycle_all_stop(wc, ewc, cycle);
 370         }
 371         else if (wc->wc_depth == 2)
 372         {
 373             wallcycle_all_start(wc, ewc, cycle);
 374         }
 375     }
 376
 377     return last;
 378 }
 379
 380 void wallcycle_get(gmx_wallcycle_t wc, int ewc, int *n, double *c)
 381 {
 382     *n = wc->wcc[ewc].n;
 383     *c = static_cast<double>(wc->wcc[ewc].c);
 384 }
 385
 386 void wallcycle_reset_all(gmx_wallcycle_t wc)
 387 {
 388     int i;
 389
 390     if (wc == nullptr)
 391     {
 392         return;
 393     }
 394
 395     for (i = 0; i < ewcNR; i++)
 396     {
 397         wc->wcc[i].n = 0;
 398         wc->wcc[i].c = 0;
 399     }
 400     wc->haveInvalidCount = FALSE;
 401
 402     if (wc->wcc_all)
 403     {
 404         for (i = 0; i < ewcNR*ewcNR; i++)
 405         {
 406             wc->wcc_all[i].n = 0;
 407             wc->wcc_all[i].c = 0;
 408         }
 409     }
 410     if (wc->wcsc)
 411     {
 412         for (i = 0; i < ewcsNR; i++)
 413         {
 414             wc->wcsc[i].n = 0;
 415             wc->wcsc[i].c = 0;
 416         }
 417     }
 418 }
 419
 420 static gmx_bool is_pme_counter(int ewc)
 421 {
 422     return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
 423 }
 424
 425 static gmx_bool is_pme_subcounter(int ewc)
 426 {
 427     return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
 428 }
 429
 430 /* Subtract counter ewc_sub timed inside a timing block for ewc_main */
 431 static void subtract_cycles(wallcc_t *wcc, int ewc_main, int ewc_sub)
 432 {
 433     if (wcc[ewc_sub].n > 0)
 434     {
 435         if (wcc[ewc_main].c >= wcc[ewc_sub].c)
 436         {
 437             wcc[ewc_main].c -= wcc[ewc_sub].c;
 438         }
 439         else
 440         {
 441             /* Something is wrong with the cycle counting */
 442             wcc[ewc_main].c  = 0;
 443         }
 444     }
 445 }
 446
 447 void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
 448 {
 449     if (wc == nullptr)
 450     {
 451         return;
 452     }
 453
 454     for (int i = 0; i < ewcNR; i++)
 455     {
 456         if (is_pme_counter(i) || (i == ewcRUN && isPmeRank))
 457         {
 458             wc->wcc[i].c *= nthreads_pme;
 459
 460             if (wc->wcc_all)
 461             {
 462                 for (int j = 0; j < ewcNR; j++)
 463                 {
 464                     wc->wcc_all[i*ewcNR+j].c *= nthreads_pme;
 465                 }
 466             }
 467         }
 468         else
 469         {
 470             wc->wcc[i].c *= nthreads_pp;
 471
 472             if (wc->wcc_all)
 473             {
 474                 for (int j = 0; j < ewcNR; j++)
 475                 {
 476                     wc->wcc_all[i*ewcNR+j].c *= nthreads_pp;
 477                 }
 478             }
 479         }
 480     }
 481     if (useCycleSubcounters && wc->wcsc && !isPmeRank)
 482     {
 483         for (int i = 0; i < ewcsNR; i++)
 484         {
 485             wc->wcsc[i].c *= nthreads_pp;
 486         }
 487     }
 488 }
 489
 490 /* TODO Make an object for this function to return, containing some
 491  * vectors of something like wallcc_t for the summed wcc, wcc_all and
 492  * wcsc, AND the original wcc for rank 0.
 493  *
 494  * The GPU timing is reported only for rank 0, so we want to preserve
 495  * the original wcycle on that rank. Rank 0 also reports the global
 496  * counts before that, so needs something to contain the global data
 497  * without over-writing the rank-0 data. The current implementation
 498  * uses cycles_sum to manage this, which works OK now because wcsc and
 499  * wcc_all are unused by the GPU reporting, but it is not satisfactory
 500  * for the future. Also, there's no need for MPI_Allreduce, since
 501  * only MASTERRANK uses any of the results. */
 502 WallcycleCounts wallcycle_sum(const t_commrec *cr, gmx_wallcycle_t wc)
 503 {
 504     WallcycleCounts cycles_sum;
 505     wallcc_t       *wcc;
 506     double          cycles[ewcNR+ewcsNR];
 507 #if GMX_MPI
 508     double          cycles_n[ewcNR+ewcsNR+1];
 509 #endif
 510     int             i;
 511     int             nsum;
 512
 513     if (wc == nullptr)
 514     {
 515         /* Default construction of std::array of non-class T can leave
 516            the values indeterminate, just like a C array, and icc
 517            warns about it. */
 518         cycles_sum.fill(0);
 519         return cycles_sum;
 520     }
 521
 522     wcc = wc->wcc;
 523
 524     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
 525     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);
 526
 527     subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);
 528
 529     if (cr->npmenodes == 0)
 530     {
 531         /* All nodes do PME (or no PME at all) */
 532         subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
 533     }
 534     else
 535     {
 536         /* The are PME-only nodes */
 537         if (wcc[ewcPMEMESH].n > 0)
 538         {
 539             /* This must be a PME only node, calculate the Wait + Comm. time */
 540             GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c, "Total run ticks must be greater than PME-only ticks");
 541             wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
 542         }
 543     }
 544
 545     /* Store the cycles in a double buffer for summing */
 546     for (i = 0; i < ewcNR; i++)
 547     {
 548 #if GMX_MPI
 549         cycles_n[i] = static_cast<double>(wcc[i].n);
 550 #endif
 551         cycles[i]   = static_cast<double>(wcc[i].c);
 552     }
 553     nsum = ewcNR;
 554     if (wc->wcsc)
 555     {
 556         for (i = 0; i < ewcsNR; i++)
 557         {
 558 #if GMX_MPI
 559             cycles_n[ewcNR+i] = static_cast<double>(wc->wcsc[i].n);
 560 #endif
 561             cycles[ewcNR+i]   = static_cast<double>(wc->wcsc[i].c);
 562         }
 563         nsum += ewcsNR;
 564     }
 565
 566 #if GMX_MPI
 567     if (cr->nnodes > 1)
 568     {
 569         double buf[ewcNR+ewcsNR+1];
 570
 571         // TODO this code is used only at the end of the run, so we
 572         // can just do a simple reduce of haveInvalidCount in
 573         // wallcycle_print, and avoid bugs
 574         cycles_n[nsum] = (wc->haveInvalidCount ? 1 : 0);
 575         // TODO Use MPI_Reduce
 576         MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX,
 577                       cr->mpi_comm_mysim);
 578         for (i = 0; i < ewcNR; i++)
 579         {
 580             wcc[i].n = gmx::roundToInt(buf[i]);
 581         }
 582         wc->haveInvalidCount = (buf[nsum] > 0);
 583         if (wc->wcsc)
 584         {
 585             for (i = 0; i < ewcsNR; i++)
 586             {
 587                 wc->wcsc[i].n = gmx::roundToInt(buf[ewcNR+i]);
 588             }
 589         }
 590
 591         // TODO Use MPI_Reduce
 592         MPI_Allreduce(cycles, cycles_sum.data(), nsum, MPI_DOUBLE, MPI_SUM,
 593                       cr->mpi_comm_mysim);
 594
 595         if (wc->wcc_all != nullptr)
 596         {
 597             double *buf_all, *cyc_all;
 598
 599             snew(cyc_all, ewcNR*ewcNR);
 600             snew(buf_all, ewcNR*ewcNR);
 601             for (i = 0; i < ewcNR*ewcNR; i++)
 602             {
 603                 cyc_all[i] = wc->wcc_all[i].c;
 604             }
 605             // TODO Use MPI_Reduce
 606             MPI_Allreduce(cyc_all, buf_all, ewcNR*ewcNR, MPI_DOUBLE, MPI_SUM,
 607                           cr->mpi_comm_mysim);
 608             for (i = 0; i < ewcNR*ewcNR; i++)
 609             {
 610                 wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
 611             }
 612             sfree(buf_all);
 613             sfree(cyc_all);
 614         }
 615     }
 616     else
 617 #endif
 618     {
 619         for (i = 0; i < nsum; i++)
 620         {
 621             cycles_sum[i] = cycles[i];
 622         }
 623     }
 624
 625     return cycles_sum;
 626 }
 627
 628 static void print_cycles(FILE *fplog, double c2t, const char *name,
 629                          int nnodes, int nthreads,
 630                          int ncalls, double c_sum, double tot)
 631 {
 632     char   nnodes_str[STRLEN];
 633     char   nthreads_str[STRLEN];
 634     char   ncalls_str[STRLEN];
 635     double wallt;
 636     double percentage = (tot > 0.) ? (100. * c_sum / tot) : 0.;
 637
 638     if (c_sum > 0)
 639     {
 640         if (ncalls > 0)
 641         {
 642             snprintf(ncalls_str, sizeof(ncalls_str), "%10d", ncalls);
 643             if (nnodes < 0)
 644             {
 645                 snprintf(nnodes_str, sizeof(nnodes_str), "N/A");
 646             }
 647             else
 648             {
 649                 snprintf(nnodes_str, sizeof(nnodes_str), "%4d", nnodes);
 650             }
 651             if (nthreads < 0)
 652             {
 653                 snprintf(nthreads_str, sizeof(nthreads_str), "N/A");
 654             }
 655             else
 656             {
 657                 snprintf(nthreads_str, sizeof(nthreads_str), "%4d", nthreads);
 658             }
 659         }
 660         else
 661         {
 662             nnodes_str[0]   = 0;
 663             nthreads_str[0] = 0;
 664             ncalls_str[0]   = 0;
 665         }
 666         /* Convert the cycle count to wallclock time for this task */
 667         wallt = c_sum*c2t;
 668
 669         fprintf(fplog, " %-19.19s %4s %4s %10s  %10.3f %14.3f %5.1f\n",
 670                 name, nnodes_str, nthreads_str, ncalls_str, wallt,
 671                 c_sum*1e-9, percentage);
 672     }
 673 }
 674
 675 static void print_gputimes(FILE *fplog, const char *name,
 676                            int n, double t, double tot_t)
 677 {
 678     char num[11];
 679     char avg_perf[11];
 680
 681     if (n > 0)
 682     {
 683         snprintf(num, sizeof(num), "%10d", n);
 684         snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t/n);
 685     }
 686     else
 687     {
 688         sprintf(num, "          ");
 689         sprintf(avg_perf, "          ");
 690     }
 691     if (t != tot_t && tot_t > 0)
 692     {
 693         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n",
 694                 name, num, t/1000, avg_perf, 100 * t/tot_t);
 695     }
 696     else
 697     {
 698         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n",
 699                 name, "", t/1000, avg_perf, 100.0);
 700     }
 701 }
 702
 703 static void print_header(FILE *fplog, int nrank_pp, int nth_pp, int nrank_pme, int nth_pme)
 704 {
 705     int nrank_tot = nrank_pp + nrank_pme;
 706     if (0 == nrank_pme)
 707     {
 708         fprintf(fplog, "On %d MPI rank%s", nrank_tot, nrank_tot == 1 ? "" : "s");
 709         if (nth_pp > 1)
 710         {
 711             fprintf(fplog, ", each using %d OpenMP threads", nth_pp);
 712         }
 713         /* Don't report doing PP+PME, because we can't tell here if
 714          * this is RF, etc. */
 715     }
 716     else
 717     {
 718         fprintf(fplog, "On %d MPI rank%s doing PP", nrank_pp, nrank_pp == 1 ? "" : "s");
 719         if (nth_pp > 1)
 720         {
 721             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pp > 1 ? " each" : "", nth_pp);
 722         }
 723         fprintf(fplog, ", and\non %d MPI rank%s doing PME", nrank_pme, nrank_pme == 1 ? "" : "s");
 724         if (nth_pme > 1)
 725         {
 726             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pme > 1 ? " each" : "", nth_pme);
 727         }
 728     }
 729
 730     fprintf(fplog, "\n\n");
 731     fprintf(fplog, " Computing:          Num   Num      Call    Wall time         Giga-Cycles\n");
 732     fprintf(fplog, "                     Ranks Threads  Count      (s)         total sum    %%\n");
 733 }
 734
 735
 736 void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int npme,
 737                      int nth_pp, int nth_pme, double realtime,
 738                      gmx_wallcycle_t wc, const WallcycleCounts &cyc_sum,
 739                      const gmx_wallclock_gpu_nbnxn_t *gpu_nbnxn_t,
 740                      const gmx_wallclock_gpu_pme_t *gpu_pme_t)
 741 {
 742     double      tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
 743     double      c2t, c2t_pp, c2t_pme = 0;
 744     int         i, j, npp, nth_tot;
 745     char        buf[STRLEN];
 746     const char *hline = "-----------------------------------------------------------------------------";
 747
 748     if (wc == nullptr)
 749     {
 750         return;
 751     }
 752
 753     GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0");
 754     GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0");
 755     GMX_ASSERT(nnodes > 0, "Number of nodes must be >0");
 756     GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative");
 757     npp     = nnodes - npme;
 758     /* npme is the number of PME-only ranks used, and we always do PP work */
 759     GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0");
 760
 761     nth_tot = npp*nth_pp + npme*nth_pme;
 762
 763     /* When using PME-only nodes, the next line is valid for both
 764        PP-only and PME-only nodes because they started ewcRUN at the
 765        same time. */
 766     tot        = cyc_sum[ewcRUN];
 767     tot_for_pp = 0;
 768
 769     if (tot <= 0.0)
 770     {
 771         /* TODO This is heavy handed, but until someone reworks the
 772            code so that it is provably robust with respect to
 773            non-positive values for all possible timer and cycle
 774            counters, there is less value gained from printing whatever
 775            timing data might still be sensible for some non-Jenkins
 776            run, than is lost from diagnosing Jenkins FP exceptions on
 777            runs about whose execution time we don't care. */
 778         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 779                 "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a time accounting",
 780                 tot);
 781         return;
 782     }
 783
 784     if (wc->haveInvalidCount)
 785     {
 786         GMX_LOG(mdlog.warning).asParagraph().appendText("NOTE: Detected invalid cycle counts, probably because threads moved between CPU cores that do not have synchronized cycle counters. Will not print the cycle accounting.");
 787         return;
 788     }
 789
 790
 791     /* Conversion factor from cycles to seconds */
 792     c2t     = realtime/tot;
 793     c2t_pp  = c2t * nth_tot / static_cast<double>(npp*nth_pp);
 794     if (npme > 0)
 795     {
 796         c2t_pme = c2t * nth_tot / static_cast<double>(npme*nth_pme);
 797     }
 798     else
 799     {
 800         c2t_pme = 0;
 801     }
 802
 803     fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");
 804
 805     print_header(fplog, npp, nth_pp, npme, nth_pme);
 806
 807     fprintf(fplog, "%s\n", hline);
 808     for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
 809     {
 810         if (is_pme_subcounter(i))
 811         {
 812             /* Do not count these at all */
 813         }
 814         else if (npme > 0 && is_pme_counter(i))
 815         {
 816             /* Print timing information for PME-only nodes, but add an
 817              * asterisk so the reader of the table can know that the
 818              * walltimes are not meant to add up. The asterisk still
 819              * fits in the required maximum of 19 characters. */
 820             char buffer[STRLEN];
 821             snprintf(buffer, STRLEN, "%s *", wcn[i]);
 822             print_cycles(fplog, c2t_pme, buffer,
 823                          npme, nth_pme,
 824                          wc->wcc[i].n, cyc_sum[i], tot);
 825         }
 826         else
 827         {
 828             /* Print timing information when it is for a PP or PP+PME
 829                node */
 830             print_cycles(fplog, c2t_pp, wcn[i],
 831                          npp, nth_pp,
 832                          wc->wcc[i].n, cyc_sum[i], tot);
 833             tot_for_pp += cyc_sum[i];
 834         }
 835     }
 836     if (wc->wcc_all != nullptr)
 837     {
 838         for (i = 0; i < ewcNR; i++)
 839         {
 840             for (j = 0; j < ewcNR; j++)
 841             {
 842                 snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
 843                 print_cycles(fplog, c2t_pp, buf,
 844                              npp, nth_pp,
 845                              wc->wcc_all[i*ewcNR+j].n,
 846                              wc->wcc_all[i*ewcNR+j].c,
 847                              tot);
 848             }
 849         }
 850     }
 851     tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot);
 852     print_cycles(fplog, c2t_pp, "Rest",
 853                  npp, nth_pp,
 854                  -1, tot_for_rest - tot_for_pp, tot);
 855     fprintf(fplog, "%s\n", hline);
 856     print_cycles(fplog, c2t, "Total",
 857                  npp, nth_pp,
 858                  -1, tot, tot);
 859     fprintf(fplog, "%s\n", hline);
 860
 861     if (npme > 0)
 862     {
 863         fprintf(fplog,
 864                 "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
 865                 "    twice the total reported, but the cycle count total and %% are correct.\n"
 866                 "%s\n", hline);
 867     }
 868
 869     if (wc->wcc[ewcPMEMESH].n > 0)
 870     {
 871         // A workaround to not print breakdown when no subcounters were recorded.
 872         // TODO: figure out and record PME GPU counters (what to do with the waiting ones?)
 873         std::vector<int> validPmeSubcounterIndices;
 874         for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
 875         {
 876             if (is_pme_subcounter(i) && wc->wcc[i].n > 0)
 877             {
 878                 validPmeSubcounterIndices.push_back(i);
 879             }
 880         }
 881
 882         if (!validPmeSubcounterIndices.empty())
 883         {
 884             fprintf(fplog, " Breakdown of PME mesh computation\n");
 885             fprintf(fplog, "%s\n", hline);
 886             for (auto i : validPmeSubcounterIndices)
 887             {
 888                 print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i],
 889                              npme > 0 ? npme : npp, nth_pme,
 890                              wc->wcc[i].n, cyc_sum[i], tot);
 891             }
 892             fprintf(fplog, "%s\n", hline);
 893         }
 894     }
 895
 896     if (useCycleSubcounters && wc->wcsc)
 897     {
 898         fprintf(fplog, " Breakdown of PP computation\n");
 899         fprintf(fplog, "%s\n", hline);
 900         for (i = 0; i < ewcsNR; i++)
 901         {
 902             print_cycles(fplog, c2t_pp, wcsn[i],
 903                          npp, nth_pp,
 904                          wc->wcsc[i].n, cyc_sum[ewcNR+i], tot);
 905         }
 906         fprintf(fplog, "%s\n", hline);
 907     }
 908
 909     /* print GPU timing summary */
 910     double tot_gpu = 0.0;
 911     if (gpu_pme_t)
 912     {
 913         for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
 914         {
 915             tot_gpu += gpu_pme_t->timing[k].t;
 916         }
 917     }
 918     if (gpu_nbnxn_t)
 919     {
 920         const char *k_log_str[2][2] = {
 921             {"Nonbonded F kernel", "Nonbonded F+ene k."},
 922             {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."}
 923         };
 924         tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
 925
 926         /* add up the kernel timings */
 927         for (i = 0; i < 2; i++)
 928         {
 929             for (j = 0; j < 2; j++)
 930             {
 931                 tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
 932             }
 933         }
 934         tot_gpu += gpu_nbnxn_t->pruneTime.t;
 935
 936         tot_cpu_overlap = wc->wcc[ewcFORCE].c;
 937         if (wc->wcc[ewcPMEMESH].n > 0)
 938         {
 939             tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
 940         }
 941         tot_cpu_overlap *= realtime*1000/tot; /* convert s to ms */
 942
 943         fprintf(fplog, "\n GPU timings\n%s\n", hline);
 944         fprintf(fplog, " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
 945         fprintf(fplog, "%s\n", hline);
 946         print_gputimes(fplog, "Pair list H2D",
 947                        gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
 948         print_gputimes(fplog, "X / q H2D",
 949                        gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
 950
 951         for (i = 0; i < 2; i++)
 952         {
 953             for (j = 0; j < 2; j++)
 954             {
 955                 if (gpu_nbnxn_t->ktime[i][j].c)
 956                 {
 957                     print_gputimes(fplog, k_log_str[i][j],
 958                                    gpu_nbnxn_t->ktime[i][j].c, gpu_nbnxn_t->ktime[i][j].t, tot_gpu);
 959                 }
 960             }
 961         }
 962         if (gpu_pme_t)
 963         {
 964             for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
 965             {
 966                 if (gpu_pme_t->timing[k].c)
 967                 {
 968                     print_gputimes(fplog, PMEStageNames[k],
 969                                    gpu_pme_t->timing[k].c,
 970                                    gpu_pme_t->timing[k].t,
 971                                    tot_gpu);
 972                 }
 973             }
 974         }
 975         if (gpu_nbnxn_t->pruneTime.c)
 976         {
 977             print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c, gpu_nbnxn_t->pruneTime.t, tot_gpu);
 978         }
 979         print_gputimes(fplog, "F D2H",  gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
 980         fprintf(fplog, "%s\n", hline);
 981         print_gputimes(fplog, "Total ", gpu_nbnxn_t->nb_c, tot_gpu, tot_gpu);
 982         fprintf(fplog, "%s\n", hline);
 983         if (gpu_nbnxn_t->dynamicPruneTime.c)
 984         {
 985             /* We print the dynamic pruning kernel timings after a separator
 986              * and avoid adding it to tot_gpu as this is not in the force
 987              * overlap. We print the fraction as relative to the rest.
 988              */
 989             print_gputimes(fplog, "*Dynamic pruning", gpu_nbnxn_t->dynamicPruneTime.c, gpu_nbnxn_t->dynamicPruneTime.t, tot_gpu);
 990             fprintf(fplog, "%s\n", hline);
 991         }
 992         gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
 993         if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
 994         {
 995             fprintf(fplog, "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = %.3f\n",
 996                     tot_gpu/gpu_nbnxn_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
 997                     gpu_cpu_ratio);
 998         }
 999
1000         /* only print notes related to CPU-GPU load balance with PME */
1001         if (wc->wcc[ewcPMEMESH].n > 0)
1002         {
1003             fprintf(fplog, "For optimal resource utilization this ratio should be close to 1\n");
1004
1005             /* print note if the imbalance is high with PME case in which
1006              * CPU-GPU load balancing is possible */
1007             if (gpu_cpu_ratio < 0.8 || gpu_cpu_ratio > 1.25)
1008             {
1009                 /* Only the sim master calls this function, so always print to stderr */
1010                 if (gpu_cpu_ratio < 0.8)
1011                 {
1012                     if (npp > 1)
1013                     {
1014                         /* The user could have used -notunepme,
1015                          * but we currently can't check that here.
1016                          */
1017                         GMX_LOG(mdlog.warning).asParagraph().appendText(
1018                                 "NOTE: The CPU has >25% more load than the GPU. This imbalance wastes\n"
1019                                 "      GPU resources. Maybe the domain decomposition limits the PME tuning.\n"
1020                                 "      In that case, try setting the DD grid manually (-dd) or lowering -dds.");
1021                     }
1022                     else
1023                     {
1024                         /* We should not end up here, unless the box is
1025                          * too small for increasing the cut-off for PME tuning.
1026                          */
1027                         GMX_LOG(mdlog.warning).asParagraph().appendText(
1028                                 "NOTE: The CPU has >25% more load than the GPU. This imbalance wastes\n"
1029                                 "      GPU resources.");
1030                     }
1031                 }
1032                 if (gpu_cpu_ratio > 1.25)
1033                 {
1034                     GMX_LOG(mdlog.warning).asParagraph().appendText(
1035                             "NOTE: The GPU has >25% more load than the CPU. This imbalance wastes\n"
1036                             "      CPU resources.");
1037                 }
1038             }
1039         }
1040     }
1041
1042     if (wc->wc_barrier)
1043     {
1044         GMX_LOG(mdlog.warning).asParagraph().appendText(
1045                 "MPI_Barrier was called before each cycle start/stop\n"
1046                 "call, so timings are not those of real runs.");
1047     }
1048
1049     if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 &&
1050         (cyc_sum[ewcDOMDEC] > tot*0.1 ||
1051          cyc_sum[ewcNS] > tot*0.1))
1052     {
1053         /* Only the sim master calls this function, so always print to stderr */
1054         if (wc->wcc[ewcDOMDEC].n == 0)
1055         {
1056             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
1057                     "NOTE: %d %% of the run time was spent in pair search,\n"
1058                     "      you might want to increase nstlist (this has no effect on accuracy)\n",
1059                     gmx::roundToInt(100*cyc_sum[ewcNS]/tot));
1060         }
1061         else
1062         {
1063             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
1064                     "NOTE: %d %% of the run time was spent in domain decomposition,\n"
1065                     "      %d %% of the run time was spent in pair search,\n"
1066                     "      you might want to increase nstlist (this has no effect on accuracy)\n",
1067                     gmx::roundToInt(100*cyc_sum[ewcDOMDEC]/tot),
1068                     gmx::roundToInt(100*cyc_sum[ewcNS]/tot));
1069         }
1070     }
1071
1072     if (cyc_sum[ewcMoveE] > tot*0.05)
1073     {
1074         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
1075                 "NOTE: %d %% of the run time was spent communicating energies,\n"
1076                 "      you might want to increase some nst* mdp options\n",
1077                 gmx::roundToInt(100*cyc_sum[ewcMoveE]/tot));
1078     }
1079 }
1080
1081 extern int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
1082 {
1083     if (wc == nullptr)
1084     {
1085         return -1;
1086     }
1087
1088     return wc->reset_counters;
1089 }
1090
1091 extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, int64_t reset_counters)
1092 {
1093     if (wc == nullptr)
1094     {
1095         return;
1096     }
1097
1098     wc->reset_counters = reset_counters;
1099 }
1100
1101 void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
1102 {
1103     if (useCycleSubcounters && wc != nullptr)
1104     {
1105         wc->wcsc[ewcs].start = gmx_cycles_read();
1106     }
1107 }
1108
1109 void wallcycle_sub_start_nocount(gmx_wallcycle_t wc, int ewcs)
1110 {
1111     if (useCycleSubcounters && wc != nullptr)
1112     {
1113         wallcycle_sub_start(wc, ewcs);
1114         wc->wcsc[ewcs].n--;
1115     }
1116 }
1117
1118 void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
1119 {
1120     if (useCycleSubcounters && wc != nullptr)
1121     {
1122         wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start;
1123         wc->wcsc[ewcs].n++;
1124     }
1125 }