src/gromacs/timing/wallcycle.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "wallcycle.h"
  40
  41 #include "config.h"
  42
  43 #include <cstdlib>
  44
  45 #include <array>
  46 #include <vector>
  47
  48 #include "gromacs/mdtypes/commrec.h"
  49 #include "gromacs/timing/cyclecounter.h"
  50 #include "gromacs/timing/gpu_timing.h"
  51 #include "gromacs/timing/wallcyclereporting.h"
  52 #include "gromacs/utility/cstringutil.h"
  53 #include "gromacs/utility/gmxassert.h"
  54 #include "gromacs/utility/gmxmpi.h"
  55 #include "gromacs/utility/logger.h"
  56 #include "gromacs/utility/smalloc.h"
  57 #include "gromacs/utility/snprintf.h"
  58
  59 static const bool useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS;
  60
  61 /* DEBUG_WCYCLE adds consistency checking for the counters.
  62  * It checks if you stop a counter different from the last
  63  * one that was opened and if you do nest too deep.
  64  */
  65 /* #define DEBUG_WCYCLE */
  66
  67 #ifdef DEBUG_WCYCLE
  68 #include "gromacs/utility/fatalerror.h"
  69 #endif
  70
  71 typedef struct
  72 {
  73     int          n;
  74     gmx_cycles_t c;
  75     gmx_cycles_t start;
  76 } wallcc_t;
  77
  78 typedef struct gmx_wallcycle
  79 {
  80     wallcc_t        *wcc;
  81     /* did we detect one or more invalid cycle counts */
  82     gmx_bool         haveInvalidCount;
  83     /* variables for testing/debugging */
  84     gmx_bool         wc_barrier;
  85     wallcc_t        *wcc_all;
  86     int              wc_depth;
  87 #ifdef DEBUG_WCYCLE
  88 #define DEPTH_MAX 6
  89     int               counterlist[DEPTH_MAX];
  90     int               count_depth;
  91 #endif
  92     int               ewc_prev;
  93     gmx_cycles_t      cycle_prev;
  94     gmx_int64_t       reset_counters;
  95 #if GMX_MPI
  96     MPI_Comm          mpi_comm_mygroup;
  97 #endif
  98     wallcc_t         *wcsc;
  99 } gmx_wallcycle_t_t;
 100
 101 /* Each name should not exceed 19 printing characters
 102    (ie. terminating null can be twentieth) */
 103 static const char *wcn[ewcNR] =
 104 {
 105     "Run", "Step", "PP during PME", "Domain decomp.", "DD comm. load",
 106     "DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.",
 107     "Comm. coord.", "Force", "Wait + Comm. F", "PME mesh",
 108     "PME redist. X/F", "PME spread", "PME gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve LJ", "PME solve Elec",
 109     "PME wait for PP", "Wait + Recv. PME F",
 110     "Wait PME GPU spread", "PME 3D-FFT", "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
 111     "Wait PME GPU gather", "Reduce GPU PME F",
 112     "Wait GPU NB nonloc.", "Wait GPU NB local", "NB X/F buffer ops.",
 113     "Vsite spread", "COM pull force", "AWH",
 114     "Write traj.", "Update", "Constraints", "Comm. energies",
 115     "Enforced rotation", "Add rot. forces", "Position swapping", "IMD", "Test"
 116 };
 117
 118 static const char *wcsn[ewcsNR] =
 119 {
 120     "DD redist.", "DD NS grid + sort", "DD setup comm.",
 121     "DD make top.", "DD make constr.", "DD top. other",
 122     "NS grid local", "NS grid non-loc.", "NS search local", "NS search non-loc.",
 123     "Bonded F",
 124     "Bonded-FEP F",
 125     "Restraints F",
 126     "Listed buffer ops.",
 127     "Nonbonded pruning",
 128     "Nonbonded F",
 129     "Launch NB GPU tasks",
 130     "Launch PME GPU tasks",
 131     "Ewald F correction",
 132     "NB X buffer ops.",
 133     "NB F buffer ops.",
 134 };
 135
 136 /* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
 137 static const char *PMEStageNames[] =
 138 {
 139     "PME spline",
 140     "PME spread",
 141     "PME spline + spread",
 142     "PME 3D-FFT r2c",
 143     "PME solve",
 144     "PME 3D-FFT c2r",
 145     "PME gather",
 146 };
 147
 148 gmx_bool wallcycle_have_counter(void)
 149 {
 150     return gmx_cycles_have_counter();
 151 }
 152
 153 gmx_wallcycle_t wallcycle_init(FILE *fplog, int resetstep, t_commrec gmx_unused *cr)
 154 {
 155     gmx_wallcycle_t wc;
 156
 157
 158     if (!wallcycle_have_counter())
 159     {
 160         return nullptr;
 161     }
 162
 163     snew(wc, 1);
 164
 165     wc->haveInvalidCount    = FALSE;
 166     wc->wc_barrier          = FALSE;
 167     wc->wcc_all             = nullptr;
 168     wc->wc_depth            = 0;
 169     wc->ewc_prev            = -1;
 170     wc->reset_counters      = resetstep;
 171
 172 #if GMX_MPI
 173     if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
 174     {
 175         if (fplog)
 176         {
 177             fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
 178         }
 179         wc->wc_barrier       = TRUE;
 180         wc->mpi_comm_mygroup = cr->mpi_comm_mygroup;
 181     }
 182 #endif
 183
 184     snew(wc->wcc, ewcNR);
 185     if (getenv("GMX_CYCLE_ALL") != nullptr)
 186     {
 187         if (fplog)
 188         {
 189             fprintf(fplog, "\nWill time all the code during the run\n\n");
 190         }
 191         snew(wc->wcc_all, ewcNR*ewcNR);
 192     }
 193
 194     if (useCycleSubcounters)
 195     {
 196         snew(wc->wcsc, ewcsNR);
 197     }
 198
 199 #ifdef DEBUG_WCYCLE
 200     wc->count_depth = 0;
 201 #endif
 202
 203     return wc;
 204 }
 205
 206 /* TODO: Should be called from finish_run() or runner()
 207    void wallcycle_destroy(gmx_wallcycle_t wc)
 208    {
 209     if (wc == nullptr)
 210     {
 211         return;
 212     }
 213
 214     if (wc->wcc != nullptr)
 215     {
 216         sfree(wc->wcc);
 217     }
 218     if (wc->wcc_all != nullptr)
 219     {
 220         sfree(wc->wcc_all);
 221     }
 222     if (wc->wcsc != nullptr)
 223     {
 224         sfree(wc->wcsc);
 225     }
 226     sfree(wc);
 227    }
 228  */
 229
 230 static void wallcycle_all_start(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 231 {
 232     wc->ewc_prev   = ewc;
 233     wc->cycle_prev = cycle;
 234 }
 235
 236 static void wallcycle_all_stop(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 237 {
 238     wc->wcc_all[wc->ewc_prev*ewcNR+ewc].n += 1;
 239     wc->wcc_all[wc->ewc_prev*ewcNR+ewc].c += cycle - wc->cycle_prev;
 240 }
 241
 242
 243 #ifdef DEBUG_WCYCLE
 244 static void debug_start_check(gmx_wallcycle_t wc, int ewc)
 245 {
 246     /* fprintf(stderr,"wcycle_start depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 247
 248     if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
 249     {
 250         gmx_fatal(FARGS, "wallcycle counter depth out of range: %d",
 251                   wc->count_depth);
 252     }
 253     wc->counterlist[wc->count_depth] = ewc;
 254     wc->count_depth++;
 255 }
 256
 257 static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
 258 {
 259     wc->count_depth--;
 260
 261     /* fprintf(stderr,"wcycle_stop depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 262
 263     if (wc->count_depth < 0)
 264     {
 265         gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc], wc->count_depth);
 266     }
 267     if (wc->counterlist[wc->count_depth] != ewc)
 268     {
 269         gmx_fatal(FARGS, "wallcycle mismatch at stop, start %s, stop %s",
 270                   wcn[wc->counterlist[wc->count_depth]], wcn[ewc]);
 271     }
 272 }
 273 #endif
 274
 275 void wallcycle_start(gmx_wallcycle_t wc, int ewc)
 276 {
 277     gmx_cycles_t cycle;
 278
 279     if (wc == nullptr)
 280     {
 281         return;
 282     }
 283
 284 #if GMX_MPI
 285     if (wc->wc_barrier)
 286     {
 287         MPI_Barrier(wc->mpi_comm_mygroup);
 288     }
 289 #endif
 290
 291 #ifdef DEBUG_WCYCLE
 292     debug_start_check(wc, ewc);
 293 #endif
 294
 295     cycle              = gmx_cycles_read();
 296     wc->wcc[ewc].start = cycle;
 297     if (wc->wcc_all != nullptr)
 298     {
 299         wc->wc_depth++;
 300         if (ewc == ewcRUN)
 301         {
 302             wallcycle_all_start(wc, ewc, cycle);
 303         }
 304         else if (wc->wc_depth == 3)
 305         {
 306             wallcycle_all_stop(wc, ewc, cycle);
 307         }
 308     }
 309 }
 310
 311 void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
 312 {
 313     if (wc == nullptr)
 314     {
 315         return;
 316     }
 317
 318     wallcycle_start(wc, ewc);
 319     wc->wcc[ewc].n--;
 320 }
 321
 322 double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
 323 {
 324     gmx_cycles_t cycle, last;
 325
 326     if (wc == nullptr)
 327     {
 328         return 0;
 329     }
 330
 331 #if GMX_MPI
 332     if (wc->wc_barrier)
 333     {
 334         MPI_Barrier(wc->mpi_comm_mygroup);
 335     }
 336 #endif
 337
 338 #ifdef DEBUG_WCYCLE
 339     debug_stop_check(wc, ewc);
 340 #endif
 341
 342     /* When processes or threads migrate between cores, the cycle counting
 343      * can get messed up if the cycle counter on different cores are not
 344      * synchronized. When this happens we expect both large negative and
 345      * positive cycle differences. We can detect negative cycle differences.
 346      * Detecting too large positive counts if difficult, since count can be
 347      * large, especially for ewcRUN. If we detect a negative count,
 348      * we will not print the cycle accounting table.
 349      */
 350     cycle                    = gmx_cycles_read();
 351     if (cycle >= wc->wcc[ewc].start)
 352     {
 353         last                 = cycle - wc->wcc[ewc].start;
 354     }
 355     else
 356     {
 357         last                 = 0;
 358         wc->haveInvalidCount = TRUE;
 359     }
 360     wc->wcc[ewc].c          += last;
 361     wc->wcc[ewc].n++;
 362     if (wc->wcc_all)
 363     {
 364         wc->wc_depth--;
 365         if (ewc == ewcRUN)
 366         {
 367             wallcycle_all_stop(wc, ewc, cycle);
 368         }
 369         else if (wc->wc_depth == 2)
 370         {
 371             wallcycle_all_start(wc, ewc, cycle);
 372         }
 373     }
 374
 375     return last;
 376 }
 377
 378 void wallcycle_get(gmx_wallcycle_t wc, int ewc, int *n, double *c)
 379 {
 380     *n = wc->wcc[ewc].n;
 381     *c = static_cast<double>(wc->wcc[ewc].c);
 382 }
 383
 384 void wallcycle_reset_all(gmx_wallcycle_t wc)
 385 {
 386     int i;
 387
 388     if (wc == nullptr)
 389     {
 390         return;
 391     }
 392
 393     for (i = 0; i < ewcNR; i++)
 394     {
 395         wc->wcc[i].n = 0;
 396         wc->wcc[i].c = 0;
 397     }
 398     wc->haveInvalidCount = FALSE;
 399
 400     if (wc->wcc_all)
 401     {
 402         for (i = 0; i < ewcNR*ewcNR; i++)
 403         {
 404             wc->wcc_all[i].n = 0;
 405             wc->wcc_all[i].c = 0;
 406         }
 407     }
 408     if (wc->wcsc)
 409     {
 410         for (i = 0; i < ewcsNR; i++)
 411         {
 412             wc->wcsc[i].n = 0;
 413             wc->wcsc[i].c = 0;
 414         }
 415     }
 416 }
 417
 418 static gmx_bool is_pme_counter(int ewc)
 419 {
 420     return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
 421 }
 422
 423 static gmx_bool is_pme_subcounter(int ewc)
 424 {
 425     return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
 426 }
 427
 428 /* Subtract counter ewc_sub timed inside a timing block for ewc_main */
 429 static void subtract_cycles(wallcc_t *wcc, int ewc_main, int ewc_sub)
 430 {
 431     if (wcc[ewc_sub].n > 0)
 432     {
 433         if (wcc[ewc_main].c >= wcc[ewc_sub].c)
 434         {
 435             wcc[ewc_main].c -= wcc[ewc_sub].c;
 436         }
 437         else
 438         {
 439             /* Something is wrong with the cycle counting */
 440             wcc[ewc_main].c  = 0;
 441         }
 442     }
 443 }
 444
 445 void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
 446 {
 447     if (wc == nullptr)
 448     {
 449         return;
 450     }
 451
 452     for (int i = 0; i < ewcNR; i++)
 453     {
 454         if (is_pme_counter(i) || (i == ewcRUN && isPmeRank))
 455         {
 456             wc->wcc[i].c *= nthreads_pme;
 457
 458             if (wc->wcc_all)
 459             {
 460                 for (int j = 0; j < ewcNR; j++)
 461                 {
 462                     wc->wcc_all[i*ewcNR+j].c *= nthreads_pme;
 463                 }
 464             }
 465         }
 466         else
 467         {
 468             wc->wcc[i].c *= nthreads_pp;
 469
 470             if (wc->wcc_all)
 471             {
 472                 for (int j = 0; j < ewcNR; j++)
 473                 {
 474                     wc->wcc_all[i*ewcNR+j].c *= nthreads_pp;
 475                 }
 476             }
 477         }
 478     }
 479     if (useCycleSubcounters && wc->wcsc && !isPmeRank)
 480     {
 481         for (int i = 0; i < ewcsNR; i++)
 482         {
 483             wc->wcsc[i].c *= nthreads_pp;
 484         }
 485     }
 486 }
 487
 488 /* TODO Make an object for this function to return, containing some
 489  * vectors of something like wallcc_t for the summed wcc, wcc_all and
 490  * wcsc, AND the original wcc for rank 0.
 491  *
 492  * The GPU timing is reported only for rank 0, so we want to preserve
 493  * the original wcycle on that rank. Rank 0 also reports the global
 494  * counts before that, so needs something to contain the global data
 495  * without over-writing the rank-0 data. The current implementation
 496  * uses cycles_sum to manage this, which works OK now because wcsc and
 497  * wcc_all are unused by the GPU reporting, but it is not satisfactory
 498  * for the future. Also, there's no need for MPI_Allreduce, since
 499  * only MASTERRANK uses any of the results. */
 500 WallcycleCounts wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc)
 501 {
 502     WallcycleCounts cycles_sum;
 503     wallcc_t       *wcc;
 504     double          cycles[ewcNR+ewcsNR];
 505 #if GMX_MPI
 506     double          cycles_n[ewcNR+ewcsNR+1];
 507 #endif
 508     int             i;
 509     int             nsum;
 510
 511     if (wc == nullptr)
 512     {
 513         /* Default construction of std::array of non-class T can leave
 514            the values indeterminate, just like a C array, and icc
 515            warns about it. */
 516         cycles_sum.fill(0);
 517         return cycles_sum;
 518     }
 519
 520     wcc = wc->wcc;
 521
 522     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
 523     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);
 524
 525     subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);
 526
 527     if (cr->npmenodes == 0)
 528     {
 529         /* All nodes do PME (or no PME at all) */
 530         subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
 531     }
 532     else
 533     {
 534         /* The are PME-only nodes */
 535         if (wcc[ewcPMEMESH].n > 0)
 536         {
 537             /* This must be a PME only node, calculate the Wait + Comm. time */
 538             GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c, "Total run ticks must be greater than PME-only ticks");
 539             wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
 540         }
 541     }
 542
 543     /* Store the cycles in a double buffer for summing */
 544     for (i = 0; i < ewcNR; i++)
 545     {
 546 #if GMX_MPI
 547         cycles_n[i] = static_cast<double>(wcc[i].n);
 548 #endif
 549         cycles[i]   = static_cast<double>(wcc[i].c);
 550     }
 551     nsum = ewcNR;
 552     if (wc->wcsc)
 553     {
 554         for (i = 0; i < ewcsNR; i++)
 555         {
 556 #if GMX_MPI
 557             cycles_n[ewcNR+i] = static_cast<double>(wc->wcsc[i].n);
 558 #endif
 559             cycles[ewcNR+i]   = static_cast<double>(wc->wcsc[i].c);
 560         }
 561         nsum += ewcsNR;
 562     }
 563
 564 #if GMX_MPI
 565     if (cr->nnodes > 1)
 566     {
 567         double buf[ewcNR+ewcsNR+1];
 568
 569         // TODO this code is used only at the end of the run, so we
 570         // can just do a simple reduce of haveInvalidCount in
 571         // wallcycle_print, and avoid bugs
 572         cycles_n[nsum] = (wc->haveInvalidCount > 0 ? 1 : 0);
 573         // TODO Use MPI_Reduce
 574         MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX,
 575                       cr->mpi_comm_mysim);
 576         for (i = 0; i < ewcNR; i++)
 577         {
 578             wcc[i].n = static_cast<int>(buf[i] + 0.5);
 579         }
 580         wc->haveInvalidCount = (buf[nsum] > 0);
 581         if (wc->wcsc)
 582         {
 583             for (i = 0; i < ewcsNR; i++)
 584             {
 585                 wc->wcsc[i].n = static_cast<int>(buf[ewcNR+i] + 0.5);
 586             }
 587         }
 588
 589         // TODO Use MPI_Reduce
 590         MPI_Allreduce(cycles, cycles_sum.data(), nsum, MPI_DOUBLE, MPI_SUM,
 591                       cr->mpi_comm_mysim);
 592
 593         if (wc->wcc_all != nullptr)
 594         {
 595             double *buf_all, *cyc_all;
 596
 597             snew(cyc_all, ewcNR*ewcNR);
 598             snew(buf_all, ewcNR*ewcNR);
 599             for (i = 0; i < ewcNR*ewcNR; i++)
 600             {
 601                 cyc_all[i] = wc->wcc_all[i].c;
 602             }
 603             // TODO Use MPI_Reduce
 604             MPI_Allreduce(cyc_all, buf_all, ewcNR*ewcNR, MPI_DOUBLE, MPI_SUM,
 605                           cr->mpi_comm_mysim);
 606             for (i = 0; i < ewcNR*ewcNR; i++)
 607             {
 608                 wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
 609             }
 610             sfree(buf_all);
 611             sfree(cyc_all);
 612         }
 613     }
 614     else
 615 #endif
 616     {
 617         for (i = 0; i < nsum; i++)
 618         {
 619             cycles_sum[i] = cycles[i];
 620         }
 621     }
 622
 623     return cycles_sum;
 624 }
 625
 626 static void print_cycles(FILE *fplog, double c2t, const char *name,
 627                          int nnodes, int nthreads,
 628                          int ncalls, double c_sum, double tot)
 629 {
 630     char   nnodes_str[STRLEN];
 631     char   nthreads_str[STRLEN];
 632     char   ncalls_str[STRLEN];
 633     double wallt;
 634     double percentage = (tot > 0.) ? (100. * c_sum / tot) : 0.;
 635
 636     if (c_sum > 0)
 637     {
 638         if (ncalls > 0)
 639         {
 640             snprintf(ncalls_str, sizeof(ncalls_str), "%10d", ncalls);
 641             if (nnodes < 0)
 642             {
 643                 snprintf(nnodes_str, sizeof(nnodes_str), "N/A");
 644             }
 645             else
 646             {
 647                 snprintf(nnodes_str, sizeof(nnodes_str), "%4d", nnodes);
 648             }
 649             if (nthreads < 0)
 650             {
 651                 snprintf(nthreads_str, sizeof(nthreads_str), "N/A");
 652             }
 653             else
 654             {
 655                 snprintf(nthreads_str, sizeof(nthreads_str), "%4d", nthreads);
 656             }
 657         }
 658         else
 659         {
 660             nnodes_str[0]   = 0;
 661             nthreads_str[0] = 0;
 662             ncalls_str[0]   = 0;
 663         }
 664         /* Convert the cycle count to wallclock time for this task */
 665         wallt = c_sum*c2t;
 666
 667         fprintf(fplog, " %-19.19s %4s %4s %10s  %10.3f %14.3f %5.1f\n",
 668                 name, nnodes_str, nthreads_str, ncalls_str, wallt,
 669                 c_sum*1e-9, percentage);
 670     }
 671 }
 672
 673 static void print_gputimes(FILE *fplog, const char *name,
 674                            int n, double t, double tot_t)
 675 {
 676     char num[11];
 677     char avg_perf[11];
 678
 679     if (n > 0)
 680     {
 681         snprintf(num, sizeof(num), "%10d", n);
 682         snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t/n);
 683     }
 684     else
 685     {
 686         sprintf(num, "          ");
 687         sprintf(avg_perf, "          ");
 688     }
 689     if (t != tot_t && tot_t > 0)
 690     {
 691         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n",
 692                 name, num, t/1000, avg_perf, 100 * t/tot_t);
 693     }
 694     else
 695     {
 696         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n",
 697                 name, "", t/1000, avg_perf, 100.0);
 698     }
 699 }
 700
 701 static void print_header(FILE *fplog, int nrank_pp, int nth_pp, int nrank_pme, int nth_pme)
 702 {
 703     int nrank_tot = nrank_pp + nrank_pme;
 704     if (0 == nrank_pme)
 705     {
 706         fprintf(fplog, "On %d MPI rank%s", nrank_tot, nrank_tot == 1 ? "" : "s");
 707         if (nth_pp > 1)
 708         {
 709             fprintf(fplog, ", each using %d OpenMP threads", nth_pp);
 710         }
 711         /* Don't report doing PP+PME, because we can't tell here if
 712          * this is RF, etc. */
 713     }
 714     else
 715     {
 716         fprintf(fplog, "On %d MPI rank%s doing PP", nrank_pp, nrank_pp == 1 ? "" : "s");
 717         if (nth_pp > 1)
 718         {
 719             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pp > 1 ? " each" : "", nth_pp);
 720         }
 721         fprintf(fplog, ", and\non %d MPI rank%s doing PME", nrank_pme, nrank_pme == 1 ? "" : "s");
 722         if (nth_pme > 1)
 723         {
 724             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pme > 1 ? " each" : "", nth_pme);
 725         }
 726     }
 727
 728     fprintf(fplog, "\n\n");
 729     fprintf(fplog, " Computing:          Num   Num      Call    Wall time         Giga-Cycles\n");
 730     fprintf(fplog, "                     Ranks Threads  Count      (s)         total sum    %%\n");
 731 }
 732
 733
 734 void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int npme,
 735                      int nth_pp, int nth_pme, double realtime,
 736                      gmx_wallcycle_t wc, const WallcycleCounts &cyc_sum,
 737                      const gmx_wallclock_gpu_nbnxn_t *gpu_nbnxn_t,
 738                      const gmx_wallclock_gpu_pme_t *gpu_pme_t)
 739 {
 740     double      tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
 741     double      c2t, c2t_pp, c2t_pme = 0;
 742     int         i, j, npp, nth_tot;
 743     char        buf[STRLEN];
 744     const char *hline = "-----------------------------------------------------------------------------";
 745
 746     if (wc == nullptr)
 747     {
 748         return;
 749     }
 750
 751     GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0");
 752     GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0");
 753     GMX_ASSERT(nnodes > 0, "Number of nodes must be >0");
 754     GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative");
 755     npp     = nnodes - npme;
 756     /* npme is the number of PME-only ranks used, and we always do PP work */
 757     GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0");
 758
 759     nth_tot = npp*nth_pp + npme*nth_pme;
 760
 761     /* When using PME-only nodes, the next line is valid for both
 762        PP-only and PME-only nodes because they started ewcRUN at the
 763        same time. */
 764     tot        = cyc_sum[ewcRUN];
 765     tot_for_pp = 0;
 766
 767     if (tot <= 0.0)
 768     {
 769         /* TODO This is heavy handed, but until someone reworks the
 770            code so that it is provably robust with respect to
 771            non-positive values for all possible timer and cycle
 772            counters, there is less value gained from printing whatever
 773            timing data might still be sensible for some non-Jenkins
 774            run, than is lost from diagnosing Jenkins FP exceptions on
 775            runs about whose execution time we don't care. */
 776         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 777                 "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a time accounting",
 778                 tot);
 779         return;
 780     }
 781
 782     if (wc->haveInvalidCount)
 783     {
 784         GMX_LOG(mdlog.warning).asParagraph().appendText("NOTE: Detected invalid cycle counts, probably because threads moved between CPU cores that do not have synchronized cycle counters. Will not print the cycle accounting.");
 785         return;
 786     }
 787
 788
 789     /* Conversion factor from cycles to seconds */
 790     c2t     = realtime/tot;
 791     c2t_pp  = c2t * nth_tot / static_cast<double>(npp*nth_pp);
 792     if (npme > 0)
 793     {
 794         c2t_pme = c2t * nth_tot / static_cast<double>(npme*nth_pme);
 795     }
 796     else
 797     {
 798         c2t_pme = 0;
 799     }
 800
 801     fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");
 802
 803     print_header(fplog, npp, nth_pp, npme, nth_pme);
 804
 805     fprintf(fplog, "%s\n", hline);
 806     for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
 807     {
 808         if (is_pme_subcounter(i))
 809         {
 810             /* Do not count these at all */
 811         }
 812         else if (npme > 0 && is_pme_counter(i))
 813         {
 814             /* Print timing information for PME-only nodes, but add an
 815              * asterisk so the reader of the table can know that the
 816              * walltimes are not meant to add up. The asterisk still
 817              * fits in the required maximum of 19 characters. */
 818             char buffer[STRLEN];
 819             snprintf(buffer, STRLEN, "%s *", wcn[i]);
 820             print_cycles(fplog, c2t_pme, buffer,
 821                          npme, nth_pme,
 822                          wc->wcc[i].n, cyc_sum[i], tot);
 823         }
 824         else
 825         {
 826             /* Print timing information when it is for a PP or PP+PME
 827                node */
 828             print_cycles(fplog, c2t_pp, wcn[i],
 829                          npp, nth_pp,
 830                          wc->wcc[i].n, cyc_sum[i], tot);
 831             tot_for_pp += cyc_sum[i];
 832         }
 833     }
 834     if (wc->wcc_all != nullptr)
 835     {
 836         for (i = 0; i < ewcNR; i++)
 837         {
 838             for (j = 0; j < ewcNR; j++)
 839             {
 840                 snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
 841                 print_cycles(fplog, c2t_pp, buf,
 842                              npp, nth_pp,
 843                              wc->wcc_all[i*ewcNR+j].n,
 844                              wc->wcc_all[i*ewcNR+j].c,
 845                              tot);
 846             }
 847         }
 848     }
 849     tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot);
 850     print_cycles(fplog, c2t_pp, "Rest",
 851                  npp, nth_pp,
 852                  -1, tot_for_rest - tot_for_pp, tot);
 853     fprintf(fplog, "%s\n", hline);
 854     print_cycles(fplog, c2t, "Total",
 855                  npp, nth_pp,
 856                  -1, tot, tot);
 857     fprintf(fplog, "%s\n", hline);
 858
 859     if (npme > 0)
 860     {
 861         fprintf(fplog,
 862                 "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
 863                 "    twice the total reported, but the cycle count total and %% are correct.\n"
 864                 "%s\n", hline);
 865     }
 866
 867     if (wc->wcc[ewcPMEMESH].n > 0)
 868     {
 869         // A workaround to not print breakdown when no subcounters were recorded.
 870         // TODO: figure out and record PME GPU counters (what to do with the waiting ones?)
 871         std::vector<int> validPmeSubcounterIndices;
 872         for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
 873         {
 874             if (is_pme_subcounter(i) && wc->wcc[i].n > 0)
 875             {
 876                 validPmeSubcounterIndices.push_back(i);
 877             }
 878         }
 879
 880         if (!validPmeSubcounterIndices.empty())
 881         {
 882             fprintf(fplog, " Breakdown of PME mesh computation\n");
 883             fprintf(fplog, "%s\n", hline);
 884             for (auto i : validPmeSubcounterIndices)
 885             {
 886                 print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i],
 887                              npme > 0 ? npme : npp, nth_pme,
 888                              wc->wcc[i].n, cyc_sum[i], tot);
 889             }
 890             fprintf(fplog, "%s\n", hline);
 891         }
 892     }
 893
 894     if (useCycleSubcounters && wc->wcsc)
 895     {
 896         fprintf(fplog, " Breakdown of PP computation\n");
 897         fprintf(fplog, "%s\n", hline);
 898         for (i = 0; i < ewcsNR; i++)
 899         {
 900             print_cycles(fplog, c2t_pp, wcsn[i],
 901                          npp, nth_pp,
 902                          wc->wcsc[i].n, cyc_sum[ewcNR+i], tot);
 903         }
 904         fprintf(fplog, "%s\n", hline);
 905     }
 906
 907     /* print GPU timing summary */
 908     double tot_gpu = 0.0;
 909     if (gpu_pme_t)
 910     {
 911         for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
 912         {
 913             tot_gpu += gpu_pme_t->timing[k].t;
 914         }
 915     }
 916     if (gpu_nbnxn_t)
 917     {
 918         const char *k_log_str[2][2] = {
 919             {"Nonbonded F kernel", "Nonbonded F+ene k."},
 920             {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."}
 921         };
 922         tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
 923
 924         /* add up the kernel timings */
 925         for (i = 0; i < 2; i++)
 926         {
 927             for (j = 0; j < 2; j++)
 928             {
 929                 tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
 930             }
 931         }
 932         tot_gpu += gpu_nbnxn_t->pruneTime.t;
 933
 934         tot_cpu_overlap = wc->wcc[ewcFORCE].c;
 935         if (wc->wcc[ewcPMEMESH].n > 0)
 936         {
 937             tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
 938         }
 939         tot_cpu_overlap *= realtime*1000/tot; /* convert s to ms */
 940
 941         fprintf(fplog, "\n GPU timings\n%s\n", hline);
 942         fprintf(fplog, " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
 943         fprintf(fplog, "%s\n", hline);
 944         print_gputimes(fplog, "Pair list H2D",
 945                        gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
 946         print_gputimes(fplog, "X / q H2D",
 947                        gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
 948
 949         for (i = 0; i < 2; i++)
 950         {
 951             for (j = 0; j < 2; j++)
 952             {
 953                 if (gpu_nbnxn_t->ktime[i][j].c)
 954                 {
 955                     print_gputimes(fplog, k_log_str[i][j],
 956                                    gpu_nbnxn_t->ktime[i][j].c, gpu_nbnxn_t->ktime[i][j].t, tot_gpu);
 957                 }
 958             }
 959         }
 960         if (gpu_pme_t)
 961         {
 962             for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
 963             {
 964                 if (gpu_pme_t->timing[k].c)
 965                 {
 966                     print_gputimes(fplog, PMEStageNames[k],
 967                                    gpu_pme_t->timing[k].c,
 968                                    gpu_pme_t->timing[k].t,
 969                                    tot_gpu);
 970                 }
 971             }
 972         }
 973         if (gpu_nbnxn_t->pruneTime.c)
 974         {
 975             print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c, gpu_nbnxn_t->pruneTime.t, tot_gpu);
 976         }
 977         print_gputimes(fplog, "F D2H",  gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
 978         fprintf(fplog, "%s\n", hline);
 979         print_gputimes(fplog, "Total ", gpu_nbnxn_t->nb_c, tot_gpu, tot_gpu);
 980         fprintf(fplog, "%s\n", hline);
 981         if (gpu_nbnxn_t->dynamicPruneTime.c)
 982         {
 983             /* We print the dynamic pruning kernel timings after a separator
 984              * and avoid adding it to tot_gpu as this is not in the force
 985              * overlap. We print the fraction as relative to the rest.
 986              */
 987             print_gputimes(fplog, "*Dynamic pruning", gpu_nbnxn_t->dynamicPruneTime.c, gpu_nbnxn_t->dynamicPruneTime.t, tot_gpu);
 988             fprintf(fplog, "%s\n", hline);
 989         }
 990         gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
 991         if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
 992         {
 993             fprintf(fplog, "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = %.3f\n",
 994                     tot_gpu/gpu_nbnxn_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
 995                     gpu_cpu_ratio);
 996         }
 997
 998         /* only print notes related to CPU-GPU load balance with PME */
 999         if (wc->wcc[ewcPMEMESH].n > 0)
1000         {
1001             fprintf(fplog, "For optimal resource utilization this ratio should be close to 1\n");
1002
1003             /* print note if the imbalance is high with PME case in which
1004              * CPU-GPU load balancing is possible */
1005             if (gpu_cpu_ratio < 0.8 || gpu_cpu_ratio > 1.25)
1006             {
1007                 /* Only the sim master calls this function, so always print to stderr */
1008                 if (gpu_cpu_ratio < 0.8)
1009                 {
1010                     if (npp > 1)
1011                     {
1012                         /* The user could have used -notunepme,
1013                          * but we currently can't check that here.
1014                          */
1015                         GMX_LOG(mdlog.warning).asParagraph().appendText(
1016                                 "NOTE: The CPU has >25% more load than the GPU. This imbalance wastes\n"
1017                                 "      GPU resources. Maybe the domain decomposition limits the PME tuning.\n"
1018                                 "      In that case, try setting the DD grid manually (-dd) or lowering -dds.");
1019                     }
1020                     else
1021                     {
1022                         /* We should not end up here, unless the box is
1023                          * too small for increasing the cut-off for PME tuning.
1024                          */
1025                         GMX_LOG(mdlog.warning).asParagraph().appendText(
1026                                 "NOTE: The CPU has >25% more load than the GPU. This imbalance wastes\n"
1027                                 "      GPU resources.");
1028                     }
1029                 }
1030                 if (gpu_cpu_ratio > 1.25)
1031                 {
1032                     GMX_LOG(mdlog.warning).asParagraph().appendText(
1033                             "NOTE: The GPU has >25% more load than the CPU. This imbalance wastes\n"
1034                             "      CPU resources.");
1035                 }
1036             }
1037         }
1038     }
1039
1040     if (wc->wc_barrier)
1041     {
1042         GMX_LOG(mdlog.warning).asParagraph().appendText(
1043                 "MPI_Barrier was called before each cycle start/stop\n"
1044                 "call, so timings are not those of real runs.");
1045     }
1046
1047     if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 &&
1048         (cyc_sum[ewcDOMDEC] > tot*0.1 ||
1049          cyc_sum[ewcNS] > tot*0.1))
1050     {
1051         /* Only the sim master calls this function, so always print to stderr */
1052         if (wc->wcc[ewcDOMDEC].n == 0)
1053         {
1054             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
1055                     "NOTE: %d %% of the run time was spent in pair search,\n"
1056                     "      you might want to increase nstlist (this has no effect on accuracy)\n",
1057                     (int)(100*cyc_sum[ewcNS]/tot+0.5));
1058         }
1059         else
1060         {
1061             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
1062                     "NOTE: %d %% of the run time was spent in domain decomposition,\n"
1063                     "      %d %% of the run time was spent in pair search,\n"
1064                     "      you might want to increase nstlist (this has no effect on accuracy)\n",
1065                     (int)(100*cyc_sum[ewcDOMDEC]/tot+0.5),
1066                     (int)(100*cyc_sum[ewcNS]/tot+0.5));
1067         }
1068     }
1069
1070     if (cyc_sum[ewcMoveE] > tot*0.05)
1071     {
1072         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
1073                 "NOTE: %d %% of the run time was spent communicating energies,\n"
1074                 "      you might want to use the -gcom option of mdrun\n",
1075                 (int)(100*cyc_sum[ewcMoveE]/tot+0.5));
1076     }
1077 }
1078
1079 extern gmx_int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
1080 {
1081     if (wc == nullptr)
1082     {
1083         return -1;
1084     }
1085
1086     return wc->reset_counters;
1087 }
1088
1089 extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, gmx_int64_t reset_counters)
1090 {
1091     if (wc == nullptr)
1092     {
1093         return;
1094     }
1095
1096     wc->reset_counters = reset_counters;
1097 }
1098
1099 void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
1100 {
1101     if (useCycleSubcounters && wc != nullptr)
1102     {
1103         wc->wcsc[ewcs].start = gmx_cycles_read();
1104     }
1105 }
1106
1107 void wallcycle_sub_start_nocount(gmx_wallcycle_t wc, int ewcs)
1108 {
1109     if (useCycleSubcounters && wc != nullptr)
1110     {
1111         wallcycle_sub_start(wc, ewcs);
1112         wc->wcsc[ewcs].n--;
1113     }
1114 }
1115
1116 void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
1117 {
1118     if (useCycleSubcounters && wc != nullptr)
1119     {
1120         wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start;
1121         wc->wcsc[ewcs].n++;
1122     }
1123 }