src/gromacs/timing/wallcycle.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "wallcycle.h"
  40
  41 #include "config.h"
  42
  43 #include <cstdlib>
  44
  45 #include "gromacs/legacyheaders/md_logging.h"
  46 #include "gromacs/legacyheaders/types/commrec.h"
  47 #include "gromacs/timing/cyclecounter.h"
  48 #include "gromacs/timing/gpu_timing.h"
  49 #include "gromacs/utility/cstringutil.h"
  50 #include "gromacs/utility/gmxassert.h"
  51 #include "gromacs/utility/gmxmpi.h"
  52 #include "gromacs/utility/smalloc.h"
  53 #include "gromacs/utility/snprintf.h"
  54
  55 /* DEBUG_WCYCLE adds consistency checking for the counters.
  56  * It checks if you stop a counter different from the last
  57  * one that was opened and if you do nest too deep.
  58  */
  59 /* #define DEBUG_WCYCLE */
  60
  61 #ifdef DEBUG_WCYCLE
  62 #include "gromacs/utility/fatalerror.h"
  63 #endif
  64
  65 typedef struct
  66 {
  67     int          n;
  68     gmx_cycles_t c;
  69     gmx_cycles_t start;
  70     gmx_cycles_t last;
  71 } wallcc_t;
  72
  73 typedef struct gmx_wallcycle
  74 {
  75     wallcc_t        *wcc;
  76     /* variables for testing/debugging */
  77     gmx_bool         wc_barrier;
  78     wallcc_t        *wcc_all;
  79     int              wc_depth;
  80 #ifdef DEBUG_WCYCLE
  81 #define DEPTH_MAX 6
  82     int               counterlist[DEPTH_MAX];
  83     int               count_depth;
  84 #endif
  85     int               ewc_prev;
  86     gmx_cycles_t      cycle_prev;
  87     gmx_int64_t       reset_counters;
  88 #ifdef GMX_MPI
  89     MPI_Comm          mpi_comm_mygroup;
  90 #endif
  91     int               nthreads_pp;
  92     int               nthreads_pme;
  93 #ifdef GMX_CYCLE_SUBCOUNTERS
  94     wallcc_t         *wcsc;
  95 #endif
  96     double           *cycles_sum;
  97 } gmx_wallcycle_t_t;
  98
  99 /* Each name should not exceed 19 printing characters
 100    (ie. terminating null can be twentieth) */
 101 static const char *wcn[ewcNR] =
 102 {
 103     "Run", "Step", "PP during PME", "Domain decomp.", "DD comm. load",
 104     "DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.",
 105     "Comm. coord.", "Born radii", "Force", "Wait + Comm. F", "PME mesh",
 106     "PME redist. X/F", "PME spread/gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve LJ", "PME solve Elec",
 107     "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "Wait GPU loc. est.", "NB X/F buffer ops.",
 108     "Vsite spread", "COM pull force",
 109     "Write traj.", "Update", "Constraints", "Comm. energies",
 110     "Enforced rotation", "Add rot. forces", "Coordinate swapping", "IMD", "Test"
 111 };
 112
 113 #ifdef GMX_CYCLE_SUBCOUNTERS
 114 static const char *wcsn[ewcsNR] =
 115 {
 116     "DD redist.", "DD NS grid + sort", "DD setup comm.",
 117     "DD make top.", "DD make constr.", "DD top. other",
 118     "NS grid local", "NS grid non-loc.", "NS search local", "NS search non-loc.",
 119     "Bonded F",
 120     "Bonded-FEP F",
 121     "Restraints F",
 122     "Listed buffer ops.",
 123     "Nonbonded F",
 124     "Ewald F correction",
 125     "NB X buffer ops.",
 126     "NB F buffer ops.",
 127 };
 128 #endif
 129
 130 gmx_bool wallcycle_have_counter(void)
 131 {
 132     return gmx_cycles_have_counter();
 133 }
 134
 135 gmx_wallcycle_t wallcycle_init(FILE *fplog, int resetstep, t_commrec gmx_unused *cr,
 136                                int nthreads_pp, int nthreads_pme)
 137 {
 138     gmx_wallcycle_t wc;
 139
 140
 141     if (!wallcycle_have_counter())
 142     {
 143         return NULL;
 144     }
 145
 146     snew(wc, 1);
 147
 148     wc->wc_barrier          = FALSE;
 149     wc->wcc_all             = NULL;
 150     wc->wc_depth            = 0;
 151     wc->ewc_prev            = -1;
 152     wc->reset_counters      = resetstep;
 153     wc->nthreads_pp         = nthreads_pp;
 154     wc->nthreads_pme        = nthreads_pme;
 155     wc->cycles_sum          = NULL;
 156
 157 #ifdef GMX_MPI
 158     if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != NULL)
 159     {
 160         if (fplog)
 161         {
 162             fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
 163         }
 164         wc->wc_barrier       = TRUE;
 165         wc->mpi_comm_mygroup = cr->mpi_comm_mygroup;
 166     }
 167 #endif
 168
 169     snew(wc->wcc, ewcNR);
 170     if (getenv("GMX_CYCLE_ALL") != NULL)
 171     {
 172         if (fplog)
 173         {
 174             fprintf(fplog, "\nWill time all the code during the run\n\n");
 175         }
 176         snew(wc->wcc_all, ewcNR*ewcNR);
 177     }
 178
 179 #ifdef GMX_CYCLE_SUBCOUNTERS
 180     snew(wc->wcsc, ewcsNR);
 181 #endif
 182
 183 #ifdef DEBUG_WCYCLE
 184     wc->count_depth = 0;
 185 #endif
 186
 187     return wc;
 188 }
 189
 190 void wallcycle_destroy(gmx_wallcycle_t wc)
 191 {
 192     if (wc == NULL)
 193     {
 194         return;
 195     }
 196
 197     if (wc->wcc != NULL)
 198     {
 199         sfree(wc->wcc);
 200     }
 201     if (wc->wcc_all != NULL)
 202     {
 203         sfree(wc->wcc_all);
 204     }
 205 #ifdef GMX_CYCLE_SUBCOUNTERS
 206     if (wc->wcsc != NULL)
 207     {
 208         sfree(wc->wcsc);
 209     }
 210 #endif
 211     sfree(wc);
 212 }
 213
 214 static void wallcycle_all_start(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 215 {
 216     wc->ewc_prev   = ewc;
 217     wc->cycle_prev = cycle;
 218 }
 219
 220 static void wallcycle_all_stop(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 221 {
 222     wc->wcc_all[wc->ewc_prev*ewcNR+ewc].n += 1;
 223     wc->wcc_all[wc->ewc_prev*ewcNR+ewc].c += cycle - wc->cycle_prev;
 224 }
 225
 226
 227 #ifdef DEBUG_WCYCLE
 228 static void debug_start_check(gmx_wallcycle_t wc, int ewc)
 229 {
 230     /* fprintf(stderr,"wcycle_start depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 231
 232     if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
 233     {
 234         gmx_fatal(FARGS, "wallcycle counter depth out of range: %d",
 235                   wc->count_depth);
 236     }
 237     wc->counterlist[wc->count_depth] = ewc;
 238     wc->count_depth++;
 239 }
 240
 241 static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
 242 {
 243     wc->count_depth--;
 244
 245     /* fprintf(stderr,"wcycle_stop depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 246
 247     if (wc->count_depth < 0)
 248     {
 249         gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc], wc->count_depth);
 250     }
 251     if (wc->counterlist[wc->count_depth] != ewc)
 252     {
 253         gmx_fatal(FARGS, "wallcycle mismatch at stop, start %s, stop %s",
 254                   wcn[wc->counterlist[wc->count_depth]], wcn[ewc]);
 255     }
 256 }
 257 #endif
 258
 259 void wallcycle_start(gmx_wallcycle_t wc, int ewc)
 260 {
 261     gmx_cycles_t cycle;
 262
 263     if (wc == NULL)
 264     {
 265         return;
 266     }
 267
 268 #ifdef GMX_MPI
 269     if (wc->wc_barrier)
 270     {
 271         MPI_Barrier(wc->mpi_comm_mygroup);
 272     }
 273 #endif
 274
 275 #ifdef DEBUG_WCYCLE
 276     debug_start_check(wc, ewc);
 277 #endif
 278
 279     cycle              = gmx_cycles_read();
 280     wc->wcc[ewc].start = cycle;
 281     if (wc->wcc_all != NULL)
 282     {
 283         wc->wc_depth++;
 284         if (ewc == ewcRUN)
 285         {
 286             wallcycle_all_start(wc, ewc, cycle);
 287         }
 288         else if (wc->wc_depth == 3)
 289         {
 290             wallcycle_all_stop(wc, ewc, cycle);
 291         }
 292     }
 293 }
 294
 295 void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
 296 {
 297     if (wc == NULL)
 298     {
 299         return;
 300     }
 301
 302     wallcycle_start(wc, ewc);
 303     wc->wcc[ewc].n--;
 304 }
 305
 306 double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
 307 {
 308     gmx_cycles_t cycle, last;
 309
 310     if (wc == NULL)
 311     {
 312         return 0;
 313     }
 314
 315 #ifdef GMX_MPI
 316     if (wc->wc_barrier)
 317     {
 318         MPI_Barrier(wc->mpi_comm_mygroup);
 319     }
 320 #endif
 321
 322 #ifdef DEBUG_WCYCLE
 323     debug_stop_check(wc, ewc);
 324 #endif
 325
 326     cycle           = gmx_cycles_read();
 327     last            = cycle - wc->wcc[ewc].start;
 328     wc->wcc[ewc].c += last;
 329     wc->wcc[ewc].n++;
 330     if (wc->wcc_all)
 331     {
 332         wc->wc_depth--;
 333         if (ewc == ewcRUN)
 334         {
 335             wallcycle_all_stop(wc, ewc, cycle);
 336         }
 337         else if (wc->wc_depth == 2)
 338         {
 339             wallcycle_all_start(wc, ewc, cycle);
 340         }
 341     }
 342
 343     return last;
 344 }
 345
 346 void wallcycle_get(gmx_wallcycle_t wc, int ewc, int *n, double *c)
 347 {
 348     *n = wc->wcc[ewc].n;
 349     *c = static_cast<double>(wc->wcc[ewc].c);
 350 }
 351
 352 void wallcycle_reset_all(gmx_wallcycle_t wc)
 353 {
 354     int i;
 355
 356     if (wc == NULL)
 357     {
 358         return;
 359     }
 360
 361     for (i = 0; i < ewcNR; i++)
 362     {
 363         wc->wcc[i].n = 0;
 364         wc->wcc[i].c = 0;
 365     }
 366     if (wc->wcc_all)
 367     {
 368         for (i = 0; i < ewcNR*ewcNR; i++)
 369         {
 370             wc->wcc_all[i].n = 0;
 371             wc->wcc_all[i].c = 0;
 372         }
 373     }
 374 #ifdef GMX_CYCLE_SUBCOUNTERS
 375     for (i = 0; i < ewcsNR; i++)
 376     {
 377         wc->wcsc[i].n = 0;
 378         wc->wcsc[i].c = 0;
 379     }
 380 #endif
 381 }
 382
 383 static gmx_bool is_pme_counter(int ewc)
 384 {
 385     return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
 386 }
 387
 388 static gmx_bool is_pme_subcounter(int ewc)
 389 {
 390     return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
 391 }
 392
 393 /* Subtract counter ewc_sub timed inside a timing block for ewc_main */
 394 static void subtract_cycles(wallcc_t *wcc, int ewc_main, int ewc_sub)
 395 {
 396     if (wcc[ewc_sub].n > 0)
 397     {
 398         GMX_ASSERT(wcc[ewc_main].c >= wcc[ewc_sub].c, "Subcounter cannot have more ticks than parent");
 399
 400         wcc[ewc_main].c -= wcc[ewc_sub].c;
 401     }
 402 }
 403
 404 void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc)
 405 {
 406     wallcc_t *wcc;
 407     double    cycles[ewcNR+ewcsNR];
 408 #ifdef GMX_MPI
 409     double    cycles_n[ewcNR+ewcsNR];
 410     double    buf[ewcNR+ewcsNR];
 411     double   *buf_all, *cyc_all;
 412 #endif
 413     int       i, j;
 414     int       nsum;
 415
 416     if (wc == NULL)
 417     {
 418         return;
 419     }
 420
 421     snew(wc->cycles_sum, ewcNR+ewcsNR);
 422
 423     wcc = wc->wcc;
 424
 425     /* The GPU wait estimate counter is used for load balancing only
 426      * and will mess up the total due to double counting: clear it.
 427      */
 428     wcc[ewcWAIT_GPU_NB_L_EST].n = 0;
 429     wcc[ewcWAIT_GPU_NB_L_EST].c = 0;
 430
 431     for (i = 0; i < ewcNR; i++)
 432     {
 433         if (is_pme_counter(i) || (i == ewcRUN && cr->duty == DUTY_PME))
 434         {
 435             wcc[i].c *= wc->nthreads_pme;
 436
 437             if (wc->wcc_all)
 438             {
 439                 for (j = 0; j < ewcNR; j++)
 440                 {
 441                     wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pme;
 442                 }
 443             }
 444         }
 445         else
 446         {
 447             wcc[i].c *= wc->nthreads_pp;
 448
 449             if (wc->wcc_all)
 450             {
 451                 for (j = 0; j < ewcNR; j++)
 452                 {
 453                     wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pp;
 454                 }
 455             }
 456         }
 457     }
 458
 459     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
 460     subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);
 461
 462     subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);
 463
 464     if (cr->npmenodes == 0)
 465     {
 466         /* All nodes do PME (or no PME at all) */
 467         subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
 468     }
 469     else
 470     {
 471         /* The are PME-only nodes */
 472         if (wcc[ewcPMEMESH].n > 0)
 473         {
 474             /* This must be a PME only node, calculate the Wait + Comm. time */
 475             GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c, "Total run ticks must be greater than PME-only ticks");
 476             wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
 477         }
 478     }
 479
 480     /* Store the cycles in a double buffer for summing */
 481     for (i = 0; i < ewcNR; i++)
 482     {
 483 #ifdef GMX_MPI
 484         cycles_n[i] = static_cast<double>(wcc[i].n);
 485 #endif
 486         cycles[i]   = static_cast<double>(wcc[i].c);
 487     }
 488     nsum = ewcNR;
 489 #ifdef GMX_CYCLE_SUBCOUNTERS
 490     for (i = 0; i < ewcsNR; i++)
 491     {
 492         wc->wcsc[i].c    *= wc->nthreads_pp;
 493 #ifdef GMX_MPI
 494         cycles_n[ewcNR+i] = static_cast<double>(wc->wcsc[i].n);
 495 #endif
 496         cycles[ewcNR+i]   = static_cast<double>(wc->wcsc[i].c);
 497     }
 498     nsum += ewcsNR;
 499 #endif
 500
 501 #ifdef GMX_MPI
 502     if (cr->nnodes > 1)
 503     {
 504         MPI_Allreduce(cycles_n, buf, nsum, MPI_DOUBLE, MPI_MAX,
 505                       cr->mpi_comm_mysim);
 506         for (i = 0; i < ewcNR; i++)
 507         {
 508             wcc[i].n = static_cast<int>(buf[i] + 0.5);
 509         }
 510 #ifdef GMX_CYCLE_SUBCOUNTERS
 511         for (i = 0; i < ewcsNR; i++)
 512         {
 513             wc->wcsc[i].n = static_cast<int>(buf[ewcNR+i] + 0.5);
 514         }
 515 #endif
 516
 517         MPI_Allreduce(cycles, wc->cycles_sum, nsum, MPI_DOUBLE, MPI_SUM,
 518                       cr->mpi_comm_mysim);
 519
 520         if (wc->wcc_all != NULL)
 521         {
 522             snew(cyc_all, ewcNR*ewcNR);
 523             snew(buf_all, ewcNR*ewcNR);
 524             for (i = 0; i < ewcNR*ewcNR; i++)
 525             {
 526                 cyc_all[i] = wc->wcc_all[i].c;
 527             }
 528             MPI_Allreduce(cyc_all, buf_all, ewcNR*ewcNR, MPI_DOUBLE, MPI_SUM,
 529                           cr->mpi_comm_mysim);
 530             for (i = 0; i < ewcNR*ewcNR; i++)
 531             {
 532                 wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
 533             }
 534             sfree(buf_all);
 535             sfree(cyc_all);
 536         }
 537     }
 538     else
 539 #endif
 540     {
 541         for (i = 0; i < nsum; i++)
 542         {
 543             wc->cycles_sum[i] = cycles[i];
 544         }
 545     }
 546 }
 547
 548 static void print_cycles(FILE *fplog, double c2t, const char *name,
 549                          int nnodes, int nthreads,
 550                          int ncalls, double c_sum, double tot)
 551 {
 552     char   nnodes_str[6];
 553     char   nthreads_str[6];
 554     char   ncalls_str[11];
 555     double wallt;
 556     double percentage = (tot > 0.) ? (100. * c_sum / tot) : 0.;
 557
 558     if (c_sum > 0)
 559     {
 560         if (ncalls > 0)
 561         {
 562             snprintf(ncalls_str, sizeof(ncalls_str), "%10d", ncalls);
 563             if (nnodes < 0)
 564             {
 565                 snprintf(nnodes_str, sizeof(nnodes_str), "N/A");
 566             }
 567             else
 568             {
 569                 snprintf(nnodes_str, sizeof(nnodes_str), "%4d", nnodes);
 570             }
 571             if (nthreads < 0)
 572             {
 573                 snprintf(nthreads_str, sizeof(nthreads_str), "N/A");
 574             }
 575             else
 576             {
 577                 snprintf(nthreads_str, sizeof(nthreads_str), "%4d", nthreads);
 578             }
 579         }
 580         else
 581         {
 582             nnodes_str[0]   = 0;
 583             nthreads_str[0] = 0;
 584             ncalls_str[0]   = 0;
 585         }
 586         /* Convert the cycle count to wallclock time for this task */
 587         wallt = c_sum*c2t;
 588
 589         fprintf(fplog, " %-19.19s %4s %4s %10s  %10.3f %14.3f %5.1f\n",
 590                 name, nnodes_str, nthreads_str, ncalls_str, wallt,
 591                 c_sum*1e-9, percentage);
 592     }
 593 }
 594
 595 static void print_gputimes(FILE *fplog, const char *name,
 596                            int n, double t, double tot_t)
 597 {
 598     char num[11];
 599     char avg_perf[11];
 600
 601     if (n > 0)
 602     {
 603         snprintf(num, sizeof(num), "%10d", n);
 604         snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t/n);
 605     }
 606     else
 607     {
 608         sprintf(num, "          ");
 609         sprintf(avg_perf, "          ");
 610     }
 611     if (t != tot_t && tot_t > 0)
 612     {
 613         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n",
 614                 name, num, t/1000, avg_perf, 100 * t/tot_t);
 615     }
 616     else
 617     {
 618         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n",
 619                 name, "", t/1000, avg_perf, 100.0);
 620     }
 621 }
 622
 623 static void print_header(FILE *fplog, int nrank_pp, int nth_pp, int nrank_pme, int nth_pme)
 624 {
 625     int nrank_tot = nrank_pp + nrank_pme;
 626     if (0 == nrank_pme)
 627     {
 628         fprintf(fplog, "On %d MPI rank%s", nrank_tot, nrank_tot == 1 ? "" : "s");
 629         if (nth_pp > 1)
 630         {
 631             fprintf(fplog, ", each using %d OpenMP threads", nth_pp);
 632         }
 633         /* Don't report doing PP+PME, because we can't tell here if
 634          * this is RF, etc. */
 635     }
 636     else
 637     {
 638         fprintf(fplog, "On %d MPI rank%s doing PP", nrank_pp, nrank_pp == 1 ? "" : "s");
 639         if (nth_pp > 1)
 640         {
 641             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pp > 1 ? " each" : "", nth_pp);
 642         }
 643         fprintf(fplog, ", and\non %d MPI rank%s doing PME", nrank_pme, nrank_pme == 1 ? "" : "s");
 644         if (nth_pme > 1)
 645         {
 646             fprintf(fplog, ",%s using %d OpenMP threads", nrank_pme > 1 ? " each" : "", nth_pme);
 647         }
 648     }
 649
 650     fprintf(fplog, "\n\n");
 651     fprintf(fplog, " Computing:          Num   Num      Call    Wall time         Giga-Cycles\n");
 652     fprintf(fplog, "                     Ranks Threads  Count      (s)         total sum    %%\n");
 653 }
 654
 655 void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
 656                      gmx_wallcycle_t wc, struct gmx_wallclock_gpu_t *gpu_t)
 657 {
 658     double     *cyc_sum;
 659     double      tot, tot_for_pp, tot_for_rest, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, tot_k;
 660     double      c2t, c2t_pp, c2t_pme = 0;
 661     int         i, j, npp, nth_pp, nth_pme, nth_tot;
 662     char        buf[STRLEN];
 663     const char *hline = "-----------------------------------------------------------------------------";
 664
 665     if (wc == NULL)
 666     {
 667         return;
 668     }
 669
 670     nth_pp  = wc->nthreads_pp;
 671     GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0");
 672
 673     nth_pme = wc->nthreads_pme;
 674     GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0");
 675
 676     cyc_sum = wc->cycles_sum;
 677
 678     GMX_ASSERT(nnodes > 0, "Number of nodes must be >0");
 679     GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative");
 680     npp     = nnodes - npme;
 681     /* npme is the number of PME-only ranks used, and we always do PP work */
 682     GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0");
 683
 684     nth_tot = npp*nth_pp + npme*nth_pme;
 685
 686     /* When using PME-only nodes, the next line is valid for both
 687        PP-only and PME-only nodes because they started ewcRUN at the
 688        same time. */
 689     tot        = cyc_sum[ewcRUN];
 690     tot_for_pp = 0;
 691
 692     if (tot <= 0.0)
 693     {
 694         /* TODO This is heavy handed, but until someone reworks the
 695            code so that it is provably robust with respect to
 696            non-positive values for all possible timer and cycle
 697            counters, there is less value gained from printing whatever
 698            timing data might still be sensible for some non-Jenkins
 699            run, than is lost from diagnosing Jenkins FP exceptions on
 700            runs about whose execution time we don't care. */
 701         md_print_warn(NULL, fplog, "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a time accounting\n", tot);
 702         return;
 703     }
 704
 705     /* Conversion factor from cycles to seconds */
 706     c2t     = realtime/tot;
 707     c2t_pp  = c2t * nth_tot / static_cast<double>(npp*nth_pp);
 708     if (npme > 0)
 709     {
 710         c2t_pme = c2t * nth_tot / static_cast<double>(npme*nth_pme);
 711     }
 712     else
 713     {
 714         c2t_pme = 0;
 715     }
 716
 717     fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");
 718
 719     print_header(fplog, npp, nth_pp, npme, nth_pme);
 720
 721     fprintf(fplog, "%s\n", hline);
 722     for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
 723     {
 724         if (is_pme_subcounter(i))
 725         {
 726             /* Do not count these at all */
 727         }
 728         else if (npme > 0 && is_pme_counter(i))
 729         {
 730             /* Print timing information for PME-only nodes, but add an
 731              * asterisk so the reader of the table can know that the
 732              * walltimes are not meant to add up. The asterisk still
 733              * fits in the required maximum of 19 characters. */
 734             char buffer[STRLEN];
 735             snprintf(buffer, STRLEN, "%s *", wcn[i]);
 736             print_cycles(fplog, c2t_pme, buffer,
 737                          npme, nth_pme,
 738                          wc->wcc[i].n, cyc_sum[i], tot);
 739         }
 740         else
 741         {
 742             /* Print timing information when it is for a PP or PP+PME
 743                node */
 744             print_cycles(fplog, c2t_pp, wcn[i],
 745                          npp, nth_pp,
 746                          wc->wcc[i].n, cyc_sum[i], tot);
 747             tot_for_pp += cyc_sum[i];
 748         }
 749     }
 750     if (wc->wcc_all != NULL)
 751     {
 752         for (i = 0; i < ewcNR; i++)
 753         {
 754             for (j = 0; j < ewcNR; j++)
 755             {
 756                 snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
 757                 print_cycles(fplog, c2t_pp, buf,
 758                              npp, nth_pp,
 759                              wc->wcc_all[i*ewcNR+j].n,
 760                              wc->wcc_all[i*ewcNR+j].c,
 761                              tot);
 762             }
 763         }
 764     }
 765     tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot);
 766     print_cycles(fplog, c2t_pp, "Rest",
 767                  npp, nth_pp,
 768                  -1, tot_for_rest - tot_for_pp, tot);
 769     fprintf(fplog, "%s\n", hline);
 770     print_cycles(fplog, c2t, "Total",
 771                  npp, nth_pp,
 772                  -1, tot, tot);
 773     fprintf(fplog, "%s\n", hline);
 774
 775     if (npme > 0)
 776     {
 777         fprintf(fplog,
 778                 "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
 779                 "    twice the total reported, but the cycle count total and %% are correct.\n"
 780                 "%s\n", hline);
 781     }
 782
 783     if (wc->wcc[ewcPMEMESH].n > 0)
 784     {
 785         fprintf(fplog, " Breakdown of PME mesh computation\n");
 786         fprintf(fplog, "%s\n", hline);
 787         for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
 788         {
 789             if (is_pme_subcounter(i))
 790             {
 791                 print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i],
 792                              npme > 0 ? npme : npp, nth_pme,
 793                              wc->wcc[i].n, cyc_sum[i], tot);
 794             }
 795         }
 796         fprintf(fplog, "%s\n", hline);
 797     }
 798
 799 #ifdef GMX_CYCLE_SUBCOUNTERS
 800     fprintf(fplog, " Breakdown of PP computation\n");
 801     fprintf(fplog, "%s\n", hline);
 802     for (i = 0; i < ewcsNR; i++)
 803     {
 804         print_cycles(fplog, c2t_pp, wcsn[i],
 805                      npp, nth_pp,
 806                      wc->wcsc[i].n, cyc_sum[ewcNR+i], tot);
 807     }
 808     fprintf(fplog, "%s\n", hline);
 809 #endif
 810
 811     /* print GPU timing summary */
 812     if (gpu_t)
 813     {
 814         const char *k_log_str[2][2] = {
 815             {"Nonbonded F kernel", "Nonbonded F+ene k."},
 816             {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."}
 817         };
 818
 819         tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t;
 820
 821         /* add up the kernel timings */
 822         tot_k = 0.0;
 823         for (i = 0; i < 2; i++)
 824         {
 825             for (j = 0; j < 2; j++)
 826             {
 827                 tot_k += gpu_t->ktime[i][j].t;
 828             }
 829         }
 830         tot_gpu += tot_k;
 831
 832         tot_cpu_overlap = wc->wcc[ewcFORCE].c;
 833         if (wc->wcc[ewcPMEMESH].n > 0)
 834         {
 835             tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
 836         }
 837         tot_cpu_overlap *= realtime*1000/tot; /* convert s to ms */
 838
 839         fprintf(fplog, "\n GPU timings\n%s\n", hline);
 840         fprintf(fplog, " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
 841         fprintf(fplog, "%s\n", hline);
 842         print_gputimes(fplog, "Pair list H2D",
 843                        gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu);
 844         print_gputimes(fplog, "X / q H2D",
 845                        gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu);
 846
 847         for (i = 0; i < 2; i++)
 848         {
 849             for (j = 0; j < 2; j++)
 850             {
 851                 if (gpu_t->ktime[i][j].c)
 852                 {
 853                     print_gputimes(fplog, k_log_str[i][j],
 854                                    gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu);
 855                 }
 856             }
 857         }
 858
 859         print_gputimes(fplog, "F D2H",  gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu);
 860         fprintf(fplog, "%s\n", hline);
 861         print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu);
 862         fprintf(fplog, "%s\n", hline);
 863
 864         gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
 865         if (gpu_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
 866         {
 867             fprintf(fplog, "\nForce evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n",
 868                     tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
 869                     gpu_cpu_ratio);
 870         }
 871
 872         /* only print notes related to CPU-GPU load balance with PME */
 873         if (wc->wcc[ewcPMEMESH].n > 0)
 874         {
 875             fprintf(fplog, "For optimal performance this ratio should be close to 1!\n");
 876
 877             /* print note if the imbalance is high with PME case in which
 878              * CPU-GPU load balancing is possible */
 879             if (gpu_cpu_ratio < 0.75 || gpu_cpu_ratio > 1.2)
 880             {
 881                 /* Only the sim master calls this function, so always print to stderr */
 882                 if (gpu_cpu_ratio < 0.75)
 883                 {
 884                     if (npp > 1)
 885                     {
 886                         /* The user could have used -notunepme,
 887                          * but we currently can't check that here.
 888                          */
 889                         md_print_warn(NULL, fplog,
 890                                       "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
 891                                       "      performance loss. Maybe the domain decomposition limits the PME tuning.\n"
 892                                       "      In that case, try setting the DD grid manually (-dd) or lowering -dds.");
 893                     }
 894                     else
 895                     {
 896                         /* We should not end up here, unless the box is
 897                          * too small for increasing the cut-off for PME tuning.
 898                          */
 899                         md_print_warn(NULL, fplog,
 900                                       "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
 901                                       "      performance loss.");
 902                     }
 903                 }
 904                 if (gpu_cpu_ratio > 1.2)
 905                 {
 906                     md_print_warn(NULL, fplog,
 907                                   "\nNOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n"
 908                                   "      performance loss, consider using a shorter cut-off and a finer PME grid.");
 909                 }
 910             }
 911         }
 912     }
 913
 914     if (wc->wc_barrier)
 915     {
 916         md_print_warn(NULL, fplog,
 917                       "MPI_Barrier was called before each cycle start/stop\n"
 918                       "call, so timings are not those of real runs.\n");
 919     }
 920
 921     if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 &&
 922         (cyc_sum[ewcDOMDEC] > tot*0.1 ||
 923          cyc_sum[ewcNS] > tot*0.1))
 924     {
 925         /* Only the sim master calls this function, so always print to stderr */
 926         if (wc->wcc[ewcDOMDEC].n == 0)
 927         {
 928             md_print_warn(NULL, fplog,
 929                           "NOTE: %d %% of the run time was spent in pair search,\n"
 930                           "      you might want to increase nstlist (this has no effect on accuracy)\n",
 931                           (int)(100*cyc_sum[ewcNS]/tot+0.5));
 932         }
 933         else
 934         {
 935             md_print_warn(NULL, fplog,
 936                           "NOTE: %d %% of the run time was spent in domain decomposition,\n"
 937                           "      %d %% of the run time was spent in pair search,\n"
 938                           "      you might want to increase nstlist (this has no effect on accuracy)\n",
 939                           (int)(100*cyc_sum[ewcDOMDEC]/tot+0.5),
 940                           (int)(100*cyc_sum[ewcNS]/tot+0.5));
 941         }
 942     }
 943
 944     if (cyc_sum[ewcMoveE] > tot*0.05)
 945     {
 946         /* Only the sim master calls this function, so always print to stderr */
 947         md_print_warn(NULL, fplog,
 948                       "NOTE: %d %% of the run time was spent communicating energies,\n"
 949                       "      you might want to use the -gcom option of mdrun\n",
 950                       (int)(100*cyc_sum[ewcMoveE]/tot+0.5));
 951     }
 952 }
 953
 954 extern gmx_int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
 955 {
 956     if (wc == NULL)
 957     {
 958         return -1;
 959     }
 960
 961     return wc->reset_counters;
 962 }
 963
 964 extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, gmx_int64_t reset_counters)
 965 {
 966     if (wc == NULL)
 967     {
 968         return;
 969     }
 970
 971     wc->reset_counters = reset_counters;
 972 }
 973
 974 #ifdef GMX_CYCLE_SUBCOUNTERS
 975
 976 void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
 977 {
 978     if (wc != NULL)
 979     {
 980         wc->wcsc[ewcs].start = gmx_cycles_read();
 981     }
 982 }
 983
 984 void wallcycle_sub_start_nocount(gmx_wallcycle_t wc, int ewcs)
 985 {
 986     if (wc == NULL)
 987     {
 988         return;
 989     }
 990
 991     wallcycle_sub_start(wc, ewcs);
 992     wc->wcsc[ewcs].n--;
 993 }
 994
 995 void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
 996 {
 997     if (wc != NULL)
 998     {
 999         wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start;
1000         wc->wcsc[ewcs].n++;
1001     }
1002 }
1003
1004 #else
1005
1006 void wallcycle_sub_start(gmx_wallcycle_t gmx_unused wc, int gmx_unused ewcs)
1007 {
1008 }
1009 void wallcycle_sub_start_nocount(gmx_wallcycle_t gmx_unused wc, int gmx_unused ewcs)
1010 {
1011 }
1012 void wallcycle_sub_stop(gmx_wallcycle_t gmx_unused wc, int gmx_unused ewcs)
1013 {
1014 }
1015
1016 #endif /* GMX_CYCLE_SUBCOUNTERS */