src/mdlib/gmx_wallcycle.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team,
   6  * check out http://www.gromacs.org for more information.
   7  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   8  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   9  * others, as listed in the AUTHORS file in the top-level source
  10  * directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38
  39
  40 #ifdef HAVE_CONFIG_H
  41 #include <config.h>
  42 #endif
  43
  44 #include <string.h>
  45 #include "gmx_wallcycle.h"
  46 #include "gmx_cyclecounter.h"
  47 #include "smalloc.h"
  48 #include "gmx_fatal.h"
  49 #include "md_logging.h"
  50 #include "string2.h"
  51
  52 #ifdef GMX_LIB_MPI
  53 #include <mpi.h>
  54 #endif
  55 #ifdef GMX_THREAD_MPI
  56 #include "tmpi.h"
  57 #endif
  58
  59 /* DEBUG_WCYCLE adds consistency checking for the counters.
  60  * It checks if you stop a counter different from the last
  61  * one that was opened and if you do nest too deep.
  62  */
  63 /* #define DEBUG_WCYCLE */
  64
  65 typedef struct
  66 {
  67     int          n;
  68     gmx_cycles_t c;
  69     gmx_cycles_t start;
  70     gmx_cycles_t last;
  71 } wallcc_t;
  72
  73 typedef struct gmx_wallcycle
  74 {
  75     wallcc_t        *wcc;
  76     /* variables for testing/debugging */
  77     gmx_bool         wc_barrier;
  78     wallcc_t        *wcc_all;
  79     int              wc_depth;
  80 #ifdef DEBUG_WCYCLE
  81 #define DEPTH_MAX 6
  82     int               counterlist[DEPTH_MAX];
  83     int               count_depth;
  84 #endif
  85     int               ewc_prev;
  86     gmx_cycles_t      cycle_prev;
  87     gmx_large_int_t   reset_counters;
  88 #ifdef GMX_MPI
  89     MPI_Comm          mpi_comm_mygroup;
  90 #endif
  91     int               nthreads_pp;
  92     int               nthreads_pme;
  93 #ifdef GMX_CYCLE_SUBCOUNTERS
  94     wallcc_t         *wcsc;
  95 #endif
  96     double           *cycles_sum;
  97 } gmx_wallcycle_t_t;
  98
  99 /* Each name should not exceed 19 characters */
 100 static const char *wcn[ewcNR] =
 101 {
 102     "Run", "Step", "PP during PME", "Domain decomp.", "DD comm. load",
 103     "DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.",
 104     "Comm. coord.", "Born radii", "Force", "Wait + Comm. F", "PME mesh",
 105     "PME redist. X/F", "PME spread/gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve",
 106     "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "NB X/F buffer ops.",
 107     "Vsite spread", "Write traj.", "Update", "Constraints", "Comm. energies",
 108     "Enforced rotation", "Add rot. forces", "Test"
 109 };
 110
 111 static const char *wcsn[ewcsNR] =
 112 {
 113     "DD redist.", "DD NS grid + sort", "DD setup comm.",
 114     "DD make top.", "DD make constr.", "DD top. other",
 115     "NS grid local", "NS grid non-loc.", "NS search local", "NS search non-loc.",
 116     "Bonded F", "Nonbonded F", "Ewald F correction",
 117     "NB X buffer ops.", "NB F buffer ops."
 118 };
 119
 120 gmx_bool wallcycle_have_counter(void)
 121 {
 122     return gmx_cycles_have_counter();
 123 }
 124
 125 gmx_wallcycle_t wallcycle_init(FILE *fplog, int resetstep, t_commrec *cr,
 126                                int nthreads_pp, int nthreads_pme)
 127 {
 128     gmx_wallcycle_t wc;
 129
 130
 131     if (!wallcycle_have_counter())
 132     {
 133         return NULL;
 134     }
 135
 136     snew(wc, 1);
 137
 138     wc->wc_barrier          = FALSE;
 139     wc->wcc_all             = NULL;
 140     wc->wc_depth            = 0;
 141     wc->ewc_prev            = -1;
 142     wc->reset_counters      = resetstep;
 143     wc->nthreads_pp         = nthreads_pp;
 144     wc->nthreads_pme        = nthreads_pme;
 145     wc->cycles_sum          = NULL;
 146
 147 #ifdef GMX_MPI
 148     if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != NULL)
 149     {
 150         if (fplog)
 151         {
 152             fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
 153         }
 154         wc->wc_barrier       = TRUE;
 155         wc->mpi_comm_mygroup = cr->mpi_comm_mygroup;
 156     }
 157 #endif
 158
 159     snew(wc->wcc, ewcNR);
 160     if (getenv("GMX_CYCLE_ALL") != NULL)
 161     {
 162         if (fplog)
 163         {
 164             fprintf(fplog, "\nWill time all the code during the run\n\n");
 165         }
 166         snew(wc->wcc_all, ewcNR*ewcNR);
 167     }
 168
 169 #ifdef GMX_CYCLE_SUBCOUNTERS
 170     snew(wc->wcsc, ewcsNR);
 171 #endif
 172
 173 #ifdef DEBUG_WCYCLE
 174     wc->count_depth = 0;
 175 #endif
 176
 177     return wc;
 178 }
 179
 180 void wallcycle_destroy(gmx_wallcycle_t wc)
 181 {
 182     if (wc == NULL)
 183     {
 184         return;
 185     }
 186
 187     if (wc->wcc != NULL)
 188     {
 189         sfree(wc->wcc);
 190     }
 191     if (wc->wcc_all != NULL)
 192     {
 193         sfree(wc->wcc_all);
 194     }
 195 #ifdef GMX_CYCLE_SUBCOUNTERS
 196     if (wc->wcsc != NULL)
 197     {
 198         sfree(wc->wcsc);
 199     }
 200 #endif
 201     sfree(wc);
 202 }
 203
 204 static void wallcycle_all_start(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 205 {
 206     wc->ewc_prev   = ewc;
 207     wc->cycle_prev = cycle;
 208 }
 209
 210 static void wallcycle_all_stop(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 211 {
 212     wc->wcc_all[wc->ewc_prev*ewcNR+ewc].n += 1;
 213     wc->wcc_all[wc->ewc_prev*ewcNR+ewc].c += cycle - wc->cycle_prev;
 214 }
 215
 216
 217 #ifdef DEBUG_WCYCLE
 218 static void debug_start_check(gmx_wallcycle_t wc, int ewc)
 219 {
 220     /* fprintf(stderr,"wcycle_start depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 221
 222     if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
 223     {
 224         gmx_fatal(FARGS, "wallcycle counter depth out of range: %d",
 225                   wc->count_depth);
 226     }
 227     wc->counterlist[wc->count_depth] = ewc;
 228     wc->count_depth++;
 229 }
 230
 231 static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
 232 {
 233     wc->count_depth--;
 234
 235     /* fprintf(stderr,"wcycle_stop depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 236
 237     if (wc->count_depth < 0)
 238     {
 239         gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc], wc->count_depth);
 240     }
 241     if (wc->counterlist[wc->count_depth] != ewc)
 242     {
 243         gmx_fatal(FARGS, "wallcycle mismatch at stop, start %s, stop %s",
 244                   wcn[wc->counterlist[wc->count_depth]], wcn[ewc]);
 245     }
 246 }
 247 #endif
 248
 249 void wallcycle_start(gmx_wallcycle_t wc, int ewc)
 250 {
 251     gmx_cycles_t cycle;
 252
 253     if (wc == NULL)
 254     {
 255         return;
 256     }
 257
 258 #ifdef GMX_MPI
 259     if (wc->wc_barrier)
 260     {
 261         MPI_Barrier(wc->mpi_comm_mygroup);
 262     }
 263 #endif
 264
 265 #ifdef DEBUG_WCYCLE
 266     debug_start_check(wc, ewc);
 267 #endif
 268
 269     cycle              = gmx_cycles_read();
 270     wc->wcc[ewc].start = cycle;
 271     if (wc->wcc_all != NULL)
 272     {
 273         wc->wc_depth++;
 274         if (ewc == ewcRUN)
 275         {
 276             wallcycle_all_start(wc, ewc, cycle);
 277         }
 278         else if (wc->wc_depth == 3)
 279         {
 280             wallcycle_all_stop(wc, ewc, cycle);
 281         }
 282     }
 283 }
 284
 285 void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
 286 {
 287     if (wc == NULL)
 288     {
 289         return;
 290     }
 291
 292     wallcycle_start(wc, ewc);
 293     wc->wcc[ewc].n--;
 294 }
 295
 296 double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
 297 {
 298     gmx_cycles_t cycle, last;
 299
 300     if (wc == NULL)
 301     {
 302         return 0;
 303     }
 304
 305 #ifdef GMX_MPI
 306     if (wc->wc_barrier)
 307     {
 308         MPI_Barrier(wc->mpi_comm_mygroup);
 309     }
 310 #endif
 311
 312 #ifdef DEBUG_WCYCLE
 313     debug_stop_check(wc, ewc);
 314 #endif
 315
 316     cycle           = gmx_cycles_read();
 317     last            = cycle - wc->wcc[ewc].start;
 318     wc->wcc[ewc].c += last;
 319     wc->wcc[ewc].n++;
 320     if (wc->wcc_all)
 321     {
 322         wc->wc_depth--;
 323         if (ewc == ewcRUN)
 324         {
 325             wallcycle_all_stop(wc, ewc, cycle);
 326         }
 327         else if (wc->wc_depth == 2)
 328         {
 329             wallcycle_all_start(wc, ewc, cycle);
 330         }
 331     }
 332
 333     return last;
 334 }
 335
 336 void wallcycle_reset_all(gmx_wallcycle_t wc)
 337 {
 338     int i;
 339
 340     if (wc == NULL)
 341     {
 342         return;
 343     }
 344
 345     for (i = 0; i < ewcNR; i++)
 346     {
 347         wc->wcc[i].n = 0;
 348         wc->wcc[i].c = 0;
 349     }
 350     if (wc->wcc_all)
 351     {
 352         for (i = 0; i < ewcNR*ewcNR; i++)
 353         {
 354             wc->wcc_all[i].n = 0;
 355             wc->wcc_all[i].c = 0;
 356         }
 357     }
 358 #ifdef GMX_CYCLE_SUBCOUNTERS
 359     for (i = 0; i < ewcsNR; i++)
 360     {
 361         wc->wcsc[i].n = 0;
 362         wc->wcsc[i].c = 0;
 363     }
 364 #endif
 365 }
 366
 367 static gmx_bool is_pme_counter(int ewc)
 368 {
 369     return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
 370 }
 371
 372 static gmx_bool is_pme_subcounter(int ewc)
 373 {
 374     return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
 375 }
 376
 377 void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc)
 378 {
 379     wallcc_t *wcc;
 380     double   *cycles;
 381     double    cycles_n[ewcNR+ewcsNR], buf[ewcNR+ewcsNR], *cyc_all, *buf_all;
 382     int       i, j;
 383     int       nsum;
 384
 385     if (wc == NULL)
 386     {
 387         return;
 388     }
 389
 390     snew(wc->cycles_sum, ewcNR+ewcsNR);
 391     cycles = wc->cycles_sum;
 392
 393     wcc = wc->wcc;
 394
 395     for (i = 0; i < ewcNR; i++)
 396     {
 397         if (is_pme_counter(i) || (i == ewcRUN && cr->duty == DUTY_PME))
 398         {
 399             wcc[i].c *= wc->nthreads_pme;
 400
 401             if (wc->wcc_all)
 402             {
 403                 for (j = 0; j < ewcNR; j++)
 404                 {
 405                     wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pme;
 406                 }
 407             }
 408         }
 409         else
 410         {
 411             wcc[i].c *= wc->nthreads_pp;
 412
 413             if (wc->wcc_all)
 414             {
 415                 for (j = 0; j < ewcNR; j++)
 416                 {
 417                     wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pp;
 418                 }
 419             }
 420         }
 421     }
 422
 423     if (wcc[ewcDDCOMMLOAD].n > 0)
 424     {
 425         wcc[ewcDOMDEC].c -= wcc[ewcDDCOMMLOAD].c;
 426     }
 427     if (wcc[ewcDDCOMMBOUND].n > 0)
 428     {
 429         wcc[ewcDOMDEC].c -= wcc[ewcDDCOMMBOUND].c;
 430     }
 431     if (wcc[ewcPME_FFTCOMM].n > 0)
 432     {
 433         wcc[ewcPME_FFT].c -= wcc[ewcPME_FFTCOMM].c;
 434     }
 435
 436     if (cr->npmenodes == 0)
 437     {
 438         /* All nodes do PME (or no PME at all) */
 439         if (wcc[ewcPMEMESH].n > 0)
 440         {
 441             wcc[ewcFORCE].c -= wcc[ewcPMEMESH].c;
 442         }
 443     }
 444     else
 445     {
 446         /* The are PME-only nodes */
 447         if (wcc[ewcPMEMESH].n > 0)
 448         {
 449             /* This must be a PME only node, calculate the Wait + Comm. time */
 450             wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
 451         }
 452     }
 453
 454     /* Store the cycles in a double buffer for summing */
 455     for (i = 0; i < ewcNR; i++)
 456     {
 457         cycles_n[i] = (double)wcc[i].n;
 458         cycles[i]   = (double)wcc[i].c;
 459     }
 460     nsum = ewcNR;
 461 #ifdef GMX_CYCLE_SUBCOUNTERS
 462     for (i = 0; i < ewcsNR; i++)
 463     {
 464         wc->wcsc[i].c    *= wc->nthreads_pp;
 465         cycles_n[ewcNR+i] = (double)wc->wcsc[i].n;
 466         cycles[ewcNR+i]   = (double)wc->wcsc[i].c;
 467     }
 468     nsum += ewcsNR;
 469 #endif
 470
 471 #ifdef GMX_MPI
 472     if (cr->nnodes > 1)
 473     {
 474         MPI_Allreduce(cycles_n, buf, nsum, MPI_DOUBLE, MPI_MAX,
 475                       cr->mpi_comm_mysim);
 476         for (i = 0; i < ewcNR; i++)
 477         {
 478             wcc[i].n = (int)(buf[i] + 0.5);
 479         }
 480 #ifdef GMX_CYCLE_SUBCOUNTERS
 481         for (i = 0; i < ewcsNR; i++)
 482         {
 483             wc->wcsc[i].n = (int)(buf[ewcNR+i] + 0.5);
 484         }
 485 #endif
 486
 487         MPI_Allreduce(cycles, buf, nsum, MPI_DOUBLE, MPI_SUM,
 488                       cr->mpi_comm_mysim);
 489         for (i = 0; i < nsum; i++)
 490         {
 491             cycles[i] = buf[i];
 492         }
 493
 494         if (wc->wcc_all != NULL)
 495         {
 496             snew(cyc_all, ewcNR*ewcNR);
 497             snew(buf_all, ewcNR*ewcNR);
 498             for (i = 0; i < ewcNR*ewcNR; i++)
 499             {
 500                 cyc_all[i] = wc->wcc_all[i].c;
 501             }
 502             MPI_Allreduce(cyc_all, buf_all, ewcNR*ewcNR, MPI_DOUBLE, MPI_SUM,
 503                           cr->mpi_comm_mysim);
 504             for (i = 0; i < ewcNR*ewcNR; i++)
 505             {
 506                 wc->wcc_all[i].c = buf_all[i];
 507             }
 508             sfree(buf_all);
 509             sfree(cyc_all);
 510         }
 511     }
 512 #endif
 513 }
 514
 515 static void print_cycles(FILE *fplog, double c2t, const char *name,
 516                          int nthreads_tot,
 517                          int nnodes, int nthreads,
 518                          int n, double c, double tot)
 519 {
 520     char   num[11];
 521     char   thstr[6];
 522     double wallt;
 523
 524     if (c > 0)
 525     {
 526         if (n > 0)
 527         {
 528             snprintf(num, sizeof(num), "%10d", n);
 529             if (nthreads < 0)
 530             {
 531                 snprintf(thstr, sizeof(thstr), "N/A");
 532             }
 533             else
 534             {
 535                 snprintf(thstr, sizeof(thstr), "%4d", nthreads);
 536             }
 537         }
 538         else
 539         {
 540             sprintf(num, "          ");
 541             sprintf(thstr, "    ");
 542         }
 543         /* Convert the cycle count to wallclock time for this task */
 544         if (nthreads > 0)
 545         {
 546             /* Cycle count has been multiplied by the thread count,
 547              * correct for the number of threads used.
 548              */
 549             wallt = c*c2t*nthreads_tot/(double)(nnodes*nthreads);
 550         }
 551         else
 552         {
 553             /* nthreads=-1 signals total run time, no correction required */
 554             wallt = c*c2t;
 555         }
 556         fprintf(fplog, " %-19s %4d %4s %10s  %10.3f %12.3f   %5.1f\n",
 557                 name, nnodes, thstr, num, wallt, c*1e-9, 100*c/tot);
 558     }
 559 }
 560
 561 static void print_gputimes(FILE *fplog, const char *name,
 562                            int n, double t, double tot_t)
 563 {
 564     char num[11];
 565     char avg_perf[11];
 566
 567     if (n > 0)
 568     {
 569         snprintf(num, sizeof(num), "%10d", n);
 570         snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t/n);
 571     }
 572     else
 573     {
 574         sprintf(num, "          ");
 575         sprintf(avg_perf, "          ");
 576     }
 577     if (t != tot_t)
 578     {
 579         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n",
 580                 name, num, t/1000, avg_perf, 100 * t/tot_t);
 581     }
 582     else
 583     {
 584         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n",
 585                 name, "", t/1000, avg_perf, 100.0);
 586     }
 587 }
 588
 589 void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
 590                      gmx_wallcycle_t wc, wallclock_gpu_t *gpu_t)
 591 {
 592     double     *cycles;
 593     double      c2t, tot, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, sum, tot_k;
 594     int         i, j, npp, nth_pp, nth_pme, nth_tot;
 595     char        buf[STRLEN];
 596     const char *hline = "-----------------------------------------------------------------------------";
 597
 598     if (wc == NULL)
 599     {
 600         return;
 601     }
 602
 603     nth_pp  = wc->nthreads_pp;
 604     nth_pme = wc->nthreads_pme;
 605
 606     cycles = wc->cycles_sum;
 607
 608     if (npme > 0)
 609     {
 610         npp = nnodes - npme;
 611
 612         nth_tot = npp*nth_pp + npme*nth_pme;
 613     }
 614     else
 615     {
 616         npp  = nnodes;
 617         npme = nnodes;
 618
 619         nth_tot = npp*nth_pp;
 620     }
 621
 622     tot = cycles[ewcRUN];
 623
 624     /* Conversion factor from cycles to seconds */
 625     if (tot > 0)
 626     {
 627         c2t = realtime/tot;
 628     }
 629     else
 630     {
 631         c2t = 0;
 632     }
 633
 634     fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");
 635
 636     fprintf(fplog, " Computing:         Nodes   Th.     Count  Wall t (s)     G-Cycles       %c\n", '%');
 637     fprintf(fplog, "%s\n", hline);
 638     sum = 0;
 639     for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
 640     {
 641         if (!is_pme_subcounter(i))
 642         {
 643             print_cycles(fplog, c2t, wcn[i], nth_tot,
 644                          is_pme_counter(i) ? npme : npp,
 645                          is_pme_counter(i) ? nth_pme : nth_pp,
 646                          wc->wcc[i].n, cycles[i], tot);
 647             sum += cycles[i];
 648         }
 649     }
 650     if (wc->wcc_all != NULL)
 651     {
 652         for (i = 0; i < ewcNR; i++)
 653         {
 654             for (j = 0; j < ewcNR; j++)
 655             {
 656                 snprintf(buf, 9, "%-9s", wcn[i]);
 657                 buf[9] = ' ';
 658                 snprintf(buf+10, 9, "%-9s", wcn[j]);
 659                 buf[19] = '\0';
 660                 print_cycles(fplog, c2t, buf, nth_tot,
 661                              is_pme_counter(i) ? npme : npp,
 662                              is_pme_counter(i) ? nth_pme : nth_pp,
 663                              wc->wcc_all[i*ewcNR+j].n,
 664                              wc->wcc_all[i*ewcNR+j].c,
 665                              tot);
 666             }
 667         }
 668     }
 669     print_cycles(fplog, c2t, "Rest", nth_tot, npp, -1, 0, tot-sum, tot);
 670     fprintf(fplog, "%s\n", hline);
 671     print_cycles(fplog, c2t, "Total", nth_tot, nnodes, -1, 0, tot, tot);
 672     fprintf(fplog, "%s\n", hline);
 673
 674     if (wc->wcc[ewcPMEMESH].n > 0)
 675     {
 676         fprintf(fplog, "%s\n", hline);
 677         for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
 678         {
 679             if (is_pme_subcounter(i))
 680             {
 681                 print_cycles(fplog, c2t, wcn[i], nth_tot,
 682                              is_pme_counter(i) ? npme : npp,
 683                              is_pme_counter(i) ? nth_pme : nth_pp,
 684                              wc->wcc[i].n, cycles[i], tot);
 685             }
 686         }
 687         fprintf(fplog, "%s\n", hline);
 688     }
 689
 690 #ifdef GMX_CYCLE_SUBCOUNTERS
 691     fprintf(fplog, "%s\n", hline);
 692     for (i = 0; i < ewcsNR; i++)
 693     {
 694         print_cycles(fplog, c2t, wcsn[i], nth_tot, npp, nth_pp,
 695                      wc->wcsc[i].n, cycles[ewcNR+i], tot);
 696     }
 697     fprintf(fplog, "%s\n", hline);
 698 #endif
 699
 700     /* print GPU timing summary */
 701     if (gpu_t)
 702     {
 703         const char *k_log_str[2][2] = {
 704             {"Nonbonded F kernel", "Nonbonded F+ene k."},
 705             {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."}
 706         };
 707
 708         tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t;
 709
 710         /* add up the kernel timings */
 711         tot_k = 0.0;
 712         for (i = 0; i < 2; i++)
 713         {
 714             for (j = 0; j < 2; j++)
 715             {
 716                 tot_k += gpu_t->ktime[i][j].t;
 717             }
 718         }
 719         tot_gpu += tot_k;
 720
 721         tot_cpu_overlap = wc->wcc[ewcFORCE].c;
 722         if (wc->wcc[ewcPMEMESH].n > 0)
 723         {
 724             tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
 725         }
 726         tot_cpu_overlap *= c2t * 1000; /* convert s to ms */
 727
 728         fprintf(fplog, "\n GPU timings\n%s\n", hline);
 729         fprintf(fplog, " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
 730         fprintf(fplog, "%s\n", hline);
 731         print_gputimes(fplog, "Pair list H2D",
 732                        gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu);
 733         print_gputimes(fplog, "X / q H2D",
 734                        gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu);
 735
 736         for (i = 0; i < 2; i++)
 737         {
 738             for (j = 0; j < 2; j++)
 739             {
 740                 if (gpu_t->ktime[i][j].c)
 741                 {
 742                     print_gputimes(fplog, k_log_str[i][j],
 743                                    gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu);
 744                 }
 745             }
 746         }
 747
 748         print_gputimes(fplog, "F D2H",  gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu);
 749         fprintf(fplog, "%s\n", hline);
 750         print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu);
 751         fprintf(fplog, "%s\n", hline);
 752
 753         gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
 754         fprintf(fplog, "\nForce evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n",
 755                 tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
 756                 gpu_cpu_ratio);
 757
 758         /* only print notes related to CPU-GPU load balance with PME */
 759         if (wc->wcc[ewcPMEMESH].n > 0)
 760         {
 761             fprintf(fplog, "For optimal performance this ratio should be close to 1!\n");
 762
 763             /* print note if the imbalance is high with PME case in which
 764              * CPU-GPU load balancing is possible */
 765             if (gpu_cpu_ratio < 0.75 || gpu_cpu_ratio > 1.2)
 766             {
 767                 /* Only the sim master calls this function, so always print to stderr */
 768                 if (gpu_cpu_ratio < 0.75)
 769                 {
 770                     if (npp > 1)
 771                     {
 772                         /* The user could have used -notunepme,
 773                          * but we currently can't check that here.
 774                          */
 775                         md_print_warn(NULL, fplog,
 776                                       "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
 777                                       "      performance loss. Maybe the domain decomposition limits the PME tuning.\n"
 778                                       "      In that case, try setting the DD grid manually (-dd) or lowering -dds.");
 779                     }
 780                     else
 781                     {
 782                         /* We should not end up here, unless the box is
 783                          * too small for increasing the cut-off for PME tuning.
 784                          */
 785                         md_print_warn(NULL, fplog,
 786                                       "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
 787                                       "      performance loss.");
 788                     }
 789                 }
 790                 if (gpu_cpu_ratio > 1.2)
 791                 {
 792                     md_print_warn(NULL, fplog,
 793                                   "\nNOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n"
 794                                   "      performance loss, consider using a shorter cut-off and a finer PME grid.");
 795                 }
 796             }
 797         }
 798     }
 799
 800     if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 &&
 801         (cycles[ewcDOMDEC] > tot*0.1 ||
 802          cycles[ewcNS] > tot*0.1))
 803     {
 804         /* Only the sim master calls this function, so always print to stderr */
 805         if (wc->wcc[ewcDOMDEC].n == 0)
 806         {
 807             md_print_warn(NULL, fplog,
 808                           "NOTE: %d %% of the run time was spent in pair search,\n"
 809                           "      you might want to increase nstlist (this has no effect on accuracy)\n",
 810                           (int)(100*cycles[ewcNS]/tot+0.5));
 811         }
 812         else
 813         {
 814             md_print_warn(NULL, fplog,
 815                           "NOTE: %d %% of the run time was spent in domain decomposition,\n"
 816                           "      %d %% of the run time was spent in pair search,\n"
 817                           "      you might want to increase nstlist (this has no effect on accuracy)\n",
 818                           (int)(100*cycles[ewcDOMDEC]/tot+0.5),
 819                           (int)(100*cycles[ewcNS]/tot+0.5));
 820         }
 821     }
 822
 823     if (cycles[ewcMoveE] > tot*0.05)
 824     {
 825         /* Only the sim master calls this function, so always print to stderr */
 826         md_print_warn(NULL, fplog,
 827                       "NOTE: %d %% of the run time was spent communicating energies,\n"
 828                       "      you might want to use the -gcom option of mdrun\n",
 829                       (int)(100*cycles[ewcMoveE]/tot+0.5));
 830     }
 831 }
 832
 833 extern gmx_large_int_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
 834 {
 835     if (wc == NULL)
 836     {
 837         return -1;
 838     }
 839
 840     return wc->reset_counters;
 841 }
 842
 843 extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, gmx_large_int_t reset_counters)
 844 {
 845     if (wc == NULL)
 846     {
 847         return;
 848     }
 849
 850     wc->reset_counters = reset_counters;
 851 }
 852
 853 #ifdef GMX_CYCLE_SUBCOUNTERS
 854
 855 void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
 856 {
 857     if (wc != NULL)
 858     {
 859         wc->wcsc[ewcs].start = gmx_cycles_read();
 860     }
 861 }
 862
 863 void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
 864 {
 865     if (wc != NULL)
 866     {
 867         wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start;
 868         wc->wcsc[ewcs].n++;
 869     }
 870 }
 871
 872 #endif /* GMX_CYCLE_SUBCOUNTERS */