src/mdlib/gmx_wallcycle.c

   1 /*  -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  12  * Copyright (c) 2001-2008, The GROMACS development team,
  13  * check out http://www.gromacs.org for more information.
  14
  15  * This program is free software; you can redistribute it and/or
  16  * modify it under the terms of the GNU General Public License
  17  * as published by the Free Software Foundation; either version 2
  18  * of the License, or (at your option) any later version.
  19  *
  20  * If you want to redistribute modifications, please consider that
  21  * scientific software is very special. Version control is crucial -
  22  * bugs must be traceable. We will be happy to consider code for
  23  * inclusion in the official distribution, but derived work must not
  24  * be called official GROMACS. Details are found in the README & COPYING
  25  * files - if they are missing, get the official version at www.gromacs.org.
  26  *
  27  * To help us fund GROMACS development, we humbly ask that you cite
  28  * the papers on the package - you can find them in the top README file.
  29  *
  30  * For more info, check our website at http://www.gromacs.org
  31  *
  32  * And Hey:
  33  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  34  */
  35
  36
  37 #ifdef HAVE_CONFIG_H
  38 #include <config.h>
  39 #endif
  40
  41 #include <string.h>
  42 #include "gmx_wallcycle.h"
  43 #include "gmx_cyclecounter.h"
  44 #include "smalloc.h"
  45 #include "gmx_fatal.h"
  46 #include "md_logging.h"
  47 #include "string2.h"
  48
  49 #ifdef GMX_LIB_MPI
  50 #include <mpi.h>
  51 #endif
  52 #ifdef GMX_THREAD_MPI
  53 #include "tmpi.h"
  54 #endif
  55
  56 /* DEBUG_WCYCLE adds consistency checking for the counters.
  57  * It checks if you stop a counter different from the last
  58  * one that was opened and if you do nest too deep.
  59  */
  60 /* #define DEBUG_WCYCLE */
  61
  62 typedef struct
  63 {
  64     int          n;
  65     gmx_cycles_t c;
  66     gmx_cycles_t start;
  67     gmx_cycles_t last;
  68 } wallcc_t;
  69
  70 typedef struct gmx_wallcycle
  71 {
  72     wallcc_t     *wcc;
  73     /* variables for testing/debugging */
  74     gmx_bool         wc_barrier;
  75     wallcc_t     *wcc_all;
  76     int          wc_depth;
  77 #ifdef DEBUG_WCYCLE
  78 #define DEPTH_MAX 6
  79     int          counterlist[DEPTH_MAX];
  80     int          count_depth;
  81 #endif
  82     int          ewc_prev;
  83     gmx_cycles_t cycle_prev;
  84     gmx_large_int_t   reset_counters;
  85 #ifdef GMX_MPI
  86     MPI_Comm     mpi_comm_mygroup;
  87 #endif
  88     int          nthreads_pp;
  89     int          nthreads_pme;
  90 #ifdef GMX_CYCLE_SUBCOUNTERS
  91     wallcc_t     *wcsc;
  92 #endif
  93     double       *cycles_sum;
  94 } gmx_wallcycle_t_t;
  95
  96 /* Each name should not exceed 19 characters */
  97 static const char *wcn[ewcNR] =
  98 { "Run", "Step", "PP during PME", "Domain decomp.", "DD comm. load",
  99   "DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.",
 100   "Comm. coord.", "Born radii", "Force", "Wait + Comm. F", "PME mesh",
 101   "PME redist. X/F", "PME spread/gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve",
 102   "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "NB X/F buffer ops.",
 103   "Vsite spread", "Write traj.", "Update", "Constraints", "Comm. energies",
 104   "Enforced rotation", "Add rot. forces", "Test" };
 105
 106 static const char *wcsn[ewcsNR] =
 107 { "DD redist.", "DD NS grid + sort", "DD setup comm.",
 108   "DD make top.", "DD make constr.", "DD top. other",
 109   "NS grid local", "NS grid non-loc.", "NS search local", "NS search non-loc.",
 110   "Bonded F", "Nonbonded F", "Ewald F correction",
 111   "NB X buffer ops.", "NB F buffer ops."
 112 };
 113
 114 gmx_bool wallcycle_have_counter(void)
 115 {
 116   return gmx_cycles_have_counter();
 117 }
 118
 119 gmx_wallcycle_t wallcycle_init(FILE *fplog,int resetstep,t_commrec *cr,
 120                                int nthreads_pp, int nthreads_pme)
 121 {
 122     gmx_wallcycle_t wc;
 123
 124
 125     if (!wallcycle_have_counter())
 126     {
 127         return NULL;
 128     }
 129
 130     snew(wc,1);
 131
 132     wc->wc_barrier          = FALSE;
 133     wc->wcc_all             = NULL;
 134     wc->wc_depth            = 0;
 135     wc->ewc_prev            = -1;
 136     wc->reset_counters      = resetstep;
 137     wc->nthreads_pp         = nthreads_pp;
 138     wc->nthreads_pme        = nthreads_pme;
 139     wc->cycles_sum          = NULL;
 140
 141 #ifdef GMX_MPI
 142     if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != NULL)
 143     {
 144         if (fplog)
 145         {
 146             fprintf(fplog,"\nWill call MPI_Barrier before each cycle start/stop call\n\n");
 147         }
 148         wc->wc_barrier = TRUE;
 149         wc->mpi_comm_mygroup = cr->mpi_comm_mygroup;
 150     }
 151 #endif
 152
 153     snew(wc->wcc,ewcNR);
 154     if (getenv("GMX_CYCLE_ALL") != NULL)
 155     {
 156         if (fplog)
 157         {
 158             fprintf(fplog,"\nWill time all the code during the run\n\n");
 159         }
 160         snew(wc->wcc_all,ewcNR*ewcNR);
 161     }
 162
 163 #ifdef GMX_CYCLE_SUBCOUNTERS
 164     snew(wc->wcsc,ewcsNR);
 165 #endif
 166
 167 #ifdef DEBUG_WCYCLE
 168     wc->count_depth = 0;
 169 #endif
 170
 171     return wc;
 172 }
 173
 174 void wallcycle_destroy(gmx_wallcycle_t wc)
 175 {
 176     if (wc == NULL)
 177     {
 178         return;
 179     }
 180
 181     if (wc->wcc != NULL)
 182     {
 183         sfree(wc->wcc);
 184     }
 185     if (wc->wcc_all != NULL)
 186     {
 187         sfree(wc->wcc_all);
 188     }
 189 #ifdef GMX_CYCLE_SUBCOUNTERS
 190     if (wc->wcsc != NULL)
 191     {
 192         sfree(wc->wcsc);
 193     }
 194 #endif
 195     sfree(wc);
 196 }
 197
 198 static void wallcycle_all_start(gmx_wallcycle_t wc,int ewc,gmx_cycles_t cycle)
 199 {
 200     wc->ewc_prev = ewc;
 201     wc->cycle_prev = cycle;
 202 }
 203
 204 static void wallcycle_all_stop(gmx_wallcycle_t wc,int ewc,gmx_cycles_t cycle)
 205 {
 206     wc->wcc_all[wc->ewc_prev*ewcNR+ewc].n += 1;
 207     wc->wcc_all[wc->ewc_prev*ewcNR+ewc].c += cycle - wc->cycle_prev;
 208 }
 209
 210
 211 #ifdef DEBUG_WCYCLE
 212 static void debug_start_check(gmx_wallcycle_t wc, int ewc)
 213 {
 214     /* fprintf(stderr,"wcycle_start depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 215
 216     if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
 217     {
 218         gmx_fatal(FARGS,"wallcycle counter depth out of range: %d",
 219                   wc->count_depth);
 220     }
 221     wc->counterlist[wc->count_depth] = ewc;
 222     wc->count_depth++;
 223 }
 224
 225 static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
 226 {
 227     wc->count_depth--;
 228
 229     /* fprintf(stderr,"wcycle_stop depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 230
 231     if (wc->count_depth < 0)
 232     {
 233         gmx_fatal(FARGS,"wallcycle counter depth out of range when stopping %s: %d",wcn[ewc],wc->count_depth);
 234     }
 235     if (wc->counterlist[wc->count_depth] != ewc)
 236     {
 237         gmx_fatal(FARGS,"wallcycle mismatch at stop, start %s, stop %s",
 238                   wcn[wc->counterlist[wc->count_depth]],wcn[ewc]);
 239     }
 240 }
 241 #endif
 242
 243 void wallcycle_start(gmx_wallcycle_t wc, int ewc)
 244 {
 245     gmx_cycles_t cycle;
 246
 247     if (wc == NULL)
 248     {
 249         return;
 250     }
 251
 252 #ifdef GMX_MPI
 253     if (wc->wc_barrier)
 254     {
 255         MPI_Barrier(wc->mpi_comm_mygroup);
 256     }
 257 #endif
 258
 259 #ifdef DEBUG_WCYCLE
 260     debug_start_check(wc,ewc);
 261 #endif
 262
 263     cycle = gmx_cycles_read();
 264     wc->wcc[ewc].start = cycle;
 265     if (wc->wcc_all != NULL)
 266     {
 267         wc->wc_depth++;
 268         if (ewc == ewcRUN)
 269         {
 270             wallcycle_all_start(wc,ewc,cycle);
 271         }
 272         else if (wc->wc_depth == 3)
 273         {
 274             wallcycle_all_stop(wc,ewc,cycle);
 275         }
 276     }
 277 }
 278
 279 void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
 280 {
 281     if (wc == NULL)
 282     {
 283         return;
 284     }
 285
 286     wallcycle_start(wc, ewc);
 287     wc->wcc[ewc].n--;
 288 }
 289
 290 double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
 291 {
 292     gmx_cycles_t cycle,last;
 293
 294     if (wc == NULL)
 295     {
 296         return 0;
 297     }
 298
 299 #ifdef GMX_MPI
 300     if (wc->wc_barrier)
 301     {
 302         MPI_Barrier(wc->mpi_comm_mygroup);
 303     }
 304 #endif
 305
 306 #ifdef DEBUG_WCYCLE
 307     debug_stop_check(wc,ewc);
 308 #endif
 309
 310     cycle = gmx_cycles_read();
 311     last = cycle - wc->wcc[ewc].start;
 312     wc->wcc[ewc].c += last;
 313     wc->wcc[ewc].n++;
 314     if (wc->wcc_all)
 315     {
 316         wc->wc_depth--;
 317         if (ewc == ewcRUN)
 318         {
 319             wallcycle_all_stop(wc,ewc,cycle);
 320         }
 321         else if (wc->wc_depth == 2)
 322         {
 323             wallcycle_all_start(wc,ewc,cycle);
 324         }
 325     }
 326
 327     return last;
 328 }
 329
 330 void wallcycle_reset_all(gmx_wallcycle_t wc)
 331 {
 332     int i;
 333
 334     if (wc == NULL)
 335     {
 336         return;
 337     }
 338
 339     for(i=0; i<ewcNR; i++)
 340     {
 341         wc->wcc[i].n = 0;
 342         wc->wcc[i].c = 0;
 343     }
 344     if (wc->wcc_all)
 345     {
 346         for(i=0; i<ewcNR*ewcNR; i++)
 347         {
 348             wc->wcc_all[i].n = 0;
 349             wc->wcc_all[i].c = 0;
 350         }
 351     }
 352 #ifdef GMX_CYCLE_SUBCOUNTERS
 353     for (i=0; i<ewcsNR; i++)
 354     {
 355         wc->wcsc[i].n = 0;
 356         wc->wcsc[i].c = 0;
 357     }
 358 #endif
 359 }
 360
 361 static gmx_bool is_pme_counter(int ewc)
 362 {
 363     return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
 364 }
 365
 366 static gmx_bool is_pme_subcounter(int ewc)
 367 {
 368     return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
 369 }
 370
 371 void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc)
 372 {
 373     wallcc_t *wcc;
 374     double *cycles;
 375     double cycles_n[ewcNR+ewcsNR],buf[ewcNR+ewcsNR],*cyc_all,*buf_all;
 376     int    i,j;
 377     int    nsum;
 378
 379     if (wc == NULL)
 380     {
 381         return;
 382     }
 383
 384     snew(wc->cycles_sum,ewcNR+ewcsNR);
 385     cycles = wc->cycles_sum;
 386
 387     wcc = wc->wcc;
 388
 389     for(i=0; i<ewcNR; i++)
 390     {
 391         if (is_pme_counter(i) || (i==ewcRUN && cr->duty == DUTY_PME))
 392         {
 393             wcc[i].c *= wc->nthreads_pme;
 394
 395             if (wc->wcc_all)
 396             {
 397                 for(j=0; j<ewcNR; j++)
 398                 {
 399                     wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pme;
 400                 }
 401             }
 402         }
 403         else
 404         {
 405             wcc[i].c *= wc->nthreads_pp;
 406
 407             if (wc->wcc_all)
 408             {
 409                 for(j=0; j<ewcNR; j++)
 410                 {
 411                     wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pp;
 412                 }
 413             }
 414         }
 415     }
 416
 417     if (wcc[ewcDDCOMMLOAD].n > 0)
 418     {
 419         wcc[ewcDOMDEC].c -= wcc[ewcDDCOMMLOAD].c;
 420     }
 421     if (wcc[ewcDDCOMMBOUND].n > 0)
 422     {
 423         wcc[ewcDOMDEC].c -= wcc[ewcDDCOMMBOUND].c;
 424     }
 425     if (wcc[ewcPME_FFTCOMM].n > 0)
 426     {
 427         wcc[ewcPME_FFT].c -= wcc[ewcPME_FFTCOMM].c;
 428     }
 429
 430     if (cr->npmenodes == 0)
 431     {
 432         /* All nodes do PME (or no PME at all) */
 433         if (wcc[ewcPMEMESH].n > 0)
 434         {
 435             wcc[ewcFORCE].c -= wcc[ewcPMEMESH].c;
 436         }
 437     }
 438     else
 439     {
 440         /* The are PME-only nodes */
 441         if (wcc[ewcPMEMESH].n > 0)
 442         {
 443             /* This must be a PME only node, calculate the Wait + Comm. time */
 444             wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
 445         }
 446     }
 447
 448     /* Store the cycles in a double buffer for summing */
 449     for(i=0; i<ewcNR; i++)
 450     {
 451         cycles_n[i] = (double)wcc[i].n;
 452         cycles[i]   = (double)wcc[i].c;
 453     }
 454     nsum = ewcNR;
 455 #ifdef GMX_CYCLE_SUBCOUNTERS
 456     for(i=0; i<ewcsNR; i++)
 457     {
 458         wc->wcsc[i].c *= wc->nthreads_pp;
 459         cycles_n[ewcNR+i] = (double)wc->wcsc[i].n;
 460         cycles[ewcNR+i]   = (double)wc->wcsc[i].c;
 461     }
 462     nsum += ewcsNR;
 463 #endif
 464
 465 #ifdef GMX_MPI
 466     if (cr->nnodes > 1)
 467     {
 468         MPI_Allreduce(cycles_n,buf,nsum,MPI_DOUBLE,MPI_MAX,
 469                       cr->mpi_comm_mysim);
 470         for(i=0; i<ewcNR; i++)
 471         {
 472             wcc[i].n = (int)(buf[i] + 0.5);
 473         }
 474 #ifdef GMX_CYCLE_SUBCOUNTERS
 475         for(i=0; i<ewcsNR; i++)
 476         {
 477             wc->wcsc[i].n = (int)(buf[ewcNR+i] + 0.5);
 478         }
 479 #endif
 480
 481         MPI_Allreduce(cycles,buf,nsum,MPI_DOUBLE,MPI_SUM,
 482                       cr->mpi_comm_mysim);
 483         for(i=0; i<nsum; i++)
 484         {
 485             cycles[i] = buf[i];
 486         }
 487
 488         if (wc->wcc_all != NULL)
 489         {
 490             snew(cyc_all,ewcNR*ewcNR);
 491             snew(buf_all,ewcNR*ewcNR);
 492             for(i=0; i<ewcNR*ewcNR; i++)
 493             {
 494                 cyc_all[i] = wc->wcc_all[i].c;
 495             }
 496             MPI_Allreduce(cyc_all,buf_all,ewcNR*ewcNR,MPI_DOUBLE,MPI_SUM,
 497                           cr->mpi_comm_mysim);
 498             for(i=0; i<ewcNR*ewcNR; i++)
 499             {
 500                 wc->wcc_all[i].c = buf_all[i];
 501             }
 502             sfree(buf_all);
 503             sfree(cyc_all);
 504         }
 505     }
 506 #endif
 507 }
 508
 509 static void print_cycles(FILE *fplog, double c2t, const char *name,
 510                          int nnodes_tot,int nnodes, int nthreads,
 511                          int n, double c, double tot)
 512 {
 513     char num[11];
 514     char thstr[6];
 515     double wallt;
 516
 517     if (c > 0)
 518     {
 519         if (n > 0)
 520         {
 521             snprintf(num,sizeof(num),"%10d",n);
 522             if (nthreads < 0)
 523                 snprintf(thstr, sizeof(thstr), "N/A");
 524             else
 525                 snprintf(thstr, sizeof(thstr), "%4d", nthreads);
 526         }
 527         else
 528         {
 529             sprintf(num,"          ");
 530             sprintf(thstr, "    ");
 531         }
 532         wallt = c*c2t*nnodes_tot/(double)nnodes;
 533         fprintf(fplog," %-19s %4d %4s %10s  %10.3f %12.3f   %5.1f\n",
 534                 name,nnodes,thstr,num,wallt,c*1e-9,100*c/tot);
 535     }
 536 }
 537
 538 static void print_gputimes(FILE *fplog, const char *name,
 539                            int n, double t, double tot_t)
 540 {
 541     char num[11];
 542     char avg_perf[11];
 543
 544     if (n > 0)
 545     {
 546         snprintf(num, sizeof(num), "%10d", n);
 547         snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t/n);
 548     }
 549     else
 550     {
 551       sprintf(num,"          ");
 552       sprintf(avg_perf,"          ");
 553     }
 554     if (t != tot_t)
 555     {
 556         fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n",
 557                 name, num, t/1000, avg_perf, 100 * t/tot_t);
 558     }
 559     else
 560     {
 561          fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n",
 562                name, "", t/1000, avg_perf, 100.0);
 563     }
 564 }
 565
 566 void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
 567                      gmx_wallcycle_t wc, wallclock_gpu_t *gpu_t)
 568 {
 569     double *cycles;
 570     double c2t,tot,tot_gpu,tot_cpu_overlap,gpu_cpu_ratio,sum,tot_k;
 571     int    i,j,npp,nth_pp,nth_pme;
 572     char   buf[STRLEN];
 573     const char *hline = "-----------------------------------------------------------------------------";
 574
 575     if (wc == NULL)
 576     {
 577         return;
 578     }
 579
 580     nth_pp  = wc->nthreads_pp;
 581     nth_pme = wc->nthreads_pme;
 582
 583     cycles = wc->cycles_sum;
 584
 585     if (npme > 0)
 586     {
 587         npp = nnodes - npme;
 588     }
 589     else
 590     {
 591         npp  = nnodes;
 592         npme = nnodes;
 593     }
 594     tot = cycles[ewcRUN];
 595
 596     /* Conversion factor from cycles to seconds */
 597     if (tot > 0)
 598     {
 599         c2t = realtime/tot;
 600     }
 601     else
 602     {
 603         c2t = 0;
 604     }
 605
 606     fprintf(fplog,"\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");
 607
 608     fprintf(fplog," Computing:         Nodes   Th.     Count  Wall t (s)     G-Cycles       %c\n",'%');
 609     fprintf(fplog,"%s\n",hline);
 610     sum = 0;
 611     for(i=ewcPPDURINGPME+1; i<ewcNR; i++)
 612     {
 613         if (!is_pme_subcounter(i))
 614         {
 615             print_cycles(fplog,c2t,wcn[i],nnodes,
 616                          is_pme_counter(i) ? npme : npp,
 617                          is_pme_counter(i) ? nth_pme : nth_pp,
 618                          wc->wcc[i].n,cycles[i],tot);
 619             sum += cycles[i];
 620         }
 621     }
 622     if (wc->wcc_all != NULL)
 623     {
 624         for(i=0; i<ewcNR; i++)
 625         {
 626             for(j=0; j<ewcNR; j++)
 627             {
 628                 snprintf(buf,9,"%-9s",wcn[i]);
 629                 buf[9] = ' ';
 630                 snprintf(buf+10,9,"%-9s",wcn[j]);
 631                 buf[19] = '\0';
 632                 print_cycles(fplog,c2t,buf,nnodes,
 633                              is_pme_counter(i) ? npme : npp,
 634                              is_pme_counter(i) ? nth_pme : nth_pp,
 635                              wc->wcc_all[i*ewcNR+j].n,
 636                              wc->wcc_all[i*ewcNR+j].c,
 637                              tot);
 638             }
 639         }
 640     }
 641     print_cycles(fplog,c2t,"Rest",npp,npp,-1,0,tot-sum,tot);
 642     fprintf(fplog,"%s\n",hline);
 643     print_cycles(fplog,c2t,"Total",nnodes,nnodes,-1,0,tot,tot);
 644     fprintf(fplog,"%s\n",hline);
 645
 646     if (wc->wcc[ewcPMEMESH].n > 0)
 647     {
 648         fprintf(fplog,"%s\n",hline);
 649         for(i=ewcPPDURINGPME+1; i<ewcNR; i++)
 650         {
 651             if (is_pme_subcounter(i))
 652             {
 653                 print_cycles(fplog,c2t,wcn[i],nnodes,
 654                              is_pme_counter(i) ? npme : npp,
 655                              is_pme_counter(i) ? nth_pme : nth_pp,
 656                              wc->wcc[i].n,cycles[i],tot);
 657             }
 658         }
 659         fprintf(fplog,"%s\n",hline);
 660     }
 661
 662 #ifdef GMX_CYCLE_SUBCOUNTERS
 663     fprintf(fplog,"%s\n",hline);
 664     for(i=0; i<ewcsNR; i++)
 665     {
 666         print_cycles(fplog,c2t,wcsn[i],nnodes,npp,nth_pp,
 667                      wc->wcsc[i].n,cycles[ewcNR+i],tot);
 668     }
 669     fprintf(fplog,"%s\n",hline);
 670 #endif
 671
 672     /* print GPU timing summary */
 673     if (gpu_t)
 674     {
 675         const char *k_log_str[2][2] = {
 676                 {"Nonbonded F kernel", "Nonbonded F+ene k."},
 677                 {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."}};
 678
 679         tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t;
 680
 681         /* add up the kernel timings */
 682         tot_k = 0.0;
 683         for (i = 0; i < 2; i++)
 684         {
 685             for(j = 0; j < 2; j++)
 686             {
 687                 tot_k += gpu_t->ktime[i][j].t;
 688             }
 689         }
 690         tot_gpu += tot_k;
 691
 692         tot_cpu_overlap = wc->wcc[ewcFORCE].c;
 693         if (wc->wcc[ewcPMEMESH].n > 0)
 694         {
 695             tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
 696         }
 697         tot_cpu_overlap *= c2t * 1000; /* convert s to ms */
 698
 699         fprintf(fplog, "\n GPU timings\n%s\n", hline);
 700         fprintf(fplog," Computing:                         Count  Wall t (s)      ms/step       %c\n",'%');
 701         fprintf(fplog, "%s\n", hline);
 702         print_gputimes(fplog, "Pair list H2D",
 703                 gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu);
 704          print_gputimes(fplog, "X / q H2D",
 705                 gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu);
 706
 707         for (i = 0; i < 2; i++)
 708         {
 709             for(j = 0; j < 2; j++)
 710             {
 711                 if (gpu_t->ktime[i][j].c)
 712                 {
 713                     print_gputimes(fplog, k_log_str[i][j],
 714                             gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu);
 715                 }
 716             }
 717         }
 718
 719         print_gputimes(fplog, "F D2H",  gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu);
 720         fprintf(fplog, "%s\n", hline);
 721         print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu);
 722         fprintf(fplog, "%s\n", hline);
 723
 724         gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
 725         fprintf(fplog, "\n Force evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n",
 726                 tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
 727                 gpu_cpu_ratio);
 728
 729         /* only print notes related to CPU-GPU load balance with PME */
 730         if (wc->wcc[ewcPMEMESH].n > 0)
 731         {
 732             fprintf(fplog, "For optimal performance this ratio should be close to 1!\n");
 733
 734             /* print note if the imbalance is high with PME case in which
 735              * CPU-GPU load balancing is possible */
 736             if (gpu_cpu_ratio < 0.75 || gpu_cpu_ratio > 1.2)
 737             {
 738                 /* Only the sim master calls this function, so always print to stderr */
 739                 if (gpu_cpu_ratio < 0.75)
 740                 {
 741                     if (npp > 1)
 742                     {
 743                         /* The user could have used -notunepme,
 744                          * but we currently can't check that here.
 745                          */
 746                         md_print_warn(NULL,fplog,
 747                                       "NOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
 748                                       "      performance loss. Maybe the domain decomposition limits the PME tuning.\n"
 749                                       "      In that case, try setting the DD grid manually (-dd) or lowering -dds.\n");
 750                     }
 751                     else
 752                     {
 753                         /* We should not end up here, unless the box is
 754                          * too small for increasing the cut-off for PME tuning.
 755                          */
 756                         md_print_warn(NULL,fplog,
 757                                       "NOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
 758                                       "      performance loss.\n");
 759                     }
 760                 }
 761                 if (gpu_cpu_ratio > 1.2)
 762                 {
 763                     md_print_warn(NULL,fplog,
 764                                   "NOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n"
 765                                   "      performance loss, consider using a shorter cut-off and a finer PME grid.\n");
 766                 }
 767             }
 768         }
 769     }
 770
 771     if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 &&
 772         (cycles[ewcDOMDEC] > tot*0.1 ||
 773          cycles[ewcNS] > tot*0.1))
 774     {
 775         /* Only the sim master calls this function, so always print to stderr */
 776         if (wc->wcc[ewcDOMDEC].n == 0)
 777         {
 778             md_print_warn(NULL,fplog,
 779                           "NOTE: %d %% of the run time was spent in pair search,\n"
 780                           "      you might want to increase nstlist (this has no effect on accuracy)\n",
 781                           (int)(100*cycles[ewcNS]/tot+0.5));
 782         }
 783         else
 784         {
 785             md_print_warn(NULL,fplog,
 786                           "NOTE: %d %% of the run time was spent in domain decomposition,\n"
 787                           "      %d %% of the run time was spent in pair search,\n"
 788                           "      you might want to increase nstlist (this has no effect on accuracy)\n",
 789                           (int)(100*cycles[ewcDOMDEC]/tot+0.5),
 790                           (int)(100*cycles[ewcNS]/tot+0.5));
 791         }
 792     }
 793
 794     if (cycles[ewcMoveE] > tot*0.05)
 795     {
 796         /* Only the sim master calls this function, so always print to stderr */
 797         md_print_warn(NULL,fplog,
 798                       "NOTE: %d %% of the run time was spent communicating energies,\n"
 799                       "      you might want to use the -gcom option of mdrun\n",
 800                       (int)(100*cycles[ewcMoveE]/tot+0.5));
 801     }
 802 }
 803
 804 extern gmx_large_int_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
 805 {
 806     if (wc == NULL)
 807     {
 808         return -1;
 809     }
 810
 811     return wc->reset_counters;
 812 }
 813
 814 extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, gmx_large_int_t reset_counters)
 815 {
 816     if (wc == NULL)
 817         return;
 818
 819     wc->reset_counters = reset_counters;
 820 }
 821
 822 #ifdef GMX_CYCLE_SUBCOUNTERS
 823
 824 void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
 825 {
 826     if (wc != NULL)
 827     {
 828         wc->wcsc[ewcs].start = gmx_cycles_read();
 829     }
 830 }
 831
 832 void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
 833 {
 834     if (wc != NULL)
 835     {
 836         wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start;
 837         wc->wcsc[ewcs].n++;
 838     }
 839 }
 840
 841 #endif /* GMX_CYCLE_SUBCOUNTERS */