src/gromacs/gmxana/gmx_tune_pme.c

   1 /*
   2  *
   3  *                This source code is part of
   4  *
   5  *                 G   R   O   M   A   C   S
   6  *
   7  *          GROningen MAchine for Chemical Simulations
   8  *
   9  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  10  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  11  * Copyright (c) 2001-2008, The GROMACS development team,
  12  * check out http://www.gromacs.org for more information.
  13
  14  * This program is free software; you can redistribute it and/or
  15  * modify it under the terms of the GNU General Public License
  16  * as published by the Free Software Foundation; either version 2
  17  * of the License, or (at your option) any later version.
  18  *
  19  * If you want to redistribute modifications, please consider that
  20  * scientific software is very special. Version control is crucial -
  21  * bugs must be traceable. We will be happy to consider code for
  22  * inclusion in the official distribution, but derived work must not
  23  * be called official GROMACS. Details are found in the README & COPYING
  24  * files - if they are missing, get the official version at www.gromacs.org.
  25  *
  26  * To help us fund GROMACS development, we humbly ask that you cite
  27  * the papers on the package - you can find them in the top README file.
  28  *
  29  * For more info, check our website at http://www.gromacs.org
  30  *
  31  * And Hey:
  32  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  33  */
  34 #ifdef HAVE_CONFIG_H
  35 #include <config.h>
  36 #endif
  37
  38
  39 #include <time.h>
  40 #ifdef HAVE_SYS_TIME_H
  41 #include <sys/time.h>
  42 #endif
  43
  44
  45
  46 #include "statutil.h"
  47 #include "typedefs.h"
  48 #include "smalloc.h"
  49 #include "vec.h"
  50 #include "copyrite.h"
  51 #include "statutil.h"
  52 #include "tpxio.h"
  53 #include "string2.h"
  54 #include "readinp.h"
  55 #include "calcgrid.h"
  56 #include "checkpoint.h"
  57 #include "macros.h"
  58 #include "gmx_ana.h"
  59 #include "names.h"
  60 #include "perf_est.h"
  61
  62
  63
  64 /* Enum for situations that can occur during log file parsing, the
  65  * corresponding string entries can be found in do_the_tests() in
  66  * const char* ParseLog[] */
  67 enum {
  68     eParselogOK,
  69     eParselogNotFound,
  70     eParselogNoPerfData,
  71     eParselogTerm,
  72     eParselogResetProblem,
  73     eParselogNoDDGrid,
  74     eParselogTPXVersion,
  75     eParselogNotParallel,
  76     eParselogFatal,
  77     eParselogNr
  78 };
  79
  80
  81 typedef struct
  82 {
  83     int     nPMEnodes;    /* number of PME-only nodes used in this test */
  84     int     nx, ny, nz;   /* DD grid */
  85     int     guessPME;     /* if nPMEnodes == -1, this is the guessed number of PME nodes */
  86     double *Gcycles;      /* This can contain more than one value if doing multiple tests */
  87     double  Gcycles_Av;
  88     float  *ns_per_day;
  89     float   ns_per_day_Av;
  90     float  *PME_f_load;     /* PME mesh/force load average*/
  91     float   PME_f_load_Av;  /* Average average ;) ... */
  92     char   *mdrun_cmd_line; /* Mdrun command line used for this test */
  93 } t_perf;
  94
  95
  96 typedef struct
  97 {
  98     int             nr_inputfiles;  /* The number of tpr and mdp input files */
  99     gmx_large_int_t orig_sim_steps; /* Number of steps to be done in the real simulation */
 100     gmx_large_int_t orig_init_step; /* Init step for the real simulation */
 101     real           *rcoulomb;       /* The coulomb radii [0...nr_inputfiles] */
 102     real           *rvdw;           /* The vdW radii */
 103     real           *rlist;          /* Neighbourlist cutoff radius */
 104     real           *rlistlong;
 105     int            *nkx, *nky, *nkz;
 106     real           *fsx, *fsy, *fsz; /* Fourierspacing in x,y,z dimension */
 107 } t_inputinfo;
 108
 109
 110 static void sep_line(FILE *fp)
 111 {
 112     fprintf(fp, "\n------------------------------------------------------------\n");
 113 }
 114
 115
 116 /* Wrapper for system calls */
 117 static int gmx_system_call(char *command)
 118 {
 119 #ifdef GMX_NO_SYSTEM
 120     gmx_fatal(FARGS, "No calls to system(3) supported on this platform. Attempted to call:\n'%s'\n", command);
 121 #else
 122     return ( system(command) );
 123 #endif
 124 }
 125
 126
 127 /* Check if string starts with substring */
 128 static gmx_bool str_starts(const char *string, const char *substring)
 129 {
 130     return ( strncmp(string, substring, strlen(substring)) == 0);
 131 }
 132
 133
 134 static void cleandata(t_perf *perfdata, int test_nr)
 135 {
 136     perfdata->Gcycles[test_nr]    = 0.0;
 137     perfdata->ns_per_day[test_nr] = 0.0;
 138     perfdata->PME_f_load[test_nr] = 0.0;
 139
 140     return;
 141 }
 142
 143
 144 static gmx_bool is_equal(real a, real b)
 145 {
 146     real diff, eps = 1.0e-7;
 147
 148
 149     diff = a - b;
 150
 151     if (diff < 0.0)
 152     {
 153         diff = -diff;
 154     }
 155
 156     if (diff < eps)
 157     {
 158         return TRUE;
 159     }
 160     else
 161     {
 162         return FALSE;
 163     }
 164 }
 165
 166
 167 static void finalize(const char *fn_out)
 168 {
 169     char  buf[STRLEN];
 170     FILE *fp;
 171
 172
 173     fp = fopen(fn_out, "r");
 174     fprintf(stdout, "\n\n");
 175
 176     while (fgets(buf, STRLEN-1, fp) != NULL)
 177     {
 178         fprintf(stdout, "%s", buf);
 179     }
 180     fclose(fp);
 181     fprintf(stdout, "\n\n");
 182 }
 183
 184
 185 enum {
 186     eFoundNothing, eFoundDDStr, eFoundAccountingStr, eFoundCycleStr
 187 };
 188
 189 static int parse_logfile(const char *logfile, const char *errfile,
 190                          t_perf *perfdata, int test_nr, int presteps, gmx_large_int_t cpt_steps,
 191                          int nnodes)
 192 {
 193     FILE           *fp;
 194     char            line[STRLEN], dumstring[STRLEN], dumstring2[STRLEN];
 195     const char      matchstrdd[]  = "Domain decomposition grid";
 196     const char      matchstrcr[]  = "resetting all time and cycle counters";
 197     const char      matchstrbal[] = "Average PME mesh/force load:";
 198     const char      matchstring[] = "R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G";
 199     const char      errSIG[]      = "signal, stopping at the next";
 200     int             iFound;
 201     int             procs;
 202     float           dum1, dum2, dum3, dum4;
 203     int             ndum;
 204     int             npme;
 205     gmx_large_int_t resetsteps     = -1;
 206     gmx_bool        bFoundResetStr = FALSE;
 207     gmx_bool        bResetChecked  = FALSE;
 208
 209
 210     if (!gmx_fexist(logfile))
 211     {
 212         fprintf(stderr, "WARNING: Could not find logfile %s.\n", logfile);
 213         cleandata(perfdata, test_nr);
 214         return eParselogNotFound;
 215     }
 216
 217     fp = fopen(logfile, "r");
 218     perfdata->PME_f_load[test_nr] = -1.0;
 219     perfdata->guessPME            = -1;
 220
 221     iFound = eFoundNothing;
 222     if (1 == nnodes)
 223     {
 224         iFound = eFoundDDStr; /* Skip some case statements */
 225     }
 226
 227     while (fgets(line, STRLEN, fp) != NULL)
 228     {
 229         /* Remove leading spaces */
 230         ltrim(line);
 231
 232         /* Check for TERM and INT signals from user: */
 233         if (strstr(line, errSIG) != NULL)
 234         {
 235             fclose(fp);
 236             cleandata(perfdata, test_nr);
 237             return eParselogTerm;
 238         }
 239
 240         /* Check whether cycle resetting  worked */
 241         if (presteps > 0 && !bFoundResetStr)
 242         {
 243             if (strstr(line, matchstrcr) != NULL)
 244             {
 245                 sprintf(dumstring, "step %s", gmx_large_int_pfmt);
 246                 sscanf(line, dumstring, &resetsteps);
 247                 bFoundResetStr = TRUE;
 248                 if (resetsteps == presteps+cpt_steps)
 249                 {
 250                     bResetChecked = TRUE;
 251                 }
 252                 else
 253                 {
 254                     sprintf(dumstring, gmx_large_int_pfmt, resetsteps);
 255                     sprintf(dumstring2, gmx_large_int_pfmt, presteps+cpt_steps);
 256                     fprintf(stderr, "WARNING: Time step counters were reset at step %s,\n"
 257                             "         though they were supposed to be reset at step %s!\n",
 258                             dumstring, dumstring2);
 259                 }
 260             }
 261         }
 262
 263         /* Look for strings that appear in a certain order in the log file: */
 264         switch (iFound)
 265         {
 266             case eFoundNothing:
 267                 /* Look for domain decomp grid and separate PME nodes: */
 268                 if (str_starts(line, matchstrdd))
 269                 {
 270                     sscanf(line, "Domain decomposition grid %d x %d x %d, separate PME nodes %d",
 271                            &(perfdata->nx), &(perfdata->ny), &(perfdata->nz), &npme);
 272                     if (perfdata->nPMEnodes == -1)
 273                     {
 274                         perfdata->guessPME = npme;
 275                     }
 276                     else if (perfdata->nPMEnodes != npme)
 277                     {
 278                         gmx_fatal(FARGS, "PME nodes from command line and output file are not identical");
 279                     }
 280                     iFound = eFoundDDStr;
 281                 }
 282                 /* Catch a few errors that might have occured: */
 283                 else if (str_starts(line, "There is no domain decomposition for"))
 284                 {
 285                     fclose(fp);
 286                     return eParselogNoDDGrid;
 287                 }
 288                 else if (str_starts(line, "reading tpx file"))
 289                 {
 290                     fclose(fp);
 291                     return eParselogTPXVersion;
 292                 }
 293                 else if (str_starts(line, "The -dd or -npme option request a parallel simulation"))
 294                 {
 295                     fclose(fp);
 296                     return eParselogNotParallel;
 297                 }
 298                 break;
 299             case eFoundDDStr:
 300                 /* Look for PME mesh/force balance (not necessarily present, though) */
 301                 if (str_starts(line, matchstrbal))
 302                 {
 303                     sscanf(&line[strlen(matchstrbal)], "%f", &(perfdata->PME_f_load[test_nr]));
 304                 }
 305                 /* Look for matchstring */
 306                 if (str_starts(line, matchstring))
 307                 {
 308                     iFound = eFoundAccountingStr;
 309                 }
 310                 break;
 311             case eFoundAccountingStr:
 312                 /* Already found matchstring - look for cycle data */
 313                 if (str_starts(line, "Total  "))
 314                 {
 315                     sscanf(line, "Total %d %lf", &procs, &(perfdata->Gcycles[test_nr]));
 316                     iFound = eFoundCycleStr;
 317                 }
 318                 break;
 319             case eFoundCycleStr:
 320                 /* Already found cycle data - look for remaining performance info and return */
 321                 if (str_starts(line, "Performance:"))
 322                 {
 323                     ndum = sscanf(line, "%s %f %f %f %f", dumstring, &dum1, &dum2, &dum3, &dum4);
 324                     /* (ns/day) is the second last entry, depending on whether GMX_DETAILED_PERF_STATS was set in print_perf(), nrnb.c */
 325                     perfdata->ns_per_day[test_nr] = (ndum == 5) ? dum3 : dum1;
 326                     fclose(fp);
 327                     if (bResetChecked || presteps == 0)
 328                     {
 329                         return eParselogOK;
 330                     }
 331                     else
 332                     {
 333                         return eParselogResetProblem;
 334                     }
 335                 }
 336                 break;
 337         }
 338     } /* while */
 339
 340     /* Close the log file */
 341     fclose(fp);
 342
 343     /* Check why there is no performance data in the log file.
 344      * Did a fatal errors occur? */
 345     if (gmx_fexist(errfile))
 346     {
 347         fp = fopen(errfile, "r");
 348         while (fgets(line, STRLEN, fp) != NULL)
 349         {
 350             if (str_starts(line, "Fatal error:") )
 351             {
 352                 if (fgets(line, STRLEN, fp) != NULL)
 353                 {
 354                     fprintf(stderr, "\nWARNING: An error occured during this benchmark:\n"
 355                             "%s\n", line);
 356                 }
 357                 fclose(fp);
 358                 cleandata(perfdata, test_nr);
 359                 return eParselogFatal;
 360             }
 361         }
 362         fclose(fp);
 363     }
 364     else
 365     {
 366         fprintf(stderr, "WARNING: Could not find stderr file %s.\n", errfile);
 367     }
 368
 369     /* Giving up ... we could not find out why there is no performance data in
 370      * the log file. */
 371     fprintf(stdout, "No performance data in log file.\n");
 372     cleandata(perfdata, test_nr);
 373
 374     return eParselogNoPerfData;
 375 }
 376
 377
 378 static gmx_bool analyze_data(
 379         FILE         *fp,
 380         const char   *fn,
 381         t_perf      **perfdata,
 382         int           nnodes,
 383         int           ntprs,
 384         int           ntests,
 385         int           nrepeats,
 386         t_inputinfo  *info,
 387         int          *index_tpr,    /* OUT: Nr of mdp file with best settings */
 388         int          *npme_optimal) /* OUT: Optimal number of PME nodes */
 389 {
 390     int      i, j, k;
 391     int      line  = 0, line_win = -1;
 392     int      k_win = -1, i_win = -1, winPME;
 393     double   s     = 0.0; /* standard deviation */
 394     t_perf  *pd;
 395     char     strbuf[STRLEN];
 396     char     str_PME_f_load[13];
 397     gmx_bool bCanUseOrigTPR;
 398     gmx_bool bRefinedCoul, bRefinedVdW, bRefinedGrid;
 399
 400
 401     if (nrepeats > 1)
 402     {
 403         sep_line(fp);
 404         fprintf(fp, "Summary of successful runs:\n");
 405         fprintf(fp, "Line tpr PME nodes  Gcycles Av.     Std.dev.       ns/day        PME/f");
 406         if (nnodes > 1)
 407         {
 408             fprintf(fp, "    DD grid");
 409         }
 410         fprintf(fp, "\n");
 411     }
 412
 413
 414     for (k = 0; k < ntprs; k++)
 415     {
 416         for (i = 0; i < ntests; i++)
 417         {
 418             /* Select the right dataset: */
 419             pd = &(perfdata[k][i]);
 420
 421             pd->Gcycles_Av    = 0.0;
 422             pd->PME_f_load_Av = 0.0;
 423             pd->ns_per_day_Av = 0.0;
 424
 425             if (pd->nPMEnodes == -1)
 426             {
 427                 sprintf(strbuf, "(%3d)", pd->guessPME);
 428             }
 429             else
 430             {
 431                 sprintf(strbuf, "     ");
 432             }
 433
 434             /* Get the average run time of a setting */
 435             for (j = 0; j < nrepeats; j++)
 436             {
 437                 pd->Gcycles_Av    += pd->Gcycles[j];
 438                 pd->PME_f_load_Av += pd->PME_f_load[j];
 439             }
 440             pd->Gcycles_Av    /= nrepeats;
 441             pd->PME_f_load_Av /= nrepeats;
 442
 443             for (j = 0; j < nrepeats; j++)
 444             {
 445                 if (pd->ns_per_day[j] > 0.0)
 446                 {
 447                     pd->ns_per_day_Av += pd->ns_per_day[j];
 448                 }
 449                 else
 450                 {
 451                     /* Somehow the performance number was not aquired for this run,
 452                      * therefor set the average to some negative value: */
 453                     pd->ns_per_day_Av = -1.0f*nrepeats;
 454                     break;
 455                 }
 456             }
 457             pd->ns_per_day_Av /= nrepeats;
 458
 459             /* Nicer output: */
 460             if (pd->PME_f_load_Av > 0.0)
 461             {
 462                 sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load_Av);
 463             }
 464             else
 465             {
 466                 sprintf(str_PME_f_load, "%s", "         -  ");
 467             }
 468
 469
 470             /* We assume we had a successful run if both averages are positive */
 471             if (pd->Gcycles_Av > 0.0 && pd->ns_per_day_Av > 0.0)
 472             {
 473                 /* Output statistics if repeats were done */
 474                 if (nrepeats > 1)
 475                 {
 476                     /* Calculate the standard deviation */
 477                     s = 0.0;
 478                     for (j = 0; j < nrepeats; j++)
 479                     {
 480                         s += pow( pd->Gcycles[j] - pd->Gcycles_Av, 2 );
 481                     }
 482                     s /= (nrepeats - 1);
 483                     s  = sqrt(s);
 484
 485                     fprintf(fp, "%4d %3d %4d%s %12.3f %12.3f %12.3f %s",
 486                             line, k, pd->nPMEnodes, strbuf, pd->Gcycles_Av, s,
 487                             pd->ns_per_day_Av, str_PME_f_load);
 488                     if (nnodes > 1)
 489                     {
 490                         fprintf(fp, "  %3d %3d %3d", pd->nx, pd->ny, pd->nz);
 491                     }
 492                     fprintf(fp, "\n");
 493                 }
 494                 /* Store the index of the best run found so far in 'winner': */
 495                 if ( (k_win == -1) || (pd->Gcycles_Av < perfdata[k_win][i_win].Gcycles_Av) )
 496                 {
 497                     k_win    = k;
 498                     i_win    = i;
 499                     line_win = line;
 500                 }
 501                 line++;
 502             }
 503         }
 504     }
 505
 506     if (k_win == -1)
 507     {
 508         gmx_fatal(FARGS, "None of the runs was successful! Check %s for problems.", fn);
 509     }
 510
 511     sep_line(fp);
 512
 513     winPME = perfdata[k_win][i_win].nPMEnodes;
 514
 515     if (1 == ntests)
 516     {
 517         /* We stuck to a fixed number of PME-only nodes */
 518         sprintf(strbuf, "settings No. %d", k_win);
 519     }
 520     else
 521     {
 522         /* We have optimized the number of PME-only nodes */
 523         if (winPME == -1)
 524         {
 525             sprintf(strbuf, "%s", "the automatic number of PME nodes");
 526         }
 527         else
 528         {
 529             sprintf(strbuf, "%d PME nodes", winPME);
 530         }
 531     }
 532     fprintf(fp, "Best performance was achieved with %s", strbuf);
 533     if ((nrepeats > 1) && (ntests > 1))
 534     {
 535         fprintf(fp, " (see line %d)", line_win);
 536     }
 537     fprintf(fp, "\n");
 538
 539     /* Only mention settings if they were modified: */
 540     bRefinedCoul = !is_equal(info->rcoulomb[k_win], info->rcoulomb[0]);
 541     bRefinedVdW  = !is_equal(info->rvdw[k_win], info->rvdw[0]    );
 542     bRefinedGrid = !(info->nkx[k_win] == info->nkx[0] &&
 543                      info->nky[k_win] == info->nky[0] &&
 544                      info->nkz[k_win] == info->nkz[0]);
 545
 546     if (bRefinedCoul || bRefinedVdW || bRefinedGrid)
 547     {
 548         fprintf(fp, "Optimized PME settings:\n");
 549         bCanUseOrigTPR = FALSE;
 550     }
 551     else
 552     {
 553         bCanUseOrigTPR = TRUE;
 554     }
 555
 556     if (bRefinedCoul)
 557     {
 558         fprintf(fp, "   New Coulomb radius: %f nm (was %f nm)\n", info->rcoulomb[k_win], info->rcoulomb[0]);
 559     }
 560
 561     if (bRefinedVdW)
 562     {
 563         fprintf(fp, "   New Van der Waals radius: %f nm (was %f nm)\n", info->rvdw[k_win], info->rvdw[0]);
 564     }
 565
 566     if (bRefinedGrid)
 567     {
 568         fprintf(fp, "   New Fourier grid xyz: %d %d %d (was %d %d %d)\n", info->nkx[k_win], info->nky[k_win], info->nkz[k_win],
 569                 info->nkx[0], info->nky[0], info->nkz[0]);
 570     }
 571
 572     if (bCanUseOrigTPR && ntprs > 1)
 573     {
 574         fprintf(fp, "and original PME settings.\n");
 575     }
 576
 577     fflush(fp);
 578
 579     /* Return the index of the mdp file that showed the highest performance
 580      * and the optimal number of PME nodes */
 581     *index_tpr    = k_win;
 582     *npme_optimal = winPME;
 583
 584     return bCanUseOrigTPR;
 585 }
 586
 587
 588 /* Get the commands we need to set up the runs from environment variables */
 589 static void get_program_paths(gmx_bool bThreads, char *cmd_mpirun[], char *cmd_mdrun[])
 590 {
 591     char      *cp;
 592     FILE      *fp;
 593     const char def_mpirun[]   = "mpirun";
 594     const char def_mdrun[]    = "mdrun";
 595
 596     const char empty_mpirun[] = "";
 597
 598     /* Get the commands we need to set up the runs from environment variables */
 599     if (!bThreads)
 600     {
 601         if ( (cp = getenv("MPIRUN")) != NULL)
 602         {
 603             *cmd_mpirun = strdup(cp);
 604         }
 605         else
 606         {
 607             *cmd_mpirun = strdup(def_mpirun);
 608         }
 609     }
 610     else
 611     {
 612         *cmd_mpirun = strdup(empty_mpirun);
 613     }
 614
 615     if ( (cp = getenv("MDRUN" )) != NULL)
 616     {
 617         *cmd_mdrun  = strdup(cp);
 618     }
 619     else
 620     {
 621         *cmd_mdrun  = strdup(def_mdrun);
 622     }
 623 }
 624
 625 /* Check that the commands will run mdrun (perhaps via mpirun) by
 626  * running a very quick test simulation. Requires MPI environment to
 627  * be available if applicable. */
 628 static void check_mdrun_works(gmx_bool    bThreads,
 629                               const char *cmd_mpirun,
 630                               const char *cmd_np,
 631                               const char *cmd_mdrun)
 632 {
 633     char      *command = NULL;
 634     char      *cp;
 635     char       line[STRLEN];
 636     FILE      *fp;
 637     const char filename[]     = "benchtest.log";
 638
 639     /* This string should always be identical to the one in copyrite.c,
 640      * gmx_print_version_info() in the defined(GMX_MPI) section */
 641     const char match_mpi[]    = "MPI library:        MPI";
 642     const char match_mdrun[]  = "Program: ";
 643     gmx_bool   bMdrun         = FALSE;
 644     gmx_bool   bMPI           = FALSE;
 645
 646     /* Run a small test to see whether mpirun + mdrun work  */
 647     fprintf(stdout, "Making sure that mdrun can be executed. ");
 648     if (bThreads)
 649     {
 650         snew(command, strlen(cmd_mdrun) + strlen(cmd_np) + strlen(filename) + 50);
 651         sprintf(command, "%s%s-version -maxh 0.001 1> %s 2>&1", cmd_mdrun, cmd_np, filename);
 652     }
 653     else
 654     {
 655         snew(command, strlen(cmd_mpirun) + strlen(cmd_np) + strlen(cmd_mdrun) + strlen(filename) + 50);
 656         sprintf(command, "%s%s%s -version -maxh 0.001 1> %s 2>&1", cmd_mpirun, cmd_np, cmd_mdrun, filename);
 657     }
 658     fprintf(stdout, "Trying '%s' ... ", command);
 659     make_backup(filename);
 660     gmx_system_call(command);
 661
 662     /* Check if we find the characteristic string in the output: */
 663     if (!gmx_fexist(filename))
 664     {
 665         gmx_fatal(FARGS, "Output from test run could not be found.");
 666     }
 667
 668     fp = fopen(filename, "r");
 669     /* We need to scan the whole output file, since sometimes the queuing system
 670      * also writes stuff to stdout/err */
 671     while (!feof(fp) )
 672     {
 673         cp = fgets(line, STRLEN, fp);
 674         if (cp != NULL)
 675         {
 676             if (str_starts(line, match_mdrun) )
 677             {
 678                 bMdrun = TRUE;
 679             }
 680             if (str_starts(line, match_mpi) )
 681             {
 682                 bMPI = TRUE;
 683             }
 684         }
 685     }
 686     fclose(fp);
 687
 688     if (bThreads)
 689     {
 690         if (bMPI)
 691         {
 692             gmx_fatal(FARGS, "Need a threaded version of mdrun. This one\n"
 693                       "(%s)\n"
 694                       "seems to have been compiled with MPI instead.",
 695                       *cmd_mdrun);
 696         }
 697     }
 698     else
 699     {
 700         if (bMdrun && !bMPI)
 701         {
 702             gmx_fatal(FARGS, "Need an MPI-enabled version of mdrun. This one\n"
 703                       "(%s)\n"
 704                       "seems to have been compiled without MPI support.",
 705                       *cmd_mdrun);
 706         }
 707     }
 708
 709     if (!bMdrun)
 710     {
 711         gmx_fatal(FARGS, "Cannot execute mdrun. Please check %s for problems!",
 712                   filename);
 713     }
 714
 715     fprintf(stdout, "passed.\n");
 716
 717     /* Clean up ... */
 718     remove(filename);
 719     sfree(command);
 720 }
 721
 722
 723 static void launch_simulation(
 724         gmx_bool    bLaunch,        /* Should the simulation be launched? */
 725         FILE       *fp,             /* General log file */
 726         gmx_bool    bThreads,       /* whether to use threads */
 727         char       *cmd_mpirun,     /* Command for mpirun */
 728         char       *cmd_np,         /* Switch for -np or -ntmpi or empty */
 729         char       *cmd_mdrun,      /* Command for mdrun */
 730         char       *args_for_mdrun, /* Arguments for mdrun */
 731         const char *simulation_tpr, /* This tpr will be simulated */
 732         int         nPMEnodes)      /* Number of PME nodes to use */
 733 {
 734     char  *command;
 735
 736
 737     /* Make enough space for the system call command,
 738      * (100 extra chars for -npme ... etc. options should suffice): */
 739     snew(command, strlen(cmd_mpirun)+strlen(cmd_mdrun)+strlen(cmd_np)+strlen(args_for_mdrun)+strlen(simulation_tpr)+100);
 740
 741     /* Note that the -passall options requires args_for_mdrun to be at the end
 742      * of the command line string */
 743     if (bThreads)
 744     {
 745         sprintf(command, "%s%s-npme %d -s %s %s",
 746                 cmd_mdrun, cmd_np, nPMEnodes, simulation_tpr, args_for_mdrun);
 747     }
 748     else
 749     {
 750         sprintf(command, "%s%s%s -npme %d -s %s %s",
 751                 cmd_mpirun, cmd_np, cmd_mdrun, nPMEnodes, simulation_tpr, args_for_mdrun);
 752     }
 753
 754     fprintf(fp, "%s this command line to launch the simulation:\n\n%s", bLaunch ? "Using" : "Please use", command);
 755     sep_line(fp);
 756     fflush(fp);
 757
 758     /* Now the real thing! */
 759     if (bLaunch)
 760     {
 761         fprintf(stdout, "\nLaunching simulation with best parameters now.\nExecuting '%s'", command);
 762         sep_line(stdout);
 763         fflush(stdout);
 764         gmx_system_call(command);
 765     }
 766 }
 767
 768
 769 static void modify_PMEsettings(
 770         gmx_large_int_t simsteps,    /* Set this value as number of time steps */
 771         gmx_large_int_t init_step,   /* Set this value as init_step */
 772         const char     *fn_best_tpr, /* tpr file with the best performance */
 773         const char     *fn_sim_tpr)  /* name of tpr file to be launched */
 774 {
 775     t_inputrec   *ir;
 776     t_state       state;
 777     gmx_mtop_t    mtop;
 778     char          buf[200];
 779
 780     snew(ir, 1);
 781     read_tpx_state(fn_best_tpr, ir, &state, NULL, &mtop);
 782
 783     /* Reset nsteps and init_step to the value of the input .tpr file */
 784     ir->nsteps    = simsteps;
 785     ir->init_step = init_step;
 786
 787     /* Write the tpr file which will be launched */
 788     sprintf(buf, "Writing optimized simulation file %s with nsteps=%s.\n", fn_sim_tpr, gmx_large_int_pfmt);
 789     fprintf(stdout, buf, ir->nsteps);
 790     fflush(stdout);
 791     write_tpx_state(fn_sim_tpr, ir, &state, &mtop);
 792
 793     sfree(ir);
 794 }
 795
 796
 797 #define EPME_SWITCHED(e) ((e) == eelPMESWITCH || (e) == eelPMEUSERSWITCH)
 798
 799 /* Make additional TPR files with more computational load for the
 800  * direct space processors: */
 801 static void make_benchmark_tprs(
 802         const char     *fn_sim_tpr,      /* READ : User-provided tpr file                 */
 803         char           *fn_bench_tprs[], /* WRITE: Names of benchmark tpr files           */
 804         gmx_large_int_t benchsteps,      /* Number of time steps for benchmark runs       */
 805         gmx_large_int_t statesteps,      /* Step counter in checkpoint file               */
 806         real            rmin,            /* Minimal Coulomb radius                        */
 807         real            rmax,            /* Maximal Coulomb radius                        */
 808         real            bScaleRvdw,      /* Scale rvdw along with rcoulomb                */
 809         int            *ntprs,           /* No. of TPRs to write, each with a different
 810                                             rcoulomb and fourierspacing                   */
 811         t_inputinfo    *info,            /* Contains information about mdp file options   */
 812         FILE           *fp)              /* Write the output here                         */
 813 {
 814     int           i, j, d;
 815     t_inputrec   *ir;
 816     t_state       state;
 817     gmx_mtop_t    mtop;
 818     real          nlist_buffer;     /* Thickness of the buffer regions for PME-switch potentials */
 819     char          buf[200];
 820     rvec          box_size;
 821     gmx_bool      bNote = FALSE;
 822     real          add;              /* Add this to rcoul for the next test    */
 823     real          fac = 1.0;        /* Scaling factor for Coulomb radius      */
 824     real          fourierspacing;   /* Basic fourierspacing from tpr          */
 825
 826
 827     sprintf(buf, "Making benchmark tpr file%s with %s time step%s",
 828             *ntprs > 1 ? "s" : "", gmx_large_int_pfmt, benchsteps > 1 ? "s" : "");
 829     fprintf(stdout, buf, benchsteps);
 830     if (statesteps > 0)
 831     {
 832         sprintf(buf, " (adding %s steps from checkpoint file)", gmx_large_int_pfmt);
 833         fprintf(stdout, buf, statesteps);
 834         benchsteps += statesteps;
 835     }
 836     fprintf(stdout, ".\n");
 837
 838
 839     snew(ir, 1);
 840     read_tpx_state(fn_sim_tpr, ir, &state, NULL, &mtop);
 841
 842     /* Check if some kind of PME was chosen */
 843     if (EEL_PME(ir->coulombtype) == FALSE)
 844     {
 845         gmx_fatal(FARGS, "Can only do optimizations for simulations with %s electrostatics.",
 846                   EELTYPE(eelPME));
 847     }
 848
 849     /* Check if rcoulomb == rlist, which is necessary for plain PME. */
 850     if (  (ir->cutoff_scheme != ecutsVERLET) &&
 851           (eelPME == ir->coulombtype) && !(ir->rcoulomb == ir->rlist))
 852     {
 853         gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to rlist (%f).",
 854                   EELTYPE(eelPME), ir->rcoulomb, ir->rlist);
 855     }
 856     /* For other PME types, rcoulomb is allowed to be smaller than rlist */
 857     else if (ir->rcoulomb > ir->rlist)
 858     {
 859         gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to or smaller than rlist (%f)",
 860                   EELTYPE(ir->coulombtype), ir->rcoulomb, ir->rlist);
 861     }
 862
 863     if (bScaleRvdw && ir->rvdw != ir->rcoulomb)
 864     {
 865         fprintf(stdout, "NOTE: input rvdw != rcoulomb, will not scale rvdw\n");
 866         bScaleRvdw = FALSE;
 867     }
 868
 869     /* Reduce the number of steps for the benchmarks */
 870     info->orig_sim_steps = ir->nsteps;
 871     ir->nsteps           = benchsteps;
 872     /* We must not use init_step from the input tpr file for the benchmarks */
 873     info->orig_init_step = ir->init_step;
 874     ir->init_step        = 0;
 875
 876     /* For PME-switch potentials, keep the radial distance of the buffer region */
 877     nlist_buffer   = ir->rlist - ir->rcoulomb;
 878
 879     /* Determine length of triclinic box vectors */
 880     for (d = 0; d < DIM; d++)
 881     {
 882         box_size[d] = 0;
 883         for (i = 0; i < DIM; i++)
 884         {
 885             box_size[d] += state.box[d][i]*state.box[d][i];
 886         }
 887         box_size[d] = sqrt(box_size[d]);
 888     }
 889
 890     if (ir->fourier_spacing > 0)
 891     {
 892         info->fsx[0] = ir->fourier_spacing;
 893         info->fsy[0] = ir->fourier_spacing;
 894         info->fsz[0] = ir->fourier_spacing;
 895     }
 896     else
 897     {
 898         /* Reconstruct fourierspacing per dimension from the number of grid points and box size */
 899         info->fsx[0] = box_size[XX]/ir->nkx;
 900         info->fsy[0] = box_size[YY]/ir->nky;
 901         info->fsz[0] = box_size[ZZ]/ir->nkz;
 902     }
 903
 904     /* If no value for the fourierspacing was provided on the command line, we
 905      * use the reconstruction from the tpr file */
 906     if (ir->fourier_spacing > 0)
 907     {
 908         /* Use the spacing from the tpr */
 909         fourierspacing = ir->fourier_spacing;
 910     }
 911     else
 912     {
 913         /* Use the maximum observed spacing */
 914         fourierspacing = max(max(info->fsx[0], info->fsy[0]), info->fsz[0]);
 915     }
 916
 917     fprintf(stdout, "Calculating PME grid points on the basis of a fourierspacing of %f nm\n", fourierspacing);
 918
 919     /* For performance comparisons the number of particles is useful to have */
 920     fprintf(fp, "   Number of particles  : %d\n", mtop.natoms);
 921
 922     /* Print information about settings of which some are potentially modified: */
 923     fprintf(fp, "   Coulomb type         : %s\n", EELTYPE(ir->coulombtype));
 924     fprintf(fp, "   Grid spacing x y z   : %f %f %f\n",
 925             box_size[XX]/ir->nkx, box_size[YY]/ir->nky, box_size[ZZ]/ir->nkz);
 926     fprintf(fp, "   Van der Waals type   : %s\n", EVDWTYPE(ir->vdwtype));
 927     if (EVDW_SWITCHED(ir->vdwtype))
 928     {
 929         fprintf(fp, "   rvdw_switch          : %f nm\n", ir->rvdw_switch);
 930     }
 931     if (EPME_SWITCHED(ir->coulombtype))
 932     {
 933         fprintf(fp, "   rlist                : %f nm\n", ir->rlist);
 934     }
 935     if (ir->rlistlong != max_cutoff(ir->rvdw, ir->rcoulomb))
 936     {
 937         fprintf(fp, "   rlistlong            : %f nm\n", ir->rlistlong);
 938     }
 939
 940     /* Print a descriptive line about the tpr settings tested */
 941     fprintf(fp, "\nWill try these real/reciprocal workload settings:\n");
 942     fprintf(fp, " No.   scaling  rcoulomb");
 943     fprintf(fp, "  nkx  nky  nkz");
 944     fprintf(fp, "   spacing");
 945     if (evdwCUT == ir->vdwtype)
 946     {
 947         fprintf(fp, "      rvdw");
 948     }
 949     if (EPME_SWITCHED(ir->coulombtype))
 950     {
 951         fprintf(fp, "     rlist");
 952     }
 953     if (ir->rlistlong != max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb)) )
 954     {
 955         fprintf(fp, " rlistlong");
 956     }
 957     fprintf(fp, "  tpr file\n");
 958
 959     /* Loop to create the requested number of tpr input files */
 960     for (j = 0; j < *ntprs; j++)
 961     {
 962         /* The first .tpr is the provided one, just need to modify nsteps,
 963          * so skip the following block */
 964         if (j != 0)
 965         {
 966             /* Determine which Coulomb radii rc to use in the benchmarks */
 967             add = (rmax-rmin)/(*ntprs-1);
 968             if (is_equal(rmin, info->rcoulomb[0]))
 969             {
 970                 ir->rcoulomb = rmin + j*add;
 971             }
 972             else if (is_equal(rmax, info->rcoulomb[0]))
 973             {
 974                 ir->rcoulomb = rmin + (j-1)*add;
 975             }
 976             else
 977             {
 978                 /* rmin != rcoul != rmax, ergo test between rmin and rmax */
 979                 add          = (rmax-rmin)/(*ntprs-2);
 980                 ir->rcoulomb = rmin + (j-1)*add;
 981             }
 982
 983             /* Determine the scaling factor fac */
 984             fac = ir->rcoulomb/info->rcoulomb[0];
 985
 986             /* Scale the Fourier grid spacing */
 987             ir->nkx = ir->nky = ir->nkz = 0;
 988             calc_grid(NULL, state.box, fourierspacing*fac, &ir->nkx, &ir->nky, &ir->nkz);
 989
 990             /* Adjust other radii since various conditions neet to be fulfilled */
 991             if (eelPME == ir->coulombtype)
 992             {
 993                 /* plain PME, rcoulomb must be equal to rlist */
 994                 ir->rlist = ir->rcoulomb;
 995             }
 996             else
 997             {
 998                 /* rlist must be >= rcoulomb, we keep the size of the buffer region */
 999                 ir->rlist = ir->rcoulomb + nlist_buffer;
1000             }
1001
1002             if (bScaleRvdw && evdwCUT == ir->vdwtype)
1003             {
1004                 /* For vdw cutoff, rvdw >= rlist */
1005                 ir->rvdw = max(info->rvdw[0], ir->rlist);
1006             }
1007
1008             ir->rlistlong = max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb));
1009
1010         } /* end of "if (j != 0)" */
1011
1012         /* for j==0: Save the original settings
1013          * for j >0: Save modified radii and Fourier grids */
1014         info->rcoulomb[j]  = ir->rcoulomb;
1015         info->rvdw[j]      = ir->rvdw;
1016         info->nkx[j]       = ir->nkx;
1017         info->nky[j]       = ir->nky;
1018         info->nkz[j]       = ir->nkz;
1019         info->rlist[j]     = ir->rlist;
1020         info->rlistlong[j] = ir->rlistlong;
1021         info->fsx[j]       = fac*fourierspacing;
1022         info->fsy[j]       = fac*fourierspacing;
1023         info->fsz[j]       = fac*fourierspacing;
1024
1025         /* Write the benchmark tpr file */
1026         strncpy(fn_bench_tprs[j], fn_sim_tpr, strlen(fn_sim_tpr)-strlen(".tpr"));
1027         sprintf(buf, "_bench%.2d.tpr", j);
1028         strcat(fn_bench_tprs[j], buf);
1029         fprintf(stdout, "Writing benchmark tpr %s with nsteps=", fn_bench_tprs[j]);
1030         fprintf(stdout, gmx_large_int_pfmt, ir->nsteps);
1031         if (j > 0)
1032         {
1033             fprintf(stdout, ", scaling factor %f\n", fac);
1034         }
1035         else
1036         {
1037             fprintf(stdout, ", unmodified settings\n");
1038         }
1039
1040         write_tpx_state(fn_bench_tprs[j], ir, &state, &mtop);
1041
1042         /* Write information about modified tpr settings to log file */
1043         fprintf(fp, "%4d%10f%10f", j, fac, ir->rcoulomb);
1044         fprintf(fp, "%5d%5d%5d", ir->nkx, ir->nky, ir->nkz);
1045         fprintf(fp, " %9f ", info->fsx[j]);
1046         if (evdwCUT == ir->vdwtype)
1047         {
1048             fprintf(fp, "%10f", ir->rvdw);
1049         }
1050         if (EPME_SWITCHED(ir->coulombtype))
1051         {
1052             fprintf(fp, "%10f", ir->rlist);
1053         }
1054         if (info->rlistlong[0] != max_cutoff(info->rlist[0], max_cutoff(info->rvdw[0], info->rcoulomb[0])) )
1055         {
1056             fprintf(fp, "%10f", ir->rlistlong);
1057         }
1058         fprintf(fp, "  %-14s\n", fn_bench_tprs[j]);
1059
1060         /* Make it clear to the user that some additional settings were modified */
1061         if (!is_equal(ir->rvdw, info->rvdw[0])
1062             || !is_equal(ir->rlistlong, info->rlistlong[0]) )
1063         {
1064             bNote = TRUE;
1065         }
1066     }
1067     if (bNote)
1068     {
1069         fprintf(fp, "\nNote that in addition to the Coulomb radius and the Fourier grid\n"
1070                 "other input settings were also changed (see table above).\n"
1071                 "Please check if the modified settings are appropriate.\n");
1072     }
1073     fflush(stdout);
1074     fflush(fp);
1075     sfree(ir);
1076 }
1077
1078
1079 /* Rename the files we want to keep to some meaningful filename and
1080  * delete the rest */
1081 static void cleanup(const t_filenm *fnm, int nfile, int k, int nnodes,
1082                     int nPMEnodes, int nr, gmx_bool bKeepStderr)
1083 {
1084     char        numstring[STRLEN];
1085     char        newfilename[STRLEN];
1086     const char *fn = NULL;
1087     int         i;
1088     const char *opt;
1089
1090
1091     fprintf(stdout, "Cleaning up, deleting benchmark temp files ...\n");
1092
1093     for (i = 0; i < nfile; i++)
1094     {
1095         opt = (char *)fnm[i].opt;
1096         if (strcmp(opt, "-p") == 0)
1097         {
1098             /* do nothing; keep this file */
1099             ;
1100         }
1101         else if (strcmp(opt, "-bg") == 0)
1102         {
1103             /* Give the log file a nice name so one can later see which parameters were used */
1104             numstring[0] = '\0';
1105             if (nr > 0)
1106             {
1107                 sprintf(numstring, "_%d", nr);
1108             }
1109             sprintf(newfilename, "%s_no%d_np%d_npme%d%s", opt2fn("-bg", nfile, fnm), k, nnodes, nPMEnodes, numstring);
1110             if (gmx_fexist(opt2fn("-bg", nfile, fnm)))
1111             {
1112                 fprintf(stdout, "renaming log file to %s\n", newfilename);
1113                 make_backup(newfilename);
1114                 rename(opt2fn("-bg", nfile, fnm), newfilename);
1115             }
1116         }
1117         else if (strcmp(opt, "-err") == 0)
1118         {
1119             /* This file contains the output of stderr. We want to keep it in
1120              * cases where there have been problems. */
1121             fn           = opt2fn(opt, nfile, fnm);
1122             numstring[0] = '\0';
1123             if (nr > 0)
1124             {
1125                 sprintf(numstring, "_%d", nr);
1126             }
1127             sprintf(newfilename, "%s_no%d_np%d_npme%d%s", fn, k, nnodes, nPMEnodes, numstring);
1128             if (gmx_fexist(fn))
1129             {
1130                 if (bKeepStderr)
1131                 {
1132                     fprintf(stdout, "Saving stderr output in %s\n", newfilename);
1133                     make_backup(newfilename);
1134                     rename(fn, newfilename);
1135                 }
1136                 else
1137                 {
1138                     fprintf(stdout, "Deleting %s\n", fn);
1139                     remove(fn);
1140                 }
1141             }
1142         }
1143         /* Delete the files which are created for each benchmark run: (options -b*) */
1144         else if ( (0 == strncmp(opt, "-b", 2)) && (opt2bSet(opt, nfile, fnm) || !is_optional(&fnm[i])) )
1145         {
1146             fn = opt2fn(opt, nfile, fnm);
1147             if (gmx_fexist(fn))
1148             {
1149                 fprintf(stdout, "Deleting %s\n", fn);
1150                 remove(fn);
1151             }
1152         }
1153     }
1154 }
1155
1156
1157 /* Returns the largest common factor of n1 and n2 */
1158 static int largest_common_factor(int n1, int n2)
1159 {
1160     int factor, nmax;
1161
1162     nmax = min(n1, n2);
1163     for (factor = nmax; factor > 0; factor--)
1164     {
1165         if (0 == (n1 % factor) && 0 == (n2 % factor) )
1166         {
1167             return(factor);
1168         }
1169     }
1170     return 0; /* one for the compiler */
1171 }
1172
1173 enum {
1174     eNpmeAuto, eNpmeAll, eNpmeReduced, eNpmeSubset, eNpmeNr
1175 };
1176
1177 /* Create a list of numbers of PME nodes to test */
1178 static void make_npme_list(
1179         const char *npmevalues_opt, /* Make a complete list with all
1180                                      * possibilities or a short list that keeps only
1181                                      * reasonable numbers of PME nodes                  */
1182         int        *nentries,       /* Number of entries we put in the nPMEnodes list   */
1183         int        *nPMEnodes[],    /* Each entry contains the value for -npme          */
1184         int         nnodes,         /* Total number of nodes to do the tests on         */
1185         int         minPMEnodes,    /* Minimum number of PME nodes                      */
1186         int         maxPMEnodes)    /* Maximum number of PME nodes                      */
1187 {
1188     int i, npme, npp;
1189     int min_factor = 1;   /* We request that npp and npme have this minimal
1190                            * largest common factor (depends on npp)           */
1191     int nlistmax;         /* Max. list size                                   */
1192     int nlist;            /* Actual number of entries in list                 */
1193     int eNPME = 0;
1194
1195
1196     /* Do we need to check all possible values for -npme or is a reduced list enough? */
1197     if (0 == strcmp(npmevalues_opt, "all") )
1198     {
1199         eNPME = eNpmeAll;
1200     }
1201     else if (0 == strcmp(npmevalues_opt, "subset") )
1202     {
1203         eNPME = eNpmeSubset;
1204     }
1205     else /* "auto" or "range" */
1206     {
1207         if (nnodes <= 64)
1208         {
1209             eNPME = eNpmeAll;
1210         }
1211         else if (nnodes < 128)
1212         {
1213             eNPME = eNpmeReduced;
1214         }
1215         else
1216         {
1217             eNPME = eNpmeSubset;
1218         }
1219     }
1220
1221     /* Calculate how many entries we could possibly have (in case of -npme all) */
1222     if (nnodes > 2)
1223     {
1224         nlistmax = maxPMEnodes - minPMEnodes + 3;
1225         if (0 == minPMEnodes)
1226         {
1227             nlistmax--;
1228         }
1229     }
1230     else
1231     {
1232         nlistmax = 1;
1233     }
1234
1235     /* Now make the actual list which is at most of size nlist */
1236     snew(*nPMEnodes, nlistmax);
1237     nlist = 0; /* start counting again, now the real entries in the list */
1238     for (i = 0; i < nlistmax - 2; i++)
1239     {
1240         npme = maxPMEnodes - i;
1241         npp  = nnodes-npme;
1242         switch (eNPME)
1243         {
1244             case eNpmeAll:
1245                 min_factor = 1;
1246                 break;
1247             case eNpmeReduced:
1248                 min_factor = 2;
1249                 break;
1250             case eNpmeSubset:
1251                 /* For 2d PME we want a common largest factor of at least the cube
1252                  * root of the number of PP nodes */
1253                 min_factor = (int) pow(npp, 1.0/3.0);
1254                 break;
1255             default:
1256                 gmx_fatal(FARGS, "Unknown option for eNPME in make_npme_list");
1257                 break;
1258         }
1259         if (largest_common_factor(npp, npme) >= min_factor)
1260         {
1261             (*nPMEnodes)[nlist] = npme;
1262             nlist++;
1263         }
1264     }
1265     /* We always test 0 PME nodes and the automatic number */
1266     *nentries             = nlist + 2;
1267     (*nPMEnodes)[nlist  ] =  0;
1268     (*nPMEnodes)[nlist+1] = -1;
1269
1270     fprintf(stderr, "Will try the following %d different values for -npme:\n", *nentries);
1271     for (i = 0; i < *nentries-1; i++)
1272     {
1273         fprintf(stderr, "%d, ", (*nPMEnodes)[i]);
1274     }
1275     fprintf(stderr, "and %d (auto).\n", (*nPMEnodes)[*nentries-1]);
1276 }
1277
1278
1279 /* Allocate memory to store the performance data */
1280 static void init_perfdata(t_perf *perfdata[], int ntprs, int datasets, int repeats)
1281 {
1282     int i, j, k;
1283
1284
1285     for (k = 0; k < ntprs; k++)
1286     {
1287         snew(perfdata[k], datasets);
1288         for (i = 0; i < datasets; i++)
1289         {
1290             for (j = 0; j < repeats; j++)
1291             {
1292                 snew(perfdata[k][i].Gcycles, repeats);
1293                 snew(perfdata[k][i].ns_per_day, repeats);
1294                 snew(perfdata[k][i].PME_f_load, repeats);
1295             }
1296         }
1297     }
1298 }
1299
1300
1301 /* Check for errors on mdrun -h */
1302 static void make_sure_it_runs(char *mdrun_cmd_line, int length, FILE *fp)
1303 {
1304     char *command, *msg;
1305     int   ret;
1306
1307
1308     snew(command, length +  15);
1309     snew(msg, length + 500);
1310
1311     fprintf(stdout, "Making sure the benchmarks can be executed ...\n");
1312     /* FIXME: mdrun -h no longer actually does anything useful.
1313      * It unconditionally prints the help, ignoring all other options. */
1314     sprintf(command, "%s-h -quiet", mdrun_cmd_line);
1315     ret = gmx_system_call(command);
1316
1317     if (0 != ret)
1318     {
1319         /* To prevent confusion, do not again issue a gmx_fatal here since we already
1320          * get the error message from mdrun itself */
1321         sprintf(msg,    "Cannot run the benchmark simulations! Please check the error message of\n"
1322                 "mdrun for the source of the problem. Did you provide a command line\n"
1323                 "argument that neither g_tune_pme nor mdrun understands? Offending command:\n"
1324                 "\n%s\n\n", command);
1325
1326         fprintf(stderr, "%s", msg);
1327         sep_line(fp);
1328         fprintf(fp, "%s", msg);
1329
1330         exit(ret);
1331     }
1332
1333     sfree(command);
1334     sfree(msg    );
1335 }
1336
1337
1338 static void do_the_tests(
1339         FILE           *fp,             /* General g_tune_pme output file         */
1340         char          **tpr_names,      /* Filenames of the input files to test   */
1341         int             maxPMEnodes,    /* Max fraction of nodes to use for PME   */
1342         int             minPMEnodes,    /* Min fraction of nodes to use for PME   */
1343         int             npme_fixed,     /* If >= -1, test fixed number of PME
1344                                          * nodes only                             */
1345         const char     *npmevalues_opt, /* Which -npme values should be tested    */
1346         t_perf        **perfdata,       /* Here the performace data is stored     */
1347         int            *pmeentries,     /* Entries in the nPMEnodes list          */
1348         int             repeats,        /* Repeat each test this often            */
1349         int             nnodes,         /* Total number of nodes = nPP + nPME     */
1350         int             nr_tprs,        /* Total number of tpr files to test      */
1351         gmx_bool        bThreads,       /* Threads or MPI?                        */
1352         char           *cmd_mpirun,     /* mpirun command string                  */
1353         char           *cmd_np,         /* "-np", "-n", whatever mpirun needs     */
1354         char           *cmd_mdrun,      /* mdrun command string                   */
1355         char           *cmd_args_bench, /* arguments for mdrun in a string        */
1356         const t_filenm *fnm,            /* List of filenames from command line    */
1357         int             nfile,          /* Number of files specified on the cmdl. */
1358         int             presteps,       /* DLB equilibration steps, is checked    */
1359         gmx_large_int_t cpt_steps)      /* Time step counter in the checkpoint    */
1360 {
1361     int      i, nr, k, ret, count = 0, totaltests;
1362     int     *nPMEnodes = NULL;
1363     t_perf  *pd        = NULL;
1364     int      cmdline_length;
1365     char    *command, *cmd_stub;
1366     char     buf[STRLEN];
1367     gmx_bool bResetProblem = FALSE;
1368     gmx_bool bFirst        = TRUE;
1369
1370
1371     /* This string array corresponds to the eParselog enum type at the start
1372      * of this file */
1373     const char* ParseLog[] = {
1374         "OK.",
1375         "Logfile not found!",
1376         "No timings, logfile truncated?",
1377         "Run was terminated.",
1378         "Counters were not reset properly.",
1379         "No DD grid found for these settings.",
1380         "TPX version conflict!",
1381         "mdrun was not started in parallel!",
1382         "An error occured."
1383     };
1384     char        str_PME_f_load[13];
1385
1386
1387     /* Allocate space for the mdrun command line. 100 extra characters should
1388        be more than enough for the -npme etcetera arguments */
1389     cmdline_length =  strlen(cmd_mpirun)
1390         + strlen(cmd_np)
1391         + strlen(cmd_mdrun)
1392         + strlen(cmd_args_bench)
1393         + strlen(tpr_names[0]) + 100;
1394     snew(command, cmdline_length);
1395     snew(cmd_stub, cmdline_length);
1396
1397     /* Construct the part of the command line that stays the same for all tests: */
1398     if (bThreads)
1399     {
1400         sprintf(cmd_stub, "%s%s", cmd_mdrun, cmd_np);
1401     }
1402     else
1403     {
1404         sprintf(cmd_stub, "%s%s%s ", cmd_mpirun, cmd_np, cmd_mdrun);
1405     }
1406
1407     /* Create a list of numbers of PME nodes to test */
1408     if (npme_fixed < -1)
1409     {
1410         make_npme_list(npmevalues_opt, pmeentries, &nPMEnodes,
1411                        nnodes, minPMEnodes, maxPMEnodes);
1412     }
1413     else
1414     {
1415         *pmeentries  = 1;
1416         snew(nPMEnodes, 1);
1417         nPMEnodes[0] = npme_fixed;
1418         fprintf(stderr, "Will use a fixed number of %d PME-only nodes.\n", nPMEnodes[0]);
1419     }
1420
1421     if (0 == repeats)
1422     {
1423         fprintf(fp, "\nNo benchmarks done since number of repeats (-r) is 0.\n");
1424         ffclose(fp);
1425         finalize(opt2fn("-p", nfile, fnm));
1426         exit(0);
1427     }
1428
1429     /* Allocate one dataset for each tpr input file: */
1430     init_perfdata(perfdata, nr_tprs, *pmeentries, repeats);
1431
1432     /*****************************************/
1433     /* Main loop over all tpr files to test: */
1434     /*****************************************/
1435     totaltests = nr_tprs*(*pmeentries)*repeats;
1436     for (k = 0; k < nr_tprs; k++)
1437     {
1438         fprintf(fp, "\nIndividual timings for input file %d (%s):\n", k, tpr_names[k]);
1439         fprintf(fp, "PME nodes      Gcycles       ns/day        PME/f    Remark\n");
1440         /* Loop over various numbers of PME nodes: */
1441         for (i = 0; i < *pmeentries; i++)
1442         {
1443             pd = &perfdata[k][i];
1444
1445             /* Loop over the repeats for each scenario: */
1446             for (nr = 0; nr < repeats; nr++)
1447             {
1448                 pd->nPMEnodes = nPMEnodes[i];
1449
1450                 /* Add -npme and -s to the command line and save it. Note that
1451                  * the -passall (if set) options requires cmd_args_bench to be
1452                  * at the end of the command line string */
1453                 snew(pd->mdrun_cmd_line, cmdline_length);
1454                 sprintf(pd->mdrun_cmd_line, "%s-npme %d -s %s %s",
1455                         cmd_stub, pd->nPMEnodes, tpr_names[k], cmd_args_bench);
1456
1457                 /* To prevent that all benchmarks fail due to a show-stopper argument
1458                  * on the mdrun command line, we make a quick check with mdrun -h first */
1459                 if (bFirst)
1460                 {
1461                     make_sure_it_runs(pd->mdrun_cmd_line, cmdline_length, fp);
1462                 }
1463                 bFirst = FALSE;
1464
1465                 /* Do a benchmark simulation: */
1466                 if (repeats > 1)
1467                 {
1468                     sprintf(buf, ", pass %d/%d", nr+1, repeats);
1469                 }
1470                 else
1471                 {
1472                     buf[0] = '\0';
1473                 }
1474                 fprintf(stdout, "\n=== Progress %2.0f%%, tpr %d/%d, run %d/%d%s:\n",
1475                         (100.0*count)/totaltests,
1476                         k+1, nr_tprs, i+1, *pmeentries, buf);
1477                 make_backup(opt2fn("-err", nfile, fnm));
1478                 sprintf(command, "%s 1> /dev/null 2>%s", pd->mdrun_cmd_line, opt2fn("-err", nfile, fnm));
1479                 fprintf(stdout, "%s\n", pd->mdrun_cmd_line);
1480                 gmx_system_call(command);
1481
1482                 /* Collect the performance data from the log file; also check stderr
1483                  * for fatal errors */
1484                 ret = parse_logfile(opt2fn("-bg", nfile, fnm), opt2fn("-err", nfile, fnm),
1485                                     pd, nr, presteps, cpt_steps, nnodes);
1486                 if ((presteps > 0) && (ret == eParselogResetProblem))
1487                 {
1488                     bResetProblem = TRUE;
1489                 }
1490
1491                 if (-1 == pd->nPMEnodes)
1492                 {
1493                     sprintf(buf, "(%3d)", pd->guessPME);
1494                 }
1495                 else
1496                 {
1497                     sprintf(buf, "     ");
1498                 }
1499
1500                 /* Nicer output */
1501                 if (pd->PME_f_load[nr] > 0.0)
1502                 {
1503                     sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load[nr]);
1504                 }
1505                 else
1506                 {
1507                     sprintf(str_PME_f_load, "%s", "         -  ");
1508                 }
1509
1510                 /* Write the data we got to disk */
1511                 fprintf(fp, "%4d%s %12.3f %12.3f %s    %s", pd->nPMEnodes,
1512                         buf, pd->Gcycles[nr], pd->ns_per_day[nr], str_PME_f_load, ParseLog[ret]);
1513                 if (!(ret == eParselogOK || ret == eParselogNoDDGrid || ret == eParselogNotFound) )
1514                 {
1515                     fprintf(fp, " Check %s file for problems.", ret == eParselogFatal ? "err" : "log");
1516                 }
1517                 fprintf(fp, "\n");
1518                 fflush(fp);
1519                 count++;
1520
1521                 /* Do some cleaning up and delete the files we do not need any more */
1522                 cleanup(fnm, nfile, k, nnodes, pd->nPMEnodes, nr, ret == eParselogFatal);
1523
1524                 /* If the first run with this number of processors already failed, do not try again: */
1525                 if (pd->Gcycles[0] <= 0.0 && repeats > 1)
1526                 {
1527                     fprintf(stdout, "Skipping remaining passes of unsuccessful setting, see log file for details.\n");
1528                     count += repeats-(nr+1);
1529                     break;
1530                 }
1531             } /* end of repeats loop */
1532         }     /* end of -npme loop */
1533     }         /* end of tpr file loop */
1534
1535     if (bResetProblem)
1536     {
1537         sep_line(fp);
1538         fprintf(fp, "WARNING: The cycle and time step counters could not be reset properly. ");
1539         sep_line(fp);
1540     }
1541     sfree(command);
1542     sfree(cmd_stub);
1543 }
1544
1545
1546 static void check_input(
1547         int             nnodes,
1548         int             repeats,
1549         int            *ntprs,
1550         real           *rmin,
1551         real            rcoulomb,
1552         real           *rmax,
1553         real            maxPMEfraction,
1554         real            minPMEfraction,
1555         int             npme_fixed,
1556         gmx_large_int_t bench_nsteps,
1557         const t_filenm *fnm,
1558         int             nfile,
1559         int             sim_part,
1560         int             presteps,
1561         int             npargs,
1562         t_pargs        *pa)
1563 {
1564     int old;
1565
1566
1567     /* Make sure the input file exists */
1568     if (!gmx_fexist(opt2fn("-s", nfile, fnm)))
1569     {
1570         gmx_fatal(FARGS, "File %s not found.", opt2fn("-s", nfile, fnm));
1571     }
1572
1573     /* Make sure that the checkpoint file is not overwritten during benchmarking */
1574     if ( (0 == strcmp(opt2fn("-cpi", nfile, fnm), opt2fn("-bcpo", nfile, fnm)) ) && (sim_part > 1) )
1575     {
1576         gmx_fatal(FARGS, "Checkpoint input (-cpi) and benchmark checkpoint output (-bcpo) files must not be identical.\n"
1577                   "The checkpoint input file must not be overwritten during the benchmarks.\n");
1578     }
1579
1580     /* Make sure that repeats is >= 0 (if == 0, only write tpr files) */
1581     if (repeats < 0)
1582     {
1583         gmx_fatal(FARGS, "Number of repeats < 0!");
1584     }
1585
1586     /* Check number of nodes */
1587     if (nnodes < 1)
1588     {
1589         gmx_fatal(FARGS, "Number of nodes/threads must be a positive integer.");
1590     }
1591
1592     /* Automatically choose -ntpr if not set */
1593     if (*ntprs < 1)
1594     {
1595         if (nnodes < 16)
1596         {
1597             *ntprs = 1;
1598         }
1599         else
1600         {
1601             *ntprs = 3;
1602             /* Set a reasonable scaling factor for rcoulomb */
1603             if (*rmax <= 0)
1604             {
1605                 *rmax = rcoulomb * 1.2;
1606             }
1607         }
1608         fprintf(stderr, "Will test %d tpr file%s.\n", *ntprs, *ntprs == 1 ? "" : "s");
1609     }
1610     else
1611     {
1612         if (1 == *ntprs)
1613         {
1614             fprintf(stderr, "Note: Choose ntpr>1 to shift PME load between real and reciprocal space.\n");
1615         }
1616     }
1617
1618     /* Make shure that rmin <= rcoulomb <= rmax */
1619     if (*rmin <= 0)
1620     {
1621         *rmin = rcoulomb;
1622     }
1623     if (*rmax <= 0)
1624     {
1625         *rmax = rcoulomb;
1626     }
1627     if (!(*rmin <= *rmax) )
1628     {
1629         gmx_fatal(FARGS, "Please choose the Coulomb radii such that rmin <= rmax.\n"
1630                   "rmin = %g, rmax = %g, actual rcoul from .tpr file = %g\n", *rmin, *rmax, rcoulomb);
1631     }
1632     /* Add test scenarios if rmin or rmax were set */
1633     if (*ntprs <= 2)
1634     {
1635         if (!is_equal(*rmin, rcoulomb) && (*ntprs == 1) )
1636         {
1637             (*ntprs)++;
1638             fprintf(stderr, "NOTE: Setting -rmin to %g changed -ntpr to %d\n",
1639                     *rmin, *ntprs);
1640         }
1641         if (!is_equal(*rmax, rcoulomb) && (*ntprs == 1) )
1642         {
1643             (*ntprs)++;
1644             fprintf(stderr, "NOTE: Setting -rmax to %g changed -ntpr to %d\n",
1645                     *rmax, *ntprs);
1646         }
1647     }
1648     old = *ntprs;
1649     /* If one of rmin, rmax is set, we need 2 tpr files at minimum */
1650     if (!is_equal(*rmax, rcoulomb) || !is_equal(*rmin, rcoulomb) )
1651     {
1652         *ntprs = max(*ntprs, 2);
1653     }
1654
1655     /* If both rmin, rmax are set, we need 3 tpr files at minimum */
1656     if (!is_equal(*rmax, rcoulomb) && !is_equal(*rmin, rcoulomb) )
1657     {
1658         *ntprs = max(*ntprs, 3);
1659     }
1660
1661     if (old != *ntprs)
1662     {
1663         fprintf(stderr, "NOTE: Your rmin, rmax setting changed -ntpr to %d\n", *ntprs);
1664     }
1665
1666     if (*ntprs > 1)
1667     {
1668         if (is_equal(*rmin, rcoulomb) && is_equal(rcoulomb, *rmax)) /* We have just a single rc */
1669         {
1670             fprintf(stderr, "WARNING: Resetting -ntpr to 1 since no Coulomb radius scaling is requested.\n"
1671                     "Please set rmin < rmax to test Coulomb radii in the [rmin, rmax] interval\n"
1672                     "with correspondingly adjusted PME grid settings\n");
1673             *ntprs = 1;
1674         }
1675     }
1676
1677     /* Check whether max and min fraction are within required values */
1678     if (maxPMEfraction > 0.5 || maxPMEfraction < 0)
1679     {
1680         gmx_fatal(FARGS, "-max must be between 0 and 0.5");
1681     }
1682     if (minPMEfraction > 0.5 || minPMEfraction < 0)
1683     {
1684         gmx_fatal(FARGS, "-min must be between 0 and 0.5");
1685     }
1686     if (maxPMEfraction < minPMEfraction)
1687     {
1688         gmx_fatal(FARGS, "-max must be larger or equal to -min");
1689     }
1690
1691     /* Check whether the number of steps - if it was set - has a reasonable value */
1692     if (bench_nsteps < 0)
1693     {
1694         gmx_fatal(FARGS, "Number of steps must be positive.");
1695     }
1696
1697     if (bench_nsteps > 10000 || bench_nsteps < 100)
1698     {
1699         fprintf(stderr, "WARNING: steps=");
1700         fprintf(stderr, gmx_large_int_pfmt, bench_nsteps);
1701         fprintf(stderr, ". Are you sure you want to perform so %s steps for each benchmark?\n", (bench_nsteps < 100) ? "few" : "many");
1702     }
1703
1704     if (presteps < 0)
1705     {
1706         gmx_fatal(FARGS, "Cannot have a negative number of presteps.\n");
1707     }
1708
1709     /* Check for rcoulomb scaling if more than one .tpr file is tested */
1710     if (*ntprs > 1)
1711     {
1712         if (*rmin/rcoulomb < 0.75 || *rmax/rcoulomb > 1.25)
1713         {
1714             fprintf(stderr, "WARNING: Applying extreme scaling factor. I hope you know what you are doing.\n");
1715         }
1716     }
1717
1718     /* If a fixed number of PME nodes is set we do rcoulomb and PME gird tuning
1719      * only. We need to check whether the requested number of PME-only nodes
1720      * makes sense. */
1721     if (npme_fixed > -1)
1722     {
1723         /* No more than 50% of all nodes can be assigned as PME-only nodes. */
1724         if (2*npme_fixed > nnodes)
1725         {
1726             gmx_fatal(FARGS, "Cannot have more than %d PME-only nodes for a total of %d nodes (you chose %d).\n",
1727                       nnodes/2, nnodes, npme_fixed);
1728         }
1729         if ((npme_fixed > 0) && (5*npme_fixed < nnodes))
1730         {
1731             fprintf(stderr, "WARNING: Only %g percent of the nodes are assigned as PME-only nodes.\n",
1732                     100.0*((real)npme_fixed / (real)nnodes));
1733         }
1734         if (opt2parg_bSet("-min", npargs, pa) || opt2parg_bSet("-max", npargs, pa))
1735         {
1736             fprintf(stderr, "NOTE: The -min, -max, and -npme options have no effect when a\n"
1737                     "      fixed number of PME-only nodes is requested with -fix.\n");
1738         }
1739     }
1740 }
1741
1742
1743 /* Returns TRUE when "opt" is needed at launch time */
1744 static gmx_bool is_launch_file(char *opt, gmx_bool bSet)
1745 {
1746     /* Apart from the input .tpr and the output log files we need all options that
1747      * were set on the command line and that do not start with -b */
1748     if    (0 == strncmp(opt, "-b", 2) || 0 == strncmp(opt, "-s", 2)
1749            || 0 == strncmp(opt, "-err", 4) || 0 == strncmp(opt, "-p", 2) )
1750     {
1751         return FALSE;
1752     }
1753
1754     return bSet;
1755 }
1756
1757
1758 /* Returns TRUE when "opt" defines a file which is needed for the benchmarks runs */
1759 static gmx_bool is_bench_file(char *opt, gmx_bool bSet, gmx_bool bOptional, gmx_bool bIsOutput)
1760 {
1761     /* Apart from the input .tpr, all files starting with "-b" are for
1762      * _b_enchmark files exclusively */
1763     if (0 == strncmp(opt, "-s", 2))
1764     {
1765         return FALSE;
1766     }
1767
1768     if (0 == strncmp(opt, "-b", 2) || 0 == strncmp(opt, "-s", 2))
1769     {
1770         if (!bOptional || bSet)
1771         {
1772             return TRUE;
1773         }
1774         else
1775         {
1776             return FALSE;
1777         }
1778     }
1779     else
1780     {
1781         if (bIsOutput)
1782         {
1783             return FALSE;
1784         }
1785         else
1786         {
1787             if (bSet) /* These are additional input files like -cpi -ei */
1788             {
1789                 return TRUE;
1790             }
1791             else
1792             {
1793                 return FALSE;
1794             }
1795         }
1796     }
1797 }
1798
1799
1800 /* Adds 'buf' to 'str' */
1801 static void add_to_string(char **str, char *buf)
1802 {
1803     int len;
1804
1805
1806     len = strlen(*str) + strlen(buf) + 1;
1807     srenew(*str, len);
1808     strcat(*str, buf);
1809 }
1810
1811
1812 /* Create the command line for the benchmark as well as for the real run */
1813 static void create_command_line_snippets(
1814         gmx_bool  bAppendFiles,
1815         gmx_bool  bKeepAndNumCPT,
1816         gmx_bool  bResetHWay,
1817         int       presteps,
1818         int       nfile,
1819         t_filenm  fnm[],
1820         char     *cmd_args_bench[],  /* command line arguments for benchmark runs */
1821         char     *cmd_args_launch[], /* command line arguments for simulation run */
1822         char      extra_args[])      /* Add this to the end of the command line */
1823 {
1824     int         i;
1825     char       *opt;
1826     const char *name;
1827     char        strbuf[STRLEN];
1828
1829
1830     /* strlen needs at least '\0' as a string: */
1831     snew(*cmd_args_bench, 1);
1832     snew(*cmd_args_launch, 1);
1833     *cmd_args_launch[0] = '\0';
1834     *cmd_args_bench[0]  = '\0';
1835
1836
1837     /*******************************************/
1838     /* 1. Process other command line arguments */
1839     /*******************************************/
1840     if (presteps > 0)
1841     {
1842         /* Add equilibration steps to benchmark options */
1843         sprintf(strbuf, "-resetstep %d ", presteps);
1844         add_to_string(cmd_args_bench, strbuf);
1845     }
1846     /* These switches take effect only at launch time */
1847     if (FALSE == bAppendFiles)
1848     {
1849         add_to_string(cmd_args_launch, "-noappend ");
1850     }
1851     if (bKeepAndNumCPT)
1852     {
1853         add_to_string(cmd_args_launch, "-cpnum ");
1854     }
1855     if (bResetHWay)
1856     {
1857         add_to_string(cmd_args_launch, "-resethway ");
1858     }
1859
1860     /********************/
1861     /* 2. Process files */
1862     /********************/
1863     for (i = 0; i < nfile; i++)
1864     {
1865         opt  = (char *)fnm[i].opt;
1866         name = opt2fn(opt, nfile, fnm);
1867
1868         /* Strbuf contains the options, now let's sort out where we need that */
1869         sprintf(strbuf, "%s %s ", opt, name);
1870
1871         if (is_bench_file(opt, opt2bSet(opt, nfile, fnm), is_optional(&fnm[i]), is_output(&fnm[i])) )
1872         {
1873             /* All options starting with -b* need the 'b' removed,
1874              * therefore overwrite strbuf */
1875             if (0 == strncmp(opt, "-b", 2))
1876             {
1877                 sprintf(strbuf, "-%s %s ", &opt[2], name);
1878             }
1879
1880             add_to_string(cmd_args_bench, strbuf);
1881         }
1882
1883         if (is_launch_file(opt, opt2bSet(opt, nfile, fnm)) )
1884         {
1885             add_to_string(cmd_args_launch, strbuf);
1886         }
1887     }
1888
1889     add_to_string(cmd_args_bench, extra_args);
1890     add_to_string(cmd_args_launch, extra_args);
1891 }
1892
1893
1894 /* Set option opt */
1895 static void setopt(const char *opt, int nfile, t_filenm fnm[])
1896 {
1897     int i;
1898
1899     for (i = 0; (i < nfile); i++)
1900     {
1901         if (strcmp(opt, fnm[i].opt) == 0)
1902         {
1903             fnm[i].flag |= ffSET;
1904         }
1905     }
1906 }
1907
1908
1909 /* This routine inspects the tpr file and ...
1910  * 1. checks for output files that get triggered by a tpr option. These output
1911  *    files are marked as 'set' to allow for a proper cleanup after each
1912  *    tuning run.
1913  * 2. returns the PME:PP load ratio
1914  * 3. returns rcoulomb from the tpr */
1915 static float inspect_tpr(int nfile, t_filenm fnm[], real *rcoulomb)
1916 {
1917     gmx_bool     bPull;     /* Is pulling requested in .tpr file?             */
1918     gmx_bool     bTpi;      /* Is test particle insertion requested?          */
1919     gmx_bool     bFree;     /* Is a free energy simulation requested?         */
1920     gmx_bool     bNM;       /* Is a normal mode analysis requested?           */
1921     t_inputrec   ir;
1922     t_state      state;
1923     gmx_mtop_t   mtop;
1924
1925
1926     /* Check tpr file for options that trigger extra output files */
1927     read_tpx_state(opt2fn("-s", nfile, fnm), &ir, &state, NULL, &mtop);
1928     bPull = (epullNO != ir.ePull);
1929     bFree = (efepNO  != ir.efep );
1930     bNM   = (eiNM    == ir.eI   );
1931     bTpi  = EI_TPI(ir.eI);
1932
1933     /* Set these output files on the tuning command-line */
1934     if (bPull)
1935     {
1936         setopt("-pf", nfile, fnm);
1937         setopt("-px", nfile, fnm);
1938     }
1939     if (bFree)
1940     {
1941         setopt("-dhdl", nfile, fnm);
1942     }
1943     if (bTpi)
1944     {
1945         setopt("-tpi", nfile, fnm);
1946         setopt("-tpid", nfile, fnm);
1947     }
1948     if (bNM)
1949     {
1950         setopt("-mtx", nfile, fnm);
1951     }
1952
1953     *rcoulomb = ir.rcoulomb;
1954
1955     /* Return the estimate for the number of PME nodes */
1956     return pme_load_estimate(&mtop, &ir, state.box);
1957 }
1958
1959
1960 static void couple_files_options(int nfile, t_filenm fnm[])
1961 {
1962     int      i;
1963     gmx_bool bSet, bBench;
1964     char    *opt;
1965     char     buf[20];
1966
1967
1968     for (i = 0; i < nfile; i++)
1969     {
1970         opt    = (char *)fnm[i].opt;
1971         bSet   = ((fnm[i].flag & ffSET) != 0);
1972         bBench = (0 == strncmp(opt, "-b", 2));
1973
1974         /* Check optional files */
1975         /* If e.g. -eo is set, then -beo also needs to be set */
1976         if (is_optional(&fnm[i]) && bSet && !bBench)
1977         {
1978             sprintf(buf, "-b%s", &opt[1]);
1979             setopt(buf, nfile, fnm);
1980         }
1981         /* If -beo is set, then -eo also needs to be! */
1982         if (is_optional(&fnm[i]) && bSet && bBench)
1983         {
1984             sprintf(buf, "-%s", &opt[2]);
1985             setopt(buf, nfile, fnm);
1986         }
1987     }
1988 }
1989
1990
1991 static double gettime()
1992 {
1993 #ifdef HAVE_GETTIMEOFDAY
1994     struct timeval t;
1995     double         seconds;
1996
1997     gettimeofday(&t, NULL);
1998
1999     seconds = (double) t.tv_sec + 1e-6*(double)t.tv_usec;
2000
2001     return seconds;
2002 #else
2003     double  seconds;
2004
2005     seconds = time(NULL);
2006
2007     return seconds;
2008 #endif
2009 }
2010
2011
2012 #define BENCHSTEPS (1000)
2013
2014 int gmx_tune_pme(int argc, char *argv[])
2015 {
2016     const char     *desc[] = {
2017         "For a given number [TT]-np[tt] or [TT]-ntmpi[tt] of processors/threads, this program systematically",
2018         "times [TT]mdrun[tt] with various numbers of PME-only nodes and determines",
2019         "which setting is fastest. It will also test whether performance can",
2020         "be enhanced by shifting load from the reciprocal to the real space",
2021         "part of the Ewald sum. ",
2022         "Simply pass your [TT].tpr[tt] file to [TT]g_tune_pme[tt] together with other options",
2023         "for [TT]mdrun[tt] as needed.[PAR]",
2024         "Which executables are used can be set in the environment variables",
2025         "MPIRUN and MDRUN. If these are not present, 'mpirun' and 'mdrun'",
2026         "will be used as defaults. Note that for certain MPI frameworks you",
2027         "need to provide a machine- or hostfile. This can also be passed",
2028         "via the MPIRUN variable, e.g.[PAR]",
2029         "[TT]export MPIRUN=\"/usr/local/mpirun -machinefile hosts\"[tt][PAR]",
2030         "Please call [TT]g_tune_pme[tt] with the normal options you would pass to",
2031         "[TT]mdrun[tt] and add [TT]-np[tt] for the number of processors to perform the",
2032         "tests on, or [TT]-ntmpi[tt] for the number of threads. You can also add [TT]-r[tt]",
2033         "to repeat each test several times to get better statistics. [PAR]",
2034         "[TT]g_tune_pme[tt] can test various real space / reciprocal space workloads",
2035         "for you. With [TT]-ntpr[tt] you control how many extra [TT].tpr[tt] files will be",
2036         "written with enlarged cutoffs and smaller Fourier grids respectively.",
2037         "Typically, the first test (number 0) will be with the settings from the input",
2038         "[TT].tpr[tt] file; the last test (number [TT]ntpr[tt]) will have the Coulomb cutoff",
2039         "specified by [TT]-rmax[tt] with a somwhat smaller PME grid at the same time. ",
2040         "In this last test, the Fourier spacing is multiplied with [TT]rmax[tt]/rcoulomb. ",
2041         "The remaining [TT].tpr[tt] files will have equally-spaced Coulomb radii (and Fourier "
2042         "spacings) between these extremes. [BB]Note[bb] that you can set [TT]-ntpr[tt] to 1",
2043         "if you just seek the optimal number of PME-only nodes; in that case",
2044         "your input [TT].tpr[tt] file will remain unchanged.[PAR]",
2045         "For the benchmark runs, the default of 1000 time steps should suffice for most",
2046         "MD systems. The dynamic load balancing needs about 100 time steps",
2047         "to adapt to local load imbalances, therefore the time step counters",
2048         "are by default reset after 100 steps. For large systems (>1M atoms), as well as ",
2049         "for a higher accuarcy of the measurements, you should set [TT]-resetstep[tt] to a higher value.",
2050         "From the 'DD' load imbalance entries in the md.log output file you",
2051         "can tell after how many steps the load is sufficiently balanced. Example call:[PAR]"
2052         "[TT]g_tune_pme -np 64 -s protein.tpr -launch[tt][PAR]",
2053         "After calling [TT]mdrun[tt] several times, detailed performance information",
2054         "is available in the output file [TT]perf.out.[tt] ",
2055         "[BB]Note[bb] that during the benchmarks, a couple of temporary files are written",
2056         "(options [TT]-b[tt]*), these will be automatically deleted after each test.[PAR]",
2057         "If you want the simulation to be started automatically with the",
2058         "optimized parameters, use the command line option [TT]-launch[tt].[PAR]",
2059     };
2060
2061     int             nnodes         = 1;
2062     int             repeats        = 2;
2063     int             pmeentries     = 0; /* How many values for -npme do we actually test for each tpr file */
2064     real            maxPMEfraction = 0.50;
2065     real            minPMEfraction = 0.25;
2066     int             maxPMEnodes, minPMEnodes;
2067     float           guessPMEratio;                    /* guessed PME:PP ratio based on the tpr file */
2068     float           guessPMEnodes;
2069     int             npme_fixed     = -2;              /* If >= -1, use only this number
2070                                                        * of PME-only nodes                */
2071     int             ntprs          = 0;
2072     real            rmin           = 0.0, rmax = 0.0; /* min and max value for rcoulomb if scaling is requested */
2073     real            rcoulomb       = -1.0;            /* Coulomb radius as set in .tpr file */
2074     gmx_bool        bScaleRvdw     = TRUE;
2075     gmx_large_int_t bench_nsteps   = BENCHSTEPS;
2076     gmx_large_int_t new_sim_nsteps = -1;  /* -1 indicates: not set by the user */
2077     gmx_large_int_t cpt_steps      = 0;   /* Step counter in .cpt input file   */
2078     int             presteps       = 100; /* Do a full cycle reset after presteps steps */
2079     gmx_bool        bOverwrite     = FALSE, bKeepTPR;
2080     gmx_bool        bLaunch        = FALSE;
2081     char           *ExtraArgs      = NULL;
2082     char          **tpr_names      = NULL;
2083     const char     *simulation_tpr = NULL;
2084     int             best_npme, best_tpr;
2085     int             sim_part = 1; /* For benchmarks with checkpoint files */
2086     char            bbuf[STRLEN];
2087
2088     /* Default program names if nothing else is found */
2089     char         *cmd_mpirun = NULL, *cmd_mdrun = NULL;
2090     char         *cmd_args_bench, *cmd_args_launch;
2091     char         *cmd_np = NULL;
2092
2093     t_perf      **perfdata = NULL;
2094     t_inputinfo  *info;
2095     int           i;
2096     FILE         *fp;
2097     t_commrec    *cr;
2098
2099     /* Print out how long the tuning took */
2100     double          seconds;
2101
2102     static t_filenm fnm[] = {
2103         /* g_tune_pme */
2104         { efOUT, "-p",      "perf",     ffWRITE },
2105         { efLOG, "-err",    "bencherr", ffWRITE },
2106         { efTPX, "-so",     "tuned",    ffWRITE },
2107         /* mdrun: */
2108         { efTPX, NULL,      NULL,       ffREAD },
2109         { efTRN, "-o",      NULL,       ffWRITE },
2110         { efXTC, "-x",      NULL,       ffOPTWR },
2111         { efCPT, "-cpi",    NULL,       ffOPTRD },
2112         { efCPT, "-cpo",    NULL,       ffOPTWR },
2113         { efSTO, "-c",      "confout",  ffWRITE },
2114         { efEDR, "-e",      "ener",     ffWRITE },
2115         { efLOG, "-g",      "md",       ffWRITE },
2116         { efXVG, "-dhdl",   "dhdl",     ffOPTWR },
2117         { efXVG, "-field",  "field",    ffOPTWR },
2118         { efXVG, "-table",  "table",    ffOPTRD },
2119         { efXVG, "-tabletf", "tabletf",   ffOPTRD },
2120         { efXVG, "-tablep", "tablep",   ffOPTRD },
2121         { efXVG, "-tableb", "table",    ffOPTRD },
2122         { efTRX, "-rerun",  "rerun",    ffOPTRD },
2123         { efXVG, "-tpi",    "tpi",      ffOPTWR },
2124         { efXVG, "-tpid",   "tpidist",  ffOPTWR },
2125         { efEDI, "-ei",     "sam",      ffOPTRD },
2126         { efXVG, "-eo",     "edsam",    ffOPTWR },
2127         { efXVG, "-devout", "deviatie", ffOPTWR },
2128         { efXVG, "-runav",  "runaver",  ffOPTWR },
2129         { efXVG, "-px",     "pullx",    ffOPTWR },
2130         { efXVG, "-pf",     "pullf",    ffOPTWR },
2131         { efXVG, "-ro",     "rotation", ffOPTWR },
2132         { efLOG, "-ra",     "rotangles", ffOPTWR },
2133         { efLOG, "-rs",     "rotslabs", ffOPTWR },
2134         { efLOG, "-rt",     "rottorque", ffOPTWR },
2135         { efMTX, "-mtx",    "nm",       ffOPTWR },
2136         { efNDX, "-dn",     "dipole",   ffOPTWR },
2137         /* Output files that are deleted after each benchmark run */
2138         { efTRN, "-bo",     "bench",    ffWRITE },
2139         { efXTC, "-bx",     "bench",    ffWRITE },
2140         { efCPT, "-bcpo",   "bench",    ffWRITE },
2141         { efSTO, "-bc",     "bench",    ffWRITE },
2142         { efEDR, "-be",     "bench",    ffWRITE },
2143         { efLOG, "-bg",     "bench",    ffWRITE },
2144         { efXVG, "-beo",    "benchedo", ffOPTWR },
2145         { efXVG, "-bdhdl",  "benchdhdl", ffOPTWR },
2146         { efXVG, "-bfield", "benchfld", ffOPTWR },
2147         { efXVG, "-btpi",   "benchtpi", ffOPTWR },
2148         { efXVG, "-btpid",  "benchtpid", ffOPTWR },
2149         { efXVG, "-bdevout", "benchdev", ffOPTWR },
2150         { efXVG, "-brunav", "benchrnav", ffOPTWR },
2151         { efXVG, "-bpx",    "benchpx",  ffOPTWR },
2152         { efXVG, "-bpf",    "benchpf",  ffOPTWR },
2153         { efXVG, "-bro",    "benchrot", ffOPTWR },
2154         { efLOG, "-bra",    "benchrota", ffOPTWR },
2155         { efLOG, "-brs",    "benchrots", ffOPTWR },
2156         { efLOG, "-brt",    "benchrott", ffOPTWR },
2157         { efMTX, "-bmtx",   "benchn",   ffOPTWR },
2158         { efNDX, "-bdn",    "bench",    ffOPTWR }
2159     };
2160
2161     gmx_bool        bThreads     = FALSE;
2162
2163     int             nthreads = 1;
2164
2165     const char     *procstring[] =
2166     { NULL, "-np", "-n", "none", NULL };
2167     const char     *npmevalues_opt[] =
2168     { NULL, "auto", "all", "subset", NULL };
2169
2170     gmx_bool     bAppendFiles          = TRUE;
2171     gmx_bool     bKeepAndNumCPT        = FALSE;
2172     gmx_bool     bResetCountersHalfWay = FALSE;
2173     gmx_bool     bBenchmark            = TRUE;
2174
2175     output_env_t oenv = NULL;
2176
2177     t_pargs      pa[] = {
2178         /***********************/
2179         /* g_tune_pme options: */
2180         /***********************/
2181         { "-np",       FALSE, etINT,  {&nnodes},
2182           "Number of nodes to run the tests on (must be > 2 for separate PME nodes)" },
2183         { "-npstring", FALSE, etENUM, {procstring},
2184           "Specify the number of processors to [TT]$MPIRUN[tt] using this string"},
2185         { "-ntmpi",    FALSE, etINT,  {&nthreads},
2186           "Number of MPI-threads to run the tests on (turns MPI & mpirun off)"},
2187         { "-r",        FALSE, etINT,  {&repeats},
2188           "Repeat each test this often" },
2189         { "-max",      FALSE, etREAL, {&maxPMEfraction},
2190           "Max fraction of PME nodes to test with" },
2191         { "-min",      FALSE, etREAL, {&minPMEfraction},
2192           "Min fraction of PME nodes to test with" },
2193         { "-npme",     FALSE, etENUM, {npmevalues_opt},
2194           "Within -min and -max, benchmark all possible values for [TT]-npme[tt], or just a reasonable subset. "
2195           "Auto neglects -min and -max and chooses reasonable values around a guess for npme derived from the .tpr"},
2196         { "-fix",      FALSE, etINT,  {&npme_fixed},
2197           "If >= -1, do not vary the number of PME-only nodes, instead use this fixed value and only vary rcoulomb and the PME grid spacing."},
2198         { "-rmax",     FALSE, etREAL, {&rmax},
2199           "If >0, maximal rcoulomb for -ntpr>1 (rcoulomb upscaling results in fourier grid downscaling)" },
2200         { "-rmin",     FALSE, etREAL, {&rmin},
2201           "If >0, minimal rcoulomb for -ntpr>1" },
2202         { "-scalevdw",  FALSE, etBOOL, {&bScaleRvdw},
2203           "Scale rvdw along with rcoulomb"},
2204         { "-ntpr",     FALSE, etINT,  {&ntprs},
2205           "Number of [TT].tpr[tt] files to benchmark. Create this many files with different rcoulomb scaling factors depending on -rmin and -rmax. "
2206           "If < 1, automatically choose the number of [TT].tpr[tt] files to test" },
2207         { "-steps",    FALSE, etGMX_LARGE_INT, {&bench_nsteps},
2208           "Take timings for this many steps in the benchmark runs" },
2209         { "-resetstep", FALSE, etINT,  {&presteps},
2210           "Let dlb equilibrate this many steps before timings are taken (reset cycle counters after this many steps)" },
2211         { "-simsteps", FALSE, etGMX_LARGE_INT, {&new_sim_nsteps},
2212           "If non-negative, perform this many steps in the real run (overwrites nsteps from [TT].tpr[tt], add [TT].cpt[tt] steps)" },
2213         { "-launch",   FALSE, etBOOL, {&bLaunch},
2214           "Launch the real simulation after optimization" },
2215         { "-bench",    FALSE, etBOOL, {&bBenchmark},
2216           "Run the benchmarks or just create the input [TT].tpr[tt] files?" },
2217         /******************/
2218         /* mdrun options: */
2219         /******************/
2220         /* We let g_tune_pme parse and understand these options, because we need to
2221          * prevent that they appear on the mdrun command line for the benchmarks */
2222         { "-append",   FALSE, etBOOL, {&bAppendFiles},
2223           "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names (for launch only)" },
2224         { "-cpnum",    FALSE, etBOOL, {&bKeepAndNumCPT},
2225           "Keep and number checkpoint files (launch only)" },
2226         { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay},
2227           "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt] (launch only)" }
2228     };
2229
2230 #define NFILE asize(fnm)
2231
2232     seconds = gettime();
2233
2234     if (!parse_common_args(&argc, argv, PCA_NOEXIT_ON_ARGS,
2235                            NFILE, fnm, asize(pa), pa, asize(desc), desc,
2236                            0, NULL, &oenv))
2237     {
2238         return 0;
2239     }
2240
2241     /* Store the remaining unparsed command line entries in a string which
2242      * is then attached to the mdrun command line */
2243     snew(ExtraArgs, 1);
2244     ExtraArgs[0] = '\0';
2245     for (i = 1; i < argc; i++) /* argc will now be 1 if everything was understood */
2246     {
2247         add_to_string(&ExtraArgs, argv[i]);
2248         add_to_string(&ExtraArgs, " ");
2249     }
2250
2251     if (opt2parg_bSet("-ntmpi", asize(pa), pa))
2252     {
2253         bThreads = TRUE;
2254         if (opt2parg_bSet("-npstring", asize(pa), pa))
2255         {
2256             fprintf(stderr, "WARNING: -npstring has no effect when using threads.\n");
2257         }
2258
2259         if (nnodes > 1)
2260         {
2261             gmx_fatal(FARGS, "Can't run multi-threaded MPI simulation yet!");
2262         }
2263         /* and now we just set this; a bit of an ugly hack*/
2264         nnodes = nthreads;
2265     }
2266     /* Check for PME:PP ratio and whether tpr triggers additional output files */
2267     guessPMEratio = inspect_tpr(NFILE, fnm, &rcoulomb);
2268
2269     /* Automatically set -beo options if -eo is set etc. */
2270     couple_files_options(NFILE, fnm);
2271
2272     /* Construct the command line arguments for benchmark runs
2273      * as well as for the simulation run */
2274     if (bThreads)
2275     {
2276         sprintf(bbuf, " -ntmpi %d ", nthreads);
2277     }
2278     else
2279     {
2280         /* This string will be used for MPI runs and will appear after the
2281          * mpirun command. */
2282         if (strcmp(procstring[0], "none") != 0)
2283         {
2284             sprintf(bbuf, " %s %d ", procstring[0], nnodes);
2285         }
2286         else
2287         {
2288             sprintf(bbuf, " ");
2289         }
2290     }
2291
2292     cmd_np = bbuf;
2293
2294     create_command_line_snippets(bAppendFiles, bKeepAndNumCPT, bResetCountersHalfWay, presteps,
2295                                  NFILE, fnm, &cmd_args_bench, &cmd_args_launch, ExtraArgs);
2296
2297     /* Read in checkpoint file if requested */
2298     sim_part = 1;
2299     if (opt2bSet("-cpi", NFILE, fnm))
2300     {
2301         snew(cr, 1);
2302         cr->duty = DUTY_PP; /* makes the following routine happy */
2303         read_checkpoint_simulation_part(opt2fn("-cpi", NFILE, fnm),
2304                                         &sim_part, &cpt_steps, cr,
2305                                         FALSE, NFILE, fnm, NULL, NULL);
2306         sfree(cr);
2307         sim_part++;
2308         /* sim_part will now be 1 if no checkpoint file was found */
2309         if (sim_part <= 1)
2310         {
2311             gmx_fatal(FARGS, "Checkpoint file %s not found!", opt2fn("-cpi", NFILE, fnm));
2312         }
2313     }
2314
2315     /* Open performance output file and write header info */
2316     fp = ffopen(opt2fn("-p", NFILE, fnm), "w");
2317
2318     /* Make a quick consistency check of command line parameters */
2319     check_input(nnodes, repeats, &ntprs, &rmin, rcoulomb, &rmax,
2320                 maxPMEfraction, minPMEfraction, npme_fixed,
2321                 bench_nsteps, fnm, NFILE, sim_part, presteps,
2322                 asize(pa), pa);
2323
2324     /* Determine the maximum and minimum number of PME nodes to test,
2325      * the actual list of settings is build in do_the_tests(). */
2326     if ((nnodes > 2) && (npme_fixed < -1))
2327     {
2328         if (0 == strcmp(npmevalues_opt[0], "auto"))
2329         {
2330             /* Determine the npme range automatically based on the PME:PP load guess */
2331             if (guessPMEratio > 1.0)
2332             {
2333                 /* More PME than PP work, probably we do not need separate PME nodes at all! */
2334                 maxPMEnodes = nnodes/2;
2335                 minPMEnodes = nnodes/2;
2336             }
2337             else
2338             {
2339                 /* PME : PP load is in the range 0..1, let's test around the guess */
2340                 guessPMEnodes = nnodes/(1.0 + 1.0/guessPMEratio);
2341                 minPMEnodes   = floor(0.7*guessPMEnodes);
2342                 maxPMEnodes   =  ceil(1.6*guessPMEnodes);
2343                 maxPMEnodes   = min(maxPMEnodes, nnodes/2);
2344             }
2345         }
2346         else
2347         {
2348             /* Determine the npme range based on user input */
2349             maxPMEnodes = floor(maxPMEfraction*nnodes);
2350             minPMEnodes = max(floor(minPMEfraction*nnodes), 0);
2351             fprintf(stdout, "Will try runs with %d ", minPMEnodes);
2352             if (maxPMEnodes != minPMEnodes)
2353             {
2354                 fprintf(stdout, "- %d ", maxPMEnodes);
2355             }
2356             fprintf(stdout, "PME-only nodes.\n  Note that the automatic number of PME-only nodes and no separate PME nodes are always tested.\n");
2357         }
2358     }
2359     else
2360     {
2361         maxPMEnodes = 0;
2362         minPMEnodes = 0;
2363     }
2364
2365     /* Get the commands we need to set up the runs from environment variables */
2366     get_program_paths(bThreads, &cmd_mpirun, &cmd_mdrun);
2367     if (bBenchmark && repeats > 0)
2368     {
2369         check_mdrun_works(bThreads, cmd_mpirun, cmd_np, cmd_mdrun);
2370     }
2371
2372     /* Print some header info to file */
2373     sep_line(fp);
2374     fprintf(fp, "\n      P E R F O R M A N C E   R E S U L T S\n");
2375     sep_line(fp);
2376     fprintf(fp, "%s for Gromacs %s\n", ShortProgram(), GromacsVersion());
2377     if (!bThreads)
2378     {
2379         fprintf(fp, "Number of nodes         : %d\n", nnodes);
2380         fprintf(fp, "The mpirun command is   : %s\n", cmd_mpirun);
2381         if (strcmp(procstring[0], "none") != 0)
2382         {
2383             fprintf(fp, "Passing # of nodes via  : %s\n", procstring[0]);
2384         }
2385         else
2386         {
2387             fprintf(fp, "Not setting number of nodes in system call\n");
2388         }
2389     }
2390     else
2391     {
2392         fprintf(fp, "Number of threads       : %d\n", nnodes);
2393     }
2394
2395     fprintf(fp, "The mdrun  command is   : %s\n", cmd_mdrun);
2396     fprintf(fp, "mdrun args benchmarks   : %s\n", cmd_args_bench);
2397     fprintf(fp, "Benchmark steps         : ");
2398     fprintf(fp, gmx_large_int_pfmt, bench_nsteps);
2399     fprintf(fp, "\n");
2400     fprintf(fp, "dlb equilibration steps : %d\n", presteps);
2401     if (sim_part > 1)
2402     {
2403         fprintf(fp, "Checkpoint time step    : ");
2404         fprintf(fp, gmx_large_int_pfmt, cpt_steps);
2405         fprintf(fp, "\n");
2406     }
2407     fprintf(fp, "mdrun args at launchtime: %s\n", cmd_args_launch);
2408
2409     if (new_sim_nsteps >= 0)
2410     {
2411         bOverwrite = TRUE;
2412         fprintf(stderr, "Note: Simulation input file %s will have ", opt2fn("-so", NFILE, fnm));
2413         fprintf(stderr, gmx_large_int_pfmt, new_sim_nsteps+cpt_steps);
2414         fprintf(stderr, " steps.\n");
2415         fprintf(fp, "Simulation steps        : ");
2416         fprintf(fp, gmx_large_int_pfmt, new_sim_nsteps);
2417         fprintf(fp, "\n");
2418     }
2419     if (repeats > 1)
2420     {
2421         fprintf(fp, "Repeats for each test   : %d\n", repeats);
2422     }
2423
2424     if (npme_fixed >= -1)
2425     {
2426         fprintf(fp, "Fixing -npme at         : %d\n", npme_fixed);
2427     }
2428
2429     fprintf(fp, "Input file              : %s\n", opt2fn("-s", NFILE, fnm));
2430     fprintf(fp, "   PME/PP load estimate : %g\n", guessPMEratio);
2431
2432     /* Allocate memory for the inputinfo struct: */
2433     snew(info, 1);
2434     info->nr_inputfiles = ntprs;
2435     for (i = 0; i < ntprs; i++)
2436     {
2437         snew(info->rcoulomb, ntprs);
2438         snew(info->rvdw, ntprs);
2439         snew(info->rlist, ntprs);
2440         snew(info->rlistlong, ntprs);
2441         snew(info->nkx, ntprs);
2442         snew(info->nky, ntprs);
2443         snew(info->nkz, ntprs);
2444         snew(info->fsx, ntprs);
2445         snew(info->fsy, ntprs);
2446         snew(info->fsz, ntprs);
2447     }
2448     /* Make alternative tpr files to test: */
2449     snew(tpr_names, ntprs);
2450     for (i = 0; i < ntprs; i++)
2451     {
2452         snew(tpr_names[i], STRLEN);
2453     }
2454
2455     /* It can be that ntprs is reduced by make_benchmark_tprs if not enough
2456      * different grids could be found. */
2457     make_benchmark_tprs(opt2fn("-s", NFILE, fnm), tpr_names, bench_nsteps+presteps,
2458                         cpt_steps, rmin, rmax, bScaleRvdw, &ntprs, info, fp);
2459
2460     /********************************************************************************/
2461     /* Main loop over all scenarios we need to test: tpr files, PME nodes, repeats  */
2462     /********************************************************************************/
2463     snew(perfdata, ntprs);
2464     if (bBenchmark)
2465     {
2466         do_the_tests(fp, tpr_names, maxPMEnodes, minPMEnodes, npme_fixed, npmevalues_opt[0], perfdata, &pmeentries,
2467                      repeats, nnodes, ntprs, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
2468                      cmd_args_bench, fnm, NFILE, presteps, cpt_steps);
2469
2470         fprintf(fp, "\nTuning took%8.1f minutes.\n", (gettime()-seconds)/60.0);
2471
2472         /* Analyse the results and give a suggestion for optimal settings: */
2473         bKeepTPR = analyze_data(fp, opt2fn("-p", NFILE, fnm), perfdata, nnodes, ntprs, pmeentries,
2474                                 repeats, info, &best_tpr, &best_npme);
2475
2476         /* Take the best-performing tpr file and enlarge nsteps to original value */
2477         if (bKeepTPR && !bOverwrite)
2478         {
2479             simulation_tpr = opt2fn("-s", NFILE, fnm);
2480         }
2481         else
2482         {
2483             simulation_tpr = opt2fn("-so", NFILE, fnm);
2484             modify_PMEsettings(bOverwrite ? (new_sim_nsteps+cpt_steps) : info->orig_sim_steps,
2485                                info->orig_init_step, tpr_names[best_tpr], simulation_tpr);
2486         }
2487
2488         /* Let's get rid of the temporary benchmark input files */
2489         for (i = 0; i < ntprs; i++)
2490         {
2491             fprintf(stdout, "Deleting temporary benchmark input file %s\n", tpr_names[i]);
2492             remove(tpr_names[i]);
2493         }
2494
2495         /* Now start the real simulation if the user requested it ... */
2496         launch_simulation(bLaunch, fp, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
2497                           cmd_args_launch, simulation_tpr, best_npme);
2498     }
2499     ffclose(fp);
2500
2501     /* ... or simply print the performance results to screen: */
2502     if (!bLaunch)
2503     {
2504         finalize(opt2fn("-p", NFILE, fnm));
2505     }
2506
2507     return 0;
2508 }