src/gromacs/gmxana/gmx_tune_pme.c

   1 /*
   2  *
   3  *                This source code is part of
   4  *
   5  *                 G   R   O   M   A   C   S
   6  *
   7  *          GROningen MAchine for Chemical Simulations
   8  *
   9  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  10  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  11  * Copyright (c) 2001-2008, The GROMACS development team,
  12  * check out http://www.gromacs.org for more information.
  13
  14  * This program is free software; you can redistribute it and/or
  15  * modify it under the terms of the GNU General Public License
  16  * as published by the Free Software Foundation; either version 2
  17  * of the License, or (at your option) any later version.
  18  *
  19  * If you want to redistribute modifications, please consider that
  20  * scientific software is very special. Version control is crucial -
  21  * bugs must be traceable. We will be happy to consider code for
  22  * inclusion in the official distribution, but derived work must not
  23  * be called official GROMACS. Details are found in the README & COPYING
  24  * files - if they are missing, get the official version at www.gromacs.org.
  25  *
  26  * To help us fund GROMACS development, we humbly ask that you cite
  27  * the papers on the package - you can find them in the top README file.
  28  *
  29  * For more info, check our website at http://www.gromacs.org
  30  *
  31  * And Hey:
  32  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  33  */
  34 #ifdef HAVE_CONFIG_H
  35 #include <config.h>
  36 #endif
  37
  38
  39 #include <time.h>
  40 #ifdef HAVE_SYS_TIME_H
  41 #include <sys/time.h>
  42 #endif
  43
  44
  45
  46 #include "statutil.h"
  47 #include "typedefs.h"
  48 #include "smalloc.h"
  49 #include "vec.h"
  50 #include "copyrite.h"
  51 #include "statutil.h"
  52 #include "tpxio.h"
  53 #include "string2.h"
  54 #include "readinp.h"
  55 #include "calcgrid.h"
  56 #include "checkpoint.h"
  57 #include "macros.h"
  58 #include "gmx_ana.h"
  59 #include "names.h"
  60 #include "perf_est.h"
  61
  62
  63
  64 /* Enum for situations that can occur during log file parsing, the
  65  * corresponding string entries can be found in do_the_tests() in
  66  * const char* ParseLog[] */
  67 enum {
  68     eParselogOK,
  69     eParselogNotFound,
  70     eParselogNoPerfData,
  71     eParselogTerm,
  72     eParselogResetProblem,
  73     eParselogNoDDGrid,
  74     eParselogTPXVersion,
  75     eParselogNotParallel,
  76     eParselogFatal,
  77     eParselogNr
  78 };
  79
  80
  81 typedef struct
  82 {
  83     int     nPMEnodes;    /* number of PME-only nodes used in this test */
  84     int     nx, ny, nz;   /* DD grid */
  85     int     guessPME;     /* if nPMEnodes == -1, this is the guessed number of PME nodes */
  86     double *Gcycles;      /* This can contain more than one value if doing multiple tests */
  87     double  Gcycles_Av;
  88     float  *ns_per_day;
  89     float   ns_per_day_Av;
  90     float  *PME_f_load;     /* PME mesh/force load average*/
  91     float   PME_f_load_Av;  /* Average average ;) ... */
  92     char   *mdrun_cmd_line; /* Mdrun command line used for this test */
  93 } t_perf;
  94
  95
  96 typedef struct
  97 {
  98     int             nr_inputfiles;  /* The number of tpr and mdp input files */
  99     gmx_large_int_t orig_sim_steps; /* Number of steps to be done in the real simulation */
 100     gmx_large_int_t orig_init_step; /* Init step for the real simulation */
 101     real           *rcoulomb;       /* The coulomb radii [0...nr_inputfiles] */
 102     real           *rvdw;           /* The vdW radii */
 103     real           *rlist;          /* Neighbourlist cutoff radius */
 104     real           *rlistlong;
 105     int            *nkx, *nky, *nkz;
 106     real           *fsx, *fsy, *fsz; /* Fourierspacing in x,y,z dimension */
 107 } t_inputinfo;
 108
 109
 110 static void sep_line(FILE *fp)
 111 {
 112     fprintf(fp, "\n------------------------------------------------------------\n");
 113 }
 114
 115
 116 /* Wrapper for system calls */
 117 static int gmx_system_call(char *command)
 118 {
 119 #ifdef GMX_NO_SYSTEM
 120     gmx_fatal(FARGS, "No calls to system(3) supported on this platform. Attempted to call:\n'%s'\n", command);
 121 #else
 122     return ( system(command) );
 123 #endif
 124 }
 125
 126
 127 /* Check if string starts with substring */
 128 static gmx_bool str_starts(const char *string, const char *substring)
 129 {
 130     return ( strncmp(string, substring, strlen(substring)) == 0);
 131 }
 132
 133
 134 static void cleandata(t_perf *perfdata, int test_nr)
 135 {
 136     perfdata->Gcycles[test_nr]    = 0.0;
 137     perfdata->ns_per_day[test_nr] = 0.0;
 138     perfdata->PME_f_load[test_nr] = 0.0;
 139
 140     return;
 141 }
 142
 143
 144 static gmx_bool is_equal(real a, real b)
 145 {
 146     real diff, eps = 1.0e-7;
 147
 148
 149     diff = a - b;
 150
 151     if (diff < 0.0)
 152     {
 153         diff = -diff;
 154     }
 155
 156     if (diff < eps)
 157     {
 158         return TRUE;
 159     }
 160     else
 161     {
 162         return FALSE;
 163     }
 164 }
 165
 166
 167 static void finalize(const char *fn_out)
 168 {
 169     char  buf[STRLEN];
 170     FILE *fp;
 171
 172
 173     fp = fopen(fn_out, "r");
 174     fprintf(stdout, "\n\n");
 175
 176     while (fgets(buf, STRLEN-1, fp) != NULL)
 177     {
 178         fprintf(stdout, "%s", buf);
 179     }
 180     fclose(fp);
 181     fprintf(stdout, "\n\n");
 182 }
 183
 184
 185 enum {
 186     eFoundNothing, eFoundDDStr, eFoundAccountingStr, eFoundCycleStr
 187 };
 188
 189 static int parse_logfile(const char *logfile, const char *errfile,
 190                          t_perf *perfdata, int test_nr, int presteps, gmx_large_int_t cpt_steps,
 191                          int nnodes)
 192 {
 193     FILE           *fp;
 194     char            line[STRLEN], dumstring[STRLEN], dumstring2[STRLEN];
 195     const char      matchstrdd[]  = "Domain decomposition grid";
 196     const char      matchstrcr[]  = "resetting all time and cycle counters";
 197     const char      matchstrbal[] = "Average PME mesh/force load:";
 198     const char      matchstring[] = "R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G";
 199     const char      errSIG[]      = "signal, stopping at the next";
 200     int             iFound;
 201     int             procs;
 202     float           dum1, dum2, dum3, dum4;
 203     int             ndum;
 204     int             npme;
 205     gmx_large_int_t resetsteps     = -1;
 206     gmx_bool        bFoundResetStr = FALSE;
 207     gmx_bool        bResetChecked  = FALSE;
 208
 209
 210     if (!gmx_fexist(logfile))
 211     {
 212         fprintf(stderr, "WARNING: Could not find logfile %s.\n", logfile);
 213         cleandata(perfdata, test_nr);
 214         return eParselogNotFound;
 215     }
 216
 217     fp = fopen(logfile, "r");
 218     perfdata->PME_f_load[test_nr] = -1.0;
 219     perfdata->guessPME            = -1;
 220
 221     iFound = eFoundNothing;
 222     if (1 == nnodes)
 223     {
 224         iFound = eFoundDDStr; /* Skip some case statements */
 225     }
 226
 227     while (fgets(line, STRLEN, fp) != NULL)
 228     {
 229         /* Remove leading spaces */
 230         ltrim(line);
 231
 232         /* Check for TERM and INT signals from user: */
 233         if (strstr(line, errSIG) != NULL)
 234         {
 235             fclose(fp);
 236             cleandata(perfdata, test_nr);
 237             return eParselogTerm;
 238         }
 239
 240         /* Check whether cycle resetting  worked */
 241         if (presteps > 0 && !bFoundResetStr)
 242         {
 243             if (strstr(line, matchstrcr) != NULL)
 244             {
 245                 sprintf(dumstring, "step %s", gmx_large_int_pfmt);
 246                 sscanf(line, dumstring, &resetsteps);
 247                 bFoundResetStr = TRUE;
 248                 if (resetsteps == presteps+cpt_steps)
 249                 {
 250                     bResetChecked = TRUE;
 251                 }
 252                 else
 253                 {
 254                     sprintf(dumstring, gmx_large_int_pfmt, resetsteps);
 255                     sprintf(dumstring2, gmx_large_int_pfmt, presteps+cpt_steps);
 256                     fprintf(stderr, "WARNING: Time step counters were reset at step %s,\n"
 257                             "         though they were supposed to be reset at step %s!\n",
 258                             dumstring, dumstring2);
 259                 }
 260             }
 261         }
 262
 263         /* Look for strings that appear in a certain order in the log file: */
 264         switch (iFound)
 265         {
 266             case eFoundNothing:
 267                 /* Look for domain decomp grid and separate PME nodes: */
 268                 if (str_starts(line, matchstrdd))
 269                 {
 270                     sscanf(line, "Domain decomposition grid %d x %d x %d, separate PME nodes %d",
 271                            &(perfdata->nx), &(perfdata->ny), &(perfdata->nz), &npme);
 272                     if (perfdata->nPMEnodes == -1)
 273                     {
 274                         perfdata->guessPME = npme;
 275                     }
 276                     else if (perfdata->nPMEnodes != npme)
 277                     {
 278                         gmx_fatal(FARGS, "PME nodes from command line and output file are not identical");
 279                     }
 280                     iFound = eFoundDDStr;
 281                 }
 282                 /* Catch a few errors that might have occured: */
 283                 else if (str_starts(line, "There is no domain decomposition for"))
 284                 {
 285                     fclose(fp);
 286                     return eParselogNoDDGrid;
 287                 }
 288                 else if (str_starts(line, "reading tpx file"))
 289                 {
 290                     fclose(fp);
 291                     return eParselogTPXVersion;
 292                 }
 293                 else if (str_starts(line, "The -dd or -npme option request a parallel simulation"))
 294                 {
 295                     fclose(fp);
 296                     return eParselogNotParallel;
 297                 }
 298                 break;
 299             case eFoundDDStr:
 300                 /* Look for PME mesh/force balance (not necessarily present, though) */
 301                 if (str_starts(line, matchstrbal))
 302                 {
 303                     sscanf(&line[strlen(matchstrbal)], "%f", &(perfdata->PME_f_load[test_nr]));
 304                 }
 305                 /* Look for matchstring */
 306                 if (str_starts(line, matchstring))
 307                 {
 308                     iFound = eFoundAccountingStr;
 309                 }
 310                 break;
 311             case eFoundAccountingStr:
 312                 /* Already found matchstring - look for cycle data */
 313                 if (str_starts(line, "Total  "))
 314                 {
 315                     sscanf(line, "Total %d %lf", &procs, &(perfdata->Gcycles[test_nr]));
 316                     iFound = eFoundCycleStr;
 317                 }
 318                 break;
 319             case eFoundCycleStr:
 320                 /* Already found cycle data - look for remaining performance info and return */
 321                 if (str_starts(line, "Performance:"))
 322                 {
 323                     ndum = sscanf(line, "%s %f %f %f %f", dumstring, &dum1, &dum2, &dum3, &dum4);
 324                     /* (ns/day) is the second last entry, depending on whether GMX_DETAILED_PERF_STATS was set in print_perf(), nrnb.c */
 325                     perfdata->ns_per_day[test_nr] = (ndum == 5) ? dum3 : dum1;
 326                     fclose(fp);
 327                     if (bResetChecked || presteps == 0)
 328                     {
 329                         return eParselogOK;
 330                     }
 331                     else
 332                     {
 333                         return eParselogResetProblem;
 334                     }
 335                 }
 336                 break;
 337         }
 338     } /* while */
 339
 340     /* Close the log file */
 341     fclose(fp);
 342
 343     /* Check why there is no performance data in the log file.
 344      * Did a fatal errors occur? */
 345     if (gmx_fexist(errfile))
 346     {
 347         fp = fopen(errfile, "r");
 348         while (fgets(line, STRLEN, fp) != NULL)
 349         {
 350             if (str_starts(line, "Fatal error:") )
 351             {
 352                 if (fgets(line, STRLEN, fp) != NULL)
 353                 {
 354                     fprintf(stderr, "\nWARNING: An error occured during this benchmark:\n"
 355                             "%s\n", line);
 356                 }
 357                 fclose(fp);
 358                 cleandata(perfdata, test_nr);
 359                 return eParselogFatal;
 360             }
 361         }
 362         fclose(fp);
 363     }
 364     else
 365     {
 366         fprintf(stderr, "WARNING: Could not find stderr file %s.\n", errfile);
 367     }
 368
 369     /* Giving up ... we could not find out why there is no performance data in
 370      * the log file. */
 371     fprintf(stdout, "No performance data in log file.\n");
 372     cleandata(perfdata, test_nr);
 373
 374     return eParselogNoPerfData;
 375 }
 376
 377
 378 static gmx_bool analyze_data(
 379         FILE         *fp,
 380         const char   *fn,
 381         t_perf      **perfdata,
 382         int           nnodes,
 383         int           ntprs,
 384         int           ntests,
 385         int           nrepeats,
 386         t_inputinfo  *info,
 387         int          *index_tpr,    /* OUT: Nr of mdp file with best settings */
 388         int          *npme_optimal) /* OUT: Optimal number of PME nodes */
 389 {
 390     int      i, j, k;
 391     int      line  = 0, line_win = -1;
 392     int      k_win = -1, i_win = -1, winPME;
 393     double   s     = 0.0; /* standard deviation */
 394     t_perf  *pd;
 395     char     strbuf[STRLEN];
 396     char     str_PME_f_load[13];
 397     gmx_bool bCanUseOrigTPR;
 398     gmx_bool bRefinedCoul, bRefinedVdW, bRefinedGrid;
 399
 400
 401     if (nrepeats > 1)
 402     {
 403         sep_line(fp);
 404         fprintf(fp, "Summary of successful runs:\n");
 405         fprintf(fp, "Line tpr PME nodes  Gcycles Av.     Std.dev.       ns/day        PME/f");
 406         if (nnodes > 1)
 407         {
 408             fprintf(fp, "    DD grid");
 409         }
 410         fprintf(fp, "\n");
 411     }
 412
 413
 414     for (k = 0; k < ntprs; k++)
 415     {
 416         for (i = 0; i < ntests; i++)
 417         {
 418             /* Select the right dataset: */
 419             pd = &(perfdata[k][i]);
 420
 421             pd->Gcycles_Av    = 0.0;
 422             pd->PME_f_load_Av = 0.0;
 423             pd->ns_per_day_Av = 0.0;
 424
 425             if (pd->nPMEnodes == -1)
 426             {
 427                 sprintf(strbuf, "(%3d)", pd->guessPME);
 428             }
 429             else
 430             {
 431                 sprintf(strbuf, "     ");
 432             }
 433
 434             /* Get the average run time of a setting */
 435             for (j = 0; j < nrepeats; j++)
 436             {
 437                 pd->Gcycles_Av    += pd->Gcycles[j];
 438                 pd->PME_f_load_Av += pd->PME_f_load[j];
 439             }
 440             pd->Gcycles_Av    /= nrepeats;
 441             pd->PME_f_load_Av /= nrepeats;
 442
 443             for (j = 0; j < nrepeats; j++)
 444             {
 445                 if (pd->ns_per_day[j] > 0.0)
 446                 {
 447                     pd->ns_per_day_Av += pd->ns_per_day[j];
 448                 }
 449                 else
 450                 {
 451                     /* Somehow the performance number was not aquired for this run,
 452                      * therefor set the average to some negative value: */
 453                     pd->ns_per_day_Av = -1.0f*nrepeats;
 454                     break;
 455                 }
 456             }
 457             pd->ns_per_day_Av /= nrepeats;
 458
 459             /* Nicer output: */
 460             if (pd->PME_f_load_Av > 0.0)
 461             {
 462                 sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load_Av);
 463             }
 464             else
 465             {
 466                 sprintf(str_PME_f_load, "%s", "         -  ");
 467             }
 468
 469
 470             /* We assume we had a successful run if both averages are positive */
 471             if (pd->Gcycles_Av > 0.0 && pd->ns_per_day_Av > 0.0)
 472             {
 473                 /* Output statistics if repeats were done */
 474                 if (nrepeats > 1)
 475                 {
 476                     /* Calculate the standard deviation */
 477                     s = 0.0;
 478                     for (j = 0; j < nrepeats; j++)
 479                     {
 480                         s += pow( pd->Gcycles[j] - pd->Gcycles_Av, 2 );
 481                     }
 482                     s /= (nrepeats - 1);
 483                     s  = sqrt(s);
 484
 485                     fprintf(fp, "%4d %3d %4d%s %12.3f %12.3f %12.3f %s",
 486                             line, k, pd->nPMEnodes, strbuf, pd->Gcycles_Av, s,
 487                             pd->ns_per_day_Av, str_PME_f_load);
 488                     if (nnodes > 1)
 489                     {
 490                         fprintf(fp, "  %3d %3d %3d", pd->nx, pd->ny, pd->nz);
 491                     }
 492                     fprintf(fp, "\n");
 493                 }
 494                 /* Store the index of the best run found so far in 'winner': */
 495                 if ( (k_win == -1) || (pd->Gcycles_Av < perfdata[k_win][i_win].Gcycles_Av) )
 496                 {
 497                     k_win    = k;
 498                     i_win    = i;
 499                     line_win = line;
 500                 }
 501                 line++;
 502             }
 503         }
 504     }
 505
 506     if (k_win == -1)
 507     {
 508         gmx_fatal(FARGS, "None of the runs was successful! Check %s for problems.", fn);
 509     }
 510
 511     sep_line(fp);
 512
 513     winPME = perfdata[k_win][i_win].nPMEnodes;
 514
 515     if (1 == ntests)
 516     {
 517         /* We stuck to a fixed number of PME-only nodes */
 518         sprintf(strbuf, "settings No. %d", k_win);
 519     }
 520     else
 521     {
 522         /* We have optimized the number of PME-only nodes */
 523         if (winPME == -1)
 524         {
 525             sprintf(strbuf, "%s", "the automatic number of PME nodes");
 526         }
 527         else
 528         {
 529             sprintf(strbuf, "%d PME nodes", winPME);
 530         }
 531     }
 532     fprintf(fp, "Best performance was achieved with %s", strbuf);
 533     if ((nrepeats > 1) && (ntests > 1))
 534     {
 535         fprintf(fp, " (see line %d)", line_win);
 536     }
 537     fprintf(fp, "\n");
 538
 539     /* Only mention settings if they were modified: */
 540     bRefinedCoul = !is_equal(info->rcoulomb[k_win], info->rcoulomb[0]);
 541     bRefinedVdW  = !is_equal(info->rvdw[k_win], info->rvdw[0]    );
 542     bRefinedGrid = !(info->nkx[k_win] == info->nkx[0] &&
 543                      info->nky[k_win] == info->nky[0] &&
 544                      info->nkz[k_win] == info->nkz[0]);
 545
 546     if (bRefinedCoul || bRefinedVdW || bRefinedGrid)
 547     {
 548         fprintf(fp, "Optimized PME settings:\n");
 549         bCanUseOrigTPR = FALSE;
 550     }
 551     else
 552     {
 553         bCanUseOrigTPR = TRUE;
 554     }
 555
 556     if (bRefinedCoul)
 557     {
 558         fprintf(fp, "   New Coulomb radius: %f nm (was %f nm)\n", info->rcoulomb[k_win], info->rcoulomb[0]);
 559     }
 560
 561     if (bRefinedVdW)
 562     {
 563         fprintf(fp, "   New Van der Waals radius: %f nm (was %f nm)\n", info->rvdw[k_win], info->rvdw[0]);
 564     }
 565
 566     if (bRefinedGrid)
 567     {
 568         fprintf(fp, "   New Fourier grid xyz: %d %d %d (was %d %d %d)\n", info->nkx[k_win], info->nky[k_win], info->nkz[k_win],
 569                 info->nkx[0], info->nky[0], info->nkz[0]);
 570     }
 571
 572     if (bCanUseOrigTPR && ntprs > 1)
 573     {
 574         fprintf(fp, "and original PME settings.\n");
 575     }
 576
 577     fflush(fp);
 578
 579     /* Return the index of the mdp file that showed the highest performance
 580      * and the optimal number of PME nodes */
 581     *index_tpr    = k_win;
 582     *npme_optimal = winPME;
 583
 584     return bCanUseOrigTPR;
 585 }
 586
 587
 588 /* Get the commands we need to set up the runs from environment variables */
 589 static void get_program_paths(gmx_bool bThreads, char *cmd_mpirun[], char cmd_np[],
 590                               char *cmd_mdrun[], int repeats)
 591 {
 592     char      *cp;
 593     FILE      *fp;
 594     const char def_mpirun[]   = "mpirun";
 595     const char def_mdrun[]    = "mdrun";
 596
 597     const char empty_mpirun[] = "";
 598
 599     /* Get the commands we need to set up the runs from environment variables */
 600     if (!bThreads)
 601     {
 602         if ( (cp = getenv("MPIRUN")) != NULL)
 603         {
 604             *cmd_mpirun = strdup(cp);
 605         }
 606         else
 607         {
 608             *cmd_mpirun = strdup(def_mpirun);
 609         }
 610     }
 611     else
 612     {
 613         *cmd_mpirun = strdup(empty_mpirun);
 614     }
 615
 616     if ( (cp = getenv("MDRUN" )) != NULL)
 617     {
 618         *cmd_mdrun  = strdup(cp);
 619     }
 620     else
 621     {
 622         *cmd_mdrun  = strdup(def_mdrun);
 623     }
 624 }
 625
 626 /* Check that the commands will run mdrun (perhaps via mpirun) by
 627  * running a very quick test simulation. Requires MPI environment to
 628  * be available if applicable. */
 629 static void check_mdrun_works(gmx_bool bThreads,
 630                               const char *cmd_mpirun,
 631                               const char *cmd_np,
 632                               const char *cmd_mdrun)
 633 {
 634     char      *command = NULL;
 635     char      *cp;
 636     char       line[STRLEN];
 637     FILE      *fp;
 638     const char filename[]     = "benchtest.log";
 639
 640     /* This string should always be identical to the one in copyrite.c,
 641      * gmx_print_version_info() in the defined(GMX_MPI) section */
 642     const char match_mpi[]    = "MPI library:        MPI";
 643     const char match_mdrun[]  = "Program: ";
 644     gmx_bool   bMdrun         = FALSE;
 645     gmx_bool   bMPI           = FALSE;
 646
 647     /* Run a small test to see whether mpirun + mdrun work  */
 648     fprintf(stdout, "Making sure that mdrun can be executed. ");
 649     if (bThreads)
 650     {
 651         snew(command, strlen(cmd_mdrun) + strlen(cmd_np) + strlen(filename) + 50);
 652         sprintf(command, "%s%s-version -maxh 0.001 1> %s 2>&1", cmd_mdrun, cmd_np, filename);
 653     }
 654     else
 655     {
 656         snew(command, strlen(cmd_mpirun) + strlen(cmd_np) + strlen(cmd_mdrun) + strlen(filename) + 50);
 657         sprintf(command, "%s%s%s -version -maxh 0.001 1> %s 2>&1", cmd_mpirun, cmd_np, cmd_mdrun, filename);
 658     }
 659     fprintf(stdout, "Trying '%s' ... ", command);
 660     make_backup(filename);
 661     gmx_system_call(command);
 662
 663     /* Check if we find the characteristic string in the output: */
 664     if (!gmx_fexist(filename))
 665     {
 666         gmx_fatal(FARGS, "Output from test run could not be found.");
 667     }
 668
 669     fp = fopen(filename, "r");
 670     /* We need to scan the whole output file, since sometimes the queuing system
 671      * also writes stuff to stdout/err */
 672     while (!feof(fp) )
 673     {
 674         cp = fgets(line, STRLEN, fp);
 675         if (cp != NULL)
 676         {
 677             if (str_starts(line, match_mdrun) )
 678             {
 679                 bMdrun = TRUE;
 680             }
 681             if (str_starts(line, match_mpi) )
 682             {
 683                 bMPI = TRUE;
 684             }
 685         }
 686     }
 687     fclose(fp);
 688
 689     if (bThreads)
 690     {
 691         if (bMPI)
 692         {
 693             gmx_fatal(FARGS, "Need a threaded version of mdrun. This one\n"
 694                       "(%s)\n"
 695                       "seems to have been compiled with MPI instead.",
 696                       *cmd_mdrun);
 697         }
 698     }
 699     else
 700     {
 701         if (bMdrun && !bMPI)
 702         {
 703             gmx_fatal(FARGS, "Need an MPI-enabled version of mdrun. This one\n"
 704                       "(%s)\n"
 705                       "seems to have been compiled without MPI support.",
 706                       *cmd_mdrun);
 707         }
 708     }
 709
 710     if (!bMdrun)
 711     {
 712         gmx_fatal(FARGS, "Cannot execute mdrun. Please check %s for problems!",
 713                   filename);
 714     }
 715
 716     fprintf(stdout, "passed.\n");
 717
 718     /* Clean up ... */
 719     remove(filename);
 720     sfree(command);
 721 }
 722
 723
 724 static void launch_simulation(
 725         gmx_bool    bLaunch,        /* Should the simulation be launched? */
 726         FILE       *fp,             /* General log file */
 727         gmx_bool    bThreads,       /* whether to use threads */
 728         char       *cmd_mpirun,     /* Command for mpirun */
 729         char       *cmd_np,         /* Switch for -np or -ntmpi or empty */
 730         char       *cmd_mdrun,      /* Command for mdrun */
 731         char       *args_for_mdrun, /* Arguments for mdrun */
 732         const char *simulation_tpr, /* This tpr will be simulated */
 733         int         nPMEnodes)      /* Number of PME nodes to use */
 734 {
 735     char  *command;
 736
 737
 738     /* Make enough space for the system call command,
 739      * (100 extra chars for -npme ... etc. options should suffice): */
 740     snew(command, strlen(cmd_mpirun)+strlen(cmd_mdrun)+strlen(cmd_np)+strlen(args_for_mdrun)+strlen(simulation_tpr)+100);
 741
 742     /* Note that the -passall options requires args_for_mdrun to be at the end
 743      * of the command line string */
 744     if (bThreads)
 745     {
 746         sprintf(command, "%s%s-npme %d -s %s %s",
 747                 cmd_mdrun, cmd_np, nPMEnodes, simulation_tpr, args_for_mdrun);
 748     }
 749     else
 750     {
 751         sprintf(command, "%s%s%s -npme %d -s %s %s",
 752                 cmd_mpirun, cmd_np, cmd_mdrun, nPMEnodes, simulation_tpr, args_for_mdrun);
 753     }
 754
 755     fprintf(fp, "%s this command line to launch the simulation:\n\n%s", bLaunch ? "Using" : "Please use", command);
 756     sep_line(fp);
 757     fflush(fp);
 758
 759     /* Now the real thing! */
 760     if (bLaunch)
 761     {
 762         fprintf(stdout, "\nLaunching simulation with best parameters now.\nExecuting '%s'", command);
 763         sep_line(stdout);
 764         fflush(stdout);
 765         gmx_system_call(command);
 766     }
 767 }
 768
 769
 770 static void modify_PMEsettings(
 771         gmx_large_int_t simsteps,    /* Set this value as number of time steps */
 772         gmx_large_int_t init_step,   /* Set this value as init_step */
 773         const char     *fn_best_tpr, /* tpr file with the best performance */
 774         const char     *fn_sim_tpr)  /* name of tpr file to be launched */
 775 {
 776     t_inputrec   *ir;
 777     t_state       state;
 778     gmx_mtop_t    mtop;
 779     char          buf[200];
 780
 781     snew(ir, 1);
 782     read_tpx_state(fn_best_tpr, ir, &state, NULL, &mtop);
 783
 784     /* Reset nsteps and init_step to the value of the input .tpr file */
 785     ir->nsteps    = simsteps;
 786     ir->init_step = init_step;
 787
 788     /* Write the tpr file which will be launched */
 789     sprintf(buf, "Writing optimized simulation file %s with nsteps=%s.\n", fn_sim_tpr, gmx_large_int_pfmt);
 790     fprintf(stdout, buf, ir->nsteps);
 791     fflush(stdout);
 792     write_tpx_state(fn_sim_tpr, ir, &state, &mtop);
 793
 794     sfree(ir);
 795 }
 796
 797
 798 #define EPME_SWITCHED(e) ((e) == eelPMESWITCH || (e) == eelPMEUSERSWITCH)
 799
 800 /* Make additional TPR files with more computational load for the
 801  * direct space processors: */
 802 static void make_benchmark_tprs(
 803         const char     *fn_sim_tpr,      /* READ : User-provided tpr file                 */
 804         char           *fn_bench_tprs[], /* WRITE: Names of benchmark tpr files           */
 805         gmx_large_int_t benchsteps,      /* Number of time steps for benchmark runs       */
 806         gmx_large_int_t statesteps,      /* Step counter in checkpoint file               */
 807         real            rmin,            /* Minimal Coulomb radius                        */
 808         real            rmax,            /* Maximal Coulomb radius                        */
 809         real            bScaleRvdw,      /* Scale rvdw along with rcoulomb                */
 810         int            *ntprs,           /* No. of TPRs to write, each with a different
 811                                             rcoulomb and fourierspacing                   */
 812         t_inputinfo    *info,            /* Contains information about mdp file options   */
 813         FILE           *fp)              /* Write the output here                         */
 814 {
 815     int           i, j, d;
 816     t_inputrec   *ir;
 817     t_state       state;
 818     gmx_mtop_t    mtop;
 819     real          nlist_buffer;     /* Thickness of the buffer regions for PME-switch potentials */
 820     char          buf[200];
 821     rvec          box_size;
 822     gmx_bool      bNote = FALSE;
 823     real          add;              /* Add this to rcoul for the next test    */
 824     real          fac = 1.0;        /* Scaling factor for Coulomb radius      */
 825     real          fourierspacing;   /* Basic fourierspacing from tpr          */
 826
 827
 828     sprintf(buf, "Making benchmark tpr file%s with %s time step%s",
 829             *ntprs > 1 ? "s" : "", gmx_large_int_pfmt, benchsteps > 1 ? "s" : "");
 830     fprintf(stdout, buf, benchsteps);
 831     if (statesteps > 0)
 832     {
 833         sprintf(buf, " (adding %s steps from checkpoint file)", gmx_large_int_pfmt);
 834         fprintf(stdout, buf, statesteps);
 835         benchsteps += statesteps;
 836     }
 837     fprintf(stdout, ".\n");
 838
 839
 840     snew(ir, 1);
 841     read_tpx_state(fn_sim_tpr, ir, &state, NULL, &mtop);
 842
 843     /* Check if some kind of PME was chosen */
 844     if (EEL_PME(ir->coulombtype) == FALSE)
 845     {
 846         gmx_fatal(FARGS, "Can only do optimizations for simulations with %s electrostatics.",
 847                   EELTYPE(eelPME));
 848     }
 849
 850     /* Check if rcoulomb == rlist, which is necessary for plain PME. */
 851     if (  (ir->cutoff_scheme != ecutsVERLET) &&
 852           (eelPME == ir->coulombtype) && !(ir->rcoulomb == ir->rlist))
 853     {
 854         gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to rlist (%f).",
 855                   EELTYPE(eelPME), ir->rcoulomb, ir->rlist);
 856     }
 857     /* For other PME types, rcoulomb is allowed to be smaller than rlist */
 858     else if (ir->rcoulomb > ir->rlist)
 859     {
 860         gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to or smaller than rlist (%f)",
 861                   EELTYPE(ir->coulombtype), ir->rcoulomb, ir->rlist);
 862     }
 863
 864     if (bScaleRvdw && ir->rvdw != ir->rcoulomb)
 865     {
 866         fprintf(stdout, "NOTE: input rvdw != rcoulomb, will not scale rvdw\n");
 867         bScaleRvdw = FALSE;
 868     }
 869
 870     /* Reduce the number of steps for the benchmarks */
 871     info->orig_sim_steps = ir->nsteps;
 872     ir->nsteps           = benchsteps;
 873     /* We must not use init_step from the input tpr file for the benchmarks */
 874     info->orig_init_step = ir->init_step;
 875     ir->init_step        = 0;
 876
 877     /* For PME-switch potentials, keep the radial distance of the buffer region */
 878     nlist_buffer   = ir->rlist - ir->rcoulomb;
 879
 880     /* Determine length of triclinic box vectors */
 881     for (d = 0; d < DIM; d++)
 882     {
 883         box_size[d] = 0;
 884         for (i = 0; i < DIM; i++)
 885         {
 886             box_size[d] += state.box[d][i]*state.box[d][i];
 887         }
 888         box_size[d] = sqrt(box_size[d]);
 889     }
 890
 891     if (ir->fourier_spacing > 0)
 892     {
 893         info->fsx[0] = ir->fourier_spacing;
 894         info->fsy[0] = ir->fourier_spacing;
 895         info->fsz[0] = ir->fourier_spacing;
 896     }
 897     else
 898     {
 899         /* Reconstruct fourierspacing per dimension from the number of grid points and box size */
 900         info->fsx[0] = box_size[XX]/ir->nkx;
 901         info->fsy[0] = box_size[YY]/ir->nky;
 902         info->fsz[0] = box_size[ZZ]/ir->nkz;
 903     }
 904
 905     /* If no value for the fourierspacing was provided on the command line, we
 906      * use the reconstruction from the tpr file */
 907     if (ir->fourier_spacing > 0)
 908     {
 909         /* Use the spacing from the tpr */
 910         fourierspacing = ir->fourier_spacing;
 911     }
 912     else
 913     {
 914         /* Use the maximum observed spacing */
 915         fourierspacing = max(max(info->fsx[0], info->fsy[0]), info->fsz[0]);
 916     }
 917
 918     fprintf(stdout, "Calculating PME grid points on the basis of a fourierspacing of %f nm\n", fourierspacing);
 919
 920     /* For performance comparisons the number of particles is useful to have */
 921     fprintf(fp, "   Number of particles  : %d\n", mtop.natoms);
 922
 923     /* Print information about settings of which some are potentially modified: */
 924     fprintf(fp, "   Coulomb type         : %s\n", EELTYPE(ir->coulombtype));
 925     fprintf(fp, "   Grid spacing x y z   : %f %f %f\n",
 926             box_size[XX]/ir->nkx, box_size[YY]/ir->nky, box_size[ZZ]/ir->nkz);
 927     fprintf(fp, "   Van der Waals type   : %s\n", EVDWTYPE(ir->vdwtype));
 928     if (EVDW_SWITCHED(ir->vdwtype))
 929     {
 930         fprintf(fp, "   rvdw_switch          : %f nm\n", ir->rvdw_switch);
 931     }
 932     if (EPME_SWITCHED(ir->coulombtype))
 933     {
 934         fprintf(fp, "   rlist                : %f nm\n", ir->rlist);
 935     }
 936     if (ir->rlistlong != max_cutoff(ir->rvdw, ir->rcoulomb))
 937     {
 938         fprintf(fp, "   rlistlong            : %f nm\n", ir->rlistlong);
 939     }
 940
 941     /* Print a descriptive line about the tpr settings tested */
 942     fprintf(fp, "\nWill try these real/reciprocal workload settings:\n");
 943     fprintf(fp, " No.   scaling  rcoulomb");
 944     fprintf(fp, "  nkx  nky  nkz");
 945     fprintf(fp, "   spacing");
 946     if (evdwCUT == ir->vdwtype)
 947     {
 948         fprintf(fp, "      rvdw");
 949     }
 950     if (EPME_SWITCHED(ir->coulombtype))
 951     {
 952         fprintf(fp, "     rlist");
 953     }
 954     if (ir->rlistlong != max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb)) )
 955     {
 956         fprintf(fp, " rlistlong");
 957     }
 958     fprintf(fp, "  tpr file\n");
 959
 960     /* Loop to create the requested number of tpr input files */
 961     for (j = 0; j < *ntprs; j++)
 962     {
 963         /* The first .tpr is the provided one, just need to modify nsteps,
 964          * so skip the following block */
 965         if (j != 0)
 966         {
 967             /* Determine which Coulomb radii rc to use in the benchmarks */
 968             add = (rmax-rmin)/(*ntprs-1);
 969             if (is_equal(rmin, info->rcoulomb[0]))
 970             {
 971                 ir->rcoulomb = rmin + j*add;
 972             }
 973             else if (is_equal(rmax, info->rcoulomb[0]))
 974             {
 975                 ir->rcoulomb = rmin + (j-1)*add;
 976             }
 977             else
 978             {
 979                 /* rmin != rcoul != rmax, ergo test between rmin and rmax */
 980                 add          = (rmax-rmin)/(*ntprs-2);
 981                 ir->rcoulomb = rmin + (j-1)*add;
 982             }
 983
 984             /* Determine the scaling factor fac */
 985             fac = ir->rcoulomb/info->rcoulomb[0];
 986
 987             /* Scale the Fourier grid spacing */
 988             ir->nkx = ir->nky = ir->nkz = 0;
 989             calc_grid(NULL, state.box, fourierspacing*fac, &ir->nkx, &ir->nky, &ir->nkz);
 990
 991             /* Adjust other radii since various conditions neet to be fulfilled */
 992             if (eelPME == ir->coulombtype)
 993             {
 994                 /* plain PME, rcoulomb must be equal to rlist */
 995                 ir->rlist = ir->rcoulomb;
 996             }
 997             else
 998             {
 999                 /* rlist must be >= rcoulomb, we keep the size of the buffer region */
1000                 ir->rlist = ir->rcoulomb + nlist_buffer;
1001             }
1002
1003             if (bScaleRvdw && evdwCUT == ir->vdwtype)
1004             {
1005                 /* For vdw cutoff, rvdw >= rlist */
1006                 ir->rvdw = max(info->rvdw[0], ir->rlist);
1007             }
1008
1009             ir->rlistlong = max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb));
1010
1011         } /* end of "if (j != 0)" */
1012
1013         /* for j==0: Save the original settings
1014          * for j >0: Save modified radii and Fourier grids */
1015         info->rcoulomb[j]  = ir->rcoulomb;
1016         info->rvdw[j]      = ir->rvdw;
1017         info->nkx[j]       = ir->nkx;
1018         info->nky[j]       = ir->nky;
1019         info->nkz[j]       = ir->nkz;
1020         info->rlist[j]     = ir->rlist;
1021         info->rlistlong[j] = ir->rlistlong;
1022         info->fsx[j]       = fac*fourierspacing;
1023         info->fsy[j]       = fac*fourierspacing;
1024         info->fsz[j]       = fac*fourierspacing;
1025
1026         /* Write the benchmark tpr file */
1027         strncpy(fn_bench_tprs[j], fn_sim_tpr, strlen(fn_sim_tpr)-strlen(".tpr"));
1028         sprintf(buf, "_bench%.2d.tpr", j);
1029         strcat(fn_bench_tprs[j], buf);
1030         fprintf(stdout, "Writing benchmark tpr %s with nsteps=", fn_bench_tprs[j]);
1031         fprintf(stdout, gmx_large_int_pfmt, ir->nsteps);
1032         if (j > 0)
1033         {
1034             fprintf(stdout, ", scaling factor %f\n", fac);
1035         }
1036         else
1037         {
1038             fprintf(stdout, ", unmodified settings\n");
1039         }
1040
1041         write_tpx_state(fn_bench_tprs[j], ir, &state, &mtop);
1042
1043         /* Write information about modified tpr settings to log file */
1044         fprintf(fp, "%4d%10f%10f", j, fac, ir->rcoulomb);
1045         fprintf(fp, "%5d%5d%5d", ir->nkx, ir->nky, ir->nkz);
1046         fprintf(fp, " %9f ", info->fsx[j]);
1047         if (evdwCUT == ir->vdwtype)
1048         {
1049             fprintf(fp, "%10f", ir->rvdw);
1050         }
1051         if (EPME_SWITCHED(ir->coulombtype))
1052         {
1053             fprintf(fp, "%10f", ir->rlist);
1054         }
1055         if (info->rlistlong[0] != max_cutoff(info->rlist[0], max_cutoff(info->rvdw[0], info->rcoulomb[0])) )
1056         {
1057             fprintf(fp, "%10f", ir->rlistlong);
1058         }
1059         fprintf(fp, "  %-14s\n", fn_bench_tprs[j]);
1060
1061         /* Make it clear to the user that some additional settings were modified */
1062         if (!is_equal(ir->rvdw, info->rvdw[0])
1063             || !is_equal(ir->rlistlong, info->rlistlong[0]) )
1064         {
1065             bNote = TRUE;
1066         }
1067     }
1068     if (bNote)
1069     {
1070         fprintf(fp, "\nNote that in addition to the Coulomb radius and the Fourier grid\n"
1071                 "other input settings were also changed (see table above).\n"
1072                 "Please check if the modified settings are appropriate.\n");
1073     }
1074     fflush(stdout);
1075     fflush(fp);
1076     sfree(ir);
1077 }
1078
1079
1080 /* Rename the files we want to keep to some meaningful filename and
1081  * delete the rest */
1082 static void cleanup(const t_filenm *fnm, int nfile, int k, int nnodes,
1083                     int nPMEnodes, int nr, gmx_bool bKeepStderr)
1084 {
1085     char        numstring[STRLEN];
1086     char        newfilename[STRLEN];
1087     const char *fn = NULL;
1088     int         i;
1089     const char *opt;
1090
1091
1092     fprintf(stdout, "Cleaning up, deleting benchmark temp files ...\n");
1093
1094     for (i = 0; i < nfile; i++)
1095     {
1096         opt = (char *)fnm[i].opt;
1097         if (strcmp(opt, "-p") == 0)
1098         {
1099             /* do nothing; keep this file */
1100             ;
1101         }
1102         else if (strcmp(opt, "-bg") == 0)
1103         {
1104             /* Give the log file a nice name so one can later see which parameters were used */
1105             numstring[0] = '\0';
1106             if (nr > 0)
1107             {
1108                 sprintf(numstring, "_%d", nr);
1109             }
1110             sprintf(newfilename, "%s_no%d_np%d_npme%d%s", opt2fn("-bg", nfile, fnm), k, nnodes, nPMEnodes, numstring);
1111             if (gmx_fexist(opt2fn("-bg", nfile, fnm)))
1112             {
1113                 fprintf(stdout, "renaming log file to %s\n", newfilename);
1114                 make_backup(newfilename);
1115                 rename(opt2fn("-bg", nfile, fnm), newfilename);
1116             }
1117         }
1118         else if (strcmp(opt, "-err") == 0)
1119         {
1120             /* This file contains the output of stderr. We want to keep it in
1121              * cases where there have been problems. */
1122             fn           = opt2fn(opt, nfile, fnm);
1123             numstring[0] = '\0';
1124             if (nr > 0)
1125             {
1126                 sprintf(numstring, "_%d", nr);
1127             }
1128             sprintf(newfilename, "%s_no%d_np%d_npme%d%s", fn, k, nnodes, nPMEnodes, numstring);
1129             if (gmx_fexist(fn))
1130             {
1131                 if (bKeepStderr)
1132                 {
1133                     fprintf(stdout, "Saving stderr output in %s\n", newfilename);
1134                     make_backup(newfilename);
1135                     rename(fn, newfilename);
1136                 }
1137                 else
1138                 {
1139                     fprintf(stdout, "Deleting %s\n", fn);
1140                     remove(fn);
1141                 }
1142             }
1143         }
1144         /* Delete the files which are created for each benchmark run: (options -b*) */
1145         else if ( (0 == strncmp(opt, "-b", 2)) && (opt2bSet(opt, nfile, fnm) || !is_optional(&fnm[i])) )
1146         {
1147             fn = opt2fn(opt, nfile, fnm);
1148             if (gmx_fexist(fn))
1149             {
1150                 fprintf(stdout, "Deleting %s\n", fn);
1151                 remove(fn);
1152             }
1153         }
1154     }
1155 }
1156
1157
1158 /* Returns the largest common factor of n1 and n2 */
1159 static int largest_common_factor(int n1, int n2)
1160 {
1161     int factor, nmax;
1162
1163     nmax = min(n1, n2);
1164     for (factor = nmax; factor > 0; factor--)
1165     {
1166         if (0 == (n1 % factor) && 0 == (n2 % factor) )
1167         {
1168             return(factor);
1169         }
1170     }
1171     return 0; /* one for the compiler */
1172 }
1173
1174 enum {
1175     eNpmeAuto, eNpmeAll, eNpmeReduced, eNpmeSubset, eNpmeNr
1176 };
1177
1178 /* Create a list of numbers of PME nodes to test */
1179 static void make_npme_list(
1180         const char *npmevalues_opt, /* Make a complete list with all
1181                                      * possibilities or a short list that keeps only
1182                                      * reasonable numbers of PME nodes                  */
1183         int        *nentries,       /* Number of entries we put in the nPMEnodes list   */
1184         int        *nPMEnodes[],    /* Each entry contains the value for -npme          */
1185         int         nnodes,         /* Total number of nodes to do the tests on         */
1186         int         minPMEnodes,    /* Minimum number of PME nodes                      */
1187         int         maxPMEnodes)    /* Maximum number of PME nodes                      */
1188 {
1189     int i, npme, npp;
1190     int min_factor = 1;   /* We request that npp and npme have this minimal
1191                            * largest common factor (depends on npp)           */
1192     int nlistmax;         /* Max. list size                                   */
1193     int nlist;            /* Actual number of entries in list                 */
1194     int eNPME = 0;
1195
1196
1197     /* Do we need to check all possible values for -npme or is a reduced list enough? */
1198     if (0 == strcmp(npmevalues_opt, "all") )
1199     {
1200         eNPME = eNpmeAll;
1201     }
1202     else if (0 == strcmp(npmevalues_opt, "subset") )
1203     {
1204         eNPME = eNpmeSubset;
1205     }
1206     else /* "auto" or "range" */
1207     {
1208         if (nnodes <= 64)
1209         {
1210             eNPME = eNpmeAll;
1211         }
1212         else if (nnodes < 128)
1213         {
1214             eNPME = eNpmeReduced;
1215         }
1216         else
1217         {
1218             eNPME = eNpmeSubset;
1219         }
1220     }
1221
1222     /* Calculate how many entries we could possibly have (in case of -npme all) */
1223     if (nnodes > 2)
1224     {
1225         nlistmax = maxPMEnodes - minPMEnodes + 3;
1226         if (0 == minPMEnodes)
1227         {
1228             nlistmax--;
1229         }
1230     }
1231     else
1232     {
1233         nlistmax = 1;
1234     }
1235
1236     /* Now make the actual list which is at most of size nlist */
1237     snew(*nPMEnodes, nlistmax);
1238     nlist = 0; /* start counting again, now the real entries in the list */
1239     for (i = 0; i < nlistmax - 2; i++)
1240     {
1241         npme = maxPMEnodes - i;
1242         npp  = nnodes-npme;
1243         switch (eNPME)
1244         {
1245             case eNpmeAll:
1246                 min_factor = 1;
1247                 break;
1248             case eNpmeReduced:
1249                 min_factor = 2;
1250                 break;
1251             case eNpmeSubset:
1252                 /* For 2d PME we want a common largest factor of at least the cube
1253                  * root of the number of PP nodes */
1254                 min_factor = (int) pow(npp, 1.0/3.0);
1255                 break;
1256             default:
1257                 gmx_fatal(FARGS, "Unknown option for eNPME in make_npme_list");
1258                 break;
1259         }
1260         if (largest_common_factor(npp, npme) >= min_factor)
1261         {
1262             (*nPMEnodes)[nlist] = npme;
1263             nlist++;
1264         }
1265     }
1266     /* We always test 0 PME nodes and the automatic number */
1267     *nentries             = nlist + 2;
1268     (*nPMEnodes)[nlist  ] =  0;
1269     (*nPMEnodes)[nlist+1] = -1;
1270
1271     fprintf(stderr, "Will try the following %d different values for -npme:\n", *nentries);
1272     for (i = 0; i < *nentries-1; i++)
1273     {
1274         fprintf(stderr, "%d, ", (*nPMEnodes)[i]);
1275     }
1276     fprintf(stderr, "and %d (auto).\n", (*nPMEnodes)[*nentries-1]);
1277 }
1278
1279
1280 /* Allocate memory to store the performance data */
1281 static void init_perfdata(t_perf *perfdata[], int ntprs, int datasets, int repeats)
1282 {
1283     int i, j, k;
1284
1285
1286     for (k = 0; k < ntprs; k++)
1287     {
1288         snew(perfdata[k], datasets);
1289         for (i = 0; i < datasets; i++)
1290         {
1291             for (j = 0; j < repeats; j++)
1292             {
1293                 snew(perfdata[k][i].Gcycles, repeats);
1294                 snew(perfdata[k][i].ns_per_day, repeats);
1295                 snew(perfdata[k][i].PME_f_load, repeats);
1296             }
1297         }
1298     }
1299 }
1300
1301
1302 /* Check for errors on mdrun -h */
1303 static void make_sure_it_runs(char *mdrun_cmd_line, int length, FILE *fp)
1304 {
1305     char *command, *msg;
1306     int   ret;
1307
1308
1309     snew(command, length +  15);
1310     snew(msg, length + 500);
1311
1312     fprintf(stdout, "Making sure the benchmarks can be executed ...\n");
1313     /* FIXME: mdrun -h no longer actually does anything useful.
1314      * It unconditionally prints the help, ignoring all other options. */
1315     sprintf(command, "%s-h -quiet", mdrun_cmd_line);
1316     ret = gmx_system_call(command);
1317
1318     if (0 != ret)
1319     {
1320         /* To prevent confusion, do not again issue a gmx_fatal here since we already
1321          * get the error message from mdrun itself */
1322         sprintf(msg,    "Cannot run the benchmark simulations! Please check the error message of\n"
1323                 "mdrun for the source of the problem. Did you provide a command line\n"
1324                 "argument that neither g_tune_pme nor mdrun understands? Offending command:\n"
1325                 "\n%s\n\n", command);
1326
1327         fprintf(stderr, "%s", msg);
1328         sep_line(fp);
1329         fprintf(fp, "%s", msg);
1330
1331         exit(ret);
1332     }
1333
1334     sfree(command);
1335     sfree(msg    );
1336 }
1337
1338
1339 static void do_the_tests(
1340         FILE           *fp,             /* General g_tune_pme output file         */
1341         char          **tpr_names,      /* Filenames of the input files to test   */
1342         int             maxPMEnodes,    /* Max fraction of nodes to use for PME   */
1343         int             minPMEnodes,    /* Min fraction of nodes to use for PME   */
1344         int             npme_fixed,     /* If >= -1, test fixed number of PME
1345                                          * nodes only                             */
1346         const char     *npmevalues_opt, /* Which -npme values should be tested    */
1347         t_perf        **perfdata,       /* Here the performace data is stored     */
1348         int            *pmeentries,     /* Entries in the nPMEnodes list          */
1349         int             repeats,        /* Repeat each test this often            */
1350         int             nnodes,         /* Total number of nodes = nPP + nPME     */
1351         int             nr_tprs,        /* Total number of tpr files to test      */
1352         gmx_bool        bThreads,       /* Threads or MPI?                        */
1353         char           *cmd_mpirun,     /* mpirun command string                  */
1354         char           *cmd_np,         /* "-np", "-n", whatever mpirun needs     */
1355         char           *cmd_mdrun,      /* mdrun command string                   */
1356         char           *cmd_args_bench, /* arguments for mdrun in a string        */
1357         const t_filenm *fnm,            /* List of filenames from command line    */
1358         int             nfile,          /* Number of files specified on the cmdl. */
1359         int             presteps,       /* DLB equilibration steps, is checked    */
1360         gmx_large_int_t cpt_steps)      /* Time step counter in the checkpoint    */
1361 {
1362     int      i, nr, k, ret, count = 0, totaltests;
1363     int     *nPMEnodes = NULL;
1364     t_perf  *pd        = NULL;
1365     int      cmdline_length;
1366     char    *command, *cmd_stub;
1367     char     buf[STRLEN];
1368     gmx_bool bResetProblem = FALSE;
1369     gmx_bool bFirst        = TRUE;
1370
1371
1372     /* This string array corresponds to the eParselog enum type at the start
1373      * of this file */
1374     const char* ParseLog[] = {
1375         "OK.",
1376         "Logfile not found!",
1377         "No timings, logfile truncated?",
1378         "Run was terminated.",
1379         "Counters were not reset properly.",
1380         "No DD grid found for these settings.",
1381         "TPX version conflict!",
1382         "mdrun was not started in parallel!",
1383         "An error occured."
1384     };
1385     char        str_PME_f_load[13];
1386
1387
1388     /* Allocate space for the mdrun command line. 100 extra characters should
1389        be more than enough for the -npme etcetera arguments */
1390     cmdline_length =  strlen(cmd_mpirun)
1391         + strlen(cmd_np)
1392         + strlen(cmd_mdrun)
1393         + strlen(cmd_args_bench)
1394         + strlen(tpr_names[0]) + 100;
1395     snew(command, cmdline_length);
1396     snew(cmd_stub, cmdline_length);
1397
1398     /* Construct the part of the command line that stays the same for all tests: */
1399     if (bThreads)
1400     {
1401         sprintf(cmd_stub, "%s%s", cmd_mdrun, cmd_np);
1402     }
1403     else
1404     {
1405         sprintf(cmd_stub, "%s%s%s ", cmd_mpirun, cmd_np, cmd_mdrun);
1406     }
1407
1408     /* Create a list of numbers of PME nodes to test */
1409     if (npme_fixed < -1)
1410     {
1411         make_npme_list(npmevalues_opt, pmeentries, &nPMEnodes,
1412                        nnodes, minPMEnodes, maxPMEnodes);
1413     }
1414     else
1415     {
1416         *pmeentries  = 1;
1417         snew(nPMEnodes, 1);
1418         nPMEnodes[0] = npme_fixed;
1419         fprintf(stderr, "Will use a fixed number of %d PME-only nodes.\n", nPMEnodes[0]);
1420     }
1421
1422     if (0 == repeats)
1423     {
1424         fprintf(fp, "\nNo benchmarks done since number of repeats (-r) is 0.\n");
1425         ffclose(fp);
1426         finalize(opt2fn("-p", nfile, fnm));
1427         exit(0);
1428     }
1429
1430     /* Allocate one dataset for each tpr input file: */
1431     init_perfdata(perfdata, nr_tprs, *pmeentries, repeats);
1432
1433     /*****************************************/
1434     /* Main loop over all tpr files to test: */
1435     /*****************************************/
1436     totaltests = nr_tprs*(*pmeentries)*repeats;
1437     for (k = 0; k < nr_tprs; k++)
1438     {
1439         fprintf(fp, "\nIndividual timings for input file %d (%s):\n", k, tpr_names[k]);
1440         fprintf(fp, "PME nodes      Gcycles       ns/day        PME/f    Remark\n");
1441         /* Loop over various numbers of PME nodes: */
1442         for (i = 0; i < *pmeentries; i++)
1443         {
1444             pd = &perfdata[k][i];
1445
1446             /* Loop over the repeats for each scenario: */
1447             for (nr = 0; nr < repeats; nr++)
1448             {
1449                 pd->nPMEnodes = nPMEnodes[i];
1450
1451                 /* Add -npme and -s to the command line and save it. Note that
1452                  * the -passall (if set) options requires cmd_args_bench to be
1453                  * at the end of the command line string */
1454                 snew(pd->mdrun_cmd_line, cmdline_length);
1455                 sprintf(pd->mdrun_cmd_line, "%s-npme %d -s %s %s",
1456                         cmd_stub, pd->nPMEnodes, tpr_names[k], cmd_args_bench);
1457
1458                 /* To prevent that all benchmarks fail due to a show-stopper argument
1459                  * on the mdrun command line, we make a quick check with mdrun -h first */
1460                 if (bFirst)
1461                 {
1462                     make_sure_it_runs(pd->mdrun_cmd_line, cmdline_length, fp);
1463                 }
1464                 bFirst = FALSE;
1465
1466                 /* Do a benchmark simulation: */
1467                 if (repeats > 1)
1468                 {
1469                     sprintf(buf, ", pass %d/%d", nr+1, repeats);
1470                 }
1471                 else
1472                 {
1473                     buf[0] = '\0';
1474                 }
1475                 fprintf(stdout, "\n=== Progress %2.0f%%, tpr %d/%d, run %d/%d%s:\n",
1476                         (100.0*count)/totaltests,
1477                         k+1, nr_tprs, i+1, *pmeentries, buf);
1478                 make_backup(opt2fn("-err", nfile, fnm));
1479                 sprintf(command, "%s 1> /dev/null 2>%s", pd->mdrun_cmd_line, opt2fn("-err", nfile, fnm));
1480                 fprintf(stdout, "%s\n", pd->mdrun_cmd_line);
1481                 gmx_system_call(command);
1482
1483                 /* Collect the performance data from the log file; also check stderr
1484                  * for fatal errors */
1485                 ret = parse_logfile(opt2fn("-bg", nfile, fnm), opt2fn("-err", nfile, fnm),
1486                                     pd, nr, presteps, cpt_steps, nnodes);
1487                 if ((presteps > 0) && (ret == eParselogResetProblem))
1488                 {
1489                     bResetProblem = TRUE;
1490                 }
1491
1492                 if (-1 == pd->nPMEnodes)
1493                 {
1494                     sprintf(buf, "(%3d)", pd->guessPME);
1495                 }
1496                 else
1497                 {
1498                     sprintf(buf, "     ");
1499                 }
1500
1501                 /* Nicer output */
1502                 if (pd->PME_f_load[nr] > 0.0)
1503                 {
1504                     sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load[nr]);
1505                 }
1506                 else
1507                 {
1508                     sprintf(str_PME_f_load, "%s", "         -  ");
1509                 }
1510
1511                 /* Write the data we got to disk */
1512                 fprintf(fp, "%4d%s %12.3f %12.3f %s    %s", pd->nPMEnodes,
1513                         buf, pd->Gcycles[nr], pd->ns_per_day[nr], str_PME_f_load, ParseLog[ret]);
1514                 if (!(ret == eParselogOK || ret == eParselogNoDDGrid || ret == eParselogNotFound) )
1515                 {
1516                     fprintf(fp, " Check %s file for problems.", ret == eParselogFatal ? "err" : "log");
1517                 }
1518                 fprintf(fp, "\n");
1519                 fflush(fp);
1520                 count++;
1521
1522                 /* Do some cleaning up and delete the files we do not need any more */
1523                 cleanup(fnm, nfile, k, nnodes, pd->nPMEnodes, nr, ret == eParselogFatal);
1524
1525                 /* If the first run with this number of processors already failed, do not try again: */
1526                 if (pd->Gcycles[0] <= 0.0 && repeats > 1)
1527                 {
1528                     fprintf(stdout, "Skipping remaining passes of unsuccessful setting, see log file for details.\n");
1529                     count += repeats-(nr+1);
1530                     break;
1531                 }
1532             } /* end of repeats loop */
1533         }     /* end of -npme loop */
1534     }         /* end of tpr file loop */
1535
1536     if (bResetProblem)
1537     {
1538         sep_line(fp);
1539         fprintf(fp, "WARNING: The cycle and time step counters could not be reset properly. ");
1540         sep_line(fp);
1541     }
1542     sfree(command);
1543     sfree(cmd_stub);
1544 }
1545
1546
1547 static void check_input(
1548         int             nnodes,
1549         int             repeats,
1550         int            *ntprs,
1551         real           *rmin,
1552         real            rcoulomb,
1553         real           *rmax,
1554         real            maxPMEfraction,
1555         real            minPMEfraction,
1556         int             npme_fixed,
1557         gmx_large_int_t bench_nsteps,
1558         const t_filenm *fnm,
1559         int             nfile,
1560         int             sim_part,
1561         int             presteps,
1562         int             npargs,
1563         t_pargs        *pa)
1564 {
1565     int old;
1566
1567
1568     /* Make sure the input file exists */
1569     if (!gmx_fexist(opt2fn("-s", nfile, fnm)))
1570     {
1571         gmx_fatal(FARGS, "File %s not found.", opt2fn("-s", nfile, fnm));
1572     }
1573
1574     /* Make sure that the checkpoint file is not overwritten during benchmarking */
1575     if ( (0 == strcmp(opt2fn("-cpi", nfile, fnm), opt2fn("-bcpo", nfile, fnm)) ) && (sim_part > 1) )
1576     {
1577         gmx_fatal(FARGS, "Checkpoint input (-cpi) and benchmark checkpoint output (-bcpo) files must not be identical.\n"
1578                   "The checkpoint input file must not be overwritten during the benchmarks.\n");
1579     }
1580
1581     /* Make sure that repeats is >= 0 (if == 0, only write tpr files) */
1582     if (repeats < 0)
1583     {
1584         gmx_fatal(FARGS, "Number of repeats < 0!");
1585     }
1586
1587     /* Check number of nodes */
1588     if (nnodes < 1)
1589     {
1590         gmx_fatal(FARGS, "Number of nodes/threads must be a positive integer.");
1591     }
1592
1593     /* Automatically choose -ntpr if not set */
1594     if (*ntprs < 1)
1595     {
1596         if (nnodes < 16)
1597         {
1598             *ntprs = 1;
1599         }
1600         else
1601         {
1602             *ntprs = 3;
1603             /* Set a reasonable scaling factor for rcoulomb */
1604             if (*rmax <= 0)
1605             {
1606                 *rmax = rcoulomb * 1.2;
1607             }
1608         }
1609         fprintf(stderr, "Will test %d tpr file%s.\n", *ntprs, *ntprs == 1 ? "" : "s");
1610     }
1611     else
1612     {
1613         if (1 == *ntprs)
1614         {
1615             fprintf(stderr, "Note: Choose ntpr>1 to shift PME load between real and reciprocal space.\n");
1616         }
1617     }
1618
1619     /* Make shure that rmin <= rcoulomb <= rmax */
1620     if (*rmin <= 0)
1621     {
1622         *rmin = rcoulomb;
1623     }
1624     if (*rmax <= 0)
1625     {
1626         *rmax = rcoulomb;
1627     }
1628     if (!(*rmin <= *rmax) )
1629     {
1630         gmx_fatal(FARGS, "Please choose the Coulomb radii such that rmin <= rmax.\n"
1631                   "rmin = %g, rmax = %g, actual rcoul from .tpr file = %g\n", *rmin, *rmax, rcoulomb);
1632     }
1633     /* Add test scenarios if rmin or rmax were set */
1634     if (*ntprs <= 2)
1635     {
1636         if (!is_equal(*rmin, rcoulomb) && (*ntprs == 1) )
1637         {
1638             (*ntprs)++;
1639             fprintf(stderr, "NOTE: Setting -rmin to %g changed -ntpr to %d\n",
1640                     *rmin, *ntprs);
1641         }
1642         if (!is_equal(*rmax, rcoulomb) && (*ntprs == 1) )
1643         {
1644             (*ntprs)++;
1645             fprintf(stderr, "NOTE: Setting -rmax to %g changed -ntpr to %d\n",
1646                     *rmax, *ntprs);
1647         }
1648     }
1649     old = *ntprs;
1650     /* If one of rmin, rmax is set, we need 2 tpr files at minimum */
1651     if (!is_equal(*rmax, rcoulomb) || !is_equal(*rmin, rcoulomb) )
1652     {
1653         *ntprs = max(*ntprs, 2);
1654     }
1655
1656     /* If both rmin, rmax are set, we need 3 tpr files at minimum */
1657     if (!is_equal(*rmax, rcoulomb) && !is_equal(*rmin, rcoulomb) )
1658     {
1659         *ntprs = max(*ntprs, 3);
1660     }
1661
1662     if (old != *ntprs)
1663     {
1664         fprintf(stderr, "NOTE: Your rmin, rmax setting changed -ntpr to %d\n", *ntprs);
1665     }
1666
1667     if (*ntprs > 1)
1668     {
1669         if (is_equal(*rmin, rcoulomb) && is_equal(rcoulomb, *rmax)) /* We have just a single rc */
1670         {
1671             fprintf(stderr, "WARNING: Resetting -ntpr to 1 since no Coulomb radius scaling is requested.\n"
1672                     "Please set rmin < rmax to test Coulomb radii in the [rmin, rmax] interval\n"
1673                     "with correspondingly adjusted PME grid settings\n");
1674             *ntprs = 1;
1675         }
1676     }
1677
1678     /* Check whether max and min fraction are within required values */
1679     if (maxPMEfraction > 0.5 || maxPMEfraction < 0)
1680     {
1681         gmx_fatal(FARGS, "-max must be between 0 and 0.5");
1682     }
1683     if (minPMEfraction > 0.5 || minPMEfraction < 0)
1684     {
1685         gmx_fatal(FARGS, "-min must be between 0 and 0.5");
1686     }
1687     if (maxPMEfraction < minPMEfraction)
1688     {
1689         gmx_fatal(FARGS, "-max must be larger or equal to -min");
1690     }
1691
1692     /* Check whether the number of steps - if it was set - has a reasonable value */
1693     if (bench_nsteps < 0)
1694     {
1695         gmx_fatal(FARGS, "Number of steps must be positive.");
1696     }
1697
1698     if (bench_nsteps > 10000 || bench_nsteps < 100)
1699     {
1700         fprintf(stderr, "WARNING: steps=");
1701         fprintf(stderr, gmx_large_int_pfmt, bench_nsteps);
1702         fprintf(stderr, ". Are you sure you want to perform so %s steps for each benchmark?\n", (bench_nsteps < 100) ? "few" : "many");
1703     }
1704
1705     if (presteps < 0)
1706     {
1707         gmx_fatal(FARGS, "Cannot have a negative number of presteps.\n");
1708     }
1709
1710     /* Check for rcoulomb scaling if more than one .tpr file is tested */
1711     if (*ntprs > 1)
1712     {
1713         if (*rmin/rcoulomb < 0.75 || *rmax/rcoulomb > 1.25)
1714         {
1715             fprintf(stderr, "WARNING: Applying extreme scaling factor. I hope you know what you are doing.\n");
1716         }
1717     }
1718
1719     /* If a fixed number of PME nodes is set we do rcoulomb and PME gird tuning
1720      * only. We need to check whether the requested number of PME-only nodes
1721      * makes sense. */
1722     if (npme_fixed > -1)
1723     {
1724         /* No more than 50% of all nodes can be assigned as PME-only nodes. */
1725         if (2*npme_fixed > nnodes)
1726         {
1727             gmx_fatal(FARGS, "Cannot have more than %d PME-only nodes for a total of %d nodes (you chose %d).\n",
1728                       nnodes/2, nnodes, npme_fixed);
1729         }
1730         if ((npme_fixed > 0) && (5*npme_fixed < nnodes))
1731         {
1732             fprintf(stderr, "WARNING: Only %g percent of the nodes are assigned as PME-only nodes.\n",
1733                     100.0*((real)npme_fixed / (real)nnodes));
1734         }
1735         if (opt2parg_bSet("-min", npargs, pa) || opt2parg_bSet("-max", npargs, pa))
1736         {
1737             fprintf(stderr, "NOTE: The -min, -max, and -npme options have no effect when a\n"
1738                     "      fixed number of PME-only nodes is requested with -fix.\n");
1739         }
1740     }
1741 }
1742
1743
1744 /* Returns TRUE when "opt" is needed at launch time */
1745 static gmx_bool is_launch_file(char *opt, gmx_bool bSet)
1746 {
1747     /* Apart from the input .tpr and the output log files we need all options that
1748      * were set on the command line and that do not start with -b */
1749     if    (0 == strncmp(opt, "-b", 2) || 0 == strncmp(opt, "-s", 2)
1750            || 0 == strncmp(opt, "-err", 4) || 0 == strncmp(opt, "-p", 2) )
1751     {
1752         return FALSE;
1753     }
1754
1755     return bSet;
1756 }
1757
1758
1759 /* Returns TRUE when "opt" defines a file which is needed for the benchmarks runs */
1760 static gmx_bool is_bench_file(char *opt, gmx_bool bSet, gmx_bool bOptional, gmx_bool bIsOutput)
1761 {
1762     /* Apart from the input .tpr, all files starting with "-b" are for
1763      * _b_enchmark files exclusively */
1764     if (0 == strncmp(opt, "-s", 2))
1765     {
1766         return FALSE;
1767     }
1768
1769     if (0 == strncmp(opt, "-b", 2) || 0 == strncmp(opt, "-s", 2))
1770     {
1771         if (!bOptional || bSet)
1772         {
1773             return TRUE;
1774         }
1775         else
1776         {
1777             return FALSE;
1778         }
1779     }
1780     else
1781     {
1782         if (bIsOutput)
1783         {
1784             return FALSE;
1785         }
1786         else
1787         {
1788             if (bSet) /* These are additional input files like -cpi -ei */
1789             {
1790                 return TRUE;
1791             }
1792             else
1793             {
1794                 return FALSE;
1795             }
1796         }
1797     }
1798 }
1799
1800
1801 /* Adds 'buf' to 'str' */
1802 static void add_to_string(char **str, char *buf)
1803 {
1804     int len;
1805
1806
1807     len = strlen(*str) + strlen(buf) + 1;
1808     srenew(*str, len);
1809     strcat(*str, buf);
1810 }
1811
1812
1813 /* Create the command line for the benchmark as well as for the real run */
1814 static void create_command_line_snippets(
1815         gmx_bool  bAppendFiles,
1816         gmx_bool  bKeepAndNumCPT,
1817         gmx_bool  bResetHWay,
1818         int       presteps,
1819         int       nfile,
1820         t_filenm  fnm[],
1821         char     *cmd_args_bench[],  /* command line arguments for benchmark runs */
1822         char     *cmd_args_launch[], /* command line arguments for simulation run */
1823         char      extra_args[])      /* Add this to the end of the command line */
1824 {
1825     int         i;
1826     char       *opt;
1827     const char *name;
1828     char        strbuf[STRLEN];
1829
1830
1831     /* strlen needs at least '\0' as a string: */
1832     snew(*cmd_args_bench, 1);
1833     snew(*cmd_args_launch, 1);
1834     *cmd_args_launch[0] = '\0';
1835     *cmd_args_bench[0]  = '\0';
1836
1837
1838     /*******************************************/
1839     /* 1. Process other command line arguments */
1840     /*******************************************/
1841     if (presteps > 0)
1842     {
1843         /* Add equilibration steps to benchmark options */
1844         sprintf(strbuf, "-resetstep %d ", presteps);
1845         add_to_string(cmd_args_bench, strbuf);
1846     }
1847     /* These switches take effect only at launch time */
1848     if (FALSE == bAppendFiles)
1849     {
1850         add_to_string(cmd_args_launch, "-noappend ");
1851     }
1852     if (bKeepAndNumCPT)
1853     {
1854         add_to_string(cmd_args_launch, "-cpnum ");
1855     }
1856     if (bResetHWay)
1857     {
1858         add_to_string(cmd_args_launch, "-resethway ");
1859     }
1860
1861     /********************/
1862     /* 2. Process files */
1863     /********************/
1864     for (i = 0; i < nfile; i++)
1865     {
1866         opt  = (char *)fnm[i].opt;
1867         name = opt2fn(opt, nfile, fnm);
1868
1869         /* Strbuf contains the options, now let's sort out where we need that */
1870         sprintf(strbuf, "%s %s ", opt, name);
1871
1872         if (is_bench_file(opt, opt2bSet(opt, nfile, fnm), is_optional(&fnm[i]), is_output(&fnm[i])) )
1873         {
1874             /* All options starting with -b* need the 'b' removed,
1875              * therefore overwrite strbuf */
1876             if (0 == strncmp(opt, "-b", 2))
1877             {
1878                 sprintf(strbuf, "-%s %s ", &opt[2], name);
1879             }
1880
1881             add_to_string(cmd_args_bench, strbuf);
1882         }
1883
1884         if (is_launch_file(opt, opt2bSet(opt, nfile, fnm)) )
1885         {
1886             add_to_string(cmd_args_launch, strbuf);
1887         }
1888     }
1889
1890     add_to_string(cmd_args_bench, extra_args);
1891     add_to_string(cmd_args_launch, extra_args);
1892 }
1893
1894
1895 /* Set option opt */
1896 static void setopt(const char *opt, int nfile, t_filenm fnm[])
1897 {
1898     int i;
1899
1900     for (i = 0; (i < nfile); i++)
1901     {
1902         if (strcmp(opt, fnm[i].opt) == 0)
1903         {
1904             fnm[i].flag |= ffSET;
1905         }
1906     }
1907 }
1908
1909
1910 /* This routine inspects the tpr file and ...
1911  * 1. checks for output files that get triggered by a tpr option. These output
1912  *    files are marked as 'set' to allow for a proper cleanup after each
1913  *    tuning run.
1914  * 2. returns the PME:PP load ratio
1915  * 3. returns rcoulomb from the tpr */
1916 static float inspect_tpr(int nfile, t_filenm fnm[], real *rcoulomb)
1917 {
1918     gmx_bool     bPull;     /* Is pulling requested in .tpr file?             */
1919     gmx_bool     bTpi;      /* Is test particle insertion requested?          */
1920     gmx_bool     bFree;     /* Is a free energy simulation requested?         */
1921     gmx_bool     bNM;       /* Is a normal mode analysis requested?           */
1922     t_inputrec   ir;
1923     t_state      state;
1924     gmx_mtop_t   mtop;
1925
1926
1927     /* Check tpr file for options that trigger extra output files */
1928     read_tpx_state(opt2fn("-s", nfile, fnm), &ir, &state, NULL, &mtop);
1929     bPull = (epullNO != ir.ePull);
1930     bFree = (efepNO  != ir.efep );
1931     bNM   = (eiNM    == ir.eI   );
1932     bTpi  = EI_TPI(ir.eI);
1933
1934     /* Set these output files on the tuning command-line */
1935     if (bPull)
1936     {
1937         setopt("-pf", nfile, fnm);
1938         setopt("-px", nfile, fnm);
1939     }
1940     if (bFree)
1941     {
1942         setopt("-dhdl", nfile, fnm);
1943     }
1944     if (bTpi)
1945     {
1946         setopt("-tpi", nfile, fnm);
1947         setopt("-tpid", nfile, fnm);
1948     }
1949     if (bNM)
1950     {
1951         setopt("-mtx", nfile, fnm);
1952     }
1953
1954     *rcoulomb = ir.rcoulomb;
1955
1956     /* Return the estimate for the number of PME nodes */
1957     return pme_load_estimate(&mtop, &ir, state.box);
1958 }
1959
1960
1961 static void couple_files_options(int nfile, t_filenm fnm[])
1962 {
1963     int      i;
1964     gmx_bool bSet, bBench;
1965     char    *opt;
1966     char     buf[20];
1967
1968
1969     for (i = 0; i < nfile; i++)
1970     {
1971         opt    = (char *)fnm[i].opt;
1972         bSet   = ((fnm[i].flag & ffSET) != 0);
1973         bBench = (0 == strncmp(opt, "-b", 2));
1974
1975         /* Check optional files */
1976         /* If e.g. -eo is set, then -beo also needs to be set */
1977         if (is_optional(&fnm[i]) && bSet && !bBench)
1978         {
1979             sprintf(buf, "-b%s", &opt[1]);
1980             setopt(buf, nfile, fnm);
1981         }
1982         /* If -beo is set, then -eo also needs to be! */
1983         if (is_optional(&fnm[i]) && bSet && bBench)
1984         {
1985             sprintf(buf, "-%s", &opt[2]);
1986             setopt(buf, nfile, fnm);
1987         }
1988     }
1989 }
1990
1991
1992 static double gettime()
1993 {
1994 #ifdef HAVE_GETTIMEOFDAY
1995     struct timeval t;
1996     double         seconds;
1997
1998     gettimeofday(&t, NULL);
1999
2000     seconds = (double) t.tv_sec + 1e-6*(double)t.tv_usec;
2001
2002     return seconds;
2003 #else
2004     double  seconds;
2005
2006     seconds = time(NULL);
2007
2008     return seconds;
2009 #endif
2010 }
2011
2012
2013 #define BENCHSTEPS (1000)
2014
2015 int gmx_tune_pme(int argc, char *argv[])
2016 {
2017     const char     *desc[] = {
2018         "For a given number [TT]-np[tt] or [TT]-ntmpi[tt] of processors/threads, this program systematically",
2019         "times [TT]mdrun[tt] with various numbers of PME-only nodes and determines",
2020         "which setting is fastest. It will also test whether performance can",
2021         "be enhanced by shifting load from the reciprocal to the real space",
2022         "part of the Ewald sum. ",
2023         "Simply pass your [TT].tpr[tt] file to [TT]g_tune_pme[tt] together with other options",
2024         "for [TT]mdrun[tt] as needed.[PAR]",
2025         "Which executables are used can be set in the environment variables",
2026         "MPIRUN and MDRUN. If these are not present, 'mpirun' and 'mdrun'",
2027         "will be used as defaults. Note that for certain MPI frameworks you",
2028         "need to provide a machine- or hostfile. This can also be passed",
2029         "via the MPIRUN variable, e.g.[PAR]",
2030         "[TT]export MPIRUN=\"/usr/local/mpirun -machinefile hosts\"[tt][PAR]",
2031         "Please call [TT]g_tune_pme[tt] with the normal options you would pass to",
2032         "[TT]mdrun[tt] and add [TT]-np[tt] for the number of processors to perform the",
2033         "tests on, or [TT]-ntmpi[tt] for the number of threads. You can also add [TT]-r[tt]",
2034         "to repeat each test several times to get better statistics. [PAR]",
2035         "[TT]g_tune_pme[tt] can test various real space / reciprocal space workloads",
2036         "for you. With [TT]-ntpr[tt] you control how many extra [TT].tpr[tt] files will be",
2037         "written with enlarged cutoffs and smaller Fourier grids respectively.",
2038         "Typically, the first test (number 0) will be with the settings from the input",
2039         "[TT].tpr[tt] file; the last test (number [TT]ntpr[tt]) will have the Coulomb cutoff",
2040         "specified by [TT]-rmax[tt] with a somwhat smaller PME grid at the same time. ",
2041         "In this last test, the Fourier spacing is multiplied with [TT]rmax[tt]/rcoulomb. ",
2042         "The remaining [TT].tpr[tt] files will have equally-spaced Coulomb radii (and Fourier "
2043         "spacings) between these extremes. [BB]Note[bb] that you can set [TT]-ntpr[tt] to 1",
2044         "if you just seek the optimal number of PME-only nodes; in that case",
2045         "your input [TT].tpr[tt] file will remain unchanged.[PAR]",
2046         "For the benchmark runs, the default of 1000 time steps should suffice for most",
2047         "MD systems. The dynamic load balancing needs about 100 time steps",
2048         "to adapt to local load imbalances, therefore the time step counters",
2049         "are by default reset after 100 steps. For large systems (>1M atoms), as well as ",
2050         "for a higher accuarcy of the measurements, you should set [TT]-resetstep[tt] to a higher value.",
2051         "From the 'DD' load imbalance entries in the md.log output file you",
2052         "can tell after how many steps the load is sufficiently balanced. Example call:[PAR]"
2053         "[TT]g_tune_pme -np 64 -s protein.tpr -launch[tt][PAR]",
2054         "After calling [TT]mdrun[tt] several times, detailed performance information",
2055         "is available in the output file [TT]perf.out.[tt] ",
2056         "[BB]Note[bb] that during the benchmarks, a couple of temporary files are written",
2057         "(options [TT]-b[tt]*), these will be automatically deleted after each test.[PAR]",
2058         "If you want the simulation to be started automatically with the",
2059         "optimized parameters, use the command line option [TT]-launch[tt].[PAR]",
2060     };
2061
2062     int             nnodes         = 1;
2063     int             repeats        = 2;
2064     int             pmeentries     = 0; /* How many values for -npme do we actually test for each tpr file */
2065     real            maxPMEfraction = 0.50;
2066     real            minPMEfraction = 0.25;
2067     int             maxPMEnodes, minPMEnodes;
2068     float           guessPMEratio;                    /* guessed PME:PP ratio based on the tpr file */
2069     float           guessPMEnodes;
2070     int             npme_fixed     = -2;              /* If >= -1, use only this number
2071                                                        * of PME-only nodes                */
2072     int             ntprs          = 0;
2073     real            rmin           = 0.0, rmax = 0.0; /* min and max value for rcoulomb if scaling is requested */
2074     real            rcoulomb       = -1.0;            /* Coulomb radius as set in .tpr file */
2075     gmx_bool        bScaleRvdw     = TRUE;
2076     gmx_large_int_t bench_nsteps   = BENCHSTEPS;
2077     gmx_large_int_t new_sim_nsteps = -1;  /* -1 indicates: not set by the user */
2078     gmx_large_int_t cpt_steps      = 0;   /* Step counter in .cpt input file   */
2079     int             presteps       = 100; /* Do a full cycle reset after presteps steps */
2080     gmx_bool        bOverwrite     = FALSE, bKeepTPR;
2081     gmx_bool        bLaunch        = FALSE;
2082     char           *ExtraArgs      = NULL;
2083     char          **tpr_names      = NULL;
2084     const char     *simulation_tpr = NULL;
2085     int             best_npme, best_tpr;
2086     int             sim_part = 1; /* For benchmarks with checkpoint files */
2087     char            bbuf[STRLEN];
2088
2089     /* Default program names if nothing else is found */
2090     char         *cmd_mpirun = NULL, *cmd_mdrun = NULL;
2091     char         *cmd_args_bench, *cmd_args_launch;
2092     char         *cmd_np = NULL;
2093
2094     t_perf      **perfdata = NULL;
2095     t_inputinfo  *info;
2096     int           i;
2097     FILE         *fp;
2098     t_commrec    *cr;
2099
2100     /* Print out how long the tuning took */
2101     double          seconds;
2102
2103     static t_filenm fnm[] = {
2104         /* g_tune_pme */
2105         { efOUT, "-p",      "perf",     ffWRITE },
2106         { efLOG, "-err",    "bencherr", ffWRITE },
2107         { efTPX, "-so",     "tuned",    ffWRITE },
2108         /* mdrun: */
2109         { efTPX, NULL,      NULL,       ffREAD },
2110         { efTRN, "-o",      NULL,       ffWRITE },
2111         { efXTC, "-x",      NULL,       ffOPTWR },
2112         { efCPT, "-cpi",    NULL,       ffOPTRD },
2113         { efCPT, "-cpo",    NULL,       ffOPTWR },
2114         { efSTO, "-c",      "confout",  ffWRITE },
2115         { efEDR, "-e",      "ener",     ffWRITE },
2116         { efLOG, "-g",      "md",       ffWRITE },
2117         { efXVG, "-dhdl",   "dhdl",     ffOPTWR },
2118         { efXVG, "-field",  "field",    ffOPTWR },
2119         { efXVG, "-table",  "table",    ffOPTRD },
2120         { efXVG, "-tabletf", "tabletf",   ffOPTRD },
2121         { efXVG, "-tablep", "tablep",   ffOPTRD },
2122         { efXVG, "-tableb", "table",    ffOPTRD },
2123         { efTRX, "-rerun",  "rerun",    ffOPTRD },
2124         { efXVG, "-tpi",    "tpi",      ffOPTWR },
2125         { efXVG, "-tpid",   "tpidist",  ffOPTWR },
2126         { efEDI, "-ei",     "sam",      ffOPTRD },
2127         { efXVG, "-eo",     "edsam",    ffOPTWR },
2128         { efGCT, "-j",      "wham",     ffOPTRD },
2129         { efGCT, "-jo",     "bam",      ffOPTWR },
2130         { efXVG, "-ffout",  "gct",      ffOPTWR },
2131         { efXVG, "-devout", "deviatie", ffOPTWR },
2132         { efXVG, "-runav",  "runaver",  ffOPTWR },
2133         { efXVG, "-px",     "pullx",    ffOPTWR },
2134         { efXVG, "-pf",     "pullf",    ffOPTWR },
2135         { efXVG, "-ro",     "rotation", ffOPTWR },
2136         { efLOG, "-ra",     "rotangles", ffOPTWR },
2137         { efLOG, "-rs",     "rotslabs", ffOPTWR },
2138         { efLOG, "-rt",     "rottorque", ffOPTWR },
2139         { efMTX, "-mtx",    "nm",       ffOPTWR },
2140         { efNDX, "-dn",     "dipole",   ffOPTWR },
2141         /* Output files that are deleted after each benchmark run */
2142         { efTRN, "-bo",     "bench",    ffWRITE },
2143         { efXTC, "-bx",     "bench",    ffWRITE },
2144         { efCPT, "-bcpo",   "bench",    ffWRITE },
2145         { efSTO, "-bc",     "bench",    ffWRITE },
2146         { efEDR, "-be",     "bench",    ffWRITE },
2147         { efLOG, "-bg",     "bench",    ffWRITE },
2148         { efXVG, "-beo",    "benchedo", ffOPTWR },
2149         { efXVG, "-bdhdl",  "benchdhdl", ffOPTWR },
2150         { efXVG, "-bfield", "benchfld", ffOPTWR },
2151         { efXVG, "-btpi",   "benchtpi", ffOPTWR },
2152         { efXVG, "-btpid",  "benchtpid", ffOPTWR },
2153         { efGCT, "-bjo",    "bench",    ffOPTWR },
2154         { efXVG, "-bffout", "benchgct", ffOPTWR },
2155         { efXVG, "-bdevout", "benchdev", ffOPTWR },
2156         { efXVG, "-brunav", "benchrnav", ffOPTWR },
2157         { efXVG, "-bpx",    "benchpx",  ffOPTWR },
2158         { efXVG, "-bpf",    "benchpf",  ffOPTWR },
2159         { efXVG, "-bro",    "benchrot", ffOPTWR },
2160         { efLOG, "-bra",    "benchrota", ffOPTWR },
2161         { efLOG, "-brs",    "benchrots", ffOPTWR },
2162         { efLOG, "-brt",    "benchrott", ffOPTWR },
2163         { efMTX, "-bmtx",   "benchn",   ffOPTWR },
2164         { efNDX, "-bdn",    "bench",    ffOPTWR }
2165     };
2166
2167     gmx_bool        bThreads     = FALSE;
2168
2169     int             nthreads = 1;
2170
2171     const char     *procstring[] =
2172     { NULL, "-np", "-n", "none", NULL };
2173     const char     *npmevalues_opt[] =
2174     { NULL, "auto", "all", "subset", NULL };
2175
2176     gmx_bool     bAppendFiles          = TRUE;
2177     gmx_bool     bKeepAndNumCPT        = FALSE;
2178     gmx_bool     bResetCountersHalfWay = FALSE;
2179     gmx_bool     bBenchmark            = TRUE;
2180
2181     output_env_t oenv = NULL;
2182
2183     t_pargs      pa[] = {
2184         /***********************/
2185         /* g_tune_pme options: */
2186         /***********************/
2187         { "-np",       FALSE, etINT,  {&nnodes},
2188           "Number of nodes to run the tests on (must be > 2 for separate PME nodes)" },
2189         { "-npstring", FALSE, etENUM, {procstring},
2190           "Specify the number of processors to [TT]$MPIRUN[tt] using this string"},
2191         { "-ntmpi",    FALSE, etINT,  {&nthreads},
2192           "Number of MPI-threads to run the tests on (turns MPI & mpirun off)"},
2193         { "-r",        FALSE, etINT,  {&repeats},
2194           "Repeat each test this often" },
2195         { "-max",      FALSE, etREAL, {&maxPMEfraction},
2196           "Max fraction of PME nodes to test with" },
2197         { "-min",      FALSE, etREAL, {&minPMEfraction},
2198           "Min fraction of PME nodes to test with" },
2199         { "-npme",     FALSE, etENUM, {npmevalues_opt},
2200           "Within -min and -max, benchmark all possible values for [TT]-npme[tt], or just a reasonable subset. "
2201           "Auto neglects -min and -max and chooses reasonable values around a guess for npme derived from the .tpr"},
2202         { "-fix",      FALSE, etINT,  {&npme_fixed},
2203           "If >= -1, do not vary the number of PME-only nodes, instead use this fixed value and only vary rcoulomb and the PME grid spacing."},
2204         { "-rmax",     FALSE, etREAL, {&rmax},
2205           "If >0, maximal rcoulomb for -ntpr>1 (rcoulomb upscaling results in fourier grid downscaling)" },
2206         { "-rmin",     FALSE, etREAL, {&rmin},
2207           "If >0, minimal rcoulomb for -ntpr>1" },
2208         { "-scalevdw",  FALSE, etBOOL, {&bScaleRvdw},
2209           "Scale rvdw along with rcoulomb"},
2210         { "-ntpr",     FALSE, etINT,  {&ntprs},
2211           "Number of [TT].tpr[tt] files to benchmark. Create this many files with different rcoulomb scaling factors depending on -rmin and -rmax. "
2212           "If < 1, automatically choose the number of [TT].tpr[tt] files to test" },
2213         { "-steps",    FALSE, etGMX_LARGE_INT, {&bench_nsteps},
2214           "Take timings for this many steps in the benchmark runs" },
2215         { "-resetstep", FALSE, etINT,  {&presteps},
2216           "Let dlb equilibrate this many steps before timings are taken (reset cycle counters after this many steps)" },
2217         { "-simsteps", FALSE, etGMX_LARGE_INT, {&new_sim_nsteps},
2218           "If non-negative, perform this many steps in the real run (overwrites nsteps from [TT].tpr[tt], add [TT].cpt[tt] steps)" },
2219         { "-launch",   FALSE, etBOOL, {&bLaunch},
2220           "Launch the real simulation after optimization" },
2221         { "-bench",    FALSE, etBOOL, {&bBenchmark},
2222           "Run the benchmarks or just create the input [TT].tpr[tt] files?" },
2223         /******************/
2224         /* mdrun options: */
2225         /******************/
2226         /* We let g_tune_pme parse and understand these options, because we need to
2227          * prevent that they appear on the mdrun command line for the benchmarks */
2228         { "-append",   FALSE, etBOOL, {&bAppendFiles},
2229           "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names (for launch only)" },
2230         { "-cpnum",    FALSE, etBOOL, {&bKeepAndNumCPT},
2231           "Keep and number checkpoint files (launch only)" },
2232         { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay},
2233           "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt] (launch only)" }
2234     };
2235
2236 #define NFILE asize(fnm)
2237
2238     seconds = gettime();
2239
2240     if (!parse_common_args(&argc, argv, PCA_NOEXIT_ON_ARGS,
2241                            NFILE, fnm, asize(pa), pa, asize(desc), desc,
2242                            0, NULL, &oenv))
2243     {
2244         return 0;
2245     }
2246
2247     /* Store the remaining unparsed command line entries in a string which
2248      * is then attached to the mdrun command line */
2249     snew(ExtraArgs, 1);
2250     ExtraArgs[0] = '\0';
2251     for (i = 1; i < argc; i++) /* argc will now be 1 if everything was understood */
2252     {
2253         add_to_string(&ExtraArgs, argv[i]);
2254         add_to_string(&ExtraArgs, " ");
2255     }
2256
2257     if (opt2parg_bSet("-ntmpi", asize(pa), pa))
2258     {
2259         bThreads = TRUE;
2260         if (opt2parg_bSet("-npstring", asize(pa), pa))
2261         {
2262             fprintf(stderr, "WARNING: -npstring has no effect when using threads.\n");
2263         }
2264
2265         if (nnodes > 1)
2266         {
2267             gmx_fatal(FARGS, "Can't run multi-threaded MPI simulation yet!");
2268         }
2269         /* and now we just set this; a bit of an ugly hack*/
2270         nnodes = nthreads;
2271     }
2272     /* Check for PME:PP ratio and whether tpr triggers additional output files */
2273     guessPMEratio = inspect_tpr(NFILE, fnm, &rcoulomb);
2274
2275     /* Automatically set -beo options if -eo is set etc. */
2276     couple_files_options(NFILE, fnm);
2277
2278     /* Construct the command line arguments for benchmark runs
2279      * as well as for the simulation run */
2280     if (bThreads)
2281     {
2282         sprintf(bbuf, " -ntmpi %d ", nthreads);
2283     }
2284     else
2285     {
2286         /* This string will be used for MPI runs and will appear after the
2287          * mpirun command. */
2288         if (strcmp(procstring[0], "none") != 0)
2289         {
2290             sprintf(bbuf, " %s %d ", procstring[0], nnodes);
2291         }
2292         else
2293         {
2294             sprintf(bbuf, " ");
2295         }
2296     }
2297
2298     cmd_np = bbuf;
2299
2300     create_command_line_snippets(bAppendFiles, bKeepAndNumCPT, bResetCountersHalfWay, presteps,
2301                                  NFILE, fnm, &cmd_args_bench, &cmd_args_launch, ExtraArgs);
2302
2303     /* Read in checkpoint file if requested */
2304     sim_part = 1;
2305     if (opt2bSet("-cpi", NFILE, fnm))
2306     {
2307         snew(cr, 1);
2308         cr->duty = DUTY_PP; /* makes the following routine happy */
2309         read_checkpoint_simulation_part(opt2fn("-cpi", NFILE, fnm),
2310                                         &sim_part, &cpt_steps, cr,
2311                                         FALSE, NFILE, fnm, NULL, NULL);
2312         sfree(cr);
2313         sim_part++;
2314         /* sim_part will now be 1 if no checkpoint file was found */
2315         if (sim_part <= 1)
2316         {
2317             gmx_fatal(FARGS, "Checkpoint file %s not found!", opt2fn("-cpi", NFILE, fnm));
2318         }
2319     }
2320
2321     /* Open performance output file and write header info */
2322     fp = ffopen(opt2fn("-p", NFILE, fnm), "w");
2323
2324     /* Make a quick consistency check of command line parameters */
2325     check_input(nnodes, repeats, &ntprs, &rmin, rcoulomb, &rmax,
2326                 maxPMEfraction, minPMEfraction, npme_fixed,
2327                 bench_nsteps, fnm, NFILE, sim_part, presteps,
2328                 asize(pa), pa);
2329
2330     /* Determine the maximum and minimum number of PME nodes to test,
2331      * the actual list of settings is build in do_the_tests(). */
2332     if ((nnodes > 2) && (npme_fixed < -1))
2333     {
2334         if (0 == strcmp(npmevalues_opt[0], "auto"))
2335         {
2336             /* Determine the npme range automatically based on the PME:PP load guess */
2337             if (guessPMEratio > 1.0)
2338             {
2339                 /* More PME than PP work, probably we do not need separate PME nodes at all! */
2340                 maxPMEnodes = nnodes/2;
2341                 minPMEnodes = nnodes/2;
2342             }
2343             else
2344             {
2345                 /* PME : PP load is in the range 0..1, let's test around the guess */
2346                 guessPMEnodes = nnodes/(1.0 + 1.0/guessPMEratio);
2347                 minPMEnodes   = floor(0.7*guessPMEnodes);
2348                 maxPMEnodes   =  ceil(1.6*guessPMEnodes);
2349                 maxPMEnodes   = min(maxPMEnodes, nnodes/2);
2350             }
2351         }
2352         else
2353         {
2354             /* Determine the npme range based on user input */
2355             maxPMEnodes = floor(maxPMEfraction*nnodes);
2356             minPMEnodes = max(floor(minPMEfraction*nnodes), 0);
2357             fprintf(stdout, "Will try runs with %d ", minPMEnodes);
2358             if (maxPMEnodes != minPMEnodes)
2359             {
2360                 fprintf(stdout, "- %d ", maxPMEnodes);
2361             }
2362             fprintf(stdout, "PME-only nodes.\n  Note that the automatic number of PME-only nodes and no separate PME nodes are always tested.\n");
2363         }
2364     }
2365     else
2366     {
2367         maxPMEnodes = 0;
2368         minPMEnodes = 0;
2369     }
2370
2371     /* Get the commands we need to set up the runs from environment variables */
2372     get_program_paths(bThreads, &cmd_mpirun, cmd_np, &cmd_mdrun, repeats);
2373     if (bBenchmark && repeats > 0)
2374     {
2375         check_mdrun_works(bThreads, cmd_mpirun, cmd_np, cmd_mdrun);
2376     }
2377
2378     /* Print some header info to file */
2379     sep_line(fp);
2380     fprintf(fp, "\n      P E R F O R M A N C E   R E S U L T S\n");
2381     sep_line(fp);
2382     fprintf(fp, "%s for Gromacs %s\n", ShortProgram(), GromacsVersion());
2383     if (!bThreads)
2384     {
2385         fprintf(fp, "Number of nodes         : %d\n", nnodes);
2386         fprintf(fp, "The mpirun command is   : %s\n", cmd_mpirun);
2387         if (strcmp(procstring[0], "none") != 0)
2388         {
2389             fprintf(fp, "Passing # of nodes via  : %s\n", procstring[0]);
2390         }
2391         else
2392         {
2393             fprintf(fp, "Not setting number of nodes in system call\n");
2394         }
2395     }
2396     else
2397     {
2398         fprintf(fp, "Number of threads       : %d\n", nnodes);
2399     }
2400
2401     fprintf(fp, "The mdrun  command is   : %s\n", cmd_mdrun);
2402     fprintf(fp, "mdrun args benchmarks   : %s\n", cmd_args_bench);
2403     fprintf(fp, "Benchmark steps         : ");
2404     fprintf(fp, gmx_large_int_pfmt, bench_nsteps);
2405     fprintf(fp, "\n");
2406     fprintf(fp, "dlb equilibration steps : %d\n", presteps);
2407     if (sim_part > 1)
2408     {
2409         fprintf(fp, "Checkpoint time step    : ");
2410         fprintf(fp, gmx_large_int_pfmt, cpt_steps);
2411         fprintf(fp, "\n");
2412     }
2413     fprintf(fp, "mdrun args at launchtime: %s\n", cmd_args_launch);
2414
2415     if (new_sim_nsteps >= 0)
2416     {
2417         bOverwrite = TRUE;
2418         fprintf(stderr, "Note: Simulation input file %s will have ", opt2fn("-so", NFILE, fnm));
2419         fprintf(stderr, gmx_large_int_pfmt, new_sim_nsteps+cpt_steps);
2420         fprintf(stderr, " steps.\n");
2421         fprintf(fp, "Simulation steps        : ");
2422         fprintf(fp, gmx_large_int_pfmt, new_sim_nsteps);
2423         fprintf(fp, "\n");
2424     }
2425     if (repeats > 1)
2426     {
2427         fprintf(fp, "Repeats for each test   : %d\n", repeats);
2428     }
2429
2430     if (npme_fixed >= -1)
2431     {
2432         fprintf(fp, "Fixing -npme at         : %d\n", npme_fixed);
2433     }
2434
2435     fprintf(fp, "Input file              : %s\n", opt2fn("-s", NFILE, fnm));
2436     fprintf(fp, "   PME/PP load estimate : %g\n", guessPMEratio);
2437
2438     /* Allocate memory for the inputinfo struct: */
2439     snew(info, 1);
2440     info->nr_inputfiles = ntprs;
2441     for (i = 0; i < ntprs; i++)
2442     {
2443         snew(info->rcoulomb, ntprs);
2444         snew(info->rvdw, ntprs);
2445         snew(info->rlist, ntprs);
2446         snew(info->rlistlong, ntprs);
2447         snew(info->nkx, ntprs);
2448         snew(info->nky, ntprs);
2449         snew(info->nkz, ntprs);
2450         snew(info->fsx, ntprs);
2451         snew(info->fsy, ntprs);
2452         snew(info->fsz, ntprs);
2453     }
2454     /* Make alternative tpr files to test: */
2455     snew(tpr_names, ntprs);
2456     for (i = 0; i < ntprs; i++)
2457     {
2458         snew(tpr_names[i], STRLEN);
2459     }
2460
2461     /* It can be that ntprs is reduced by make_benchmark_tprs if not enough
2462      * different grids could be found. */
2463     make_benchmark_tprs(opt2fn("-s", NFILE, fnm), tpr_names, bench_nsteps+presteps,
2464                         cpt_steps, rmin, rmax, bScaleRvdw, &ntprs, info, fp);
2465
2466     /********************************************************************************/
2467     /* Main loop over all scenarios we need to test: tpr files, PME nodes, repeats  */
2468     /********************************************************************************/
2469     snew(perfdata, ntprs);
2470     if (bBenchmark)
2471     {
2472         do_the_tests(fp, tpr_names, maxPMEnodes, minPMEnodes, npme_fixed, npmevalues_opt[0], perfdata, &pmeentries,
2473                      repeats, nnodes, ntprs, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
2474                      cmd_args_bench, fnm, NFILE, presteps, cpt_steps);
2475
2476         fprintf(fp, "\nTuning took%8.1f minutes.\n", (gettime()-seconds)/60.0);
2477
2478         /* Analyse the results and give a suggestion for optimal settings: */
2479         bKeepTPR = analyze_data(fp, opt2fn("-p", NFILE, fnm), perfdata, nnodes, ntprs, pmeentries,
2480                                 repeats, info, &best_tpr, &best_npme);
2481
2482         /* Take the best-performing tpr file and enlarge nsteps to original value */
2483         if (bKeepTPR && !bOverwrite)
2484         {
2485             simulation_tpr = opt2fn("-s", NFILE, fnm);
2486         }
2487         else
2488         {
2489             simulation_tpr = opt2fn("-so", NFILE, fnm);
2490             modify_PMEsettings(bOverwrite ? (new_sim_nsteps+cpt_steps) : info->orig_sim_steps,
2491                                info->orig_init_step, tpr_names[best_tpr], simulation_tpr);
2492         }
2493
2494         /* Let's get rid of the temporary benchmark input files */
2495         for (i = 0; i < ntprs; i++)
2496         {
2497             fprintf(stdout, "Deleting temporary benchmark input file %s\n", tpr_names[i]);
2498             remove(tpr_names[i]);
2499         }
2500
2501         /* Now start the real simulation if the user requested it ... */
2502         launch_simulation(bLaunch, fp, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
2503                           cmd_args_launch, simulation_tpr, best_npme);
2504     }
2505     ffclose(fp);
2506
2507     /* ... or simply print the performance results to screen: */
2508     if (!bLaunch)
2509     {
2510         finalize(opt2fn("-p", NFILE, fnm));
2511     }
2512
2513     return 0;
2514 }