src/gromacs/gmxana/gmx_tune_pme.c

   1 /*
   2  *
   3  *                This source code is part of
   4  *
   5  *                 G   R   O   M   A   C   S
   6  *
   7  *          GROningen MAchine for Chemical Simulations
   8  *
   9  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  10  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  11  * Copyright (c) 2001-2008, The GROMACS development team,
  12  * check out http://www.gromacs.org for more information.
  13
  14  * This program is free software; you can redistribute it and/or
  15  * modify it under the terms of the GNU General Public License
  16  * as published by the Free Software Foundation; either version 2
  17  * of the License, or (at your option) any later version.
  18  *
  19  * If you want to redistribute modifications, please consider that
  20  * scientific software is very special. Version control is crucial -
  21  * bugs must be traceable. We will be happy to consider code for
  22  * inclusion in the official distribution, but derived work must not
  23  * be called official GROMACS. Details are found in the README & COPYING
  24  * files - if they are missing, get the official version at www.gromacs.org.
  25  *
  26  * To help us fund GROMACS development, we humbly ask that you cite
  27  * the papers on the package - you can find them in the top README file.
  28  *
  29  * For more info, check our website at http://www.gromacs.org
  30  *
  31  * And Hey:
  32  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  33  */
  34 #ifdef HAVE_CONFIG_H
  35 #include <config.h>
  36 #endif
  37
  38
  39 #include <time.h>
  40 #ifdef HAVE_SYS_TIME_H
  41 #include <sys/time.h>
  42 #endif
  43
  44
  45
  46 #include "gromacs/commandline/pargs.h"
  47 #include "typedefs.h"
  48 #include "smalloc.h"
  49 #include "vec.h"
  50 #include "copyrite.h"
  51 #include "gromacs/fileio/tpxio.h"
  52 #include "string2.h"
  53 #include "readinp.h"
  54 #include "calcgrid.h"
  55 #include "checkpoint.h"
  56 #include "macros.h"
  57 #include "gmx_ana.h"
  58 #include "names.h"
  59 #include "perf_est.h"
  60 #include "gromacs/timing/walltime_accounting.h"
  61
  62
  63 /* Enum for situations that can occur during log file parsing, the
  64  * corresponding string entries can be found in do_the_tests() in
  65  * const char* ParseLog[] */
  66 enum {
  67     eParselogOK,
  68     eParselogNotFound,
  69     eParselogNoPerfData,
  70     eParselogTerm,
  71     eParselogResetProblem,
  72     eParselogNoDDGrid,
  73     eParselogTPXVersion,
  74     eParselogNotParallel,
  75     eParselogLargePrimeFactor,
  76     eParselogFatal,
  77     eParselogNr
  78 };
  79
  80
  81 typedef struct
  82 {
  83     int     nPMEnodes;    /* number of PME-only nodes used in this test */
  84     int     nx, ny, nz;   /* DD grid */
  85     int     guessPME;     /* if nPMEnodes == -1, this is the guessed number of PME nodes */
  86     double *Gcycles;      /* This can contain more than one value if doing multiple tests */
  87     double  Gcycles_Av;
  88     float  *ns_per_day;
  89     float   ns_per_day_Av;
  90     float  *PME_f_load;     /* PME mesh/force load average*/
  91     float   PME_f_load_Av;  /* Average average ;) ... */
  92     char   *mdrun_cmd_line; /* Mdrun command line used for this test */
  93 } t_perf;
  94
  95
  96 typedef struct
  97 {
  98     int             nr_inputfiles;  /* The number of tpr and mdp input files */
  99     gmx_large_int_t orig_sim_steps; /* Number of steps to be done in the real simulation */
 100     gmx_large_int_t orig_init_step; /* Init step for the real simulation */
 101     real           *rcoulomb;       /* The coulomb radii [0...nr_inputfiles] */
 102     real           *rvdw;           /* The vdW radii */
 103     real           *rlist;          /* Neighbourlist cutoff radius */
 104     real           *rlistlong;
 105     int            *nkx, *nky, *nkz;
 106     real           *fsx, *fsy, *fsz; /* Fourierspacing in x,y,z dimension */
 107 } t_inputinfo;
 108
 109
 110 static void sep_line(FILE *fp)
 111 {
 112     fprintf(fp, "\n------------------------------------------------------------\n");
 113 }
 114
 115
 116 /* Wrapper for system calls */
 117 static int gmx_system_call(char *command)
 118 {
 119 #ifdef GMX_NO_SYSTEM
 120     gmx_fatal(FARGS, "No calls to system(3) supported on this platform. Attempted to call:\n'%s'\n", command);
 121 #else
 122     return ( system(command) );
 123 #endif
 124 }
 125
 126
 127 /* Check if string starts with substring */
 128 static gmx_bool str_starts(const char *string, const char *substring)
 129 {
 130     return ( strncmp(string, substring, strlen(substring)) == 0);
 131 }
 132
 133
 134 static void cleandata(t_perf *perfdata, int test_nr)
 135 {
 136     perfdata->Gcycles[test_nr]    = 0.0;
 137     perfdata->ns_per_day[test_nr] = 0.0;
 138     perfdata->PME_f_load[test_nr] = 0.0;
 139
 140     return;
 141 }
 142
 143
 144 static gmx_bool is_equal(real a, real b)
 145 {
 146     real diff, eps = 1.0e-7;
 147
 148
 149     diff = a - b;
 150
 151     if (diff < 0.0)
 152     {
 153         diff = -diff;
 154     }
 155
 156     if (diff < eps)
 157     {
 158         return TRUE;
 159     }
 160     else
 161     {
 162         return FALSE;
 163     }
 164 }
 165
 166
 167 static void finalize(const char *fn_out)
 168 {
 169     char  buf[STRLEN];
 170     FILE *fp;
 171
 172
 173     fp = fopen(fn_out, "r");
 174     fprintf(stdout, "\n\n");
 175
 176     while (fgets(buf, STRLEN-1, fp) != NULL)
 177     {
 178         fprintf(stdout, "%s", buf);
 179     }
 180     fclose(fp);
 181     fprintf(stdout, "\n\n");
 182 }
 183
 184
 185 enum {
 186     eFoundNothing, eFoundDDStr, eFoundAccountingStr, eFoundCycleStr
 187 };
 188
 189 static int parse_logfile(const char *logfile, const char *errfile,
 190                          t_perf *perfdata, int test_nr, int presteps, gmx_large_int_t cpt_steps,
 191                          int nnodes)
 192 {
 193     FILE           *fp;
 194     char            line[STRLEN], dumstring[STRLEN], dumstring2[STRLEN];
 195     const char      matchstrdd[]  = "Domain decomposition grid";
 196     const char      matchstrcr[]  = "resetting all time and cycle counters";
 197     const char      matchstrbal[] = "Average PME mesh/force load:";
 198     const char      matchstring[] = "R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G";
 199     const char      errSIG[]      = "signal, stopping at the next";
 200     int             iFound;
 201     int             procs;
 202     float           dum1, dum2, dum3, dum4;
 203     int             ndum;
 204     int             npme;
 205     gmx_large_int_t resetsteps     = -1;
 206     gmx_bool        bFoundResetStr = FALSE;
 207     gmx_bool        bResetChecked  = FALSE;
 208
 209
 210     if (!gmx_fexist(logfile))
 211     {
 212         fprintf(stderr, "WARNING: Could not find logfile %s.\n", logfile);
 213         cleandata(perfdata, test_nr);
 214         return eParselogNotFound;
 215     }
 216
 217     fp = fopen(logfile, "r");
 218     perfdata->PME_f_load[test_nr] = -1.0;
 219     perfdata->guessPME            = -1;
 220
 221     iFound = eFoundNothing;
 222     if (1 == nnodes)
 223     {
 224         iFound = eFoundDDStr; /* Skip some case statements */
 225     }
 226
 227     while (fgets(line, STRLEN, fp) != NULL)
 228     {
 229         /* Remove leading spaces */
 230         ltrim(line);
 231
 232         /* Check for TERM and INT signals from user: */
 233         if (strstr(line, errSIG) != NULL)
 234         {
 235             fclose(fp);
 236             cleandata(perfdata, test_nr);
 237             return eParselogTerm;
 238         }
 239
 240         /* Check whether cycle resetting  worked */
 241         if (presteps > 0 && !bFoundResetStr)
 242         {
 243             if (strstr(line, matchstrcr) != NULL)
 244             {
 245                 sprintf(dumstring, "step %s", gmx_large_int_pfmt);
 246                 sscanf(line, dumstring, &resetsteps);
 247                 bFoundResetStr = TRUE;
 248                 if (resetsteps == presteps+cpt_steps)
 249                 {
 250                     bResetChecked = TRUE;
 251                 }
 252                 else
 253                 {
 254                     sprintf(dumstring, gmx_large_int_pfmt, resetsteps);
 255                     sprintf(dumstring2, gmx_large_int_pfmt, presteps+cpt_steps);
 256                     fprintf(stderr, "WARNING: Time step counters were reset at step %s,\n"
 257                             "         though they were supposed to be reset at step %s!\n",
 258                             dumstring, dumstring2);
 259                 }
 260             }
 261         }
 262
 263         /* Look for strings that appear in a certain order in the log file: */
 264         switch (iFound)
 265         {
 266             case eFoundNothing:
 267                 /* Look for domain decomp grid and separate PME nodes: */
 268                 if (str_starts(line, matchstrdd))
 269                 {
 270                     sscanf(line, "Domain decomposition grid %d x %d x %d, separate PME nodes %d",
 271                            &(perfdata->nx), &(perfdata->ny), &(perfdata->nz), &npme);
 272                     if (perfdata->nPMEnodes == -1)
 273                     {
 274                         perfdata->guessPME = npme;
 275                     }
 276                     else if (perfdata->nPMEnodes != npme)
 277                     {
 278                         gmx_fatal(FARGS, "PME nodes from command line and output file are not identical");
 279                     }
 280                     iFound = eFoundDDStr;
 281                 }
 282                 /* Catch a few errors that might have occured: */
 283                 else if (str_starts(line, "There is no domain decomposition for"))
 284                 {
 285                     fclose(fp);
 286                     return eParselogNoDDGrid;
 287                 }
 288                 else if (str_starts(line, "The number of nodes you selected"))
 289                 {
 290                     fclose(fp);
 291                     return eParselogLargePrimeFactor;
 292                 }
 293                 else if (str_starts(line, "reading tpx file"))
 294                 {
 295                     fclose(fp);
 296                     return eParselogTPXVersion;
 297                 }
 298                 else if (str_starts(line, "The -dd or -npme option request a parallel simulation"))
 299                 {
 300                     fclose(fp);
 301                     return eParselogNotParallel;
 302                 }
 303                 break;
 304             case eFoundDDStr:
 305                 /* Look for PME mesh/force balance (not necessarily present, though) */
 306                 if (str_starts(line, matchstrbal))
 307                 {
 308                     sscanf(&line[strlen(matchstrbal)], "%f", &(perfdata->PME_f_load[test_nr]));
 309                 }
 310                 /* Look for matchstring */
 311                 if (str_starts(line, matchstring))
 312                 {
 313                     iFound = eFoundAccountingStr;
 314                 }
 315                 break;
 316             case eFoundAccountingStr:
 317                 /* Already found matchstring - look for cycle data */
 318                 if (str_starts(line, "Total  "))
 319                 {
 320                     sscanf(line, "Total %d %lf", &procs, &(perfdata->Gcycles[test_nr]));
 321                     iFound = eFoundCycleStr;
 322                 }
 323                 break;
 324             case eFoundCycleStr:
 325                 /* Already found cycle data - look for remaining performance info and return */
 326                 if (str_starts(line, "Performance:"))
 327                 {
 328                     ndum = sscanf(line, "%s %f %f %f %f", dumstring, &dum1, &dum2, &dum3, &dum4);
 329                     /* (ns/day) is the second last entry, depending on whether GMX_DETAILED_PERF_STATS was set in print_perf(), nrnb.c */
 330                     perfdata->ns_per_day[test_nr] = (ndum == 5) ? dum3 : dum1;
 331                     fclose(fp);
 332                     if (bResetChecked || presteps == 0)
 333                     {
 334                         return eParselogOK;
 335                     }
 336                     else
 337                     {
 338                         return eParselogResetProblem;
 339                     }
 340                 }
 341                 break;
 342         }
 343     } /* while */
 344
 345     /* Close the log file */
 346     fclose(fp);
 347
 348     /* Check why there is no performance data in the log file.
 349      * Did a fatal errors occur? */
 350     if (gmx_fexist(errfile))
 351     {
 352         fp = fopen(errfile, "r");
 353         while (fgets(line, STRLEN, fp) != NULL)
 354         {
 355             if (str_starts(line, "Fatal error:") )
 356             {
 357                 if (fgets(line, STRLEN, fp) != NULL)
 358                 {
 359                     fprintf(stderr, "\nWARNING: An error occured during this benchmark:\n"
 360                             "%s\n", line);
 361                 }
 362                 fclose(fp);
 363                 cleandata(perfdata, test_nr);
 364                 return eParselogFatal;
 365             }
 366         }
 367         fclose(fp);
 368     }
 369     else
 370     {
 371         fprintf(stderr, "WARNING: Could not find stderr file %s.\n", errfile);
 372     }
 373
 374     /* Giving up ... we could not find out why there is no performance data in
 375      * the log file. */
 376     fprintf(stdout, "No performance data in log file.\n");
 377     cleandata(perfdata, test_nr);
 378
 379     return eParselogNoPerfData;
 380 }
 381
 382
 383 static gmx_bool analyze_data(
 384         FILE         *fp,
 385         const char   *fn,
 386         t_perf      **perfdata,
 387         int           nnodes,
 388         int           ntprs,
 389         int           ntests,
 390         int           nrepeats,
 391         t_inputinfo  *info,
 392         int          *index_tpr,    /* OUT: Nr of mdp file with best settings */
 393         int          *npme_optimal) /* OUT: Optimal number of PME nodes */
 394 {
 395     int      i, j, k;
 396     int      line  = 0, line_win = -1;
 397     int      k_win = -1, i_win = -1, winPME;
 398     double   s     = 0.0; /* standard deviation */
 399     t_perf  *pd;
 400     char     strbuf[STRLEN];
 401     char     str_PME_f_load[13];
 402     gmx_bool bCanUseOrigTPR;
 403     gmx_bool bRefinedCoul, bRefinedVdW, bRefinedGrid;
 404
 405
 406     if (nrepeats > 1)
 407     {
 408         sep_line(fp);
 409         fprintf(fp, "Summary of successful runs:\n");
 410         fprintf(fp, "Line tpr PME nodes  Gcycles Av.     Std.dev.       ns/day        PME/f");
 411         if (nnodes > 1)
 412         {
 413             fprintf(fp, "    DD grid");
 414         }
 415         fprintf(fp, "\n");
 416     }
 417
 418
 419     for (k = 0; k < ntprs; k++)
 420     {
 421         for (i = 0; i < ntests; i++)
 422         {
 423             /* Select the right dataset: */
 424             pd = &(perfdata[k][i]);
 425
 426             pd->Gcycles_Av    = 0.0;
 427             pd->PME_f_load_Av = 0.0;
 428             pd->ns_per_day_Av = 0.0;
 429
 430             if (pd->nPMEnodes == -1)
 431             {
 432                 sprintf(strbuf, "(%3d)", pd->guessPME);
 433             }
 434             else
 435             {
 436                 sprintf(strbuf, "     ");
 437             }
 438
 439             /* Get the average run time of a setting */
 440             for (j = 0; j < nrepeats; j++)
 441             {
 442                 pd->Gcycles_Av    += pd->Gcycles[j];
 443                 pd->PME_f_load_Av += pd->PME_f_load[j];
 444             }
 445             pd->Gcycles_Av    /= nrepeats;
 446             pd->PME_f_load_Av /= nrepeats;
 447
 448             for (j = 0; j < nrepeats; j++)
 449             {
 450                 if (pd->ns_per_day[j] > 0.0)
 451                 {
 452                     pd->ns_per_day_Av += pd->ns_per_day[j];
 453                 }
 454                 else
 455                 {
 456                     /* Somehow the performance number was not aquired for this run,
 457                      * therefor set the average to some negative value: */
 458                     pd->ns_per_day_Av = -1.0f*nrepeats;
 459                     break;
 460                 }
 461             }
 462             pd->ns_per_day_Av /= nrepeats;
 463
 464             /* Nicer output: */
 465             if (pd->PME_f_load_Av > 0.0)
 466             {
 467                 sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load_Av);
 468             }
 469             else
 470             {
 471                 sprintf(str_PME_f_load, "%s", "         -  ");
 472             }
 473
 474
 475             /* We assume we had a successful run if both averages are positive */
 476             if (pd->Gcycles_Av > 0.0 && pd->ns_per_day_Av > 0.0)
 477             {
 478                 /* Output statistics if repeats were done */
 479                 if (nrepeats > 1)
 480                 {
 481                     /* Calculate the standard deviation */
 482                     s = 0.0;
 483                     for (j = 0; j < nrepeats; j++)
 484                     {
 485                         s += pow( pd->Gcycles[j] - pd->Gcycles_Av, 2 );
 486                     }
 487                     s /= (nrepeats - 1);
 488                     s  = sqrt(s);
 489
 490                     fprintf(fp, "%4d %3d %4d%s %12.3f %12.3f %12.3f %s",
 491                             line, k, pd->nPMEnodes, strbuf, pd->Gcycles_Av, s,
 492                             pd->ns_per_day_Av, str_PME_f_load);
 493                     if (nnodes > 1)
 494                     {
 495                         fprintf(fp, "  %3d %3d %3d", pd->nx, pd->ny, pd->nz);
 496                     }
 497                     fprintf(fp, "\n");
 498                 }
 499                 /* Store the index of the best run found so far in 'winner': */
 500                 if ( (k_win == -1) || (pd->Gcycles_Av < perfdata[k_win][i_win].Gcycles_Av) )
 501                 {
 502                     k_win    = k;
 503                     i_win    = i;
 504                     line_win = line;
 505                 }
 506                 line++;
 507             }
 508         }
 509     }
 510
 511     if (k_win == -1)
 512     {
 513         gmx_fatal(FARGS, "None of the runs was successful! Check %s for problems.", fn);
 514     }
 515
 516     sep_line(fp);
 517
 518     winPME = perfdata[k_win][i_win].nPMEnodes;
 519
 520     if (1 == ntests)
 521     {
 522         /* We stuck to a fixed number of PME-only nodes */
 523         sprintf(strbuf, "settings No. %d", k_win);
 524     }
 525     else
 526     {
 527         /* We have optimized the number of PME-only nodes */
 528         if (winPME == -1)
 529         {
 530             sprintf(strbuf, "%s", "the automatic number of PME nodes");
 531         }
 532         else
 533         {
 534             sprintf(strbuf, "%d PME nodes", winPME);
 535         }
 536     }
 537     fprintf(fp, "Best performance was achieved with %s", strbuf);
 538     if ((nrepeats > 1) && (ntests > 1))
 539     {
 540         fprintf(fp, " (see line %d)", line_win);
 541     }
 542     fprintf(fp, "\n");
 543
 544     /* Only mention settings if they were modified: */
 545     bRefinedCoul = !is_equal(info->rcoulomb[k_win], info->rcoulomb[0]);
 546     bRefinedVdW  = !is_equal(info->rvdw[k_win], info->rvdw[0]    );
 547     bRefinedGrid = !(info->nkx[k_win] == info->nkx[0] &&
 548                      info->nky[k_win] == info->nky[0] &&
 549                      info->nkz[k_win] == info->nkz[0]);
 550
 551     if (bRefinedCoul || bRefinedVdW || bRefinedGrid)
 552     {
 553         fprintf(fp, "Optimized PME settings:\n");
 554         bCanUseOrigTPR = FALSE;
 555     }
 556     else
 557     {
 558         bCanUseOrigTPR = TRUE;
 559     }
 560
 561     if (bRefinedCoul)
 562     {
 563         fprintf(fp, "   New Coulomb radius: %f nm (was %f nm)\n", info->rcoulomb[k_win], info->rcoulomb[0]);
 564     }
 565
 566     if (bRefinedVdW)
 567     {
 568         fprintf(fp, "   New Van der Waals radius: %f nm (was %f nm)\n", info->rvdw[k_win], info->rvdw[0]);
 569     }
 570
 571     if (bRefinedGrid)
 572     {
 573         fprintf(fp, "   New Fourier grid xyz: %d %d %d (was %d %d %d)\n", info->nkx[k_win], info->nky[k_win], info->nkz[k_win],
 574                 info->nkx[0], info->nky[0], info->nkz[0]);
 575     }
 576
 577     if (bCanUseOrigTPR && ntprs > 1)
 578     {
 579         fprintf(fp, "and original PME settings.\n");
 580     }
 581
 582     fflush(fp);
 583
 584     /* Return the index of the mdp file that showed the highest performance
 585      * and the optimal number of PME nodes */
 586     *index_tpr    = k_win;
 587     *npme_optimal = winPME;
 588
 589     return bCanUseOrigTPR;
 590 }
 591
 592
 593 /* Get the commands we need to set up the runs from environment variables */
 594 static void get_program_paths(gmx_bool bThreads, char *cmd_mpirun[], char *cmd_mdrun[])
 595 {
 596     char      *cp;
 597     FILE      *fp;
 598     const char def_mpirun[]   = "mpirun";
 599     const char def_mdrun[]    = "mdrun";
 600
 601     const char empty_mpirun[] = "";
 602
 603     /* Get the commands we need to set up the runs from environment variables */
 604     if (!bThreads)
 605     {
 606         if ( (cp = getenv("MPIRUN")) != NULL)
 607         {
 608             *cmd_mpirun = strdup(cp);
 609         }
 610         else
 611         {
 612             *cmd_mpirun = strdup(def_mpirun);
 613         }
 614     }
 615     else
 616     {
 617         *cmd_mpirun = strdup(empty_mpirun);
 618     }
 619
 620     if ( (cp = getenv("MDRUN" )) != NULL)
 621     {
 622         *cmd_mdrun  = strdup(cp);
 623     }
 624     else
 625     {
 626         *cmd_mdrun  = strdup(def_mdrun);
 627     }
 628 }
 629
 630 /* Check that the commands will run mdrun (perhaps via mpirun) by
 631  * running a very quick test simulation. Requires MPI environment to
 632  * be available if applicable. */
 633 static void check_mdrun_works(gmx_bool    bThreads,
 634                               const char *cmd_mpirun,
 635                               const char *cmd_np,
 636                               const char *cmd_mdrun)
 637 {
 638     char      *command = NULL;
 639     char      *cp;
 640     char       line[STRLEN];
 641     FILE      *fp;
 642     const char filename[]     = "benchtest.log";
 643
 644     /* This string should always be identical to the one in copyrite.c,
 645      * gmx_print_version_info() in the defined(GMX_MPI) section */
 646     const char match_mpi[]    = "MPI library:        MPI";
 647     const char match_mdrun[]  = "Program: ";
 648     gmx_bool   bMdrun         = FALSE;
 649     gmx_bool   bMPI           = FALSE;
 650
 651     /* Run a small test to see whether mpirun + mdrun work  */
 652     fprintf(stdout, "Making sure that mdrun can be executed. ");
 653     if (bThreads)
 654     {
 655         snew(command, strlen(cmd_mdrun) + strlen(cmd_np) + strlen(filename) + 50);
 656         sprintf(command, "%s%s-version -maxh 0.001 1> %s 2>&1", cmd_mdrun, cmd_np, filename);
 657     }
 658     else
 659     {
 660         snew(command, strlen(cmd_mpirun) + strlen(cmd_np) + strlen(cmd_mdrun) + strlen(filename) + 50);
 661         sprintf(command, "%s%s%s -version -maxh 0.001 1> %s 2>&1", cmd_mpirun, cmd_np, cmd_mdrun, filename);
 662     }
 663     fprintf(stdout, "Trying '%s' ... ", command);
 664     make_backup(filename);
 665     gmx_system_call(command);
 666
 667     /* Check if we find the characteristic string in the output: */
 668     if (!gmx_fexist(filename))
 669     {
 670         gmx_fatal(FARGS, "Output from test run could not be found.");
 671     }
 672
 673     fp = fopen(filename, "r");
 674     /* We need to scan the whole output file, since sometimes the queuing system
 675      * also writes stuff to stdout/err */
 676     while (!feof(fp) )
 677     {
 678         cp = fgets(line, STRLEN, fp);
 679         if (cp != NULL)
 680         {
 681             if (str_starts(line, match_mdrun) )
 682             {
 683                 bMdrun = TRUE;
 684             }
 685             if (str_starts(line, match_mpi) )
 686             {
 687                 bMPI = TRUE;
 688             }
 689         }
 690     }
 691     fclose(fp);
 692
 693     if (bThreads)
 694     {
 695         if (bMPI)
 696         {
 697             gmx_fatal(FARGS, "Need a threaded version of mdrun. This one\n"
 698                       "(%s)\n"
 699                       "seems to have been compiled with MPI instead.",
 700                       *cmd_mdrun);
 701         }
 702     }
 703     else
 704     {
 705         if (bMdrun && !bMPI)
 706         {
 707             gmx_fatal(FARGS, "Need an MPI-enabled version of mdrun. This one\n"
 708                       "(%s)\n"
 709                       "seems to have been compiled without MPI support.",
 710                       *cmd_mdrun);
 711         }
 712     }
 713
 714     if (!bMdrun)
 715     {
 716         gmx_fatal(FARGS, "Cannot execute mdrun. Please check %s for problems!",
 717                   filename);
 718     }
 719
 720     fprintf(stdout, "passed.\n");
 721
 722     /* Clean up ... */
 723     remove(filename);
 724     sfree(command);
 725 }
 726
 727
 728 static void launch_simulation(
 729         gmx_bool    bLaunch,        /* Should the simulation be launched? */
 730         FILE       *fp,             /* General log file */
 731         gmx_bool    bThreads,       /* whether to use threads */
 732         char       *cmd_mpirun,     /* Command for mpirun */
 733         char       *cmd_np,         /* Switch for -np or -ntmpi or empty */
 734         char       *cmd_mdrun,      /* Command for mdrun */
 735         char       *args_for_mdrun, /* Arguments for mdrun */
 736         const char *simulation_tpr, /* This tpr will be simulated */
 737         int         nPMEnodes)      /* Number of PME nodes to use */
 738 {
 739     char  *command;
 740
 741
 742     /* Make enough space for the system call command,
 743      * (100 extra chars for -npme ... etc. options should suffice): */
 744     snew(command, strlen(cmd_mpirun)+strlen(cmd_mdrun)+strlen(cmd_np)+strlen(args_for_mdrun)+strlen(simulation_tpr)+100);
 745
 746     /* Note that the -passall options requires args_for_mdrun to be at the end
 747      * of the command line string */
 748     if (bThreads)
 749     {
 750         sprintf(command, "%s%s-npme %d -s %s %s",
 751                 cmd_mdrun, cmd_np, nPMEnodes, simulation_tpr, args_for_mdrun);
 752     }
 753     else
 754     {
 755         sprintf(command, "%s%s%s -npme %d -s %s %s",
 756                 cmd_mpirun, cmd_np, cmd_mdrun, nPMEnodes, simulation_tpr, args_for_mdrun);
 757     }
 758
 759     fprintf(fp, "%s this command line to launch the simulation:\n\n%s", bLaunch ? "Using" : "Please use", command);
 760     sep_line(fp);
 761     fflush(fp);
 762
 763     /* Now the real thing! */
 764     if (bLaunch)
 765     {
 766         fprintf(stdout, "\nLaunching simulation with best parameters now.\nExecuting '%s'", command);
 767         sep_line(stdout);
 768         fflush(stdout);
 769         gmx_system_call(command);
 770     }
 771 }
 772
 773
 774 static void modify_PMEsettings(
 775         gmx_large_int_t simsteps,    /* Set this value as number of time steps */
 776         gmx_large_int_t init_step,   /* Set this value as init_step */
 777         const char     *fn_best_tpr, /* tpr file with the best performance */
 778         const char     *fn_sim_tpr)  /* name of tpr file to be launched */
 779 {
 780     t_inputrec   *ir;
 781     t_state       state;
 782     gmx_mtop_t    mtop;
 783     char          buf[200];
 784
 785     snew(ir, 1);
 786     read_tpx_state(fn_best_tpr, ir, &state, NULL, &mtop);
 787
 788     /* Reset nsteps and init_step to the value of the input .tpr file */
 789     ir->nsteps    = simsteps;
 790     ir->init_step = init_step;
 791
 792     /* Write the tpr file which will be launched */
 793     sprintf(buf, "Writing optimized simulation file %s with nsteps=%s.\n", fn_sim_tpr, gmx_large_int_pfmt);
 794     fprintf(stdout, buf, ir->nsteps);
 795     fflush(stdout);
 796     write_tpx_state(fn_sim_tpr, ir, &state, &mtop);
 797
 798     sfree(ir);
 799 }
 800
 801
 802 #define EPME_SWITCHED(e) ((e) == eelPMESWITCH || (e) == eelPMEUSERSWITCH)
 803
 804 /* Make additional TPR files with more computational load for the
 805  * direct space processors: */
 806 static void make_benchmark_tprs(
 807         const char     *fn_sim_tpr,      /* READ : User-provided tpr file                 */
 808         char           *fn_bench_tprs[], /* WRITE: Names of benchmark tpr files           */
 809         gmx_large_int_t benchsteps,      /* Number of time steps for benchmark runs       */
 810         gmx_large_int_t statesteps,      /* Step counter in checkpoint file               */
 811         real            rmin,            /* Minimal Coulomb radius                        */
 812         real            rmax,            /* Maximal Coulomb radius                        */
 813         real            bScaleRvdw,      /* Scale rvdw along with rcoulomb                */
 814         int            *ntprs,           /* No. of TPRs to write, each with a different
 815                                             rcoulomb and fourierspacing                   */
 816         t_inputinfo    *info,            /* Contains information about mdp file options   */
 817         FILE           *fp)              /* Write the output here                         */
 818 {
 819     int           i, j, d;
 820     t_inputrec   *ir;
 821     t_state       state;
 822     gmx_mtop_t    mtop;
 823     real          nlist_buffer;     /* Thickness of the buffer regions for PME-switch potentials */
 824     char          buf[200];
 825     rvec          box_size;
 826     gmx_bool      bNote = FALSE;
 827     real          add;              /* Add this to rcoul for the next test    */
 828     real          fac = 1.0;        /* Scaling factor for Coulomb radius      */
 829     real          fourierspacing;   /* Basic fourierspacing from tpr          */
 830
 831
 832     sprintf(buf, "Making benchmark tpr file%s with %s time step%s",
 833             *ntprs > 1 ? "s" : "", gmx_large_int_pfmt, benchsteps > 1 ? "s" : "");
 834     fprintf(stdout, buf, benchsteps);
 835     if (statesteps > 0)
 836     {
 837         sprintf(buf, " (adding %s steps from checkpoint file)", gmx_large_int_pfmt);
 838         fprintf(stdout, buf, statesteps);
 839         benchsteps += statesteps;
 840     }
 841     fprintf(stdout, ".\n");
 842
 843
 844     snew(ir, 1);
 845     read_tpx_state(fn_sim_tpr, ir, &state, NULL, &mtop);
 846
 847     /* Check if some kind of PME was chosen */
 848     if (EEL_PME(ir->coulombtype) == FALSE)
 849     {
 850         gmx_fatal(FARGS, "Can only do optimizations for simulations with %s electrostatics.",
 851                   EELTYPE(eelPME));
 852     }
 853
 854     /* Check if rcoulomb == rlist, which is necessary for plain PME. */
 855     if (  (ir->cutoff_scheme != ecutsVERLET) &&
 856           (eelPME == ir->coulombtype) && !(ir->rcoulomb == ir->rlist))
 857     {
 858         gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to rlist (%f).",
 859                   EELTYPE(eelPME), ir->rcoulomb, ir->rlist);
 860     }
 861     /* For other PME types, rcoulomb is allowed to be smaller than rlist */
 862     else if (ir->rcoulomb > ir->rlist)
 863     {
 864         gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to or smaller than rlist (%f)",
 865                   EELTYPE(ir->coulombtype), ir->rcoulomb, ir->rlist);
 866     }
 867
 868     if (bScaleRvdw && ir->rvdw != ir->rcoulomb)
 869     {
 870         fprintf(stdout, "NOTE: input rvdw != rcoulomb, will not scale rvdw\n");
 871         bScaleRvdw = FALSE;
 872     }
 873
 874     /* Reduce the number of steps for the benchmarks */
 875     info->orig_sim_steps = ir->nsteps;
 876     ir->nsteps           = benchsteps;
 877     /* We must not use init_step from the input tpr file for the benchmarks */
 878     info->orig_init_step = ir->init_step;
 879     ir->init_step        = 0;
 880
 881     /* For PME-switch potentials, keep the radial distance of the buffer region */
 882     nlist_buffer   = ir->rlist - ir->rcoulomb;
 883
 884     /* Determine length of triclinic box vectors */
 885     for (d = 0; d < DIM; d++)
 886     {
 887         box_size[d] = 0;
 888         for (i = 0; i < DIM; i++)
 889         {
 890             box_size[d] += state.box[d][i]*state.box[d][i];
 891         }
 892         box_size[d] = sqrt(box_size[d]);
 893     }
 894
 895     if (ir->fourier_spacing > 0)
 896     {
 897         info->fsx[0] = ir->fourier_spacing;
 898         info->fsy[0] = ir->fourier_spacing;
 899         info->fsz[0] = ir->fourier_spacing;
 900     }
 901     else
 902     {
 903         /* Reconstruct fourierspacing per dimension from the number of grid points and box size */
 904         info->fsx[0] = box_size[XX]/ir->nkx;
 905         info->fsy[0] = box_size[YY]/ir->nky;
 906         info->fsz[0] = box_size[ZZ]/ir->nkz;
 907     }
 908
 909     /* If no value for the fourierspacing was provided on the command line, we
 910      * use the reconstruction from the tpr file */
 911     if (ir->fourier_spacing > 0)
 912     {
 913         /* Use the spacing from the tpr */
 914         fourierspacing = ir->fourier_spacing;
 915     }
 916     else
 917     {
 918         /* Use the maximum observed spacing */
 919         fourierspacing = max(max(info->fsx[0], info->fsy[0]), info->fsz[0]);
 920     }
 921
 922     fprintf(stdout, "Calculating PME grid points on the basis of a fourierspacing of %f nm\n", fourierspacing);
 923
 924     /* For performance comparisons the number of particles is useful to have */
 925     fprintf(fp, "   Number of particles  : %d\n", mtop.natoms);
 926
 927     /* Print information about settings of which some are potentially modified: */
 928     fprintf(fp, "   Coulomb type         : %s\n", EELTYPE(ir->coulombtype));
 929     fprintf(fp, "   Grid spacing x y z   : %f %f %f\n",
 930             box_size[XX]/ir->nkx, box_size[YY]/ir->nky, box_size[ZZ]/ir->nkz);
 931     fprintf(fp, "   Van der Waals type   : %s\n", EVDWTYPE(ir->vdwtype));
 932     if (EVDW_SWITCHED(ir->vdwtype))
 933     {
 934         fprintf(fp, "   rvdw_switch          : %f nm\n", ir->rvdw_switch);
 935     }
 936     if (EPME_SWITCHED(ir->coulombtype))
 937     {
 938         fprintf(fp, "   rlist                : %f nm\n", ir->rlist);
 939     }
 940     if (ir->rlistlong != max_cutoff(ir->rvdw, ir->rcoulomb))
 941     {
 942         fprintf(fp, "   rlistlong            : %f nm\n", ir->rlistlong);
 943     }
 944
 945     /* Print a descriptive line about the tpr settings tested */
 946     fprintf(fp, "\nWill try these real/reciprocal workload settings:\n");
 947     fprintf(fp, " No.   scaling  rcoulomb");
 948     fprintf(fp, "  nkx  nky  nkz");
 949     fprintf(fp, "   spacing");
 950     if (evdwCUT == ir->vdwtype)
 951     {
 952         fprintf(fp, "      rvdw");
 953     }
 954     if (EPME_SWITCHED(ir->coulombtype))
 955     {
 956         fprintf(fp, "     rlist");
 957     }
 958     if (ir->rlistlong != max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb)) )
 959     {
 960         fprintf(fp, " rlistlong");
 961     }
 962     fprintf(fp, "  tpr file\n");
 963
 964     /* Loop to create the requested number of tpr input files */
 965     for (j = 0; j < *ntprs; j++)
 966     {
 967         /* The first .tpr is the provided one, just need to modify nsteps,
 968          * so skip the following block */
 969         if (j != 0)
 970         {
 971             /* Determine which Coulomb radii rc to use in the benchmarks */
 972             add = (rmax-rmin)/(*ntprs-1);
 973             if (is_equal(rmin, info->rcoulomb[0]))
 974             {
 975                 ir->rcoulomb = rmin + j*add;
 976             }
 977             else if (is_equal(rmax, info->rcoulomb[0]))
 978             {
 979                 ir->rcoulomb = rmin + (j-1)*add;
 980             }
 981             else
 982             {
 983                 /* rmin != rcoul != rmax, ergo test between rmin and rmax */
 984                 add          = (rmax-rmin)/(*ntprs-2);
 985                 ir->rcoulomb = rmin + (j-1)*add;
 986             }
 987
 988             /* Determine the scaling factor fac */
 989             fac = ir->rcoulomb/info->rcoulomb[0];
 990
 991             /* Scale the Fourier grid spacing */
 992             ir->nkx = ir->nky = ir->nkz = 0;
 993             calc_grid(NULL, state.box, fourierspacing*fac, &ir->nkx, &ir->nky, &ir->nkz);
 994
 995             /* Adjust other radii since various conditions neet to be fulfilled */
 996             if (eelPME == ir->coulombtype)
 997             {
 998                 /* plain PME, rcoulomb must be equal to rlist */
 999                 ir->rlist = ir->rcoulomb;
1000             }
1001             else
1002             {
1003                 /* rlist must be >= rcoulomb, we keep the size of the buffer region */
1004                 ir->rlist = ir->rcoulomb + nlist_buffer;
1005             }
1006
1007             if (bScaleRvdw && evdwCUT == ir->vdwtype)
1008             {
1009                 /* For vdw cutoff, rvdw >= rlist */
1010                 ir->rvdw = max(info->rvdw[0], ir->rlist);
1011             }
1012
1013             ir->rlistlong = max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb));
1014
1015         } /* end of "if (j != 0)" */
1016
1017         /* for j==0: Save the original settings
1018          * for j >0: Save modified radii and Fourier grids */
1019         info->rcoulomb[j]  = ir->rcoulomb;
1020         info->rvdw[j]      = ir->rvdw;
1021         info->nkx[j]       = ir->nkx;
1022         info->nky[j]       = ir->nky;
1023         info->nkz[j]       = ir->nkz;
1024         info->rlist[j]     = ir->rlist;
1025         info->rlistlong[j] = ir->rlistlong;
1026         info->fsx[j]       = fac*fourierspacing;
1027         info->fsy[j]       = fac*fourierspacing;
1028         info->fsz[j]       = fac*fourierspacing;
1029
1030         /* Write the benchmark tpr file */
1031         strncpy(fn_bench_tprs[j], fn_sim_tpr, strlen(fn_sim_tpr)-strlen(".tpr"));
1032         sprintf(buf, "_bench%.2d.tpr", j);
1033         strcat(fn_bench_tprs[j], buf);
1034         fprintf(stdout, "Writing benchmark tpr %s with nsteps=", fn_bench_tprs[j]);
1035         fprintf(stdout, gmx_large_int_pfmt, ir->nsteps);
1036         if (j > 0)
1037         {
1038             fprintf(stdout, ", scaling factor %f\n", fac);
1039         }
1040         else
1041         {
1042             fprintf(stdout, ", unmodified settings\n");
1043         }
1044
1045         write_tpx_state(fn_bench_tprs[j], ir, &state, &mtop);
1046
1047         /* Write information about modified tpr settings to log file */
1048         fprintf(fp, "%4d%10f%10f", j, fac, ir->rcoulomb);
1049         fprintf(fp, "%5d%5d%5d", ir->nkx, ir->nky, ir->nkz);
1050         fprintf(fp, " %9f ", info->fsx[j]);
1051         if (evdwCUT == ir->vdwtype)
1052         {
1053             fprintf(fp, "%10f", ir->rvdw);
1054         }
1055         if (EPME_SWITCHED(ir->coulombtype))
1056         {
1057             fprintf(fp, "%10f", ir->rlist);
1058         }
1059         if (info->rlistlong[0] != max_cutoff(info->rlist[0], max_cutoff(info->rvdw[0], info->rcoulomb[0])) )
1060         {
1061             fprintf(fp, "%10f", ir->rlistlong);
1062         }
1063         fprintf(fp, "  %-14s\n", fn_bench_tprs[j]);
1064
1065         /* Make it clear to the user that some additional settings were modified */
1066         if (!is_equal(ir->rvdw, info->rvdw[0])
1067             || !is_equal(ir->rlistlong, info->rlistlong[0]) )
1068         {
1069             bNote = TRUE;
1070         }
1071     }
1072     if (bNote)
1073     {
1074         fprintf(fp, "\nNote that in addition to the Coulomb radius and the Fourier grid\n"
1075                 "other input settings were also changed (see table above).\n"
1076                 "Please check if the modified settings are appropriate.\n");
1077     }
1078     fflush(stdout);
1079     fflush(fp);
1080     sfree(ir);
1081 }
1082
1083
1084 /* Rename the files we want to keep to some meaningful filename and
1085  * delete the rest */
1086 static void cleanup(const t_filenm *fnm, int nfile, int k, int nnodes,
1087                     int nPMEnodes, int nr, gmx_bool bKeepStderr)
1088 {
1089     char        numstring[STRLEN];
1090     char        newfilename[STRLEN];
1091     const char *fn = NULL;
1092     int         i;
1093     const char *opt;
1094
1095
1096     fprintf(stdout, "Cleaning up, deleting benchmark temp files ...\n");
1097
1098     for (i = 0; i < nfile; i++)
1099     {
1100         opt = (char *)fnm[i].opt;
1101         if (strcmp(opt, "-p") == 0)
1102         {
1103             /* do nothing; keep this file */
1104             ;
1105         }
1106         else if (strcmp(opt, "-bg") == 0)
1107         {
1108             /* Give the log file a nice name so one can later see which parameters were used */
1109             numstring[0] = '\0';
1110             if (nr > 0)
1111             {
1112                 sprintf(numstring, "_%d", nr);
1113             }
1114             sprintf(newfilename, "%s_no%d_np%d_npme%d%s", opt2fn("-bg", nfile, fnm), k, nnodes, nPMEnodes, numstring);
1115             if (gmx_fexist(opt2fn("-bg", nfile, fnm)))
1116             {
1117                 fprintf(stdout, "renaming log file to %s\n", newfilename);
1118                 make_backup(newfilename);
1119                 rename(opt2fn("-bg", nfile, fnm), newfilename);
1120             }
1121         }
1122         else if (strcmp(opt, "-err") == 0)
1123         {
1124             /* This file contains the output of stderr. We want to keep it in
1125              * cases where there have been problems. */
1126             fn           = opt2fn(opt, nfile, fnm);
1127             numstring[0] = '\0';
1128             if (nr > 0)
1129             {
1130                 sprintf(numstring, "_%d", nr);
1131             }
1132             sprintf(newfilename, "%s_no%d_np%d_npme%d%s", fn, k, nnodes, nPMEnodes, numstring);
1133             if (gmx_fexist(fn))
1134             {
1135                 if (bKeepStderr)
1136                 {
1137                     fprintf(stdout, "Saving stderr output in %s\n", newfilename);
1138                     make_backup(newfilename);
1139                     rename(fn, newfilename);
1140                 }
1141                 else
1142                 {
1143                     fprintf(stdout, "Deleting %s\n", fn);
1144                     remove(fn);
1145                 }
1146             }
1147         }
1148         /* Delete the files which are created for each benchmark run: (options -b*) */
1149         else if ( (0 == strncmp(opt, "-b", 2)) && (opt2bSet(opt, nfile, fnm) || !is_optional(&fnm[i])) )
1150         {
1151             fn = opt2fn(opt, nfile, fnm);
1152             if (gmx_fexist(fn))
1153             {
1154                 fprintf(stdout, "Deleting %s\n", fn);
1155                 remove(fn);
1156             }
1157         }
1158     }
1159 }
1160
1161
1162 /* Returns the largest common factor of n1 and n2 */
1163 static int largest_common_factor(int n1, int n2)
1164 {
1165     int factor, nmax;
1166
1167     nmax = min(n1, n2);
1168     for (factor = nmax; factor > 0; factor--)
1169     {
1170         if (0 == (n1 % factor) && 0 == (n2 % factor) )
1171         {
1172             return(factor);
1173         }
1174     }
1175     return 0; /* one for the compiler */
1176 }
1177
1178 enum {
1179     eNpmeAuto, eNpmeAll, eNpmeReduced, eNpmeSubset, eNpmeNr
1180 };
1181
1182 /* Create a list of numbers of PME nodes to test */
1183 static void make_npme_list(
1184         const char *npmevalues_opt, /* Make a complete list with all
1185                                      * possibilities or a short list that keeps only
1186                                      * reasonable numbers of PME nodes                  */
1187         int        *nentries,       /* Number of entries we put in the nPMEnodes list   */
1188         int        *nPMEnodes[],    /* Each entry contains the value for -npme          */
1189         int         nnodes,         /* Total number of nodes to do the tests on         */
1190         int         minPMEnodes,    /* Minimum number of PME nodes                      */
1191         int         maxPMEnodes)    /* Maximum number of PME nodes                      */
1192 {
1193     int i, npme, npp;
1194     int min_factor = 1;   /* We request that npp and npme have this minimal
1195                            * largest common factor (depends on npp)           */
1196     int nlistmax;         /* Max. list size                                   */
1197     int nlist;            /* Actual number of entries in list                 */
1198     int eNPME = 0;
1199
1200
1201     /* Do we need to check all possible values for -npme or is a reduced list enough? */
1202     if (0 == strcmp(npmevalues_opt, "all") )
1203     {
1204         eNPME = eNpmeAll;
1205     }
1206     else if (0 == strcmp(npmevalues_opt, "subset") )
1207     {
1208         eNPME = eNpmeSubset;
1209     }
1210     else /* "auto" or "range" */
1211     {
1212         if (nnodes <= 64)
1213         {
1214             eNPME = eNpmeAll;
1215         }
1216         else if (nnodes < 128)
1217         {
1218             eNPME = eNpmeReduced;
1219         }
1220         else
1221         {
1222             eNPME = eNpmeSubset;
1223         }
1224     }
1225
1226     /* Calculate how many entries we could possibly have (in case of -npme all) */
1227     if (nnodes > 2)
1228     {
1229         nlistmax = maxPMEnodes - minPMEnodes + 3;
1230         if (0 == minPMEnodes)
1231         {
1232             nlistmax--;
1233         }
1234     }
1235     else
1236     {
1237         nlistmax = 1;
1238     }
1239
1240     /* Now make the actual list which is at most of size nlist */
1241     snew(*nPMEnodes, nlistmax);
1242     nlist = 0; /* start counting again, now the real entries in the list */
1243     for (i = 0; i < nlistmax - 2; i++)
1244     {
1245         npme = maxPMEnodes - i;
1246         npp  = nnodes-npme;
1247         switch (eNPME)
1248         {
1249             case eNpmeAll:
1250                 min_factor = 1;
1251                 break;
1252             case eNpmeReduced:
1253                 min_factor = 2;
1254                 break;
1255             case eNpmeSubset:
1256                 /* For 2d PME we want a common largest factor of at least the cube
1257                  * root of the number of PP nodes */
1258                 min_factor = (int) pow(npp, 1.0/3.0);
1259                 break;
1260             default:
1261                 gmx_fatal(FARGS, "Unknown option for eNPME in make_npme_list");
1262                 break;
1263         }
1264         if (largest_common_factor(npp, npme) >= min_factor)
1265         {
1266             (*nPMEnodes)[nlist] = npme;
1267             nlist++;
1268         }
1269     }
1270     /* We always test 0 PME nodes and the automatic number */
1271     *nentries             = nlist + 2;
1272     (*nPMEnodes)[nlist  ] =  0;
1273     (*nPMEnodes)[nlist+1] = -1;
1274
1275     fprintf(stderr, "Will try the following %d different values for -npme:\n", *nentries);
1276     for (i = 0; i < *nentries-1; i++)
1277     {
1278         fprintf(stderr, "%d, ", (*nPMEnodes)[i]);
1279     }
1280     fprintf(stderr, "and %d (auto).\n", (*nPMEnodes)[*nentries-1]);
1281 }
1282
1283
1284 /* Allocate memory to store the performance data */
1285 static void init_perfdata(t_perf *perfdata[], int ntprs, int datasets, int repeats)
1286 {
1287     int i, j, k;
1288
1289
1290     for (k = 0; k < ntprs; k++)
1291     {
1292         snew(perfdata[k], datasets);
1293         for (i = 0; i < datasets; i++)
1294         {
1295             for (j = 0; j < repeats; j++)
1296             {
1297                 snew(perfdata[k][i].Gcycles, repeats);
1298                 snew(perfdata[k][i].ns_per_day, repeats);
1299                 snew(perfdata[k][i].PME_f_load, repeats);
1300             }
1301         }
1302     }
1303 }
1304
1305
1306 /* Check for errors on mdrun -h */
1307 static void make_sure_it_runs(char *mdrun_cmd_line, int length, FILE *fp)
1308 {
1309     char *command, *msg;
1310     int   ret;
1311
1312
1313     snew(command, length +  15);
1314     snew(msg, length + 500);
1315
1316     fprintf(stdout, "Making sure the benchmarks can be executed ...\n");
1317     /* FIXME: mdrun -h no longer actually does anything useful.
1318      * It unconditionally prints the help, ignoring all other options. */
1319     sprintf(command, "%s-h -quiet", mdrun_cmd_line);
1320     ret = gmx_system_call(command);
1321
1322     if (0 != ret)
1323     {
1324         /* To prevent confusion, do not again issue a gmx_fatal here since we already
1325          * get the error message from mdrun itself */
1326         sprintf(msg,    "Cannot run the benchmark simulations! Please check the error message of\n"
1327                 "mdrun for the source of the problem. Did you provide a command line\n"
1328                 "argument that neither g_tune_pme nor mdrun understands? Offending command:\n"
1329                 "\n%s\n\n", command);
1330
1331         fprintf(stderr, "%s", msg);
1332         sep_line(fp);
1333         fprintf(fp, "%s", msg);
1334
1335         exit(ret);
1336     }
1337
1338     sfree(command);
1339     sfree(msg    );
1340 }
1341
1342
1343 static void do_the_tests(
1344         FILE           *fp,             /* General g_tune_pme output file         */
1345         char          **tpr_names,      /* Filenames of the input files to test   */
1346         int             maxPMEnodes,    /* Max fraction of nodes to use for PME   */
1347         int             minPMEnodes,    /* Min fraction of nodes to use for PME   */
1348         int             npme_fixed,     /* If >= -1, test fixed number of PME
1349                                          * nodes only                             */
1350         const char     *npmevalues_opt, /* Which -npme values should be tested    */
1351         t_perf        **perfdata,       /* Here the performace data is stored     */
1352         int            *pmeentries,     /* Entries in the nPMEnodes list          */
1353         int             repeats,        /* Repeat each test this often            */
1354         int             nnodes,         /* Total number of nodes = nPP + nPME     */
1355         int             nr_tprs,        /* Total number of tpr files to test      */
1356         gmx_bool        bThreads,       /* Threads or MPI?                        */
1357         char           *cmd_mpirun,     /* mpirun command string                  */
1358         char           *cmd_np,         /* "-np", "-n", whatever mpirun needs     */
1359         char           *cmd_mdrun,      /* mdrun command string                   */
1360         char           *cmd_args_bench, /* arguments for mdrun in a string        */
1361         const t_filenm *fnm,            /* List of filenames from command line    */
1362         int             nfile,          /* Number of files specified on the cmdl. */
1363         int             presteps,       /* DLB equilibration steps, is checked    */
1364         gmx_large_int_t cpt_steps)      /* Time step counter in the checkpoint    */
1365 {
1366     int      i, nr, k, ret, count = 0, totaltests;
1367     int     *nPMEnodes = NULL;
1368     t_perf  *pd        = NULL;
1369     int      cmdline_length;
1370     char    *command, *cmd_stub;
1371     char     buf[STRLEN];
1372     gmx_bool bResetProblem = FALSE;
1373     gmx_bool bFirst        = TRUE;
1374
1375
1376     /* This string array corresponds to the eParselog enum type at the start
1377      * of this file */
1378     const char* ParseLog[] = {
1379         "OK.",
1380         "Logfile not found!",
1381         "No timings, logfile truncated?",
1382         "Run was terminated.",
1383         "Counters were not reset properly.",
1384         "No DD grid found for these settings.",
1385         "TPX version conflict!",
1386         "mdrun was not started in parallel!",
1387         "Number of PP nodes has a prime factor that is too large.",
1388         "An error occured."
1389     };
1390     char        str_PME_f_load[13];
1391
1392
1393     /* Allocate space for the mdrun command line. 100 extra characters should
1394        be more than enough for the -npme etcetera arguments */
1395     cmdline_length =  strlen(cmd_mpirun)
1396         + strlen(cmd_np)
1397         + strlen(cmd_mdrun)
1398         + strlen(cmd_args_bench)
1399         + strlen(tpr_names[0]) + 100;
1400     snew(command, cmdline_length);
1401     snew(cmd_stub, cmdline_length);
1402
1403     /* Construct the part of the command line that stays the same for all tests: */
1404     if (bThreads)
1405     {
1406         sprintf(cmd_stub, "%s%s", cmd_mdrun, cmd_np);
1407     }
1408     else
1409     {
1410         sprintf(cmd_stub, "%s%s%s ", cmd_mpirun, cmd_np, cmd_mdrun);
1411     }
1412
1413     /* Create a list of numbers of PME nodes to test */
1414     if (npme_fixed < -1)
1415     {
1416         make_npme_list(npmevalues_opt, pmeentries, &nPMEnodes,
1417                        nnodes, minPMEnodes, maxPMEnodes);
1418     }
1419     else
1420     {
1421         *pmeentries  = 1;
1422         snew(nPMEnodes, 1);
1423         nPMEnodes[0] = npme_fixed;
1424         fprintf(stderr, "Will use a fixed number of %d PME-only nodes.\n", nPMEnodes[0]);
1425     }
1426
1427     if (0 == repeats)
1428     {
1429         fprintf(fp, "\nNo benchmarks done since number of repeats (-r) is 0.\n");
1430         ffclose(fp);
1431         finalize(opt2fn("-p", nfile, fnm));
1432         exit(0);
1433     }
1434
1435     /* Allocate one dataset for each tpr input file: */
1436     init_perfdata(perfdata, nr_tprs, *pmeentries, repeats);
1437
1438     /*****************************************/
1439     /* Main loop over all tpr files to test: */
1440     /*****************************************/
1441     totaltests = nr_tprs*(*pmeentries)*repeats;
1442     for (k = 0; k < nr_tprs; k++)
1443     {
1444         fprintf(fp, "\nIndividual timings for input file %d (%s):\n", k, tpr_names[k]);
1445         fprintf(fp, "PME nodes      Gcycles       ns/day        PME/f    Remark\n");
1446         /* Loop over various numbers of PME nodes: */
1447         for (i = 0; i < *pmeentries; i++)
1448         {
1449             pd = &perfdata[k][i];
1450
1451             /* Loop over the repeats for each scenario: */
1452             for (nr = 0; nr < repeats; nr++)
1453             {
1454                 pd->nPMEnodes = nPMEnodes[i];
1455
1456                 /* Add -npme and -s to the command line and save it. Note that
1457                  * the -passall (if set) options requires cmd_args_bench to be
1458                  * at the end of the command line string */
1459                 snew(pd->mdrun_cmd_line, cmdline_length);
1460                 sprintf(pd->mdrun_cmd_line, "%s-npme %d -s %s %s",
1461                         cmd_stub, pd->nPMEnodes, tpr_names[k], cmd_args_bench);
1462
1463                 /* To prevent that all benchmarks fail due to a show-stopper argument
1464                  * on the mdrun command line, we make a quick check with mdrun -h first */
1465                 if (bFirst)
1466                 {
1467                     make_sure_it_runs(pd->mdrun_cmd_line, cmdline_length, fp);
1468                 }
1469                 bFirst = FALSE;
1470
1471                 /* Do a benchmark simulation: */
1472                 if (repeats > 1)
1473                 {
1474                     sprintf(buf, ", pass %d/%d", nr+1, repeats);
1475                 }
1476                 else
1477                 {
1478                     buf[0] = '\0';
1479                 }
1480                 fprintf(stdout, "\n=== Progress %2.0f%%, tpr %d/%d, run %d/%d%s:\n",
1481                         (100.0*count)/totaltests,
1482                         k+1, nr_tprs, i+1, *pmeentries, buf);
1483                 make_backup(opt2fn("-err", nfile, fnm));
1484                 sprintf(command, "%s 1> /dev/null 2>%s", pd->mdrun_cmd_line, opt2fn("-err", nfile, fnm));
1485                 fprintf(stdout, "%s\n", pd->mdrun_cmd_line);
1486                 gmx_system_call(command);
1487
1488                 /* Collect the performance data from the log file; also check stderr
1489                  * for fatal errors */
1490                 ret = parse_logfile(opt2fn("-bg", nfile, fnm), opt2fn("-err", nfile, fnm),
1491                                     pd, nr, presteps, cpt_steps, nnodes);
1492                 if ((presteps > 0) && (ret == eParselogResetProblem))
1493                 {
1494                     bResetProblem = TRUE;
1495                 }
1496
1497                 if (-1 == pd->nPMEnodes)
1498                 {
1499                     sprintf(buf, "(%3d)", pd->guessPME);
1500                 }
1501                 else
1502                 {
1503                     sprintf(buf, "     ");
1504                 }
1505
1506                 /* Nicer output */
1507                 if (pd->PME_f_load[nr] > 0.0)
1508                 {
1509                     sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load[nr]);
1510                 }
1511                 else
1512                 {
1513                     sprintf(str_PME_f_load, "%s", "         -  ");
1514                 }
1515
1516                 /* Write the data we got to disk */
1517                 fprintf(fp, "%4d%s %12.3f %12.3f %s    %s", pd->nPMEnodes,
1518                         buf, pd->Gcycles[nr], pd->ns_per_day[nr], str_PME_f_load, ParseLog[ret]);
1519                 if (!(ret == eParselogOK || ret == eParselogNoDDGrid || ret == eParselogNotFound) )
1520                 {
1521                     fprintf(fp, " Check %s file for problems.", ret == eParselogFatal ? "err" : "log");
1522                 }
1523                 fprintf(fp, "\n");
1524                 fflush(fp);
1525                 count++;
1526
1527                 /* Do some cleaning up and delete the files we do not need any more */
1528                 cleanup(fnm, nfile, k, nnodes, pd->nPMEnodes, nr, ret == eParselogFatal);
1529
1530                 /* If the first run with this number of processors already failed, do not try again: */
1531                 if (pd->Gcycles[0] <= 0.0 && repeats > 1)
1532                 {
1533                     fprintf(stdout, "Skipping remaining passes of unsuccessful setting, see log file for details.\n");
1534                     count += repeats-(nr+1);
1535                     break;
1536                 }
1537             } /* end of repeats loop */
1538         }     /* end of -npme loop */
1539     }         /* end of tpr file loop */
1540
1541     if (bResetProblem)
1542     {
1543         sep_line(fp);
1544         fprintf(fp, "WARNING: The cycle and time step counters could not be reset properly. ");
1545         sep_line(fp);
1546     }
1547     sfree(command);
1548     sfree(cmd_stub);
1549 }
1550
1551
1552 static void check_input(
1553         int             nnodes,
1554         int             repeats,
1555         int            *ntprs,
1556         real           *rmin,
1557         real            rcoulomb,
1558         real           *rmax,
1559         real            maxPMEfraction,
1560         real            minPMEfraction,
1561         int             npme_fixed,
1562         gmx_large_int_t bench_nsteps,
1563         const t_filenm *fnm,
1564         int             nfile,
1565         int             sim_part,
1566         int             presteps,
1567         int             npargs,
1568         t_pargs        *pa)
1569 {
1570     int old;
1571
1572
1573     /* Make sure the input file exists */
1574     if (!gmx_fexist(opt2fn("-s", nfile, fnm)))
1575     {
1576         gmx_fatal(FARGS, "File %s not found.", opt2fn("-s", nfile, fnm));
1577     }
1578
1579     /* Make sure that the checkpoint file is not overwritten during benchmarking */
1580     if ( (0 == strcmp(opt2fn("-cpi", nfile, fnm), opt2fn("-bcpo", nfile, fnm)) ) && (sim_part > 1) )
1581     {
1582         gmx_fatal(FARGS, "Checkpoint input (-cpi) and benchmark checkpoint output (-bcpo) files must not be identical.\n"
1583                   "The checkpoint input file must not be overwritten during the benchmarks.\n");
1584     }
1585
1586     /* Make sure that repeats is >= 0 (if == 0, only write tpr files) */
1587     if (repeats < 0)
1588     {
1589         gmx_fatal(FARGS, "Number of repeats < 0!");
1590     }
1591
1592     /* Check number of nodes */
1593     if (nnodes < 1)
1594     {
1595         gmx_fatal(FARGS, "Number of nodes/threads must be a positive integer.");
1596     }
1597
1598     /* Automatically choose -ntpr if not set */
1599     if (*ntprs < 1)
1600     {
1601         if (nnodes < 16)
1602         {
1603             *ntprs = 1;
1604         }
1605         else
1606         {
1607             *ntprs = 3;
1608             /* Set a reasonable scaling factor for rcoulomb */
1609             if (*rmax <= 0)
1610             {
1611                 *rmax = rcoulomb * 1.2;
1612             }
1613         }
1614         fprintf(stderr, "Will test %d tpr file%s.\n", *ntprs, *ntprs == 1 ? "" : "s");
1615     }
1616     else
1617     {
1618         if (1 == *ntprs)
1619         {
1620             fprintf(stderr, "Note: Choose ntpr>1 to shift PME load between real and reciprocal space.\n");
1621         }
1622     }
1623
1624     /* Make shure that rmin <= rcoulomb <= rmax */
1625     if (*rmin <= 0)
1626     {
1627         *rmin = rcoulomb;
1628     }
1629     if (*rmax <= 0)
1630     {
1631         *rmax = rcoulomb;
1632     }
1633     if (!(*rmin <= *rmax) )
1634     {
1635         gmx_fatal(FARGS, "Please choose the Coulomb radii such that rmin <= rmax.\n"
1636                   "rmin = %g, rmax = %g, actual rcoul from .tpr file = %g\n", *rmin, *rmax, rcoulomb);
1637     }
1638     /* Add test scenarios if rmin or rmax were set */
1639     if (*ntprs <= 2)
1640     {
1641         if (!is_equal(*rmin, rcoulomb) && (*ntprs == 1) )
1642         {
1643             (*ntprs)++;
1644             fprintf(stderr, "NOTE: Setting -rmin to %g changed -ntpr to %d\n",
1645                     *rmin, *ntprs);
1646         }
1647         if (!is_equal(*rmax, rcoulomb) && (*ntprs == 1) )
1648         {
1649             (*ntprs)++;
1650             fprintf(stderr, "NOTE: Setting -rmax to %g changed -ntpr to %d\n",
1651                     *rmax, *ntprs);
1652         }
1653     }
1654     old = *ntprs;
1655     /* If one of rmin, rmax is set, we need 2 tpr files at minimum */
1656     if (!is_equal(*rmax, rcoulomb) || !is_equal(*rmin, rcoulomb) )
1657     {
1658         *ntprs = max(*ntprs, 2);
1659     }
1660
1661     /* If both rmin, rmax are set, we need 3 tpr files at minimum */
1662     if (!is_equal(*rmax, rcoulomb) && !is_equal(*rmin, rcoulomb) )
1663     {
1664         *ntprs = max(*ntprs, 3);
1665     }
1666
1667     if (old != *ntprs)
1668     {
1669         fprintf(stderr, "NOTE: Your rmin, rmax setting changed -ntpr to %d\n", *ntprs);
1670     }
1671
1672     if (*ntprs > 1)
1673     {
1674         if (is_equal(*rmin, rcoulomb) && is_equal(rcoulomb, *rmax)) /* We have just a single rc */
1675         {
1676             fprintf(stderr, "WARNING: Resetting -ntpr to 1 since no Coulomb radius scaling is requested.\n"
1677                     "Please set rmin < rmax to test Coulomb radii in the [rmin, rmax] interval\n"
1678                     "with correspondingly adjusted PME grid settings\n");
1679             *ntprs = 1;
1680         }
1681     }
1682
1683     /* Check whether max and min fraction are within required values */
1684     if (maxPMEfraction > 0.5 || maxPMEfraction < 0)
1685     {
1686         gmx_fatal(FARGS, "-max must be between 0 and 0.5");
1687     }
1688     if (minPMEfraction > 0.5 || minPMEfraction < 0)
1689     {
1690         gmx_fatal(FARGS, "-min must be between 0 and 0.5");
1691     }
1692     if (maxPMEfraction < minPMEfraction)
1693     {
1694         gmx_fatal(FARGS, "-max must be larger or equal to -min");
1695     }
1696
1697     /* Check whether the number of steps - if it was set - has a reasonable value */
1698     if (bench_nsteps < 0)
1699     {
1700         gmx_fatal(FARGS, "Number of steps must be positive.");
1701     }
1702
1703     if (bench_nsteps > 10000 || bench_nsteps < 100)
1704     {
1705         fprintf(stderr, "WARNING: steps=");
1706         fprintf(stderr, gmx_large_int_pfmt, bench_nsteps);
1707         fprintf(stderr, ". Are you sure you want to perform so %s steps for each benchmark?\n", (bench_nsteps < 100) ? "few" : "many");
1708     }
1709
1710     if (presteps < 0)
1711     {
1712         gmx_fatal(FARGS, "Cannot have a negative number of presteps.\n");
1713     }
1714
1715     /* Check for rcoulomb scaling if more than one .tpr file is tested */
1716     if (*ntprs > 1)
1717     {
1718         if (*rmin/rcoulomb < 0.75 || *rmax/rcoulomb > 1.25)
1719         {
1720             fprintf(stderr, "WARNING: Applying extreme scaling factor. I hope you know what you are doing.\n");
1721         }
1722     }
1723
1724     /* If a fixed number of PME nodes is set we do rcoulomb and PME gird tuning
1725      * only. We need to check whether the requested number of PME-only nodes
1726      * makes sense. */
1727     if (npme_fixed > -1)
1728     {
1729         /* No more than 50% of all nodes can be assigned as PME-only nodes. */
1730         if (2*npme_fixed > nnodes)
1731         {
1732             gmx_fatal(FARGS, "Cannot have more than %d PME-only nodes for a total of %d nodes (you chose %d).\n",
1733                       nnodes/2, nnodes, npme_fixed);
1734         }
1735         if ((npme_fixed > 0) && (5*npme_fixed < nnodes))
1736         {
1737             fprintf(stderr, "WARNING: Only %g percent of the nodes are assigned as PME-only nodes.\n",
1738                     100.0*((real)npme_fixed / (real)nnodes));
1739         }
1740         if (opt2parg_bSet("-min", npargs, pa) || opt2parg_bSet("-max", npargs, pa))
1741         {
1742             fprintf(stderr, "NOTE: The -min, -max, and -npme options have no effect when a\n"
1743                     "      fixed number of PME-only nodes is requested with -fix.\n");
1744         }
1745     }
1746 }
1747
1748
1749 /* Returns TRUE when "opt" is needed at launch time */
1750 static gmx_bool is_launch_file(char *opt, gmx_bool bSet)
1751 {
1752     /* Apart from the input .tpr and the output log files we need all options that
1753      * were set on the command line and that do not start with -b */
1754     if    (0 == strncmp(opt, "-b", 2) || 0 == strncmp(opt, "-s", 2)
1755            || 0 == strncmp(opt, "-err", 4) || 0 == strncmp(opt, "-p", 2) )
1756     {
1757         return FALSE;
1758     }
1759
1760     return bSet;
1761 }
1762
1763
1764 /* Returns TRUE when "opt" defines a file which is needed for the benchmarks runs */
1765 static gmx_bool is_bench_file(char *opt, gmx_bool bSet, gmx_bool bOptional, gmx_bool bIsOutput)
1766 {
1767     /* Apart from the input .tpr, all files starting with "-b" are for
1768      * _b_enchmark files exclusively */
1769     if (0 == strncmp(opt, "-s", 2))
1770     {
1771         return FALSE;
1772     }
1773
1774     if (0 == strncmp(opt, "-b", 2) || 0 == strncmp(opt, "-s", 2))
1775     {
1776         if (!bOptional || bSet)
1777         {
1778             return TRUE;
1779         }
1780         else
1781         {
1782             return FALSE;
1783         }
1784     }
1785     else
1786     {
1787         if (bIsOutput)
1788         {
1789             return FALSE;
1790         }
1791         else
1792         {
1793             if (bSet) /* These are additional input files like -cpi -ei */
1794             {
1795                 return TRUE;
1796             }
1797             else
1798             {
1799                 return FALSE;
1800             }
1801         }
1802     }
1803 }
1804
1805
1806 /* Adds 'buf' to 'str' */
1807 static void add_to_string(char **str, char *buf)
1808 {
1809     int len;
1810
1811
1812     len = strlen(*str) + strlen(buf) + 1;
1813     srenew(*str, len);
1814     strcat(*str, buf);
1815 }
1816
1817
1818 /* Create the command line for the benchmark as well as for the real run */
1819 static void create_command_line_snippets(
1820         gmx_bool  bAppendFiles,
1821         gmx_bool  bKeepAndNumCPT,
1822         gmx_bool  bResetHWay,
1823         int       presteps,
1824         int       nfile,
1825         t_filenm  fnm[],
1826         char     *cmd_args_bench[],  /* command line arguments for benchmark runs */
1827         char     *cmd_args_launch[], /* command line arguments for simulation run */
1828         char      extra_args[])      /* Add this to the end of the command line */
1829 {
1830     int         i;
1831     char       *opt;
1832     const char *name;
1833     char        strbuf[STRLEN];
1834
1835
1836     /* strlen needs at least '\0' as a string: */
1837     snew(*cmd_args_bench, 1);
1838     snew(*cmd_args_launch, 1);
1839     *cmd_args_launch[0] = '\0';
1840     *cmd_args_bench[0]  = '\0';
1841
1842
1843     /*******************************************/
1844     /* 1. Process other command line arguments */
1845     /*******************************************/
1846     if (presteps > 0)
1847     {
1848         /* Add equilibration steps to benchmark options */
1849         sprintf(strbuf, "-resetstep %d ", presteps);
1850         add_to_string(cmd_args_bench, strbuf);
1851     }
1852     /* These switches take effect only at launch time */
1853     if (FALSE == bAppendFiles)
1854     {
1855         add_to_string(cmd_args_launch, "-noappend ");
1856     }
1857     if (bKeepAndNumCPT)
1858     {
1859         add_to_string(cmd_args_launch, "-cpnum ");
1860     }
1861     if (bResetHWay)
1862     {
1863         add_to_string(cmd_args_launch, "-resethway ");
1864     }
1865
1866     /********************/
1867     /* 2. Process files */
1868     /********************/
1869     for (i = 0; i < nfile; i++)
1870     {
1871         opt  = (char *)fnm[i].opt;
1872         name = opt2fn(opt, nfile, fnm);
1873
1874         /* Strbuf contains the options, now let's sort out where we need that */
1875         sprintf(strbuf, "%s %s ", opt, name);
1876
1877         if (is_bench_file(opt, opt2bSet(opt, nfile, fnm), is_optional(&fnm[i]), is_output(&fnm[i])) )
1878         {
1879             /* All options starting with -b* need the 'b' removed,
1880              * therefore overwrite strbuf */
1881             if (0 == strncmp(opt, "-b", 2))
1882             {
1883                 sprintf(strbuf, "-%s %s ", &opt[2], name);
1884             }
1885
1886             add_to_string(cmd_args_bench, strbuf);
1887         }
1888
1889         if (is_launch_file(opt, opt2bSet(opt, nfile, fnm)) )
1890         {
1891             add_to_string(cmd_args_launch, strbuf);
1892         }
1893     }
1894
1895     add_to_string(cmd_args_bench, extra_args);
1896     add_to_string(cmd_args_launch, extra_args);
1897 }
1898
1899
1900 /* Set option opt */
1901 static void setopt(const char *opt, int nfile, t_filenm fnm[])
1902 {
1903     int i;
1904
1905     for (i = 0; (i < nfile); i++)
1906     {
1907         if (strcmp(opt, fnm[i].opt) == 0)
1908         {
1909             fnm[i].flag |= ffSET;
1910         }
1911     }
1912 }
1913
1914
1915 /* This routine inspects the tpr file and ...
1916  * 1. checks for output files that get triggered by a tpr option. These output
1917  *    files are marked as 'set' to allow for a proper cleanup after each
1918  *    tuning run.
1919  * 2. returns the PME:PP load ratio
1920  * 3. returns rcoulomb from the tpr */
1921 static float inspect_tpr(int nfile, t_filenm fnm[], real *rcoulomb)
1922 {
1923     gmx_bool     bPull;     /* Is pulling requested in .tpr file?             */
1924     gmx_bool     bTpi;      /* Is test particle insertion requested?          */
1925     gmx_bool     bFree;     /* Is a free energy simulation requested?         */
1926     gmx_bool     bNM;       /* Is a normal mode analysis requested?           */
1927     t_inputrec   ir;
1928     t_state      state;
1929     gmx_mtop_t   mtop;
1930
1931
1932     /* Check tpr file for options that trigger extra output files */
1933     read_tpx_state(opt2fn("-s", nfile, fnm), &ir, &state, NULL, &mtop);
1934     bPull = (epullNO != ir.ePull);
1935     bFree = (efepNO  != ir.efep );
1936     bNM   = (eiNM    == ir.eI   );
1937     bTpi  = EI_TPI(ir.eI);
1938
1939     /* Set these output files on the tuning command-line */
1940     if (bPull)
1941     {
1942         setopt("-pf", nfile, fnm);
1943         setopt("-px", nfile, fnm);
1944     }
1945     if (bFree)
1946     {
1947         setopt("-dhdl", nfile, fnm);
1948     }
1949     if (bTpi)
1950     {
1951         setopt("-tpi", nfile, fnm);
1952         setopt("-tpid", nfile, fnm);
1953     }
1954     if (bNM)
1955     {
1956         setopt("-mtx", nfile, fnm);
1957     }
1958
1959     *rcoulomb = ir.rcoulomb;
1960
1961     /* Return the estimate for the number of PME nodes */
1962     return pme_load_estimate(&mtop, &ir, state.box);
1963 }
1964
1965
1966 static void couple_files_options(int nfile, t_filenm fnm[])
1967 {
1968     int      i;
1969     gmx_bool bSet, bBench;
1970     char    *opt;
1971     char     buf[20];
1972
1973
1974     for (i = 0; i < nfile; i++)
1975     {
1976         opt    = (char *)fnm[i].opt;
1977         bSet   = ((fnm[i].flag & ffSET) != 0);
1978         bBench = (0 == strncmp(opt, "-b", 2));
1979
1980         /* Check optional files */
1981         /* If e.g. -eo is set, then -beo also needs to be set */
1982         if (is_optional(&fnm[i]) && bSet && !bBench)
1983         {
1984             sprintf(buf, "-b%s", &opt[1]);
1985             setopt(buf, nfile, fnm);
1986         }
1987         /* If -beo is set, then -eo also needs to be! */
1988         if (is_optional(&fnm[i]) && bSet && bBench)
1989         {
1990             sprintf(buf, "-%s", &opt[2]);
1991             setopt(buf, nfile, fnm);
1992         }
1993     }
1994 }
1995
1996
1997 #define BENCHSTEPS (1000)
1998
1999 int gmx_tune_pme(int argc, char *argv[])
2000 {
2001     const char     *desc[] = {
2002         "For a given number [TT]-np[tt] or [TT]-ntmpi[tt] of processors/threads, [THISMODULE] systematically",
2003         "times [gmx-mdrun] with various numbers of PME-only nodes and determines",
2004         "which setting is fastest. It will also test whether performance can",
2005         "be enhanced by shifting load from the reciprocal to the real space",
2006         "part of the Ewald sum. ",
2007         "Simply pass your [TT].tpr[tt] file to [THISMODULE] together with other options",
2008         "for [gmx-mdrun] as needed.[PAR]",
2009         "Which executables are used can be set in the environment variables",
2010         "MPIRUN and MDRUN. If these are not present, 'mpirun' and 'mdrun'",
2011         "will be used as defaults. Note that for certain MPI frameworks you",
2012         "need to provide a machine- or hostfile. This can also be passed",
2013         "via the MPIRUN variable, e.g.[PAR]",
2014         "[TT]export MPIRUN=\"/usr/local/mpirun -machinefile hosts\"[tt][PAR]",
2015         "Please call [THISMODULE] with the normal options you would pass to",
2016         "[gmx-mdrun] and add [TT]-np[tt] for the number of processors to perform the",
2017         "tests on, or [TT]-ntmpi[tt] for the number of threads. You can also add [TT]-r[tt]",
2018         "to repeat each test several times to get better statistics. [PAR]",
2019         "[THISMODULE] can test various real space / reciprocal space workloads",
2020         "for you. With [TT]-ntpr[tt] you control how many extra [TT].tpr[tt] files will be",
2021         "written with enlarged cutoffs and smaller Fourier grids respectively.",
2022         "Typically, the first test (number 0) will be with the settings from the input",
2023         "[TT].tpr[tt] file; the last test (number [TT]ntpr[tt]) will have the Coulomb cutoff",
2024         "specified by [TT]-rmax[tt] with a somwhat smaller PME grid at the same time. ",
2025         "In this last test, the Fourier spacing is multiplied with [TT]rmax[tt]/rcoulomb. ",
2026         "The remaining [TT].tpr[tt] files will have equally-spaced Coulomb radii (and Fourier "
2027         "spacings) between these extremes. [BB]Note[bb] that you can set [TT]-ntpr[tt] to 1",
2028         "if you just seek the optimal number of PME-only nodes; in that case",
2029         "your input [TT].tpr[tt] file will remain unchanged.[PAR]",
2030         "For the benchmark runs, the default of 1000 time steps should suffice for most",
2031         "MD systems. The dynamic load balancing needs about 100 time steps",
2032         "to adapt to local load imbalances, therefore the time step counters",
2033         "are by default reset after 100 steps. For large systems (>1M atoms), as well as ",
2034         "for a higher accuarcy of the measurements, you should set [TT]-resetstep[tt] to a higher value.",
2035         "From the 'DD' load imbalance entries in the md.log output file you",
2036         "can tell after how many steps the load is sufficiently balanced. Example call:[PAR]"
2037         "[TT]gmx tune_pme -np 64 -s protein.tpr -launch[tt][PAR]",
2038         "After calling [gmx-mdrun] several times, detailed performance information",
2039         "is available in the output file [TT]perf.out[tt].",
2040         "[BB]Note[bb] that during the benchmarks, a couple of temporary files are written",
2041         "(options [TT]-b[tt]*), these will be automatically deleted after each test.[PAR]",
2042         "If you want the simulation to be started automatically with the",
2043         "optimized parameters, use the command line option [TT]-launch[tt].[PAR]",
2044     };
2045
2046     int             nnodes         = 1;
2047     int             repeats        = 2;
2048     int             pmeentries     = 0; /* How many values for -npme do we actually test for each tpr file */
2049     real            maxPMEfraction = 0.50;
2050     real            minPMEfraction = 0.25;
2051     int             maxPMEnodes, minPMEnodes;
2052     float           guessPMEratio;                    /* guessed PME:PP ratio based on the tpr file */
2053     float           guessPMEnodes;
2054     int             npme_fixed     = -2;              /* If >= -1, use only this number
2055                                                        * of PME-only nodes                */
2056     int             ntprs          = 0;
2057     real            rmin           = 0.0, rmax = 0.0; /* min and max value for rcoulomb if scaling is requested */
2058     real            rcoulomb       = -1.0;            /* Coulomb radius as set in .tpr file */
2059     gmx_bool        bScaleRvdw     = TRUE;
2060     gmx_large_int_t bench_nsteps   = BENCHSTEPS;
2061     gmx_large_int_t new_sim_nsteps = -1;  /* -1 indicates: not set by the user */
2062     gmx_large_int_t cpt_steps      = 0;   /* Step counter in .cpt input file   */
2063     int             presteps       = 100; /* Do a full cycle reset after presteps steps */
2064     gmx_bool        bOverwrite     = FALSE, bKeepTPR;
2065     gmx_bool        bLaunch        = FALSE;
2066     char           *ExtraArgs      = NULL;
2067     char          **tpr_names      = NULL;
2068     const char     *simulation_tpr = NULL;
2069     int             best_npme, best_tpr;
2070     int             sim_part = 1; /* For benchmarks with checkpoint files */
2071     char            bbuf[STRLEN];
2072
2073     /* Default program names if nothing else is found */
2074     char         *cmd_mpirun = NULL, *cmd_mdrun = NULL;
2075     char         *cmd_args_bench, *cmd_args_launch;
2076     char         *cmd_np = NULL;
2077
2078     t_perf      **perfdata = NULL;
2079     t_inputinfo  *info;
2080     int           i;
2081     FILE         *fp;
2082     t_commrec    *cr;
2083
2084     /* Print out how long the tuning took */
2085     double          seconds;
2086
2087     static t_filenm fnm[] = {
2088         /* g_tune_pme */
2089         { efOUT, "-p",      "perf",     ffWRITE },
2090         { efLOG, "-err",    "bencherr", ffWRITE },
2091         { efTPX, "-so",     "tuned",    ffWRITE },
2092         /* mdrun: */
2093         { efTPX, NULL,      NULL,       ffREAD },
2094         { efTRN, "-o",      NULL,       ffWRITE },
2095         { efXTC, "-x",      NULL,       ffOPTWR },
2096         { efCPT, "-cpi",    NULL,       ffOPTRD },
2097         { efCPT, "-cpo",    NULL,       ffOPTWR },
2098         { efSTO, "-c",      "confout",  ffWRITE },
2099         { efEDR, "-e",      "ener",     ffWRITE },
2100         { efLOG, "-g",      "md",       ffWRITE },
2101         { efXVG, "-dhdl",   "dhdl",     ffOPTWR },
2102         { efXVG, "-field",  "field",    ffOPTWR },
2103         { efXVG, "-table",  "table",    ffOPTRD },
2104         { efXVG, "-tabletf", "tabletf",   ffOPTRD },
2105         { efXVG, "-tablep", "tablep",   ffOPTRD },
2106         { efXVG, "-tableb", "table",    ffOPTRD },
2107         { efTRX, "-rerun",  "rerun",    ffOPTRD },
2108         { efXVG, "-tpi",    "tpi",      ffOPTWR },
2109         { efXVG, "-tpid",   "tpidist",  ffOPTWR },
2110         { efEDI, "-ei",     "sam",      ffOPTRD },
2111         { efXVG, "-eo",     "edsam",    ffOPTWR },
2112         { efXVG, "-devout", "deviatie", ffOPTWR },
2113         { efXVG, "-runav",  "runaver",  ffOPTWR },
2114         { efXVG, "-px",     "pullx",    ffOPTWR },
2115         { efXVG, "-pf",     "pullf",    ffOPTWR },
2116         { efXVG, "-ro",     "rotation", ffOPTWR },
2117         { efLOG, "-ra",     "rotangles", ffOPTWR },
2118         { efLOG, "-rs",     "rotslabs", ffOPTWR },
2119         { efLOG, "-rt",     "rottorque", ffOPTWR },
2120         { efMTX, "-mtx",    "nm",       ffOPTWR },
2121         { efNDX, "-dn",     "dipole",   ffOPTWR },
2122         /* Output files that are deleted after each benchmark run */
2123         { efTRN, "-bo",     "bench",    ffWRITE },
2124         { efXTC, "-bx",     "bench",    ffWRITE },
2125         { efCPT, "-bcpo",   "bench",    ffWRITE },
2126         { efSTO, "-bc",     "bench",    ffWRITE },
2127         { efEDR, "-be",     "bench",    ffWRITE },
2128         { efLOG, "-bg",     "bench",    ffWRITE },
2129         { efXVG, "-beo",    "benchedo", ffOPTWR },
2130         { efXVG, "-bdhdl",  "benchdhdl", ffOPTWR },
2131         { efXVG, "-bfield", "benchfld", ffOPTWR },
2132         { efXVG, "-btpi",   "benchtpi", ffOPTWR },
2133         { efXVG, "-btpid",  "benchtpid", ffOPTWR },
2134         { efXVG, "-bdevout", "benchdev", ffOPTWR },
2135         { efXVG, "-brunav", "benchrnav", ffOPTWR },
2136         { efXVG, "-bpx",    "benchpx",  ffOPTWR },
2137         { efXVG, "-bpf",    "benchpf",  ffOPTWR },
2138         { efXVG, "-bro",    "benchrot", ffOPTWR },
2139         { efLOG, "-bra",    "benchrota", ffOPTWR },
2140         { efLOG, "-brs",    "benchrots", ffOPTWR },
2141         { efLOG, "-brt",    "benchrott", ffOPTWR },
2142         { efMTX, "-bmtx",   "benchn",   ffOPTWR },
2143         { efNDX, "-bdn",    "bench",    ffOPTWR }
2144     };
2145
2146     gmx_bool        bThreads     = FALSE;
2147
2148     int             nthreads = 1;
2149
2150     const char     *procstring[] =
2151     { NULL, "-np", "-n", "none", NULL };
2152     const char     *npmevalues_opt[] =
2153     { NULL, "auto", "all", "subset", NULL };
2154
2155     gmx_bool     bAppendFiles          = TRUE;
2156     gmx_bool     bKeepAndNumCPT        = FALSE;
2157     gmx_bool     bResetCountersHalfWay = FALSE;
2158     gmx_bool     bBenchmark            = TRUE;
2159
2160     output_env_t oenv = NULL;
2161
2162     t_pargs      pa[] = {
2163         /***********************/
2164         /* g_tune_pme options: */
2165         /***********************/
2166         { "-np",       FALSE, etINT,  {&nnodes},
2167           "Number of nodes to run the tests on (must be > 2 for separate PME nodes)" },
2168         { "-npstring", FALSE, etENUM, {procstring},
2169           "Specify the number of processors to [TT]$MPIRUN[tt] using this string"},
2170         { "-ntmpi",    FALSE, etINT,  {&nthreads},
2171           "Number of MPI-threads to run the tests on (turns MPI & mpirun off)"},
2172         { "-r",        FALSE, etINT,  {&repeats},
2173           "Repeat each test this often" },
2174         { "-max",      FALSE, etREAL, {&maxPMEfraction},
2175           "Max fraction of PME nodes to test with" },
2176         { "-min",      FALSE, etREAL, {&minPMEfraction},
2177           "Min fraction of PME nodes to test with" },
2178         { "-npme",     FALSE, etENUM, {npmevalues_opt},
2179           "Within -min and -max, benchmark all possible values for [TT]-npme[tt], or just a reasonable subset. "
2180           "Auto neglects -min and -max and chooses reasonable values around a guess for npme derived from the .tpr"},
2181         { "-fix",      FALSE, etINT,  {&npme_fixed},
2182           "If >= -1, do not vary the number of PME-only nodes, instead use this fixed value and only vary rcoulomb and the PME grid spacing."},
2183         { "-rmax",     FALSE, etREAL, {&rmax},
2184           "If >0, maximal rcoulomb for -ntpr>1 (rcoulomb upscaling results in fourier grid downscaling)" },
2185         { "-rmin",     FALSE, etREAL, {&rmin},
2186           "If >0, minimal rcoulomb for -ntpr>1" },
2187         { "-scalevdw",  FALSE, etBOOL, {&bScaleRvdw},
2188           "Scale rvdw along with rcoulomb"},
2189         { "-ntpr",     FALSE, etINT,  {&ntprs},
2190           "Number of [TT].tpr[tt] files to benchmark. Create this many files with different rcoulomb scaling factors depending on -rmin and -rmax. "
2191           "If < 1, automatically choose the number of [TT].tpr[tt] files to test" },
2192         { "-steps",    FALSE, etGMX_LARGE_INT, {&bench_nsteps},
2193           "Take timings for this many steps in the benchmark runs" },
2194         { "-resetstep", FALSE, etINT,  {&presteps},
2195           "Let dlb equilibrate this many steps before timings are taken (reset cycle counters after this many steps)" },
2196         { "-simsteps", FALSE, etGMX_LARGE_INT, {&new_sim_nsteps},
2197           "If non-negative, perform this many steps in the real run (overwrites nsteps from [TT].tpr[tt], add [TT].cpt[tt] steps)" },
2198         { "-launch",   FALSE, etBOOL, {&bLaunch},
2199           "Launch the real simulation after optimization" },
2200         { "-bench",    FALSE, etBOOL, {&bBenchmark},
2201           "Run the benchmarks or just create the input [TT].tpr[tt] files?" },
2202         /******************/
2203         /* mdrun options: */
2204         /******************/
2205         /* We let g_tune_pme parse and understand these options, because we need to
2206          * prevent that they appear on the mdrun command line for the benchmarks */
2207         { "-append",   FALSE, etBOOL, {&bAppendFiles},
2208           "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names (for launch only)" },
2209         { "-cpnum",    FALSE, etBOOL, {&bKeepAndNumCPT},
2210           "Keep and number checkpoint files (launch only)" },
2211         { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay},
2212           "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt] (launch only)" }
2213     };
2214
2215 #define NFILE asize(fnm)
2216
2217     seconds = gmx_gettime();
2218
2219     if (!parse_common_args(&argc, argv, PCA_NOEXIT_ON_ARGS,
2220                            NFILE, fnm, asize(pa), pa, asize(desc), desc,
2221                            0, NULL, &oenv))
2222     {
2223         return 0;
2224     }
2225
2226     /* Store the remaining unparsed command line entries in a string which
2227      * is then attached to the mdrun command line */
2228     snew(ExtraArgs, 1);
2229     ExtraArgs[0] = '\0';
2230     for (i = 1; i < argc; i++) /* argc will now be 1 if everything was understood */
2231     {
2232         add_to_string(&ExtraArgs, argv[i]);
2233         add_to_string(&ExtraArgs, " ");
2234     }
2235
2236     if (opt2parg_bSet("-ntmpi", asize(pa), pa))
2237     {
2238         bThreads = TRUE;
2239         if (opt2parg_bSet("-npstring", asize(pa), pa))
2240         {
2241             fprintf(stderr, "WARNING: -npstring has no effect when using threads.\n");
2242         }
2243
2244         if (nnodes > 1)
2245         {
2246             gmx_fatal(FARGS, "Can't run multi-threaded MPI simulation yet!");
2247         }
2248         /* and now we just set this; a bit of an ugly hack*/
2249         nnodes = nthreads;
2250     }
2251     /* Check for PME:PP ratio and whether tpr triggers additional output files */
2252     guessPMEratio = inspect_tpr(NFILE, fnm, &rcoulomb);
2253
2254     /* Automatically set -beo options if -eo is set etc. */
2255     couple_files_options(NFILE, fnm);
2256
2257     /* Construct the command line arguments for benchmark runs
2258      * as well as for the simulation run */
2259     if (bThreads)
2260     {
2261         sprintf(bbuf, " -ntmpi %d ", nthreads);
2262     }
2263     else
2264     {
2265         /* This string will be used for MPI runs and will appear after the
2266          * mpirun command. */
2267         if (strcmp(procstring[0], "none") != 0)
2268         {
2269             sprintf(bbuf, " %s %d ", procstring[0], nnodes);
2270         }
2271         else
2272         {
2273             sprintf(bbuf, " ");
2274         }
2275     }
2276
2277     cmd_np = bbuf;
2278
2279     create_command_line_snippets(bAppendFiles, bKeepAndNumCPT, bResetCountersHalfWay, presteps,
2280                                  NFILE, fnm, &cmd_args_bench, &cmd_args_launch, ExtraArgs);
2281
2282     /* Read in checkpoint file if requested */
2283     sim_part = 1;
2284     if (opt2bSet("-cpi", NFILE, fnm))
2285     {
2286         snew(cr, 1);
2287         cr->duty = DUTY_PP; /* makes the following routine happy */
2288         read_checkpoint_simulation_part(opt2fn("-cpi", NFILE, fnm),
2289                                         &sim_part, &cpt_steps, cr,
2290                                         FALSE, NFILE, fnm, NULL, NULL);
2291         sfree(cr);
2292         sim_part++;
2293         /* sim_part will now be 1 if no checkpoint file was found */
2294         if (sim_part <= 1)
2295         {
2296             gmx_fatal(FARGS, "Checkpoint file %s not found!", opt2fn("-cpi", NFILE, fnm));
2297         }
2298     }
2299
2300     /* Open performance output file and write header info */
2301     fp = ffopen(opt2fn("-p", NFILE, fnm), "w");
2302
2303     /* Make a quick consistency check of command line parameters */
2304     check_input(nnodes, repeats, &ntprs, &rmin, rcoulomb, &rmax,
2305                 maxPMEfraction, minPMEfraction, npme_fixed,
2306                 bench_nsteps, fnm, NFILE, sim_part, presteps,
2307                 asize(pa), pa);
2308
2309     /* Determine the maximum and minimum number of PME nodes to test,
2310      * the actual list of settings is build in do_the_tests(). */
2311     if ((nnodes > 2) && (npme_fixed < -1))
2312     {
2313         if (0 == strcmp(npmevalues_opt[0], "auto"))
2314         {
2315             /* Determine the npme range automatically based on the PME:PP load guess */
2316             if (guessPMEratio > 1.0)
2317             {
2318                 /* More PME than PP work, probably we do not need separate PME nodes at all! */
2319                 maxPMEnodes = nnodes/2;
2320                 minPMEnodes = nnodes/2;
2321             }
2322             else
2323             {
2324                 /* PME : PP load is in the range 0..1, let's test around the guess */
2325                 guessPMEnodes = nnodes/(1.0 + 1.0/guessPMEratio);
2326                 minPMEnodes   = floor(0.7*guessPMEnodes);
2327                 maxPMEnodes   =  ceil(1.6*guessPMEnodes);
2328                 maxPMEnodes   = min(maxPMEnodes, nnodes/2);
2329             }
2330         }
2331         else
2332         {
2333             /* Determine the npme range based on user input */
2334             maxPMEnodes = floor(maxPMEfraction*nnodes);
2335             minPMEnodes = max(floor(minPMEfraction*nnodes), 0);
2336             fprintf(stdout, "Will try runs with %d ", minPMEnodes);
2337             if (maxPMEnodes != minPMEnodes)
2338             {
2339                 fprintf(stdout, "- %d ", maxPMEnodes);
2340             }
2341             fprintf(stdout, "PME-only nodes.\n  Note that the automatic number of PME-only nodes and no separate PME nodes are always tested.\n");
2342         }
2343     }
2344     else
2345     {
2346         maxPMEnodes = 0;
2347         minPMEnodes = 0;
2348     }
2349
2350     /* Get the commands we need to set up the runs from environment variables */
2351     get_program_paths(bThreads, &cmd_mpirun, &cmd_mdrun);
2352     if (bBenchmark && repeats > 0)
2353     {
2354         check_mdrun_works(bThreads, cmd_mpirun, cmd_np, cmd_mdrun);
2355     }
2356
2357     /* Print some header info to file */
2358     sep_line(fp);
2359     fprintf(fp, "\n      P E R F O R M A N C E   R E S U L T S\n");
2360     sep_line(fp);
2361     fprintf(fp, "%s for Gromacs %s\n", ShortProgram(), GromacsVersion());
2362     if (!bThreads)
2363     {
2364         fprintf(fp, "Number of nodes         : %d\n", nnodes);
2365         fprintf(fp, "The mpirun command is   : %s\n", cmd_mpirun);
2366         if (strcmp(procstring[0], "none") != 0)
2367         {
2368             fprintf(fp, "Passing # of nodes via  : %s\n", procstring[0]);
2369         }
2370         else
2371         {
2372             fprintf(fp, "Not setting number of nodes in system call\n");
2373         }
2374     }
2375     else
2376     {
2377         fprintf(fp, "Number of threads       : %d\n", nnodes);
2378     }
2379
2380     fprintf(fp, "The mdrun  command is   : %s\n", cmd_mdrun);
2381     fprintf(fp, "mdrun args benchmarks   : %s\n", cmd_args_bench);
2382     fprintf(fp, "Benchmark steps         : ");
2383     fprintf(fp, gmx_large_int_pfmt, bench_nsteps);
2384     fprintf(fp, "\n");
2385     fprintf(fp, "dlb equilibration steps : %d\n", presteps);
2386     if (sim_part > 1)
2387     {
2388         fprintf(fp, "Checkpoint time step    : ");
2389         fprintf(fp, gmx_large_int_pfmt, cpt_steps);
2390         fprintf(fp, "\n");
2391     }
2392     fprintf(fp, "mdrun args at launchtime: %s\n", cmd_args_launch);
2393
2394     if (new_sim_nsteps >= 0)
2395     {
2396         bOverwrite = TRUE;
2397         fprintf(stderr, "Note: Simulation input file %s will have ", opt2fn("-so", NFILE, fnm));
2398         fprintf(stderr, gmx_large_int_pfmt, new_sim_nsteps+cpt_steps);
2399         fprintf(stderr, " steps.\n");
2400         fprintf(fp, "Simulation steps        : ");
2401         fprintf(fp, gmx_large_int_pfmt, new_sim_nsteps);
2402         fprintf(fp, "\n");
2403     }
2404     if (repeats > 1)
2405     {
2406         fprintf(fp, "Repeats for each test   : %d\n", repeats);
2407     }
2408
2409     if (npme_fixed >= -1)
2410     {
2411         fprintf(fp, "Fixing -npme at         : %d\n", npme_fixed);
2412     }
2413
2414     fprintf(fp, "Input file              : %s\n", opt2fn("-s", NFILE, fnm));
2415     fprintf(fp, "   PME/PP load estimate : %g\n", guessPMEratio);
2416
2417     /* Allocate memory for the inputinfo struct: */
2418     snew(info, 1);
2419     info->nr_inputfiles = ntprs;
2420     for (i = 0; i < ntprs; i++)
2421     {
2422         snew(info->rcoulomb, ntprs);
2423         snew(info->rvdw, ntprs);
2424         snew(info->rlist, ntprs);
2425         snew(info->rlistlong, ntprs);
2426         snew(info->nkx, ntprs);
2427         snew(info->nky, ntprs);
2428         snew(info->nkz, ntprs);
2429         snew(info->fsx, ntprs);
2430         snew(info->fsy, ntprs);
2431         snew(info->fsz, ntprs);
2432     }
2433     /* Make alternative tpr files to test: */
2434     snew(tpr_names, ntprs);
2435     for (i = 0; i < ntprs; i++)
2436     {
2437         snew(tpr_names[i], STRLEN);
2438     }
2439
2440     /* It can be that ntprs is reduced by make_benchmark_tprs if not enough
2441      * different grids could be found. */
2442     make_benchmark_tprs(opt2fn("-s", NFILE, fnm), tpr_names, bench_nsteps+presteps,
2443                         cpt_steps, rmin, rmax, bScaleRvdw, &ntprs, info, fp);
2444
2445     /********************************************************************************/
2446     /* Main loop over all scenarios we need to test: tpr files, PME nodes, repeats  */
2447     /********************************************************************************/
2448     snew(perfdata, ntprs);
2449     if (bBenchmark)
2450     {
2451         do_the_tests(fp, tpr_names, maxPMEnodes, minPMEnodes, npme_fixed, npmevalues_opt[0], perfdata, &pmeentries,
2452                      repeats, nnodes, ntprs, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
2453                      cmd_args_bench, fnm, NFILE, presteps, cpt_steps);
2454
2455         fprintf(fp, "\nTuning took%8.1f minutes.\n", (gmx_gettime()-seconds)/60.0);
2456
2457         /* Analyse the results and give a suggestion for optimal settings: */
2458         bKeepTPR = analyze_data(fp, opt2fn("-p", NFILE, fnm), perfdata, nnodes, ntprs, pmeentries,
2459                                 repeats, info, &best_tpr, &best_npme);
2460
2461         /* Take the best-performing tpr file and enlarge nsteps to original value */
2462         if (bKeepTPR && !bOverwrite)
2463         {
2464             simulation_tpr = opt2fn("-s", NFILE, fnm);
2465         }
2466         else
2467         {
2468             simulation_tpr = opt2fn("-so", NFILE, fnm);
2469             modify_PMEsettings(bOverwrite ? (new_sim_nsteps+cpt_steps) : info->orig_sim_steps,
2470                                info->orig_init_step, tpr_names[best_tpr], simulation_tpr);
2471         }
2472
2473         /* Let's get rid of the temporary benchmark input files */
2474         for (i = 0; i < ntprs; i++)
2475         {
2476             fprintf(stdout, "Deleting temporary benchmark input file %s\n", tpr_names[i]);
2477             remove(tpr_names[i]);
2478         }
2479
2480         /* Now start the real simulation if the user requested it ... */
2481         launch_simulation(bLaunch, fp, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
2482                           cmd_args_launch, simulation_tpr, best_npme);
2483     }
2484     ffclose(fp);
2485
2486     /* ... or simply print the performance results to screen: */
2487     if (!bLaunch)
2488     {
2489         finalize(opt2fn("-p", NFILE, fnm));
2490     }
2491
2492     return 0;
2493 }