src/gromacs/gmxana/gmx_tune_pme.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2009,2010,2011,2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifdef HAVE_CONFIG_H
  36 #include <config.h>
  37 #endif
  38
  39 #include <time.h>
  40 #ifdef HAVE_SYS_TIME_H
  41 #include <sys/time.h>
  42 #endif
  43
  44 #include "gromacs/commandline/pargs.h"
  45 #include "typedefs.h"
  46 #include "types/commrec.h"
  47 #include "gromacs/utility/smalloc.h"
  48 #include "vec.h"
  49 #include "copyrite.h"
  50 #include "gromacs/fileio/tpxio.h"
  51 #include "gromacs/utility/cstringutil.h"
  52 #include "readinp.h"
  53 #include "calcgrid.h"
  54 #include "checkpoint.h"
  55 #include "macros.h"
  56 #include "gmx_ana.h"
  57 #include "names.h"
  58 #include "perf_est.h"
  59 #include "inputrec.h"
  60 #include "gromacs/timing/walltime_accounting.h"
  61 #include "gromacs/math/utilities.h"
  62
  63 #include "gmx_fatal.h"
  64
  65 /* Enum for situations that can occur during log file parsing, the
  66  * corresponding string entries can be found in do_the_tests() in
  67  * const char* ParseLog[] */
  68 enum {
  69     eParselogOK,
  70     eParselogNotFound,
  71     eParselogNoPerfData,
  72     eParselogTerm,
  73     eParselogResetProblem,
  74     eParselogNoDDGrid,
  75     eParselogTPXVersion,
  76     eParselogNotParallel,
  77     eParselogLargePrimeFactor,
  78     eParselogFatal,
  79     eParselogNr
  80 };
  81
  82
  83 typedef struct
  84 {
  85     int     nPMEnodes;    /* number of PME-only nodes used in this test */
  86     int     nx, ny, nz;   /* DD grid */
  87     int     guessPME;     /* if nPMEnodes == -1, this is the guessed number of PME nodes */
  88     double *Gcycles;      /* This can contain more than one value if doing multiple tests */
  89     double  Gcycles_Av;
  90     float  *ns_per_day;
  91     float   ns_per_day_Av;
  92     float  *PME_f_load;     /* PME mesh/force load average*/
  93     float   PME_f_load_Av;  /* Average average ;) ... */
  94     char   *mdrun_cmd_line; /* Mdrun command line used for this test */
  95 } t_perf;
  96
  97
  98 typedef struct
  99 {
 100     int             nr_inputfiles;  /* The number of tpr and mdp input files */
 101     gmx_int64_t     orig_sim_steps; /* Number of steps to be done in the real simulation */
 102     gmx_int64_t     orig_init_step; /* Init step for the real simulation */
 103     real           *rcoulomb;       /* The coulomb radii [0...nr_inputfiles] */
 104     real           *rvdw;           /* The vdW radii */
 105     real           *rlist;          /* Neighbourlist cutoff radius */
 106     real           *rlistlong;
 107     int            *nkx, *nky, *nkz;
 108     real           *fsx, *fsy, *fsz; /* Fourierspacing in x,y,z dimension */
 109 } t_inputinfo;
 110
 111
 112 static void sep_line(FILE *fp)
 113 {
 114     fprintf(fp, "\n------------------------------------------------------------\n");
 115 }
 116
 117
 118 /* Wrapper for system calls */
 119 static int gmx_system_call(char *command)
 120 {
 121 #ifdef GMX_NO_SYSTEM
 122     gmx_fatal(FARGS, "No calls to system(3) supported on this platform. Attempted to call:\n'%s'\n", command);
 123 #else
 124     return ( system(command) );
 125 #endif
 126 }
 127
 128
 129 /* Check if string starts with substring */
 130 static gmx_bool str_starts(const char *string, const char *substring)
 131 {
 132     return ( strncmp(string, substring, strlen(substring)) == 0);
 133 }
 134
 135
 136 static void cleandata(t_perf *perfdata, int test_nr)
 137 {
 138     perfdata->Gcycles[test_nr]    = 0.0;
 139     perfdata->ns_per_day[test_nr] = 0.0;
 140     perfdata->PME_f_load[test_nr] = 0.0;
 141
 142     return;
 143 }
 144
 145
 146 static gmx_bool is_equal(real a, real b)
 147 {
 148     real diff, eps = 1.0e-7;
 149
 150
 151     diff = a - b;
 152
 153     if (diff < 0.0)
 154     {
 155         diff = -diff;
 156     }
 157
 158     if (diff < eps)
 159     {
 160         return TRUE;
 161     }
 162     else
 163     {
 164         return FALSE;
 165     }
 166 }
 167
 168
 169 static void remove_if_exists(const char *fn)
 170 {
 171     if (gmx_fexist(fn))
 172     {
 173         fprintf(stdout, "Deleting %s\n", fn);
 174         remove(fn);
 175     }
 176 }
 177
 178
 179 static void finalize(const char *fn_out)
 180 {
 181     char  buf[STRLEN];
 182     FILE *fp;
 183
 184
 185     fp = fopen(fn_out, "r");
 186     fprintf(stdout, "\n\n");
 187
 188     while (fgets(buf, STRLEN-1, fp) != NULL)
 189     {
 190         fprintf(stdout, "%s", buf);
 191     }
 192     fclose(fp);
 193     fprintf(stdout, "\n\n");
 194 }
 195
 196
 197 enum {
 198     eFoundNothing, eFoundDDStr, eFoundAccountingStr, eFoundCycleStr
 199 };
 200
 201 static int parse_logfile(const char *logfile, const char *errfile,
 202                          t_perf *perfdata, int test_nr, int presteps, gmx_int64_t cpt_steps,
 203                          int nnodes)
 204 {
 205     FILE           *fp;
 206     char            line[STRLEN], dumstring[STRLEN], dumstring2[STRLEN];
 207     const char      matchstrdd[]  = "Domain decomposition grid";
 208     const char      matchstrcr[]  = "resetting all time and cycle counters";
 209     const char      matchstrbal[] = "Average PME mesh/force load:";
 210     const char      matchstring[] = "R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G";
 211     const char      errSIG[]      = "signal, stopping at the next";
 212     int             iFound;
 213     float           dum1, dum2, dum3, dum4;
 214     int             ndum;
 215     int             npme;
 216     gmx_int64_t     resetsteps     = -1;
 217     gmx_bool        bFoundResetStr = FALSE;
 218     gmx_bool        bResetChecked  = FALSE;
 219
 220
 221     if (!gmx_fexist(logfile))
 222     {
 223         fprintf(stderr, "WARNING: Could not find logfile %s.\n", logfile);
 224         cleandata(perfdata, test_nr);
 225         return eParselogNotFound;
 226     }
 227
 228     fp = fopen(logfile, "r");
 229     perfdata->PME_f_load[test_nr] = -1.0;
 230     perfdata->guessPME            = -1;
 231
 232     iFound = eFoundNothing;
 233     if (1 == nnodes)
 234     {
 235         iFound = eFoundDDStr; /* Skip some case statements */
 236     }
 237
 238     while (fgets(line, STRLEN, fp) != NULL)
 239     {
 240         /* Remove leading spaces */
 241         ltrim(line);
 242
 243         /* Check for TERM and INT signals from user: */
 244         if (strstr(line, errSIG) != NULL)
 245         {
 246             fclose(fp);
 247             cleandata(perfdata, test_nr);
 248             return eParselogTerm;
 249         }
 250
 251         /* Check whether cycle resetting  worked */
 252         if (presteps > 0 && !bFoundResetStr)
 253         {
 254             if (strstr(line, matchstrcr) != NULL)
 255             {
 256                 sprintf(dumstring, "step %s", "%"GMX_SCNd64);
 257                 sscanf(line, dumstring, &resetsteps);
 258                 bFoundResetStr = TRUE;
 259                 if (resetsteps == presteps+cpt_steps)
 260                 {
 261                     bResetChecked = TRUE;
 262                 }
 263                 else
 264                 {
 265                     sprintf(dumstring, "%"GMX_PRId64, resetsteps);
 266                     sprintf(dumstring2, "%"GMX_PRId64, presteps+cpt_steps);
 267                     fprintf(stderr, "WARNING: Time step counters were reset at step %s,\n"
 268                             "         though they were supposed to be reset at step %s!\n",
 269                             dumstring, dumstring2);
 270                 }
 271             }
 272         }
 273
 274         /* Look for strings that appear in a certain order in the log file: */
 275         switch (iFound)
 276         {
 277             case eFoundNothing:
 278                 /* Look for domain decomp grid and separate PME nodes: */
 279                 if (str_starts(line, matchstrdd))
 280                 {
 281                     sscanf(line, "Domain decomposition grid %d x %d x %d, separate PME ranks %d",
 282                            &(perfdata->nx), &(perfdata->ny), &(perfdata->nz), &npme);
 283                     if (perfdata->nPMEnodes == -1)
 284                     {
 285                         perfdata->guessPME = npme;
 286                     }
 287                     else if (perfdata->nPMEnodes != npme)
 288                     {
 289                         gmx_fatal(FARGS, "PME ranks from command line and output file are not identical");
 290                     }
 291                     iFound = eFoundDDStr;
 292                 }
 293                 /* Catch a few errors that might have occured: */
 294                 else if (str_starts(line, "There is no domain decomposition for"))
 295                 {
 296                     fclose(fp);
 297                     return eParselogNoDDGrid;
 298                 }
 299                 else if (str_starts(line, "The number of ranks you selected"))
 300                 {
 301                     fclose(fp);
 302                     return eParselogLargePrimeFactor;
 303                 }
 304                 else if (str_starts(line, "reading tpx file"))
 305                 {
 306                     fclose(fp);
 307                     return eParselogTPXVersion;
 308                 }
 309                 else if (str_starts(line, "The -dd or -npme option request a parallel simulation"))
 310                 {
 311                     fclose(fp);
 312                     return eParselogNotParallel;
 313                 }
 314                 break;
 315             case eFoundDDStr:
 316                 /* Look for PME mesh/force balance (not necessarily present, though) */
 317                 if (str_starts(line, matchstrbal))
 318                 {
 319                     sscanf(&line[strlen(matchstrbal)], "%f", &(perfdata->PME_f_load[test_nr]));
 320                 }
 321                 /* Look for matchstring */
 322                 if (str_starts(line, matchstring))
 323                 {
 324                     iFound = eFoundAccountingStr;
 325                 }
 326                 break;
 327             case eFoundAccountingStr:
 328                 /* Already found matchstring - look for cycle data */
 329                 if (str_starts(line, "Total  "))
 330                 {
 331                     sscanf(line, "Total %*f %lf", &(perfdata->Gcycles[test_nr]));
 332                     iFound = eFoundCycleStr;
 333                 }
 334                 break;
 335             case eFoundCycleStr:
 336                 /* Already found cycle data - look for remaining performance info and return */
 337                 if (str_starts(line, "Performance:"))
 338                 {
 339                     ndum = sscanf(line, "%s %f %f %f %f", dumstring, &dum1, &dum2, &dum3, &dum4);
 340                     /* (ns/day) is the second last entry, depending on whether GMX_DETAILED_PERF_STATS was set in print_perf(), nrnb.c */
 341                     perfdata->ns_per_day[test_nr] = (ndum == 5) ? dum3 : dum1;
 342                     fclose(fp);
 343                     if (bResetChecked || presteps == 0)
 344                     {
 345                         return eParselogOK;
 346                     }
 347                     else
 348                     {
 349                         return eParselogResetProblem;
 350                     }
 351                 }
 352                 break;
 353         }
 354     } /* while */
 355
 356     /* Close the log file */
 357     fclose(fp);
 358
 359     /* Check why there is no performance data in the log file.
 360      * Did a fatal errors occur? */
 361     if (gmx_fexist(errfile))
 362     {
 363         fp = fopen(errfile, "r");
 364         while (fgets(line, STRLEN, fp) != NULL)
 365         {
 366             if (str_starts(line, "Fatal error:") )
 367             {
 368                 if (fgets(line, STRLEN, fp) != NULL)
 369                 {
 370                     fprintf(stderr, "\nWARNING: An error occured during this benchmark:\n"
 371                             "%s\n", line);
 372                 }
 373                 fclose(fp);
 374                 cleandata(perfdata, test_nr);
 375                 return eParselogFatal;
 376             }
 377         }
 378         fclose(fp);
 379     }
 380     else
 381     {
 382         fprintf(stderr, "WARNING: Could not find stderr file %s.\n", errfile);
 383     }
 384
 385     /* Giving up ... we could not find out why there is no performance data in
 386      * the log file. */
 387     fprintf(stdout, "No performance data in log file.\n");
 388     cleandata(perfdata, test_nr);
 389
 390     return eParselogNoPerfData;
 391 }
 392
 393
 394 static gmx_bool analyze_data(
 395         FILE         *fp,
 396         const char   *fn,
 397         t_perf      **perfdata,
 398         int           nnodes,
 399         int           ntprs,
 400         int           ntests,
 401         int           nrepeats,
 402         t_inputinfo  *info,
 403         int          *index_tpr,    /* OUT: Nr of mdp file with best settings */
 404         int          *npme_optimal) /* OUT: Optimal number of PME nodes */
 405 {
 406     int      i, j, k;
 407     int      line  = 0, line_win = -1;
 408     int      k_win = -1, i_win = -1, winPME;
 409     double   s     = 0.0; /* standard deviation */
 410     t_perf  *pd;
 411     char     strbuf[STRLEN];
 412     char     str_PME_f_load[13];
 413     gmx_bool bCanUseOrigTPR;
 414     gmx_bool bRefinedCoul, bRefinedVdW, bRefinedGrid;
 415
 416
 417     if (nrepeats > 1)
 418     {
 419         sep_line(fp);
 420         fprintf(fp, "Summary of successful runs:\n");
 421         fprintf(fp, "Line tpr PME ranks  Gcycles Av.     Std.dev.       ns/day        PME/f");
 422         if (nnodes > 1)
 423         {
 424             fprintf(fp, "    DD grid");
 425         }
 426         fprintf(fp, "\n");
 427     }
 428
 429
 430     for (k = 0; k < ntprs; k++)
 431     {
 432         for (i = 0; i < ntests; i++)
 433         {
 434             /* Select the right dataset: */
 435             pd = &(perfdata[k][i]);
 436
 437             pd->Gcycles_Av    = 0.0;
 438             pd->PME_f_load_Av = 0.0;
 439             pd->ns_per_day_Av = 0.0;
 440
 441             if (pd->nPMEnodes == -1)
 442             {
 443                 sprintf(strbuf, "(%3d)", pd->guessPME);
 444             }
 445             else
 446             {
 447                 sprintf(strbuf, "     ");
 448             }
 449
 450             /* Get the average run time of a setting */
 451             for (j = 0; j < nrepeats; j++)
 452             {
 453                 pd->Gcycles_Av    += pd->Gcycles[j];
 454                 pd->PME_f_load_Av += pd->PME_f_load[j];
 455             }
 456             pd->Gcycles_Av    /= nrepeats;
 457             pd->PME_f_load_Av /= nrepeats;
 458
 459             for (j = 0; j < nrepeats; j++)
 460             {
 461                 if (pd->ns_per_day[j] > 0.0)
 462                 {
 463                     pd->ns_per_day_Av += pd->ns_per_day[j];
 464                 }
 465                 else
 466                 {
 467                     /* Somehow the performance number was not aquired for this run,
 468                      * therefor set the average to some negative value: */
 469                     pd->ns_per_day_Av = -1.0f*nrepeats;
 470                     break;
 471                 }
 472             }
 473             pd->ns_per_day_Av /= nrepeats;
 474
 475             /* Nicer output: */
 476             if (pd->PME_f_load_Av > 0.0)
 477             {
 478                 sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load_Av);
 479             }
 480             else
 481             {
 482                 sprintf(str_PME_f_load, "%s", "         -  ");
 483             }
 484
 485
 486             /* We assume we had a successful run if both averages are positive */
 487             if (pd->Gcycles_Av > 0.0 && pd->ns_per_day_Av > 0.0)
 488             {
 489                 /* Output statistics if repeats were done */
 490                 if (nrepeats > 1)
 491                 {
 492                     /* Calculate the standard deviation */
 493                     s = 0.0;
 494                     for (j = 0; j < nrepeats; j++)
 495                     {
 496                         s += pow( pd->Gcycles[j] - pd->Gcycles_Av, 2 );
 497                     }
 498                     s /= (nrepeats - 1);
 499                     s  = sqrt(s);
 500
 501                     fprintf(fp, "%4d %3d %4d%s %12.3f %12.3f %12.3f %s",
 502                             line, k, pd->nPMEnodes, strbuf, pd->Gcycles_Av, s,
 503                             pd->ns_per_day_Av, str_PME_f_load);
 504                     if (nnodes > 1)
 505                     {
 506                         fprintf(fp, "  %3d %3d %3d", pd->nx, pd->ny, pd->nz);
 507                     }
 508                     fprintf(fp, "\n");
 509                 }
 510                 /* Store the index of the best run found so far in 'winner': */
 511                 if ( (k_win == -1) || (pd->Gcycles_Av < perfdata[k_win][i_win].Gcycles_Av) )
 512                 {
 513                     k_win    = k;
 514                     i_win    = i;
 515                     line_win = line;
 516                 }
 517                 line++;
 518             }
 519         }
 520     }
 521
 522     if (k_win == -1)
 523     {
 524         gmx_fatal(FARGS, "None of the runs was successful! Check %s for problems.", fn);
 525     }
 526
 527     sep_line(fp);
 528
 529     winPME = perfdata[k_win][i_win].nPMEnodes;
 530
 531     if (1 == ntests)
 532     {
 533         /* We stuck to a fixed number of PME-only nodes */
 534         sprintf(strbuf, "settings No. %d", k_win);
 535     }
 536     else
 537     {
 538         /* We have optimized the number of PME-only nodes */
 539         if (winPME == -1)
 540         {
 541             sprintf(strbuf, "%s", "the automatic number of PME ranks");
 542         }
 543         else
 544         {
 545             sprintf(strbuf, "%d PME ranks", winPME);
 546         }
 547     }
 548     fprintf(fp, "Best performance was achieved with %s", strbuf);
 549     if ((nrepeats > 1) && (ntests > 1))
 550     {
 551         fprintf(fp, " (see line %d)", line_win);
 552     }
 553     fprintf(fp, "\n");
 554
 555     /* Only mention settings if they were modified: */
 556     bRefinedCoul = !is_equal(info->rcoulomb[k_win], info->rcoulomb[0]);
 557     bRefinedVdW  = !is_equal(info->rvdw[k_win], info->rvdw[0]    );
 558     bRefinedGrid = !(info->nkx[k_win] == info->nkx[0] &&
 559                      info->nky[k_win] == info->nky[0] &&
 560                      info->nkz[k_win] == info->nkz[0]);
 561
 562     if (bRefinedCoul || bRefinedVdW || bRefinedGrid)
 563     {
 564         fprintf(fp, "Optimized PME settings:\n");
 565         bCanUseOrigTPR = FALSE;
 566     }
 567     else
 568     {
 569         bCanUseOrigTPR = TRUE;
 570     }
 571
 572     if (bRefinedCoul)
 573     {
 574         fprintf(fp, "   New Coulomb radius: %f nm (was %f nm)\n", info->rcoulomb[k_win], info->rcoulomb[0]);
 575     }
 576
 577     if (bRefinedVdW)
 578     {
 579         fprintf(fp, "   New Van der Waals radius: %f nm (was %f nm)\n", info->rvdw[k_win], info->rvdw[0]);
 580     }
 581
 582     if (bRefinedGrid)
 583     {
 584         fprintf(fp, "   New Fourier grid xyz: %d %d %d (was %d %d %d)\n", info->nkx[k_win], info->nky[k_win], info->nkz[k_win],
 585                 info->nkx[0], info->nky[0], info->nkz[0]);
 586     }
 587
 588     if (bCanUseOrigTPR && ntprs > 1)
 589     {
 590         fprintf(fp, "and original PME settings.\n");
 591     }
 592
 593     fflush(fp);
 594
 595     /* Return the index of the mdp file that showed the highest performance
 596      * and the optimal number of PME nodes */
 597     *index_tpr    = k_win;
 598     *npme_optimal = winPME;
 599
 600     return bCanUseOrigTPR;
 601 }
 602
 603
 604 /* Get the commands we need to set up the runs from environment variables */
 605 static void get_program_paths(gmx_bool bThreads, char *cmd_mpirun[], char *cmd_mdrun[])
 606 {
 607     char      *cp;
 608     FILE      *fp;
 609     const char def_mpirun[]   = "mpirun";
 610     const char def_mdrun[]    = "mdrun";
 611
 612     const char empty_mpirun[] = "";
 613
 614     /* Get the commands we need to set up the runs from environment variables */
 615     if (!bThreads)
 616     {
 617         if ( (cp = getenv("MPIRUN")) != NULL)
 618         {
 619             *cmd_mpirun = strdup(cp);
 620         }
 621         else
 622         {
 623             *cmd_mpirun = strdup(def_mpirun);
 624         }
 625     }
 626     else
 627     {
 628         *cmd_mpirun = strdup(empty_mpirun);
 629     }
 630
 631     if ( (cp = getenv("MDRUN" )) != NULL)
 632     {
 633         *cmd_mdrun  = strdup(cp);
 634     }
 635     else
 636     {
 637         *cmd_mdrun  = strdup(def_mdrun);
 638     }
 639 }
 640
 641 /* Check that the commands will run mdrun (perhaps via mpirun) by
 642  * running a very quick test simulation. Requires MPI environment to
 643  * be available if applicable. */
 644 static void check_mdrun_works(gmx_bool    bThreads,
 645                               const char *cmd_mpirun,
 646                               const char *cmd_np,
 647                               const char *cmd_mdrun)
 648 {
 649     char      *command = NULL;
 650     char      *cp;
 651     char       line[STRLEN];
 652     FILE      *fp;
 653     const char filename[]     = "benchtest.log";
 654
 655     /* This string should always be identical to the one in copyrite.c,
 656      * gmx_print_version_info() in the defined(GMX_MPI) section */
 657     const char match_mpi[]    = "MPI library:        MPI";
 658     const char match_mdrun[]  = "Executable: ";
 659     gmx_bool   bMdrun         = FALSE;
 660     gmx_bool   bMPI           = FALSE;
 661
 662     /* Run a small test to see whether mpirun + mdrun work  */
 663     fprintf(stdout, "Making sure that mdrun can be executed. ");
 664     if (bThreads)
 665     {
 666         snew(command, strlen(cmd_mdrun) + strlen(cmd_np) + strlen(filename) + 50);
 667         sprintf(command, "%s%s-version -maxh 0.001 1> %s 2>&1", cmd_mdrun, cmd_np, filename);
 668     }
 669     else
 670     {
 671         snew(command, strlen(cmd_mpirun) + strlen(cmd_np) + strlen(cmd_mdrun) + strlen(filename) + 50);
 672         sprintf(command, "%s%s%s -version -maxh 0.001 1> %s 2>&1", cmd_mpirun, cmd_np, cmd_mdrun, filename);
 673     }
 674     fprintf(stdout, "Trying '%s' ... ", command);
 675     make_backup(filename);
 676     gmx_system_call(command);
 677
 678     /* Check if we find the characteristic string in the output: */
 679     if (!gmx_fexist(filename))
 680     {
 681         gmx_fatal(FARGS, "Output from test run could not be found.");
 682     }
 683
 684     fp = fopen(filename, "r");
 685     /* We need to scan the whole output file, since sometimes the queuing system
 686      * also writes stuff to stdout/err */
 687     while (!feof(fp) )
 688     {
 689         cp = fgets(line, STRLEN, fp);
 690         if (cp != NULL)
 691         {
 692             if (str_starts(line, match_mdrun) )
 693             {
 694                 bMdrun = TRUE;
 695             }
 696             if (str_starts(line, match_mpi) )
 697             {
 698                 bMPI = TRUE;
 699             }
 700         }
 701     }
 702     fclose(fp);
 703
 704     if (bThreads)
 705     {
 706         if (bMPI)
 707         {
 708             gmx_fatal(FARGS, "Need a threaded version of mdrun. This one\n"
 709                       "(%s)\n"
 710                       "seems to have been compiled with MPI instead.",
 711                       cmd_mdrun);
 712         }
 713     }
 714     else
 715     {
 716         if (bMdrun && !bMPI)
 717         {
 718             gmx_fatal(FARGS, "Need an MPI-enabled version of mdrun. This one\n"
 719                       "(%s)\n"
 720                       "seems to have been compiled without MPI support.",
 721                       cmd_mdrun);
 722         }
 723     }
 724
 725     if (!bMdrun)
 726     {
 727         gmx_fatal(FARGS, "Cannot execute mdrun. Please check %s for problems!",
 728                   filename);
 729     }
 730
 731     fprintf(stdout, "passed.\n");
 732
 733     /* Clean up ... */
 734     remove(filename);
 735     sfree(command);
 736 }
 737
 738
 739 static void launch_simulation(
 740         gmx_bool    bLaunch,        /* Should the simulation be launched? */
 741         FILE       *fp,             /* General log file */
 742         gmx_bool    bThreads,       /* whether to use threads */
 743         char       *cmd_mpirun,     /* Command for mpirun */
 744         char       *cmd_np,         /* Switch for -np or -ntmpi or empty */
 745         char       *cmd_mdrun,      /* Command for mdrun */
 746         char       *args_for_mdrun, /* Arguments for mdrun */
 747         const char *simulation_tpr, /* This tpr will be simulated */
 748         int         nPMEnodes)      /* Number of PME nodes to use */
 749 {
 750     char  *command;
 751
 752
 753     /* Make enough space for the system call command,
 754      * (100 extra chars for -npme ... etc. options should suffice): */
 755     snew(command, strlen(cmd_mpirun)+strlen(cmd_mdrun)+strlen(cmd_np)+strlen(args_for_mdrun)+strlen(simulation_tpr)+100);
 756
 757     /* Note that the -passall options requires args_for_mdrun to be at the end
 758      * of the command line string */
 759     if (bThreads)
 760     {
 761         sprintf(command, "%s%s-npme %d -s %s %s",
 762                 cmd_mdrun, cmd_np, nPMEnodes, simulation_tpr, args_for_mdrun);
 763     }
 764     else
 765     {
 766         sprintf(command, "%s%s%s -npme %d -s %s %s",
 767                 cmd_mpirun, cmd_np, cmd_mdrun, nPMEnodes, simulation_tpr, args_for_mdrun);
 768     }
 769
 770     fprintf(fp, "%s this command line to launch the simulation:\n\n%s", bLaunch ? "Using" : "Please use", command);
 771     sep_line(fp);
 772     fflush(fp);
 773
 774     /* Now the real thing! */
 775     if (bLaunch)
 776     {
 777         fprintf(stdout, "\nLaunching simulation with best parameters now.\nExecuting '%s'", command);
 778         sep_line(stdout);
 779         fflush(stdout);
 780         gmx_system_call(command);
 781     }
 782 }
 783
 784
 785 static void modify_PMEsettings(
 786         gmx_int64_t     simsteps,    /* Set this value as number of time steps */
 787         gmx_int64_t     init_step,   /* Set this value as init_step */
 788         const char     *fn_best_tpr, /* tpr file with the best performance */
 789         const char     *fn_sim_tpr)  /* name of tpr file to be launched */
 790 {
 791     t_inputrec   *ir;
 792     t_state       state;
 793     gmx_mtop_t    mtop;
 794     char          buf[200];
 795
 796     snew(ir, 1);
 797     read_tpx_state(fn_best_tpr, ir, &state, NULL, &mtop);
 798
 799     /* Reset nsteps and init_step to the value of the input .tpr file */
 800     ir->nsteps    = simsteps;
 801     ir->init_step = init_step;
 802
 803     /* Write the tpr file which will be launched */
 804     sprintf(buf, "Writing optimized simulation file %s with nsteps=%s.\n", fn_sim_tpr, "%"GMX_PRId64);
 805     fprintf(stdout, buf, ir->nsteps);
 806     fflush(stdout);
 807     write_tpx_state(fn_sim_tpr, ir, &state, &mtop);
 808
 809     sfree(ir);
 810 }
 811
 812
 813 #define EPME_SWITCHED(e) ((e) == eelPMESWITCH || (e) == eelPMEUSERSWITCH)
 814
 815 /* Make additional TPR files with more computational load for the
 816  * direct space processors: */
 817 static void make_benchmark_tprs(
 818         const char     *fn_sim_tpr,      /* READ : User-provided tpr file                 */
 819         char           *fn_bench_tprs[], /* WRITE: Names of benchmark tpr files           */
 820         gmx_int64_t     benchsteps,      /* Number of time steps for benchmark runs       */
 821         gmx_int64_t     statesteps,      /* Step counter in checkpoint file               */
 822         real            rmin,            /* Minimal Coulomb radius                        */
 823         real            rmax,            /* Maximal Coulomb radius                        */
 824         real            bScaleRvdw,      /* Scale rvdw along with rcoulomb                */
 825         int            *ntprs,           /* No. of TPRs to write, each with a different
 826                                             rcoulomb and fourierspacing                   */
 827         t_inputinfo    *info,            /* Contains information about mdp file options   */
 828         FILE           *fp)              /* Write the output here                         */
 829 {
 830     int           i, j, d;
 831     t_inputrec   *ir;
 832     t_state       state;
 833     gmx_mtop_t    mtop;
 834     real          nlist_buffer;     /* Thickness of the buffer regions for PME-switch potentials */
 835     char          buf[200];
 836     rvec          box_size;
 837     gmx_bool      bNote = FALSE;
 838     real          add;              /* Add this to rcoul for the next test    */
 839     real          fac = 1.0;        /* Scaling factor for Coulomb radius      */
 840     real          fourierspacing;   /* Basic fourierspacing from tpr          */
 841
 842
 843     sprintf(buf, "Making benchmark tpr file%s with %s time step%s",
 844             *ntprs > 1 ? "s" : "", "%"GMX_PRId64, benchsteps > 1 ? "s" : "");
 845     fprintf(stdout, buf, benchsteps);
 846     if (statesteps > 0)
 847     {
 848         sprintf(buf, " (adding %s steps from checkpoint file)", "%"GMX_PRId64);
 849         fprintf(stdout, buf, statesteps);
 850         benchsteps += statesteps;
 851     }
 852     fprintf(stdout, ".\n");
 853
 854
 855     snew(ir, 1);
 856     read_tpx_state(fn_sim_tpr, ir, &state, NULL, &mtop);
 857
 858     /* Check if some kind of PME was chosen */
 859     if (EEL_PME(ir->coulombtype) == FALSE)
 860     {
 861         gmx_fatal(FARGS, "Can only do optimizations for simulations with %s electrostatics.",
 862                   EELTYPE(eelPME));
 863     }
 864
 865     /* Check if rcoulomb == rlist, which is necessary for plain PME. */
 866     if (  (ir->cutoff_scheme != ecutsVERLET) &&
 867           (eelPME == ir->coulombtype) && !(ir->rcoulomb == ir->rlist))
 868     {
 869         gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to rlist (%f).",
 870                   EELTYPE(eelPME), ir->rcoulomb, ir->rlist);
 871     }
 872     /* For other PME types, rcoulomb is allowed to be smaller than rlist */
 873     else if (ir->rcoulomb > ir->rlist)
 874     {
 875         gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to or smaller than rlist (%f)",
 876                   EELTYPE(ir->coulombtype), ir->rcoulomb, ir->rlist);
 877     }
 878
 879     if (bScaleRvdw && ir->rvdw != ir->rcoulomb)
 880     {
 881         fprintf(stdout, "NOTE: input rvdw != rcoulomb, will not scale rvdw\n");
 882         bScaleRvdw = FALSE;
 883     }
 884
 885     /* Reduce the number of steps for the benchmarks */
 886     info->orig_sim_steps = ir->nsteps;
 887     ir->nsteps           = benchsteps;
 888     /* We must not use init_step from the input tpr file for the benchmarks */
 889     info->orig_init_step = ir->init_step;
 890     ir->init_step        = 0;
 891
 892     /* For PME-switch potentials, keep the radial distance of the buffer region */
 893     nlist_buffer   = ir->rlist - ir->rcoulomb;
 894
 895     /* Determine length of triclinic box vectors */
 896     for (d = 0; d < DIM; d++)
 897     {
 898         box_size[d] = 0;
 899         for (i = 0; i < DIM; i++)
 900         {
 901             box_size[d] += state.box[d][i]*state.box[d][i];
 902         }
 903         box_size[d] = sqrt(box_size[d]);
 904     }
 905
 906     if (ir->fourier_spacing > 0)
 907     {
 908         info->fsx[0] = ir->fourier_spacing;
 909         info->fsy[0] = ir->fourier_spacing;
 910         info->fsz[0] = ir->fourier_spacing;
 911     }
 912     else
 913     {
 914         /* Reconstruct fourierspacing per dimension from the number of grid points and box size */
 915         info->fsx[0] = box_size[XX]/ir->nkx;
 916         info->fsy[0] = box_size[YY]/ir->nky;
 917         info->fsz[0] = box_size[ZZ]/ir->nkz;
 918     }
 919
 920     /* If no value for the fourierspacing was provided on the command line, we
 921      * use the reconstruction from the tpr file */
 922     if (ir->fourier_spacing > 0)
 923     {
 924         /* Use the spacing from the tpr */
 925         fourierspacing = ir->fourier_spacing;
 926     }
 927     else
 928     {
 929         /* Use the maximum observed spacing */
 930         fourierspacing = max(max(info->fsx[0], info->fsy[0]), info->fsz[0]);
 931     }
 932
 933     fprintf(stdout, "Calculating PME grid points on the basis of a fourierspacing of %f nm\n", fourierspacing);
 934
 935     /* For performance comparisons the number of particles is useful to have */
 936     fprintf(fp, "   Number of particles  : %d\n", mtop.natoms);
 937
 938     /* Print information about settings of which some are potentially modified: */
 939     fprintf(fp, "   Coulomb type         : %s\n", EELTYPE(ir->coulombtype));
 940     fprintf(fp, "   Grid spacing x y z   : %f %f %f\n",
 941             box_size[XX]/ir->nkx, box_size[YY]/ir->nky, box_size[ZZ]/ir->nkz);
 942     fprintf(fp, "   Van der Waals type   : %s\n", EVDWTYPE(ir->vdwtype));
 943     if (ir_vdw_switched(ir))
 944     {
 945         fprintf(fp, "   rvdw_switch          : %f nm\n", ir->rvdw_switch);
 946     }
 947     if (EPME_SWITCHED(ir->coulombtype))
 948     {
 949         fprintf(fp, "   rlist                : %f nm\n", ir->rlist);
 950     }
 951     if (ir->rlistlong != max_cutoff(ir->rvdw, ir->rcoulomb))
 952     {
 953         fprintf(fp, "   rlistlong            : %f nm\n", ir->rlistlong);
 954     }
 955
 956     /* Print a descriptive line about the tpr settings tested */
 957     fprintf(fp, "\nWill try these real/reciprocal workload settings:\n");
 958     fprintf(fp, " No.   scaling  rcoulomb");
 959     fprintf(fp, "  nkx  nky  nkz");
 960     fprintf(fp, "   spacing");
 961     if (evdwCUT == ir->vdwtype)
 962     {
 963         fprintf(fp, "      rvdw");
 964     }
 965     if (EPME_SWITCHED(ir->coulombtype))
 966     {
 967         fprintf(fp, "     rlist");
 968     }
 969     if (ir->rlistlong != max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb)) )
 970     {
 971         fprintf(fp, " rlistlong");
 972     }
 973     fprintf(fp, "  tpr file\n");
 974
 975     /* Loop to create the requested number of tpr input files */
 976     for (j = 0; j < *ntprs; j++)
 977     {
 978         /* The first .tpr is the provided one, just need to modify nsteps,
 979          * so skip the following block */
 980         if (j != 0)
 981         {
 982             /* Determine which Coulomb radii rc to use in the benchmarks */
 983             add = (rmax-rmin)/(*ntprs-1);
 984             if (is_equal(rmin, info->rcoulomb[0]))
 985             {
 986                 ir->rcoulomb = rmin + j*add;
 987             }
 988             else if (is_equal(rmax, info->rcoulomb[0]))
 989             {
 990                 ir->rcoulomb = rmin + (j-1)*add;
 991             }
 992             else
 993             {
 994                 /* rmin != rcoul != rmax, ergo test between rmin and rmax */
 995                 add          = (rmax-rmin)/(*ntprs-2);
 996                 ir->rcoulomb = rmin + (j-1)*add;
 997             }
 998
 999             /* Determine the scaling factor fac */
1000             fac = ir->rcoulomb/info->rcoulomb[0];
1001
1002             /* Scale the Fourier grid spacing */
1003             ir->nkx = ir->nky = ir->nkz = 0;
1004             calc_grid(NULL, state.box, fourierspacing*fac, &ir->nkx, &ir->nky, &ir->nkz);
1005
1006             /* Adjust other radii since various conditions need to be fulfilled */
1007             if (eelPME == ir->coulombtype)
1008             {
1009                 /* plain PME, rcoulomb must be equal to rlist */
1010                 ir->rlist = ir->rcoulomb;
1011             }
1012             else
1013             {
1014                 /* rlist must be >= rcoulomb, we keep the size of the buffer region */
1015                 ir->rlist = ir->rcoulomb + nlist_buffer;
1016             }
1017
1018             if (bScaleRvdw && evdwCUT == ir->vdwtype)
1019             {
1020                 if (ecutsVERLET == ir->cutoff_scheme)
1021                 {
1022                     /* With Verlet, the van der Waals radius must always equal the Coulomb radius */
1023                     ir->rvdw = ir->rcoulomb;
1024                 }
1025                 else
1026                 {
1027                     /* For vdw cutoff, rvdw >= rlist */
1028                     ir->rvdw = max(info->rvdw[0], ir->rlist);
1029                 }
1030             }
1031
1032             ir->rlistlong = max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb));
1033
1034         } /* end of "if (j != 0)" */
1035
1036         /* for j==0: Save the original settings
1037          * for j >0: Save modified radii and Fourier grids */
1038         info->rcoulomb[j]  = ir->rcoulomb;
1039         info->rvdw[j]      = ir->rvdw;
1040         info->nkx[j]       = ir->nkx;
1041         info->nky[j]       = ir->nky;
1042         info->nkz[j]       = ir->nkz;
1043         info->rlist[j]     = ir->rlist;
1044         info->rlistlong[j] = ir->rlistlong;
1045         info->fsx[j]       = fac*fourierspacing;
1046         info->fsy[j]       = fac*fourierspacing;
1047         info->fsz[j]       = fac*fourierspacing;
1048
1049         /* Write the benchmark tpr file */
1050         strncpy(fn_bench_tprs[j], fn_sim_tpr, strlen(fn_sim_tpr)-strlen(".tpr"));
1051         sprintf(buf, "_bench%.2d.tpr", j);
1052         strcat(fn_bench_tprs[j], buf);
1053         fprintf(stdout, "Writing benchmark tpr %s with nsteps=", fn_bench_tprs[j]);
1054         fprintf(stdout, "%"GMX_PRId64, ir->nsteps);
1055         if (j > 0)
1056         {
1057             fprintf(stdout, ", scaling factor %f\n", fac);
1058         }
1059         else
1060         {
1061             fprintf(stdout, ", unmodified settings\n");
1062         }
1063
1064         write_tpx_state(fn_bench_tprs[j], ir, &state, &mtop);
1065
1066         /* Write information about modified tpr settings to log file */
1067         fprintf(fp, "%4d%10f%10f", j, fac, ir->rcoulomb);
1068         fprintf(fp, "%5d%5d%5d", ir->nkx, ir->nky, ir->nkz);
1069         fprintf(fp, " %9f ", info->fsx[j]);
1070         if (evdwCUT == ir->vdwtype)
1071         {
1072             fprintf(fp, "%10f", ir->rvdw);
1073         }
1074         if (EPME_SWITCHED(ir->coulombtype))
1075         {
1076             fprintf(fp, "%10f", ir->rlist);
1077         }
1078         if (info->rlistlong[0] != max_cutoff(info->rlist[0], max_cutoff(info->rvdw[0], info->rcoulomb[0])) )
1079         {
1080             fprintf(fp, "%10f", ir->rlistlong);
1081         }
1082         fprintf(fp, "  %-14s\n", fn_bench_tprs[j]);
1083
1084         /* Make it clear to the user that some additional settings were modified */
1085         if (!is_equal(ir->rvdw, info->rvdw[0])
1086             || !is_equal(ir->rlistlong, info->rlistlong[0]) )
1087         {
1088             bNote = TRUE;
1089         }
1090     }
1091     if (bNote)
1092     {
1093         fprintf(fp, "\nNote that in addition to the Coulomb radius and the Fourier grid\n"
1094                 "other input settings were also changed (see table above).\n"
1095                 "Please check if the modified settings are appropriate.\n");
1096     }
1097     fflush(stdout);
1098     fflush(fp);
1099     sfree(ir);
1100 }
1101
1102
1103 /* Rename the files we want to keep to some meaningful filename and
1104  * delete the rest */
1105 static void cleanup(const t_filenm *fnm, int nfile, int k, int nnodes,
1106                     int nPMEnodes, int nr, gmx_bool bKeepStderr)
1107 {
1108     char        numstring[STRLEN];
1109     char        newfilename[STRLEN];
1110     const char *fn = NULL;
1111     int         i;
1112     const char *opt;
1113
1114
1115     fprintf(stdout, "Cleaning up, deleting benchmark temp files ...\n");
1116
1117     for (i = 0; i < nfile; i++)
1118     {
1119         opt = (char *)fnm[i].opt;
1120         if (strcmp(opt, "-p") == 0)
1121         {
1122             /* do nothing; keep this file */
1123             ;
1124         }
1125         else if (strcmp(opt, "-bg") == 0)
1126         {
1127             /* Give the log file a nice name so one can later see which parameters were used */
1128             numstring[0] = '\0';
1129             if (nr > 0)
1130             {
1131                 sprintf(numstring, "_%d", nr);
1132             }
1133             sprintf(newfilename, "%s_no%d_np%d_npme%d%s", opt2fn("-bg", nfile, fnm), k, nnodes, nPMEnodes, numstring);
1134             if (gmx_fexist(opt2fn("-bg", nfile, fnm)))
1135             {
1136                 fprintf(stdout, "renaming log file to %s\n", newfilename);
1137                 make_backup(newfilename);
1138                 rename(opt2fn("-bg", nfile, fnm), newfilename);
1139             }
1140         }
1141         else if (strcmp(opt, "-err") == 0)
1142         {
1143             /* This file contains the output of stderr. We want to keep it in
1144              * cases where there have been problems. */
1145             fn           = opt2fn(opt, nfile, fnm);
1146             numstring[0] = '\0';
1147             if (nr > 0)
1148             {
1149                 sprintf(numstring, "_%d", nr);
1150             }
1151             sprintf(newfilename, "%s_no%d_np%d_npme%d%s", fn, k, nnodes, nPMEnodes, numstring);
1152             if (gmx_fexist(fn))
1153             {
1154                 if (bKeepStderr)
1155                 {
1156                     fprintf(stdout, "Saving stderr output in %s\n", newfilename);
1157                     make_backup(newfilename);
1158                     rename(fn, newfilename);
1159                 }
1160                 else
1161                 {
1162                     fprintf(stdout, "Deleting %s\n", fn);
1163                     remove(fn);
1164                 }
1165             }
1166         }
1167         /* Delete the files which are created for each benchmark run: (options -b*) */
1168         else if ( (0 == strncmp(opt, "-b", 2)) && (opt2bSet(opt, nfile, fnm) || !is_optional(&fnm[i])) )
1169         {
1170             remove_if_exists(opt2fn(opt, nfile, fnm));
1171         }
1172     }
1173 }
1174
1175
1176 enum {
1177     eNpmeAuto, eNpmeAll, eNpmeReduced, eNpmeSubset, eNpmeNr
1178 };
1179
1180 /* Create a list of numbers of PME nodes to test */
1181 static void make_npme_list(
1182         const char *npmevalues_opt, /* Make a complete list with all
1183                                      * possibilities or a short list that keeps only
1184                                      * reasonable numbers of PME nodes                  */
1185         int        *nentries,       /* Number of entries we put in the nPMEnodes list   */
1186         int        *nPMEnodes[],    /* Each entry contains the value for -npme          */
1187         int         nnodes,         /* Total number of nodes to do the tests on         */
1188         int         minPMEnodes,    /* Minimum number of PME nodes                      */
1189         int         maxPMEnodes)    /* Maximum number of PME nodes                      */
1190 {
1191     int i, npme, npp;
1192     int min_factor = 1;   /* We request that npp and npme have this minimal
1193                            * largest common factor (depends on npp)           */
1194     int nlistmax;         /* Max. list size                                   */
1195     int nlist;            /* Actual number of entries in list                 */
1196     int eNPME = 0;
1197
1198
1199     /* Do we need to check all possible values for -npme or is a reduced list enough? */
1200     if (0 == strcmp(npmevalues_opt, "all") )
1201     {
1202         eNPME = eNpmeAll;
1203     }
1204     else if (0 == strcmp(npmevalues_opt, "subset") )
1205     {
1206         eNPME = eNpmeSubset;
1207     }
1208     else /* "auto" or "range" */
1209     {
1210         if (nnodes <= 64)
1211         {
1212             eNPME = eNpmeAll;
1213         }
1214         else if (nnodes < 128)
1215         {
1216             eNPME = eNpmeReduced;
1217         }
1218         else
1219         {
1220             eNPME = eNpmeSubset;
1221         }
1222     }
1223
1224     /* Calculate how many entries we could possibly have (in case of -npme all) */
1225     if (nnodes > 2)
1226     {
1227         nlistmax = maxPMEnodes - minPMEnodes + 3;
1228         if (0 == minPMEnodes)
1229         {
1230             nlistmax--;
1231         }
1232     }
1233     else
1234     {
1235         nlistmax = 1;
1236     }
1237
1238     /* Now make the actual list which is at most of size nlist */
1239     snew(*nPMEnodes, nlistmax);
1240     nlist = 0; /* start counting again, now the real entries in the list */
1241     for (i = 0; i < nlistmax - 2; i++)
1242     {
1243         npme = maxPMEnodes - i;
1244         npp  = nnodes-npme;
1245         switch (eNPME)
1246         {
1247             case eNpmeAll:
1248                 min_factor = 1;
1249                 break;
1250             case eNpmeReduced:
1251                 min_factor = 2;
1252                 break;
1253             case eNpmeSubset:
1254                 /* For 2d PME we want a common largest factor of at least the cube
1255                  * root of the number of PP nodes */
1256                 min_factor = (int) pow(npp, 1.0/3.0);
1257                 break;
1258             default:
1259                 gmx_fatal(FARGS, "Unknown option for eNPME in make_npme_list");
1260                 break;
1261         }
1262         if (gmx_greatest_common_divisor(npp, npme) >= min_factor)
1263         {
1264             (*nPMEnodes)[nlist] = npme;
1265             nlist++;
1266         }
1267     }
1268     /* We always test 0 PME nodes and the automatic number */
1269     *nentries             = nlist + 2;
1270     (*nPMEnodes)[nlist  ] =  0;
1271     (*nPMEnodes)[nlist+1] = -1;
1272
1273     fprintf(stderr, "Will try the following %d different values for -npme:\n", *nentries);
1274     for (i = 0; i < *nentries-1; i++)
1275     {
1276         fprintf(stderr, "%d, ", (*nPMEnodes)[i]);
1277     }
1278     fprintf(stderr, "and %d (auto).\n", (*nPMEnodes)[*nentries-1]);
1279 }
1280
1281
1282 /* Allocate memory to store the performance data */
1283 static void init_perfdata(t_perf *perfdata[], int ntprs, int datasets, int repeats)
1284 {
1285     int i, j, k;
1286
1287
1288     for (k = 0; k < ntprs; k++)
1289     {
1290         snew(perfdata[k], datasets);
1291         for (i = 0; i < datasets; i++)
1292         {
1293             for (j = 0; j < repeats; j++)
1294             {
1295                 snew(perfdata[k][i].Gcycles, repeats);
1296                 snew(perfdata[k][i].ns_per_day, repeats);
1297                 snew(perfdata[k][i].PME_f_load, repeats);
1298             }
1299         }
1300     }
1301 }
1302
1303
1304 /* Check for errors on mdrun -h */
1305 static void make_sure_it_runs(char *mdrun_cmd_line, int length, FILE *fp,
1306                               const t_filenm *fnm, int nfile)
1307 {
1308     const char *fn = NULL;
1309     char       *command, *msg;
1310     int         ret;
1311
1312
1313     snew(command, length +  15);
1314     snew(msg, length + 500);
1315
1316     fprintf(stdout, "Making sure the benchmarks can be executed by running just 1 step...\n");
1317     sprintf(command, "%s -nsteps 1 -quiet", mdrun_cmd_line);
1318     fprintf(stdout, "Executing '%s' ...\n", command);
1319     ret = gmx_system_call(command);
1320
1321     if (0 != ret)
1322     {
1323         /* To prevent confusion, do not again issue a gmx_fatal here since we already
1324          * get the error message from mdrun itself */
1325         sprintf(msg,    "Cannot run the benchmark simulations! Please check the error message of\n"
1326                 "mdrun for the source of the problem. Did you provide a command line\n"
1327                 "argument that neither g_tune_pme nor mdrun understands? Offending command:\n"
1328                 "\n%s\n\n", command);
1329
1330         fprintf(stderr, "%s", msg);
1331         sep_line(fp);
1332         fprintf(fp, "%s", msg);
1333
1334         exit(ret);
1335     }
1336     fprintf(stdout, "Benchmarks can be executed!\n");
1337
1338     /* Clean up the benchmark output files we just created */
1339     fprintf(stdout, "Cleaning up ...\n");
1340     remove_if_exists(opt2fn("-bc", nfile, fnm));
1341     remove_if_exists(opt2fn("-be", nfile, fnm));
1342     remove_if_exists(opt2fn("-bcpo", nfile, fnm));
1343     remove_if_exists(opt2fn("-bg", nfile, fnm));
1344
1345     sfree(command);
1346     sfree(msg    );
1347 }
1348
1349
1350 static void do_the_tests(
1351         FILE           *fp,             /* General g_tune_pme output file         */
1352         char          **tpr_names,      /* Filenames of the input files to test   */
1353         int             maxPMEnodes,    /* Max fraction of nodes to use for PME   */
1354         int             minPMEnodes,    /* Min fraction of nodes to use for PME   */
1355         int             npme_fixed,     /* If >= -1, test fixed number of PME
1356                                          * nodes only                             */
1357         const char     *npmevalues_opt, /* Which -npme values should be tested    */
1358         t_perf        **perfdata,       /* Here the performace data is stored     */
1359         int            *pmeentries,     /* Entries in the nPMEnodes list          */
1360         int             repeats,        /* Repeat each test this often            */
1361         int             nnodes,         /* Total number of nodes = nPP + nPME     */
1362         int             nr_tprs,        /* Total number of tpr files to test      */
1363         gmx_bool        bThreads,       /* Threads or MPI?                        */
1364         char           *cmd_mpirun,     /* mpirun command string                  */
1365         char           *cmd_np,         /* "-np", "-n", whatever mpirun needs     */
1366         char           *cmd_mdrun,      /* mdrun command string                   */
1367         char           *cmd_args_bench, /* arguments for mdrun in a string        */
1368         const t_filenm *fnm,            /* List of filenames from command line    */
1369         int             nfile,          /* Number of files specified on the cmdl. */
1370         int             presteps,       /* DLB equilibration steps, is checked    */
1371         gmx_int64_t     cpt_steps)      /* Time step counter in the checkpoint    */
1372 {
1373     int      i, nr, k, ret, count = 0, totaltests;
1374     int     *nPMEnodes = NULL;
1375     t_perf  *pd        = NULL;
1376     int      cmdline_length;
1377     char    *command, *cmd_stub;
1378     char     buf[STRLEN];
1379     gmx_bool bResetProblem = FALSE;
1380     gmx_bool bFirst        = TRUE;
1381
1382
1383     /* This string array corresponds to the eParselog enum type at the start
1384      * of this file */
1385     const char* ParseLog[] = {
1386         "OK.",
1387         "Logfile not found!",
1388         "No timings, logfile truncated?",
1389         "Run was terminated.",
1390         "Counters were not reset properly.",
1391         "No DD grid found for these settings.",
1392         "TPX version conflict!",
1393         "mdrun was not started in parallel!",
1394         "Number of PP ranks has a prime factor that is too large.",
1395         "An error occured."
1396     };
1397     char        str_PME_f_load[13];
1398
1399
1400     /* Allocate space for the mdrun command line. 100 extra characters should
1401        be more than enough for the -npme etcetera arguments */
1402     cmdline_length =  strlen(cmd_mpirun)
1403         + strlen(cmd_np)
1404         + strlen(cmd_mdrun)
1405         + strlen(cmd_args_bench)
1406         + strlen(tpr_names[0]) + 100;
1407     snew(command, cmdline_length);
1408     snew(cmd_stub, cmdline_length);
1409
1410     /* Construct the part of the command line that stays the same for all tests: */
1411     if (bThreads)
1412     {
1413         sprintf(cmd_stub, "%s%s", cmd_mdrun, cmd_np);
1414     }
1415     else
1416     {
1417         sprintf(cmd_stub, "%s%s%s ", cmd_mpirun, cmd_np, cmd_mdrun);
1418     }
1419
1420     /* Create a list of numbers of PME nodes to test */
1421     if (npme_fixed < -1)
1422     {
1423         make_npme_list(npmevalues_opt, pmeentries, &nPMEnodes,
1424                        nnodes, minPMEnodes, maxPMEnodes);
1425     }
1426     else
1427     {
1428         *pmeentries  = 1;
1429         snew(nPMEnodes, 1);
1430         nPMEnodes[0] = npme_fixed;
1431         fprintf(stderr, "Will use a fixed number of %d PME-only ranks.\n", nPMEnodes[0]);
1432     }
1433
1434     if (0 == repeats)
1435     {
1436         fprintf(fp, "\nNo benchmarks done since number of repeats (-r) is 0.\n");
1437         gmx_ffclose(fp);
1438         finalize(opt2fn("-p", nfile, fnm));
1439         exit(0);
1440     }
1441
1442     /* Allocate one dataset for each tpr input file: */
1443     init_perfdata(perfdata, nr_tprs, *pmeentries, repeats);
1444
1445     /*****************************************/
1446     /* Main loop over all tpr files to test: */
1447     /*****************************************/
1448     totaltests = nr_tprs*(*pmeentries)*repeats;
1449     for (k = 0; k < nr_tprs; k++)
1450     {
1451         fprintf(fp, "\nIndividual timings for input file %d (%s):\n", k, tpr_names[k]);
1452         fprintf(fp, "PME ranks      Gcycles       ns/day        PME/f    Remark\n");
1453         /* Loop over various numbers of PME nodes: */
1454         for (i = 0; i < *pmeentries; i++)
1455         {
1456             pd = &perfdata[k][i];
1457
1458             /* Loop over the repeats for each scenario: */
1459             for (nr = 0; nr < repeats; nr++)
1460             {
1461                 pd->nPMEnodes = nPMEnodes[i];
1462
1463                 /* Add -npme and -s to the command line and save it. Note that
1464                  * the -passall (if set) options requires cmd_args_bench to be
1465                  * at the end of the command line string */
1466                 snew(pd->mdrun_cmd_line, cmdline_length);
1467                 sprintf(pd->mdrun_cmd_line, "%s-npme %d -s %s %s",
1468                         cmd_stub, pd->nPMEnodes, tpr_names[k], cmd_args_bench);
1469
1470                 /* To prevent that all benchmarks fail due to a show-stopper argument
1471                  * on the mdrun command line, we make a quick check first */
1472                 if (bFirst)
1473                 {
1474                     make_sure_it_runs(pd->mdrun_cmd_line, cmdline_length, fp, fnm, nfile);
1475                 }
1476                 bFirst = FALSE;
1477
1478                 /* Do a benchmark simulation: */
1479                 if (repeats > 1)
1480                 {
1481                     sprintf(buf, ", pass %d/%d", nr+1, repeats);
1482                 }
1483                 else
1484                 {
1485                     buf[0] = '\0';
1486                 }
1487                 fprintf(stdout, "\n=== Progress %2.0f%%, tpr %d/%d, run %d/%d%s:\n",
1488                         (100.0*count)/totaltests,
1489                         k+1, nr_tprs, i+1, *pmeentries, buf);
1490                 make_backup(opt2fn("-err", nfile, fnm));
1491                 sprintf(command, "%s 1> /dev/null 2>%s", pd->mdrun_cmd_line, opt2fn("-err", nfile, fnm));
1492                 fprintf(stdout, "%s\n", pd->mdrun_cmd_line);
1493                 gmx_system_call(command);
1494
1495                 /* Collect the performance data from the log file; also check stderr
1496                  * for fatal errors */
1497                 ret = parse_logfile(opt2fn("-bg", nfile, fnm), opt2fn("-err", nfile, fnm),
1498                                     pd, nr, presteps, cpt_steps, nnodes);
1499                 if ((presteps > 0) && (ret == eParselogResetProblem))
1500                 {
1501                     bResetProblem = TRUE;
1502                 }
1503
1504                 if (-1 == pd->nPMEnodes)
1505                 {
1506                     sprintf(buf, "(%3d)", pd->guessPME);
1507                 }
1508                 else
1509                 {
1510                     sprintf(buf, "     ");
1511                 }
1512
1513                 /* Nicer output */
1514                 if (pd->PME_f_load[nr] > 0.0)
1515                 {
1516                     sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load[nr]);
1517                 }
1518                 else
1519                 {
1520                     sprintf(str_PME_f_load, "%s", "         -  ");
1521                 }
1522
1523                 /* Write the data we got to disk */
1524                 fprintf(fp, "%4d%s %12.3f %12.3f %s    %s", pd->nPMEnodes,
1525                         buf, pd->Gcycles[nr], pd->ns_per_day[nr], str_PME_f_load, ParseLog[ret]);
1526                 if (!(ret == eParselogOK || ret == eParselogNoDDGrid || ret == eParselogNotFound) )
1527                 {
1528                     fprintf(fp, " Check %s file for problems.", ret == eParselogFatal ? "err" : "log");
1529                 }
1530                 fprintf(fp, "\n");
1531                 fflush(fp);
1532                 count++;
1533
1534                 /* Do some cleaning up and delete the files we do not need any more */
1535                 cleanup(fnm, nfile, k, nnodes, pd->nPMEnodes, nr, ret == eParselogFatal);
1536
1537                 /* If the first run with this number of processors already failed, do not try again: */
1538                 if (pd->Gcycles[0] <= 0.0 && repeats > 1)
1539                 {
1540                     fprintf(stdout, "Skipping remaining passes of unsuccessful setting, see log file for details.\n");
1541                     count += repeats-(nr+1);
1542                     break;
1543                 }
1544             } /* end of repeats loop */
1545         }     /* end of -npme loop */
1546     }         /* end of tpr file loop */
1547
1548     if (bResetProblem)
1549     {
1550         sep_line(fp);
1551         fprintf(fp, "WARNING: The cycle and time step counters could not be reset properly. ");
1552         sep_line(fp);
1553     }
1554     sfree(command);
1555     sfree(cmd_stub);
1556 }
1557
1558
1559 static void check_input(
1560         int             nnodes,
1561         int             repeats,
1562         int            *ntprs,
1563         real           *rmin,
1564         real            rcoulomb,
1565         real           *rmax,
1566         real            maxPMEfraction,
1567         real            minPMEfraction,
1568         int             npme_fixed,
1569         gmx_int64_t     bench_nsteps,
1570         const t_filenm *fnm,
1571         int             nfile,
1572         int             sim_part,
1573         int             presteps,
1574         int             npargs,
1575         t_pargs        *pa)
1576 {
1577     int old;
1578
1579
1580     /* Make sure the input file exists */
1581     if (!gmx_fexist(opt2fn("-s", nfile, fnm)))
1582     {
1583         gmx_fatal(FARGS, "File %s not found.", opt2fn("-s", nfile, fnm));
1584     }
1585
1586     /* Make sure that the checkpoint file is not overwritten during benchmarking */
1587     if ( (0 == strcmp(opt2fn("-cpi", nfile, fnm), opt2fn("-bcpo", nfile, fnm)) ) && (sim_part > 1) )
1588     {
1589         gmx_fatal(FARGS, "Checkpoint input (-cpi) and benchmark checkpoint output (-bcpo) files must not be identical.\n"
1590                   "The checkpoint input file must not be overwritten during the benchmarks.\n");
1591     }
1592
1593     /* Make sure that repeats is >= 0 (if == 0, only write tpr files) */
1594     if (repeats < 0)
1595     {
1596         gmx_fatal(FARGS, "Number of repeats < 0!");
1597     }
1598
1599     /* Check number of nodes */
1600     if (nnodes < 1)
1601     {
1602         gmx_fatal(FARGS, "Number of ranks/threads must be a positive integer.");
1603     }
1604
1605     /* Automatically choose -ntpr if not set */
1606     if (*ntprs < 1)
1607     {
1608         if (nnodes < 16)
1609         {
1610             *ntprs = 1;
1611         }
1612         else
1613         {
1614             *ntprs = 3;
1615             /* Set a reasonable scaling factor for rcoulomb */
1616             if (*rmax <= 0)
1617             {
1618                 *rmax = rcoulomb * 1.2;
1619             }
1620         }
1621         fprintf(stderr, "Will test %d tpr file%s.\n", *ntprs, *ntprs == 1 ? "" : "s");
1622     }
1623     else
1624     {
1625         if (1 == *ntprs)
1626         {
1627             fprintf(stderr, "Note: Choose ntpr>1 to shift PME load between real and reciprocal space.\n");
1628         }
1629     }
1630
1631     /* Make shure that rmin <= rcoulomb <= rmax */
1632     if (*rmin <= 0)
1633     {
1634         *rmin = rcoulomb;
1635     }
1636     if (*rmax <= 0)
1637     {
1638         *rmax = rcoulomb;
1639     }
1640     if (!(*rmin <= *rmax) )
1641     {
1642         gmx_fatal(FARGS, "Please choose the Coulomb radii such that rmin <= rmax.\n"
1643                   "rmin = %g, rmax = %g, actual rcoul from .tpr file = %g\n", *rmin, *rmax, rcoulomb);
1644     }
1645     /* Add test scenarios if rmin or rmax were set */
1646     if (*ntprs <= 2)
1647     {
1648         if (!is_equal(*rmin, rcoulomb) && (*ntprs == 1) )
1649         {
1650             (*ntprs)++;
1651             fprintf(stderr, "NOTE: Setting -rmin to %g changed -ntpr to %d\n",
1652                     *rmin, *ntprs);
1653         }
1654         if (!is_equal(*rmax, rcoulomb) && (*ntprs == 1) )
1655         {
1656             (*ntprs)++;
1657             fprintf(stderr, "NOTE: Setting -rmax to %g changed -ntpr to %d\n",
1658                     *rmax, *ntprs);
1659         }
1660     }
1661     old = *ntprs;
1662     /* If one of rmin, rmax is set, we need 2 tpr files at minimum */
1663     if (!is_equal(*rmax, rcoulomb) || !is_equal(*rmin, rcoulomb) )
1664     {
1665         *ntprs = max(*ntprs, 2);
1666     }
1667
1668     /* If both rmin, rmax are set, we need 3 tpr files at minimum */
1669     if (!is_equal(*rmax, rcoulomb) && !is_equal(*rmin, rcoulomb) )
1670     {
1671         *ntprs = max(*ntprs, 3);
1672     }
1673
1674     if (old != *ntprs)
1675     {
1676         fprintf(stderr, "NOTE: Your rmin, rmax setting changed -ntpr to %d\n", *ntprs);
1677     }
1678
1679     if (*ntprs > 1)
1680     {
1681         if (is_equal(*rmin, rcoulomb) && is_equal(rcoulomb, *rmax)) /* We have just a single rc */
1682         {
1683             fprintf(stderr, "WARNING: Resetting -ntpr to 1 since no Coulomb radius scaling is requested.\n"
1684                     "Please set rmin < rmax to test Coulomb radii in the [rmin, rmax] interval\n"
1685                     "with correspondingly adjusted PME grid settings\n");
1686             *ntprs = 1;
1687         }
1688     }
1689
1690     /* Check whether max and min fraction are within required values */
1691     if (maxPMEfraction > 0.5 || maxPMEfraction < 0)
1692     {
1693         gmx_fatal(FARGS, "-max must be between 0 and 0.5");
1694     }
1695     if (minPMEfraction > 0.5 || minPMEfraction < 0)
1696     {
1697         gmx_fatal(FARGS, "-min must be between 0 and 0.5");
1698     }
1699     if (maxPMEfraction < minPMEfraction)
1700     {
1701         gmx_fatal(FARGS, "-max must be larger or equal to -min");
1702     }
1703
1704     /* Check whether the number of steps - if it was set - has a reasonable value */
1705     if (bench_nsteps < 0)
1706     {
1707         gmx_fatal(FARGS, "Number of steps must be positive.");
1708     }
1709
1710     if (bench_nsteps > 10000 || bench_nsteps < 100)
1711     {
1712         fprintf(stderr, "WARNING: steps=");
1713         fprintf(stderr, "%"GMX_PRId64, bench_nsteps);
1714         fprintf(stderr, ". Are you sure you want to perform so %s steps for each benchmark?\n", (bench_nsteps < 100) ? "few" : "many");
1715     }
1716
1717     if (presteps < 0)
1718     {
1719         gmx_fatal(FARGS, "Cannot have a negative number of presteps.\n");
1720     }
1721
1722     /* Check for rcoulomb scaling if more than one .tpr file is tested */
1723     if (*ntprs > 1)
1724     {
1725         if (*rmin/rcoulomb < 0.75 || *rmax/rcoulomb > 1.25)
1726         {
1727             fprintf(stderr, "WARNING: Applying extreme scaling factor. I hope you know what you are doing.\n");
1728         }
1729     }
1730
1731     /* If a fixed number of PME nodes is set we do rcoulomb and PME gird tuning
1732      * only. We need to check whether the requested number of PME-only nodes
1733      * makes sense. */
1734     if (npme_fixed > -1)
1735     {
1736         /* No more than 50% of all nodes can be assigned as PME-only nodes. */
1737         if (2*npme_fixed > nnodes)
1738         {
1739             gmx_fatal(FARGS, "Cannot have more than %d PME-only ranks for a total of %d ranks (you chose %d).\n",
1740                       nnodes/2, nnodes, npme_fixed);
1741         }
1742         if ((npme_fixed > 0) && (5*npme_fixed < nnodes))
1743         {
1744             fprintf(stderr, "WARNING: Only %g percent of the ranks are assigned as PME-only ranks.\n",
1745                     100.0*((real)npme_fixed / (real)nnodes));
1746         }
1747         if (opt2parg_bSet("-min", npargs, pa) || opt2parg_bSet("-max", npargs, pa))
1748         {
1749             fprintf(stderr, "NOTE: The -min, -max, and -npme options have no effect when a\n"
1750                     "      fixed number of PME-only ranks is requested with -fix.\n");
1751         }
1752     }
1753 }
1754
1755
1756 /* Returns TRUE when "opt" is needed at launch time */
1757 static gmx_bool is_launch_file(char *opt, gmx_bool bSet)
1758 {
1759     if (0 == strncmp(opt, "-swap", 5))
1760     {
1761         return bSet;
1762     }
1763
1764     /* Apart from the input .tpr and the output log files we need all options that
1765      * were set on the command line and that do not start with -b */
1766     if    (0 == strncmp(opt, "-b", 2) || 0 == strncmp(opt, "-s", 2)
1767            || 0 == strncmp(opt, "-err", 4) || 0 == strncmp(opt, "-p", 2) )
1768     {
1769         return FALSE;
1770     }
1771
1772     return bSet;
1773 }
1774
1775
1776 /* Returns TRUE when "opt" defines a file which is needed for the benchmarks runs */
1777 static gmx_bool is_bench_file(char *opt, gmx_bool bSet, gmx_bool bOptional, gmx_bool bIsOutput)
1778 {
1779     /* Apart from the input .tpr, all files starting with "-b" are for
1780      * _b_enchmark files exclusively */
1781     if (0 == strncmp(opt, "-s", 2))
1782     {
1783         return FALSE;
1784     }
1785
1786     if (0 == strncmp(opt, "-b", 2) || 0 == strncmp(opt, "-s", 2))
1787     {
1788         if (!bOptional || bSet)
1789         {
1790             return TRUE;
1791         }
1792         else
1793         {
1794             return FALSE;
1795         }
1796     }
1797     else
1798     {
1799         if (bIsOutput)
1800         {
1801             return FALSE;
1802         }
1803         else
1804         {
1805             if (bSet) /* These are additional input files like -cpi -ei */
1806             {
1807                 return TRUE;
1808             }
1809             else
1810             {
1811                 return FALSE;
1812             }
1813         }
1814     }
1815 }
1816
1817
1818 /* Adds 'buf' to 'str' */
1819 static void add_to_string(char **str, char *buf)
1820 {
1821     int len;
1822
1823
1824     len = strlen(*str) + strlen(buf) + 1;
1825     srenew(*str, len);
1826     strcat(*str, buf);
1827 }
1828
1829
1830 /* Create the command line for the benchmark as well as for the real run */
1831 static void create_command_line_snippets(
1832         gmx_bool  bAppendFiles,
1833         gmx_bool  bKeepAndNumCPT,
1834         gmx_bool  bResetHWay,
1835         int       presteps,
1836         int       nfile,
1837         t_filenm  fnm[],
1838         char     *cmd_args_bench[],  /* command line arguments for benchmark runs */
1839         char     *cmd_args_launch[], /* command line arguments for simulation run */
1840         char      extra_args[])      /* Add this to the end of the command line */
1841 {
1842     int         i;
1843     char       *opt;
1844     const char *name;
1845     char        strbuf[STRLEN];
1846
1847
1848     /* strlen needs at least '\0' as a string: */
1849     snew(*cmd_args_bench, 1);
1850     snew(*cmd_args_launch, 1);
1851     *cmd_args_launch[0] = '\0';
1852     *cmd_args_bench[0]  = '\0';
1853
1854
1855     /*******************************************/
1856     /* 1. Process other command line arguments */
1857     /*******************************************/
1858     if (presteps > 0)
1859     {
1860         /* Add equilibration steps to benchmark options */
1861         sprintf(strbuf, "-resetstep %d ", presteps);
1862         add_to_string(cmd_args_bench, strbuf);
1863     }
1864     /* These switches take effect only at launch time */
1865     if (FALSE == bAppendFiles)
1866     {
1867         add_to_string(cmd_args_launch, "-noappend ");
1868     }
1869     if (bKeepAndNumCPT)
1870     {
1871         add_to_string(cmd_args_launch, "-cpnum ");
1872     }
1873     if (bResetHWay)
1874     {
1875         add_to_string(cmd_args_launch, "-resethway ");
1876     }
1877
1878     /********************/
1879     /* 2. Process files */
1880     /********************/
1881     for (i = 0; i < nfile; i++)
1882     {
1883         opt  = (char *)fnm[i].opt;
1884         name = opt2fn(opt, nfile, fnm);
1885
1886         /* Strbuf contains the options, now let's sort out where we need that */
1887         sprintf(strbuf, "%s %s ", opt, name);
1888
1889         if (is_bench_file(opt, opt2bSet(opt, nfile, fnm), is_optional(&fnm[i]), is_output(&fnm[i])) )
1890         {
1891             /* All options starting with -b* need the 'b' removed,
1892              * therefore overwrite strbuf */
1893             if (0 == strncmp(opt, "-b", 2))
1894             {
1895                 sprintf(strbuf, "-%s %s ", &opt[2], name);
1896             }
1897
1898             add_to_string(cmd_args_bench, strbuf);
1899         }
1900
1901         if (is_launch_file(opt, opt2bSet(opt, nfile, fnm)) )
1902         {
1903             add_to_string(cmd_args_launch, strbuf);
1904         }
1905     }
1906
1907     add_to_string(cmd_args_bench, extra_args);
1908     add_to_string(cmd_args_launch, extra_args);
1909 }
1910
1911
1912 /* Set option opt */
1913 static void setopt(const char *opt, int nfile, t_filenm fnm[])
1914 {
1915     int i;
1916
1917     for (i = 0; (i < nfile); i++)
1918     {
1919         if (strcmp(opt, fnm[i].opt) == 0)
1920         {
1921             fnm[i].flag |= ffSET;
1922         }
1923     }
1924 }
1925
1926
1927 /* This routine inspects the tpr file and ...
1928  * 1. checks for output files that get triggered by a tpr option. These output
1929  *    files are marked as 'set' to allow for a proper cleanup after each
1930  *    tuning run.
1931  * 2. returns the PME:PP load ratio
1932  * 3. returns rcoulomb from the tpr */
1933 static float inspect_tpr(int nfile, t_filenm fnm[], real *rcoulomb)
1934 {
1935     gmx_bool     bPull;     /* Is pulling requested in .tpr file?             */
1936     gmx_bool     bTpi;      /* Is test particle insertion requested?          */
1937     gmx_bool     bFree;     /* Is a free energy simulation requested?         */
1938     gmx_bool     bNM;       /* Is a normal mode analysis requested?           */
1939     gmx_bool     bSwap;     /* Is water/ion position swapping requested?      */
1940     t_inputrec   ir;
1941     t_state      state;
1942     gmx_mtop_t   mtop;
1943
1944
1945     /* Check tpr file for options that trigger extra output files */
1946     read_tpx_state(opt2fn("-s", nfile, fnm), &ir, &state, NULL, &mtop);
1947     bPull = (epullNO != ir.ePull);
1948     bFree = (efepNO  != ir.efep );
1949     bNM   = (eiNM    == ir.eI   );
1950     bSwap = (eswapNO != ir.eSwapCoords);
1951     bTpi  = EI_TPI(ir.eI);
1952
1953     /* Set these output files on the tuning command-line */
1954     if (bPull)
1955     {
1956         setopt("-pf", nfile, fnm);
1957         setopt("-px", nfile, fnm);
1958     }
1959     if (bFree)
1960     {
1961         setopt("-dhdl", nfile, fnm);
1962     }
1963     if (bTpi)
1964     {
1965         setopt("-tpi", nfile, fnm);
1966         setopt("-tpid", nfile, fnm);
1967     }
1968     if (bNM)
1969     {
1970         setopt("-mtx", nfile, fnm);
1971     }
1972     if (bSwap)
1973     {
1974         setopt("-swap", nfile, fnm);
1975     }
1976
1977     *rcoulomb = ir.rcoulomb;
1978
1979     /* Return the estimate for the number of PME nodes */
1980     return pme_load_estimate(&mtop, &ir, state.box);
1981 }
1982
1983
1984 static void couple_files_options(int nfile, t_filenm fnm[])
1985 {
1986     int      i;
1987     gmx_bool bSet, bBench;
1988     char    *opt;
1989     char     buf[20];
1990
1991
1992     for (i = 0; i < nfile; i++)
1993     {
1994         opt    = (char *)fnm[i].opt;
1995         bSet   = ((fnm[i].flag & ffSET) != 0);
1996         bBench = (0 == strncmp(opt, "-b", 2));
1997
1998         /* Check optional files */
1999         /* If e.g. -eo is set, then -beo also needs to be set */
2000         if (is_optional(&fnm[i]) && bSet && !bBench)
2001         {
2002             sprintf(buf, "-b%s", &opt[1]);
2003             setopt(buf, nfile, fnm);
2004         }
2005         /* If -beo is set, then -eo also needs to be! */
2006         if (is_optional(&fnm[i]) && bSet && bBench)
2007         {
2008             sprintf(buf, "-%s", &opt[2]);
2009             setopt(buf, nfile, fnm);
2010         }
2011     }
2012 }
2013
2014
2015 #define BENCHSTEPS (1000)
2016
2017 int gmx_tune_pme(int argc, char *argv[])
2018 {
2019     const char     *desc[] = {
2020         "For a given number [TT]-np[tt] or [TT]-ntmpi[tt] of ranks, [THISMODULE] systematically",
2021         "times [gmx-mdrun] with various numbers of PME-only ranks and determines",
2022         "which setting is fastest. It will also test whether performance can",
2023         "be enhanced by shifting load from the reciprocal to the real space",
2024         "part of the Ewald sum. ",
2025         "Simply pass your [TT].tpr[tt] file to [THISMODULE] together with other options",
2026         "for [gmx-mdrun] as needed.[PAR]",
2027         "Which executables are used can be set in the environment variables",
2028         "MPIRUN and MDRUN. If these are not present, 'mpirun' and 'mdrun'",
2029         "will be used as defaults. Note that for certain MPI frameworks you",
2030         "need to provide a machine- or hostfile. This can also be passed",
2031         "via the MPIRUN variable, e.g.[PAR]",
2032         "[TT]export MPIRUN=\"/usr/local/mpirun -machinefile hosts\"[tt][PAR]",
2033         "Please call [THISMODULE] with the normal options you would pass to",
2034         "[gmx-mdrun] and add [TT]-np[tt] for the number of ranks to perform the",
2035         "tests on, or [TT]-ntmpi[tt] for the number of threads. You can also add [TT]-r[tt]",
2036         "to repeat each test several times to get better statistics. [PAR]",
2037         "[THISMODULE] can test various real space / reciprocal space workloads",
2038         "for you. With [TT]-ntpr[tt] you control how many extra [TT].tpr[tt] files will be",
2039         "written with enlarged cutoffs and smaller Fourier grids respectively.",
2040         "Typically, the first test (number 0) will be with the settings from the input",
2041         "[TT].tpr[tt] file; the last test (number [TT]ntpr[tt]) will have the Coulomb cutoff",
2042         "specified by [TT]-rmax[tt] with a somwhat smaller PME grid at the same time. ",
2043         "In this last test, the Fourier spacing is multiplied with [TT]rmax[tt]/rcoulomb. ",
2044         "The remaining [TT].tpr[tt] files will have equally-spaced Coulomb radii (and Fourier "
2045         "spacings) between these extremes. [BB]Note[bb] that you can set [TT]-ntpr[tt] to 1",
2046         "if you just seek the optimal number of PME-only ranks; in that case",
2047         "your input [TT].tpr[tt] file will remain unchanged.[PAR]",
2048         "For the benchmark runs, the default of 1000 time steps should suffice for most",
2049         "MD systems. The dynamic load balancing needs about 100 time steps",
2050         "to adapt to local load imbalances, therefore the time step counters",
2051         "are by default reset after 100 steps. For large systems (>1M atoms), as well as ",
2052         "for a higher accuarcy of the measurements, you should set [TT]-resetstep[tt] to a higher value.",
2053         "From the 'DD' load imbalance entries in the md.log output file you",
2054         "can tell after how many steps the load is sufficiently balanced. Example call:[PAR]"
2055         "[TT]gmx tune_pme -np 64 -s protein.tpr -launch[tt][PAR]",
2056         "After calling [gmx-mdrun] several times, detailed performance information",
2057         "is available in the output file [TT]perf.out[tt].",
2058         "[BB]Note[bb] that during the benchmarks, a couple of temporary files are written",
2059         "(options [TT]-b[tt]*), these will be automatically deleted after each test.[PAR]",
2060         "If you want the simulation to be started automatically with the",
2061         "optimized parameters, use the command line option [TT]-launch[tt].[PAR]",
2062     };
2063
2064     int             nnodes         = 1;
2065     int             repeats        = 2;
2066     int             pmeentries     = 0; /* How many values for -npme do we actually test for each tpr file */
2067     real            maxPMEfraction = 0.50;
2068     real            minPMEfraction = 0.25;
2069     int             maxPMEnodes, minPMEnodes;
2070     float           guessPMEratio;                    /* guessed PME:PP ratio based on the tpr file */
2071     float           guessPMEnodes;
2072     int             npme_fixed     = -2;              /* If >= -1, use only this number
2073                                                        * of PME-only nodes                */
2074     int             ntprs          = 0;
2075     real            rmin           = 0.0, rmax = 0.0; /* min and max value for rcoulomb if scaling is requested */
2076     real            rcoulomb       = -1.0;            /* Coulomb radius as set in .tpr file */
2077     gmx_bool        bScaleRvdw     = TRUE;
2078     gmx_int64_t     bench_nsteps   = BENCHSTEPS;
2079     gmx_int64_t     new_sim_nsteps = -1;  /* -1 indicates: not set by the user */
2080     gmx_int64_t     cpt_steps      = 0;   /* Step counter in .cpt input file   */
2081     int             presteps       = 100; /* Do a full cycle reset after presteps steps */
2082     gmx_bool        bOverwrite     = FALSE, bKeepTPR;
2083     gmx_bool        bLaunch        = FALSE;
2084     char           *ExtraArgs      = NULL;
2085     char          **tpr_names      = NULL;
2086     const char     *simulation_tpr = NULL;
2087     int             best_npme, best_tpr;
2088     int             sim_part = 1; /* For benchmarks with checkpoint files */
2089     char            bbuf[STRLEN];
2090
2091     /* Default program names if nothing else is found */
2092     char         *cmd_mpirun = NULL, *cmd_mdrun = NULL;
2093     char         *cmd_args_bench, *cmd_args_launch;
2094     char         *cmd_np = NULL;
2095
2096     t_perf      **perfdata = NULL;
2097     t_inputinfo  *info;
2098     int           i;
2099     FILE         *fp;
2100     t_commrec    *cr;
2101
2102     /* Print out how long the tuning took */
2103     double          seconds;
2104
2105     static t_filenm fnm[] = {
2106         /* g_tune_pme */
2107         { efOUT, "-p",      "perf",     ffWRITE },
2108         { efLOG, "-err",    "bencherr", ffWRITE },
2109         { efTPX, "-so",     "tuned",    ffWRITE },
2110         /* mdrun: */
2111         { efTPX, NULL,      NULL,       ffREAD },
2112         { efTRN, "-o",      NULL,       ffWRITE },
2113         { efCOMPRESSED, "-x", NULL,     ffOPTWR },
2114         { efCPT, "-cpi",    NULL,       ffOPTRD },
2115         { efCPT, "-cpo",    NULL,       ffOPTWR },
2116         { efSTO, "-c",      "confout",  ffWRITE },
2117         { efEDR, "-e",      "ener",     ffWRITE },
2118         { efLOG, "-g",      "md",       ffWRITE },
2119         { efXVG, "-dhdl",   "dhdl",     ffOPTWR },
2120         { efXVG, "-field",  "field",    ffOPTWR },
2121         { efXVG, "-table",  "table",    ffOPTRD },
2122         { efXVG, "-tabletf", "tabletf",   ffOPTRD },
2123         { efXVG, "-tablep", "tablep",   ffOPTRD },
2124         { efXVG, "-tableb", "table",    ffOPTRD },
2125         { efTRX, "-rerun",  "rerun",    ffOPTRD },
2126         { efXVG, "-tpi",    "tpi",      ffOPTWR },
2127         { efXVG, "-tpid",   "tpidist",  ffOPTWR },
2128         { efEDI, "-ei",     "sam",      ffOPTRD },
2129         { efXVG, "-eo",     "edsam",    ffOPTWR },
2130         { efXVG, "-devout", "deviatie", ffOPTWR },
2131         { efXVG, "-runav",  "runaver",  ffOPTWR },
2132         { efXVG, "-px",     "pullx",    ffOPTWR },
2133         { efXVG, "-pf",     "pullf",    ffOPTWR },
2134         { efXVG, "-ro",     "rotation", ffOPTWR },
2135         { efLOG, "-ra",     "rotangles", ffOPTWR },
2136         { efLOG, "-rs",     "rotslabs", ffOPTWR },
2137         { efLOG, "-rt",     "rottorque", ffOPTWR },
2138         { efMTX, "-mtx",    "nm",       ffOPTWR },
2139         { efNDX, "-dn",     "dipole",   ffOPTWR },
2140         { efXVG, "-swap",   "swapions", ffOPTWR },
2141         /* Output files that are deleted after each benchmark run */
2142         { efTRN, "-bo",     "bench",    ffWRITE },
2143         { efXTC, "-bx",     "bench",    ffWRITE },
2144         { efCPT, "-bcpo",   "bench",    ffWRITE },
2145         { efSTO, "-bc",     "bench",    ffWRITE },
2146         { efEDR, "-be",     "bench",    ffWRITE },
2147         { efLOG, "-bg",     "bench",    ffWRITE },
2148         { efXVG, "-beo",    "benchedo", ffOPTWR },
2149         { efXVG, "-bdhdl",  "benchdhdl", ffOPTWR },
2150         { efXVG, "-bfield", "benchfld", ffOPTWR },
2151         { efXVG, "-btpi",   "benchtpi", ffOPTWR },
2152         { efXVG, "-btpid",  "benchtpid", ffOPTWR },
2153         { efXVG, "-bdevout", "benchdev", ffOPTWR },
2154         { efXVG, "-brunav", "benchrnav", ffOPTWR },
2155         { efXVG, "-bpx",    "benchpx",  ffOPTWR },
2156         { efXVG, "-bpf",    "benchpf",  ffOPTWR },
2157         { efXVG, "-bro",    "benchrot", ffOPTWR },
2158         { efLOG, "-bra",    "benchrota", ffOPTWR },
2159         { efLOG, "-brs",    "benchrots", ffOPTWR },
2160         { efLOG, "-brt",    "benchrott", ffOPTWR },
2161         { efMTX, "-bmtx",   "benchn",   ffOPTWR },
2162         { efNDX, "-bdn",    "bench",    ffOPTWR },
2163         { efXVG, "-bswap",  "benchswp", ffOPTWR }
2164     };
2165
2166     gmx_bool        bThreads     = FALSE;
2167
2168     int             nthreads = 1;
2169
2170     const char     *procstring[] =
2171     { NULL, "-np", "-n", "none", NULL };
2172     const char     *npmevalues_opt[] =
2173     { NULL, "auto", "all", "subset", NULL };
2174
2175     gmx_bool     bAppendFiles          = TRUE;
2176     gmx_bool     bKeepAndNumCPT        = FALSE;
2177     gmx_bool     bResetCountersHalfWay = FALSE;
2178     gmx_bool     bBenchmark            = TRUE;
2179
2180     output_env_t oenv = NULL;
2181
2182     t_pargs      pa[] = {
2183         /***********************/
2184         /* g_tune_pme options: */
2185         /***********************/
2186         { "-np",       FALSE, etINT,  {&nnodes},
2187           "Number of ranks to run the tests on (must be > 2 for separate PME ranks)" },
2188         { "-npstring", FALSE, etENUM, {procstring},
2189           "Specify the number of ranks to [TT]$MPIRUN[tt] using this string"},
2190         { "-ntmpi",    FALSE, etINT,  {&nthreads},
2191           "Number of MPI-threads to run the tests on (turns MPI & mpirun off)"},
2192         { "-r",        FALSE, etINT,  {&repeats},
2193           "Repeat each test this often" },
2194         { "-max",      FALSE, etREAL, {&maxPMEfraction},
2195           "Max fraction of PME ranks to test with" },
2196         { "-min",      FALSE, etREAL, {&minPMEfraction},
2197           "Min fraction of PME ranks to test with" },
2198         { "-npme",     FALSE, etENUM, {npmevalues_opt},
2199           "Within -min and -max, benchmark all possible values for [TT]-npme[tt], or just a reasonable subset. "
2200           "Auto neglects -min and -max and chooses reasonable values around a guess for npme derived from the .tpr"},
2201         { "-fix",      FALSE, etINT,  {&npme_fixed},
2202           "If >= -1, do not vary the number of PME-only ranks, instead use this fixed value and only vary rcoulomb and the PME grid spacing."},
2203         { "-rmax",     FALSE, etREAL, {&rmax},
2204           "If >0, maximal rcoulomb for -ntpr>1 (rcoulomb upscaling results in fourier grid downscaling)" },
2205         { "-rmin",     FALSE, etREAL, {&rmin},
2206           "If >0, minimal rcoulomb for -ntpr>1" },
2207         { "-scalevdw",  FALSE, etBOOL, {&bScaleRvdw},
2208           "Scale rvdw along with rcoulomb"},
2209         { "-ntpr",     FALSE, etINT,  {&ntprs},
2210           "Number of [TT].tpr[tt] files to benchmark. Create this many files with different rcoulomb scaling factors depending on -rmin and -rmax. "
2211           "If < 1, automatically choose the number of [TT].tpr[tt] files to test" },
2212         { "-steps",    FALSE, etINT64, {&bench_nsteps},
2213           "Take timings for this many steps in the benchmark runs" },
2214         { "-resetstep", FALSE, etINT,  {&presteps},
2215           "Let dlb equilibrate this many steps before timings are taken (reset cycle counters after this many steps)" },
2216         { "-simsteps", FALSE, etINT64, {&new_sim_nsteps},
2217           "If non-negative, perform this many steps in the real run (overwrites nsteps from [TT].tpr[tt], add [TT].cpt[tt] steps)" },
2218         { "-launch",   FALSE, etBOOL, {&bLaunch},
2219           "Launch the real simulation after optimization" },
2220         { "-bench",    FALSE, etBOOL, {&bBenchmark},
2221           "Run the benchmarks or just create the input [TT].tpr[tt] files?" },
2222         /******************/
2223         /* mdrun options: */
2224         /******************/
2225         /* We let g_tune_pme parse and understand these options, because we need to
2226          * prevent that they appear on the mdrun command line for the benchmarks */
2227         { "-append",   FALSE, etBOOL, {&bAppendFiles},
2228           "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names (for launch only)" },
2229         { "-cpnum",    FALSE, etBOOL, {&bKeepAndNumCPT},
2230           "Keep and number checkpoint files (launch only)" },
2231         { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay},
2232           "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt] (launch only)" }
2233     };
2234
2235 #define NFILE asize(fnm)
2236
2237     seconds = gmx_gettime();
2238
2239     if (!parse_common_args(&argc, argv, PCA_NOEXIT_ON_ARGS,
2240                            NFILE, fnm, asize(pa), pa, asize(desc), desc,
2241                            0, NULL, &oenv))
2242     {
2243         return 0;
2244     }
2245
2246     /* Store the remaining unparsed command line entries in a string which
2247      * is then attached to the mdrun command line */
2248     snew(ExtraArgs, 1);
2249     ExtraArgs[0] = '\0';
2250     for (i = 1; i < argc; i++) /* argc will now be 1 if everything was understood */
2251     {
2252         add_to_string(&ExtraArgs, argv[i]);
2253         add_to_string(&ExtraArgs, " ");
2254     }
2255
2256     if (opt2parg_bSet("-ntmpi", asize(pa), pa))
2257     {
2258         bThreads = TRUE;
2259         if (opt2parg_bSet("-npstring", asize(pa), pa))
2260         {
2261             fprintf(stderr, "WARNING: -npstring has no effect when using threads.\n");
2262         }
2263
2264         if (nnodes > 1)
2265         {
2266             gmx_fatal(FARGS, "Can't run multi-threaded MPI simulation yet!");
2267         }
2268         /* and now we just set this; a bit of an ugly hack*/
2269         nnodes = nthreads;
2270     }
2271     /* Check for PME:PP ratio and whether tpr triggers additional output files */
2272     guessPMEratio = inspect_tpr(NFILE, fnm, &rcoulomb);
2273
2274     /* Automatically set -beo options if -eo is set etc. */
2275     couple_files_options(NFILE, fnm);
2276
2277     /* Construct the command line arguments for benchmark runs
2278      * as well as for the simulation run */
2279     if (bThreads)
2280     {
2281         sprintf(bbuf, " -ntmpi %d ", nthreads);
2282     }
2283     else
2284     {
2285         /* This string will be used for MPI runs and will appear after the
2286          * mpirun command. */
2287         if (strcmp(procstring[0], "none") != 0)
2288         {
2289             sprintf(bbuf, " %s %d ", procstring[0], nnodes);
2290         }
2291         else
2292         {
2293             sprintf(bbuf, " ");
2294         }
2295     }
2296
2297     cmd_np = bbuf;
2298
2299     create_command_line_snippets(bAppendFiles, bKeepAndNumCPT, bResetCountersHalfWay, presteps,
2300                                  NFILE, fnm, &cmd_args_bench, &cmd_args_launch, ExtraArgs);
2301
2302     /* Read in checkpoint file if requested */
2303     sim_part = 1;
2304     if (opt2bSet("-cpi", NFILE, fnm))
2305     {
2306         snew(cr, 1);
2307         cr->duty = DUTY_PP; /* makes the following routine happy */
2308         read_checkpoint_simulation_part(opt2fn("-cpi", NFILE, fnm),
2309                                         &sim_part, &cpt_steps, cr,
2310                                         FALSE, NFILE, fnm, NULL, NULL);
2311         sfree(cr);
2312         sim_part++;
2313         /* sim_part will now be 1 if no checkpoint file was found */
2314         if (sim_part <= 1)
2315         {
2316             gmx_fatal(FARGS, "Checkpoint file %s not found!", opt2fn("-cpi", NFILE, fnm));
2317         }
2318     }
2319
2320     /* Open performance output file and write header info */
2321     fp = gmx_ffopen(opt2fn("-p", NFILE, fnm), "w");
2322
2323     /* Make a quick consistency check of command line parameters */
2324     check_input(nnodes, repeats, &ntprs, &rmin, rcoulomb, &rmax,
2325                 maxPMEfraction, minPMEfraction, npme_fixed,
2326                 bench_nsteps, fnm, NFILE, sim_part, presteps,
2327                 asize(pa), pa);
2328
2329     /* Determine the maximum and minimum number of PME nodes to test,
2330      * the actual list of settings is build in do_the_tests(). */
2331     if ((nnodes > 2) && (npme_fixed < -1))
2332     {
2333         if (0 == strcmp(npmevalues_opt[0], "auto"))
2334         {
2335             /* Determine the npme range automatically based on the PME:PP load guess */
2336             if (guessPMEratio > 1.0)
2337             {
2338                 /* More PME than PP work, probably we do not need separate PME nodes at all! */
2339                 maxPMEnodes = nnodes/2;
2340                 minPMEnodes = nnodes/2;
2341             }
2342             else
2343             {
2344                 /* PME : PP load is in the range 0..1, let's test around the guess */
2345                 guessPMEnodes = nnodes/(1.0 + 1.0/guessPMEratio);
2346                 minPMEnodes   = floor(0.7*guessPMEnodes);
2347                 maxPMEnodes   =  ceil(1.6*guessPMEnodes);
2348                 maxPMEnodes   = min(maxPMEnodes, nnodes/2);
2349             }
2350         }
2351         else
2352         {
2353             /* Determine the npme range based on user input */
2354             maxPMEnodes = floor(maxPMEfraction*nnodes);
2355             minPMEnodes = max(floor(minPMEfraction*nnodes), 0);
2356             fprintf(stdout, "Will try runs with %d ", minPMEnodes);
2357             if (maxPMEnodes != minPMEnodes)
2358             {
2359                 fprintf(stdout, "- %d ", maxPMEnodes);
2360             }
2361             fprintf(stdout, "PME-only ranks.\n  Note that the automatic number of PME-only ranks and no separate PME ranks are always tested.\n");
2362         }
2363     }
2364     else
2365     {
2366         maxPMEnodes = 0;
2367         minPMEnodes = 0;
2368     }
2369
2370     /* Get the commands we need to set up the runs from environment variables */
2371     get_program_paths(bThreads, &cmd_mpirun, &cmd_mdrun);
2372     if (bBenchmark && repeats > 0)
2373     {
2374         check_mdrun_works(bThreads, cmd_mpirun, cmd_np, cmd_mdrun);
2375     }
2376
2377     /* Print some header info to file */
2378     sep_line(fp);
2379     fprintf(fp, "\n      P E R F O R M A N C E   R E S U L T S\n");
2380     sep_line(fp);
2381     fprintf(fp, "%s for Gromacs %s\n", ShortProgram(), GromacsVersion());
2382     if (!bThreads)
2383     {
2384         fprintf(fp, "Number of ranks         : %d\n", nnodes);
2385         fprintf(fp, "The mpirun command is   : %s\n", cmd_mpirun);
2386         if (strcmp(procstring[0], "none") != 0)
2387         {
2388             fprintf(fp, "Passing # of ranks via  : %s\n", procstring[0]);
2389         }
2390         else
2391         {
2392             fprintf(fp, "Not setting number of ranks in system call\n");
2393         }
2394     }
2395     else
2396     {
2397         fprintf(fp, "Number of threads       : %d\n", nnodes);
2398     }
2399
2400     fprintf(fp, "The mdrun  command is   : %s\n", cmd_mdrun);
2401     fprintf(fp, "mdrun args benchmarks   : %s\n", cmd_args_bench);
2402     fprintf(fp, "Benchmark steps         : ");
2403     fprintf(fp, "%"GMX_PRId64, bench_nsteps);
2404     fprintf(fp, "\n");
2405     fprintf(fp, "dlb equilibration steps : %d\n", presteps);
2406     if (sim_part > 1)
2407     {
2408         fprintf(fp, "Checkpoint time step    : ");
2409         fprintf(fp, "%"GMX_PRId64, cpt_steps);
2410         fprintf(fp, "\n");
2411     }
2412     fprintf(fp, "mdrun args at launchtime: %s\n", cmd_args_launch);
2413
2414     if (new_sim_nsteps >= 0)
2415     {
2416         bOverwrite = TRUE;
2417         fprintf(stderr, "Note: Simulation input file %s will have ", opt2fn("-so", NFILE, fnm));
2418         fprintf(stderr, "%"GMX_PRId64, new_sim_nsteps+cpt_steps);
2419         fprintf(stderr, " steps.\n");
2420         fprintf(fp, "Simulation steps        : ");
2421         fprintf(fp, "%"GMX_PRId64, new_sim_nsteps);
2422         fprintf(fp, "\n");
2423     }
2424     if (repeats > 1)
2425     {
2426         fprintf(fp, "Repeats for each test   : %d\n", repeats);
2427     }
2428
2429     if (npme_fixed >= -1)
2430     {
2431         fprintf(fp, "Fixing -npme at         : %d\n", npme_fixed);
2432     }
2433
2434     fprintf(fp, "Input file              : %s\n", opt2fn("-s", NFILE, fnm));
2435     fprintf(fp, "   PME/PP load estimate : %g\n", guessPMEratio);
2436
2437     /* Allocate memory for the inputinfo struct: */
2438     snew(info, 1);
2439     info->nr_inputfiles = ntprs;
2440     for (i = 0; i < ntprs; i++)
2441     {
2442         snew(info->rcoulomb, ntprs);
2443         snew(info->rvdw, ntprs);
2444         snew(info->rlist, ntprs);
2445         snew(info->rlistlong, ntprs);
2446         snew(info->nkx, ntprs);
2447         snew(info->nky, ntprs);
2448         snew(info->nkz, ntprs);
2449         snew(info->fsx, ntprs);
2450         snew(info->fsy, ntprs);
2451         snew(info->fsz, ntprs);
2452     }
2453     /* Make alternative tpr files to test: */
2454     snew(tpr_names, ntprs);
2455     for (i = 0; i < ntprs; i++)
2456     {
2457         snew(tpr_names[i], STRLEN);
2458     }
2459
2460     /* It can be that ntprs is reduced by make_benchmark_tprs if not enough
2461      * different grids could be found. */
2462     make_benchmark_tprs(opt2fn("-s", NFILE, fnm), tpr_names, bench_nsteps+presteps,
2463                         cpt_steps, rmin, rmax, bScaleRvdw, &ntprs, info, fp);
2464
2465     /********************************************************************************/
2466     /* Main loop over all scenarios we need to test: tpr files, PME nodes, repeats  */
2467     /********************************************************************************/
2468     snew(perfdata, ntprs);
2469     if (bBenchmark)
2470     {
2471         do_the_tests(fp, tpr_names, maxPMEnodes, minPMEnodes, npme_fixed, npmevalues_opt[0], perfdata, &pmeentries,
2472                      repeats, nnodes, ntprs, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
2473                      cmd_args_bench, fnm, NFILE, presteps, cpt_steps);
2474
2475         fprintf(fp, "\nTuning took%8.1f minutes.\n", (gmx_gettime()-seconds)/60.0);
2476
2477         /* Analyse the results and give a suggestion for optimal settings: */
2478         bKeepTPR = analyze_data(fp, opt2fn("-p", NFILE, fnm), perfdata, nnodes, ntprs, pmeentries,
2479                                 repeats, info, &best_tpr, &best_npme);
2480
2481         /* Take the best-performing tpr file and enlarge nsteps to original value */
2482         if (bKeepTPR && !bOverwrite)
2483         {
2484             simulation_tpr = opt2fn("-s", NFILE, fnm);
2485         }
2486         else
2487         {
2488             simulation_tpr = opt2fn("-so", NFILE, fnm);
2489             modify_PMEsettings(bOverwrite ? (new_sim_nsteps+cpt_steps) : info->orig_sim_steps,
2490                                info->orig_init_step, tpr_names[best_tpr], simulation_tpr);
2491         }
2492
2493         /* Let's get rid of the temporary benchmark input files */
2494         for (i = 0; i < ntprs; i++)
2495         {
2496             fprintf(stdout, "Deleting temporary benchmark input file %s\n", tpr_names[i]);
2497             remove(tpr_names[i]);
2498         }
2499
2500         /* Now start the real simulation if the user requested it ... */
2501         launch_simulation(bLaunch, fp, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
2502                           cmd_args_launch, simulation_tpr, best_npme);
2503     }
2504     gmx_ffclose(fp);
2505
2506     /* ... or simply print the performance results to screen: */
2507     if (!bLaunch)
2508     {
2509         finalize(opt2fn("-p", NFILE, fnm));
2510     }
2511
2512     return 0;
2513 }