src/gromacs/gmxana/gmx_tune_pme.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2009,2010,2011,2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifdef HAVE_CONFIG_H
  36 #include <config.h>
  37 #endif
  38
  39
  40 #include <time.h>
  41 #ifdef HAVE_SYS_TIME_H
  42 #include <sys/time.h>
  43 #endif
  44
  45
  46
  47 #include "gromacs/commandline/pargs.h"
  48 #include "typedefs.h"
  49 #include "types/commrec.h"
  50 #include "gromacs/utility/smalloc.h"
  51 #include "vec.h"
  52 #include "copyrite.h"
  53 #include "gromacs/fileio/tpxio.h"
  54 #include "string2.h"
  55 #include "readinp.h"
  56 #include "calcgrid.h"
  57 #include "checkpoint.h"
  58 #include "macros.h"
  59 #include "gmx_ana.h"
  60 #include "names.h"
  61 #include "perf_est.h"
  62 #include "inputrec.h"
  63 #include "gromacs/timing/walltime_accounting.h"
  64 #include "gromacs/math/utilities.h"
  65
  66
  67 /* Enum for situations that can occur during log file parsing, the
  68  * corresponding string entries can be found in do_the_tests() in
  69  * const char* ParseLog[] */
  70 enum {
  71     eParselogOK,
  72     eParselogNotFound,
  73     eParselogNoPerfData,
  74     eParselogTerm,
  75     eParselogResetProblem,
  76     eParselogNoDDGrid,
  77     eParselogTPXVersion,
  78     eParselogNotParallel,
  79     eParselogLargePrimeFactor,
  80     eParselogFatal,
  81     eParselogNr
  82 };
  83
  84
  85 typedef struct
  86 {
  87     int     nPMEnodes;    /* number of PME-only nodes used in this test */
  88     int     nx, ny, nz;   /* DD grid */
  89     int     guessPME;     /* if nPMEnodes == -1, this is the guessed number of PME nodes */
  90     double *Gcycles;      /* This can contain more than one value if doing multiple tests */
  91     double  Gcycles_Av;
  92     float  *ns_per_day;
  93     float   ns_per_day_Av;
  94     float  *PME_f_load;     /* PME mesh/force load average*/
  95     float   PME_f_load_Av;  /* Average average ;) ... */
  96     char   *mdrun_cmd_line; /* Mdrun command line used for this test */
  97 } t_perf;
  98
  99
 100 typedef struct
 101 {
 102     int             nr_inputfiles;  /* The number of tpr and mdp input files */
 103     gmx_int64_t     orig_sim_steps; /* Number of steps to be done in the real simulation */
 104     gmx_int64_t     orig_init_step; /* Init step for the real simulation */
 105     real           *rcoulomb;       /* The coulomb radii [0...nr_inputfiles] */
 106     real           *rvdw;           /* The vdW radii */
 107     real           *rlist;          /* Neighbourlist cutoff radius */
 108     real           *rlistlong;
 109     int            *nkx, *nky, *nkz;
 110     real           *fsx, *fsy, *fsz; /* Fourierspacing in x,y,z dimension */
 111 } t_inputinfo;
 112
 113
 114 static void sep_line(FILE *fp)
 115 {
 116     fprintf(fp, "\n------------------------------------------------------------\n");
 117 }
 118
 119
 120 /* Wrapper for system calls */
 121 static int gmx_system_call(char *command)
 122 {
 123 #ifdef GMX_NO_SYSTEM
 124     gmx_fatal(FARGS, "No calls to system(3) supported on this platform. Attempted to call:\n'%s'\n", command);
 125 #else
 126     return ( system(command) );
 127 #endif
 128 }
 129
 130
 131 /* Check if string starts with substring */
 132 static gmx_bool str_starts(const char *string, const char *substring)
 133 {
 134     return ( strncmp(string, substring, strlen(substring)) == 0);
 135 }
 136
 137
 138 static void cleandata(t_perf *perfdata, int test_nr)
 139 {
 140     perfdata->Gcycles[test_nr]    = 0.0;
 141     perfdata->ns_per_day[test_nr] = 0.0;
 142     perfdata->PME_f_load[test_nr] = 0.0;
 143
 144     return;
 145 }
 146
 147
 148 static gmx_bool is_equal(real a, real b)
 149 {
 150     real diff, eps = 1.0e-7;
 151
 152
 153     diff = a - b;
 154
 155     if (diff < 0.0)
 156     {
 157         diff = -diff;
 158     }
 159
 160     if (diff < eps)
 161     {
 162         return TRUE;
 163     }
 164     else
 165     {
 166         return FALSE;
 167     }
 168 }
 169
 170
 171 static void remove_if_exists(const char *fn)
 172 {
 173     if (gmx_fexist(fn))
 174     {
 175         fprintf(stdout, "Deleting %s\n", fn);
 176         remove(fn);
 177     }
 178 }
 179
 180
 181 static void finalize(const char *fn_out)
 182 {
 183     char  buf[STRLEN];
 184     FILE *fp;
 185
 186
 187     fp = fopen(fn_out, "r");
 188     fprintf(stdout, "\n\n");
 189
 190     while (fgets(buf, STRLEN-1, fp) != NULL)
 191     {
 192         fprintf(stdout, "%s", buf);
 193     }
 194     fclose(fp);
 195     fprintf(stdout, "\n\n");
 196 }
 197
 198
 199 enum {
 200     eFoundNothing, eFoundDDStr, eFoundAccountingStr, eFoundCycleStr
 201 };
 202
 203 static int parse_logfile(const char *logfile, const char *errfile,
 204                          t_perf *perfdata, int test_nr, int presteps, gmx_int64_t cpt_steps,
 205                          int nnodes)
 206 {
 207     FILE           *fp;
 208     char            line[STRLEN], dumstring[STRLEN], dumstring2[STRLEN];
 209     const char      matchstrdd[]  = "Domain decomposition grid";
 210     const char      matchstrcr[]  = "resetting all time and cycle counters";
 211     const char      matchstrbal[] = "Average PME mesh/force load:";
 212     const char      matchstring[] = "R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G";
 213     const char      errSIG[]      = "signal, stopping at the next";
 214     int             iFound;
 215     float           dum1, dum2, dum3, dum4;
 216     int             ndum;
 217     int             npme;
 218     gmx_int64_t     resetsteps     = -1;
 219     gmx_bool        bFoundResetStr = FALSE;
 220     gmx_bool        bResetChecked  = FALSE;
 221
 222
 223     if (!gmx_fexist(logfile))
 224     {
 225         fprintf(stderr, "WARNING: Could not find logfile %s.\n", logfile);
 226         cleandata(perfdata, test_nr);
 227         return eParselogNotFound;
 228     }
 229
 230     fp = fopen(logfile, "r");
 231     perfdata->PME_f_load[test_nr] = -1.0;
 232     perfdata->guessPME            = -1;
 233
 234     iFound = eFoundNothing;
 235     if (1 == nnodes)
 236     {
 237         iFound = eFoundDDStr; /* Skip some case statements */
 238     }
 239
 240     while (fgets(line, STRLEN, fp) != NULL)
 241     {
 242         /* Remove leading spaces */
 243         ltrim(line);
 244
 245         /* Check for TERM and INT signals from user: */
 246         if (strstr(line, errSIG) != NULL)
 247         {
 248             fclose(fp);
 249             cleandata(perfdata, test_nr);
 250             return eParselogTerm;
 251         }
 252
 253         /* Check whether cycle resetting  worked */
 254         if (presteps > 0 && !bFoundResetStr)
 255         {
 256             if (strstr(line, matchstrcr) != NULL)
 257             {
 258                 sprintf(dumstring, "step %s", "%"GMX_SCNd64);
 259                 sscanf(line, dumstring, &resetsteps);
 260                 bFoundResetStr = TRUE;
 261                 if (resetsteps == presteps+cpt_steps)
 262                 {
 263                     bResetChecked = TRUE;
 264                 }
 265                 else
 266                 {
 267                     sprintf(dumstring, "%"GMX_PRId64, resetsteps);
 268                     sprintf(dumstring2, "%"GMX_PRId64, presteps+cpt_steps);
 269                     fprintf(stderr, "WARNING: Time step counters were reset at step %s,\n"
 270                             "         though they were supposed to be reset at step %s!\n",
 271                             dumstring, dumstring2);
 272                 }
 273             }
 274         }
 275
 276         /* Look for strings that appear in a certain order in the log file: */
 277         switch (iFound)
 278         {
 279             case eFoundNothing:
 280                 /* Look for domain decomp grid and separate PME nodes: */
 281                 if (str_starts(line, matchstrdd))
 282                 {
 283                     sscanf(line, "Domain decomposition grid %d x %d x %d, separate PME nodes %d",
 284                            &(perfdata->nx), &(perfdata->ny), &(perfdata->nz), &npme);
 285                     if (perfdata->nPMEnodes == -1)
 286                     {
 287                         perfdata->guessPME = npme;
 288                     }
 289                     else if (perfdata->nPMEnodes != npme)
 290                     {
 291                         gmx_fatal(FARGS, "PME nodes from command line and output file are not identical");
 292                     }
 293                     iFound = eFoundDDStr;
 294                 }
 295                 /* Catch a few errors that might have occured: */
 296                 else if (str_starts(line, "There is no domain decomposition for"))
 297                 {
 298                     fclose(fp);
 299                     return eParselogNoDDGrid;
 300                 }
 301                 else if (str_starts(line, "The number of nodes you selected"))
 302                 {
 303                     fclose(fp);
 304                     return eParselogLargePrimeFactor;
 305                 }
 306                 else if (str_starts(line, "reading tpx file"))
 307                 {
 308                     fclose(fp);
 309                     return eParselogTPXVersion;
 310                 }
 311                 else if (str_starts(line, "The -dd or -npme option request a parallel simulation"))
 312                 {
 313                     fclose(fp);
 314                     return eParselogNotParallel;
 315                 }
 316                 break;
 317             case eFoundDDStr:
 318                 /* Look for PME mesh/force balance (not necessarily present, though) */
 319                 if (str_starts(line, matchstrbal))
 320                 {
 321                     sscanf(&line[strlen(matchstrbal)], "%f", &(perfdata->PME_f_load[test_nr]));
 322                 }
 323                 /* Look for matchstring */
 324                 if (str_starts(line, matchstring))
 325                 {
 326                     iFound = eFoundAccountingStr;
 327                 }
 328                 break;
 329             case eFoundAccountingStr:
 330                 /* Already found matchstring - look for cycle data */
 331                 if (str_starts(line, "Total  "))
 332                 {
 333                     sscanf(line, "Total %lf", &(perfdata->Gcycles[test_nr]));
 334                     iFound = eFoundCycleStr;
 335                 }
 336                 break;
 337             case eFoundCycleStr:
 338                 /* Already found cycle data - look for remaining performance info and return */
 339                 if (str_starts(line, "Performance:"))
 340                 {
 341                     ndum = sscanf(line, "%s %f %f %f %f", dumstring, &dum1, &dum2, &dum3, &dum4);
 342                     /* (ns/day) is the second last entry, depending on whether GMX_DETAILED_PERF_STATS was set in print_perf(), nrnb.c */
 343                     perfdata->ns_per_day[test_nr] = (ndum == 5) ? dum3 : dum1;
 344                     fclose(fp);
 345                     if (bResetChecked || presteps == 0)
 346                     {
 347                         return eParselogOK;
 348                     }
 349                     else
 350                     {
 351                         return eParselogResetProblem;
 352                     }
 353                 }
 354                 break;
 355         }
 356     } /* while */
 357
 358     /* Close the log file */
 359     fclose(fp);
 360
 361     /* Check why there is no performance data in the log file.
 362      * Did a fatal errors occur? */
 363     if (gmx_fexist(errfile))
 364     {
 365         fp = fopen(errfile, "r");
 366         while (fgets(line, STRLEN, fp) != NULL)
 367         {
 368             if (str_starts(line, "Fatal error:") )
 369             {
 370                 if (fgets(line, STRLEN, fp) != NULL)
 371                 {
 372                     fprintf(stderr, "\nWARNING: An error occured during this benchmark:\n"
 373                             "%s\n", line);
 374                 }
 375                 fclose(fp);
 376                 cleandata(perfdata, test_nr);
 377                 return eParselogFatal;
 378             }
 379         }
 380         fclose(fp);
 381     }
 382     else
 383     {
 384         fprintf(stderr, "WARNING: Could not find stderr file %s.\n", errfile);
 385     }
 386
 387     /* Giving up ... we could not find out why there is no performance data in
 388      * the log file. */
 389     fprintf(stdout, "No performance data in log file.\n");
 390     cleandata(perfdata, test_nr);
 391
 392     return eParselogNoPerfData;
 393 }
 394
 395
 396 static gmx_bool analyze_data(
 397         FILE         *fp,
 398         const char   *fn,
 399         t_perf      **perfdata,
 400         int           nnodes,
 401         int           ntprs,
 402         int           ntests,
 403         int           nrepeats,
 404         t_inputinfo  *info,
 405         int          *index_tpr,    /* OUT: Nr of mdp file with best settings */
 406         int          *npme_optimal) /* OUT: Optimal number of PME nodes */
 407 {
 408     int      i, j, k;
 409     int      line  = 0, line_win = -1;
 410     int      k_win = -1, i_win = -1, winPME;
 411     double   s     = 0.0; /* standard deviation */
 412     t_perf  *pd;
 413     char     strbuf[STRLEN];
 414     char     str_PME_f_load[13];
 415     gmx_bool bCanUseOrigTPR;
 416     gmx_bool bRefinedCoul, bRefinedVdW, bRefinedGrid;
 417
 418
 419     if (nrepeats > 1)
 420     {
 421         sep_line(fp);
 422         fprintf(fp, "Summary of successful runs:\n");
 423         fprintf(fp, "Line tpr PME nodes  Gcycles Av.     Std.dev.       ns/day        PME/f");
 424         if (nnodes > 1)
 425         {
 426             fprintf(fp, "    DD grid");
 427         }
 428         fprintf(fp, "\n");
 429     }
 430
 431
 432     for (k = 0; k < ntprs; k++)
 433     {
 434         for (i = 0; i < ntests; i++)
 435         {
 436             /* Select the right dataset: */
 437             pd = &(perfdata[k][i]);
 438
 439             pd->Gcycles_Av    = 0.0;
 440             pd->PME_f_load_Av = 0.0;
 441             pd->ns_per_day_Av = 0.0;
 442
 443             if (pd->nPMEnodes == -1)
 444             {
 445                 sprintf(strbuf, "(%3d)", pd->guessPME);
 446             }
 447             else
 448             {
 449                 sprintf(strbuf, "     ");
 450             }
 451
 452             /* Get the average run time of a setting */
 453             for (j = 0; j < nrepeats; j++)
 454             {
 455                 pd->Gcycles_Av    += pd->Gcycles[j];
 456                 pd->PME_f_load_Av += pd->PME_f_load[j];
 457             }
 458             pd->Gcycles_Av    /= nrepeats;
 459             pd->PME_f_load_Av /= nrepeats;
 460
 461             for (j = 0; j < nrepeats; j++)
 462             {
 463                 if (pd->ns_per_day[j] > 0.0)
 464                 {
 465                     pd->ns_per_day_Av += pd->ns_per_day[j];
 466                 }
 467                 else
 468                 {
 469                     /* Somehow the performance number was not aquired for this run,
 470                      * therefor set the average to some negative value: */
 471                     pd->ns_per_day_Av = -1.0f*nrepeats;
 472                     break;
 473                 }
 474             }
 475             pd->ns_per_day_Av /= nrepeats;
 476
 477             /* Nicer output: */
 478             if (pd->PME_f_load_Av > 0.0)
 479             {
 480                 sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load_Av);
 481             }
 482             else
 483             {
 484                 sprintf(str_PME_f_load, "%s", "         -  ");
 485             }
 486
 487
 488             /* We assume we had a successful run if both averages are positive */
 489             if (pd->Gcycles_Av > 0.0 && pd->ns_per_day_Av > 0.0)
 490             {
 491                 /* Output statistics if repeats were done */
 492                 if (nrepeats > 1)
 493                 {
 494                     /* Calculate the standard deviation */
 495                     s = 0.0;
 496                     for (j = 0; j < nrepeats; j++)
 497                     {
 498                         s += pow( pd->Gcycles[j] - pd->Gcycles_Av, 2 );
 499                     }
 500                     s /= (nrepeats - 1);
 501                     s  = sqrt(s);
 502
 503                     fprintf(fp, "%4d %3d %4d%s %12.3f %12.3f %12.3f %s",
 504                             line, k, pd->nPMEnodes, strbuf, pd->Gcycles_Av, s,
 505                             pd->ns_per_day_Av, str_PME_f_load);
 506                     if (nnodes > 1)
 507                     {
 508                         fprintf(fp, "  %3d %3d %3d", pd->nx, pd->ny, pd->nz);
 509                     }
 510                     fprintf(fp, "\n");
 511                 }
 512                 /* Store the index of the best run found so far in 'winner': */
 513                 if ( (k_win == -1) || (pd->Gcycles_Av < perfdata[k_win][i_win].Gcycles_Av) )
 514                 {
 515                     k_win    = k;
 516                     i_win    = i;
 517                     line_win = line;
 518                 }
 519                 line++;
 520             }
 521         }
 522     }
 523
 524     if (k_win == -1)
 525     {
 526         gmx_fatal(FARGS, "None of the runs was successful! Check %s for problems.", fn);
 527     }
 528
 529     sep_line(fp);
 530
 531     winPME = perfdata[k_win][i_win].nPMEnodes;
 532
 533     if (1 == ntests)
 534     {
 535         /* We stuck to a fixed number of PME-only nodes */
 536         sprintf(strbuf, "settings No. %d", k_win);
 537     }
 538     else
 539     {
 540         /* We have optimized the number of PME-only nodes */
 541         if (winPME == -1)
 542         {
 543             sprintf(strbuf, "%s", "the automatic number of PME nodes");
 544         }
 545         else
 546         {
 547             sprintf(strbuf, "%d PME nodes", winPME);
 548         }
 549     }
 550     fprintf(fp, "Best performance was achieved with %s", strbuf);
 551     if ((nrepeats > 1) && (ntests > 1))
 552     {
 553         fprintf(fp, " (see line %d)", line_win);
 554     }
 555     fprintf(fp, "\n");
 556
 557     /* Only mention settings if they were modified: */
 558     bRefinedCoul = !is_equal(info->rcoulomb[k_win], info->rcoulomb[0]);
 559     bRefinedVdW  = !is_equal(info->rvdw[k_win], info->rvdw[0]    );
 560     bRefinedGrid = !(info->nkx[k_win] == info->nkx[0] &&
 561                      info->nky[k_win] == info->nky[0] &&
 562                      info->nkz[k_win] == info->nkz[0]);
 563
 564     if (bRefinedCoul || bRefinedVdW || bRefinedGrid)
 565     {
 566         fprintf(fp, "Optimized PME settings:\n");
 567         bCanUseOrigTPR = FALSE;
 568     }
 569     else
 570     {
 571         bCanUseOrigTPR = TRUE;
 572     }
 573
 574     if (bRefinedCoul)
 575     {
 576         fprintf(fp, "   New Coulomb radius: %f nm (was %f nm)\n", info->rcoulomb[k_win], info->rcoulomb[0]);
 577     }
 578
 579     if (bRefinedVdW)
 580     {
 581         fprintf(fp, "   New Van der Waals radius: %f nm (was %f nm)\n", info->rvdw[k_win], info->rvdw[0]);
 582     }
 583
 584     if (bRefinedGrid)
 585     {
 586         fprintf(fp, "   New Fourier grid xyz: %d %d %d (was %d %d %d)\n", info->nkx[k_win], info->nky[k_win], info->nkz[k_win],
 587                 info->nkx[0], info->nky[0], info->nkz[0]);
 588     }
 589
 590     if (bCanUseOrigTPR && ntprs > 1)
 591     {
 592         fprintf(fp, "and original PME settings.\n");
 593     }
 594
 595     fflush(fp);
 596
 597     /* Return the index of the mdp file that showed the highest performance
 598      * and the optimal number of PME nodes */
 599     *index_tpr    = k_win;
 600     *npme_optimal = winPME;
 601
 602     return bCanUseOrigTPR;
 603 }
 604
 605
 606 /* Get the commands we need to set up the runs from environment variables */
 607 static void get_program_paths(gmx_bool bThreads, char *cmd_mpirun[], char *cmd_mdrun[])
 608 {
 609     char      *cp;
 610     FILE      *fp;
 611     const char def_mpirun[]   = "mpirun";
 612     const char def_mdrun[]    = "mdrun";
 613
 614     const char empty_mpirun[] = "";
 615
 616     /* Get the commands we need to set up the runs from environment variables */
 617     if (!bThreads)
 618     {
 619         if ( (cp = getenv("MPIRUN")) != NULL)
 620         {
 621             *cmd_mpirun = strdup(cp);
 622         }
 623         else
 624         {
 625             *cmd_mpirun = strdup(def_mpirun);
 626         }
 627     }
 628     else
 629     {
 630         *cmd_mpirun = strdup(empty_mpirun);
 631     }
 632
 633     if ( (cp = getenv("MDRUN" )) != NULL)
 634     {
 635         *cmd_mdrun  = strdup(cp);
 636     }
 637     else
 638     {
 639         *cmd_mdrun  = strdup(def_mdrun);
 640     }
 641 }
 642
 643 /* Check that the commands will run mdrun (perhaps via mpirun) by
 644  * running a very quick test simulation. Requires MPI environment to
 645  * be available if applicable. */
 646 static void check_mdrun_works(gmx_bool    bThreads,
 647                               const char *cmd_mpirun,
 648                               const char *cmd_np,
 649                               const char *cmd_mdrun)
 650 {
 651     char      *command = NULL;
 652     char      *cp;
 653     char       line[STRLEN];
 654     FILE      *fp;
 655     const char filename[]     = "benchtest.log";
 656
 657     /* This string should always be identical to the one in copyrite.c,
 658      * gmx_print_version_info() in the defined(GMX_MPI) section */
 659     const char match_mpi[]    = "MPI library:        MPI";
 660     const char match_mdrun[]  = "Executable: ";
 661     gmx_bool   bMdrun         = FALSE;
 662     gmx_bool   bMPI           = FALSE;
 663
 664     /* Run a small test to see whether mpirun + mdrun work  */
 665     fprintf(stdout, "Making sure that mdrun can be executed. ");
 666     if (bThreads)
 667     {
 668         snew(command, strlen(cmd_mdrun) + strlen(cmd_np) + strlen(filename) + 50);
 669         sprintf(command, "%s%s-version -maxh 0.001 1> %s 2>&1", cmd_mdrun, cmd_np, filename);
 670     }
 671     else
 672     {
 673         snew(command, strlen(cmd_mpirun) + strlen(cmd_np) + strlen(cmd_mdrun) + strlen(filename) + 50);
 674         sprintf(command, "%s%s%s -version -maxh 0.001 1> %s 2>&1", cmd_mpirun, cmd_np, cmd_mdrun, filename);
 675     }
 676     fprintf(stdout, "Trying '%s' ... ", command);
 677     make_backup(filename);
 678     gmx_system_call(command);
 679
 680     /* Check if we find the characteristic string in the output: */
 681     if (!gmx_fexist(filename))
 682     {
 683         gmx_fatal(FARGS, "Output from test run could not be found.");
 684     }
 685
 686     fp = fopen(filename, "r");
 687     /* We need to scan the whole output file, since sometimes the queuing system
 688      * also writes stuff to stdout/err */
 689     while (!feof(fp) )
 690     {
 691         cp = fgets(line, STRLEN, fp);
 692         if (cp != NULL)
 693         {
 694             if (str_starts(line, match_mdrun) )
 695             {
 696                 bMdrun = TRUE;
 697             }
 698             if (str_starts(line, match_mpi) )
 699             {
 700                 bMPI = TRUE;
 701             }
 702         }
 703     }
 704     fclose(fp);
 705
 706     if (bThreads)
 707     {
 708         if (bMPI)
 709         {
 710             gmx_fatal(FARGS, "Need a threaded version of mdrun. This one\n"
 711                       "(%s)\n"
 712                       "seems to have been compiled with MPI instead.",
 713                       *cmd_mdrun);
 714         }
 715     }
 716     else
 717     {
 718         if (bMdrun && !bMPI)
 719         {
 720             gmx_fatal(FARGS, "Need an MPI-enabled version of mdrun. This one\n"
 721                       "(%s)\n"
 722                       "seems to have been compiled without MPI support.",
 723                       *cmd_mdrun);
 724         }
 725     }
 726
 727     if (!bMdrun)
 728     {
 729         gmx_fatal(FARGS, "Cannot execute mdrun. Please check %s for problems!",
 730                   filename);
 731     }
 732
 733     fprintf(stdout, "passed.\n");
 734
 735     /* Clean up ... */
 736     remove(filename);
 737     sfree(command);
 738 }
 739
 740
 741 static void launch_simulation(
 742         gmx_bool    bLaunch,        /* Should the simulation be launched? */
 743         FILE       *fp,             /* General log file */
 744         gmx_bool    bThreads,       /* whether to use threads */
 745         char       *cmd_mpirun,     /* Command for mpirun */
 746         char       *cmd_np,         /* Switch for -np or -ntmpi or empty */
 747         char       *cmd_mdrun,      /* Command for mdrun */
 748         char       *args_for_mdrun, /* Arguments for mdrun */
 749         const char *simulation_tpr, /* This tpr will be simulated */
 750         int         nPMEnodes)      /* Number of PME nodes to use */
 751 {
 752     char  *command;
 753
 754
 755     /* Make enough space for the system call command,
 756      * (100 extra chars for -npme ... etc. options should suffice): */
 757     snew(command, strlen(cmd_mpirun)+strlen(cmd_mdrun)+strlen(cmd_np)+strlen(args_for_mdrun)+strlen(simulation_tpr)+100);
 758
 759     /* Note that the -passall options requires args_for_mdrun to be at the end
 760      * of the command line string */
 761     if (bThreads)
 762     {
 763         sprintf(command, "%s%s-npme %d -s %s %s",
 764                 cmd_mdrun, cmd_np, nPMEnodes, simulation_tpr, args_for_mdrun);
 765     }
 766     else
 767     {
 768         sprintf(command, "%s%s%s -npme %d -s %s %s",
 769                 cmd_mpirun, cmd_np, cmd_mdrun, nPMEnodes, simulation_tpr, args_for_mdrun);
 770     }
 771
 772     fprintf(fp, "%s this command line to launch the simulation:\n\n%s", bLaunch ? "Using" : "Please use", command);
 773     sep_line(fp);
 774     fflush(fp);
 775
 776     /* Now the real thing! */
 777     if (bLaunch)
 778     {
 779         fprintf(stdout, "\nLaunching simulation with best parameters now.\nExecuting '%s'", command);
 780         sep_line(stdout);
 781         fflush(stdout);
 782         gmx_system_call(command);
 783     }
 784 }
 785
 786
 787 static void modify_PMEsettings(
 788         gmx_int64_t     simsteps,    /* Set this value as number of time steps */
 789         gmx_int64_t     init_step,   /* Set this value as init_step */
 790         const char     *fn_best_tpr, /* tpr file with the best performance */
 791         const char     *fn_sim_tpr)  /* name of tpr file to be launched */
 792 {
 793     t_inputrec   *ir;
 794     t_state       state;
 795     gmx_mtop_t    mtop;
 796     char          buf[200];
 797
 798     snew(ir, 1);
 799     read_tpx_state(fn_best_tpr, ir, &state, NULL, &mtop);
 800
 801     /* Reset nsteps and init_step to the value of the input .tpr file */
 802     ir->nsteps    = simsteps;
 803     ir->init_step = init_step;
 804
 805     /* Write the tpr file which will be launched */
 806     sprintf(buf, "Writing optimized simulation file %s with nsteps=%s.\n", fn_sim_tpr, "%"GMX_PRId64);
 807     fprintf(stdout, buf, ir->nsteps);
 808     fflush(stdout);
 809     write_tpx_state(fn_sim_tpr, ir, &state, &mtop);
 810
 811     sfree(ir);
 812 }
 813
 814
 815 #define EPME_SWITCHED(e) ((e) == eelPMESWITCH || (e) == eelPMEUSERSWITCH)
 816
 817 /* Make additional TPR files with more computational load for the
 818  * direct space processors: */
 819 static void make_benchmark_tprs(
 820         const char     *fn_sim_tpr,      /* READ : User-provided tpr file                 */
 821         char           *fn_bench_tprs[], /* WRITE: Names of benchmark tpr files           */
 822         gmx_int64_t     benchsteps,      /* Number of time steps for benchmark runs       */
 823         gmx_int64_t     statesteps,      /* Step counter in checkpoint file               */
 824         real            rmin,            /* Minimal Coulomb radius                        */
 825         real            rmax,            /* Maximal Coulomb radius                        */
 826         real            bScaleRvdw,      /* Scale rvdw along with rcoulomb                */
 827         int            *ntprs,           /* No. of TPRs to write, each with a different
 828                                             rcoulomb and fourierspacing                   */
 829         t_inputinfo    *info,            /* Contains information about mdp file options   */
 830         FILE           *fp)              /* Write the output here                         */
 831 {
 832     int           i, j, d;
 833     t_inputrec   *ir;
 834     t_state       state;
 835     gmx_mtop_t    mtop;
 836     real          nlist_buffer;     /* Thickness of the buffer regions for PME-switch potentials */
 837     char          buf[200];
 838     rvec          box_size;
 839     gmx_bool      bNote = FALSE;
 840     real          add;              /* Add this to rcoul for the next test    */
 841     real          fac = 1.0;        /* Scaling factor for Coulomb radius      */
 842     real          fourierspacing;   /* Basic fourierspacing from tpr          */
 843
 844
 845     sprintf(buf, "Making benchmark tpr file%s with %s time step%s",
 846             *ntprs > 1 ? "s" : "", "%"GMX_PRId64, benchsteps > 1 ? "s" : "");
 847     fprintf(stdout, buf, benchsteps);
 848     if (statesteps > 0)
 849     {
 850         sprintf(buf, " (adding %s steps from checkpoint file)", "%"GMX_PRId64);
 851         fprintf(stdout, buf, statesteps);
 852         benchsteps += statesteps;
 853     }
 854     fprintf(stdout, ".\n");
 855
 856
 857     snew(ir, 1);
 858     read_tpx_state(fn_sim_tpr, ir, &state, NULL, &mtop);
 859
 860     /* Check if some kind of PME was chosen */
 861     if (EEL_PME(ir->coulombtype) == FALSE)
 862     {
 863         gmx_fatal(FARGS, "Can only do optimizations for simulations with %s electrostatics.",
 864                   EELTYPE(eelPME));
 865     }
 866
 867     /* Check if rcoulomb == rlist, which is necessary for plain PME. */
 868     if (  (ir->cutoff_scheme != ecutsVERLET) &&
 869           (eelPME == ir->coulombtype) && !(ir->rcoulomb == ir->rlist))
 870     {
 871         gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to rlist (%f).",
 872                   EELTYPE(eelPME), ir->rcoulomb, ir->rlist);
 873     }
 874     /* For other PME types, rcoulomb is allowed to be smaller than rlist */
 875     else if (ir->rcoulomb > ir->rlist)
 876     {
 877         gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to or smaller than rlist (%f)",
 878                   EELTYPE(ir->coulombtype), ir->rcoulomb, ir->rlist);
 879     }
 880
 881     if (bScaleRvdw && ir->rvdw != ir->rcoulomb)
 882     {
 883         fprintf(stdout, "NOTE: input rvdw != rcoulomb, will not scale rvdw\n");
 884         bScaleRvdw = FALSE;
 885     }
 886
 887     /* Reduce the number of steps for the benchmarks */
 888     info->orig_sim_steps = ir->nsteps;
 889     ir->nsteps           = benchsteps;
 890     /* We must not use init_step from the input tpr file for the benchmarks */
 891     info->orig_init_step = ir->init_step;
 892     ir->init_step        = 0;
 893
 894     /* For PME-switch potentials, keep the radial distance of the buffer region */
 895     nlist_buffer   = ir->rlist - ir->rcoulomb;
 896
 897     /* Determine length of triclinic box vectors */
 898     for (d = 0; d < DIM; d++)
 899     {
 900         box_size[d] = 0;
 901         for (i = 0; i < DIM; i++)
 902         {
 903             box_size[d] += state.box[d][i]*state.box[d][i];
 904         }
 905         box_size[d] = sqrt(box_size[d]);
 906     }
 907
 908     if (ir->fourier_spacing > 0)
 909     {
 910         info->fsx[0] = ir->fourier_spacing;
 911         info->fsy[0] = ir->fourier_spacing;
 912         info->fsz[0] = ir->fourier_spacing;
 913     }
 914     else
 915     {
 916         /* Reconstruct fourierspacing per dimension from the number of grid points and box size */
 917         info->fsx[0] = box_size[XX]/ir->nkx;
 918         info->fsy[0] = box_size[YY]/ir->nky;
 919         info->fsz[0] = box_size[ZZ]/ir->nkz;
 920     }
 921
 922     /* If no value for the fourierspacing was provided on the command line, we
 923      * use the reconstruction from the tpr file */
 924     if (ir->fourier_spacing > 0)
 925     {
 926         /* Use the spacing from the tpr */
 927         fourierspacing = ir->fourier_spacing;
 928     }
 929     else
 930     {
 931         /* Use the maximum observed spacing */
 932         fourierspacing = max(max(info->fsx[0], info->fsy[0]), info->fsz[0]);
 933     }
 934
 935     fprintf(stdout, "Calculating PME grid points on the basis of a fourierspacing of %f nm\n", fourierspacing);
 936
 937     /* For performance comparisons the number of particles is useful to have */
 938     fprintf(fp, "   Number of particles  : %d\n", mtop.natoms);
 939
 940     /* Print information about settings of which some are potentially modified: */
 941     fprintf(fp, "   Coulomb type         : %s\n", EELTYPE(ir->coulombtype));
 942     fprintf(fp, "   Grid spacing x y z   : %f %f %f\n",
 943             box_size[XX]/ir->nkx, box_size[YY]/ir->nky, box_size[ZZ]/ir->nkz);
 944     fprintf(fp, "   Van der Waals type   : %s\n", EVDWTYPE(ir->vdwtype));
 945     if (ir_vdw_switched(ir))
 946     {
 947         fprintf(fp, "   rvdw_switch          : %f nm\n", ir->rvdw_switch);
 948     }
 949     if (EPME_SWITCHED(ir->coulombtype))
 950     {
 951         fprintf(fp, "   rlist                : %f nm\n", ir->rlist);
 952     }
 953     if (ir->rlistlong != max_cutoff(ir->rvdw, ir->rcoulomb))
 954     {
 955         fprintf(fp, "   rlistlong            : %f nm\n", ir->rlistlong);
 956     }
 957
 958     /* Print a descriptive line about the tpr settings tested */
 959     fprintf(fp, "\nWill try these real/reciprocal workload settings:\n");
 960     fprintf(fp, " No.   scaling  rcoulomb");
 961     fprintf(fp, "  nkx  nky  nkz");
 962     fprintf(fp, "   spacing");
 963     if (evdwCUT == ir->vdwtype)
 964     {
 965         fprintf(fp, "      rvdw");
 966     }
 967     if (EPME_SWITCHED(ir->coulombtype))
 968     {
 969         fprintf(fp, "     rlist");
 970     }
 971     if (ir->rlistlong != max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb)) )
 972     {
 973         fprintf(fp, " rlistlong");
 974     }
 975     fprintf(fp, "  tpr file\n");
 976
 977     /* Loop to create the requested number of tpr input files */
 978     for (j = 0; j < *ntprs; j++)
 979     {
 980         /* The first .tpr is the provided one, just need to modify nsteps,
 981          * so skip the following block */
 982         if (j != 0)
 983         {
 984             /* Determine which Coulomb radii rc to use in the benchmarks */
 985             add = (rmax-rmin)/(*ntprs-1);
 986             if (is_equal(rmin, info->rcoulomb[0]))
 987             {
 988                 ir->rcoulomb = rmin + j*add;
 989             }
 990             else if (is_equal(rmax, info->rcoulomb[0]))
 991             {
 992                 ir->rcoulomb = rmin + (j-1)*add;
 993             }
 994             else
 995             {
 996                 /* rmin != rcoul != rmax, ergo test between rmin and rmax */
 997                 add          = (rmax-rmin)/(*ntprs-2);
 998                 ir->rcoulomb = rmin + (j-1)*add;
 999             }
1000
1001             /* Determine the scaling factor fac */
1002             fac = ir->rcoulomb/info->rcoulomb[0];
1003
1004             /* Scale the Fourier grid spacing */
1005             ir->nkx = ir->nky = ir->nkz = 0;
1006             calc_grid(NULL, state.box, fourierspacing*fac, &ir->nkx, &ir->nky, &ir->nkz);
1007
1008             /* Adjust other radii since various conditions need to be fulfilled */
1009             if (eelPME == ir->coulombtype)
1010             {
1011                 /* plain PME, rcoulomb must be equal to rlist */
1012                 ir->rlist = ir->rcoulomb;
1013             }
1014             else
1015             {
1016                 /* rlist must be >= rcoulomb, we keep the size of the buffer region */
1017                 ir->rlist = ir->rcoulomb + nlist_buffer;
1018             }
1019
1020             if (bScaleRvdw && evdwCUT == ir->vdwtype)
1021             {
1022                 if (ecutsVERLET == ir->cutoff_scheme)
1023                 {
1024                     /* With Verlet, the van der Waals radius must always equal the Coulomb radius */
1025                     ir->rvdw = ir->rcoulomb;
1026                 }
1027                 else
1028                 {
1029                     /* For vdw cutoff, rvdw >= rlist */
1030                     ir->rvdw = max(info->rvdw[0], ir->rlist);
1031                 }
1032             }
1033
1034             ir->rlistlong = max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb));
1035
1036         } /* end of "if (j != 0)" */
1037
1038         /* for j==0: Save the original settings
1039          * for j >0: Save modified radii and Fourier grids */
1040         info->rcoulomb[j]  = ir->rcoulomb;
1041         info->rvdw[j]      = ir->rvdw;
1042         info->nkx[j]       = ir->nkx;
1043         info->nky[j]       = ir->nky;
1044         info->nkz[j]       = ir->nkz;
1045         info->rlist[j]     = ir->rlist;
1046         info->rlistlong[j] = ir->rlistlong;
1047         info->fsx[j]       = fac*fourierspacing;
1048         info->fsy[j]       = fac*fourierspacing;
1049         info->fsz[j]       = fac*fourierspacing;
1050
1051         /* Write the benchmark tpr file */
1052         strncpy(fn_bench_tprs[j], fn_sim_tpr, strlen(fn_sim_tpr)-strlen(".tpr"));
1053         sprintf(buf, "_bench%.2d.tpr", j);
1054         strcat(fn_bench_tprs[j], buf);
1055         fprintf(stdout, "Writing benchmark tpr %s with nsteps=", fn_bench_tprs[j]);
1056         fprintf(stdout, "%"GMX_PRId64, ir->nsteps);
1057         if (j > 0)
1058         {
1059             fprintf(stdout, ", scaling factor %f\n", fac);
1060         }
1061         else
1062         {
1063             fprintf(stdout, ", unmodified settings\n");
1064         }
1065
1066         write_tpx_state(fn_bench_tprs[j], ir, &state, &mtop);
1067
1068         /* Write information about modified tpr settings to log file */
1069         fprintf(fp, "%4d%10f%10f", j, fac, ir->rcoulomb);
1070         fprintf(fp, "%5d%5d%5d", ir->nkx, ir->nky, ir->nkz);
1071         fprintf(fp, " %9f ", info->fsx[j]);
1072         if (evdwCUT == ir->vdwtype)
1073         {
1074             fprintf(fp, "%10f", ir->rvdw);
1075         }
1076         if (EPME_SWITCHED(ir->coulombtype))
1077         {
1078             fprintf(fp, "%10f", ir->rlist);
1079         }
1080         if (info->rlistlong[0] != max_cutoff(info->rlist[0], max_cutoff(info->rvdw[0], info->rcoulomb[0])) )
1081         {
1082             fprintf(fp, "%10f", ir->rlistlong);
1083         }
1084         fprintf(fp, "  %-14s\n", fn_bench_tprs[j]);
1085
1086         /* Make it clear to the user that some additional settings were modified */
1087         if (!is_equal(ir->rvdw, info->rvdw[0])
1088             || !is_equal(ir->rlistlong, info->rlistlong[0]) )
1089         {
1090             bNote = TRUE;
1091         }
1092     }
1093     if (bNote)
1094     {
1095         fprintf(fp, "\nNote that in addition to the Coulomb radius and the Fourier grid\n"
1096                 "other input settings were also changed (see table above).\n"
1097                 "Please check if the modified settings are appropriate.\n");
1098     }
1099     fflush(stdout);
1100     fflush(fp);
1101     sfree(ir);
1102 }
1103
1104
1105 /* Rename the files we want to keep to some meaningful filename and
1106  * delete the rest */
1107 static void cleanup(const t_filenm *fnm, int nfile, int k, int nnodes,
1108                     int nPMEnodes, int nr, gmx_bool bKeepStderr)
1109 {
1110     char        numstring[STRLEN];
1111     char        newfilename[STRLEN];
1112     const char *fn = NULL;
1113     int         i;
1114     const char *opt;
1115
1116
1117     fprintf(stdout, "Cleaning up, deleting benchmark temp files ...\n");
1118
1119     for (i = 0; i < nfile; i++)
1120     {
1121         opt = (char *)fnm[i].opt;
1122         if (strcmp(opt, "-p") == 0)
1123         {
1124             /* do nothing; keep this file */
1125             ;
1126         }
1127         else if (strcmp(opt, "-bg") == 0)
1128         {
1129             /* Give the log file a nice name so one can later see which parameters were used */
1130             numstring[0] = '\0';
1131             if (nr > 0)
1132             {
1133                 sprintf(numstring, "_%d", nr);
1134             }
1135             sprintf(newfilename, "%s_no%d_np%d_npme%d%s", opt2fn("-bg", nfile, fnm), k, nnodes, nPMEnodes, numstring);
1136             if (gmx_fexist(opt2fn("-bg", nfile, fnm)))
1137             {
1138                 fprintf(stdout, "renaming log file to %s\n", newfilename);
1139                 make_backup(newfilename);
1140                 rename(opt2fn("-bg", nfile, fnm), newfilename);
1141             }
1142         }
1143         else if (strcmp(opt, "-err") == 0)
1144         {
1145             /* This file contains the output of stderr. We want to keep it in
1146              * cases where there have been problems. */
1147             fn           = opt2fn(opt, nfile, fnm);
1148             numstring[0] = '\0';
1149             if (nr > 0)
1150             {
1151                 sprintf(numstring, "_%d", nr);
1152             }
1153             sprintf(newfilename, "%s_no%d_np%d_npme%d%s", fn, k, nnodes, nPMEnodes, numstring);
1154             if (gmx_fexist(fn))
1155             {
1156                 if (bKeepStderr)
1157                 {
1158                     fprintf(stdout, "Saving stderr output in %s\n", newfilename);
1159                     make_backup(newfilename);
1160                     rename(fn, newfilename);
1161                 }
1162                 else
1163                 {
1164                     fprintf(stdout, "Deleting %s\n", fn);
1165                     remove(fn);
1166                 }
1167             }
1168         }
1169         /* Delete the files which are created for each benchmark run: (options -b*) */
1170         else if ( (0 == strncmp(opt, "-b", 2)) && (opt2bSet(opt, nfile, fnm) || !is_optional(&fnm[i])) )
1171         {
1172             remove_if_exists(opt2fn(opt, nfile, fnm));
1173         }
1174     }
1175 }
1176
1177
1178 enum {
1179     eNpmeAuto, eNpmeAll, eNpmeReduced, eNpmeSubset, eNpmeNr
1180 };
1181
1182 /* Create a list of numbers of PME nodes to test */
1183 static void make_npme_list(
1184         const char *npmevalues_opt, /* Make a complete list with all
1185                                      * possibilities or a short list that keeps only
1186                                      * reasonable numbers of PME nodes                  */
1187         int        *nentries,       /* Number of entries we put in the nPMEnodes list   */
1188         int        *nPMEnodes[],    /* Each entry contains the value for -npme          */
1189         int         nnodes,         /* Total number of nodes to do the tests on         */
1190         int         minPMEnodes,    /* Minimum number of PME nodes                      */
1191         int         maxPMEnodes)    /* Maximum number of PME nodes                      */
1192 {
1193     int i, npme, npp;
1194     int min_factor = 1;   /* We request that npp and npme have this minimal
1195                            * largest common factor (depends on npp)           */
1196     int nlistmax;         /* Max. list size                                   */
1197     int nlist;            /* Actual number of entries in list                 */
1198     int eNPME = 0;
1199
1200
1201     /* Do we need to check all possible values for -npme or is a reduced list enough? */
1202     if (0 == strcmp(npmevalues_opt, "all") )
1203     {
1204         eNPME = eNpmeAll;
1205     }
1206     else if (0 == strcmp(npmevalues_opt, "subset") )
1207     {
1208         eNPME = eNpmeSubset;
1209     }
1210     else /* "auto" or "range" */
1211     {
1212         if (nnodes <= 64)
1213         {
1214             eNPME = eNpmeAll;
1215         }
1216         else if (nnodes < 128)
1217         {
1218             eNPME = eNpmeReduced;
1219         }
1220         else
1221         {
1222             eNPME = eNpmeSubset;
1223         }
1224     }
1225
1226     /* Calculate how many entries we could possibly have (in case of -npme all) */
1227     if (nnodes > 2)
1228     {
1229         nlistmax = maxPMEnodes - minPMEnodes + 3;
1230         if (0 == minPMEnodes)
1231         {
1232             nlistmax--;
1233         }
1234     }
1235     else
1236     {
1237         nlistmax = 1;
1238     }
1239
1240     /* Now make the actual list which is at most of size nlist */
1241     snew(*nPMEnodes, nlistmax);
1242     nlist = 0; /* start counting again, now the real entries in the list */
1243     for (i = 0; i < nlistmax - 2; i++)
1244     {
1245         npme = maxPMEnodes - i;
1246         npp  = nnodes-npme;
1247         switch (eNPME)
1248         {
1249             case eNpmeAll:
1250                 min_factor = 1;
1251                 break;
1252             case eNpmeReduced:
1253                 min_factor = 2;
1254                 break;
1255             case eNpmeSubset:
1256                 /* For 2d PME we want a common largest factor of at least the cube
1257                  * root of the number of PP nodes */
1258                 min_factor = (int) pow(npp, 1.0/3.0);
1259                 break;
1260             default:
1261                 gmx_fatal(FARGS, "Unknown option for eNPME in make_npme_list");
1262                 break;
1263         }
1264         if (gmx_greatest_common_divisor(npp, npme) >= min_factor)
1265         {
1266             (*nPMEnodes)[nlist] = npme;
1267             nlist++;
1268         }
1269     }
1270     /* We always test 0 PME nodes and the automatic number */
1271     *nentries             = nlist + 2;
1272     (*nPMEnodes)[nlist  ] =  0;
1273     (*nPMEnodes)[nlist+1] = -1;
1274
1275     fprintf(stderr, "Will try the following %d different values for -npme:\n", *nentries);
1276     for (i = 0; i < *nentries-1; i++)
1277     {
1278         fprintf(stderr, "%d, ", (*nPMEnodes)[i]);
1279     }
1280     fprintf(stderr, "and %d (auto).\n", (*nPMEnodes)[*nentries-1]);
1281 }
1282
1283
1284 /* Allocate memory to store the performance data */
1285 static void init_perfdata(t_perf *perfdata[], int ntprs, int datasets, int repeats)
1286 {
1287     int i, j, k;
1288
1289
1290     for (k = 0; k < ntprs; k++)
1291     {
1292         snew(perfdata[k], datasets);
1293         for (i = 0; i < datasets; i++)
1294         {
1295             for (j = 0; j < repeats; j++)
1296             {
1297                 snew(perfdata[k][i].Gcycles, repeats);
1298                 snew(perfdata[k][i].ns_per_day, repeats);
1299                 snew(perfdata[k][i].PME_f_load, repeats);
1300             }
1301         }
1302     }
1303 }
1304
1305
1306 /* Check for errors on mdrun -h */
1307 static void make_sure_it_runs(char *mdrun_cmd_line, int length, FILE *fp,
1308                               const t_filenm *fnm, int nfile)
1309 {
1310     const char *fn = NULL;
1311     char       *command, *msg;
1312     int         ret;
1313
1314
1315     snew(command, length +  15);
1316     snew(msg, length + 500);
1317
1318     fprintf(stdout, "Making sure the benchmarks can be executed by running just 1 step...\n");
1319     sprintf(command, "%s -nsteps 1 -quiet", mdrun_cmd_line);
1320     fprintf(stdout, "Executing '%s' ...\n", command);
1321     ret = gmx_system_call(command);
1322
1323     if (0 != ret)
1324     {
1325         /* To prevent confusion, do not again issue a gmx_fatal here since we already
1326          * get the error message from mdrun itself */
1327         sprintf(msg,    "Cannot run the benchmark simulations! Please check the error message of\n"
1328                 "mdrun for the source of the problem. Did you provide a command line\n"
1329                 "argument that neither g_tune_pme nor mdrun understands? Offending command:\n"
1330                 "\n%s\n\n", command);
1331
1332         fprintf(stderr, "%s", msg);
1333         sep_line(fp);
1334         fprintf(fp, "%s", msg);
1335
1336         exit(ret);
1337     }
1338     fprintf(stdout, "Benchmarks can be executed!\n");
1339
1340     /* Clean up the benchmark output files we just created */
1341     fprintf(stdout, "Cleaning up ...\n");
1342     remove_if_exists(opt2fn("-bc", nfile, fnm));
1343     remove_if_exists(opt2fn("-be", nfile, fnm));
1344     remove_if_exists(opt2fn("-bcpo", nfile, fnm));
1345     remove_if_exists(opt2fn("-bg", nfile, fnm));
1346
1347     sfree(command);
1348     sfree(msg    );
1349 }
1350
1351
1352 static void do_the_tests(
1353         FILE           *fp,             /* General g_tune_pme output file         */
1354         char          **tpr_names,      /* Filenames of the input files to test   */
1355         int             maxPMEnodes,    /* Max fraction of nodes to use for PME   */
1356         int             minPMEnodes,    /* Min fraction of nodes to use for PME   */
1357         int             npme_fixed,     /* If >= -1, test fixed number of PME
1358                                          * nodes only                             */
1359         const char     *npmevalues_opt, /* Which -npme values should be tested    */
1360         t_perf        **perfdata,       /* Here the performace data is stored     */
1361         int            *pmeentries,     /* Entries in the nPMEnodes list          */
1362         int             repeats,        /* Repeat each test this often            */
1363         int             nnodes,         /* Total number of nodes = nPP + nPME     */
1364         int             nr_tprs,        /* Total number of tpr files to test      */
1365         gmx_bool        bThreads,       /* Threads or MPI?                        */
1366         char           *cmd_mpirun,     /* mpirun command string                  */
1367         char           *cmd_np,         /* "-np", "-n", whatever mpirun needs     */
1368         char           *cmd_mdrun,      /* mdrun command string                   */
1369         char           *cmd_args_bench, /* arguments for mdrun in a string        */
1370         const t_filenm *fnm,            /* List of filenames from command line    */
1371         int             nfile,          /* Number of files specified on the cmdl. */
1372         int             presteps,       /* DLB equilibration steps, is checked    */
1373         gmx_int64_t     cpt_steps)      /* Time step counter in the checkpoint    */
1374 {
1375     int      i, nr, k, ret, count = 0, totaltests;
1376     int     *nPMEnodes = NULL;
1377     t_perf  *pd        = NULL;
1378     int      cmdline_length;
1379     char    *command, *cmd_stub;
1380     char     buf[STRLEN];
1381     gmx_bool bResetProblem = FALSE;
1382     gmx_bool bFirst        = TRUE;
1383
1384
1385     /* This string array corresponds to the eParselog enum type at the start
1386      * of this file */
1387     const char* ParseLog[] = {
1388         "OK.",
1389         "Logfile not found!",
1390         "No timings, logfile truncated?",
1391         "Run was terminated.",
1392         "Counters were not reset properly.",
1393         "No DD grid found for these settings.",
1394         "TPX version conflict!",
1395         "mdrun was not started in parallel!",
1396         "Number of PP nodes has a prime factor that is too large.",
1397         "An error occured."
1398     };
1399     char        str_PME_f_load[13];
1400
1401
1402     /* Allocate space for the mdrun command line. 100 extra characters should
1403        be more than enough for the -npme etcetera arguments */
1404     cmdline_length =  strlen(cmd_mpirun)
1405         + strlen(cmd_np)
1406         + strlen(cmd_mdrun)
1407         + strlen(cmd_args_bench)
1408         + strlen(tpr_names[0]) + 100;
1409     snew(command, cmdline_length);
1410     snew(cmd_stub, cmdline_length);
1411
1412     /* Construct the part of the command line that stays the same for all tests: */
1413     if (bThreads)
1414     {
1415         sprintf(cmd_stub, "%s%s", cmd_mdrun, cmd_np);
1416     }
1417     else
1418     {
1419         sprintf(cmd_stub, "%s%s%s ", cmd_mpirun, cmd_np, cmd_mdrun);
1420     }
1421
1422     /* Create a list of numbers of PME nodes to test */
1423     if (npme_fixed < -1)
1424     {
1425         make_npme_list(npmevalues_opt, pmeentries, &nPMEnodes,
1426                        nnodes, minPMEnodes, maxPMEnodes);
1427     }
1428     else
1429     {
1430         *pmeentries  = 1;
1431         snew(nPMEnodes, 1);
1432         nPMEnodes[0] = npme_fixed;
1433         fprintf(stderr, "Will use a fixed number of %d PME-only nodes.\n", nPMEnodes[0]);
1434     }
1435
1436     if (0 == repeats)
1437     {
1438         fprintf(fp, "\nNo benchmarks done since number of repeats (-r) is 0.\n");
1439         gmx_ffclose(fp);
1440         finalize(opt2fn("-p", nfile, fnm));
1441         exit(0);
1442     }
1443
1444     /* Allocate one dataset for each tpr input file: */
1445     init_perfdata(perfdata, nr_tprs, *pmeentries, repeats);
1446
1447     /*****************************************/
1448     /* Main loop over all tpr files to test: */
1449     /*****************************************/
1450     totaltests = nr_tprs*(*pmeentries)*repeats;
1451     for (k = 0; k < nr_tprs; k++)
1452     {
1453         fprintf(fp, "\nIndividual timings for input file %d (%s):\n", k, tpr_names[k]);
1454         fprintf(fp, "PME nodes      Gcycles       ns/day        PME/f    Remark\n");
1455         /* Loop over various numbers of PME nodes: */
1456         for (i = 0; i < *pmeentries; i++)
1457         {
1458             pd = &perfdata[k][i];
1459
1460             /* Loop over the repeats for each scenario: */
1461             for (nr = 0; nr < repeats; nr++)
1462             {
1463                 pd->nPMEnodes = nPMEnodes[i];
1464
1465                 /* Add -npme and -s to the command line and save it. Note that
1466                  * the -passall (if set) options requires cmd_args_bench to be
1467                  * at the end of the command line string */
1468                 snew(pd->mdrun_cmd_line, cmdline_length);
1469                 sprintf(pd->mdrun_cmd_line, "%s-npme %d -s %s %s",
1470                         cmd_stub, pd->nPMEnodes, tpr_names[k], cmd_args_bench);
1471
1472                 /* To prevent that all benchmarks fail due to a show-stopper argument
1473                  * on the mdrun command line, we make a quick check first */
1474                 if (bFirst)
1475                 {
1476                     make_sure_it_runs(pd->mdrun_cmd_line, cmdline_length, fp, fnm, nfile);
1477                 }
1478                 bFirst = FALSE;
1479
1480                 /* Do a benchmark simulation: */
1481                 if (repeats > 1)
1482                 {
1483                     sprintf(buf, ", pass %d/%d", nr+1, repeats);
1484                 }
1485                 else
1486                 {
1487                     buf[0] = '\0';
1488                 }
1489                 fprintf(stdout, "\n=== Progress %2.0f%%, tpr %d/%d, run %d/%d%s:\n",
1490                         (100.0*count)/totaltests,
1491                         k+1, nr_tprs, i+1, *pmeentries, buf);
1492                 make_backup(opt2fn("-err", nfile, fnm));
1493                 sprintf(command, "%s 1> /dev/null 2>%s", pd->mdrun_cmd_line, opt2fn("-err", nfile, fnm));
1494                 fprintf(stdout, "%s\n", pd->mdrun_cmd_line);
1495                 gmx_system_call(command);
1496
1497                 /* Collect the performance data from the log file; also check stderr
1498                  * for fatal errors */
1499                 ret = parse_logfile(opt2fn("-bg", nfile, fnm), opt2fn("-err", nfile, fnm),
1500                                     pd, nr, presteps, cpt_steps, nnodes);
1501                 if ((presteps > 0) && (ret == eParselogResetProblem))
1502                 {
1503                     bResetProblem = TRUE;
1504                 }
1505
1506                 if (-1 == pd->nPMEnodes)
1507                 {
1508                     sprintf(buf, "(%3d)", pd->guessPME);
1509                 }
1510                 else
1511                 {
1512                     sprintf(buf, "     ");
1513                 }
1514
1515                 /* Nicer output */
1516                 if (pd->PME_f_load[nr] > 0.0)
1517                 {
1518                     sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load[nr]);
1519                 }
1520                 else
1521                 {
1522                     sprintf(str_PME_f_load, "%s", "         -  ");
1523                 }
1524
1525                 /* Write the data we got to disk */
1526                 fprintf(fp, "%4d%s %12.3f %12.3f %s    %s", pd->nPMEnodes,
1527                         buf, pd->Gcycles[nr], pd->ns_per_day[nr], str_PME_f_load, ParseLog[ret]);
1528                 if (!(ret == eParselogOK || ret == eParselogNoDDGrid || ret == eParselogNotFound) )
1529                 {
1530                     fprintf(fp, " Check %s file for problems.", ret == eParselogFatal ? "err" : "log");
1531                 }
1532                 fprintf(fp, "\n");
1533                 fflush(fp);
1534                 count++;
1535
1536                 /* Do some cleaning up and delete the files we do not need any more */
1537                 cleanup(fnm, nfile, k, nnodes, pd->nPMEnodes, nr, ret == eParselogFatal);
1538
1539                 /* If the first run with this number of processors already failed, do not try again: */
1540                 if (pd->Gcycles[0] <= 0.0 && repeats > 1)
1541                 {
1542                     fprintf(stdout, "Skipping remaining passes of unsuccessful setting, see log file for details.\n");
1543                     count += repeats-(nr+1);
1544                     break;
1545                 }
1546             } /* end of repeats loop */
1547         }     /* end of -npme loop */
1548     }         /* end of tpr file loop */
1549
1550     if (bResetProblem)
1551     {
1552         sep_line(fp);
1553         fprintf(fp, "WARNING: The cycle and time step counters could not be reset properly. ");
1554         sep_line(fp);
1555     }
1556     sfree(command);
1557     sfree(cmd_stub);
1558 }
1559
1560
1561 static void check_input(
1562         int             nnodes,
1563         int             repeats,
1564         int            *ntprs,
1565         real           *rmin,
1566         real            rcoulomb,
1567         real           *rmax,
1568         real            maxPMEfraction,
1569         real            minPMEfraction,
1570         int             npme_fixed,
1571         gmx_int64_t     bench_nsteps,
1572         const t_filenm *fnm,
1573         int             nfile,
1574         int             sim_part,
1575         int             presteps,
1576         int             npargs,
1577         t_pargs        *pa)
1578 {
1579     int old;
1580
1581
1582     /* Make sure the input file exists */
1583     if (!gmx_fexist(opt2fn("-s", nfile, fnm)))
1584     {
1585         gmx_fatal(FARGS, "File %s not found.", opt2fn("-s", nfile, fnm));
1586     }
1587
1588     /* Make sure that the checkpoint file is not overwritten during benchmarking */
1589     if ( (0 == strcmp(opt2fn("-cpi", nfile, fnm), opt2fn("-bcpo", nfile, fnm)) ) && (sim_part > 1) )
1590     {
1591         gmx_fatal(FARGS, "Checkpoint input (-cpi) and benchmark checkpoint output (-bcpo) files must not be identical.\n"
1592                   "The checkpoint input file must not be overwritten during the benchmarks.\n");
1593     }
1594
1595     /* Make sure that repeats is >= 0 (if == 0, only write tpr files) */
1596     if (repeats < 0)
1597     {
1598         gmx_fatal(FARGS, "Number of repeats < 0!");
1599     }
1600
1601     /* Check number of nodes */
1602     if (nnodes < 1)
1603     {
1604         gmx_fatal(FARGS, "Number of nodes/threads must be a positive integer.");
1605     }
1606
1607     /* Automatically choose -ntpr if not set */
1608     if (*ntprs < 1)
1609     {
1610         if (nnodes < 16)
1611         {
1612             *ntprs = 1;
1613         }
1614         else
1615         {
1616             *ntprs = 3;
1617             /* Set a reasonable scaling factor for rcoulomb */
1618             if (*rmax <= 0)
1619             {
1620                 *rmax = rcoulomb * 1.2;
1621             }
1622         }
1623         fprintf(stderr, "Will test %d tpr file%s.\n", *ntprs, *ntprs == 1 ? "" : "s");
1624     }
1625     else
1626     {
1627         if (1 == *ntprs)
1628         {
1629             fprintf(stderr, "Note: Choose ntpr>1 to shift PME load between real and reciprocal space.\n");
1630         }
1631     }
1632
1633     /* Make shure that rmin <= rcoulomb <= rmax */
1634     if (*rmin <= 0)
1635     {
1636         *rmin = rcoulomb;
1637     }
1638     if (*rmax <= 0)
1639     {
1640         *rmax = rcoulomb;
1641     }
1642     if (!(*rmin <= *rmax) )
1643     {
1644         gmx_fatal(FARGS, "Please choose the Coulomb radii such that rmin <= rmax.\n"
1645                   "rmin = %g, rmax = %g, actual rcoul from .tpr file = %g\n", *rmin, *rmax, rcoulomb);
1646     }
1647     /* Add test scenarios if rmin or rmax were set */
1648     if (*ntprs <= 2)
1649     {
1650         if (!is_equal(*rmin, rcoulomb) && (*ntprs == 1) )
1651         {
1652             (*ntprs)++;
1653             fprintf(stderr, "NOTE: Setting -rmin to %g changed -ntpr to %d\n",
1654                     *rmin, *ntprs);
1655         }
1656         if (!is_equal(*rmax, rcoulomb) && (*ntprs == 1) )
1657         {
1658             (*ntprs)++;
1659             fprintf(stderr, "NOTE: Setting -rmax to %g changed -ntpr to %d\n",
1660                     *rmax, *ntprs);
1661         }
1662     }
1663     old = *ntprs;
1664     /* If one of rmin, rmax is set, we need 2 tpr files at minimum */
1665     if (!is_equal(*rmax, rcoulomb) || !is_equal(*rmin, rcoulomb) )
1666     {
1667         *ntprs = max(*ntprs, 2);
1668     }
1669
1670     /* If both rmin, rmax are set, we need 3 tpr files at minimum */
1671     if (!is_equal(*rmax, rcoulomb) && !is_equal(*rmin, rcoulomb) )
1672     {
1673         *ntprs = max(*ntprs, 3);
1674     }
1675
1676     if (old != *ntprs)
1677     {
1678         fprintf(stderr, "NOTE: Your rmin, rmax setting changed -ntpr to %d\n", *ntprs);
1679     }
1680
1681     if (*ntprs > 1)
1682     {
1683         if (is_equal(*rmin, rcoulomb) && is_equal(rcoulomb, *rmax)) /* We have just a single rc */
1684         {
1685             fprintf(stderr, "WARNING: Resetting -ntpr to 1 since no Coulomb radius scaling is requested.\n"
1686                     "Please set rmin < rmax to test Coulomb radii in the [rmin, rmax] interval\n"
1687                     "with correspondingly adjusted PME grid settings\n");
1688             *ntprs = 1;
1689         }
1690     }
1691
1692     /* Check whether max and min fraction are within required values */
1693     if (maxPMEfraction > 0.5 || maxPMEfraction < 0)
1694     {
1695         gmx_fatal(FARGS, "-max must be between 0 and 0.5");
1696     }
1697     if (minPMEfraction > 0.5 || minPMEfraction < 0)
1698     {
1699         gmx_fatal(FARGS, "-min must be between 0 and 0.5");
1700     }
1701     if (maxPMEfraction < minPMEfraction)
1702     {
1703         gmx_fatal(FARGS, "-max must be larger or equal to -min");
1704     }
1705
1706     /* Check whether the number of steps - if it was set - has a reasonable value */
1707     if (bench_nsteps < 0)
1708     {
1709         gmx_fatal(FARGS, "Number of steps must be positive.");
1710     }
1711
1712     if (bench_nsteps > 10000 || bench_nsteps < 100)
1713     {
1714         fprintf(stderr, "WARNING: steps=");
1715         fprintf(stderr, "%"GMX_PRId64, bench_nsteps);
1716         fprintf(stderr, ". Are you sure you want to perform so %s steps for each benchmark?\n", (bench_nsteps < 100) ? "few" : "many");
1717     }
1718
1719     if (presteps < 0)
1720     {
1721         gmx_fatal(FARGS, "Cannot have a negative number of presteps.\n");
1722     }
1723
1724     /* Check for rcoulomb scaling if more than one .tpr file is tested */
1725     if (*ntprs > 1)
1726     {
1727         if (*rmin/rcoulomb < 0.75 || *rmax/rcoulomb > 1.25)
1728         {
1729             fprintf(stderr, "WARNING: Applying extreme scaling factor. I hope you know what you are doing.\n");
1730         }
1731     }
1732
1733     /* If a fixed number of PME nodes is set we do rcoulomb and PME gird tuning
1734      * only. We need to check whether the requested number of PME-only nodes
1735      * makes sense. */
1736     if (npme_fixed > -1)
1737     {
1738         /* No more than 50% of all nodes can be assigned as PME-only nodes. */
1739         if (2*npme_fixed > nnodes)
1740         {
1741             gmx_fatal(FARGS, "Cannot have more than %d PME-only nodes for a total of %d nodes (you chose %d).\n",
1742                       nnodes/2, nnodes, npme_fixed);
1743         }
1744         if ((npme_fixed > 0) && (5*npme_fixed < nnodes))
1745         {
1746             fprintf(stderr, "WARNING: Only %g percent of the nodes are assigned as PME-only nodes.\n",
1747                     100.0*((real)npme_fixed / (real)nnodes));
1748         }
1749         if (opt2parg_bSet("-min", npargs, pa) || opt2parg_bSet("-max", npargs, pa))
1750         {
1751             fprintf(stderr, "NOTE: The -min, -max, and -npme options have no effect when a\n"
1752                     "      fixed number of PME-only nodes is requested with -fix.\n");
1753         }
1754     }
1755 }
1756
1757
1758 /* Returns TRUE when "opt" is needed at launch time */
1759 static gmx_bool is_launch_file(char *opt, gmx_bool bSet)
1760 {
1761     if (0 == strncmp(opt, "-swap", 5))
1762     {
1763         return bSet;
1764     }
1765
1766     /* Apart from the input .tpr and the output log files we need all options that
1767      * were set on the command line and that do not start with -b */
1768     if    (0 == strncmp(opt, "-b", 2) || 0 == strncmp(opt, "-s", 2)
1769            || 0 == strncmp(opt, "-err", 4) || 0 == strncmp(opt, "-p", 2) )
1770     {
1771         return FALSE;
1772     }
1773
1774     return bSet;
1775 }
1776
1777
1778 /* Returns TRUE when "opt" defines a file which is needed for the benchmarks runs */
1779 static gmx_bool is_bench_file(char *opt, gmx_bool bSet, gmx_bool bOptional, gmx_bool bIsOutput)
1780 {
1781     /* Apart from the input .tpr, all files starting with "-b" are for
1782      * _b_enchmark files exclusively */
1783     if (0 == strncmp(opt, "-s", 2))
1784     {
1785         return FALSE;
1786     }
1787
1788     if (0 == strncmp(opt, "-b", 2) || 0 == strncmp(opt, "-s", 2))
1789     {
1790         if (!bOptional || bSet)
1791         {
1792             return TRUE;
1793         }
1794         else
1795         {
1796             return FALSE;
1797         }
1798     }
1799     else
1800     {
1801         if (bIsOutput)
1802         {
1803             return FALSE;
1804         }
1805         else
1806         {
1807             if (bSet) /* These are additional input files like -cpi -ei */
1808             {
1809                 return TRUE;
1810             }
1811             else
1812             {
1813                 return FALSE;
1814             }
1815         }
1816     }
1817 }
1818
1819
1820 /* Adds 'buf' to 'str' */
1821 static void add_to_string(char **str, char *buf)
1822 {
1823     int len;
1824
1825
1826     len = strlen(*str) + strlen(buf) + 1;
1827     srenew(*str, len);
1828     strcat(*str, buf);
1829 }
1830
1831
1832 /* Create the command line for the benchmark as well as for the real run */
1833 static void create_command_line_snippets(
1834         gmx_bool  bAppendFiles,
1835         gmx_bool  bKeepAndNumCPT,
1836         gmx_bool  bResetHWay,
1837         int       presteps,
1838         int       nfile,
1839         t_filenm  fnm[],
1840         char     *cmd_args_bench[],  /* command line arguments for benchmark runs */
1841         char     *cmd_args_launch[], /* command line arguments for simulation run */
1842         char      extra_args[])      /* Add this to the end of the command line */
1843 {
1844     int         i;
1845     char       *opt;
1846     const char *name;
1847     char        strbuf[STRLEN];
1848
1849
1850     /* strlen needs at least '\0' as a string: */
1851     snew(*cmd_args_bench, 1);
1852     snew(*cmd_args_launch, 1);
1853     *cmd_args_launch[0] = '\0';
1854     *cmd_args_bench[0]  = '\0';
1855
1856
1857     /*******************************************/
1858     /* 1. Process other command line arguments */
1859     /*******************************************/
1860     if (presteps > 0)
1861     {
1862         /* Add equilibration steps to benchmark options */
1863         sprintf(strbuf, "-resetstep %d ", presteps);
1864         add_to_string(cmd_args_bench, strbuf);
1865     }
1866     /* These switches take effect only at launch time */
1867     if (FALSE == bAppendFiles)
1868     {
1869         add_to_string(cmd_args_launch, "-noappend ");
1870     }
1871     if (bKeepAndNumCPT)
1872     {
1873         add_to_string(cmd_args_launch, "-cpnum ");
1874     }
1875     if (bResetHWay)
1876     {
1877         add_to_string(cmd_args_launch, "-resethway ");
1878     }
1879
1880     /********************/
1881     /* 2. Process files */
1882     /********************/
1883     for (i = 0; i < nfile; i++)
1884     {
1885         opt  = (char *)fnm[i].opt;
1886         name = opt2fn(opt, nfile, fnm);
1887
1888         /* Strbuf contains the options, now let's sort out where we need that */
1889         sprintf(strbuf, "%s %s ", opt, name);
1890
1891         if (is_bench_file(opt, opt2bSet(opt, nfile, fnm), is_optional(&fnm[i]), is_output(&fnm[i])) )
1892         {
1893             /* All options starting with -b* need the 'b' removed,
1894              * therefore overwrite strbuf */
1895             if (0 == strncmp(opt, "-b", 2))
1896             {
1897                 sprintf(strbuf, "-%s %s ", &opt[2], name);
1898             }
1899
1900             add_to_string(cmd_args_bench, strbuf);
1901         }
1902
1903         if (is_launch_file(opt, opt2bSet(opt, nfile, fnm)) )
1904         {
1905             add_to_string(cmd_args_launch, strbuf);
1906         }
1907     }
1908
1909     add_to_string(cmd_args_bench, extra_args);
1910     add_to_string(cmd_args_launch, extra_args);
1911 }
1912
1913
1914 /* Set option opt */
1915 static void setopt(const char *opt, int nfile, t_filenm fnm[])
1916 {
1917     int i;
1918
1919     for (i = 0; (i < nfile); i++)
1920     {
1921         if (strcmp(opt, fnm[i].opt) == 0)
1922         {
1923             fnm[i].flag |= ffSET;
1924         }
1925     }
1926 }
1927
1928
1929 /* This routine inspects the tpr file and ...
1930  * 1. checks for output files that get triggered by a tpr option. These output
1931  *    files are marked as 'set' to allow for a proper cleanup after each
1932  *    tuning run.
1933  * 2. returns the PME:PP load ratio
1934  * 3. returns rcoulomb from the tpr */
1935 static float inspect_tpr(int nfile, t_filenm fnm[], real *rcoulomb)
1936 {
1937     gmx_bool     bPull;     /* Is pulling requested in .tpr file?             */
1938     gmx_bool     bTpi;      /* Is test particle insertion requested?          */
1939     gmx_bool     bFree;     /* Is a free energy simulation requested?         */
1940     gmx_bool     bNM;       /* Is a normal mode analysis requested?           */
1941     gmx_bool     bSwap;     /* Is water/ion position swapping requested?      */
1942     t_inputrec   ir;
1943     t_state      state;
1944     gmx_mtop_t   mtop;
1945
1946
1947     /* Check tpr file for options that trigger extra output files */
1948     read_tpx_state(opt2fn("-s", nfile, fnm), &ir, &state, NULL, &mtop);
1949     bPull = (epullNO != ir.ePull);
1950     bFree = (efepNO  != ir.efep );
1951     bNM   = (eiNM    == ir.eI   );
1952     bSwap = (eswapNO != ir.eSwapCoords);
1953     bTpi  = EI_TPI(ir.eI);
1954
1955     /* Set these output files on the tuning command-line */
1956     if (bPull)
1957     {
1958         setopt("-pf", nfile, fnm);
1959         setopt("-px", nfile, fnm);
1960     }
1961     if (bFree)
1962     {
1963         setopt("-dhdl", nfile, fnm);
1964     }
1965     if (bTpi)
1966     {
1967         setopt("-tpi", nfile, fnm);
1968         setopt("-tpid", nfile, fnm);
1969     }
1970     if (bNM)
1971     {
1972         setopt("-mtx", nfile, fnm);
1973     }
1974     if (bSwap)
1975     {
1976         setopt("-swap", nfile, fnm);
1977     }
1978
1979     *rcoulomb = ir.rcoulomb;
1980
1981     /* Return the estimate for the number of PME nodes */
1982     return pme_load_estimate(&mtop, &ir, state.box);
1983 }
1984
1985
1986 static void couple_files_options(int nfile, t_filenm fnm[])
1987 {
1988     int      i;
1989     gmx_bool bSet, bBench;
1990     char    *opt;
1991     char     buf[20];
1992
1993
1994     for (i = 0; i < nfile; i++)
1995     {
1996         opt    = (char *)fnm[i].opt;
1997         bSet   = ((fnm[i].flag & ffSET) != 0);
1998         bBench = (0 == strncmp(opt, "-b", 2));
1999
2000         /* Check optional files */
2001         /* If e.g. -eo is set, then -beo also needs to be set */
2002         if (is_optional(&fnm[i]) && bSet && !bBench)
2003         {
2004             sprintf(buf, "-b%s", &opt[1]);
2005             setopt(buf, nfile, fnm);
2006         }
2007         /* If -beo is set, then -eo also needs to be! */
2008         if (is_optional(&fnm[i]) && bSet && bBench)
2009         {
2010             sprintf(buf, "-%s", &opt[2]);
2011             setopt(buf, nfile, fnm);
2012         }
2013     }
2014 }
2015
2016
2017 #define BENCHSTEPS (1000)
2018
2019 int gmx_tune_pme(int argc, char *argv[])
2020 {
2021     const char     *desc[] = {
2022         "For a given number [TT]-np[tt] or [TT]-ntmpi[tt] of processors/threads, [THISMODULE] systematically",
2023         "times [gmx-mdrun] with various numbers of PME-only nodes and determines",
2024         "which setting is fastest. It will also test whether performance can",
2025         "be enhanced by shifting load from the reciprocal to the real space",
2026         "part of the Ewald sum. ",
2027         "Simply pass your [TT].tpr[tt] file to [THISMODULE] together with other options",
2028         "for [gmx-mdrun] as needed.[PAR]",
2029         "Which executables are used can be set in the environment variables",
2030         "MPIRUN and MDRUN. If these are not present, 'mpirun' and 'mdrun'",
2031         "will be used as defaults. Note that for certain MPI frameworks you",
2032         "need to provide a machine- or hostfile. This can also be passed",
2033         "via the MPIRUN variable, e.g.[PAR]",
2034         "[TT]export MPIRUN=\"/usr/local/mpirun -machinefile hosts\"[tt][PAR]",
2035         "Please call [THISMODULE] with the normal options you would pass to",
2036         "[gmx-mdrun] and add [TT]-np[tt] for the number of processors to perform the",
2037         "tests on, or [TT]-ntmpi[tt] for the number of threads. You can also add [TT]-r[tt]",
2038         "to repeat each test several times to get better statistics. [PAR]",
2039         "[THISMODULE] can test various real space / reciprocal space workloads",
2040         "for you. With [TT]-ntpr[tt] you control how many extra [TT].tpr[tt] files will be",
2041         "written with enlarged cutoffs and smaller Fourier grids respectively.",
2042         "Typically, the first test (number 0) will be with the settings from the input",
2043         "[TT].tpr[tt] file; the last test (number [TT]ntpr[tt]) will have the Coulomb cutoff",
2044         "specified by [TT]-rmax[tt] with a somwhat smaller PME grid at the same time. ",
2045         "In this last test, the Fourier spacing is multiplied with [TT]rmax[tt]/rcoulomb. ",
2046         "The remaining [TT].tpr[tt] files will have equally-spaced Coulomb radii (and Fourier "
2047         "spacings) between these extremes. [BB]Note[bb] that you can set [TT]-ntpr[tt] to 1",
2048         "if you just seek the optimal number of PME-only nodes; in that case",
2049         "your input [TT].tpr[tt] file will remain unchanged.[PAR]",
2050         "For the benchmark runs, the default of 1000 time steps should suffice for most",
2051         "MD systems. The dynamic load balancing needs about 100 time steps",
2052         "to adapt to local load imbalances, therefore the time step counters",
2053         "are by default reset after 100 steps. For large systems (>1M atoms), as well as ",
2054         "for a higher accuarcy of the measurements, you should set [TT]-resetstep[tt] to a higher value.",
2055         "From the 'DD' load imbalance entries in the md.log output file you",
2056         "can tell after how many steps the load is sufficiently balanced. Example call:[PAR]"
2057         "[TT]gmx tune_pme -np 64 -s protein.tpr -launch[tt][PAR]",
2058         "After calling [gmx-mdrun] several times, detailed performance information",
2059         "is available in the output file [TT]perf.out[tt].",
2060         "[BB]Note[bb] that during the benchmarks, a couple of temporary files are written",
2061         "(options [TT]-b[tt]*), these will be automatically deleted after each test.[PAR]",
2062         "If you want the simulation to be started automatically with the",
2063         "optimized parameters, use the command line option [TT]-launch[tt].[PAR]",
2064     };
2065
2066     int             nnodes         = 1;
2067     int             repeats        = 2;
2068     int             pmeentries     = 0; /* How many values for -npme do we actually test for each tpr file */
2069     real            maxPMEfraction = 0.50;
2070     real            minPMEfraction = 0.25;
2071     int             maxPMEnodes, minPMEnodes;
2072     float           guessPMEratio;                    /* guessed PME:PP ratio based on the tpr file */
2073     float           guessPMEnodes;
2074     int             npme_fixed     = -2;              /* If >= -1, use only this number
2075                                                        * of PME-only nodes                */
2076     int             ntprs          = 0;
2077     real            rmin           = 0.0, rmax = 0.0; /* min and max value for rcoulomb if scaling is requested */
2078     real            rcoulomb       = -1.0;            /* Coulomb radius as set in .tpr file */
2079     gmx_bool        bScaleRvdw     = TRUE;
2080     gmx_int64_t     bench_nsteps   = BENCHSTEPS;
2081     gmx_int64_t     new_sim_nsteps = -1;  /* -1 indicates: not set by the user */
2082     gmx_int64_t     cpt_steps      = 0;   /* Step counter in .cpt input file   */
2083     int             presteps       = 100; /* Do a full cycle reset after presteps steps */
2084     gmx_bool        bOverwrite     = FALSE, bKeepTPR;
2085     gmx_bool        bLaunch        = FALSE;
2086     char           *ExtraArgs      = NULL;
2087     char          **tpr_names      = NULL;
2088     const char     *simulation_tpr = NULL;
2089     int             best_npme, best_tpr;
2090     int             sim_part = 1; /* For benchmarks with checkpoint files */
2091     char            bbuf[STRLEN];
2092
2093     /* Default program names if nothing else is found */
2094     char         *cmd_mpirun = NULL, *cmd_mdrun = NULL;
2095     char         *cmd_args_bench, *cmd_args_launch;
2096     char         *cmd_np = NULL;
2097
2098     t_perf      **perfdata = NULL;
2099     t_inputinfo  *info;
2100     int           i;
2101     FILE         *fp;
2102     t_commrec    *cr;
2103
2104     /* Print out how long the tuning took */
2105     double          seconds;
2106
2107     static t_filenm fnm[] = {
2108         /* g_tune_pme */
2109         { efOUT, "-p",      "perf",     ffWRITE },
2110         { efLOG, "-err",    "bencherr", ffWRITE },
2111         { efTPX, "-so",     "tuned",    ffWRITE },
2112         /* mdrun: */
2113         { efTPX, NULL,      NULL,       ffREAD },
2114         { efTRN, "-o",      NULL,       ffWRITE },
2115         { efCOMPRESSED, "-x", NULL,     ffOPTWR },
2116         { efCPT, "-cpi",    NULL,       ffOPTRD },
2117         { efCPT, "-cpo",    NULL,       ffOPTWR },
2118         { efSTO, "-c",      "confout",  ffWRITE },
2119         { efEDR, "-e",      "ener",     ffWRITE },
2120         { efLOG, "-g",      "md",       ffWRITE },
2121         { efXVG, "-dhdl",   "dhdl",     ffOPTWR },
2122         { efXVG, "-field",  "field",    ffOPTWR },
2123         { efXVG, "-table",  "table",    ffOPTRD },
2124         { efXVG, "-tabletf", "tabletf",   ffOPTRD },
2125         { efXVG, "-tablep", "tablep",   ffOPTRD },
2126         { efXVG, "-tableb", "table",    ffOPTRD },
2127         { efTRX, "-rerun",  "rerun",    ffOPTRD },
2128         { efXVG, "-tpi",    "tpi",      ffOPTWR },
2129         { efXVG, "-tpid",   "tpidist",  ffOPTWR },
2130         { efEDI, "-ei",     "sam",      ffOPTRD },
2131         { efXVG, "-eo",     "edsam",    ffOPTWR },
2132         { efXVG, "-devout", "deviatie", ffOPTWR },
2133         { efXVG, "-runav",  "runaver",  ffOPTWR },
2134         { efXVG, "-px",     "pullx",    ffOPTWR },
2135         { efXVG, "-pf",     "pullf",    ffOPTWR },
2136         { efXVG, "-ro",     "rotation", ffOPTWR },
2137         { efLOG, "-ra",     "rotangles", ffOPTWR },
2138         { efLOG, "-rs",     "rotslabs", ffOPTWR },
2139         { efLOG, "-rt",     "rottorque", ffOPTWR },
2140         { efMTX, "-mtx",    "nm",       ffOPTWR },
2141         { efNDX, "-dn",     "dipole",   ffOPTWR },
2142         { efXVG, "-swap",   "swapions", ffOPTWR },
2143         /* Output files that are deleted after each benchmark run */
2144         { efTRN, "-bo",     "bench",    ffWRITE },
2145         { efXTC, "-bx",     "bench",    ffWRITE },
2146         { efCPT, "-bcpo",   "bench",    ffWRITE },
2147         { efSTO, "-bc",     "bench",    ffWRITE },
2148         { efEDR, "-be",     "bench",    ffWRITE },
2149         { efLOG, "-bg",     "bench",    ffWRITE },
2150         { efXVG, "-beo",    "benchedo", ffOPTWR },
2151         { efXVG, "-bdhdl",  "benchdhdl", ffOPTWR },
2152         { efXVG, "-bfield", "benchfld", ffOPTWR },
2153         { efXVG, "-btpi",   "benchtpi", ffOPTWR },
2154         { efXVG, "-btpid",  "benchtpid", ffOPTWR },
2155         { efXVG, "-bdevout", "benchdev", ffOPTWR },
2156         { efXVG, "-brunav", "benchrnav", ffOPTWR },
2157         { efXVG, "-bpx",    "benchpx",  ffOPTWR },
2158         { efXVG, "-bpf",    "benchpf",  ffOPTWR },
2159         { efXVG, "-bro",    "benchrot", ffOPTWR },
2160         { efLOG, "-bra",    "benchrota", ffOPTWR },
2161         { efLOG, "-brs",    "benchrots", ffOPTWR },
2162         { efLOG, "-brt",    "benchrott", ffOPTWR },
2163         { efMTX, "-bmtx",   "benchn",   ffOPTWR },
2164         { efNDX, "-bdn",    "bench",    ffOPTWR },
2165         { efXVG, "-bswap",  "benchswp", ffOPTWR }
2166     };
2167
2168     gmx_bool        bThreads     = FALSE;
2169
2170     int             nthreads = 1;
2171
2172     const char     *procstring[] =
2173     { NULL, "-np", "-n", "none", NULL };
2174     const char     *npmevalues_opt[] =
2175     { NULL, "auto", "all", "subset", NULL };
2176
2177     gmx_bool     bAppendFiles          = TRUE;
2178     gmx_bool     bKeepAndNumCPT        = FALSE;
2179     gmx_bool     bResetCountersHalfWay = FALSE;
2180     gmx_bool     bBenchmark            = TRUE;
2181
2182     output_env_t oenv = NULL;
2183
2184     t_pargs      pa[] = {
2185         /***********************/
2186         /* g_tune_pme options: */
2187         /***********************/
2188         { "-np",       FALSE, etINT,  {&nnodes},
2189           "Number of nodes to run the tests on (must be > 2 for separate PME nodes)" },
2190         { "-npstring", FALSE, etENUM, {procstring},
2191           "Specify the number of processors to [TT]$MPIRUN[tt] using this string"},
2192         { "-ntmpi",    FALSE, etINT,  {&nthreads},
2193           "Number of MPI-threads to run the tests on (turns MPI & mpirun off)"},
2194         { "-r",        FALSE, etINT,  {&repeats},
2195           "Repeat each test this often" },
2196         { "-max",      FALSE, etREAL, {&maxPMEfraction},
2197           "Max fraction of PME nodes to test with" },
2198         { "-min",      FALSE, etREAL, {&minPMEfraction},
2199           "Min fraction of PME nodes to test with" },
2200         { "-npme",     FALSE, etENUM, {npmevalues_opt},
2201           "Within -min and -max, benchmark all possible values for [TT]-npme[tt], or just a reasonable subset. "
2202           "Auto neglects -min and -max and chooses reasonable values around a guess for npme derived from the .tpr"},
2203         { "-fix",      FALSE, etINT,  {&npme_fixed},
2204           "If >= -1, do not vary the number of PME-only nodes, instead use this fixed value and only vary rcoulomb and the PME grid spacing."},
2205         { "-rmax",     FALSE, etREAL, {&rmax},
2206           "If >0, maximal rcoulomb for -ntpr>1 (rcoulomb upscaling results in fourier grid downscaling)" },
2207         { "-rmin",     FALSE, etREAL, {&rmin},
2208           "If >0, minimal rcoulomb for -ntpr>1" },
2209         { "-scalevdw",  FALSE, etBOOL, {&bScaleRvdw},
2210           "Scale rvdw along with rcoulomb"},
2211         { "-ntpr",     FALSE, etINT,  {&ntprs},
2212           "Number of [TT].tpr[tt] files to benchmark. Create this many files with different rcoulomb scaling factors depending on -rmin and -rmax. "
2213           "If < 1, automatically choose the number of [TT].tpr[tt] files to test" },
2214         { "-steps",    FALSE, etINT64, {&bench_nsteps},
2215           "Take timings for this many steps in the benchmark runs" },
2216         { "-resetstep", FALSE, etINT,  {&presteps},
2217           "Let dlb equilibrate this many steps before timings are taken (reset cycle counters after this many steps)" },
2218         { "-simsteps", FALSE, etINT64, {&new_sim_nsteps},
2219           "If non-negative, perform this many steps in the real run (overwrites nsteps from [TT].tpr[tt], add [TT].cpt[tt] steps)" },
2220         { "-launch",   FALSE, etBOOL, {&bLaunch},
2221           "Launch the real simulation after optimization" },
2222         { "-bench",    FALSE, etBOOL, {&bBenchmark},
2223           "Run the benchmarks or just create the input [TT].tpr[tt] files?" },
2224         /******************/
2225         /* mdrun options: */
2226         /******************/
2227         /* We let g_tune_pme parse and understand these options, because we need to
2228          * prevent that they appear on the mdrun command line for the benchmarks */
2229         { "-append",   FALSE, etBOOL, {&bAppendFiles},
2230           "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names (for launch only)" },
2231         { "-cpnum",    FALSE, etBOOL, {&bKeepAndNumCPT},
2232           "Keep and number checkpoint files (launch only)" },
2233         { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay},
2234           "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt] (launch only)" }
2235     };
2236
2237 #define NFILE asize(fnm)
2238
2239     seconds = gmx_gettime();
2240
2241     if (!parse_common_args(&argc, argv, PCA_NOEXIT_ON_ARGS,
2242                            NFILE, fnm, asize(pa), pa, asize(desc), desc,
2243                            0, NULL, &oenv))
2244     {
2245         return 0;
2246     }
2247
2248     /* Store the remaining unparsed command line entries in a string which
2249      * is then attached to the mdrun command line */
2250     snew(ExtraArgs, 1);
2251     ExtraArgs[0] = '\0';
2252     for (i = 1; i < argc; i++) /* argc will now be 1 if everything was understood */
2253     {
2254         add_to_string(&ExtraArgs, argv[i]);
2255         add_to_string(&ExtraArgs, " ");
2256     }
2257
2258     if (opt2parg_bSet("-ntmpi", asize(pa), pa))
2259     {
2260         bThreads = TRUE;
2261         if (opt2parg_bSet("-npstring", asize(pa), pa))
2262         {
2263             fprintf(stderr, "WARNING: -npstring has no effect when using threads.\n");
2264         }
2265
2266         if (nnodes > 1)
2267         {
2268             gmx_fatal(FARGS, "Can't run multi-threaded MPI simulation yet!");
2269         }
2270         /* and now we just set this; a bit of an ugly hack*/
2271         nnodes = nthreads;
2272     }
2273     /* Check for PME:PP ratio and whether tpr triggers additional output files */
2274     guessPMEratio = inspect_tpr(NFILE, fnm, &rcoulomb);
2275
2276     /* Automatically set -beo options if -eo is set etc. */
2277     couple_files_options(NFILE, fnm);
2278
2279     /* Construct the command line arguments for benchmark runs
2280      * as well as for the simulation run */
2281     if (bThreads)
2282     {
2283         sprintf(bbuf, " -ntmpi %d ", nthreads);
2284     }
2285     else
2286     {
2287         /* This string will be used for MPI runs and will appear after the
2288          * mpirun command. */
2289         if (strcmp(procstring[0], "none") != 0)
2290         {
2291             sprintf(bbuf, " %s %d ", procstring[0], nnodes);
2292         }
2293         else
2294         {
2295             sprintf(bbuf, " ");
2296         }
2297     }
2298
2299     cmd_np = bbuf;
2300
2301     create_command_line_snippets(bAppendFiles, bKeepAndNumCPT, bResetCountersHalfWay, presteps,
2302                                  NFILE, fnm, &cmd_args_bench, &cmd_args_launch, ExtraArgs);
2303
2304     /* Read in checkpoint file if requested */
2305     sim_part = 1;
2306     if (opt2bSet("-cpi", NFILE, fnm))
2307     {
2308         snew(cr, 1);
2309         cr->duty = DUTY_PP; /* makes the following routine happy */
2310         read_checkpoint_simulation_part(opt2fn("-cpi", NFILE, fnm),
2311                                         &sim_part, &cpt_steps, cr,
2312                                         FALSE, NFILE, fnm, NULL, NULL);
2313         sfree(cr);
2314         sim_part++;
2315         /* sim_part will now be 1 if no checkpoint file was found */
2316         if (sim_part <= 1)
2317         {
2318             gmx_fatal(FARGS, "Checkpoint file %s not found!", opt2fn("-cpi", NFILE, fnm));
2319         }
2320     }
2321
2322     /* Open performance output file and write header info */
2323     fp = gmx_ffopen(opt2fn("-p", NFILE, fnm), "w");
2324
2325     /* Make a quick consistency check of command line parameters */
2326     check_input(nnodes, repeats, &ntprs, &rmin, rcoulomb, &rmax,
2327                 maxPMEfraction, minPMEfraction, npme_fixed,
2328                 bench_nsteps, fnm, NFILE, sim_part, presteps,
2329                 asize(pa), pa);
2330
2331     /* Determine the maximum and minimum number of PME nodes to test,
2332      * the actual list of settings is build in do_the_tests(). */
2333     if ((nnodes > 2) && (npme_fixed < -1))
2334     {
2335         if (0 == strcmp(npmevalues_opt[0], "auto"))
2336         {
2337             /* Determine the npme range automatically based on the PME:PP load guess */
2338             if (guessPMEratio > 1.0)
2339             {
2340                 /* More PME than PP work, probably we do not need separate PME nodes at all! */
2341                 maxPMEnodes = nnodes/2;
2342                 minPMEnodes = nnodes/2;
2343             }
2344             else
2345             {
2346                 /* PME : PP load is in the range 0..1, let's test around the guess */
2347                 guessPMEnodes = nnodes/(1.0 + 1.0/guessPMEratio);
2348                 minPMEnodes   = floor(0.7*guessPMEnodes);
2349                 maxPMEnodes   =  ceil(1.6*guessPMEnodes);
2350                 maxPMEnodes   = min(maxPMEnodes, nnodes/2);
2351             }
2352         }
2353         else
2354         {
2355             /* Determine the npme range based on user input */
2356             maxPMEnodes = floor(maxPMEfraction*nnodes);
2357             minPMEnodes = max(floor(minPMEfraction*nnodes), 0);
2358             fprintf(stdout, "Will try runs with %d ", minPMEnodes);
2359             if (maxPMEnodes != minPMEnodes)
2360             {
2361                 fprintf(stdout, "- %d ", maxPMEnodes);
2362             }
2363             fprintf(stdout, "PME-only nodes.\n  Note that the automatic number of PME-only nodes and no separate PME nodes are always tested.\n");
2364         }
2365     }
2366     else
2367     {
2368         maxPMEnodes = 0;
2369         minPMEnodes = 0;
2370     }
2371
2372     /* Get the commands we need to set up the runs from environment variables */
2373     get_program_paths(bThreads, &cmd_mpirun, &cmd_mdrun);
2374     if (bBenchmark && repeats > 0)
2375     {
2376         check_mdrun_works(bThreads, cmd_mpirun, cmd_np, cmd_mdrun);
2377     }
2378
2379     /* Print some header info to file */
2380     sep_line(fp);
2381     fprintf(fp, "\n      P E R F O R M A N C E   R E S U L T S\n");
2382     sep_line(fp);
2383     fprintf(fp, "%s for Gromacs %s\n", ShortProgram(), GromacsVersion());
2384     if (!bThreads)
2385     {
2386         fprintf(fp, "Number of nodes         : %d\n", nnodes);
2387         fprintf(fp, "The mpirun command is   : %s\n", cmd_mpirun);
2388         if (strcmp(procstring[0], "none") != 0)
2389         {
2390             fprintf(fp, "Passing # of nodes via  : %s\n", procstring[0]);
2391         }
2392         else
2393         {
2394             fprintf(fp, "Not setting number of nodes in system call\n");
2395         }
2396     }
2397     else
2398     {
2399         fprintf(fp, "Number of threads       : %d\n", nnodes);
2400     }
2401
2402     fprintf(fp, "The mdrun  command is   : %s\n", cmd_mdrun);
2403     fprintf(fp, "mdrun args benchmarks   : %s\n", cmd_args_bench);
2404     fprintf(fp, "Benchmark steps         : ");
2405     fprintf(fp, "%"GMX_PRId64, bench_nsteps);
2406     fprintf(fp, "\n");
2407     fprintf(fp, "dlb equilibration steps : %d\n", presteps);
2408     if (sim_part > 1)
2409     {
2410         fprintf(fp, "Checkpoint time step    : ");
2411         fprintf(fp, "%"GMX_PRId64, cpt_steps);
2412         fprintf(fp, "\n");
2413     }
2414     fprintf(fp, "mdrun args at launchtime: %s\n", cmd_args_launch);
2415
2416     if (new_sim_nsteps >= 0)
2417     {
2418         bOverwrite = TRUE;
2419         fprintf(stderr, "Note: Simulation input file %s will have ", opt2fn("-so", NFILE, fnm));
2420         fprintf(stderr, "%"GMX_PRId64, new_sim_nsteps+cpt_steps);
2421         fprintf(stderr, " steps.\n");
2422         fprintf(fp, "Simulation steps        : ");
2423         fprintf(fp, "%"GMX_PRId64, new_sim_nsteps);
2424         fprintf(fp, "\n");
2425     }
2426     if (repeats > 1)
2427     {
2428         fprintf(fp, "Repeats for each test   : %d\n", repeats);
2429     }
2430
2431     if (npme_fixed >= -1)
2432     {
2433         fprintf(fp, "Fixing -npme at         : %d\n", npme_fixed);
2434     }
2435
2436     fprintf(fp, "Input file              : %s\n", opt2fn("-s", NFILE, fnm));
2437     fprintf(fp, "   PME/PP load estimate : %g\n", guessPMEratio);
2438
2439     /* Allocate memory for the inputinfo struct: */
2440     snew(info, 1);
2441     info->nr_inputfiles = ntprs;
2442     for (i = 0; i < ntprs; i++)
2443     {
2444         snew(info->rcoulomb, ntprs);
2445         snew(info->rvdw, ntprs);
2446         snew(info->rlist, ntprs);
2447         snew(info->rlistlong, ntprs);
2448         snew(info->nkx, ntprs);
2449         snew(info->nky, ntprs);
2450         snew(info->nkz, ntprs);
2451         snew(info->fsx, ntprs);
2452         snew(info->fsy, ntprs);
2453         snew(info->fsz, ntprs);
2454     }
2455     /* Make alternative tpr files to test: */
2456     snew(tpr_names, ntprs);
2457     for (i = 0; i < ntprs; i++)
2458     {
2459         snew(tpr_names[i], STRLEN);
2460     }
2461
2462     /* It can be that ntprs is reduced by make_benchmark_tprs if not enough
2463      * different grids could be found. */
2464     make_benchmark_tprs(opt2fn("-s", NFILE, fnm), tpr_names, bench_nsteps+presteps,
2465                         cpt_steps, rmin, rmax, bScaleRvdw, &ntprs, info, fp);
2466
2467     /********************************************************************************/
2468     /* Main loop over all scenarios we need to test: tpr files, PME nodes, repeats  */
2469     /********************************************************************************/
2470     snew(perfdata, ntprs);
2471     if (bBenchmark)
2472     {
2473         do_the_tests(fp, tpr_names, maxPMEnodes, minPMEnodes, npme_fixed, npmevalues_opt[0], perfdata, &pmeentries,
2474                      repeats, nnodes, ntprs, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
2475                      cmd_args_bench, fnm, NFILE, presteps, cpt_steps);
2476
2477         fprintf(fp, "\nTuning took%8.1f minutes.\n", (gmx_gettime()-seconds)/60.0);
2478
2479         /* Analyse the results and give a suggestion for optimal settings: */
2480         bKeepTPR = analyze_data(fp, opt2fn("-p", NFILE, fnm), perfdata, nnodes, ntprs, pmeentries,
2481                                 repeats, info, &best_tpr, &best_npme);
2482
2483         /* Take the best-performing tpr file and enlarge nsteps to original value */
2484         if (bKeepTPR && !bOverwrite)
2485         {
2486             simulation_tpr = opt2fn("-s", NFILE, fnm);
2487         }
2488         else
2489         {
2490             simulation_tpr = opt2fn("-so", NFILE, fnm);
2491             modify_PMEsettings(bOverwrite ? (new_sim_nsteps+cpt_steps) : info->orig_sim_steps,
2492                                info->orig_init_step, tpr_names[best_tpr], simulation_tpr);
2493         }
2494
2495         /* Let's get rid of the temporary benchmark input files */
2496         for (i = 0; i < ntprs; i++)
2497         {
2498             fprintf(stdout, "Deleting temporary benchmark input file %s\n", tpr_names[i]);
2499             remove(tpr_names[i]);
2500         }
2501
2502         /* Now start the real simulation if the user requested it ... */
2503         launch_simulation(bLaunch, fp, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
2504                           cmd_args_launch, simulation_tpr, best_npme);
2505     }
2506     gmx_ffclose(fp);
2507
2508     /* ... or simply print the performance results to screen: */
2509     if (!bLaunch)
2510     {
2511         finalize(opt2fn("-p", NFILE, fnm));
2512     }
2513
2514     return 0;
2515 }