src/tools/gmx_tune_pme.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team,
   6  * check out http://www.gromacs.org for more information.
   7  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   8  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   9  * others, as listed in the AUTHORS file in the top-level source
  10  * directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #ifdef HAVE_CONFIG_H
  39 #include <config.h>
  40 #endif
  41
  42
  43 #include <time.h>
  44 #ifdef HAVE_SYS_TIME_H
  45 #include <sys/time.h>
  46 #endif
  47
  48
  49
  50 #include "statutil.h"
  51 #include "typedefs.h"
  52 #include "smalloc.h"
  53 #include "vec.h"
  54 #include "copyrite.h"
  55 #include "statutil.h"
  56 #include "tpxio.h"
  57 #include "string2.h"
  58 #include "readinp.h"
  59 #include "calcgrid.h"
  60 #include "checkpoint.h"
  61 #include "gmx_ana.h"
  62 #include "names.h"
  63 #include "perf_est.h"
  64
  65
  66
  67 /* Enum for situations that can occur during log file parsing, the
  68  * corresponding string entries can be found in do_the_tests() in
  69  * const char* ParseLog[] */
  70 enum {
  71     eParselogOK,
  72     eParselogNotFound,
  73     eParselogNoPerfData,
  74     eParselogTerm,
  75     eParselogResetProblem,
  76     eParselogNoDDGrid,
  77     eParselogTPXVersion,
  78     eParselogNotParallel,
  79     eParselogFatal,
  80     eParselogNr
  81 };
  82
  83
  84 typedef struct
  85 {
  86     int     nPMEnodes;    /* number of PME-only nodes used in this test */
  87     int     nx, ny, nz;   /* DD grid */
  88     int     guessPME;     /* if nPMEnodes == -1, this is the guessed number of PME nodes */
  89     double *Gcycles;      /* This can contain more than one value if doing multiple tests */
  90     double  Gcycles_Av;
  91     float  *ns_per_day;
  92     float   ns_per_day_Av;
  93     float  *PME_f_load;     /* PME mesh/force load average*/
  94     float   PME_f_load_Av;  /* Average average ;) ... */
  95     char   *mdrun_cmd_line; /* Mdrun command line used for this test */
  96 } t_perf;
  97
  98
  99 typedef struct
 100 {
 101     int             nr_inputfiles;  /* The number of tpr and mdp input files */
 102     gmx_large_int_t orig_sim_steps; /* Number of steps to be done in the real simulation */
 103     gmx_large_int_t orig_init_step; /* Init step for the real simulation */
 104     real           *rcoulomb;       /* The coulomb radii [0...nr_inputfiles] */
 105     real           *rvdw;           /* The vdW radii */
 106     real           *rlist;          /* Neighbourlist cutoff radius */
 107     real           *rlistlong;
 108     int            *nkx, *nky, *nkz;
 109     real           *fsx, *fsy, *fsz; /* Fourierspacing in x,y,z dimension */
 110 } t_inputinfo;
 111
 112
 113 static void sep_line(FILE *fp)
 114 {
 115     fprintf(fp, "\n------------------------------------------------------------\n");
 116 }
 117
 118
 119 /* Wrapper for system calls */
 120 static int gmx_system_call(char *command)
 121 {
 122 #ifdef GMX_NO_SYSTEM
 123     gmx_fatal(FARGS, "No calls to system(3) supported on this platform. Attempted to call:\n'%s'\n", command);
 124 #else
 125     return ( system(command) );
 126 #endif
 127 }
 128
 129
 130 /* Check if string starts with substring */
 131 static gmx_bool str_starts(const char *string, const char *substring)
 132 {
 133     return ( strncmp(string, substring, strlen(substring)) == 0);
 134 }
 135
 136
 137 static void cleandata(t_perf *perfdata, int test_nr)
 138 {
 139     perfdata->Gcycles[test_nr]    = 0.0;
 140     perfdata->ns_per_day[test_nr] = 0.0;
 141     perfdata->PME_f_load[test_nr] = 0.0;
 142
 143     return;
 144 }
 145
 146
 147 static gmx_bool is_equal(real a, real b)
 148 {
 149     real diff, eps = 1.0e-7;
 150
 151
 152     diff = a - b;
 153
 154     if (diff < 0.0)
 155     {
 156         diff = -diff;
 157     }
 158
 159     if (diff < eps)
 160     {
 161         return TRUE;
 162     }
 163     else
 164     {
 165         return FALSE;
 166     }
 167 }
 168
 169
 170 static void finalize(const char *fn_out)
 171 {
 172     char  buf[STRLEN];
 173     FILE *fp;
 174
 175
 176     fp = fopen(fn_out, "r");
 177     fprintf(stdout, "\n\n");
 178
 179     while (fgets(buf, STRLEN-1, fp) != NULL)
 180     {
 181         fprintf(stdout, "%s", buf);
 182     }
 183     fclose(fp);
 184     fprintf(stdout, "\n\n");
 185 }
 186
 187
 188 enum {
 189     eFoundNothing, eFoundDDStr, eFoundAccountingStr, eFoundCycleStr
 190 };
 191
 192 static int parse_logfile(const char *logfile, const char *errfile,
 193                          t_perf *perfdata, int test_nr, int presteps, gmx_large_int_t cpt_steps,
 194                          int nnodes)
 195 {
 196     FILE           *fp;
 197     char            line[STRLEN], dumstring[STRLEN], dumstring2[STRLEN];
 198     const char      matchstrdd[]  = "Domain decomposition grid";
 199     const char      matchstrcr[]  = "resetting all time and cycle counters";
 200     const char      matchstrbal[] = "Average PME mesh/force load:";
 201     const char      matchstring[] = "R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G";
 202     const char      errSIG[]      = "signal, stopping at the next";
 203     int             iFound;
 204     int             procs;
 205     float           dum1, dum2, dum3, dum4;
 206     int             ndum;
 207     int             npme;
 208     gmx_large_int_t resetsteps     = -1;
 209     gmx_bool        bFoundResetStr = FALSE;
 210     gmx_bool        bResetChecked  = FALSE;
 211
 212
 213     if (!gmx_fexist(logfile))
 214     {
 215         fprintf(stderr, "WARNING: Could not find logfile %s.\n", logfile);
 216         cleandata(perfdata, test_nr);
 217         return eParselogNotFound;
 218     }
 219
 220     fp = fopen(logfile, "r");
 221     perfdata->PME_f_load[test_nr] = -1.0;
 222     perfdata->guessPME            = -1;
 223
 224     iFound = eFoundNothing;
 225     if (1 == nnodes)
 226     {
 227         iFound = eFoundDDStr; /* Skip some case statements */
 228     }
 229
 230     while (fgets(line, STRLEN, fp) != NULL)
 231     {
 232         /* Remove leading spaces */
 233         ltrim(line);
 234
 235         /* Check for TERM and INT signals from user: */
 236         if (strstr(line, errSIG) != NULL)
 237         {
 238             fclose(fp);
 239             cleandata(perfdata, test_nr);
 240             return eParselogTerm;
 241         }
 242
 243         /* Check whether cycle resetting  worked */
 244         if (presteps > 0 && !bFoundResetStr)
 245         {
 246             if (strstr(line, matchstrcr) != NULL)
 247             {
 248                 sprintf(dumstring, "step %s", gmx_large_int_pfmt);
 249                 sscanf(line, dumstring, &resetsteps);
 250                 bFoundResetStr = TRUE;
 251                 if (resetsteps == presteps+cpt_steps)
 252                 {
 253                     bResetChecked = TRUE;
 254                 }
 255                 else
 256                 {
 257                     sprintf(dumstring, gmx_large_int_pfmt, resetsteps);
 258                     sprintf(dumstring2, gmx_large_int_pfmt, presteps+cpt_steps);
 259                     fprintf(stderr, "WARNING: Time step counters were reset at step %s,\n"
 260                             "         though they were supposed to be reset at step %s!\n",
 261                             dumstring, dumstring2);
 262                 }
 263             }
 264         }
 265
 266         /* Look for strings that appear in a certain order in the log file: */
 267         switch (iFound)
 268         {
 269             case eFoundNothing:
 270                 /* Look for domain decomp grid and separate PME nodes: */
 271                 if (str_starts(line, matchstrdd))
 272                 {
 273                     sscanf(line, "Domain decomposition grid %d x %d x %d, separate PME nodes %d",
 274                            &(perfdata->nx), &(perfdata->ny), &(perfdata->nz), &npme);
 275                     if (perfdata->nPMEnodes == -1)
 276                     {
 277                         perfdata->guessPME = npme;
 278                     }
 279                     else if (perfdata->nPMEnodes != npme)
 280                     {
 281                         gmx_fatal(FARGS, "PME nodes from command line and output file are not identical");
 282                     }
 283                     iFound = eFoundDDStr;
 284                 }
 285                 /* Catch a few errors that might have occured: */
 286                 else if (str_starts(line, "There is no domain decomposition for"))
 287                 {
 288                     fclose(fp);
 289                     return eParselogNoDDGrid;
 290                 }
 291                 else if (str_starts(line, "reading tpx file"))
 292                 {
 293                     fclose(fp);
 294                     return eParselogTPXVersion;
 295                 }
 296                 else if (str_starts(line, "The -dd or -npme option request a parallel simulation"))
 297                 {
 298                     fclose(fp);
 299                     return eParselogNotParallel;
 300                 }
 301                 break;
 302             case eFoundDDStr:
 303                 /* Look for PME mesh/force balance (not necessarily present, though) */
 304                 if (str_starts(line, matchstrbal))
 305                 {
 306                     sscanf(&line[strlen(matchstrbal)], "%f", &(perfdata->PME_f_load[test_nr]));
 307                 }
 308                 /* Look for matchstring */
 309                 if (str_starts(line, matchstring))
 310                 {
 311                     iFound = eFoundAccountingStr;
 312                 }
 313                 break;
 314             case eFoundAccountingStr:
 315                 /* Already found matchstring - look for cycle data */
 316                 if (str_starts(line, "Total  "))
 317                 {
 318                     sscanf(line, "Total %d %lf", &procs, &(perfdata->Gcycles[test_nr]));
 319                     iFound = eFoundCycleStr;
 320                 }
 321                 break;
 322             case eFoundCycleStr:
 323                 /* Already found cycle data - look for remaining performance info and return */
 324                 if (str_starts(line, "Performance:"))
 325                 {
 326                     ndum = sscanf(line, "%s %f %f %f %f", dumstring, &dum1, &dum2, &dum3, &dum4);
 327                     /* (ns/day) is the second last entry, depending on whether GMX_DETAILED_PERF_STATS was set in print_perf(), nrnb.c */
 328                     perfdata->ns_per_day[test_nr] = (ndum == 5) ? dum3 : dum1;
 329                     fclose(fp);
 330                     if (bResetChecked || presteps == 0)
 331                     {
 332                         return eParselogOK;
 333                     }
 334                     else
 335                     {
 336                         return eParselogResetProblem;
 337                     }
 338                 }
 339                 break;
 340         }
 341     } /* while */
 342
 343     /* Close the log file */
 344     fclose(fp);
 345
 346     /* Check why there is no performance data in the log file.
 347      * Did a fatal errors occur? */
 348     if (gmx_fexist(errfile))
 349     {
 350         fp = fopen(errfile, "r");
 351         while (fgets(line, STRLEN, fp) != NULL)
 352         {
 353             if (str_starts(line, "Fatal error:") )
 354             {
 355                 if (fgets(line, STRLEN, fp) != NULL)
 356                 {
 357                     fprintf(stderr, "\nWARNING: An error occured during this benchmark:\n"
 358                             "%s\n", line);
 359                 }
 360                 fclose(fp);
 361                 cleandata(perfdata, test_nr);
 362                 return eParselogFatal;
 363             }
 364         }
 365         fclose(fp);
 366     }
 367     else
 368     {
 369         fprintf(stderr, "WARNING: Could not find stderr file %s.\n", errfile);
 370     }
 371
 372     /* Giving up ... we could not find out why there is no performance data in
 373      * the log file. */
 374     fprintf(stdout, "No performance data in log file.\n");
 375     cleandata(perfdata, test_nr);
 376
 377     return eParselogNoPerfData;
 378 }
 379
 380
 381 static gmx_bool analyze_data(
 382         FILE         *fp,
 383         const char   *fn,
 384         t_perf      **perfdata,
 385         int           nnodes,
 386         int           ntprs,
 387         int           ntests,
 388         int           nrepeats,
 389         t_inputinfo  *info,
 390         int          *index_tpr,    /* OUT: Nr of mdp file with best settings */
 391         int          *npme_optimal) /* OUT: Optimal number of PME nodes */
 392 {
 393     int      i, j, k;
 394     int      line  = 0, line_win = -1;
 395     int      k_win = -1, i_win = -1, winPME;
 396     double   s     = 0.0; /* standard deviation */
 397     t_perf  *pd;
 398     char     strbuf[STRLEN];
 399     char     str_PME_f_load[13];
 400     gmx_bool bCanUseOrigTPR;
 401     gmx_bool bRefinedCoul, bRefinedVdW, bRefinedGrid;
 402
 403
 404     if (nrepeats > 1)
 405     {
 406         sep_line(fp);
 407         fprintf(fp, "Summary of successful runs:\n");
 408         fprintf(fp, "Line tpr PME nodes  Gcycles Av.     Std.dev.       ns/day        PME/f");
 409         if (nnodes > 1)
 410         {
 411             fprintf(fp, "    DD grid");
 412         }
 413         fprintf(fp, "\n");
 414     }
 415
 416
 417     for (k = 0; k < ntprs; k++)
 418     {
 419         for (i = 0; i < ntests; i++)
 420         {
 421             /* Select the right dataset: */
 422             pd = &(perfdata[k][i]);
 423
 424             pd->Gcycles_Av    = 0.0;
 425             pd->PME_f_load_Av = 0.0;
 426             pd->ns_per_day_Av = 0.0;
 427
 428             if (pd->nPMEnodes == -1)
 429             {
 430                 sprintf(strbuf, "(%3d)", pd->guessPME);
 431             }
 432             else
 433             {
 434                 sprintf(strbuf, "     ");
 435             }
 436
 437             /* Get the average run time of a setting */
 438             for (j = 0; j < nrepeats; j++)
 439             {
 440                 pd->Gcycles_Av    += pd->Gcycles[j];
 441                 pd->PME_f_load_Av += pd->PME_f_load[j];
 442             }
 443             pd->Gcycles_Av    /= nrepeats;
 444             pd->PME_f_load_Av /= nrepeats;
 445
 446             for (j = 0; j < nrepeats; j++)
 447             {
 448                 if (pd->ns_per_day[j] > 0.0)
 449                 {
 450                     pd->ns_per_day_Av += pd->ns_per_day[j];
 451                 }
 452                 else
 453                 {
 454                     /* Somehow the performance number was not aquired for this run,
 455                      * therefor set the average to some negative value: */
 456                     pd->ns_per_day_Av = -1.0f*nrepeats;
 457                     break;
 458                 }
 459             }
 460             pd->ns_per_day_Av /= nrepeats;
 461
 462             /* Nicer output: */
 463             if (pd->PME_f_load_Av > 0.0)
 464             {
 465                 sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load_Av);
 466             }
 467             else
 468             {
 469                 sprintf(str_PME_f_load, "%s", "         -  ");
 470             }
 471
 472
 473             /* We assume we had a successful run if both averages are positive */
 474             if (pd->Gcycles_Av > 0.0 && pd->ns_per_day_Av > 0.0)
 475             {
 476                 /* Output statistics if repeats were done */
 477                 if (nrepeats > 1)
 478                 {
 479                     /* Calculate the standard deviation */
 480                     s = 0.0;
 481                     for (j = 0; j < nrepeats; j++)
 482                     {
 483                         s += pow( pd->Gcycles[j] - pd->Gcycles_Av, 2 );
 484                     }
 485                     s /= (nrepeats - 1);
 486                     s  = sqrt(s);
 487
 488                     fprintf(fp, "%4d %3d %4d%s %12.3f %12.3f %12.3f %s",
 489                             line, k, pd->nPMEnodes, strbuf, pd->Gcycles_Av, s,
 490                             pd->ns_per_day_Av, str_PME_f_load);
 491                     if (nnodes > 1)
 492                     {
 493                         fprintf(fp, "  %3d %3d %3d", pd->nx, pd->ny, pd->nz);
 494                     }
 495                     fprintf(fp, "\n");
 496                 }
 497                 /* Store the index of the best run found so far in 'winner': */
 498                 if ( (k_win == -1) || (pd->Gcycles_Av < perfdata[k_win][i_win].Gcycles_Av) )
 499                 {
 500                     k_win    = k;
 501                     i_win    = i;
 502                     line_win = line;
 503                 }
 504                 line++;
 505             }
 506         }
 507     }
 508
 509     if (k_win == -1)
 510     {
 511         gmx_fatal(FARGS, "None of the runs was successful! Check %s for problems.", fn);
 512     }
 513
 514     sep_line(fp);
 515
 516     winPME = perfdata[k_win][i_win].nPMEnodes;
 517
 518     if (1 == ntests)
 519     {
 520         /* We stuck to a fixed number of PME-only nodes */
 521         sprintf(strbuf, "settings No. %d", k_win);
 522     }
 523     else
 524     {
 525         /* We have optimized the number of PME-only nodes */
 526         if (winPME == -1)
 527         {
 528             sprintf(strbuf, "%s", "the automatic number of PME nodes");
 529         }
 530         else
 531         {
 532             sprintf(strbuf, "%d PME nodes", winPME);
 533         }
 534     }
 535     fprintf(fp, "Best performance was achieved with %s", strbuf);
 536     if ((nrepeats > 1) && (ntests > 1))
 537     {
 538         fprintf(fp, " (see line %d)", line_win);
 539     }
 540     fprintf(fp, "\n");
 541
 542     /* Only mention settings if they were modified: */
 543     bRefinedCoul = !is_equal(info->rcoulomb[k_win], info->rcoulomb[0]);
 544     bRefinedVdW  = !is_equal(info->rvdw[k_win], info->rvdw[0]    );
 545     bRefinedGrid = !(info->nkx[k_win] == info->nkx[0] &&
 546                      info->nky[k_win] == info->nky[0] &&
 547                      info->nkz[k_win] == info->nkz[0]);
 548
 549     if (bRefinedCoul || bRefinedVdW || bRefinedGrid)
 550     {
 551         fprintf(fp, "Optimized PME settings:\n");
 552         bCanUseOrigTPR = FALSE;
 553     }
 554     else
 555     {
 556         bCanUseOrigTPR = TRUE;
 557     }
 558
 559     if (bRefinedCoul)
 560     {
 561         fprintf(fp, "   New Coulomb radius: %f nm (was %f nm)\n", info->rcoulomb[k_win], info->rcoulomb[0]);
 562     }
 563
 564     if (bRefinedVdW)
 565     {
 566         fprintf(fp, "   New Van der Waals radius: %f nm (was %f nm)\n", info->rvdw[k_win], info->rvdw[0]);
 567     }
 568
 569     if (bRefinedGrid)
 570     {
 571         fprintf(fp, "   New Fourier grid xyz: %d %d %d (was %d %d %d)\n", info->nkx[k_win], info->nky[k_win], info->nkz[k_win],
 572                 info->nkx[0], info->nky[0], info->nkz[0]);
 573     }
 574
 575     if (bCanUseOrigTPR && ntprs > 1)
 576     {
 577         fprintf(fp, "and original PME settings.\n");
 578     }
 579
 580     fflush(fp);
 581
 582     /* Return the index of the mdp file that showed the highest performance
 583      * and the optimal number of PME nodes */
 584     *index_tpr    = k_win;
 585     *npme_optimal = winPME;
 586
 587     return bCanUseOrigTPR;
 588 }
 589
 590
 591 /* Get the commands we need to set up the runs from environment variables */
 592 static void get_program_paths(gmx_bool bThreads, char *cmd_mpirun[], char cmd_np[],
 593                               char *cmd_mdrun[], int repeats)
 594 {
 595     char      *command = NULL;
 596     char      *cp;
 597     char      *cp2;
 598     char       line[STRLEN];
 599     FILE      *fp;
 600     const char def_mpirun[]   = "mpirun";
 601     const char def_mdrun[]    = "mdrun";
 602     const char filename[]     = "benchtest.log";
 603
 604     /* This string should always be identical to the one in copyrite.c,
 605      * gmx_print_version_info() in the defined(GMX_MPI) section */
 606     const char match_mpi[]    = "MPI library:        MPI";
 607     const char match_mdrun[]  = "Program: ";
 608     const char empty_mpirun[] = "";
 609     gmx_bool   bMdrun         = FALSE;
 610     gmx_bool   bMPI           = FALSE;
 611
 612
 613     /* Get the commands we need to set up the runs from environment variables */
 614     if (!bThreads)
 615     {
 616         if ( (cp = getenv("MPIRUN")) != NULL)
 617         {
 618             *cmd_mpirun = strdup(cp);
 619         }
 620         else
 621         {
 622             *cmd_mpirun = strdup(def_mpirun);
 623         }
 624     }
 625     else
 626     {
 627         *cmd_mpirun = strdup(empty_mpirun);
 628     }
 629
 630     if ( (cp = getenv("MDRUN" )) != NULL)
 631     {
 632         *cmd_mdrun  = strdup(cp);
 633     }
 634     else
 635     {
 636         *cmd_mdrun  = strdup(def_mdrun);
 637     }
 638
 639
 640     /* If no simulations have to be performed, we are done here */
 641     if (repeats <= 0)
 642     {
 643         return;
 644     }
 645
 646     /* Run a small test to see whether mpirun + mdrun work  */
 647     fprintf(stdout, "Making sure that mdrun can be executed. ");
 648     if (bThreads)
 649     {
 650         snew(command, strlen(*cmd_mdrun) + strlen(cmd_np) + strlen(filename) + 50);
 651         sprintf(command, "%s%s-version -maxh 0.001 1> %s 2>&1", *cmd_mdrun, cmd_np, filename);
 652     }
 653     else
 654     {
 655         snew(command, strlen(*cmd_mpirun) + strlen(cmd_np) + strlen(*cmd_mdrun) + strlen(filename) + 50);
 656         sprintf(command, "%s%s%s -version -maxh 0.001 1> %s 2>&1", *cmd_mpirun, cmd_np, *cmd_mdrun, filename);
 657     }
 658     fprintf(stdout, "Trying '%s' ... ", command);
 659     make_backup(filename);
 660     gmx_system_call(command);
 661
 662     /* Check if we find the characteristic string in the output: */
 663     if (!gmx_fexist(filename))
 664     {
 665         gmx_fatal(FARGS, "Output from test run could not be found.");
 666     }
 667
 668     fp = fopen(filename, "r");
 669     /* We need to scan the whole output file, since sometimes the queuing system
 670      * also writes stuff to stdout/err */
 671     while (!feof(fp) )
 672     {
 673         cp2 = fgets(line, STRLEN, fp);
 674         if (cp2 != NULL)
 675         {
 676             if (str_starts(line, match_mdrun) )
 677             {
 678                 bMdrun = TRUE;
 679             }
 680             if (str_starts(line, match_mpi) )
 681             {
 682                 bMPI = TRUE;
 683             }
 684         }
 685     }
 686     fclose(fp);
 687
 688     if (bThreads)
 689     {
 690         if (bMPI)
 691         {
 692             gmx_fatal(FARGS, "Need a threaded version of mdrun. This one\n"
 693                       "(%s)\n"
 694                       "seems to have been compiled with MPI instead.",
 695                       *cmd_mdrun);
 696         }
 697     }
 698     else
 699     {
 700         if (bMdrun && !bMPI)
 701         {
 702             gmx_fatal(FARGS, "Need an MPI-enabled version of mdrun. This one\n"
 703                       "(%s)\n"
 704                       "seems to have been compiled without MPI support.",
 705                       *cmd_mdrun);
 706         }
 707     }
 708
 709     if (!bMdrun)
 710     {
 711         gmx_fatal(FARGS, "Cannot execute mdrun. Please check %s for problems!",
 712                   filename);
 713     }
 714
 715     fprintf(stdout, "passed.\n");
 716
 717     /* Clean up ... */
 718     remove(filename);
 719     sfree(command);
 720 }
 721
 722
 723 static void launch_simulation(
 724         gmx_bool    bLaunch,        /* Should the simulation be launched? */
 725         FILE       *fp,             /* General log file */
 726         gmx_bool    bThreads,       /* whether to use threads */
 727         char       *cmd_mpirun,     /* Command for mpirun */
 728         char       *cmd_np,         /* Switch for -np or -ntmpi or empty */
 729         char       *cmd_mdrun,      /* Command for mdrun */
 730         char       *args_for_mdrun, /* Arguments for mdrun */
 731         const char *simulation_tpr, /* This tpr will be simulated */
 732         int         nPMEnodes)      /* Number of PME nodes to use */
 733 {
 734     char  *command;
 735
 736
 737     /* Make enough space for the system call command,
 738      * (100 extra chars for -npme ... etc. options should suffice): */
 739     snew(command, strlen(cmd_mpirun)+strlen(cmd_mdrun)+strlen(cmd_np)+strlen(args_for_mdrun)+strlen(simulation_tpr)+100);
 740
 741     /* Note that the -passall options requires args_for_mdrun to be at the end
 742      * of the command line string */
 743     if (bThreads)
 744     {
 745         sprintf(command, "%s%s-npme %d -s %s %s",
 746                 cmd_mdrun, cmd_np, nPMEnodes, simulation_tpr, args_for_mdrun);
 747     }
 748     else
 749     {
 750         sprintf(command, "%s%s%s -npme %d -s %s %s",
 751                 cmd_mpirun, cmd_np, cmd_mdrun, nPMEnodes, simulation_tpr, args_for_mdrun);
 752     }
 753
 754     fprintf(fp, "%s this command line to launch the simulation:\n\n%s", bLaunch ? "Using" : "Please use", command);
 755     sep_line(fp);
 756     fflush(fp);
 757
 758     /* Now the real thing! */
 759     if (bLaunch)
 760     {
 761         fprintf(stdout, "\nLaunching simulation with best parameters now.\nExecuting '%s'", command);
 762         sep_line(stdout);
 763         fflush(stdout);
 764         gmx_system_call(command);
 765     }
 766 }
 767
 768
 769 static void modify_PMEsettings(
 770         gmx_large_int_t simsteps,    /* Set this value as number of time steps */
 771         gmx_large_int_t init_step,   /* Set this value as init_step */
 772         const char     *fn_best_tpr, /* tpr file with the best performance */
 773         const char     *fn_sim_tpr)  /* name of tpr file to be launched */
 774 {
 775     t_inputrec   *ir;
 776     t_state       state;
 777     gmx_mtop_t    mtop;
 778     char          buf[200];
 779
 780     snew(ir, 1);
 781     read_tpx_state(fn_best_tpr, ir, &state, NULL, &mtop);
 782
 783     /* Reset nsteps and init_step to the value of the input .tpr file */
 784     ir->nsteps    = simsteps;
 785     ir->init_step = init_step;
 786
 787     /* Write the tpr file which will be launched */
 788     sprintf(buf, "Writing optimized simulation file %s with nsteps=%s.\n", fn_sim_tpr, gmx_large_int_pfmt);
 789     fprintf(stdout, buf, ir->nsteps);
 790     fflush(stdout);
 791     write_tpx_state(fn_sim_tpr, ir, &state, &mtop);
 792
 793     sfree(ir);
 794 }
 795
 796
 797 #define EPME_SWITCHED(e) ((e) == eelPMESWITCH || (e) == eelPMEUSERSWITCH)
 798
 799 /* Make additional TPR files with more computational load for the
 800  * direct space processors: */
 801 static void make_benchmark_tprs(
 802         const char     *fn_sim_tpr,      /* READ : User-provided tpr file                 */
 803         char           *fn_bench_tprs[], /* WRITE: Names of benchmark tpr files           */
 804         gmx_large_int_t benchsteps,      /* Number of time steps for benchmark runs       */
 805         gmx_large_int_t statesteps,      /* Step counter in checkpoint file               */
 806         real            rmin,            /* Minimal Coulomb radius                        */
 807         real            rmax,            /* Maximal Coulomb radius                        */
 808         real            bScaleRvdw,      /* Scale rvdw along with rcoulomb                */
 809         int            *ntprs,           /* No. of TPRs to write, each with a different
 810                                             rcoulomb and fourierspacing                   */
 811         t_inputinfo    *info,            /* Contains information about mdp file options   */
 812         FILE           *fp)              /* Write the output here                         */
 813 {
 814     int           i, j, d;
 815     t_inputrec   *ir;
 816     t_state       state;
 817     gmx_mtop_t    mtop;
 818     real          nlist_buffer;     /* Thickness of the buffer regions for PME-switch potentials */
 819     char          buf[200];
 820     rvec          box_size;
 821     gmx_bool      bNote = FALSE;
 822     real          add;              /* Add this to rcoul for the next test    */
 823     real          fac = 1.0;        /* Scaling factor for Coulomb radius      */
 824     real          fourierspacing;   /* Basic fourierspacing from tpr          */
 825
 826
 827     sprintf(buf, "Making benchmark tpr file%s with %s time step%s",
 828             *ntprs > 1 ? "s" : "", gmx_large_int_pfmt, benchsteps > 1 ? "s" : "");
 829     fprintf(stdout, buf, benchsteps);
 830     if (statesteps > 0)
 831     {
 832         sprintf(buf, " (adding %s steps from checkpoint file)", gmx_large_int_pfmt);
 833         fprintf(stdout, buf, statesteps);
 834         benchsteps += statesteps;
 835     }
 836     fprintf(stdout, ".\n");
 837
 838
 839     snew(ir, 1);
 840     read_tpx_state(fn_sim_tpr, ir, &state, NULL, &mtop);
 841
 842     /* Check if some kind of PME was chosen */
 843     if (EEL_PME(ir->coulombtype) == FALSE)
 844     {
 845         gmx_fatal(FARGS, "Can only do optimizations for simulations with %s electrostatics.",
 846                   EELTYPE(eelPME));
 847     }
 848
 849     /* Check if rcoulomb == rlist, which is necessary for plain PME. */
 850     if (  (ir->cutoff_scheme != ecutsVERLET) &&
 851           (eelPME == ir->coulombtype) && !(ir->rcoulomb == ir->rlist))
 852     {
 853         gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to rlist (%f).",
 854                   EELTYPE(eelPME), ir->rcoulomb, ir->rlist);
 855     }
 856     /* For other PME types, rcoulomb is allowed to be smaller than rlist */
 857     else if (ir->rcoulomb > ir->rlist)
 858     {
 859         gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to or smaller than rlist (%f)",
 860                   EELTYPE(ir->coulombtype), ir->rcoulomb, ir->rlist);
 861     }
 862
 863     if (bScaleRvdw && ir->rvdw != ir->rcoulomb)
 864     {
 865         fprintf(stdout, "NOTE: input rvdw != rcoulomb, will not scale rvdw\n");
 866         bScaleRvdw = FALSE;
 867     }
 868
 869     /* Reduce the number of steps for the benchmarks */
 870     info->orig_sim_steps = ir->nsteps;
 871     ir->nsteps           = benchsteps;
 872     /* We must not use init_step from the input tpr file for the benchmarks */
 873     info->orig_init_step = ir->init_step;
 874     ir->init_step        = 0;
 875
 876     /* For PME-switch potentials, keep the radial distance of the buffer region */
 877     nlist_buffer   = ir->rlist - ir->rcoulomb;
 878
 879     /* Determine length of triclinic box vectors */
 880     for (d = 0; d < DIM; d++)
 881     {
 882         box_size[d] = 0;
 883         for (i = 0; i < DIM; i++)
 884         {
 885             box_size[d] += state.box[d][i]*state.box[d][i];
 886         }
 887         box_size[d] = sqrt(box_size[d]);
 888     }
 889
 890     if (ir->fourier_spacing > 0)
 891     {
 892         info->fsx[0] = ir->fourier_spacing;
 893         info->fsy[0] = ir->fourier_spacing;
 894         info->fsz[0] = ir->fourier_spacing;
 895     }
 896     else
 897     {
 898         /* Reconstruct fourierspacing per dimension from the number of grid points and box size */
 899         info->fsx[0] = box_size[XX]/ir->nkx;
 900         info->fsy[0] = box_size[YY]/ir->nky;
 901         info->fsz[0] = box_size[ZZ]/ir->nkz;
 902     }
 903
 904     /* If no value for the fourierspacing was provided on the command line, we
 905      * use the reconstruction from the tpr file */
 906     if (ir->fourier_spacing > 0)
 907     {
 908         /* Use the spacing from the tpr */
 909         fourierspacing = ir->fourier_spacing;
 910     }
 911     else
 912     {
 913         /* Use the maximum observed spacing */
 914         fourierspacing = max(max(info->fsx[0], info->fsy[0]), info->fsz[0]);
 915     }
 916
 917     fprintf(stdout, "Calculating PME grid points on the basis of a fourierspacing of %f nm\n", fourierspacing);
 918
 919     /* For performance comparisons the number of particles is useful to have */
 920     fprintf(fp, "   Number of particles  : %d\n", mtop.natoms);
 921
 922     /* Print information about settings of which some are potentially modified: */
 923     fprintf(fp, "   Coulomb type         : %s\n", EELTYPE(ir->coulombtype));
 924     fprintf(fp, "   Grid spacing x y z   : %f %f %f\n",
 925             box_size[XX]/ir->nkx, box_size[YY]/ir->nky, box_size[ZZ]/ir->nkz);
 926     fprintf(fp, "   Van der Waals type   : %s\n", EVDWTYPE(ir->vdwtype));
 927     if (EVDW_SWITCHED(ir->vdwtype))
 928     {
 929         fprintf(fp, "   rvdw_switch          : %f nm\n", ir->rvdw_switch);
 930     }
 931     if (EPME_SWITCHED(ir->coulombtype))
 932     {
 933         fprintf(fp, "   rlist                : %f nm\n", ir->rlist);
 934     }
 935     if (ir->rlistlong != max_cutoff(ir->rvdw, ir->rcoulomb))
 936     {
 937         fprintf(fp, "   rlistlong            : %f nm\n", ir->rlistlong);
 938     }
 939
 940     /* Print a descriptive line about the tpr settings tested */
 941     fprintf(fp, "\nWill try these real/reciprocal workload settings:\n");
 942     fprintf(fp, " No.   scaling  rcoulomb");
 943     fprintf(fp, "  nkx  nky  nkz");
 944     fprintf(fp, "   spacing");
 945     if (evdwCUT == ir->vdwtype)
 946     {
 947         fprintf(fp, "      rvdw");
 948     }
 949     if (EPME_SWITCHED(ir->coulombtype))
 950     {
 951         fprintf(fp, "     rlist");
 952     }
 953     if (ir->rlistlong != max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb)) )
 954     {
 955         fprintf(fp, " rlistlong");
 956     }
 957     fprintf(fp, "  tpr file\n");
 958
 959     /* Loop to create the requested number of tpr input files */
 960     for (j = 0; j < *ntprs; j++)
 961     {
 962         /* The first .tpr is the provided one, just need to modify nsteps,
 963          * so skip the following block */
 964         if (j != 0)
 965         {
 966             /* Determine which Coulomb radii rc to use in the benchmarks */
 967             add = (rmax-rmin)/(*ntprs-1);
 968             if (is_equal(rmin, info->rcoulomb[0]))
 969             {
 970                 ir->rcoulomb = rmin + j*add;
 971             }
 972             else if (is_equal(rmax, info->rcoulomb[0]))
 973             {
 974                 ir->rcoulomb = rmin + (j-1)*add;
 975             }
 976             else
 977             {
 978                 /* rmin != rcoul != rmax, ergo test between rmin and rmax */
 979                 add          = (rmax-rmin)/(*ntprs-2);
 980                 ir->rcoulomb = rmin + (j-1)*add;
 981             }
 982
 983             /* Determine the scaling factor fac */
 984             fac = ir->rcoulomb/info->rcoulomb[0];
 985
 986             /* Scale the Fourier grid spacing */
 987             ir->nkx = ir->nky = ir->nkz = 0;
 988             calc_grid(NULL, state.box, fourierspacing*fac, &ir->nkx, &ir->nky, &ir->nkz);
 989
 990             /* Adjust other radii since various conditions neet to be fulfilled */
 991             if (eelPME == ir->coulombtype)
 992             {
 993                 /* plain PME, rcoulomb must be equal to rlist */
 994                 ir->rlist = ir->rcoulomb;
 995             }
 996             else
 997             {
 998                 /* rlist must be >= rcoulomb, we keep the size of the buffer region */
 999                 ir->rlist = ir->rcoulomb + nlist_buffer;
1000             }
1001
1002             if (bScaleRvdw && evdwCUT == ir->vdwtype)
1003             {
1004                 /* For vdw cutoff, rvdw >= rlist */
1005                 ir->rvdw = max(info->rvdw[0], ir->rlist);
1006             }
1007
1008             ir->rlistlong = max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb));
1009
1010         } /* end of "if (j != 0)" */
1011
1012         /* for j==0: Save the original settings
1013          * for j >0: Save modified radii and Fourier grids */
1014         info->rcoulomb[j]  = ir->rcoulomb;
1015         info->rvdw[j]      = ir->rvdw;
1016         info->nkx[j]       = ir->nkx;
1017         info->nky[j]       = ir->nky;
1018         info->nkz[j]       = ir->nkz;
1019         info->rlist[j]     = ir->rlist;
1020         info->rlistlong[j] = ir->rlistlong;
1021         info->fsx[j]       = fac*fourierspacing;
1022         info->fsy[j]       = fac*fourierspacing;
1023         info->fsz[j]       = fac*fourierspacing;
1024
1025         /* Write the benchmark tpr file */
1026         strncpy(fn_bench_tprs[j], fn_sim_tpr, strlen(fn_sim_tpr)-strlen(".tpr"));
1027         sprintf(buf, "_bench%.2d.tpr", j);
1028         strcat(fn_bench_tprs[j], buf);
1029         fprintf(stdout, "Writing benchmark tpr %s with nsteps=", fn_bench_tprs[j]);
1030         fprintf(stdout, gmx_large_int_pfmt, ir->nsteps);
1031         if (j > 0)
1032         {
1033             fprintf(stdout, ", scaling factor %f\n", fac);
1034         }
1035         else
1036         {
1037             fprintf(stdout, ", unmodified settings\n");
1038         }
1039
1040         write_tpx_state(fn_bench_tprs[j], ir, &state, &mtop);
1041
1042         /* Write information about modified tpr settings to log file */
1043         fprintf(fp, "%4d%10f%10f", j, fac, ir->rcoulomb);
1044         fprintf(fp, "%5d%5d%5d", ir->nkx, ir->nky, ir->nkz);
1045         fprintf(fp, " %9f ", info->fsx[j]);
1046         if (evdwCUT == ir->vdwtype)
1047         {
1048             fprintf(fp, "%10f", ir->rvdw);
1049         }
1050         if (EPME_SWITCHED(ir->coulombtype))
1051         {
1052             fprintf(fp, "%10f", ir->rlist);
1053         }
1054         if (info->rlistlong[0] != max_cutoff(info->rlist[0], max_cutoff(info->rvdw[0], info->rcoulomb[0])) )
1055         {
1056             fprintf(fp, "%10f", ir->rlistlong);
1057         }
1058         fprintf(fp, "  %-14s\n", fn_bench_tprs[j]);
1059
1060         /* Make it clear to the user that some additional settings were modified */
1061         if (!is_equal(ir->rvdw, info->rvdw[0])
1062             || !is_equal(ir->rlistlong, info->rlistlong[0]) )
1063         {
1064             bNote = TRUE;
1065         }
1066     }
1067     if (bNote)
1068     {
1069         fprintf(fp, "\nNote that in addition to the Coulomb radius and the Fourier grid\n"
1070                 "other input settings were also changed (see table above).\n"
1071                 "Please check if the modified settings are appropriate.\n");
1072     }
1073     fflush(stdout);
1074     fflush(fp);
1075     sfree(ir);
1076 }
1077
1078
1079 /* Rename the files we want to keep to some meaningful filename and
1080  * delete the rest */
1081 static void cleanup(const t_filenm *fnm, int nfile, int k, int nnodes,
1082                     int nPMEnodes, int nr, gmx_bool bKeepStderr)
1083 {
1084     char        numstring[STRLEN];
1085     char        newfilename[STRLEN];
1086     const char *fn = NULL;
1087     int         i;
1088     const char *opt;
1089
1090
1091     fprintf(stdout, "Cleaning up, deleting benchmark temp files ...\n");
1092
1093     for (i = 0; i < nfile; i++)
1094     {
1095         opt = (char *)fnm[i].opt;
1096         if (strcmp(opt, "-p") == 0)
1097         {
1098             /* do nothing; keep this file */
1099             ;
1100         }
1101         else if (strcmp(opt, "-bg") == 0)
1102         {
1103             /* Give the log file a nice name so one can later see which parameters were used */
1104             numstring[0] = '\0';
1105             if (nr > 0)
1106             {
1107                 sprintf(numstring, "_%d", nr);
1108             }
1109             sprintf(newfilename, "%s_no%d_np%d_npme%d%s", opt2fn("-bg", nfile, fnm), k, nnodes, nPMEnodes, numstring);
1110             if (gmx_fexist(opt2fn("-bg", nfile, fnm)))
1111             {
1112                 fprintf(stdout, "renaming log file to %s\n", newfilename);
1113                 make_backup(newfilename);
1114                 rename(opt2fn("-bg", nfile, fnm), newfilename);
1115             }
1116         }
1117         else if (strcmp(opt, "-err") == 0)
1118         {
1119             /* This file contains the output of stderr. We want to keep it in
1120              * cases where there have been problems. */
1121             fn           = opt2fn(opt, nfile, fnm);
1122             numstring[0] = '\0';
1123             if (nr > 0)
1124             {
1125                 sprintf(numstring, "_%d", nr);
1126             }
1127             sprintf(newfilename, "%s_no%d_np%d_npme%d%s", fn, k, nnodes, nPMEnodes, numstring);
1128             if (gmx_fexist(fn))
1129             {
1130                 if (bKeepStderr)
1131                 {
1132                     fprintf(stdout, "Saving stderr output in %s\n", newfilename);
1133                     make_backup(newfilename);
1134                     rename(fn, newfilename);
1135                 }
1136                 else
1137                 {
1138                     fprintf(stdout, "Deleting %s\n", fn);
1139                     remove(fn);
1140                 }
1141             }
1142         }
1143         /* Delete the files which are created for each benchmark run: (options -b*) */
1144         else if ( (0 == strncmp(opt, "-b", 2)) && (opt2bSet(opt, nfile, fnm) || !is_optional(&fnm[i])) )
1145         {
1146             fn = opt2fn(opt, nfile, fnm);
1147             if (gmx_fexist(fn))
1148             {
1149                 fprintf(stdout, "Deleting %s\n", fn);
1150                 remove(fn);
1151             }
1152         }
1153     }
1154 }
1155
1156
1157 /* Returns the largest common factor of n1 and n2 */
1158 static int largest_common_factor(int n1, int n2)
1159 {
1160     int factor, nmax;
1161
1162     nmax = min(n1, n2);
1163     for (factor = nmax; factor > 0; factor--)
1164     {
1165         if (0 == (n1 % factor) && 0 == (n2 % factor) )
1166         {
1167             return(factor);
1168         }
1169     }
1170     return 0; /* one for the compiler */
1171 }
1172
1173 enum {
1174     eNpmeAuto, eNpmeAll, eNpmeReduced, eNpmeSubset, eNpmeNr
1175 };
1176
1177 /* Create a list of numbers of PME nodes to test */
1178 static void make_npme_list(
1179         const char *npmevalues_opt, /* Make a complete list with all
1180                                      * possibilities or a short list that keeps only
1181                                      * reasonable numbers of PME nodes                  */
1182         int        *nentries,       /* Number of entries we put in the nPMEnodes list   */
1183         int        *nPMEnodes[],    /* Each entry contains the value for -npme          */
1184         int         nnodes,         /* Total number of nodes to do the tests on         */
1185         int         minPMEnodes,    /* Minimum number of PME nodes                      */
1186         int         maxPMEnodes)    /* Maximum number of PME nodes                      */
1187 {
1188     int i, npme, npp;
1189     int min_factor = 1;   /* We request that npp and npme have this minimal
1190                            * largest common factor (depends on npp)           */
1191     int nlistmax;         /* Max. list size                                   */
1192     int nlist;            /* Actual number of entries in list                 */
1193     int eNPME = 0;
1194
1195
1196     /* Do we need to check all possible values for -npme or is a reduced list enough? */
1197     if (0 == strcmp(npmevalues_opt, "all") )
1198     {
1199         eNPME = eNpmeAll;
1200     }
1201     else if (0 == strcmp(npmevalues_opt, "subset") )
1202     {
1203         eNPME = eNpmeSubset;
1204     }
1205     else /* "auto" or "range" */
1206     {
1207         if (nnodes <= 64)
1208         {
1209             eNPME = eNpmeAll;
1210         }
1211         else if (nnodes < 128)
1212         {
1213             eNPME = eNpmeReduced;
1214         }
1215         else
1216         {
1217             eNPME = eNpmeSubset;
1218         }
1219     }
1220
1221     /* Calculate how many entries we could possibly have (in case of -npme all) */
1222     if (nnodes > 2)
1223     {
1224         nlistmax = maxPMEnodes - minPMEnodes + 3;
1225         if (0 == minPMEnodes)
1226         {
1227             nlistmax--;
1228         }
1229     }
1230     else
1231     {
1232         nlistmax = 1;
1233     }
1234
1235     /* Now make the actual list which is at most of size nlist */
1236     snew(*nPMEnodes, nlistmax);
1237     nlist = 0; /* start counting again, now the real entries in the list */
1238     for (i = 0; i < nlistmax - 2; i++)
1239     {
1240         npme = maxPMEnodes - i;
1241         npp  = nnodes-npme;
1242         switch (eNPME)
1243         {
1244             case eNpmeAll:
1245                 min_factor = 1;
1246                 break;
1247             case eNpmeReduced:
1248                 min_factor = 2;
1249                 break;
1250             case eNpmeSubset:
1251                 /* For 2d PME we want a common largest factor of at least the cube
1252                  * root of the number of PP nodes */
1253                 min_factor = (int) pow(npp, 1.0/3.0);
1254                 break;
1255             default:
1256                 gmx_fatal(FARGS, "Unknown option for eNPME in make_npme_list");
1257                 break;
1258         }
1259         if (largest_common_factor(npp, npme) >= min_factor)
1260         {
1261             (*nPMEnodes)[nlist] = npme;
1262             nlist++;
1263         }
1264     }
1265     /* We always test 0 PME nodes and the automatic number */
1266     *nentries             = nlist + 2;
1267     (*nPMEnodes)[nlist  ] =  0;
1268     (*nPMEnodes)[nlist+1] = -1;
1269
1270     fprintf(stderr, "Will try the following %d different values for -npme:\n", *nentries);
1271     for (i = 0; i < *nentries-1; i++)
1272     {
1273         fprintf(stderr, "%d, ", (*nPMEnodes)[i]);
1274     }
1275     fprintf(stderr, "and %d (auto).\n", (*nPMEnodes)[*nentries-1]);
1276 }
1277
1278
1279 /* Allocate memory to store the performance data */
1280 static void init_perfdata(t_perf *perfdata[], int ntprs, int datasets, int repeats)
1281 {
1282     int i, j, k;
1283
1284
1285     for (k = 0; k < ntprs; k++)
1286     {
1287         snew(perfdata[k], datasets);
1288         for (i = 0; i < datasets; i++)
1289         {
1290             for (j = 0; j < repeats; j++)
1291             {
1292                 snew(perfdata[k][i].Gcycles, repeats);
1293                 snew(perfdata[k][i].ns_per_day, repeats);
1294                 snew(perfdata[k][i].PME_f_load, repeats);
1295             }
1296         }
1297     }
1298 }
1299
1300
1301 /* Check for errors on mdrun -h */
1302 static void make_sure_it_runs(char *mdrun_cmd_line, int length, FILE *fp)
1303 {
1304     char *command, *msg;
1305     int   ret;
1306
1307
1308     snew(command, length +  15);
1309     snew(msg, length + 500);
1310
1311     fprintf(stdout, "Making shure the benchmarks can be executed ...\n");
1312     sprintf(command, "%s-h -quiet", mdrun_cmd_line);
1313     ret = gmx_system_call(command);
1314
1315     if (0 != ret)
1316     {
1317         /* To prevent confusion, do not again issue a gmx_fatal here since we already
1318          * get the error message from mdrun itself */
1319         sprintf(msg,    "Cannot run the benchmark simulations! Please check the error message of\n"
1320                 "mdrun for the source of the problem. Did you provide a command line\n"
1321                 "argument that neither g_tune_pme nor mdrun understands? Offending command:\n"
1322                 "\n%s\n\n", command);
1323
1324         fprintf(stderr, "%s", msg);
1325         sep_line(fp);
1326         fprintf(fp, "%s", msg);
1327
1328         exit(ret);
1329     }
1330
1331     sfree(command);
1332     sfree(msg    );
1333 }
1334
1335
1336 static void do_the_tests(
1337         FILE           *fp,             /* General g_tune_pme output file         */
1338         char          **tpr_names,      /* Filenames of the input files to test   */
1339         int             maxPMEnodes,    /* Max fraction of nodes to use for PME   */
1340         int             minPMEnodes,    /* Min fraction of nodes to use for PME   */
1341         int             npme_fixed,     /* If >= -1, test fixed number of PME
1342                                          * nodes only                             */
1343         const char     *npmevalues_opt, /* Which -npme values should be tested    */
1344         t_perf        **perfdata,       /* Here the performace data is stored     */
1345         int            *pmeentries,     /* Entries in the nPMEnodes list          */
1346         int             repeats,        /* Repeat each test this often            */
1347         int             nnodes,         /* Total number of nodes = nPP + nPME     */
1348         int             nr_tprs,        /* Total number of tpr files to test      */
1349         gmx_bool        bThreads,       /* Threads or MPI?                        */
1350         char           *cmd_mpirun,     /* mpirun command string                  */
1351         char           *cmd_np,         /* "-np", "-n", whatever mpirun needs     */
1352         char           *cmd_mdrun,      /* mdrun command string                   */
1353         char           *cmd_args_bench, /* arguments for mdrun in a string        */
1354         const t_filenm *fnm,            /* List of filenames from command line    */
1355         int             nfile,          /* Number of files specified on the cmdl. */
1356         int             presteps,       /* DLB equilibration steps, is checked    */
1357         gmx_large_int_t cpt_steps)      /* Time step counter in the checkpoint    */
1358 {
1359     int      i, nr, k, ret, count = 0, totaltests;
1360     int     *nPMEnodes = NULL;
1361     t_perf  *pd        = NULL;
1362     int      cmdline_length;
1363     char    *command, *cmd_stub;
1364     char     buf[STRLEN];
1365     gmx_bool bResetProblem = FALSE;
1366     gmx_bool bFirst        = TRUE;
1367
1368
1369     /* This string array corresponds to the eParselog enum type at the start
1370      * of this file */
1371     const char* ParseLog[] = {
1372         "OK.",
1373         "Logfile not found!",
1374         "No timings, logfile truncated?",
1375         "Run was terminated.",
1376         "Counters were not reset properly.",
1377         "No DD grid found for these settings.",
1378         "TPX version conflict!",
1379         "mdrun was not started in parallel!",
1380         "An error occured."
1381     };
1382     char        str_PME_f_load[13];
1383
1384
1385     /* Allocate space for the mdrun command line. 100 extra characters should
1386        be more than enough for the -npme etcetera arguments */
1387     cmdline_length =  strlen(cmd_mpirun)
1388         + strlen(cmd_np)
1389         + strlen(cmd_mdrun)
1390         + strlen(cmd_args_bench)
1391         + strlen(tpr_names[0]) + 100;
1392     snew(command, cmdline_length);
1393     snew(cmd_stub, cmdline_length);
1394
1395     /* Construct the part of the command line that stays the same for all tests: */
1396     if (bThreads)
1397     {
1398         sprintf(cmd_stub, "%s%s", cmd_mdrun, cmd_np);
1399     }
1400     else
1401     {
1402         sprintf(cmd_stub, "%s%s%s ", cmd_mpirun, cmd_np, cmd_mdrun);
1403     }
1404
1405     /* Create a list of numbers of PME nodes to test */
1406     if (npme_fixed < -1)
1407     {
1408         make_npme_list(npmevalues_opt, pmeentries, &nPMEnodes,
1409                        nnodes, minPMEnodes, maxPMEnodes);
1410     }
1411     else
1412     {
1413         *pmeentries  = 1;
1414         snew(nPMEnodes, 1);
1415         nPMEnodes[0] = npme_fixed;
1416         fprintf(stderr, "Will use a fixed number of %d PME-only nodes.\n", nPMEnodes[0]);
1417     }
1418
1419     if (0 == repeats)
1420     {
1421         fprintf(fp, "\nNo benchmarks done since number of repeats (-r) is 0.\n");
1422         ffclose(fp);
1423         finalize(opt2fn("-p", nfile, fnm));
1424         exit(0);
1425     }
1426
1427     /* Allocate one dataset for each tpr input file: */
1428     init_perfdata(perfdata, nr_tprs, *pmeentries, repeats);
1429
1430     /*****************************************/
1431     /* Main loop over all tpr files to test: */
1432     /*****************************************/
1433     totaltests = nr_tprs*(*pmeentries)*repeats;
1434     for (k = 0; k < nr_tprs; k++)
1435     {
1436         fprintf(fp, "\nIndividual timings for input file %d (%s):\n", k, tpr_names[k]);
1437         fprintf(fp, "PME nodes      Gcycles       ns/day        PME/f    Remark\n");
1438         /* Loop over various numbers of PME nodes: */
1439         for (i = 0; i < *pmeentries; i++)
1440         {
1441             pd = &perfdata[k][i];
1442
1443             /* Loop over the repeats for each scenario: */
1444             for (nr = 0; nr < repeats; nr++)
1445             {
1446                 pd->nPMEnodes = nPMEnodes[i];
1447
1448                 /* Add -npme and -s to the command line and save it. Note that
1449                  * the -passall (if set) options requires cmd_args_bench to be
1450                  * at the end of the command line string */
1451                 snew(pd->mdrun_cmd_line, cmdline_length);
1452                 sprintf(pd->mdrun_cmd_line, "%s-npme %d -s %s %s",
1453                         cmd_stub, pd->nPMEnodes, tpr_names[k], cmd_args_bench);
1454
1455                 /* To prevent that all benchmarks fail due to a show-stopper argument
1456                  * on the mdrun command line, we make a quick check with mdrun -h first */
1457                 if (bFirst)
1458                 {
1459                     make_sure_it_runs(pd->mdrun_cmd_line, cmdline_length, fp);
1460                 }
1461                 bFirst = FALSE;
1462
1463                 /* Do a benchmark simulation: */
1464                 if (repeats > 1)
1465                 {
1466                     sprintf(buf, ", pass %d/%d", nr+1, repeats);
1467                 }
1468                 else
1469                 {
1470                     buf[0] = '\0';
1471                 }
1472                 fprintf(stdout, "\n=== Progress %2.0f%%, tpr %d/%d, run %d/%d%s:\n",
1473                         (100.0*count)/totaltests,
1474                         k+1, nr_tprs, i+1, *pmeentries, buf);
1475                 make_backup(opt2fn("-err", nfile, fnm));
1476                 sprintf(command, "%s 1> /dev/null 2>%s", pd->mdrun_cmd_line, opt2fn("-err", nfile, fnm));
1477                 fprintf(stdout, "%s\n", pd->mdrun_cmd_line);
1478                 gmx_system_call(command);
1479
1480                 /* Collect the performance data from the log file; also check stderr
1481                  * for fatal errors */
1482                 ret = parse_logfile(opt2fn("-bg", nfile, fnm), opt2fn("-err", nfile, fnm),
1483                                     pd, nr, presteps, cpt_steps, nnodes);
1484                 if ((presteps > 0) && (ret == eParselogResetProblem))
1485                 {
1486                     bResetProblem = TRUE;
1487                 }
1488
1489                 if (-1 == pd->nPMEnodes)
1490                 {
1491                     sprintf(buf, "(%3d)", pd->guessPME);
1492                 }
1493                 else
1494                 {
1495                     sprintf(buf, "     ");
1496                 }
1497
1498                 /* Nicer output */
1499                 if (pd->PME_f_load[nr] > 0.0)
1500                 {
1501                     sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load[nr]);
1502                 }
1503                 else
1504                 {
1505                     sprintf(str_PME_f_load, "%s", "         -  ");
1506                 }
1507
1508                 /* Write the data we got to disk */
1509                 fprintf(fp, "%4d%s %12.3f %12.3f %s    %s", pd->nPMEnodes,
1510                         buf, pd->Gcycles[nr], pd->ns_per_day[nr], str_PME_f_load, ParseLog[ret]);
1511                 if (!(ret == eParselogOK || ret == eParselogNoDDGrid || ret == eParselogNotFound) )
1512                 {
1513                     fprintf(fp, " Check %s file for problems.", ret == eParselogFatal ? "err" : "log");
1514                 }
1515                 fprintf(fp, "\n");
1516                 fflush(fp);
1517                 count++;
1518
1519                 /* Do some cleaning up and delete the files we do not need any more */
1520                 cleanup(fnm, nfile, k, nnodes, pd->nPMEnodes, nr, ret == eParselogFatal);
1521
1522                 /* If the first run with this number of processors already failed, do not try again: */
1523                 if (pd->Gcycles[0] <= 0.0 && repeats > 1)
1524                 {
1525                     fprintf(stdout, "Skipping remaining passes of unsuccessful setting, see log file for details.\n");
1526                     count += repeats-(nr+1);
1527                     break;
1528                 }
1529             } /* end of repeats loop */
1530         }     /* end of -npme loop */
1531     }         /* end of tpr file loop */
1532
1533     if (bResetProblem)
1534     {
1535         sep_line(fp);
1536         fprintf(fp, "WARNING: The cycle and time step counters could not be reset properly. ");
1537         sep_line(fp);
1538     }
1539     sfree(command);
1540     sfree(cmd_stub);
1541 }
1542
1543
1544 static void check_input(
1545         int             nnodes,
1546         int             repeats,
1547         int            *ntprs,
1548         real           *rmin,
1549         real            rcoulomb,
1550         real           *rmax,
1551         real            maxPMEfraction,
1552         real            minPMEfraction,
1553         int             npme_fixed,
1554         gmx_large_int_t bench_nsteps,
1555         const t_filenm *fnm,
1556         int             nfile,
1557         int             sim_part,
1558         int             presteps,
1559         int             npargs,
1560         t_pargs        *pa)
1561 {
1562     int old;
1563
1564
1565     /* Make sure the input file exists */
1566     if (!gmx_fexist(opt2fn("-s", nfile, fnm)))
1567     {
1568         gmx_fatal(FARGS, "File %s not found.", opt2fn("-s", nfile, fnm));
1569     }
1570
1571     /* Make sure that the checkpoint file is not overwritten during benchmarking */
1572     if ( (0 == strcmp(opt2fn("-cpi", nfile, fnm), opt2fn("-bcpo", nfile, fnm)) ) && (sim_part > 1) )
1573     {
1574         gmx_fatal(FARGS, "Checkpoint input (-cpi) and benchmark checkpoint output (-bcpo) files must not be identical.\n"
1575                   "The checkpoint input file must not be overwritten during the benchmarks.\n");
1576     }
1577
1578     /* Make sure that repeats is >= 0 (if == 0, only write tpr files) */
1579     if (repeats < 0)
1580     {
1581         gmx_fatal(FARGS, "Number of repeats < 0!");
1582     }
1583
1584     /* Check number of nodes */
1585     if (nnodes < 1)
1586     {
1587         gmx_fatal(FARGS, "Number of nodes/threads must be a positive integer.");
1588     }
1589
1590     /* Automatically choose -ntpr if not set */
1591     if (*ntprs < 1)
1592     {
1593         if (nnodes < 16)
1594         {
1595             *ntprs = 1;
1596         }
1597         else
1598         {
1599             *ntprs = 3;
1600             /* Set a reasonable scaling factor for rcoulomb */
1601             if (*rmax <= 0)
1602             {
1603                 *rmax = rcoulomb * 1.2;
1604             }
1605         }
1606         fprintf(stderr, "Will test %d tpr file%s.\n", *ntprs, *ntprs == 1 ? "" : "s");
1607     }
1608     else
1609     {
1610         if (1 == *ntprs)
1611         {
1612             fprintf(stderr, "Note: Choose ntpr>1 to shift PME load between real and reciprocal space.\n");
1613         }
1614     }
1615
1616     /* Make shure that rmin <= rcoulomb <= rmax */
1617     if (*rmin <= 0)
1618     {
1619         *rmin = rcoulomb;
1620     }
1621     if (*rmax <= 0)
1622     {
1623         *rmax = rcoulomb;
1624     }
1625     if (!(*rmin <= *rmax) )
1626     {
1627         gmx_fatal(FARGS, "Please choose the Coulomb radii such that rmin <= rmax.\n"
1628                   "rmin = %g, rmax = %g, actual rcoul from .tpr file = %g\n", *rmin, *rmax, rcoulomb);
1629     }
1630     /* Add test scenarios if rmin or rmax were set */
1631     if (*ntprs <= 2)
1632     {
1633         if (!is_equal(*rmin, rcoulomb) && (*ntprs == 1) )
1634         {
1635             (*ntprs)++;
1636             fprintf(stderr, "NOTE: Setting -rmin to %g changed -ntpr to %d\n",
1637                     *rmin, *ntprs);
1638         }
1639         if (!is_equal(*rmax, rcoulomb) && (*ntprs == 1) )
1640         {
1641             (*ntprs)++;
1642             fprintf(stderr, "NOTE: Setting -rmax to %g changed -ntpr to %d\n",
1643                     *rmax, *ntprs);
1644         }
1645     }
1646     old = *ntprs;
1647     /* If one of rmin, rmax is set, we need 2 tpr files at minimum */
1648     if (!is_equal(*rmax, rcoulomb) || !is_equal(*rmin, rcoulomb) )
1649     {
1650         *ntprs = max(*ntprs, 2);
1651     }
1652
1653     /* If both rmin, rmax are set, we need 3 tpr files at minimum */
1654     if (!is_equal(*rmax, rcoulomb) && !is_equal(*rmin, rcoulomb) )
1655     {
1656         *ntprs = max(*ntprs, 3);
1657     }
1658
1659     if (old != *ntprs)
1660     {
1661         fprintf(stderr, "NOTE: Your rmin, rmax setting changed -ntpr to %d\n", *ntprs);
1662     }
1663
1664     if (*ntprs > 1)
1665     {
1666         if (is_equal(*rmin, rcoulomb) && is_equal(rcoulomb, *rmax)) /* We have just a single rc */
1667         {
1668             fprintf(stderr, "WARNING: Resetting -ntpr to 1 since no Coulomb radius scaling is requested.\n"
1669                     "Please set rmin < rmax to test Coulomb radii in the [rmin, rmax] interval\n"
1670                     "with correspondingly adjusted PME grid settings\n");
1671             *ntprs = 1;
1672         }
1673     }
1674
1675     /* Check whether max and min fraction are within required values */
1676     if (maxPMEfraction > 0.5 || maxPMEfraction < 0)
1677     {
1678         gmx_fatal(FARGS, "-max must be between 0 and 0.5");
1679     }
1680     if (minPMEfraction > 0.5 || minPMEfraction < 0)
1681     {
1682         gmx_fatal(FARGS, "-min must be between 0 and 0.5");
1683     }
1684     if (maxPMEfraction < minPMEfraction)
1685     {
1686         gmx_fatal(FARGS, "-max must be larger or equal to -min");
1687     }
1688
1689     /* Check whether the number of steps - if it was set - has a reasonable value */
1690     if (bench_nsteps < 0)
1691     {
1692         gmx_fatal(FARGS, "Number of steps must be positive.");
1693     }
1694
1695     if (bench_nsteps > 10000 || bench_nsteps < 100)
1696     {
1697         fprintf(stderr, "WARNING: steps=");
1698         fprintf(stderr, gmx_large_int_pfmt, bench_nsteps);
1699         fprintf(stderr, ". Are you sure you want to perform so %s steps for each benchmark?\n", (bench_nsteps < 100) ? "few" : "many");
1700     }
1701
1702     if (presteps < 0)
1703     {
1704         gmx_fatal(FARGS, "Cannot have a negative number of presteps.\n");
1705     }
1706
1707     /* Check for rcoulomb scaling if more than one .tpr file is tested */
1708     if (*ntprs > 1)
1709     {
1710         if (*rmin/rcoulomb < 0.75 || *rmax/rcoulomb > 1.25)
1711         {
1712             fprintf(stderr, "WARNING: Applying extreme scaling factor. I hope you know what you are doing.\n");
1713         }
1714     }
1715
1716     /* If a fixed number of PME nodes is set we do rcoulomb and PME gird tuning
1717      * only. We need to check whether the requested number of PME-only nodes
1718      * makes sense. */
1719     if (npme_fixed > -1)
1720     {
1721         /* No more than 50% of all nodes can be assigned as PME-only nodes. */
1722         if (2*npme_fixed > nnodes)
1723         {
1724             gmx_fatal(FARGS, "Cannot have more than %d PME-only nodes for a total of %d nodes (you chose %d).\n",
1725                       nnodes/2, nnodes, npme_fixed);
1726         }
1727         if ((npme_fixed > 0) && (5*npme_fixed < nnodes))
1728         {
1729             fprintf(stderr, "WARNING: Only %g percent of the nodes are assigned as PME-only nodes.\n",
1730                     100.0*((real)npme_fixed / (real)nnodes));
1731         }
1732         if (opt2parg_bSet("-min", npargs, pa) || opt2parg_bSet("-max", npargs, pa))
1733         {
1734             fprintf(stderr, "NOTE: The -min, -max, and -npme options have no effect when a\n"
1735                     "      fixed number of PME-only nodes is requested with -fix.\n");
1736         }
1737     }
1738 }
1739
1740
1741 /* Returns TRUE when "opt" is needed at launch time */
1742 static gmx_bool is_launch_file(char *opt, gmx_bool bSet)
1743 {
1744     /* Apart from the input .tpr and the error log we need all options that were set
1745      * on the command line and that do not start with -b */
1746     if (0 == strncmp(opt, "-b", 2) || 0 == strncmp(opt, "-s", 2) || 0 == strncmp(opt, "-err", 4))
1747     {
1748         return FALSE;
1749     }
1750
1751     return bSet;
1752 }
1753
1754
1755 /* Returns TRUE when "opt" defines a file which is needed for the benchmarks runs */
1756 static gmx_bool is_bench_file(char *opt, gmx_bool bSet, gmx_bool bOptional, gmx_bool bIsOutput)
1757 {
1758     /* Apart from the input .tpr, all files starting with "-b" are for
1759      * _b_enchmark files exclusively */
1760     if (0 == strncmp(opt, "-s", 2))
1761     {
1762         return FALSE;
1763     }
1764
1765     if (0 == strncmp(opt, "-b", 2) || 0 == strncmp(opt, "-s", 2))
1766     {
1767         if (!bOptional || bSet)
1768         {
1769             return TRUE;
1770         }
1771         else
1772         {
1773             return FALSE;
1774         }
1775     }
1776     else
1777     {
1778         if (bIsOutput)
1779         {
1780             return FALSE;
1781         }
1782         else
1783         {
1784             if (bSet) /* These are additional input files like -cpi -ei */
1785             {
1786                 return TRUE;
1787             }
1788             else
1789             {
1790                 return FALSE;
1791             }
1792         }
1793     }
1794 }
1795
1796
1797 /* Adds 'buf' to 'str' */
1798 static void add_to_string(char **str, char *buf)
1799 {
1800     int len;
1801
1802
1803     len = strlen(*str) + strlen(buf) + 1;
1804     srenew(*str, len);
1805     strcat(*str, buf);
1806 }
1807
1808
1809 /* Create the command line for the benchmark as well as for the real run */
1810 static void create_command_line_snippets(
1811         gmx_bool  bAppendFiles,
1812         gmx_bool  bKeepAndNumCPT,
1813         gmx_bool  bResetHWay,
1814         int       presteps,
1815         int       nfile,
1816         t_filenm  fnm[],
1817         char     *cmd_args_bench[],  /* command line arguments for benchmark runs */
1818         char     *cmd_args_launch[], /* command line arguments for simulation run */
1819         char      extra_args[])      /* Add this to the end of the command line */
1820 {
1821     int         i;
1822     char       *opt;
1823     const char *name;
1824     char        strbuf[STRLEN];
1825
1826
1827     /* strlen needs at least '\0' as a string: */
1828     snew(*cmd_args_bench, 1);
1829     snew(*cmd_args_launch, 1);
1830     *cmd_args_launch[0] = '\0';
1831     *cmd_args_bench[0]  = '\0';
1832
1833
1834     /*******************************************/
1835     /* 1. Process other command line arguments */
1836     /*******************************************/
1837     if (presteps > 0)
1838     {
1839         /* Add equilibration steps to benchmark options */
1840         sprintf(strbuf, "-resetstep %d ", presteps);
1841         add_to_string(cmd_args_bench, strbuf);
1842     }
1843     /* These switches take effect only at launch time */
1844     if (FALSE == bAppendFiles)
1845     {
1846         add_to_string(cmd_args_launch, "-noappend ");
1847     }
1848     if (bKeepAndNumCPT)
1849     {
1850         add_to_string(cmd_args_launch, "-cpnum ");
1851     }
1852     if (bResetHWay)
1853     {
1854         add_to_string(cmd_args_launch, "-resethway ");
1855     }
1856
1857     /********************/
1858     /* 2. Process files */
1859     /********************/
1860     for (i = 0; i < nfile; i++)
1861     {
1862         opt  = (char *)fnm[i].opt;
1863         name = opt2fn(opt, nfile, fnm);
1864
1865         /* Strbuf contains the options, now let's sort out where we need that */
1866         sprintf(strbuf, "%s %s ", opt, name);
1867
1868         if (is_bench_file(opt, opt2bSet(opt, nfile, fnm), is_optional(&fnm[i]), is_output(&fnm[i])) )
1869         {
1870             /* All options starting with -b* need the 'b' removed,
1871              * therefore overwrite strbuf */
1872             if (0 == strncmp(opt, "-b", 2))
1873             {
1874                 sprintf(strbuf, "-%s %s ", &opt[2], name);
1875             }
1876
1877             add_to_string(cmd_args_bench, strbuf);
1878         }
1879
1880         if (is_launch_file(opt, opt2bSet(opt, nfile, fnm)) )
1881         {
1882             add_to_string(cmd_args_launch, strbuf);
1883         }
1884     }
1885
1886     add_to_string(cmd_args_bench, extra_args);
1887     add_to_string(cmd_args_launch, extra_args);
1888 }
1889
1890
1891 /* Set option opt */
1892 static void setopt(const char *opt, int nfile, t_filenm fnm[])
1893 {
1894     int i;
1895
1896     for (i = 0; (i < nfile); i++)
1897     {
1898         if (strcmp(opt, fnm[i].opt) == 0)
1899         {
1900             fnm[i].flag |= ffSET;
1901         }
1902     }
1903 }
1904
1905
1906 /* This routine inspects the tpr file and ...
1907  * 1. checks for output files that get triggered by a tpr option. These output
1908  *    files are marked as 'set' to allow for a proper cleanup after each
1909  *    tuning run.
1910  * 2. returns the PME:PP load ratio
1911  * 3. returns rcoulomb from the tpr */
1912 static float inspect_tpr(int nfile, t_filenm fnm[], real *rcoulomb)
1913 {
1914     gmx_bool     bPull;     /* Is pulling requested in .tpr file?             */
1915     gmx_bool     bTpi;      /* Is test particle insertion requested?          */
1916     gmx_bool     bFree;     /* Is a free energy simulation requested?         */
1917     gmx_bool     bNM;       /* Is a normal mode analysis requested?           */
1918     t_inputrec   ir;
1919     t_state      state;
1920     gmx_mtop_t   mtop;
1921
1922
1923     /* Check tpr file for options that trigger extra output files */
1924     read_tpx_state(opt2fn("-s", nfile, fnm), &ir, &state, NULL, &mtop);
1925     bPull = (epullNO != ir.ePull);
1926     bFree = (efepNO  != ir.efep );
1927     bNM   = (eiNM    == ir.eI   );
1928     bTpi  = EI_TPI(ir.eI);
1929
1930     /* Set these output files on the tuning command-line */
1931     if (bPull)
1932     {
1933         setopt("-pf", nfile, fnm);
1934         setopt("-px", nfile, fnm);
1935     }
1936     if (bFree)
1937     {
1938         setopt("-dhdl", nfile, fnm);
1939     }
1940     if (bTpi)
1941     {
1942         setopt("-tpi", nfile, fnm);
1943         setopt("-tpid", nfile, fnm);
1944     }
1945     if (bNM)
1946     {
1947         setopt("-mtx", nfile, fnm);
1948     }
1949
1950     *rcoulomb = ir.rcoulomb;
1951
1952     /* Return the estimate for the number of PME nodes */
1953     return pme_load_estimate(&mtop, &ir, state.box);
1954 }
1955
1956
1957 static void couple_files_options(int nfile, t_filenm fnm[])
1958 {
1959     int      i;
1960     gmx_bool bSet, bBench;
1961     char    *opt;
1962     char     buf[20];
1963
1964
1965     for (i = 0; i < nfile; i++)
1966     {
1967         opt    = (char *)fnm[i].opt;
1968         bSet   = ((fnm[i].flag & ffSET) != 0);
1969         bBench = (0 == strncmp(opt, "-b", 2));
1970
1971         /* Check optional files */
1972         /* If e.g. -eo is set, then -beo also needs to be set */
1973         if (is_optional(&fnm[i]) && bSet && !bBench)
1974         {
1975             sprintf(buf, "-b%s", &opt[1]);
1976             setopt(buf, nfile, fnm);
1977         }
1978         /* If -beo is set, then -eo also needs to be! */
1979         if (is_optional(&fnm[i]) && bSet && bBench)
1980         {
1981             sprintf(buf, "-%s", &opt[2]);
1982             setopt(buf, nfile, fnm);
1983         }
1984     }
1985 }
1986
1987
1988 static double gettime()
1989 {
1990 #ifdef HAVE_GETTIMEOFDAY
1991     struct timeval t;
1992     double         seconds;
1993
1994     gettimeofday(&t, NULL);
1995
1996     seconds = (double) t.tv_sec + 1e-6*(double)t.tv_usec;
1997
1998     return seconds;
1999 #else
2000     double  seconds;
2001
2002     seconds = time(NULL);
2003
2004     return seconds;
2005 #endif
2006 }
2007
2008
2009 #define BENCHSTEPS (1000)
2010
2011 int gmx_tune_pme(int argc, char *argv[])
2012 {
2013     const char     *desc[] = {
2014         "For a given number [TT]-np[tt] or [TT]-ntmpi[tt] of processors/threads, this program systematically",
2015         "times [TT]mdrun[tt] with various numbers of PME-only nodes and determines",
2016         "which setting is fastest. It will also test whether performance can",
2017         "be enhanced by shifting load from the reciprocal to the real space",
2018         "part of the Ewald sum. ",
2019         "Simply pass your [TT].tpr[tt] file to [TT]g_tune_pme[tt] together with other options",
2020         "for [TT]mdrun[tt] as needed.[PAR]",
2021         "Which executables are used can be set in the environment variables",
2022         "MPIRUN and MDRUN. If these are not present, 'mpirun' and 'mdrun'",
2023         "will be used as defaults. Note that for certain MPI frameworks you",
2024         "need to provide a machine- or hostfile. This can also be passed",
2025         "via the MPIRUN variable, e.g.[PAR]",
2026         "[TT]export MPIRUN=\"/usr/local/mpirun -machinefile hosts\"[tt][PAR]",
2027         "Please call [TT]g_tune_pme[tt] with the normal options you would pass to",
2028         "[TT]mdrun[tt] and add [TT]-np[tt] for the number of processors to perform the",
2029         "tests on, or [TT]-ntmpi[tt] for the number of threads. You can also add [TT]-r[tt]",
2030         "to repeat each test several times to get better statistics. [PAR]",
2031         "[TT]g_tune_pme[tt] can test various real space / reciprocal space workloads",
2032         "for you. With [TT]-ntpr[tt] you control how many extra [TT].tpr[tt] files will be",
2033         "written with enlarged cutoffs and smaller Fourier grids respectively.",
2034         "Typically, the first test (number 0) will be with the settings from the input",
2035         "[TT].tpr[tt] file; the last test (number [TT]ntpr[tt]) will have the Coulomb cutoff",
2036         "specified by [TT]-rmax[tt] with a somwhat smaller PME grid at the same time. ",
2037         "In this last test, the Fourier spacing is multiplied with [TT]rmax[tt]/rcoulomb. ",
2038         "The remaining [TT].tpr[tt] files will have equally-spaced Coulomb radii (and Fourier "
2039         "spacings) between these extremes. [BB]Note[bb] that you can set [TT]-ntpr[tt] to 1",
2040         "if you just seek the optimal number of PME-only nodes; in that case",
2041         "your input [TT].tpr[tt] file will remain unchanged.[PAR]",
2042         "For the benchmark runs, the default of 1000 time steps should suffice for most",
2043         "MD systems. The dynamic load balancing needs about 100 time steps",
2044         "to adapt to local load imbalances, therefore the time step counters",
2045         "are by default reset after 100 steps. For large systems (>1M atoms), as well as ",
2046         "for a higher accuarcy of the measurements, you should set [TT]-resetstep[tt] to a higher value.",
2047         "From the 'DD' load imbalance entries in the md.log output file you",
2048         "can tell after how many steps the load is sufficiently balanced. Example call:[PAR]"
2049         "[TT]g_tune_pme -np 64 -s protein.tpr -launch[tt][PAR]",
2050         "After calling [TT]mdrun[tt] several times, detailed performance information",
2051         "is available in the output file [TT]perf.out.[tt] ",
2052         "[BB]Note[bb] that during the benchmarks, a couple of temporary files are written",
2053         "(options [TT]-b[tt]*), these will be automatically deleted after each test.[PAR]",
2054         "If you want the simulation to be started automatically with the",
2055         "optimized parameters, use the command line option [TT]-launch[tt].[PAR]",
2056     };
2057
2058     int             nnodes         = 1;
2059     int             repeats        = 2;
2060     int             pmeentries     = 0; /* How many values for -npme do we actually test for each tpr file */
2061     real            maxPMEfraction = 0.50;
2062     real            minPMEfraction = 0.25;
2063     int             maxPMEnodes, minPMEnodes;
2064     float           guessPMEratio;                    /* guessed PME:PP ratio based on the tpr file */
2065     float           guessPMEnodes;
2066     int             npme_fixed     = -2;              /* If >= -1, use only this number
2067                                                        * of PME-only nodes                */
2068     int             ntprs          = 0;
2069     real            rmin           = 0.0, rmax = 0.0; /* min and max value for rcoulomb if scaling is requested */
2070     real            rcoulomb       = -1.0;            /* Coulomb radius as set in .tpr file */
2071     gmx_bool        bScaleRvdw     = TRUE;
2072     gmx_large_int_t bench_nsteps   = BENCHSTEPS;
2073     gmx_large_int_t new_sim_nsteps = -1;  /* -1 indicates: not set by the user */
2074     gmx_large_int_t cpt_steps      = 0;   /* Step counter in .cpt input file   */
2075     int             presteps       = 100; /* Do a full cycle reset after presteps steps */
2076     gmx_bool        bOverwrite     = FALSE, bKeepTPR;
2077     gmx_bool        bLaunch        = FALSE;
2078     char           *ExtraArgs      = NULL;
2079     char          **tpr_names      = NULL;
2080     const char     *simulation_tpr = NULL;
2081     int             best_npme, best_tpr;
2082     int             sim_part = 1; /* For benchmarks with checkpoint files */
2083     char            bbuf[STRLEN];
2084
2085     /* Default program names if nothing else is found */
2086     char         *cmd_mpirun = NULL, *cmd_mdrun = NULL;
2087     char         *cmd_args_bench, *cmd_args_launch;
2088     char         *cmd_np = NULL;
2089
2090     t_perf      **perfdata = NULL;
2091     t_inputinfo  *info;
2092     int           i;
2093     FILE         *fp;
2094     t_commrec    *cr;
2095
2096     /* Print out how long the tuning took */
2097     double          seconds;
2098
2099     static t_filenm fnm[] = {
2100         /* g_tune_pme */
2101         { efOUT, "-p",      "perf",     ffWRITE },
2102         { efLOG, "-err",    "bencherr", ffWRITE },
2103         { efTPX, "-so",     "tuned",    ffWRITE },
2104         /* mdrun: */
2105         { efTPX, NULL,      NULL,       ffREAD },
2106         { efTRN, "-o",      NULL,       ffWRITE },
2107         { efXTC, "-x",      NULL,       ffOPTWR },
2108         { efCPT, "-cpi",    NULL,       ffOPTRD },
2109         { efCPT, "-cpo",    NULL,       ffOPTWR },
2110         { efSTO, "-c",      "confout",  ffWRITE },
2111         { efEDR, "-e",      "ener",     ffWRITE },
2112         { efLOG, "-g",      "md",       ffWRITE },
2113         { efXVG, "-dhdl",   "dhdl",     ffOPTWR },
2114         { efXVG, "-field",  "field",    ffOPTWR },
2115         { efXVG, "-table",  "table",    ffOPTRD },
2116         { efXVG, "-tabletf", "tabletf",   ffOPTRD },
2117         { efXVG, "-tablep", "tablep",   ffOPTRD },
2118         { efXVG, "-tableb", "table",    ffOPTRD },
2119         { efTRX, "-rerun",  "rerun",    ffOPTRD },
2120         { efXVG, "-tpi",    "tpi",      ffOPTWR },
2121         { efXVG, "-tpid",   "tpidist",  ffOPTWR },
2122         { efEDI, "-ei",     "sam",      ffOPTRD },
2123         { efXVG, "-eo",     "edsam",    ffOPTWR },
2124         { efGCT, "-j",      "wham",     ffOPTRD },
2125         { efGCT, "-jo",     "bam",      ffOPTWR },
2126         { efXVG, "-ffout",  "gct",      ffOPTWR },
2127         { efXVG, "-devout", "deviatie", ffOPTWR },
2128         { efXVG, "-runav",  "runaver",  ffOPTWR },
2129         { efXVG, "-px",     "pullx",    ffOPTWR },
2130         { efXVG, "-pf",     "pullf",    ffOPTWR },
2131         { efXVG, "-ro",     "rotation", ffOPTWR },
2132         { efLOG, "-ra",     "rotangles", ffOPTWR },
2133         { efLOG, "-rs",     "rotslabs", ffOPTWR },
2134         { efLOG, "-rt",     "rottorque", ffOPTWR },
2135         { efMTX, "-mtx",    "nm",       ffOPTWR },
2136         { efNDX, "-dn",     "dipole",   ffOPTWR },
2137         /* Output files that are deleted after each benchmark run */
2138         { efTRN, "-bo",     "bench",    ffWRITE },
2139         { efXTC, "-bx",     "bench",    ffWRITE },
2140         { efCPT, "-bcpo",   "bench",    ffWRITE },
2141         { efSTO, "-bc",     "bench",    ffWRITE },
2142         { efEDR, "-be",     "bench",    ffWRITE },
2143         { efLOG, "-bg",     "bench",    ffWRITE },
2144         { efXVG, "-beo",    "benchedo", ffOPTWR },
2145         { efXVG, "-bdhdl",  "benchdhdl", ffOPTWR },
2146         { efXVG, "-bfield", "benchfld", ffOPTWR },
2147         { efXVG, "-btpi",   "benchtpi", ffOPTWR },
2148         { efXVG, "-btpid",  "benchtpid", ffOPTWR },
2149         { efGCT, "-bjo",    "bench",    ffOPTWR },
2150         { efXVG, "-bffout", "benchgct", ffOPTWR },
2151         { efXVG, "-bdevout", "benchdev", ffOPTWR },
2152         { efXVG, "-brunav", "benchrnav", ffOPTWR },
2153         { efXVG, "-bpx",    "benchpx",  ffOPTWR },
2154         { efXVG, "-bpf",    "benchpf",  ffOPTWR },
2155         { efXVG, "-bro",    "benchrot", ffOPTWR },
2156         { efLOG, "-bra",    "benchrota", ffOPTWR },
2157         { efLOG, "-brs",    "benchrots", ffOPTWR },
2158         { efLOG, "-brt",    "benchrott", ffOPTWR },
2159         { efMTX, "-bmtx",   "benchn",   ffOPTWR },
2160         { efNDX, "-bdn",    "bench",    ffOPTWR }
2161     };
2162
2163     gmx_bool        bThreads     = FALSE;
2164
2165     int             nthreads = 1;
2166
2167     const char     *procstring[] =
2168     { NULL, "-np", "-n", "none", NULL };
2169     const char     *npmevalues_opt[] =
2170     { NULL, "auto", "all", "subset", NULL };
2171
2172     gmx_bool     bAppendFiles          = TRUE;
2173     gmx_bool     bKeepAndNumCPT        = FALSE;
2174     gmx_bool     bResetCountersHalfWay = FALSE;
2175     gmx_bool     bBenchmark            = TRUE;
2176
2177     output_env_t oenv = NULL;
2178
2179     t_pargs      pa[] = {
2180         /***********************/
2181         /* g_tune_pme options: */
2182         /***********************/
2183         { "-np",       FALSE, etINT,  {&nnodes},
2184           "Number of nodes to run the tests on (must be > 2 for separate PME nodes)" },
2185         { "-npstring", FALSE, etENUM, {procstring},
2186           "Specify the number of processors to [TT]$MPIRUN[tt] using this string"},
2187         { "-ntmpi",    FALSE, etINT,  {&nthreads},
2188           "Number of MPI-threads to run the tests on (turns MPI & mpirun off)"},
2189         { "-r",        FALSE, etINT,  {&repeats},
2190           "Repeat each test this often" },
2191         { "-max",      FALSE, etREAL, {&maxPMEfraction},
2192           "Max fraction of PME nodes to test with" },
2193         { "-min",      FALSE, etREAL, {&minPMEfraction},
2194           "Min fraction of PME nodes to test with" },
2195         { "-npme",     FALSE, etENUM, {npmevalues_opt},
2196           "Within -min and -max, benchmark all possible values for [TT]-npme[tt], or just a reasonable subset. "
2197           "Auto neglects -min and -max and chooses reasonable values around a guess for npme derived from the .tpr"},
2198         { "-fix",      FALSE, etINT,  {&npme_fixed},
2199           "If >= -1, do not vary the number of PME-only nodes, instead use this fixed value and only vary rcoulomb and the PME grid spacing."},
2200         { "-rmax",     FALSE, etREAL, {&rmax},
2201           "If >0, maximal rcoulomb for -ntpr>1 (rcoulomb upscaling results in fourier grid downscaling)" },
2202         { "-rmin",     FALSE, etREAL, {&rmin},
2203           "If >0, minimal rcoulomb for -ntpr>1" },
2204         { "-scalevdw",  FALSE, etBOOL, {&bScaleRvdw},
2205           "Scale rvdw along with rcoulomb"},
2206         { "-ntpr",     FALSE, etINT,  {&ntprs},
2207           "Number of [TT].tpr[tt] files to benchmark. Create this many files with different rcoulomb scaling factors depending on -rmin and -rmax. "
2208           "If < 1, automatically choose the number of [TT].tpr[tt] files to test" },
2209         { "-steps",    FALSE, etGMX_LARGE_INT, {&bench_nsteps},
2210           "Take timings for this many steps in the benchmark runs" },
2211         { "-resetstep", FALSE, etINT,  {&presteps},
2212           "Let dlb equilibrate this many steps before timings are taken (reset cycle counters after this many steps)" },
2213         { "-simsteps", FALSE, etGMX_LARGE_INT, {&new_sim_nsteps},
2214           "If non-negative, perform this many steps in the real run (overwrites nsteps from [TT].tpr[tt], add [TT].cpt[tt] steps)" },
2215         { "-launch",   FALSE, etBOOL, {&bLaunch},
2216           "Launch the real simulation after optimization" },
2217         { "-bench",    FALSE, etBOOL, {&bBenchmark},
2218           "Run the benchmarks or just create the input [TT].tpr[tt] files?" },
2219         /******************/
2220         /* mdrun options: */
2221         /******************/
2222         /* We let g_tune_pme parse and understand these options, because we need to
2223          * prevent that they appear on the mdrun command line for the benchmarks */
2224         { "-append",   FALSE, etBOOL, {&bAppendFiles},
2225           "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names (for launch only)" },
2226         { "-cpnum",    FALSE, etBOOL, {&bKeepAndNumCPT},
2227           "Keep and number checkpoint files (launch only)" },
2228         { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay},
2229           "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt] (launch only)" }
2230     };
2231
2232
2233 #define NFILE asize(fnm)
2234
2235     CopyRight(stderr, argv[0]);
2236
2237     seconds = gettime();
2238
2239     parse_common_args(&argc, argv, PCA_NOEXIT_ON_ARGS,
2240                       NFILE, fnm, asize(pa), pa, asize(desc), desc,
2241                       0, NULL, &oenv);
2242
2243     /* Store the remaining unparsed command line entries in a string which
2244      * is then attached to the mdrun command line */
2245     snew(ExtraArgs, 1);
2246     ExtraArgs[0] = '\0';
2247     for (i = 1; i < argc; i++) /* argc will now be 1 if everything was understood */
2248     {
2249         add_to_string(&ExtraArgs, argv[i]);
2250         add_to_string(&ExtraArgs, " ");
2251     }
2252
2253     if (opt2parg_bSet("-ntmpi", asize(pa), pa))
2254     {
2255         bThreads = TRUE;
2256         if (opt2parg_bSet("-npstring", asize(pa), pa))
2257         {
2258             fprintf(stderr, "WARNING: -npstring has no effect when using threads.\n");
2259         }
2260
2261         if (nnodes > 1)
2262         {
2263             gmx_fatal(FARGS, "Can't run multi-threaded MPI simulation yet!");
2264         }
2265         /* and now we just set this; a bit of an ugly hack*/
2266         nnodes = nthreads;
2267     }
2268     /* Check for PME:PP ratio and whether tpr triggers additional output files */
2269     guessPMEratio = inspect_tpr(NFILE, fnm, &rcoulomb);
2270
2271     /* Automatically set -beo options if -eo is set etc. */
2272     couple_files_options(NFILE, fnm);
2273
2274     /* Construct the command line arguments for benchmark runs
2275      * as well as for the simulation run */
2276     if (bThreads)
2277     {
2278         sprintf(bbuf, " -ntmpi %d ", nthreads);
2279     }
2280     else
2281     {
2282         sprintf(bbuf, " -np %d ", nnodes);
2283     }
2284
2285     cmd_np = bbuf;
2286
2287     create_command_line_snippets(bAppendFiles, bKeepAndNumCPT, bResetCountersHalfWay, presteps,
2288                                  NFILE, fnm, &cmd_args_bench, &cmd_args_launch, ExtraArgs);
2289
2290     /* Read in checkpoint file if requested */
2291     sim_part = 1;
2292     if (opt2bSet("-cpi", NFILE, fnm))
2293     {
2294         snew(cr, 1);
2295         cr->duty = DUTY_PP; /* makes the following routine happy */
2296         read_checkpoint_simulation_part(opt2fn("-cpi", NFILE, fnm),
2297                                         &sim_part, &cpt_steps, cr,
2298                                         FALSE, NFILE, fnm, NULL, NULL);
2299         sfree(cr);
2300         sim_part++;
2301         /* sim_part will now be 1 if no checkpoint file was found */
2302         if (sim_part <= 1)
2303         {
2304             gmx_fatal(FARGS, "Checkpoint file %s not found!", opt2fn("-cpi", NFILE, fnm));
2305         }
2306     }
2307
2308     /* Open performance output file and write header info */
2309     fp = ffopen(opt2fn("-p", NFILE, fnm), "w");
2310
2311     /* Make a quick consistency check of command line parameters */
2312     check_input(nnodes, repeats, &ntprs, &rmin, rcoulomb, &rmax,
2313                 maxPMEfraction, minPMEfraction, npme_fixed,
2314                 bench_nsteps, fnm, NFILE, sim_part, presteps,
2315                 asize(pa), pa);
2316
2317     /* Determine the maximum and minimum number of PME nodes to test,
2318      * the actual list of settings is build in do_the_tests(). */
2319     if ((nnodes > 2) && (npme_fixed < -1))
2320     {
2321         if (0 == strcmp(npmevalues_opt[0], "auto"))
2322         {
2323             /* Determine the npme range automatically based on the PME:PP load guess */
2324             if (guessPMEratio > 1.0)
2325             {
2326                 /* More PME than PP work, probably we do not need separate PME nodes at all! */
2327                 maxPMEnodes = nnodes/2;
2328                 minPMEnodes = nnodes/2;
2329             }
2330             else
2331             {
2332                 /* PME : PP load is in the range 0..1, let's test around the guess */
2333                 guessPMEnodes = nnodes/(1.0 + 1.0/guessPMEratio);
2334                 minPMEnodes   = floor(0.7*guessPMEnodes);
2335                 maxPMEnodes   =  ceil(1.6*guessPMEnodes);
2336                 maxPMEnodes   = min(maxPMEnodes, nnodes/2);
2337             }
2338         }
2339         else
2340         {
2341             /* Determine the npme range based on user input */
2342             maxPMEnodes = floor(maxPMEfraction*nnodes);
2343             minPMEnodes = max(floor(minPMEfraction*nnodes), 0);
2344             fprintf(stdout, "Will try runs with %d ", minPMEnodes);
2345             if (maxPMEnodes != minPMEnodes)
2346             {
2347                 fprintf(stdout, "- %d ", maxPMEnodes);
2348             }
2349             fprintf(stdout, "PME-only nodes.\n  Note that the automatic number of PME-only nodes and no separate PME nodes are always tested.\n");
2350         }
2351     }
2352     else
2353     {
2354         maxPMEnodes = 0;
2355         minPMEnodes = 0;
2356     }
2357
2358     /* Get the commands we need to set up the runs from environment variables */
2359     get_program_paths(bThreads, &cmd_mpirun, cmd_np, &cmd_mdrun, repeats);
2360
2361     /* Print some header info to file */
2362     sep_line(fp);
2363     fprintf(fp, "\n      P E R F O R M A N C E   R E S U L T S\n");
2364     sep_line(fp);
2365     fprintf(fp, "%s for Gromacs %s\n", ShortProgram(), GromacsVersion());
2366     if (!bThreads)
2367     {
2368         fprintf(fp, "Number of nodes         : %d\n", nnodes);
2369         fprintf(fp, "The mpirun command is   : %s\n", cmd_mpirun);
2370         if (strcmp(procstring[0], "none") != 0)
2371         {
2372             fprintf(fp, "Passing # of nodes via  : %s\n", procstring[0]);
2373         }
2374         else
2375         {
2376             fprintf(fp, "Not setting number of nodes in system call\n");
2377         }
2378     }
2379     else
2380     {
2381         fprintf(fp, "Number of threads       : %d\n", nnodes);
2382     }
2383
2384     fprintf(fp, "The mdrun  command is   : %s\n", cmd_mdrun);
2385     fprintf(fp, "mdrun args benchmarks   : %s\n", cmd_args_bench);
2386     fprintf(fp, "Benchmark steps         : ");
2387     fprintf(fp, gmx_large_int_pfmt, bench_nsteps);
2388     fprintf(fp, "\n");
2389     fprintf(fp, "dlb equilibration steps : %d\n", presteps);
2390     if (sim_part > 1)
2391     {
2392         fprintf(fp, "Checkpoint time step    : ");
2393         fprintf(fp, gmx_large_int_pfmt, cpt_steps);
2394         fprintf(fp, "\n");
2395     }
2396     fprintf(fp, "mdrun args at launchtime: %s\n", cmd_args_launch);
2397
2398     if (new_sim_nsteps >= 0)
2399     {
2400         bOverwrite = TRUE;
2401         fprintf(stderr, "Note: Simulation input file %s will have ", opt2fn("-so", NFILE, fnm));
2402         fprintf(stderr, gmx_large_int_pfmt, new_sim_nsteps+cpt_steps);
2403         fprintf(stderr, " steps.\n");
2404         fprintf(fp, "Simulation steps        : ");
2405         fprintf(fp, gmx_large_int_pfmt, new_sim_nsteps);
2406         fprintf(fp, "\n");
2407     }
2408     if (repeats > 1)
2409     {
2410         fprintf(fp, "Repeats for each test   : %d\n", repeats);
2411     }
2412
2413     if (npme_fixed >= -1)
2414     {
2415         fprintf(fp, "Fixing -npme at         : %d\n", npme_fixed);
2416     }
2417
2418     fprintf(fp, "Input file              : %s\n", opt2fn("-s", NFILE, fnm));
2419     fprintf(fp, "   PME/PP load estimate : %g\n", guessPMEratio);
2420
2421     /* Allocate memory for the inputinfo struct: */
2422     snew(info, 1);
2423     info->nr_inputfiles = ntprs;
2424     for (i = 0; i < ntprs; i++)
2425     {
2426         snew(info->rcoulomb, ntprs);
2427         snew(info->rvdw, ntprs);
2428         snew(info->rlist, ntprs);
2429         snew(info->rlistlong, ntprs);
2430         snew(info->nkx, ntprs);
2431         snew(info->nky, ntprs);
2432         snew(info->nkz, ntprs);
2433         snew(info->fsx, ntprs);
2434         snew(info->fsy, ntprs);
2435         snew(info->fsz, ntprs);
2436     }
2437     /* Make alternative tpr files to test: */
2438     snew(tpr_names, ntprs);
2439     for (i = 0; i < ntprs; i++)
2440     {
2441         snew(tpr_names[i], STRLEN);
2442     }
2443
2444     /* It can be that ntprs is reduced by make_benchmark_tprs if not enough
2445      * different grids could be found. */
2446     make_benchmark_tprs(opt2fn("-s", NFILE, fnm), tpr_names, bench_nsteps+presteps,
2447                         cpt_steps, rmin, rmax, bScaleRvdw, &ntprs, info, fp);
2448
2449     /********************************************************************************/
2450     /* Main loop over all scenarios we need to test: tpr files, PME nodes, repeats  */
2451     /********************************************************************************/
2452     snew(perfdata, ntprs);
2453     if (bBenchmark)
2454     {
2455         do_the_tests(fp, tpr_names, maxPMEnodes, minPMEnodes, npme_fixed, npmevalues_opt[0], perfdata, &pmeentries,
2456                      repeats, nnodes, ntprs, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
2457                      cmd_args_bench, fnm, NFILE, presteps, cpt_steps);
2458
2459         fprintf(fp, "\nTuning took%8.1f minutes.\n", (gettime()-seconds)/60.0);
2460
2461         /* Analyse the results and give a suggestion for optimal settings: */
2462         bKeepTPR = analyze_data(fp, opt2fn("-p", NFILE, fnm), perfdata, nnodes, ntprs, pmeentries,
2463                                 repeats, info, &best_tpr, &best_npme);
2464
2465         /* Take the best-performing tpr file and enlarge nsteps to original value */
2466         if (bKeepTPR && !bOverwrite)
2467         {
2468             simulation_tpr = opt2fn("-s", NFILE, fnm);
2469         }
2470         else
2471         {
2472             simulation_tpr = opt2fn("-so", NFILE, fnm);
2473             modify_PMEsettings(bOverwrite ? (new_sim_nsteps+cpt_steps) : info->orig_sim_steps,
2474                                info->orig_init_step, tpr_names[best_tpr], simulation_tpr);
2475         }
2476
2477         /* Let's get rid of the temporary benchmark input files */
2478         for (i = 0; i < ntprs; i++)
2479         {
2480             fprintf(stdout, "Deleting temporary benchmark input file %s\n", tpr_names[i]);
2481             remove(tpr_names[i]);
2482         }
2483
2484         /* Now start the real simulation if the user requested it ... */
2485         launch_simulation(bLaunch, fp, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
2486                           cmd_args_launch, simulation_tpr, best_npme);
2487     }
2488     ffclose(fp);
2489
2490     /* ... or simply print the performance results to screen: */
2491     if (!bLaunch)
2492     {
2493         finalize(opt2fn("-p", NFILE, fnm));
2494     }
2495
2496     return 0;
2497 }