src/gromacs/gmxana/gmx_tune_pme.c

   1 /*
   2  *
   3  *                This source code is part of
   4  *
   5  *                 G   R   O   M   A   C   S
   6  *
   7  *          GROningen MAchine for Chemical Simulations
   8  *
   9  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  10  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  11  * Copyright (c) 2001-2008, The GROMACS development team,
  12  * check out http://www.gromacs.org for more information.
  13
  14  * This program is free software; you can redistribute it and/or
  15  * modify it under the terms of the GNU General Public License
  16  * as published by the Free Software Foundation; either version 2
  17  * of the License, or (at your option) any later version.
  18  *
  19  * If you want to redistribute modifications, please consider that
  20  * scientific software is very special. Version control is crucial -
  21  * bugs must be traceable. We will be happy to consider code for
  22  * inclusion in the official distribution, but derived work must not
  23  * be called official GROMACS. Details are found in the README & COPYING
  24  * files - if they are missing, get the official version at www.gromacs.org.
  25  *
  26  * To help us fund GROMACS development, we humbly ask that you cite
  27  * the papers on the package - you can find them in the top README file.
  28  *
  29  * For more info, check our website at http://www.gromacs.org
  30  *
  31  * And Hey:
  32  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  33  */
  34 #ifdef HAVE_CONFIG_H
  35 #include <config.h>
  36 #endif
  37
  38
  39 #include <time.h>
  40 #ifdef HAVE_SYS_TIME_H
  41 #include <sys/time.h>
  42 #endif
  43
  44
  45
  46 #include "statutil.h"
  47 #include "typedefs.h"
  48 #include "smalloc.h"
  49 #include "vec.h"
  50 #include "copyrite.h"
  51 #include "statutil.h"
  52 #include "tpxio.h"
  53 #include "string2.h"
  54 #include "readinp.h"
  55 #include "calcgrid.h"
  56 #include "checkpoint.h"
  57 #include "macros.h"
  58 #include "gmx_ana.h"
  59 #include "names.h"
  60 #include "perf_est.h"
  61 #include "sim_util.h"
  62
  63
  64 /* Enum for situations that can occur during log file parsing, the
  65  * corresponding string entries can be found in do_the_tests() in
  66  * const char* ParseLog[] */
  67 enum {
  68     eParselogOK,
  69     eParselogNotFound,
  70     eParselogNoPerfData,
  71     eParselogTerm,
  72     eParselogResetProblem,
  73     eParselogNoDDGrid,
  74     eParselogTPXVersion,
  75     eParselogNotParallel,
  76     eParselogLargePrimeFactor,
  77     eParselogFatal,
  78     eParselogNr
  79 };
  80
  81
  82 typedef struct
  83 {
  84     int     nPMEnodes;    /* number of PME-only nodes used in this test */
  85     int     nx, ny, nz;   /* DD grid */
  86     int     guessPME;     /* if nPMEnodes == -1, this is the guessed number of PME nodes */
  87     double *Gcycles;      /* This can contain more than one value if doing multiple tests */
  88     double  Gcycles_Av;
  89     float  *ns_per_day;
  90     float   ns_per_day_Av;
  91     float  *PME_f_load;     /* PME mesh/force load average*/
  92     float   PME_f_load_Av;  /* Average average ;) ... */
  93     char   *mdrun_cmd_line; /* Mdrun command line used for this test */
  94 } t_perf;
  95
  96
  97 typedef struct
  98 {
  99     int             nr_inputfiles;  /* The number of tpr and mdp input files */
 100     gmx_large_int_t orig_sim_steps; /* Number of steps to be done in the real simulation */
 101     gmx_large_int_t orig_init_step; /* Init step for the real simulation */
 102     real           *rcoulomb;       /* The coulomb radii [0...nr_inputfiles] */
 103     real           *rvdw;           /* The vdW radii */
 104     real           *rlist;          /* Neighbourlist cutoff radius */
 105     real           *rlistlong;
 106     int            *nkx, *nky, *nkz;
 107     real           *fsx, *fsy, *fsz; /* Fourierspacing in x,y,z dimension */
 108 } t_inputinfo;
 109
 110
 111 static void sep_line(FILE *fp)
 112 {
 113     fprintf(fp, "\n------------------------------------------------------------\n");
 114 }
 115
 116
 117 /* Wrapper for system calls */
 118 static int gmx_system_call(char *command)
 119 {
 120 #ifdef GMX_NO_SYSTEM
 121     gmx_fatal(FARGS, "No calls to system(3) supported on this platform. Attempted to call:\n'%s'\n", command);
 122 #else
 123     return ( system(command) );
 124 #endif
 125 }
 126
 127
 128 /* Check if string starts with substring */
 129 static gmx_bool str_starts(const char *string, const char *substring)
 130 {
 131     return ( strncmp(string, substring, strlen(substring)) == 0);
 132 }
 133
 134
 135 static void cleandata(t_perf *perfdata, int test_nr)
 136 {
 137     perfdata->Gcycles[test_nr]    = 0.0;
 138     perfdata->ns_per_day[test_nr] = 0.0;
 139     perfdata->PME_f_load[test_nr] = 0.0;
 140
 141     return;
 142 }
 143
 144
 145 static gmx_bool is_equal(real a, real b)
 146 {
 147     real diff, eps = 1.0e-7;
 148
 149
 150     diff = a - b;
 151
 152     if (diff < 0.0)
 153     {
 154         diff = -diff;
 155     }
 156
 157     if (diff < eps)
 158     {
 159         return TRUE;
 160     }
 161     else
 162     {
 163         return FALSE;
 164     }
 165 }
 166
 167
 168 static void finalize(const char *fn_out)
 169 {
 170     char  buf[STRLEN];
 171     FILE *fp;
 172
 173
 174     fp = fopen(fn_out, "r");
 175     fprintf(stdout, "\n\n");
 176
 177     while (fgets(buf, STRLEN-1, fp) != NULL)
 178     {
 179         fprintf(stdout, "%s", buf);
 180     }
 181     fclose(fp);
 182     fprintf(stdout, "\n\n");
 183 }
 184
 185
 186 enum {
 187     eFoundNothing, eFoundDDStr, eFoundAccountingStr, eFoundCycleStr
 188 };
 189
 190 static int parse_logfile(const char *logfile, const char *errfile,
 191                          t_perf *perfdata, int test_nr, int presteps, gmx_large_int_t cpt_steps,
 192                          int nnodes)
 193 {
 194     FILE           *fp;
 195     char            line[STRLEN], dumstring[STRLEN], dumstring2[STRLEN];
 196     const char      matchstrdd[]  = "Domain decomposition grid";
 197     const char      matchstrcr[]  = "resetting all time and cycle counters";
 198     const char      matchstrbal[] = "Average PME mesh/force load:";
 199     const char      matchstring[] = "R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G";
 200     const char      errSIG[]      = "signal, stopping at the next";
 201     int             iFound;
 202     int             procs;
 203     float           dum1, dum2, dum3, dum4;
 204     int             ndum;
 205     int             npme;
 206     gmx_large_int_t resetsteps     = -1;
 207     gmx_bool        bFoundResetStr = FALSE;
 208     gmx_bool        bResetChecked  = FALSE;
 209
 210
 211     if (!gmx_fexist(logfile))
 212     {
 213         fprintf(stderr, "WARNING: Could not find logfile %s.\n", logfile);
 214         cleandata(perfdata, test_nr);
 215         return eParselogNotFound;
 216     }
 217
 218     fp = fopen(logfile, "r");
 219     perfdata->PME_f_load[test_nr] = -1.0;
 220     perfdata->guessPME            = -1;
 221
 222     iFound = eFoundNothing;
 223     if (1 == nnodes)
 224     {
 225         iFound = eFoundDDStr; /* Skip some case statements */
 226     }
 227
 228     while (fgets(line, STRLEN, fp) != NULL)
 229     {
 230         /* Remove leading spaces */
 231         ltrim(line);
 232
 233         /* Check for TERM and INT signals from user: */
 234         if (strstr(line, errSIG) != NULL)
 235         {
 236             fclose(fp);
 237             cleandata(perfdata, test_nr);
 238             return eParselogTerm;
 239         }
 240
 241         /* Check whether cycle resetting  worked */
 242         if (presteps > 0 && !bFoundResetStr)
 243         {
 244             if (strstr(line, matchstrcr) != NULL)
 245             {
 246                 sprintf(dumstring, "step %s", gmx_large_int_pfmt);
 247                 sscanf(line, dumstring, &resetsteps);
 248                 bFoundResetStr = TRUE;
 249                 if (resetsteps == presteps+cpt_steps)
 250                 {
 251                     bResetChecked = TRUE;
 252                 }
 253                 else
 254                 {
 255                     sprintf(dumstring, gmx_large_int_pfmt, resetsteps);
 256                     sprintf(dumstring2, gmx_large_int_pfmt, presteps+cpt_steps);
 257                     fprintf(stderr, "WARNING: Time step counters were reset at step %s,\n"
 258                             "         though they were supposed to be reset at step %s!\n",
 259                             dumstring, dumstring2);
 260                 }
 261             }
 262         }
 263
 264         /* Look for strings that appear in a certain order in the log file: */
 265         switch (iFound)
 266         {
 267             case eFoundNothing:
 268                 /* Look for domain decomp grid and separate PME nodes: */
 269                 if (str_starts(line, matchstrdd))
 270                 {
 271                     sscanf(line, "Domain decomposition grid %d x %d x %d, separate PME nodes %d",
 272                            &(perfdata->nx), &(perfdata->ny), &(perfdata->nz), &npme);
 273                     if (perfdata->nPMEnodes == -1)
 274                     {
 275                         perfdata->guessPME = npme;
 276                     }
 277                     else if (perfdata->nPMEnodes != npme)
 278                     {
 279                         gmx_fatal(FARGS, "PME nodes from command line and output file are not identical");
 280                     }
 281                     iFound = eFoundDDStr;
 282                 }
 283                 /* Catch a few errors that might have occured: */
 284                 else if (str_starts(line, "There is no domain decomposition for"))
 285                 {
 286                     fclose(fp);
 287                     return eParselogNoDDGrid;
 288                 }
 289                 else if (str_starts(line, "The number of nodes you selected"))
 290                 {
 291                     fclose(fp);
 292                     return eParselogLargePrimeFactor;
 293                 }
 294                 else if (str_starts(line, "reading tpx file"))
 295                 {
 296                     fclose(fp);
 297                     return eParselogTPXVersion;
 298                 }
 299                 else if (str_starts(line, "The -dd or -npme option request a parallel simulation"))
 300                 {
 301                     fclose(fp);
 302                     return eParselogNotParallel;
 303                 }
 304                 break;
 305             case eFoundDDStr:
 306                 /* Look for PME mesh/force balance (not necessarily present, though) */
 307                 if (str_starts(line, matchstrbal))
 308                 {
 309                     sscanf(&line[strlen(matchstrbal)], "%f", &(perfdata->PME_f_load[test_nr]));
 310                 }
 311                 /* Look for matchstring */
 312                 if (str_starts(line, matchstring))
 313                 {
 314                     iFound = eFoundAccountingStr;
 315                 }
 316                 break;
 317             case eFoundAccountingStr:
 318                 /* Already found matchstring - look for cycle data */
 319                 if (str_starts(line, "Total  "))
 320                 {
 321                     sscanf(line, "Total %d %lf", &procs, &(perfdata->Gcycles[test_nr]));
 322                     iFound = eFoundCycleStr;
 323                 }
 324                 break;
 325             case eFoundCycleStr:
 326                 /* Already found cycle data - look for remaining performance info and return */
 327                 if (str_starts(line, "Performance:"))
 328                 {
 329                     ndum = sscanf(line, "%s %f %f %f %f", dumstring, &dum1, &dum2, &dum3, &dum4);
 330                     /* (ns/day) is the second last entry, depending on whether GMX_DETAILED_PERF_STATS was set in print_perf(), nrnb.c */
 331                     perfdata->ns_per_day[test_nr] = (ndum == 5) ? dum3 : dum1;
 332                     fclose(fp);
 333                     if (bResetChecked || presteps == 0)
 334                     {
 335                         return eParselogOK;
 336                     }
 337                     else
 338                     {
 339                         return eParselogResetProblem;
 340                     }
 341                 }
 342                 break;
 343         }
 344     } /* while */
 345
 346     /* Close the log file */
 347     fclose(fp);
 348
 349     /* Check why there is no performance data in the log file.
 350      * Did a fatal errors occur? */
 351     if (gmx_fexist(errfile))
 352     {
 353         fp = fopen(errfile, "r");
 354         while (fgets(line, STRLEN, fp) != NULL)
 355         {
 356             if (str_starts(line, "Fatal error:") )
 357             {
 358                 if (fgets(line, STRLEN, fp) != NULL)
 359                 {
 360                     fprintf(stderr, "\nWARNING: An error occured during this benchmark:\n"
 361                             "%s\n", line);
 362                 }
 363                 fclose(fp);
 364                 cleandata(perfdata, test_nr);
 365                 return eParselogFatal;
 366             }
 367         }
 368         fclose(fp);
 369     }
 370     else
 371     {
 372         fprintf(stderr, "WARNING: Could not find stderr file %s.\n", errfile);
 373     }
 374
 375     /* Giving up ... we could not find out why there is no performance data in
 376      * the log file. */
 377     fprintf(stdout, "No performance data in log file.\n");
 378     cleandata(perfdata, test_nr);
 379
 380     return eParselogNoPerfData;
 381 }
 382
 383
 384 static gmx_bool analyze_data(
 385         FILE         *fp,
 386         const char   *fn,
 387         t_perf      **perfdata,
 388         int           nnodes,
 389         int           ntprs,
 390         int           ntests,
 391         int           nrepeats,
 392         t_inputinfo  *info,
 393         int          *index_tpr,    /* OUT: Nr of mdp file with best settings */
 394         int          *npme_optimal) /* OUT: Optimal number of PME nodes */
 395 {
 396     int      i, j, k;
 397     int      line  = 0, line_win = -1;
 398     int      k_win = -1, i_win = -1, winPME;
 399     double   s     = 0.0; /* standard deviation */
 400     t_perf  *pd;
 401     char     strbuf[STRLEN];
 402     char     str_PME_f_load[13];
 403     gmx_bool bCanUseOrigTPR;
 404     gmx_bool bRefinedCoul, bRefinedVdW, bRefinedGrid;
 405
 406
 407     if (nrepeats > 1)
 408     {
 409         sep_line(fp);
 410         fprintf(fp, "Summary of successful runs:\n");
 411         fprintf(fp, "Line tpr PME nodes  Gcycles Av.     Std.dev.       ns/day        PME/f");
 412         if (nnodes > 1)
 413         {
 414             fprintf(fp, "    DD grid");
 415         }
 416         fprintf(fp, "\n");
 417     }
 418
 419
 420     for (k = 0; k < ntprs; k++)
 421     {
 422         for (i = 0; i < ntests; i++)
 423         {
 424             /* Select the right dataset: */
 425             pd = &(perfdata[k][i]);
 426
 427             pd->Gcycles_Av    = 0.0;
 428             pd->PME_f_load_Av = 0.0;
 429             pd->ns_per_day_Av = 0.0;
 430
 431             if (pd->nPMEnodes == -1)
 432             {
 433                 sprintf(strbuf, "(%3d)", pd->guessPME);
 434             }
 435             else
 436             {
 437                 sprintf(strbuf, "     ");
 438             }
 439
 440             /* Get the average run time of a setting */
 441             for (j = 0; j < nrepeats; j++)
 442             {
 443                 pd->Gcycles_Av    += pd->Gcycles[j];
 444                 pd->PME_f_load_Av += pd->PME_f_load[j];
 445             }
 446             pd->Gcycles_Av    /= nrepeats;
 447             pd->PME_f_load_Av /= nrepeats;
 448
 449             for (j = 0; j < nrepeats; j++)
 450             {
 451                 if (pd->ns_per_day[j] > 0.0)
 452                 {
 453                     pd->ns_per_day_Av += pd->ns_per_day[j];
 454                 }
 455                 else
 456                 {
 457                     /* Somehow the performance number was not aquired for this run,
 458                      * therefor set the average to some negative value: */
 459                     pd->ns_per_day_Av = -1.0f*nrepeats;
 460                     break;
 461                 }
 462             }
 463             pd->ns_per_day_Av /= nrepeats;
 464
 465             /* Nicer output: */
 466             if (pd->PME_f_load_Av > 0.0)
 467             {
 468                 sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load_Av);
 469             }
 470             else
 471             {
 472                 sprintf(str_PME_f_load, "%s", "         -  ");
 473             }
 474
 475
 476             /* We assume we had a successful run if both averages are positive */
 477             if (pd->Gcycles_Av > 0.0 && pd->ns_per_day_Av > 0.0)
 478             {
 479                 /* Output statistics if repeats were done */
 480                 if (nrepeats > 1)
 481                 {
 482                     /* Calculate the standard deviation */
 483                     s = 0.0;
 484                     for (j = 0; j < nrepeats; j++)
 485                     {
 486                         s += pow( pd->Gcycles[j] - pd->Gcycles_Av, 2 );
 487                     }
 488                     s /= (nrepeats - 1);
 489                     s  = sqrt(s);
 490
 491                     fprintf(fp, "%4d %3d %4d%s %12.3f %12.3f %12.3f %s",
 492                             line, k, pd->nPMEnodes, strbuf, pd->Gcycles_Av, s,
 493                             pd->ns_per_day_Av, str_PME_f_load);
 494                     if (nnodes > 1)
 495                     {
 496                         fprintf(fp, "  %3d %3d %3d", pd->nx, pd->ny, pd->nz);
 497                     }
 498                     fprintf(fp, "\n");
 499                 }
 500                 /* Store the index of the best run found so far in 'winner': */
 501                 if ( (k_win == -1) || (pd->Gcycles_Av < perfdata[k_win][i_win].Gcycles_Av) )
 502                 {
 503                     k_win    = k;
 504                     i_win    = i;
 505                     line_win = line;
 506                 }
 507                 line++;
 508             }
 509         }
 510     }
 511
 512     if (k_win == -1)
 513     {
 514         gmx_fatal(FARGS, "None of the runs was successful! Check %s for problems.", fn);
 515     }
 516
 517     sep_line(fp);
 518
 519     winPME = perfdata[k_win][i_win].nPMEnodes;
 520
 521     if (1 == ntests)
 522     {
 523         /* We stuck to a fixed number of PME-only nodes */
 524         sprintf(strbuf, "settings No. %d", k_win);
 525     }
 526     else
 527     {
 528         /* We have optimized the number of PME-only nodes */
 529         if (winPME == -1)
 530         {
 531             sprintf(strbuf, "%s", "the automatic number of PME nodes");
 532         }
 533         else
 534         {
 535             sprintf(strbuf, "%d PME nodes", winPME);
 536         }
 537     }
 538     fprintf(fp, "Best performance was achieved with %s", strbuf);
 539     if ((nrepeats > 1) && (ntests > 1))
 540     {
 541         fprintf(fp, " (see line %d)", line_win);
 542     }
 543     fprintf(fp, "\n");
 544
 545     /* Only mention settings if they were modified: */
 546     bRefinedCoul = !is_equal(info->rcoulomb[k_win], info->rcoulomb[0]);
 547     bRefinedVdW  = !is_equal(info->rvdw[k_win], info->rvdw[0]    );
 548     bRefinedGrid = !(info->nkx[k_win] == info->nkx[0] &&
 549                      info->nky[k_win] == info->nky[0] &&
 550                      info->nkz[k_win] == info->nkz[0]);
 551
 552     if (bRefinedCoul || bRefinedVdW || bRefinedGrid)
 553     {
 554         fprintf(fp, "Optimized PME settings:\n");
 555         bCanUseOrigTPR = FALSE;
 556     }
 557     else
 558     {
 559         bCanUseOrigTPR = TRUE;
 560     }
 561
 562     if (bRefinedCoul)
 563     {
 564         fprintf(fp, "   New Coulomb radius: %f nm (was %f nm)\n", info->rcoulomb[k_win], info->rcoulomb[0]);
 565     }
 566
 567     if (bRefinedVdW)
 568     {
 569         fprintf(fp, "   New Van der Waals radius: %f nm (was %f nm)\n", info->rvdw[k_win], info->rvdw[0]);
 570     }
 571
 572     if (bRefinedGrid)
 573     {
 574         fprintf(fp, "   New Fourier grid xyz: %d %d %d (was %d %d %d)\n", info->nkx[k_win], info->nky[k_win], info->nkz[k_win],
 575                 info->nkx[0], info->nky[0], info->nkz[0]);
 576     }
 577
 578     if (bCanUseOrigTPR && ntprs > 1)
 579     {
 580         fprintf(fp, "and original PME settings.\n");
 581     }
 582
 583     fflush(fp);
 584
 585     /* Return the index of the mdp file that showed the highest performance
 586      * and the optimal number of PME nodes */
 587     *index_tpr    = k_win;
 588     *npme_optimal = winPME;
 589
 590     return bCanUseOrigTPR;
 591 }
 592
 593
 594 /* Get the commands we need to set up the runs from environment variables */
 595 static void get_program_paths(gmx_bool bThreads, char *cmd_mpirun[], char *cmd_mdrun[])
 596 {
 597     char      *cp;
 598     FILE      *fp;
 599     const char def_mpirun[]   = "mpirun";
 600     const char def_mdrun[]    = "mdrun";
 601
 602     const char empty_mpirun[] = "";
 603
 604     /* Get the commands we need to set up the runs from environment variables */
 605     if (!bThreads)
 606     {
 607         if ( (cp = getenv("MPIRUN")) != NULL)
 608         {
 609             *cmd_mpirun = strdup(cp);
 610         }
 611         else
 612         {
 613             *cmd_mpirun = strdup(def_mpirun);
 614         }
 615     }
 616     else
 617     {
 618         *cmd_mpirun = strdup(empty_mpirun);
 619     }
 620
 621     if ( (cp = getenv("MDRUN" )) != NULL)
 622     {
 623         *cmd_mdrun  = strdup(cp);
 624     }
 625     else
 626     {
 627         *cmd_mdrun  = strdup(def_mdrun);
 628     }
 629 }
 630
 631 /* Check that the commands will run mdrun (perhaps via mpirun) by
 632  * running a very quick test simulation. Requires MPI environment to
 633  * be available if applicable. */
 634 static void check_mdrun_works(gmx_bool    bThreads,
 635                               const char *cmd_mpirun,
 636                               const char *cmd_np,
 637                               const char *cmd_mdrun)
 638 {
 639     char      *command = NULL;
 640     char      *cp;
 641     char       line[STRLEN];
 642     FILE      *fp;
 643     const char filename[]     = "benchtest.log";
 644
 645     /* This string should always be identical to the one in copyrite.c,
 646      * gmx_print_version_info() in the defined(GMX_MPI) section */
 647     const char match_mpi[]    = "MPI library:        MPI";
 648     const char match_mdrun[]  = "Program: ";
 649     gmx_bool   bMdrun         = FALSE;
 650     gmx_bool   bMPI           = FALSE;
 651
 652     /* Run a small test to see whether mpirun + mdrun work  */
 653     fprintf(stdout, "Making sure that mdrun can be executed. ");
 654     if (bThreads)
 655     {
 656         snew(command, strlen(cmd_mdrun) + strlen(cmd_np) + strlen(filename) + 50);
 657         sprintf(command, "%s%s-version -maxh 0.001 1> %s 2>&1", cmd_mdrun, cmd_np, filename);
 658     }
 659     else
 660     {
 661         snew(command, strlen(cmd_mpirun) + strlen(cmd_np) + strlen(cmd_mdrun) + strlen(filename) + 50);
 662         sprintf(command, "%s%s%s -version -maxh 0.001 1> %s 2>&1", cmd_mpirun, cmd_np, cmd_mdrun, filename);
 663     }
 664     fprintf(stdout, "Trying '%s' ... ", command);
 665     make_backup(filename);
 666     gmx_system_call(command);
 667
 668     /* Check if we find the characteristic string in the output: */
 669     if (!gmx_fexist(filename))
 670     {
 671         gmx_fatal(FARGS, "Output from test run could not be found.");
 672     }
 673
 674     fp = fopen(filename, "r");
 675     /* We need to scan the whole output file, since sometimes the queuing system
 676      * also writes stuff to stdout/err */
 677     while (!feof(fp) )
 678     {
 679         cp = fgets(line, STRLEN, fp);
 680         if (cp != NULL)
 681         {
 682             if (str_starts(line, match_mdrun) )
 683             {
 684                 bMdrun = TRUE;
 685             }
 686             if (str_starts(line, match_mpi) )
 687             {
 688                 bMPI = TRUE;
 689             }
 690         }
 691     }
 692     fclose(fp);
 693
 694     if (bThreads)
 695     {
 696         if (bMPI)
 697         {
 698             gmx_fatal(FARGS, "Need a threaded version of mdrun. This one\n"
 699                       "(%s)\n"
 700                       "seems to have been compiled with MPI instead.",
 701                       *cmd_mdrun);
 702         }
 703     }
 704     else
 705     {
 706         if (bMdrun && !bMPI)
 707         {
 708             gmx_fatal(FARGS, "Need an MPI-enabled version of mdrun. This one\n"
 709                       "(%s)\n"
 710                       "seems to have been compiled without MPI support.",
 711                       *cmd_mdrun);
 712         }
 713     }
 714
 715     if (!bMdrun)
 716     {
 717         gmx_fatal(FARGS, "Cannot execute mdrun. Please check %s for problems!",
 718                   filename);
 719     }
 720
 721     fprintf(stdout, "passed.\n");
 722
 723     /* Clean up ... */
 724     remove(filename);
 725     sfree(command);
 726 }
 727
 728
 729 static void launch_simulation(
 730         gmx_bool    bLaunch,        /* Should the simulation be launched? */
 731         FILE       *fp,             /* General log file */
 732         gmx_bool    bThreads,       /* whether to use threads */
 733         char       *cmd_mpirun,     /* Command for mpirun */
 734         char       *cmd_np,         /* Switch for -np or -ntmpi or empty */
 735         char       *cmd_mdrun,      /* Command for mdrun */
 736         char       *args_for_mdrun, /* Arguments for mdrun */
 737         const char *simulation_tpr, /* This tpr will be simulated */
 738         int         nPMEnodes)      /* Number of PME nodes to use */
 739 {
 740     char  *command;
 741
 742
 743     /* Make enough space for the system call command,
 744      * (100 extra chars for -npme ... etc. options should suffice): */
 745     snew(command, strlen(cmd_mpirun)+strlen(cmd_mdrun)+strlen(cmd_np)+strlen(args_for_mdrun)+strlen(simulation_tpr)+100);
 746
 747     /* Note that the -passall options requires args_for_mdrun to be at the end
 748      * of the command line string */
 749     if (bThreads)
 750     {
 751         sprintf(command, "%s%s-npme %d -s %s %s",
 752                 cmd_mdrun, cmd_np, nPMEnodes, simulation_tpr, args_for_mdrun);
 753     }
 754     else
 755     {
 756         sprintf(command, "%s%s%s -npme %d -s %s %s",
 757                 cmd_mpirun, cmd_np, cmd_mdrun, nPMEnodes, simulation_tpr, args_for_mdrun);
 758     }
 759
 760     fprintf(fp, "%s this command line to launch the simulation:\n\n%s", bLaunch ? "Using" : "Please use", command);
 761     sep_line(fp);
 762     fflush(fp);
 763
 764     /* Now the real thing! */
 765     if (bLaunch)
 766     {
 767         fprintf(stdout, "\nLaunching simulation with best parameters now.\nExecuting '%s'", command);
 768         sep_line(stdout);
 769         fflush(stdout);
 770         gmx_system_call(command);
 771     }
 772 }
 773
 774
 775 static void modify_PMEsettings(
 776         gmx_large_int_t simsteps,    /* Set this value as number of time steps */
 777         gmx_large_int_t init_step,   /* Set this value as init_step */
 778         const char     *fn_best_tpr, /* tpr file with the best performance */
 779         const char     *fn_sim_tpr)  /* name of tpr file to be launched */
 780 {
 781     t_inputrec   *ir;
 782     t_state       state;
 783     gmx_mtop_t    mtop;
 784     char          buf[200];
 785
 786     snew(ir, 1);
 787     read_tpx_state(fn_best_tpr, ir, &state, NULL, &mtop);
 788
 789     /* Reset nsteps and init_step to the value of the input .tpr file */
 790     ir->nsteps    = simsteps;
 791     ir->init_step = init_step;
 792
 793     /* Write the tpr file which will be launched */
 794     sprintf(buf, "Writing optimized simulation file %s with nsteps=%s.\n", fn_sim_tpr, gmx_large_int_pfmt);
 795     fprintf(stdout, buf, ir->nsteps);
 796     fflush(stdout);
 797     write_tpx_state(fn_sim_tpr, ir, &state, &mtop);
 798
 799     sfree(ir);
 800 }
 801
 802
 803 #define EPME_SWITCHED(e) ((e) == eelPMESWITCH || (e) == eelPMEUSERSWITCH)
 804
 805 /* Make additional TPR files with more computational load for the
 806  * direct space processors: */
 807 static void make_benchmark_tprs(
 808         const char     *fn_sim_tpr,      /* READ : User-provided tpr file                 */
 809         char           *fn_bench_tprs[], /* WRITE: Names of benchmark tpr files           */
 810         gmx_large_int_t benchsteps,      /* Number of time steps for benchmark runs       */
 811         gmx_large_int_t statesteps,      /* Step counter in checkpoint file               */
 812         real            rmin,            /* Minimal Coulomb radius                        */
 813         real            rmax,            /* Maximal Coulomb radius                        */
 814         real            bScaleRvdw,      /* Scale rvdw along with rcoulomb                */
 815         int            *ntprs,           /* No. of TPRs to write, each with a different
 816                                             rcoulomb and fourierspacing                   */
 817         t_inputinfo    *info,            /* Contains information about mdp file options   */
 818         FILE           *fp)              /* Write the output here                         */
 819 {
 820     int           i, j, d;
 821     t_inputrec   *ir;
 822     t_state       state;
 823     gmx_mtop_t    mtop;
 824     real          nlist_buffer;     /* Thickness of the buffer regions for PME-switch potentials */
 825     char          buf[200];
 826     rvec          box_size;
 827     gmx_bool      bNote = FALSE;
 828     real          add;              /* Add this to rcoul for the next test    */
 829     real          fac = 1.0;        /* Scaling factor for Coulomb radius      */
 830     real          fourierspacing;   /* Basic fourierspacing from tpr          */
 831
 832
 833     sprintf(buf, "Making benchmark tpr file%s with %s time step%s",
 834             *ntprs > 1 ? "s" : "", gmx_large_int_pfmt, benchsteps > 1 ? "s" : "");
 835     fprintf(stdout, buf, benchsteps);
 836     if (statesteps > 0)
 837     {
 838         sprintf(buf, " (adding %s steps from checkpoint file)", gmx_large_int_pfmt);
 839         fprintf(stdout, buf, statesteps);
 840         benchsteps += statesteps;
 841     }
 842     fprintf(stdout, ".\n");
 843
 844
 845     snew(ir, 1);
 846     read_tpx_state(fn_sim_tpr, ir, &state, NULL, &mtop);
 847
 848     /* Check if some kind of PME was chosen */
 849     if (EEL_PME(ir->coulombtype) == FALSE)
 850     {
 851         gmx_fatal(FARGS, "Can only do optimizations for simulations with %s electrostatics.",
 852                   EELTYPE(eelPME));
 853     }
 854
 855     /* Check if rcoulomb == rlist, which is necessary for plain PME. */
 856     if (  (ir->cutoff_scheme != ecutsVERLET) &&
 857           (eelPME == ir->coulombtype) && !(ir->rcoulomb == ir->rlist))
 858     {
 859         gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to rlist (%f).",
 860                   EELTYPE(eelPME), ir->rcoulomb, ir->rlist);
 861     }
 862     /* For other PME types, rcoulomb is allowed to be smaller than rlist */
 863     else if (ir->rcoulomb > ir->rlist)
 864     {
 865         gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to or smaller than rlist (%f)",
 866                   EELTYPE(ir->coulombtype), ir->rcoulomb, ir->rlist);
 867     }
 868
 869     if (bScaleRvdw && ir->rvdw != ir->rcoulomb)
 870     {
 871         fprintf(stdout, "NOTE: input rvdw != rcoulomb, will not scale rvdw\n");
 872         bScaleRvdw = FALSE;
 873     }
 874
 875     /* Reduce the number of steps for the benchmarks */
 876     info->orig_sim_steps = ir->nsteps;
 877     ir->nsteps           = benchsteps;
 878     /* We must not use init_step from the input tpr file for the benchmarks */
 879     info->orig_init_step = ir->init_step;
 880     ir->init_step        = 0;
 881
 882     /* For PME-switch potentials, keep the radial distance of the buffer region */
 883     nlist_buffer   = ir->rlist - ir->rcoulomb;
 884
 885     /* Determine length of triclinic box vectors */
 886     for (d = 0; d < DIM; d++)
 887     {
 888         box_size[d] = 0;
 889         for (i = 0; i < DIM; i++)
 890         {
 891             box_size[d] += state.box[d][i]*state.box[d][i];
 892         }
 893         box_size[d] = sqrt(box_size[d]);
 894     }
 895
 896     if (ir->fourier_spacing > 0)
 897     {
 898         info->fsx[0] = ir->fourier_spacing;
 899         info->fsy[0] = ir->fourier_spacing;
 900         info->fsz[0] = ir->fourier_spacing;
 901     }
 902     else
 903     {
 904         /* Reconstruct fourierspacing per dimension from the number of grid points and box size */
 905         info->fsx[0] = box_size[XX]/ir->nkx;
 906         info->fsy[0] = box_size[YY]/ir->nky;
 907         info->fsz[0] = box_size[ZZ]/ir->nkz;
 908     }
 909
 910     /* If no value for the fourierspacing was provided on the command line, we
 911      * use the reconstruction from the tpr file */
 912     if (ir->fourier_spacing > 0)
 913     {
 914         /* Use the spacing from the tpr */
 915         fourierspacing = ir->fourier_spacing;
 916     }
 917     else
 918     {
 919         /* Use the maximum observed spacing */
 920         fourierspacing = max(max(info->fsx[0], info->fsy[0]), info->fsz[0]);
 921     }
 922
 923     fprintf(stdout, "Calculating PME grid points on the basis of a fourierspacing of %f nm\n", fourierspacing);
 924
 925     /* For performance comparisons the number of particles is useful to have */
 926     fprintf(fp, "   Number of particles  : %d\n", mtop.natoms);
 927
 928     /* Print information about settings of which some are potentially modified: */
 929     fprintf(fp, "   Coulomb type         : %s\n", EELTYPE(ir->coulombtype));
 930     fprintf(fp, "   Grid spacing x y z   : %f %f %f\n",
 931             box_size[XX]/ir->nkx, box_size[YY]/ir->nky, box_size[ZZ]/ir->nkz);
 932     fprintf(fp, "   Van der Waals type   : %s\n", EVDWTYPE(ir->vdwtype));
 933     if (EVDW_SWITCHED(ir->vdwtype))
 934     {
 935         fprintf(fp, "   rvdw_switch          : %f nm\n", ir->rvdw_switch);
 936     }
 937     if (EPME_SWITCHED(ir->coulombtype))
 938     {
 939         fprintf(fp, "   rlist                : %f nm\n", ir->rlist);
 940     }
 941     if (ir->rlistlong != max_cutoff(ir->rvdw, ir->rcoulomb))
 942     {
 943         fprintf(fp, "   rlistlong            : %f nm\n", ir->rlistlong);
 944     }
 945
 946     /* Print a descriptive line about the tpr settings tested */
 947     fprintf(fp, "\nWill try these real/reciprocal workload settings:\n");
 948     fprintf(fp, " No.   scaling  rcoulomb");
 949     fprintf(fp, "  nkx  nky  nkz");
 950     fprintf(fp, "   spacing");
 951     if (evdwCUT == ir->vdwtype)
 952     {
 953         fprintf(fp, "      rvdw");
 954     }
 955     if (EPME_SWITCHED(ir->coulombtype))
 956     {
 957         fprintf(fp, "     rlist");
 958     }
 959     if (ir->rlistlong != max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb)) )
 960     {
 961         fprintf(fp, " rlistlong");
 962     }
 963     fprintf(fp, "  tpr file\n");
 964
 965     /* Loop to create the requested number of tpr input files */
 966     for (j = 0; j < *ntprs; j++)
 967     {
 968         /* The first .tpr is the provided one, just need to modify nsteps,
 969          * so skip the following block */
 970         if (j != 0)
 971         {
 972             /* Determine which Coulomb radii rc to use in the benchmarks */
 973             add = (rmax-rmin)/(*ntprs-1);
 974             if (is_equal(rmin, info->rcoulomb[0]))
 975             {
 976                 ir->rcoulomb = rmin + j*add;
 977             }
 978             else if (is_equal(rmax, info->rcoulomb[0]))
 979             {
 980                 ir->rcoulomb = rmin + (j-1)*add;
 981             }
 982             else
 983             {
 984                 /* rmin != rcoul != rmax, ergo test between rmin and rmax */
 985                 add          = (rmax-rmin)/(*ntprs-2);
 986                 ir->rcoulomb = rmin + (j-1)*add;
 987             }
 988
 989             /* Determine the scaling factor fac */
 990             fac = ir->rcoulomb/info->rcoulomb[0];
 991
 992             /* Scale the Fourier grid spacing */
 993             ir->nkx = ir->nky = ir->nkz = 0;
 994             calc_grid(NULL, state.box, fourierspacing*fac, &ir->nkx, &ir->nky, &ir->nkz);
 995
 996             /* Adjust other radii since various conditions neet to be fulfilled */
 997             if (eelPME == ir->coulombtype)
 998             {
 999                 /* plain PME, rcoulomb must be equal to rlist */
1000                 ir->rlist = ir->rcoulomb;
1001             }
1002             else
1003             {
1004                 /* rlist must be >= rcoulomb, we keep the size of the buffer region */
1005                 ir->rlist = ir->rcoulomb + nlist_buffer;
1006             }
1007
1008             if (bScaleRvdw && evdwCUT == ir->vdwtype)
1009             {
1010                 /* For vdw cutoff, rvdw >= rlist */
1011                 ir->rvdw = max(info->rvdw[0], ir->rlist);
1012             }
1013
1014             ir->rlistlong = max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb));
1015
1016         } /* end of "if (j != 0)" */
1017
1018         /* for j==0: Save the original settings
1019          * for j >0: Save modified radii and Fourier grids */
1020         info->rcoulomb[j]  = ir->rcoulomb;
1021         info->rvdw[j]      = ir->rvdw;
1022         info->nkx[j]       = ir->nkx;
1023         info->nky[j]       = ir->nky;
1024         info->nkz[j]       = ir->nkz;
1025         info->rlist[j]     = ir->rlist;
1026         info->rlistlong[j] = ir->rlistlong;
1027         info->fsx[j]       = fac*fourierspacing;
1028         info->fsy[j]       = fac*fourierspacing;
1029         info->fsz[j]       = fac*fourierspacing;
1030
1031         /* Write the benchmark tpr file */
1032         strncpy(fn_bench_tprs[j], fn_sim_tpr, strlen(fn_sim_tpr)-strlen(".tpr"));
1033         sprintf(buf, "_bench%.2d.tpr", j);
1034         strcat(fn_bench_tprs[j], buf);
1035         fprintf(stdout, "Writing benchmark tpr %s with nsteps=", fn_bench_tprs[j]);
1036         fprintf(stdout, gmx_large_int_pfmt, ir->nsteps);
1037         if (j > 0)
1038         {
1039             fprintf(stdout, ", scaling factor %f\n", fac);
1040         }
1041         else
1042         {
1043             fprintf(stdout, ", unmodified settings\n");
1044         }
1045
1046         write_tpx_state(fn_bench_tprs[j], ir, &state, &mtop);
1047
1048         /* Write information about modified tpr settings to log file */
1049         fprintf(fp, "%4d%10f%10f", j, fac, ir->rcoulomb);
1050         fprintf(fp, "%5d%5d%5d", ir->nkx, ir->nky, ir->nkz);
1051         fprintf(fp, " %9f ", info->fsx[j]);
1052         if (evdwCUT == ir->vdwtype)
1053         {
1054             fprintf(fp, "%10f", ir->rvdw);
1055         }
1056         if (EPME_SWITCHED(ir->coulombtype))
1057         {
1058             fprintf(fp, "%10f", ir->rlist);
1059         }
1060         if (info->rlistlong[0] != max_cutoff(info->rlist[0], max_cutoff(info->rvdw[0], info->rcoulomb[0])) )
1061         {
1062             fprintf(fp, "%10f", ir->rlistlong);
1063         }
1064         fprintf(fp, "  %-14s\n", fn_bench_tprs[j]);
1065
1066         /* Make it clear to the user that some additional settings were modified */
1067         if (!is_equal(ir->rvdw, info->rvdw[0])
1068             || !is_equal(ir->rlistlong, info->rlistlong[0]) )
1069         {
1070             bNote = TRUE;
1071         }
1072     }
1073     if (bNote)
1074     {
1075         fprintf(fp, "\nNote that in addition to the Coulomb radius and the Fourier grid\n"
1076                 "other input settings were also changed (see table above).\n"
1077                 "Please check if the modified settings are appropriate.\n");
1078     }
1079     fflush(stdout);
1080     fflush(fp);
1081     sfree(ir);
1082 }
1083
1084
1085 /* Rename the files we want to keep to some meaningful filename and
1086  * delete the rest */
1087 static void cleanup(const t_filenm *fnm, int nfile, int k, int nnodes,
1088                     int nPMEnodes, int nr, gmx_bool bKeepStderr)
1089 {
1090     char        numstring[STRLEN];
1091     char        newfilename[STRLEN];
1092     const char *fn = NULL;
1093     int         i;
1094     const char *opt;
1095
1096
1097     fprintf(stdout, "Cleaning up, deleting benchmark temp files ...\n");
1098
1099     for (i = 0; i < nfile; i++)
1100     {
1101         opt = (char *)fnm[i].opt;
1102         if (strcmp(opt, "-p") == 0)
1103         {
1104             /* do nothing; keep this file */
1105             ;
1106         }
1107         else if (strcmp(opt, "-bg") == 0)
1108         {
1109             /* Give the log file a nice name so one can later see which parameters were used */
1110             numstring[0] = '\0';
1111             if (nr > 0)
1112             {
1113                 sprintf(numstring, "_%d", nr);
1114             }
1115             sprintf(newfilename, "%s_no%d_np%d_npme%d%s", opt2fn("-bg", nfile, fnm), k, nnodes, nPMEnodes, numstring);
1116             if (gmx_fexist(opt2fn("-bg", nfile, fnm)))
1117             {
1118                 fprintf(stdout, "renaming log file to %s\n", newfilename);
1119                 make_backup(newfilename);
1120                 rename(opt2fn("-bg", nfile, fnm), newfilename);
1121             }
1122         }
1123         else if (strcmp(opt, "-err") == 0)
1124         {
1125             /* This file contains the output of stderr. We want to keep it in
1126              * cases where there have been problems. */
1127             fn           = opt2fn(opt, nfile, fnm);
1128             numstring[0] = '\0';
1129             if (nr > 0)
1130             {
1131                 sprintf(numstring, "_%d", nr);
1132             }
1133             sprintf(newfilename, "%s_no%d_np%d_npme%d%s", fn, k, nnodes, nPMEnodes, numstring);
1134             if (gmx_fexist(fn))
1135             {
1136                 if (bKeepStderr)
1137                 {
1138                     fprintf(stdout, "Saving stderr output in %s\n", newfilename);
1139                     make_backup(newfilename);
1140                     rename(fn, newfilename);
1141                 }
1142                 else
1143                 {
1144                     fprintf(stdout, "Deleting %s\n", fn);
1145                     remove(fn);
1146                 }
1147             }
1148         }
1149         /* Delete the files which are created for each benchmark run: (options -b*) */
1150         else if ( (0 == strncmp(opt, "-b", 2)) && (opt2bSet(opt, nfile, fnm) || !is_optional(&fnm[i])) )
1151         {
1152             fn = opt2fn(opt, nfile, fnm);
1153             if (gmx_fexist(fn))
1154             {
1155                 fprintf(stdout, "Deleting %s\n", fn);
1156                 remove(fn);
1157             }
1158         }
1159     }
1160 }
1161
1162
1163 /* Returns the largest common factor of n1 and n2 */
1164 static int largest_common_factor(int n1, int n2)
1165 {
1166     int factor, nmax;
1167
1168     nmax = min(n1, n2);
1169     for (factor = nmax; factor > 0; factor--)
1170     {
1171         if (0 == (n1 % factor) && 0 == (n2 % factor) )
1172         {
1173             return(factor);
1174         }
1175     }
1176     return 0; /* one for the compiler */
1177 }
1178
1179 enum {
1180     eNpmeAuto, eNpmeAll, eNpmeReduced, eNpmeSubset, eNpmeNr
1181 };
1182
1183 /* Create a list of numbers of PME nodes to test */
1184 static void make_npme_list(
1185         const char *npmevalues_opt, /* Make a complete list with all
1186                                      * possibilities or a short list that keeps only
1187                                      * reasonable numbers of PME nodes                  */
1188         int        *nentries,       /* Number of entries we put in the nPMEnodes list   */
1189         int        *nPMEnodes[],    /* Each entry contains the value for -npme          */
1190         int         nnodes,         /* Total number of nodes to do the tests on         */
1191         int         minPMEnodes,    /* Minimum number of PME nodes                      */
1192         int         maxPMEnodes)    /* Maximum number of PME nodes                      */
1193 {
1194     int i, npme, npp;
1195     int min_factor = 1;   /* We request that npp and npme have this minimal
1196                            * largest common factor (depends on npp)           */
1197     int nlistmax;         /* Max. list size                                   */
1198     int nlist;            /* Actual number of entries in list                 */
1199     int eNPME = 0;
1200
1201
1202     /* Do we need to check all possible values for -npme or is a reduced list enough? */
1203     if (0 == strcmp(npmevalues_opt, "all") )
1204     {
1205         eNPME = eNpmeAll;
1206     }
1207     else if (0 == strcmp(npmevalues_opt, "subset") )
1208     {
1209         eNPME = eNpmeSubset;
1210     }
1211     else /* "auto" or "range" */
1212     {
1213         if (nnodes <= 64)
1214         {
1215             eNPME = eNpmeAll;
1216         }
1217         else if (nnodes < 128)
1218         {
1219             eNPME = eNpmeReduced;
1220         }
1221         else
1222         {
1223             eNPME = eNpmeSubset;
1224         }
1225     }
1226
1227     /* Calculate how many entries we could possibly have (in case of -npme all) */
1228     if (nnodes > 2)
1229     {
1230         nlistmax = maxPMEnodes - minPMEnodes + 3;
1231         if (0 == minPMEnodes)
1232         {
1233             nlistmax--;
1234         }
1235     }
1236     else
1237     {
1238         nlistmax = 1;
1239     }
1240
1241     /* Now make the actual list which is at most of size nlist */
1242     snew(*nPMEnodes, nlistmax);
1243     nlist = 0; /* start counting again, now the real entries in the list */
1244     for (i = 0; i < nlistmax - 2; i++)
1245     {
1246         npme = maxPMEnodes - i;
1247         npp  = nnodes-npme;
1248         switch (eNPME)
1249         {
1250             case eNpmeAll:
1251                 min_factor = 1;
1252                 break;
1253             case eNpmeReduced:
1254                 min_factor = 2;
1255                 break;
1256             case eNpmeSubset:
1257                 /* For 2d PME we want a common largest factor of at least the cube
1258                  * root of the number of PP nodes */
1259                 min_factor = (int) pow(npp, 1.0/3.0);
1260                 break;
1261             default:
1262                 gmx_fatal(FARGS, "Unknown option for eNPME in make_npme_list");
1263                 break;
1264         }
1265         if (largest_common_factor(npp, npme) >= min_factor)
1266         {
1267             (*nPMEnodes)[nlist] = npme;
1268             nlist++;
1269         }
1270     }
1271     /* We always test 0 PME nodes and the automatic number */
1272     *nentries             = nlist + 2;
1273     (*nPMEnodes)[nlist  ] =  0;
1274     (*nPMEnodes)[nlist+1] = -1;
1275
1276     fprintf(stderr, "Will try the following %d different values for -npme:\n", *nentries);
1277     for (i = 0; i < *nentries-1; i++)
1278     {
1279         fprintf(stderr, "%d, ", (*nPMEnodes)[i]);
1280     }
1281     fprintf(stderr, "and %d (auto).\n", (*nPMEnodes)[*nentries-1]);
1282 }
1283
1284
1285 /* Allocate memory to store the performance data */
1286 static void init_perfdata(t_perf *perfdata[], int ntprs, int datasets, int repeats)
1287 {
1288     int i, j, k;
1289
1290
1291     for (k = 0; k < ntprs; k++)
1292     {
1293         snew(perfdata[k], datasets);
1294         for (i = 0; i < datasets; i++)
1295         {
1296             for (j = 0; j < repeats; j++)
1297             {
1298                 snew(perfdata[k][i].Gcycles, repeats);
1299                 snew(perfdata[k][i].ns_per_day, repeats);
1300                 snew(perfdata[k][i].PME_f_load, repeats);
1301             }
1302         }
1303     }
1304 }
1305
1306
1307 /* Check for errors on mdrun -h */
1308 static void make_sure_it_runs(char *mdrun_cmd_line, int length, FILE *fp)
1309 {
1310     char *command, *msg;
1311     int   ret;
1312
1313
1314     snew(command, length +  15);
1315     snew(msg, length + 500);
1316
1317     fprintf(stdout, "Making sure the benchmarks can be executed ...\n");
1318     /* FIXME: mdrun -h no longer actually does anything useful.
1319      * It unconditionally prints the help, ignoring all other options. */
1320     sprintf(command, "%s-h -quiet", mdrun_cmd_line);
1321     ret = gmx_system_call(command);
1322
1323     if (0 != ret)
1324     {
1325         /* To prevent confusion, do not again issue a gmx_fatal here since we already
1326          * get the error message from mdrun itself */
1327         sprintf(msg,    "Cannot run the benchmark simulations! Please check the error message of\n"
1328                 "mdrun for the source of the problem. Did you provide a command line\n"
1329                 "argument that neither g_tune_pme nor mdrun understands? Offending command:\n"
1330                 "\n%s\n\n", command);
1331
1332         fprintf(stderr, "%s", msg);
1333         sep_line(fp);
1334         fprintf(fp, "%s", msg);
1335
1336         exit(ret);
1337     }
1338
1339     sfree(command);
1340     sfree(msg    );
1341 }
1342
1343
1344 static void do_the_tests(
1345         FILE           *fp,             /* General g_tune_pme output file         */
1346         char          **tpr_names,      /* Filenames of the input files to test   */
1347         int             maxPMEnodes,    /* Max fraction of nodes to use for PME   */
1348         int             minPMEnodes,    /* Min fraction of nodes to use for PME   */
1349         int             npme_fixed,     /* If >= -1, test fixed number of PME
1350                                          * nodes only                             */
1351         const char     *npmevalues_opt, /* Which -npme values should be tested    */
1352         t_perf        **perfdata,       /* Here the performace data is stored     */
1353         int            *pmeentries,     /* Entries in the nPMEnodes list          */
1354         int             repeats,        /* Repeat each test this often            */
1355         int             nnodes,         /* Total number of nodes = nPP + nPME     */
1356         int             nr_tprs,        /* Total number of tpr files to test      */
1357         gmx_bool        bThreads,       /* Threads or MPI?                        */
1358         char           *cmd_mpirun,     /* mpirun command string                  */
1359         char           *cmd_np,         /* "-np", "-n", whatever mpirun needs     */
1360         char           *cmd_mdrun,      /* mdrun command string                   */
1361         char           *cmd_args_bench, /* arguments for mdrun in a string        */
1362         const t_filenm *fnm,            /* List of filenames from command line    */
1363         int             nfile,          /* Number of files specified on the cmdl. */
1364         int             presteps,       /* DLB equilibration steps, is checked    */
1365         gmx_large_int_t cpt_steps)      /* Time step counter in the checkpoint    */
1366 {
1367     int      i, nr, k, ret, count = 0, totaltests;
1368     int     *nPMEnodes = NULL;
1369     t_perf  *pd        = NULL;
1370     int      cmdline_length;
1371     char    *command, *cmd_stub;
1372     char     buf[STRLEN];
1373     gmx_bool bResetProblem = FALSE;
1374     gmx_bool bFirst        = TRUE;
1375
1376
1377     /* This string array corresponds to the eParselog enum type at the start
1378      * of this file */
1379     const char* ParseLog[] = {
1380         "OK.",
1381         "Logfile not found!",
1382         "No timings, logfile truncated?",
1383         "Run was terminated.",
1384         "Counters were not reset properly.",
1385         "No DD grid found for these settings.",
1386         "TPX version conflict!",
1387         "mdrun was not started in parallel!",
1388         "Number of PP nodes has a prime factor that is too large.",
1389         "An error occured."
1390     };
1391     char        str_PME_f_load[13];
1392
1393
1394     /* Allocate space for the mdrun command line. 100 extra characters should
1395        be more than enough for the -npme etcetera arguments */
1396     cmdline_length =  strlen(cmd_mpirun)
1397         + strlen(cmd_np)
1398         + strlen(cmd_mdrun)
1399         + strlen(cmd_args_bench)
1400         + strlen(tpr_names[0]) + 100;
1401     snew(command, cmdline_length);
1402     snew(cmd_stub, cmdline_length);
1403
1404     /* Construct the part of the command line that stays the same for all tests: */
1405     if (bThreads)
1406     {
1407         sprintf(cmd_stub, "%s%s", cmd_mdrun, cmd_np);
1408     }
1409     else
1410     {
1411         sprintf(cmd_stub, "%s%s%s ", cmd_mpirun, cmd_np, cmd_mdrun);
1412     }
1413
1414     /* Create a list of numbers of PME nodes to test */
1415     if (npme_fixed < -1)
1416     {
1417         make_npme_list(npmevalues_opt, pmeentries, &nPMEnodes,
1418                        nnodes, minPMEnodes, maxPMEnodes);
1419     }
1420     else
1421     {
1422         *pmeentries  = 1;
1423         snew(nPMEnodes, 1);
1424         nPMEnodes[0] = npme_fixed;
1425         fprintf(stderr, "Will use a fixed number of %d PME-only nodes.\n", nPMEnodes[0]);
1426     }
1427
1428     if (0 == repeats)
1429     {
1430         fprintf(fp, "\nNo benchmarks done since number of repeats (-r) is 0.\n");
1431         ffclose(fp);
1432         finalize(opt2fn("-p", nfile, fnm));
1433         exit(0);
1434     }
1435
1436     /* Allocate one dataset for each tpr input file: */
1437     init_perfdata(perfdata, nr_tprs, *pmeentries, repeats);
1438
1439     /*****************************************/
1440     /* Main loop over all tpr files to test: */
1441     /*****************************************/
1442     totaltests = nr_tprs*(*pmeentries)*repeats;
1443     for (k = 0; k < nr_tprs; k++)
1444     {
1445         fprintf(fp, "\nIndividual timings for input file %d (%s):\n", k, tpr_names[k]);
1446         fprintf(fp, "PME nodes      Gcycles       ns/day        PME/f    Remark\n");
1447         /* Loop over various numbers of PME nodes: */
1448         for (i = 0; i < *pmeentries; i++)
1449         {
1450             pd = &perfdata[k][i];
1451
1452             /* Loop over the repeats for each scenario: */
1453             for (nr = 0; nr < repeats; nr++)
1454             {
1455                 pd->nPMEnodes = nPMEnodes[i];
1456
1457                 /* Add -npme and -s to the command line and save it. Note that
1458                  * the -passall (if set) options requires cmd_args_bench to be
1459                  * at the end of the command line string */
1460                 snew(pd->mdrun_cmd_line, cmdline_length);
1461                 sprintf(pd->mdrun_cmd_line, "%s-npme %d -s %s %s",
1462                         cmd_stub, pd->nPMEnodes, tpr_names[k], cmd_args_bench);
1463
1464                 /* To prevent that all benchmarks fail due to a show-stopper argument
1465                  * on the mdrun command line, we make a quick check with mdrun -h first */
1466                 if (bFirst)
1467                 {
1468                     make_sure_it_runs(pd->mdrun_cmd_line, cmdline_length, fp);
1469                 }
1470                 bFirst = FALSE;
1471
1472                 /* Do a benchmark simulation: */
1473                 if (repeats > 1)
1474                 {
1475                     sprintf(buf, ", pass %d/%d", nr+1, repeats);
1476                 }
1477                 else
1478                 {
1479                     buf[0] = '\0';
1480                 }
1481                 fprintf(stdout, "\n=== Progress %2.0f%%, tpr %d/%d, run %d/%d%s:\n",
1482                         (100.0*count)/totaltests,
1483                         k+1, nr_tprs, i+1, *pmeentries, buf);
1484                 make_backup(opt2fn("-err", nfile, fnm));
1485                 sprintf(command, "%s 1> /dev/null 2>%s", pd->mdrun_cmd_line, opt2fn("-err", nfile, fnm));
1486                 fprintf(stdout, "%s\n", pd->mdrun_cmd_line);
1487                 gmx_system_call(command);
1488
1489                 /* Collect the performance data from the log file; also check stderr
1490                  * for fatal errors */
1491                 ret = parse_logfile(opt2fn("-bg", nfile, fnm), opt2fn("-err", nfile, fnm),
1492                                     pd, nr, presteps, cpt_steps, nnodes);
1493                 if ((presteps > 0) && (ret == eParselogResetProblem))
1494                 {
1495                     bResetProblem = TRUE;
1496                 }
1497
1498                 if (-1 == pd->nPMEnodes)
1499                 {
1500                     sprintf(buf, "(%3d)", pd->guessPME);
1501                 }
1502                 else
1503                 {
1504                     sprintf(buf, "     ");
1505                 }
1506
1507                 /* Nicer output */
1508                 if (pd->PME_f_load[nr] > 0.0)
1509                 {
1510                     sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load[nr]);
1511                 }
1512                 else
1513                 {
1514                     sprintf(str_PME_f_load, "%s", "         -  ");
1515                 }
1516
1517                 /* Write the data we got to disk */
1518                 fprintf(fp, "%4d%s %12.3f %12.3f %s    %s", pd->nPMEnodes,
1519                         buf, pd->Gcycles[nr], pd->ns_per_day[nr], str_PME_f_load, ParseLog[ret]);
1520                 if (!(ret == eParselogOK || ret == eParselogNoDDGrid || ret == eParselogNotFound) )
1521                 {
1522                     fprintf(fp, " Check %s file for problems.", ret == eParselogFatal ? "err" : "log");
1523                 }
1524                 fprintf(fp, "\n");
1525                 fflush(fp);
1526                 count++;
1527
1528                 /* Do some cleaning up and delete the files we do not need any more */
1529                 cleanup(fnm, nfile, k, nnodes, pd->nPMEnodes, nr, ret == eParselogFatal);
1530
1531                 /* If the first run with this number of processors already failed, do not try again: */
1532                 if (pd->Gcycles[0] <= 0.0 && repeats > 1)
1533                 {
1534                     fprintf(stdout, "Skipping remaining passes of unsuccessful setting, see log file for details.\n");
1535                     count += repeats-(nr+1);
1536                     break;
1537                 }
1538             } /* end of repeats loop */
1539         }     /* end of -npme loop */
1540     }         /* end of tpr file loop */
1541
1542     if (bResetProblem)
1543     {
1544         sep_line(fp);
1545         fprintf(fp, "WARNING: The cycle and time step counters could not be reset properly. ");
1546         sep_line(fp);
1547     }
1548     sfree(command);
1549     sfree(cmd_stub);
1550 }
1551
1552
1553 static void check_input(
1554         int             nnodes,
1555         int             repeats,
1556         int            *ntprs,
1557         real           *rmin,
1558         real            rcoulomb,
1559         real           *rmax,
1560         real            maxPMEfraction,
1561         real            minPMEfraction,
1562         int             npme_fixed,
1563         gmx_large_int_t bench_nsteps,
1564         const t_filenm *fnm,
1565         int             nfile,
1566         int             sim_part,
1567         int             presteps,
1568         int             npargs,
1569         t_pargs        *pa)
1570 {
1571     int old;
1572
1573
1574     /* Make sure the input file exists */
1575     if (!gmx_fexist(opt2fn("-s", nfile, fnm)))
1576     {
1577         gmx_fatal(FARGS, "File %s not found.", opt2fn("-s", nfile, fnm));
1578     }
1579
1580     /* Make sure that the checkpoint file is not overwritten during benchmarking */
1581     if ( (0 == strcmp(opt2fn("-cpi", nfile, fnm), opt2fn("-bcpo", nfile, fnm)) ) && (sim_part > 1) )
1582     {
1583         gmx_fatal(FARGS, "Checkpoint input (-cpi) and benchmark checkpoint output (-bcpo) files must not be identical.\n"
1584                   "The checkpoint input file must not be overwritten during the benchmarks.\n");
1585     }
1586
1587     /* Make sure that repeats is >= 0 (if == 0, only write tpr files) */
1588     if (repeats < 0)
1589     {
1590         gmx_fatal(FARGS, "Number of repeats < 0!");
1591     }
1592
1593     /* Check number of nodes */
1594     if (nnodes < 1)
1595     {
1596         gmx_fatal(FARGS, "Number of nodes/threads must be a positive integer.");
1597     }
1598
1599     /* Automatically choose -ntpr if not set */
1600     if (*ntprs < 1)
1601     {
1602         if (nnodes < 16)
1603         {
1604             *ntprs = 1;
1605         }
1606         else
1607         {
1608             *ntprs = 3;
1609             /* Set a reasonable scaling factor for rcoulomb */
1610             if (*rmax <= 0)
1611             {
1612                 *rmax = rcoulomb * 1.2;
1613             }
1614         }
1615         fprintf(stderr, "Will test %d tpr file%s.\n", *ntprs, *ntprs == 1 ? "" : "s");
1616     }
1617     else
1618     {
1619         if (1 == *ntprs)
1620         {
1621             fprintf(stderr, "Note: Choose ntpr>1 to shift PME load between real and reciprocal space.\n");
1622         }
1623     }
1624
1625     /* Make shure that rmin <= rcoulomb <= rmax */
1626     if (*rmin <= 0)
1627     {
1628         *rmin = rcoulomb;
1629     }
1630     if (*rmax <= 0)
1631     {
1632         *rmax = rcoulomb;
1633     }
1634     if (!(*rmin <= *rmax) )
1635     {
1636         gmx_fatal(FARGS, "Please choose the Coulomb radii such that rmin <= rmax.\n"
1637                   "rmin = %g, rmax = %g, actual rcoul from .tpr file = %g\n", *rmin, *rmax, rcoulomb);
1638     }
1639     /* Add test scenarios if rmin or rmax were set */
1640     if (*ntprs <= 2)
1641     {
1642         if (!is_equal(*rmin, rcoulomb) && (*ntprs == 1) )
1643         {
1644             (*ntprs)++;
1645             fprintf(stderr, "NOTE: Setting -rmin to %g changed -ntpr to %d\n",
1646                     *rmin, *ntprs);
1647         }
1648         if (!is_equal(*rmax, rcoulomb) && (*ntprs == 1) )
1649         {
1650             (*ntprs)++;
1651             fprintf(stderr, "NOTE: Setting -rmax to %g changed -ntpr to %d\n",
1652                     *rmax, *ntprs);
1653         }
1654     }
1655     old = *ntprs;
1656     /* If one of rmin, rmax is set, we need 2 tpr files at minimum */
1657     if (!is_equal(*rmax, rcoulomb) || !is_equal(*rmin, rcoulomb) )
1658     {
1659         *ntprs = max(*ntprs, 2);
1660     }
1661
1662     /* If both rmin, rmax are set, we need 3 tpr files at minimum */
1663     if (!is_equal(*rmax, rcoulomb) && !is_equal(*rmin, rcoulomb) )
1664     {
1665         *ntprs = max(*ntprs, 3);
1666     }
1667
1668     if (old != *ntprs)
1669     {
1670         fprintf(stderr, "NOTE: Your rmin, rmax setting changed -ntpr to %d\n", *ntprs);
1671     }
1672
1673     if (*ntprs > 1)
1674     {
1675         if (is_equal(*rmin, rcoulomb) && is_equal(rcoulomb, *rmax)) /* We have just a single rc */
1676         {
1677             fprintf(stderr, "WARNING: Resetting -ntpr to 1 since no Coulomb radius scaling is requested.\n"
1678                     "Please set rmin < rmax to test Coulomb radii in the [rmin, rmax] interval\n"
1679                     "with correspondingly adjusted PME grid settings\n");
1680             *ntprs = 1;
1681         }
1682     }
1683
1684     /* Check whether max and min fraction are within required values */
1685     if (maxPMEfraction > 0.5 || maxPMEfraction < 0)
1686     {
1687         gmx_fatal(FARGS, "-max must be between 0 and 0.5");
1688     }
1689     if (minPMEfraction > 0.5 || minPMEfraction < 0)
1690     {
1691         gmx_fatal(FARGS, "-min must be between 0 and 0.5");
1692     }
1693     if (maxPMEfraction < minPMEfraction)
1694     {
1695         gmx_fatal(FARGS, "-max must be larger or equal to -min");
1696     }
1697
1698     /* Check whether the number of steps - if it was set - has a reasonable value */
1699     if (bench_nsteps < 0)
1700     {
1701         gmx_fatal(FARGS, "Number of steps must be positive.");
1702     }
1703
1704     if (bench_nsteps > 10000 || bench_nsteps < 100)
1705     {
1706         fprintf(stderr, "WARNING: steps=");
1707         fprintf(stderr, gmx_large_int_pfmt, bench_nsteps);
1708         fprintf(stderr, ". Are you sure you want to perform so %s steps for each benchmark?\n", (bench_nsteps < 100) ? "few" : "many");
1709     }
1710
1711     if (presteps < 0)
1712     {
1713         gmx_fatal(FARGS, "Cannot have a negative number of presteps.\n");
1714     }
1715
1716     /* Check for rcoulomb scaling if more than one .tpr file is tested */
1717     if (*ntprs > 1)
1718     {
1719         if (*rmin/rcoulomb < 0.75 || *rmax/rcoulomb > 1.25)
1720         {
1721             fprintf(stderr, "WARNING: Applying extreme scaling factor. I hope you know what you are doing.\n");
1722         }
1723     }
1724
1725     /* If a fixed number of PME nodes is set we do rcoulomb and PME gird tuning
1726      * only. We need to check whether the requested number of PME-only nodes
1727      * makes sense. */
1728     if (npme_fixed > -1)
1729     {
1730         /* No more than 50% of all nodes can be assigned as PME-only nodes. */
1731         if (2*npme_fixed > nnodes)
1732         {
1733             gmx_fatal(FARGS, "Cannot have more than %d PME-only nodes for a total of %d nodes (you chose %d).\n",
1734                       nnodes/2, nnodes, npme_fixed);
1735         }
1736         if ((npme_fixed > 0) && (5*npme_fixed < nnodes))
1737         {
1738             fprintf(stderr, "WARNING: Only %g percent of the nodes are assigned as PME-only nodes.\n",
1739                     100.0*((real)npme_fixed / (real)nnodes));
1740         }
1741         if (opt2parg_bSet("-min", npargs, pa) || opt2parg_bSet("-max", npargs, pa))
1742         {
1743             fprintf(stderr, "NOTE: The -min, -max, and -npme options have no effect when a\n"
1744                     "      fixed number of PME-only nodes is requested with -fix.\n");
1745         }
1746     }
1747 }
1748
1749
1750 /* Returns TRUE when "opt" is needed at launch time */
1751 static gmx_bool is_launch_file(char *opt, gmx_bool bSet)
1752 {
1753     /* Apart from the input .tpr and the output log files we need all options that
1754      * were set on the command line and that do not start with -b */
1755     if    (0 == strncmp(opt, "-b", 2) || 0 == strncmp(opt, "-s", 2)
1756            || 0 == strncmp(opt, "-err", 4) || 0 == strncmp(opt, "-p", 2) )
1757     {
1758         return FALSE;
1759     }
1760
1761     return bSet;
1762 }
1763
1764
1765 /* Returns TRUE when "opt" defines a file which is needed for the benchmarks runs */
1766 static gmx_bool is_bench_file(char *opt, gmx_bool bSet, gmx_bool bOptional, gmx_bool bIsOutput)
1767 {
1768     /* Apart from the input .tpr, all files starting with "-b" are for
1769      * _b_enchmark files exclusively */
1770     if (0 == strncmp(opt, "-s", 2))
1771     {
1772         return FALSE;
1773     }
1774
1775     if (0 == strncmp(opt, "-b", 2) || 0 == strncmp(opt, "-s", 2))
1776     {
1777         if (!bOptional || bSet)
1778         {
1779             return TRUE;
1780         }
1781         else
1782         {
1783             return FALSE;
1784         }
1785     }
1786     else
1787     {
1788         if (bIsOutput)
1789         {
1790             return FALSE;
1791         }
1792         else
1793         {
1794             if (bSet) /* These are additional input files like -cpi -ei */
1795             {
1796                 return TRUE;
1797             }
1798             else
1799             {
1800                 return FALSE;
1801             }
1802         }
1803     }
1804 }
1805
1806
1807 /* Adds 'buf' to 'str' */
1808 static void add_to_string(char **str, char *buf)
1809 {
1810     int len;
1811
1812
1813     len = strlen(*str) + strlen(buf) + 1;
1814     srenew(*str, len);
1815     strcat(*str, buf);
1816 }
1817
1818
1819 /* Create the command line for the benchmark as well as for the real run */
1820 static void create_command_line_snippets(
1821         gmx_bool  bAppendFiles,
1822         gmx_bool  bKeepAndNumCPT,
1823         gmx_bool  bResetHWay,
1824         int       presteps,
1825         int       nfile,
1826         t_filenm  fnm[],
1827         char     *cmd_args_bench[],  /* command line arguments for benchmark runs */
1828         char     *cmd_args_launch[], /* command line arguments for simulation run */
1829         char      extra_args[])      /* Add this to the end of the command line */
1830 {
1831     int         i;
1832     char       *opt;
1833     const char *name;
1834     char        strbuf[STRLEN];
1835
1836
1837     /* strlen needs at least '\0' as a string: */
1838     snew(*cmd_args_bench, 1);
1839     snew(*cmd_args_launch, 1);
1840     *cmd_args_launch[0] = '\0';
1841     *cmd_args_bench[0]  = '\0';
1842
1843
1844     /*******************************************/
1845     /* 1. Process other command line arguments */
1846     /*******************************************/
1847     if (presteps > 0)
1848     {
1849         /* Add equilibration steps to benchmark options */
1850         sprintf(strbuf, "-resetstep %d ", presteps);
1851         add_to_string(cmd_args_bench, strbuf);
1852     }
1853     /* These switches take effect only at launch time */
1854     if (FALSE == bAppendFiles)
1855     {
1856         add_to_string(cmd_args_launch, "-noappend ");
1857     }
1858     if (bKeepAndNumCPT)
1859     {
1860         add_to_string(cmd_args_launch, "-cpnum ");
1861     }
1862     if (bResetHWay)
1863     {
1864         add_to_string(cmd_args_launch, "-resethway ");
1865     }
1866
1867     /********************/
1868     /* 2. Process files */
1869     /********************/
1870     for (i = 0; i < nfile; i++)
1871     {
1872         opt  = (char *)fnm[i].opt;
1873         name = opt2fn(opt, nfile, fnm);
1874
1875         /* Strbuf contains the options, now let's sort out where we need that */
1876         sprintf(strbuf, "%s %s ", opt, name);
1877
1878         if (is_bench_file(opt, opt2bSet(opt, nfile, fnm), is_optional(&fnm[i]), is_output(&fnm[i])) )
1879         {
1880             /* All options starting with -b* need the 'b' removed,
1881              * therefore overwrite strbuf */
1882             if (0 == strncmp(opt, "-b", 2))
1883             {
1884                 sprintf(strbuf, "-%s %s ", &opt[2], name);
1885             }
1886
1887             add_to_string(cmd_args_bench, strbuf);
1888         }
1889
1890         if (is_launch_file(opt, opt2bSet(opt, nfile, fnm)) )
1891         {
1892             add_to_string(cmd_args_launch, strbuf);
1893         }
1894     }
1895
1896     add_to_string(cmd_args_bench, extra_args);
1897     add_to_string(cmd_args_launch, extra_args);
1898 }
1899
1900
1901 /* Set option opt */
1902 static void setopt(const char *opt, int nfile, t_filenm fnm[])
1903 {
1904     int i;
1905
1906     for (i = 0; (i < nfile); i++)
1907     {
1908         if (strcmp(opt, fnm[i].opt) == 0)
1909         {
1910             fnm[i].flag |= ffSET;
1911         }
1912     }
1913 }
1914
1915
1916 /* This routine inspects the tpr file and ...
1917  * 1. checks for output files that get triggered by a tpr option. These output
1918  *    files are marked as 'set' to allow for a proper cleanup after each
1919  *    tuning run.
1920  * 2. returns the PME:PP load ratio
1921  * 3. returns rcoulomb from the tpr */
1922 static float inspect_tpr(int nfile, t_filenm fnm[], real *rcoulomb)
1923 {
1924     gmx_bool     bPull;     /* Is pulling requested in .tpr file?             */
1925     gmx_bool     bTpi;      /* Is test particle insertion requested?          */
1926     gmx_bool     bFree;     /* Is a free energy simulation requested?         */
1927     gmx_bool     bNM;       /* Is a normal mode analysis requested?           */
1928     t_inputrec   ir;
1929     t_state      state;
1930     gmx_mtop_t   mtop;
1931
1932
1933     /* Check tpr file for options that trigger extra output files */
1934     read_tpx_state(opt2fn("-s", nfile, fnm), &ir, &state, NULL, &mtop);
1935     bPull = (epullNO != ir.ePull);
1936     bFree = (efepNO  != ir.efep );
1937     bNM   = (eiNM    == ir.eI   );
1938     bTpi  = EI_TPI(ir.eI);
1939
1940     /* Set these output files on the tuning command-line */
1941     if (bPull)
1942     {
1943         setopt("-pf", nfile, fnm);
1944         setopt("-px", nfile, fnm);
1945     }
1946     if (bFree)
1947     {
1948         setopt("-dhdl", nfile, fnm);
1949     }
1950     if (bTpi)
1951     {
1952         setopt("-tpi", nfile, fnm);
1953         setopt("-tpid", nfile, fnm);
1954     }
1955     if (bNM)
1956     {
1957         setopt("-mtx", nfile, fnm);
1958     }
1959
1960     *rcoulomb = ir.rcoulomb;
1961
1962     /* Return the estimate for the number of PME nodes */
1963     return pme_load_estimate(&mtop, &ir, state.box);
1964 }
1965
1966
1967 static void couple_files_options(int nfile, t_filenm fnm[])
1968 {
1969     int      i;
1970     gmx_bool bSet, bBench;
1971     char    *opt;
1972     char     buf[20];
1973
1974
1975     for (i = 0; i < nfile; i++)
1976     {
1977         opt    = (char *)fnm[i].opt;
1978         bSet   = ((fnm[i].flag & ffSET) != 0);
1979         bBench = (0 == strncmp(opt, "-b", 2));
1980
1981         /* Check optional files */
1982         /* If e.g. -eo is set, then -beo also needs to be set */
1983         if (is_optional(&fnm[i]) && bSet && !bBench)
1984         {
1985             sprintf(buf, "-b%s", &opt[1]);
1986             setopt(buf, nfile, fnm);
1987         }
1988         /* If -beo is set, then -eo also needs to be! */
1989         if (is_optional(&fnm[i]) && bSet && bBench)
1990         {
1991             sprintf(buf, "-%s", &opt[2]);
1992             setopt(buf, nfile, fnm);
1993         }
1994     }
1995 }
1996
1997
1998 #define BENCHSTEPS (1000)
1999
2000 int gmx_tune_pme(int argc, char *argv[])
2001 {
2002     const char     *desc[] = {
2003         "For a given number [TT]-np[tt] or [TT]-ntmpi[tt] of processors/threads, this program systematically",
2004         "times [TT]mdrun[tt] with various numbers of PME-only nodes and determines",
2005         "which setting is fastest. It will also test whether performance can",
2006         "be enhanced by shifting load from the reciprocal to the real space",
2007         "part of the Ewald sum. ",
2008         "Simply pass your [TT].tpr[tt] file to [TT]g_tune_pme[tt] together with other options",
2009         "for [TT]mdrun[tt] as needed.[PAR]",
2010         "Which executables are used can be set in the environment variables",
2011         "MPIRUN and MDRUN. If these are not present, 'mpirun' and 'mdrun'",
2012         "will be used as defaults. Note that for certain MPI frameworks you",
2013         "need to provide a machine- or hostfile. This can also be passed",
2014         "via the MPIRUN variable, e.g.[PAR]",
2015         "[TT]export MPIRUN=\"/usr/local/mpirun -machinefile hosts\"[tt][PAR]",
2016         "Please call [TT]g_tune_pme[tt] with the normal options you would pass to",
2017         "[TT]mdrun[tt] and add [TT]-np[tt] for the number of processors to perform the",
2018         "tests on, or [TT]-ntmpi[tt] for the number of threads. You can also add [TT]-r[tt]",
2019         "to repeat each test several times to get better statistics. [PAR]",
2020         "[TT]g_tune_pme[tt] can test various real space / reciprocal space workloads",
2021         "for you. With [TT]-ntpr[tt] you control how many extra [TT].tpr[tt] files will be",
2022         "written with enlarged cutoffs and smaller Fourier grids respectively.",
2023         "Typically, the first test (number 0) will be with the settings from the input",
2024         "[TT].tpr[tt] file; the last test (number [TT]ntpr[tt]) will have the Coulomb cutoff",
2025         "specified by [TT]-rmax[tt] with a somwhat smaller PME grid at the same time. ",
2026         "In this last test, the Fourier spacing is multiplied with [TT]rmax[tt]/rcoulomb. ",
2027         "The remaining [TT].tpr[tt] files will have equally-spaced Coulomb radii (and Fourier "
2028         "spacings) between these extremes. [BB]Note[bb] that you can set [TT]-ntpr[tt] to 1",
2029         "if you just seek the optimal number of PME-only nodes; in that case",
2030         "your input [TT].tpr[tt] file will remain unchanged.[PAR]",
2031         "For the benchmark runs, the default of 1000 time steps should suffice for most",
2032         "MD systems. The dynamic load balancing needs about 100 time steps",
2033         "to adapt to local load imbalances, therefore the time step counters",
2034         "are by default reset after 100 steps. For large systems (>1M atoms), as well as ",
2035         "for a higher accuarcy of the measurements, you should set [TT]-resetstep[tt] to a higher value.",
2036         "From the 'DD' load imbalance entries in the md.log output file you",
2037         "can tell after how many steps the load is sufficiently balanced. Example call:[PAR]"
2038         "[TT]g_tune_pme -np 64 -s protein.tpr -launch[tt][PAR]",
2039         "After calling [TT]mdrun[tt] several times, detailed performance information",
2040         "is available in the output file [TT]perf.out.[tt] ",
2041         "[BB]Note[bb] that during the benchmarks, a couple of temporary files are written",
2042         "(options [TT]-b[tt]*), these will be automatically deleted after each test.[PAR]",
2043         "If you want the simulation to be started automatically with the",
2044         "optimized parameters, use the command line option [TT]-launch[tt].[PAR]",
2045     };
2046
2047     int             nnodes         = 1;
2048     int             repeats        = 2;
2049     int             pmeentries     = 0; /* How many values for -npme do we actually test for each tpr file */
2050     real            maxPMEfraction = 0.50;
2051     real            minPMEfraction = 0.25;
2052     int             maxPMEnodes, minPMEnodes;
2053     float           guessPMEratio;                    /* guessed PME:PP ratio based on the tpr file */
2054     float           guessPMEnodes;
2055     int             npme_fixed     = -2;              /* If >= -1, use only this number
2056                                                        * of PME-only nodes                */
2057     int             ntprs          = 0;
2058     real            rmin           = 0.0, rmax = 0.0; /* min and max value for rcoulomb if scaling is requested */
2059     real            rcoulomb       = -1.0;            /* Coulomb radius as set in .tpr file */
2060     gmx_bool        bScaleRvdw     = TRUE;
2061     gmx_large_int_t bench_nsteps   = BENCHSTEPS;
2062     gmx_large_int_t new_sim_nsteps = -1;  /* -1 indicates: not set by the user */
2063     gmx_large_int_t cpt_steps      = 0;   /* Step counter in .cpt input file   */
2064     int             presteps       = 100; /* Do a full cycle reset after presteps steps */
2065     gmx_bool        bOverwrite     = FALSE, bKeepTPR;
2066     gmx_bool        bLaunch        = FALSE;
2067     char           *ExtraArgs      = NULL;
2068     char          **tpr_names      = NULL;
2069     const char     *simulation_tpr = NULL;
2070     int             best_npme, best_tpr;
2071     int             sim_part = 1; /* For benchmarks with checkpoint files */
2072     char            bbuf[STRLEN];
2073
2074     /* Default program names if nothing else is found */
2075     char         *cmd_mpirun = NULL, *cmd_mdrun = NULL;
2076     char         *cmd_args_bench, *cmd_args_launch;
2077     char         *cmd_np = NULL;
2078
2079     t_perf      **perfdata = NULL;
2080     t_inputinfo  *info;
2081     int           i;
2082     FILE         *fp;
2083     t_commrec    *cr;
2084
2085     /* Print out how long the tuning took */
2086     double          seconds;
2087
2088     static t_filenm fnm[] = {
2089         /* g_tune_pme */
2090         { efOUT, "-p",      "perf",     ffWRITE },
2091         { efLOG, "-err",    "bencherr", ffWRITE },
2092         { efTPX, "-so",     "tuned",    ffWRITE },
2093         /* mdrun: */
2094         { efTPX, NULL,      NULL,       ffREAD },
2095         { efTRN, "-o",      NULL,       ffWRITE },
2096         { efXTC, "-x",      NULL,       ffOPTWR },
2097         { efCPT, "-cpi",    NULL,       ffOPTRD },
2098         { efCPT, "-cpo",    NULL,       ffOPTWR },
2099         { efSTO, "-c",      "confout",  ffWRITE },
2100         { efEDR, "-e",      "ener",     ffWRITE },
2101         { efLOG, "-g",      "md",       ffWRITE },
2102         { efXVG, "-dhdl",   "dhdl",     ffOPTWR },
2103         { efXVG, "-field",  "field",    ffOPTWR },
2104         { efXVG, "-table",  "table",    ffOPTRD },
2105         { efXVG, "-tabletf", "tabletf",   ffOPTRD },
2106         { efXVG, "-tablep", "tablep",   ffOPTRD },
2107         { efXVG, "-tableb", "table",    ffOPTRD },
2108         { efTRX, "-rerun",  "rerun",    ffOPTRD },
2109         { efXVG, "-tpi",    "tpi",      ffOPTWR },
2110         { efXVG, "-tpid",   "tpidist",  ffOPTWR },
2111         { efEDI, "-ei",     "sam",      ffOPTRD },
2112         { efXVG, "-eo",     "edsam",    ffOPTWR },
2113         { efXVG, "-devout", "deviatie", ffOPTWR },
2114         { efXVG, "-runav",  "runaver",  ffOPTWR },
2115         { efXVG, "-px",     "pullx",    ffOPTWR },
2116         { efXVG, "-pf",     "pullf",    ffOPTWR },
2117         { efXVG, "-ro",     "rotation", ffOPTWR },
2118         { efLOG, "-ra",     "rotangles", ffOPTWR },
2119         { efLOG, "-rs",     "rotslabs", ffOPTWR },
2120         { efLOG, "-rt",     "rottorque", ffOPTWR },
2121         { efMTX, "-mtx",    "nm",       ffOPTWR },
2122         { efNDX, "-dn",     "dipole",   ffOPTWR },
2123         /* Output files that are deleted after each benchmark run */
2124         { efTRN, "-bo",     "bench",    ffWRITE },
2125         { efXTC, "-bx",     "bench",    ffWRITE },
2126         { efCPT, "-bcpo",   "bench",    ffWRITE },
2127         { efSTO, "-bc",     "bench",    ffWRITE },
2128         { efEDR, "-be",     "bench",    ffWRITE },
2129         { efLOG, "-bg",     "bench",    ffWRITE },
2130         { efXVG, "-beo",    "benchedo", ffOPTWR },
2131         { efXVG, "-bdhdl",  "benchdhdl", ffOPTWR },
2132         { efXVG, "-bfield", "benchfld", ffOPTWR },
2133         { efXVG, "-btpi",   "benchtpi", ffOPTWR },
2134         { efXVG, "-btpid",  "benchtpid", ffOPTWR },
2135         { efXVG, "-bdevout", "benchdev", ffOPTWR },
2136         { efXVG, "-brunav", "benchrnav", ffOPTWR },
2137         { efXVG, "-bpx",    "benchpx",  ffOPTWR },
2138         { efXVG, "-bpf",    "benchpf",  ffOPTWR },
2139         { efXVG, "-bro",    "benchrot", ffOPTWR },
2140         { efLOG, "-bra",    "benchrota", ffOPTWR },
2141         { efLOG, "-brs",    "benchrots", ffOPTWR },
2142         { efLOG, "-brt",    "benchrott", ffOPTWR },
2143         { efMTX, "-bmtx",   "benchn",   ffOPTWR },
2144         { efNDX, "-bdn",    "bench",    ffOPTWR }
2145     };
2146
2147     gmx_bool        bThreads     = FALSE;
2148
2149     int             nthreads = 1;
2150
2151     const char     *procstring[] =
2152     { NULL, "-np", "-n", "none", NULL };
2153     const char     *npmevalues_opt[] =
2154     { NULL, "auto", "all", "subset", NULL };
2155
2156     gmx_bool     bAppendFiles          = TRUE;
2157     gmx_bool     bKeepAndNumCPT        = FALSE;
2158     gmx_bool     bResetCountersHalfWay = FALSE;
2159     gmx_bool     bBenchmark            = TRUE;
2160
2161     output_env_t oenv = NULL;
2162
2163     t_pargs      pa[] = {
2164         /***********************/
2165         /* g_tune_pme options: */
2166         /***********************/
2167         { "-np",       FALSE, etINT,  {&nnodes},
2168           "Number of nodes to run the tests on (must be > 2 for separate PME nodes)" },
2169         { "-npstring", FALSE, etENUM, {procstring},
2170           "Specify the number of processors to [TT]$MPIRUN[tt] using this string"},
2171         { "-ntmpi",    FALSE, etINT,  {&nthreads},
2172           "Number of MPI-threads to run the tests on (turns MPI & mpirun off)"},
2173         { "-r",        FALSE, etINT,  {&repeats},
2174           "Repeat each test this often" },
2175         { "-max",      FALSE, etREAL, {&maxPMEfraction},
2176           "Max fraction of PME nodes to test with" },
2177         { "-min",      FALSE, etREAL, {&minPMEfraction},
2178           "Min fraction of PME nodes to test with" },
2179         { "-npme",     FALSE, etENUM, {npmevalues_opt},
2180           "Within -min and -max, benchmark all possible values for [TT]-npme[tt], or just a reasonable subset. "
2181           "Auto neglects -min and -max and chooses reasonable values around a guess for npme derived from the .tpr"},
2182         { "-fix",      FALSE, etINT,  {&npme_fixed},
2183           "If >= -1, do not vary the number of PME-only nodes, instead use this fixed value and only vary rcoulomb and the PME grid spacing."},
2184         { "-rmax",     FALSE, etREAL, {&rmax},
2185           "If >0, maximal rcoulomb for -ntpr>1 (rcoulomb upscaling results in fourier grid downscaling)" },
2186         { "-rmin",     FALSE, etREAL, {&rmin},
2187           "If >0, minimal rcoulomb for -ntpr>1" },
2188         { "-scalevdw",  FALSE, etBOOL, {&bScaleRvdw},
2189           "Scale rvdw along with rcoulomb"},
2190         { "-ntpr",     FALSE, etINT,  {&ntprs},
2191           "Number of [TT].tpr[tt] files to benchmark. Create this many files with different rcoulomb scaling factors depending on -rmin and -rmax. "
2192           "If < 1, automatically choose the number of [TT].tpr[tt] files to test" },
2193         { "-steps",    FALSE, etGMX_LARGE_INT, {&bench_nsteps},
2194           "Take timings for this many steps in the benchmark runs" },
2195         { "-resetstep", FALSE, etINT,  {&presteps},
2196           "Let dlb equilibrate this many steps before timings are taken (reset cycle counters after this many steps)" },
2197         { "-simsteps", FALSE, etGMX_LARGE_INT, {&new_sim_nsteps},
2198           "If non-negative, perform this many steps in the real run (overwrites nsteps from [TT].tpr[tt], add [TT].cpt[tt] steps)" },
2199         { "-launch",   FALSE, etBOOL, {&bLaunch},
2200           "Launch the real simulation after optimization" },
2201         { "-bench",    FALSE, etBOOL, {&bBenchmark},
2202           "Run the benchmarks or just create the input [TT].tpr[tt] files?" },
2203         /******************/
2204         /* mdrun options: */
2205         /******************/
2206         /* We let g_tune_pme parse and understand these options, because we need to
2207          * prevent that they appear on the mdrun command line for the benchmarks */
2208         { "-append",   FALSE, etBOOL, {&bAppendFiles},
2209           "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names (for launch only)" },
2210         { "-cpnum",    FALSE, etBOOL, {&bKeepAndNumCPT},
2211           "Keep and number checkpoint files (launch only)" },
2212         { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay},
2213           "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt] (launch only)" }
2214     };
2215
2216 #define NFILE asize(fnm)
2217
2218     seconds = gmx_gettime();
2219
2220     if (!parse_common_args(&argc, argv, PCA_NOEXIT_ON_ARGS,
2221                            NFILE, fnm, asize(pa), pa, asize(desc), desc,
2222                            0, NULL, &oenv))
2223     {
2224         return 0;
2225     }
2226
2227     /* Store the remaining unparsed command line entries in a string which
2228      * is then attached to the mdrun command line */
2229     snew(ExtraArgs, 1);
2230     ExtraArgs[0] = '\0';
2231     for (i = 1; i < argc; i++) /* argc will now be 1 if everything was understood */
2232     {
2233         add_to_string(&ExtraArgs, argv[i]);
2234         add_to_string(&ExtraArgs, " ");
2235     }
2236
2237     if (opt2parg_bSet("-ntmpi", asize(pa), pa))
2238     {
2239         bThreads = TRUE;
2240         if (opt2parg_bSet("-npstring", asize(pa), pa))
2241         {
2242             fprintf(stderr, "WARNING: -npstring has no effect when using threads.\n");
2243         }
2244
2245         if (nnodes > 1)
2246         {
2247             gmx_fatal(FARGS, "Can't run multi-threaded MPI simulation yet!");
2248         }
2249         /* and now we just set this; a bit of an ugly hack*/
2250         nnodes = nthreads;
2251     }
2252     /* Check for PME:PP ratio and whether tpr triggers additional output files */
2253     guessPMEratio = inspect_tpr(NFILE, fnm, &rcoulomb);
2254
2255     /* Automatically set -beo options if -eo is set etc. */
2256     couple_files_options(NFILE, fnm);
2257
2258     /* Construct the command line arguments for benchmark runs
2259      * as well as for the simulation run */
2260     if (bThreads)
2261     {
2262         sprintf(bbuf, " -ntmpi %d ", nthreads);
2263     }
2264     else
2265     {
2266         /* This string will be used for MPI runs and will appear after the
2267          * mpirun command. */
2268         if (strcmp(procstring[0], "none") != 0)
2269         {
2270             sprintf(bbuf, " %s %d ", procstring[0], nnodes);
2271         }
2272         else
2273         {
2274             sprintf(bbuf, " ");
2275         }
2276     }
2277
2278     cmd_np = bbuf;
2279
2280     create_command_line_snippets(bAppendFiles, bKeepAndNumCPT, bResetCountersHalfWay, presteps,
2281                                  NFILE, fnm, &cmd_args_bench, &cmd_args_launch, ExtraArgs);
2282
2283     /* Read in checkpoint file if requested */
2284     sim_part = 1;
2285     if (opt2bSet("-cpi", NFILE, fnm))
2286     {
2287         snew(cr, 1);
2288         cr->duty = DUTY_PP; /* makes the following routine happy */
2289         read_checkpoint_simulation_part(opt2fn("-cpi", NFILE, fnm),
2290                                         &sim_part, &cpt_steps, cr,
2291                                         FALSE, NFILE, fnm, NULL, NULL);
2292         sfree(cr);
2293         sim_part++;
2294         /* sim_part will now be 1 if no checkpoint file was found */
2295         if (sim_part <= 1)
2296         {
2297             gmx_fatal(FARGS, "Checkpoint file %s not found!", opt2fn("-cpi", NFILE, fnm));
2298         }
2299     }
2300
2301     /* Open performance output file and write header info */
2302     fp = ffopen(opt2fn("-p", NFILE, fnm), "w");
2303
2304     /* Make a quick consistency check of command line parameters */
2305     check_input(nnodes, repeats, &ntprs, &rmin, rcoulomb, &rmax,
2306                 maxPMEfraction, minPMEfraction, npme_fixed,
2307                 bench_nsteps, fnm, NFILE, sim_part, presteps,
2308                 asize(pa), pa);
2309
2310     /* Determine the maximum and minimum number of PME nodes to test,
2311      * the actual list of settings is build in do_the_tests(). */
2312     if ((nnodes > 2) && (npme_fixed < -1))
2313     {
2314         if (0 == strcmp(npmevalues_opt[0], "auto"))
2315         {
2316             /* Determine the npme range automatically based on the PME:PP load guess */
2317             if (guessPMEratio > 1.0)
2318             {
2319                 /* More PME than PP work, probably we do not need separate PME nodes at all! */
2320                 maxPMEnodes = nnodes/2;
2321                 minPMEnodes = nnodes/2;
2322             }
2323             else
2324             {
2325                 /* PME : PP load is in the range 0..1, let's test around the guess */
2326                 guessPMEnodes = nnodes/(1.0 + 1.0/guessPMEratio);
2327                 minPMEnodes   = floor(0.7*guessPMEnodes);
2328                 maxPMEnodes   =  ceil(1.6*guessPMEnodes);
2329                 maxPMEnodes   = min(maxPMEnodes, nnodes/2);
2330             }
2331         }
2332         else
2333         {
2334             /* Determine the npme range based on user input */
2335             maxPMEnodes = floor(maxPMEfraction*nnodes);
2336             minPMEnodes = max(floor(minPMEfraction*nnodes), 0);
2337             fprintf(stdout, "Will try runs with %d ", minPMEnodes);
2338             if (maxPMEnodes != minPMEnodes)
2339             {
2340                 fprintf(stdout, "- %d ", maxPMEnodes);
2341             }
2342             fprintf(stdout, "PME-only nodes.\n  Note that the automatic number of PME-only nodes and no separate PME nodes are always tested.\n");
2343         }
2344     }
2345     else
2346     {
2347         maxPMEnodes = 0;
2348         minPMEnodes = 0;
2349     }
2350
2351     /* Get the commands we need to set up the runs from environment variables */
2352     get_program_paths(bThreads, &cmd_mpirun, &cmd_mdrun);
2353     if (bBenchmark && repeats > 0)
2354     {
2355         check_mdrun_works(bThreads, cmd_mpirun, cmd_np, cmd_mdrun);
2356     }
2357
2358     /* Print some header info to file */
2359     sep_line(fp);
2360     fprintf(fp, "\n      P E R F O R M A N C E   R E S U L T S\n");
2361     sep_line(fp);
2362     fprintf(fp, "%s for Gromacs %s\n", ShortProgram(), GromacsVersion());
2363     if (!bThreads)
2364     {
2365         fprintf(fp, "Number of nodes         : %d\n", nnodes);
2366         fprintf(fp, "The mpirun command is   : %s\n", cmd_mpirun);
2367         if (strcmp(procstring[0], "none") != 0)
2368         {
2369             fprintf(fp, "Passing # of nodes via  : %s\n", procstring[0]);
2370         }
2371         else
2372         {
2373             fprintf(fp, "Not setting number of nodes in system call\n");
2374         }
2375     }
2376     else
2377     {
2378         fprintf(fp, "Number of threads       : %d\n", nnodes);
2379     }
2380
2381     fprintf(fp, "The mdrun  command is   : %s\n", cmd_mdrun);
2382     fprintf(fp, "mdrun args benchmarks   : %s\n", cmd_args_bench);
2383     fprintf(fp, "Benchmark steps         : ");
2384     fprintf(fp, gmx_large_int_pfmt, bench_nsteps);
2385     fprintf(fp, "\n");
2386     fprintf(fp, "dlb equilibration steps : %d\n", presteps);
2387     if (sim_part > 1)
2388     {
2389         fprintf(fp, "Checkpoint time step    : ");
2390         fprintf(fp, gmx_large_int_pfmt, cpt_steps);
2391         fprintf(fp, "\n");
2392     }
2393     fprintf(fp, "mdrun args at launchtime: %s\n", cmd_args_launch);
2394
2395     if (new_sim_nsteps >= 0)
2396     {
2397         bOverwrite = TRUE;
2398         fprintf(stderr, "Note: Simulation input file %s will have ", opt2fn("-so", NFILE, fnm));
2399         fprintf(stderr, gmx_large_int_pfmt, new_sim_nsteps+cpt_steps);
2400         fprintf(stderr, " steps.\n");
2401         fprintf(fp, "Simulation steps        : ");
2402         fprintf(fp, gmx_large_int_pfmt, new_sim_nsteps);
2403         fprintf(fp, "\n");
2404     }
2405     if (repeats > 1)
2406     {
2407         fprintf(fp, "Repeats for each test   : %d\n", repeats);
2408     }
2409
2410     if (npme_fixed >= -1)
2411     {
2412         fprintf(fp, "Fixing -npme at         : %d\n", npme_fixed);
2413     }
2414
2415     fprintf(fp, "Input file              : %s\n", opt2fn("-s", NFILE, fnm));
2416     fprintf(fp, "   PME/PP load estimate : %g\n", guessPMEratio);
2417
2418     /* Allocate memory for the inputinfo struct: */
2419     snew(info, 1);
2420     info->nr_inputfiles = ntprs;
2421     for (i = 0; i < ntprs; i++)
2422     {
2423         snew(info->rcoulomb, ntprs);
2424         snew(info->rvdw, ntprs);
2425         snew(info->rlist, ntprs);
2426         snew(info->rlistlong, ntprs);
2427         snew(info->nkx, ntprs);
2428         snew(info->nky, ntprs);
2429         snew(info->nkz, ntprs);
2430         snew(info->fsx, ntprs);
2431         snew(info->fsy, ntprs);
2432         snew(info->fsz, ntprs);
2433     }
2434     /* Make alternative tpr files to test: */
2435     snew(tpr_names, ntprs);
2436     for (i = 0; i < ntprs; i++)
2437     {
2438         snew(tpr_names[i], STRLEN);
2439     }
2440
2441     /* It can be that ntprs is reduced by make_benchmark_tprs if not enough
2442      * different grids could be found. */
2443     make_benchmark_tprs(opt2fn("-s", NFILE, fnm), tpr_names, bench_nsteps+presteps,
2444                         cpt_steps, rmin, rmax, bScaleRvdw, &ntprs, info, fp);
2445
2446     /********************************************************************************/
2447     /* Main loop over all scenarios we need to test: tpr files, PME nodes, repeats  */
2448     /********************************************************************************/
2449     snew(perfdata, ntprs);
2450     if (bBenchmark)
2451     {
2452         do_the_tests(fp, tpr_names, maxPMEnodes, minPMEnodes, npme_fixed, npmevalues_opt[0], perfdata, &pmeentries,
2453                      repeats, nnodes, ntprs, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
2454                      cmd_args_bench, fnm, NFILE, presteps, cpt_steps);
2455
2456         fprintf(fp, "\nTuning took%8.1f minutes.\n", (gmx_gettime()-seconds)/60.0);
2457
2458         /* Analyse the results and give a suggestion for optimal settings: */
2459         bKeepTPR = analyze_data(fp, opt2fn("-p", NFILE, fnm), perfdata, nnodes, ntprs, pmeentries,
2460                                 repeats, info, &best_tpr, &best_npme);
2461
2462         /* Take the best-performing tpr file and enlarge nsteps to original value */
2463         if (bKeepTPR && !bOverwrite)
2464         {
2465             simulation_tpr = opt2fn("-s", NFILE, fnm);
2466         }
2467         else
2468         {
2469             simulation_tpr = opt2fn("-so", NFILE, fnm);
2470             modify_PMEsettings(bOverwrite ? (new_sim_nsteps+cpt_steps) : info->orig_sim_steps,
2471                                info->orig_init_step, tpr_names[best_tpr], simulation_tpr);
2472         }
2473
2474         /* Let's get rid of the temporary benchmark input files */
2475         for (i = 0; i < ntprs; i++)
2476         {
2477             fprintf(stdout, "Deleting temporary benchmark input file %s\n", tpr_names[i]);
2478             remove(tpr_names[i]);
2479         }
2480
2481         /* Now start the real simulation if the user requested it ... */
2482         launch_simulation(bLaunch, fp, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
2483                           cmd_args_launch, simulation_tpr, best_npme);
2484     }
2485     ffclose(fp);
2486
2487     /* ... or simply print the performance results to screen: */
2488     if (!bLaunch)
2489     {
2490         finalize(opt2fn("-p", NFILE, fnm));
2491     }
2492
2493     return 0;
2494 }