2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2009-2018, The GROMACS development team.
5 * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
51 #ifdef HAVE_SYS_TIME_H
52 # include <sys/time.h>
55 #include "gromacs/commandline/pargs.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/fft/calcgrid.h"
58 #include "gromacs/fileio/checkpoint.h"
59 #include "gromacs/fileio/tpxio.h"
60 #include "gromacs/math/utilities.h"
61 #include "gromacs/math/vec.h"
62 #include "gromacs/mdlib/perf_est.h"
63 #include "gromacs/mdtypes/commrec.h"
64 #include "gromacs/mdtypes/inputrec.h"
65 #include "gromacs/mdtypes/md_enums.h"
66 #include "gromacs/mdtypes/state.h"
67 #include "gromacs/pbcutil/pbc.h"
68 #include "gromacs/taskassignment/usergpuids.h"
69 #include "gromacs/timing/walltime_accounting.h"
70 #include "gromacs/topology/topology.h"
71 #include "gromacs/utility/arraysize.h"
72 #include "gromacs/utility/baseversion.h"
73 #include "gromacs/utility/cstringutil.h"
74 #include "gromacs/utility/fatalerror.h"
75 #include "gromacs/utility/futil.h"
76 #include "gromacs/utility/gmxassert.h"
77 #include "gromacs/utility/path.h"
78 #include "gromacs/utility/smalloc.h"
79 #include "gromacs/utility/stringutil.h"
81 /* Enum for situations that can occur during log file parsing, the
82 * corresponding string entries can be found in do_the_tests() in
83 * const char* ParseLog[] */
84 /* TODO clean up CamelCasing of these enum names */
91 eParselogResetProblem,
95 eParselogLargePrimeFactor,
96 eParselogMismatchOfNumberOfPPRanksAndAvailableGPUs,
105 int nPMEnodes; /* number of PME-only nodes used in this test */
106 int nx, ny, nz; /* DD grid */
107 int guessPME; /* if nPMEnodes == -1, this is the guessed number of PME nodes */
108 double* Gcycles; /* This can contain more than one value if doing multiple tests */
112 float* PME_f_load; /* PME mesh/force load average*/
113 float PME_f_load_Av; /* Average average ;) ... */
114 char* mdrun_cmd_line; /* Mdrun command line used for this test */
120 int nr_inputfiles; /* The number of tpr and mdp input files */
121 int64_t orig_sim_steps; /* Number of steps to be done in the real simulation */
122 int64_t orig_init_step; /* Init step for the real simulation */
123 real* rcoulomb; /* The coulomb radii [0...nr_inputfiles] */
124 real* rvdw; /* The vdW radii */
125 real* rlist; /* Neighbourlist cutoff radius */
126 int * nkx, *nky, *nkz;
127 real * fsx, *fsy, *fsz; /* Fourierspacing in x,y,z dimension */
131 static void sep_line(FILE* fp)
133 fprintf(fp, "\n------------------------------------------------------------\n");
137 /* Wrapper for system calls */
138 static int gmx_system_call(char* command)
140 return (system(command));
144 /* Check if string starts with substring */
145 static gmx_bool str_starts(const char* string, const char* substring)
147 return (std::strncmp(string, substring, std::strlen(substring)) == 0);
151 static void cleandata(t_perf* perfdata, int test_nr)
153 perfdata->Gcycles[test_nr] = 0.0;
154 perfdata->ns_per_day[test_nr] = 0.0;
155 perfdata->PME_f_load[test_nr] = 0.0;
159 static void remove_if_exists(const char* fn)
163 fprintf(stdout, "Deleting %s\n", fn);
169 static void finalize(const char* fn_out)
175 fp = fopen(fn_out, "r");
176 fprintf(stdout, "\n\n");
178 while (fgets(buf, STRLEN - 1, fp) != nullptr)
180 fprintf(stdout, "%s", buf);
183 fprintf(stdout, "\n\n");
195 static int parse_logfile(const char* logfile,
204 char line[STRLEN], dumstring[STRLEN], dumstring2[STRLEN];
205 const char matchstrdd[] = "Domain decomposition grid";
206 const char matchstrcr[] = "resetting all time and cycle counters";
207 const char matchstrbal[] = "Average PME mesh/force load:";
208 const char matchstring[] = "R E A L C Y C L E A N D T I M E A C C O U N T I N G";
209 const char errSIG[] = "signal, stopping at the next";
211 float dum1, dum2, dum3, dum4;
214 int64_t resetsteps = -1;
215 gmx_bool bFoundResetStr = FALSE;
216 gmx_bool bResetChecked = FALSE;
219 if (!gmx_fexist(logfile))
221 fprintf(stderr, "WARNING: Could not find logfile %s.\n", logfile);
222 cleandata(perfdata, test_nr);
223 return eParselogNotFound;
226 fp = fopen(logfile, "r");
227 perfdata->PME_f_load[test_nr] = -1.0;
228 perfdata->guessPME = -1;
230 iFound = eFoundNothing;
233 iFound = eFoundDDStr; /* Skip some case statements */
236 while (fgets(line, STRLEN, fp) != nullptr)
238 /* Remove leading spaces */
241 /* Check for TERM and INT signals from user: */
242 if (std::strstr(line, errSIG) != nullptr)
245 cleandata(perfdata, test_nr);
246 return eParselogTerm;
249 /* Check whether cycle resetting worked */
250 if (presteps > 0 && !bFoundResetStr)
252 if (std::strstr(line, matchstrcr) != nullptr)
254 sprintf(dumstring, "step %s", "%" SCNd64);
255 sscanf(line, dumstring, &resetsteps);
256 bFoundResetStr = TRUE;
257 if (resetsteps == presteps + cpt_steps)
259 bResetChecked = TRUE;
263 sprintf(dumstring, "%" PRId64, resetsteps);
264 sprintf(dumstring2, "%" PRId64, presteps + cpt_steps);
266 "WARNING: Time step counters were reset at step %s,\n"
267 " though they were supposed to be reset at step %s!\n",
274 /* Look for strings that appear in a certain order in the log file: */
278 /* Look for domain decomp grid and separate PME nodes: */
279 if (str_starts(line, matchstrdd))
282 "Domain decomposition grid %d x %d x %d, separate PME ranks %d",
287 if (perfdata->nPMEnodes == -1)
289 perfdata->guessPME = npme;
291 else if (perfdata->nPMEnodes != npme)
294 "PME ranks from command line and output file are not identical");
296 iFound = eFoundDDStr;
298 /* Catch a few errors that might have occurred: */
299 else if (str_starts(line, "There is no domain decomposition for"))
302 return eParselogNoDDGrid;
304 else if (str_starts(line, "The number of ranks you selected"))
307 return eParselogLargePrimeFactor;
309 else if (str_starts(line, "reading tpx file"))
312 return eParselogTPXVersion;
314 else if (str_starts(line, "The -dd or -npme option request a parallel simulation"))
317 return eParselogNotParallel;
321 /* Even after the "Domain decomposition grid" string was found,
322 * it could be that mdrun had to quit due to some error. */
323 if (str_starts(line, "Incorrect launch configuration: mismatching number of"))
326 return eParselogMismatchOfNumberOfPPRanksAndAvailableGPUs;
328 else if (str_starts(line, "Some of the requested GPUs do not exist"))
331 return eParselogGpuProblem;
333 /* Look for PME mesh/force balance (not necessarily present, though) */
334 else if (str_starts(line, matchstrbal))
336 sscanf(&line[std::strlen(matchstrbal)], "%f", &(perfdata->PME_f_load[test_nr]));
338 /* Look for matchstring */
339 else if (str_starts(line, matchstring))
341 iFound = eFoundAccountingStr;
344 case eFoundAccountingStr:
345 /* Already found matchstring - look for cycle data */
346 if (str_starts(line, "Total "))
348 sscanf(line, "Total %*f %lf", &(perfdata->Gcycles[test_nr]));
349 iFound = eFoundCycleStr;
353 /* Already found cycle data - look for remaining performance info and return */
354 if (str_starts(line, "Performance:"))
356 ndum = sscanf(line, "%s %f %f %f %f", dumstring, &dum1, &dum2, &dum3, &dum4);
357 /* (ns/day) is the second last entry, depending on whether GMX_DETAILED_PERF_STATS was set in print_perf(), nrnb.c */
358 perfdata->ns_per_day[test_nr] = (ndum == 5) ? dum3 : dum1;
360 if (bResetChecked || presteps == 0)
366 return eParselogResetProblem;
373 /* Close the log file */
376 /* Check why there is no performance data in the log file.
377 * Did a fatal errors occur? */
378 if (gmx_fexist(errfile))
380 fp = fopen(errfile, "r");
381 while (fgets(line, STRLEN, fp) != nullptr)
383 if (str_starts(line, "Fatal error:"))
385 if (fgets(line, STRLEN, fp) != nullptr)
388 "\nWARNING: An error occurred during this benchmark:\n"
393 cleandata(perfdata, test_nr);
394 return eParselogFatal;
401 fprintf(stderr, "WARNING: Could not find stderr file %s.\n", errfile);
404 /* Giving up ... we could not find out why there is no performance data in
406 fprintf(stdout, "No performance data in log file.\n");
407 cleandata(perfdata, test_nr);
409 return eParselogNoPerfData;
413 static gmx_bool analyze_data(FILE* fp,
421 int* index_tpr, /* OUT: Nr of mdp file with best settings */
422 int* npme_optimal) /* OUT: Optimal number of PME nodes */
425 int line = 0, line_win = -1;
426 int k_win = -1, i_win = -1, winPME;
427 double s = 0.0; /* standard deviation */
430 char str_PME_f_load[13];
431 gmx_bool bCanUseOrigTPR;
432 gmx_bool bRefinedCoul, bRefinedVdW, bRefinedGrid;
438 fprintf(fp, "Summary of successful runs:\n");
439 fprintf(fp, "Line tpr PME ranks Gcycles Av. Std.dev. ns/day PME/f");
442 fprintf(fp, " DD grid");
448 for (k = 0; k < ntprs; k++)
450 for (i = 0; i < ntests; i++)
452 /* Select the right dataset: */
453 pd = &(perfdata[k][i]);
455 pd->Gcycles_Av = 0.0;
456 pd->PME_f_load_Av = 0.0;
457 pd->ns_per_day_Av = 0.0;
459 if (pd->nPMEnodes == -1)
461 sprintf(strbuf, "(%3d)", pd->guessPME);
465 sprintf(strbuf, " ");
468 /* Get the average run time of a setting */
469 for (j = 0; j < nrepeats; j++)
471 pd->Gcycles_Av += pd->Gcycles[j];
472 pd->PME_f_load_Av += pd->PME_f_load[j];
474 pd->Gcycles_Av /= nrepeats;
475 pd->PME_f_load_Av /= nrepeats;
477 for (j = 0; j < nrepeats; j++)
479 if (pd->ns_per_day[j] > 0.0)
481 pd->ns_per_day_Av += pd->ns_per_day[j];
485 /* Somehow the performance number was not aquired for this run,
486 * therefor set the average to some negative value: */
487 pd->ns_per_day_Av = -1.0F * nrepeats;
491 pd->ns_per_day_Av /= nrepeats;
494 if (pd->PME_f_load_Av > 0.0)
496 sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load_Av);
500 sprintf(str_PME_f_load, "%s", " - ");
504 /* We assume we had a successful run if both averages are positive */
505 if (pd->Gcycles_Av > 0.0 && pd->ns_per_day_Av > 0.0)
507 /* Output statistics if repeats were done */
510 /* Calculate the standard deviation */
512 for (j = 0; j < nrepeats; j++)
514 s += gmx::square(pd->Gcycles[j] - pd->Gcycles_Av);
520 "%4d %3d %4d%s %12.3f %12.3f %12.3f %s",
531 fprintf(fp, " %3d %3d %3d", pd->nx, pd->ny, pd->nz);
535 /* Store the index of the best run found so far in 'winner': */
536 if ((k_win == -1) || (pd->Gcycles_Av < perfdata[k_win][i_win].Gcycles_Av))
549 gmx_fatal(FARGS, "None of the runs was successful! Check %s for problems.", fn);
554 winPME = perfdata[k_win][i_win].nPMEnodes;
558 /* We stuck to a fixed number of PME-only nodes */
559 sprintf(strbuf, "settings No. %d", k_win);
563 /* We have optimized the number of PME-only nodes */
566 sprintf(strbuf, "%s", "the automatic number of PME ranks");
570 sprintf(strbuf, "%d PME ranks", winPME);
573 fprintf(fp, "Best performance was achieved with %s", strbuf);
574 if ((nrepeats > 1) && (ntests > 1))
576 fprintf(fp, " (see line %d)", line_win);
580 /* Only mention settings if they were modified: */
581 bRefinedCoul = !gmx_within_tol(info->rcoulomb[k_win], info->rcoulomb[0], GMX_REAL_EPS);
582 bRefinedVdW = !gmx_within_tol(info->rvdw[k_win], info->rvdw[0], GMX_REAL_EPS);
583 bRefinedGrid = !(info->nkx[k_win] == info->nkx[0] && info->nky[k_win] == info->nky[0]
584 && info->nkz[k_win] == info->nkz[0]);
586 if (bRefinedCoul || bRefinedVdW || bRefinedGrid)
588 fprintf(fp, "Optimized PME settings:\n");
589 bCanUseOrigTPR = FALSE;
593 bCanUseOrigTPR = TRUE;
598 fprintf(fp, " New Coulomb radius: %f nm (was %f nm)\n", info->rcoulomb[k_win], info->rcoulomb[0]);
603 fprintf(fp, " New Van der Waals radius: %f nm (was %f nm)\n", info->rvdw[k_win], info->rvdw[0]);
609 " New Fourier grid xyz: %d %d %d (was %d %d %d)\n",
618 if (bCanUseOrigTPR && ntprs > 1)
620 fprintf(fp, "and original PME settings.\n");
625 /* Return the index of the mdp file that showed the highest performance
626 * and the optimal number of PME nodes */
628 *npme_optimal = winPME;
630 return bCanUseOrigTPR;
634 /* Get the commands we need to set up the runs from environment variables */
635 static void get_program_paths(gmx_bool bThreads, char* cmd_mpirun[], char* cmd_mdrun[])
638 const char def_mpirun[] = "mpirun";
640 const char empty_mpirun[] = "";
642 /* Get the commands we need to set up the runs from environment variables */
645 if ((cp = getenv("MPIRUN")) != nullptr)
647 *cmd_mpirun = gmx_strdup(cp);
651 *cmd_mpirun = gmx_strdup(def_mpirun);
656 *cmd_mpirun = gmx_strdup(empty_mpirun);
659 if (*cmd_mdrun == nullptr)
661 /* The use of MDRUN is deprecated, but made available in 5.1
662 for backward compatibility. It may be removed in a future
664 if ((cp = getenv("MDRUN")) != nullptr)
666 *cmd_mdrun = gmx_strdup(cp);
670 gmx_fatal(FARGS, "The way to call mdrun must be set in the -mdrun command-line flag.");
675 /* Check that the commands will run mdrun (perhaps via mpirun) by
676 * running a very quick test simulation. Requires MPI environment or
677 * GPU support to be available if applicable. */
678 /* TODO implement feature to parse the log file to get the list of
679 compatible GPUs from mdrun, if the user of gmx tune-pme has not
681 static void check_mdrun_works(gmx_bool bThreads,
682 const char* cmd_mpirun,
684 const char* cmd_mdrun,
685 gmx_bool bNeedGpuSupport)
687 char* command = nullptr;
691 const char filename[] = "benchtest.log";
693 /* This string should always be identical to the one in copyrite.c,
694 * gmx_print_version_info() in the GMX_MPI section */
695 const char match_mpi[] = "MPI library: MPI";
696 const char match_mdrun[] = "Executable: ";
697 const char match_nogpu[] = "GPU support: disabled";
698 gmx_bool bMdrun = FALSE;
699 gmx_bool bMPI = FALSE;
700 gmx_bool bHaveGpuSupport = TRUE;
702 /* Run a small test to see whether mpirun + mdrun work */
703 fprintf(stdout, "Making sure that mdrun can be executed. ");
706 snew(command, std::strlen(cmd_mdrun) + std::strlen(cmd_np) + std::strlen(filename) + 50);
707 sprintf(command, "%s%s -version -maxh 0.001 1> %s 2>&1", cmd_mdrun, cmd_np, filename);
712 std::strlen(cmd_mpirun) + std::strlen(cmd_np) + std::strlen(cmd_mdrun)
713 + std::strlen(filename) + 50);
714 sprintf(command, "%s%s%s -version -maxh 0.001 1> %s 2>&1", cmd_mpirun, cmd_np, cmd_mdrun, filename);
716 fprintf(stdout, "Trying '%s' ... ", command);
717 make_backup(filename);
718 gmx_system_call(command);
720 /* Check if we find the characteristic string in the output: */
721 if (!gmx_fexist(filename))
723 gmx_fatal(FARGS, "Output from test run could not be found.");
726 fp = fopen(filename, "r");
727 /* We need to scan the whole output file, since sometimes the queuing system
728 * also writes stuff to stdout/err */
731 cp = fgets(line, STRLEN, fp);
734 if (str_starts(line, match_mdrun))
738 if (str_starts(line, match_mpi))
742 if (str_starts(line, match_nogpu))
744 bHaveGpuSupport = FALSE;
755 "Need a threaded version of mdrun. This one\n"
757 "seems to have been compiled with MPI instead.",
766 "Need an MPI-enabled version of mdrun. This one\n"
768 "seems to have been compiled without MPI support.",
775 gmx_fatal(FARGS, "Cannot execute mdrun. Please check %s for problems!", filename);
778 if (bNeedGpuSupport && !bHaveGpuSupport)
780 gmx_fatal(FARGS, "The mdrun executable did not have the expected GPU support.");
783 fprintf(stdout, "passed.\n");
790 /* Handles the no-GPU case by emitting an empty string. */
791 static std::string make_gpu_id_command_line(const char* eligible_gpu_ids)
793 /* If the user has given no eligible GPU IDs, or we're trying the
794 * default behaviour, then there is nothing for tune_pme to give
795 * to mdrun -gpu_id */
796 if (eligible_gpu_ids != nullptr)
798 return gmx::formatString("-gpu_id %s", eligible_gpu_ids);
802 return std::string();
805 static void launch_simulation(gmx_bool bLaunch, /* Should the simulation be launched? */
806 FILE* fp, /* General log file */
807 gmx_bool bThreads, /* whether to use threads */
808 char* cmd_mpirun, /* Command for mpirun */
809 char* cmd_np, /* Switch for -np or -ntmpi or empty */
810 char* cmd_mdrun, /* Command for mdrun */
811 char* args_for_mdrun, /* Arguments for mdrun */
812 const char* simulation_tpr, /* This tpr will be simulated */
813 int nPMEnodes, /* Number of PME ranks to use */
814 const char* eligible_gpu_ids) /* Available GPU IDs for
815 * constructing mdrun command lines */
820 /* Make enough space for the system call command,
821 * (200 extra chars for -npme ... etc. options should suffice): */
823 std::strlen(cmd_mpirun) + std::strlen(cmd_mdrun) + std::strlen(cmd_np)
824 + std::strlen(args_for_mdrun) + std::strlen(simulation_tpr) + 200);
826 auto cmd_gpu_ids = make_gpu_id_command_line(eligible_gpu_ids);
828 /* Note that the -passall options requires args_for_mdrun to be at the end
829 * of the command line string */
833 "%s%s-npme %d -s %s %s %s",
839 cmd_gpu_ids.c_str());
844 "%s%s%s -npme %d -s %s %s %s",
851 cmd_gpu_ids.c_str());
854 fprintf(fp, "%s this command line to launch the simulation:\n\n%s", bLaunch ? "Using" : "Please use", command);
858 /* Now the real thing! */
861 fprintf(stdout, "\nLaunching simulation with best parameters now.\nExecuting '%s'", command);
864 gmx_system_call(command);
869 static void modify_PMEsettings(int64_t simsteps, /* Set this value as number of time steps */
870 int64_t init_step, /* Set this value as init_step */
871 const char* fn_best_tpr, /* tpr file with the best performance */
872 const char* fn_sim_tpr) /* name of tpr file to be launched */
878 t_inputrec irInstance;
879 t_inputrec* ir = &irInstance;
880 read_tpx_state(fn_best_tpr, ir, &state, &mtop);
882 /* Reset nsteps and init_step to the value of the input .tpr file */
883 ir->nsteps = simsteps;
884 ir->init_step = init_step;
886 /* Write the tpr file which will be launched */
887 sprintf(buf, "Writing optimized simulation file %s with nsteps=%s.\n", fn_sim_tpr, "%" PRId64);
888 fprintf(stdout, buf, ir->nsteps);
890 write_tpx_state(fn_sim_tpr, ir, &state, &mtop);
893 static gmx_bool can_scale_rvdw(VanDerWaalsType vdwtype)
895 return (VanDerWaalsType::Cut == vdwtype || VanDerWaalsType::Pme == vdwtype);
898 #define EPME_SWITCHED(e) \
899 ((e) == CoulombInteractionType::PmeSwitch || (e) == CoulombInteractionType::PmeUserSwitch)
901 /* Make additional TPR files with more computational load for the
902 * direct space processors: */
903 static void make_benchmark_tprs(const char* fn_sim_tpr, /* READ : User-provided tpr file */
904 char* fn_bench_tprs[], /* WRITE: Names of benchmark tpr files */
905 int64_t benchsteps, /* Number of time steps for benchmark runs */
906 int64_t statesteps, /* Step counter in checkpoint file */
907 real rmin, /* Minimal Coulomb radius */
908 real rmax, /* Maximal Coulomb radius */
909 bool bScaleRvdw, /* Scale rvdw along with rcoulomb */
910 const int* ntprs, /* No. of TPRs to write, each with a different
911 rcoulomb and fourierspacing */
912 t_inputinfo* info, /* Contains information about mdp file options */
913 FILE* fp) /* Write the output here */
918 real nlist_buffer; /* Thickness of the buffer regions for PME-switch potentials */
921 gmx_bool bNote = FALSE;
922 real add; /* Add this to rcoul for the next test */
923 real fac = 1.0; /* Scaling factor for Coulomb radius */
924 real fourierspacing; /* Basic fourierspacing from tpr */
928 "Making benchmark tpr file%s with %s time step%s",
929 *ntprs > 1 ? "s" : "",
931 benchsteps > 1 ? "s" : "");
932 fprintf(stdout, buf, benchsteps);
935 sprintf(buf, " (adding %s steps from checkpoint file)", "%" PRId64);
936 fprintf(stdout, buf, statesteps);
937 benchsteps += statesteps;
939 fprintf(stdout, ".\n");
941 t_inputrec irInstance;
942 t_inputrec* ir = &irInstance;
943 read_tpx_state(fn_sim_tpr, ir, &state, &mtop);
945 /* Check if some kind of PME was chosen */
946 if (EEL_PME(ir->coulombtype) == FALSE)
949 "Can only do optimizations for simulations with %s electrostatics.",
950 enumValueToString(CoulombInteractionType::Pme));
953 /* Check if rcoulomb == rlist, which is necessary for plain PME. */
954 if ((ir->cutoff_scheme != CutoffScheme::Verlet)
955 && (CoulombInteractionType::Pme == ir->coulombtype) && !(ir->rcoulomb == ir->rlist))
958 "%s requires rcoulomb (%f) to be equal to rlist (%f).",
959 enumValueToString(CoulombInteractionType::Pme),
963 /* For other PME types, rcoulomb is allowed to be smaller than rlist */
964 else if (ir->rcoulomb > ir->rlist)
967 "%s requires rcoulomb (%f) to be equal to or smaller than rlist (%f)",
968 enumValueToString(ir->coulombtype),
973 if (bScaleRvdw && ir->rvdw != ir->rcoulomb)
975 fprintf(stdout, "NOTE: input rvdw != rcoulomb, will not scale rvdw\n");
979 /* Reduce the number of steps for the benchmarks */
980 info->orig_sim_steps = ir->nsteps;
981 ir->nsteps = benchsteps;
982 /* We must not use init_step from the input tpr file for the benchmarks */
983 info->orig_init_step = ir->init_step;
986 /* For PME-switch potentials, keep the radial distance of the buffer region */
987 nlist_buffer = ir->rlist - ir->rcoulomb;
989 /* Determine length of triclinic box vectors */
990 for (d = 0; d < DIM; d++)
993 for (i = 0; i < DIM; i++)
995 box_size[d] += state.box[d][i] * state.box[d][i];
997 box_size[d] = std::sqrt(box_size[d]);
1000 if (ir->fourier_spacing > 0)
1002 info->fsx[0] = ir->fourier_spacing;
1003 info->fsy[0] = ir->fourier_spacing;
1004 info->fsz[0] = ir->fourier_spacing;
1008 /* Reconstruct fourierspacing per dimension from the number of grid points and box size */
1009 info->fsx[0] = box_size[XX] / ir->nkx;
1010 info->fsy[0] = box_size[YY] / ir->nky;
1011 info->fsz[0] = box_size[ZZ] / ir->nkz;
1014 /* If no value for the fourierspacing was provided on the command line, we
1015 * use the reconstruction from the tpr file */
1016 if (ir->fourier_spacing > 0)
1018 /* Use the spacing from the tpr */
1019 fourierspacing = ir->fourier_spacing;
1023 /* Use the maximum observed spacing */
1024 fourierspacing = std::max(std::max(info->fsx[0], info->fsy[0]), info->fsz[0]);
1027 fprintf(stdout, "Calculating PME grid points on the basis of a fourierspacing of %f nm\n", fourierspacing);
1029 /* For performance comparisons the number of particles is useful to have */
1030 fprintf(fp, " Number of particles : %d\n", mtop.natoms);
1032 /* Print information about settings of which some are potentially modified: */
1033 fprintf(fp, " Coulomb type : %s\n", enumValueToString(ir->coulombtype));
1035 " Grid spacing x y z : %f %f %f\n",
1036 box_size[XX] / ir->nkx,
1037 box_size[YY] / ir->nky,
1038 box_size[ZZ] / ir->nkz);
1039 fprintf(fp, " Van der Waals type : %s\n", enumValueToString(ir->vdwtype));
1040 if (ir_vdw_switched(ir))
1042 fprintf(fp, " rvdw_switch : %f nm\n", ir->rvdw_switch);
1044 if (EPME_SWITCHED(ir->coulombtype))
1046 fprintf(fp, " rlist : %f nm\n", ir->rlist);
1049 /* Print a descriptive line about the tpr settings tested */
1050 fprintf(fp, "\nWill try these real/reciprocal workload settings:\n");
1051 fprintf(fp, " No. scaling rcoulomb");
1052 fprintf(fp, " nkx nky nkz");
1053 fprintf(fp, " spacing");
1054 if (can_scale_rvdw(ir->vdwtype))
1056 fprintf(fp, " rvdw");
1058 if (EPME_SWITCHED(ir->coulombtype))
1060 fprintf(fp, " rlist");
1062 fprintf(fp, " tpr file\n");
1064 /* Loop to create the requested number of tpr input files */
1065 for (j = 0; j < *ntprs; j++)
1067 /* The first .tpr is the provided one, just need to modify nsteps,
1068 * so skip the following block */
1071 /* Determine which Coulomb radii rc to use in the benchmarks */
1072 add = (rmax - rmin) / (*ntprs - 1);
1073 if (gmx_within_tol(rmin, info->rcoulomb[0], GMX_REAL_EPS))
1075 ir->rcoulomb = rmin + j * add;
1077 else if (gmx_within_tol(rmax, info->rcoulomb[0], GMX_REAL_EPS))
1079 ir->rcoulomb = rmin + (j - 1) * add;
1083 /* rmin != rcoul != rmax, ergo test between rmin and rmax */
1084 add = (rmax - rmin) / (*ntprs - 2);
1085 ir->rcoulomb = rmin + (j - 1) * add;
1088 /* Determine the scaling factor fac */
1089 fac = ir->rcoulomb / info->rcoulomb[0];
1091 /* Scale the Fourier grid spacing */
1092 ir->nkx = ir->nky = ir->nkz = 0;
1093 calcFftGrid(nullptr,
1095 fourierspacing * fac,
1096 minimalPmeGridSize(ir->pme_order),
1101 /* Adjust other radii since various conditions need to be fulfilled */
1102 if (CoulombInteractionType::Pme == ir->coulombtype)
1104 /* plain PME, rcoulomb must be equal to rlist TODO only in the group scheme? */
1105 ir->rlist = ir->rcoulomb;
1109 /* rlist must be >= rcoulomb, we keep the size of the buffer region */
1110 ir->rlist = ir->rcoulomb + nlist_buffer;
1113 if (bScaleRvdw && can_scale_rvdw(ir->vdwtype))
1115 if (CutoffScheme::Verlet == ir->cutoff_scheme || VanDerWaalsType::Pme == ir->vdwtype)
1117 /* With either the Verlet cutoff-scheme or LJ-PME,
1118 the van der Waals radius must always equal the
1120 ir->rvdw = ir->rcoulomb;
1124 /* For vdw cutoff, rvdw >= rlist */
1125 ir->rvdw = std::max(info->rvdw[0], ir->rlist);
1128 } /* end of "if (j != 0)" */
1130 /* for j==0: Save the original settings
1131 * for j >0: Save modified radii and Fourier grids */
1132 info->rcoulomb[j] = ir->rcoulomb;
1133 info->rvdw[j] = ir->rvdw;
1134 info->nkx[j] = ir->nkx;
1135 info->nky[j] = ir->nky;
1136 info->nkz[j] = ir->nkz;
1137 info->rlist[j] = ir->rlist;
1138 info->fsx[j] = fac * fourierspacing;
1139 info->fsy[j] = fac * fourierspacing;
1140 info->fsz[j] = fac * fourierspacing;
1142 /* Write the benchmark tpr file */
1143 fn_bench_tprs[j] = gmx_strdup(
1144 gmx::Path::concatenateBeforeExtension(fn_sim_tpr, gmx::formatString("_bench%.2d", j))
1147 fprintf(stdout, "Writing benchmark tpr %s with nsteps=", fn_bench_tprs[j]);
1148 fprintf(stdout, "%" PRId64, ir->nsteps);
1151 fprintf(stdout, ", scaling factor %f\n", fac);
1155 fprintf(stdout, ", unmodified settings\n");
1158 write_tpx_state(fn_bench_tprs[j], ir, &state, &mtop);
1160 /* Write information about modified tpr settings to log file */
1161 fprintf(fp, "%4d%10f%10f", j, fac, ir->rcoulomb);
1162 fprintf(fp, "%5d%5d%5d", ir->nkx, ir->nky, ir->nkz);
1163 fprintf(fp, " %9f ", info->fsx[j]);
1164 if (can_scale_rvdw(ir->vdwtype))
1166 fprintf(fp, "%10f", ir->rvdw);
1168 if (EPME_SWITCHED(ir->coulombtype))
1170 fprintf(fp, "%10f", ir->rlist);
1172 fprintf(fp, " %-14s\n", fn_bench_tprs[j]);
1174 /* Make it clear to the user that some additional settings were modified */
1175 if (!gmx_within_tol(ir->rvdw, info->rvdw[0], GMX_REAL_EPS)
1176 || !gmx_within_tol(ir->rlist, info->rlist[0], GMX_REAL_EPS))
1184 "\nNote that in addition to the Coulomb radius and the Fourier grid\n"
1185 "other input settings were also changed (see table above).\n"
1186 "Please check if the modified settings are appropriate.\n");
1193 /* Rename the files we want to keep to some meaningful filename and
1194 * delete the rest */
1195 static void cleanup(const t_filenm* fnm, int nfile, int k, int nnodes, int nPMEnodes, int nr, gmx_bool bKeepStderr)
1197 char numstring[STRLEN];
1198 const char* fn = nullptr;
1203 fprintf(stdout, "Cleaning up, deleting benchmark temp files ...\n");
1205 for (i = 0; i < nfile; i++)
1207 opt = const_cast<char*>(fnm[i].opt);
1208 if (std::strcmp(opt, "-p") == 0)
1210 /* do nothing; keep this file */
1212 else if (std::strcmp(opt, "-bg") == 0)
1214 /* Give the log file a nice name so one can later see which parameters were used */
1215 numstring[0] = '\0';
1218 sprintf(numstring, "_%d", nr);
1220 std::string newfilename = gmx::formatString(
1221 "%s_no%d_np%d_npme%d%s", opt2fn("-bg", nfile, fnm), k, nnodes, nPMEnodes, numstring);
1222 if (gmx_fexist(opt2fn("-bg", nfile, fnm)))
1224 fprintf(stdout, "renaming log file to %s\n", newfilename.c_str());
1225 make_backup(newfilename);
1226 rename(opt2fn("-bg", nfile, fnm), newfilename.c_str());
1229 else if (std::strcmp(opt, "-err") == 0)
1231 /* This file contains the output of stderr. We want to keep it in
1232 * cases where there have been problems. */
1233 fn = opt2fn(opt, nfile, fnm);
1234 numstring[0] = '\0';
1237 sprintf(numstring, "_%d", nr);
1239 std::string newfilename =
1240 gmx::formatString("%s_no%d_np%d_npme%d%s", fn, k, nnodes, nPMEnodes, numstring);
1245 fprintf(stdout, "Saving stderr output in %s\n", newfilename.c_str());
1246 make_backup(newfilename);
1247 rename(fn, newfilename.c_str());
1251 fprintf(stdout, "Deleting %s\n", fn);
1256 /* Delete the files which are created for each benchmark run: (options -b*) */
1257 else if ((0 == std::strncmp(opt, "-b", 2)) && (opt2bSet(opt, nfile, fnm) || !is_optional(&fnm[i])))
1259 remove_if_exists(opt2fn(opt, nfile, fnm));
1274 /* Create a list of numbers of PME nodes to test */
1275 static void make_npme_list(const char* npmevalues_opt, /* Make a complete list with all
1276 * possibilities or a short list that keeps
1277 * only reasonable numbers of PME nodes */
1278 int* nentries, /* Number of entries we put in the nPMEnodes list */
1279 int* nPMEnodes[], /* Each entry contains the value for -npme */
1280 int nnodes, /* Total number of nodes to do the tests on */
1281 int minPMEnodes, /* Minimum number of PME nodes */
1282 int maxPMEnodes) /* Maximum number of PME nodes */
1285 int min_factor = 1; /* We request that npp and npme have this minimal
1286 * largest common factor (depends on npp) */
1287 int nlistmax; /* Max. list size */
1288 int nlist; /* Actual number of entries in list */
1292 /* Do we need to check all possible values for -npme or is a reduced list enough? */
1293 if (!std::strcmp(npmevalues_opt, "all"))
1297 else if (!std::strcmp(npmevalues_opt, "subset"))
1299 eNPME = eNpmeSubset;
1301 else /* "auto" or "range" */
1307 else if (nnodes < 128)
1309 eNPME = eNpmeReduced;
1313 eNPME = eNpmeSubset;
1317 /* Calculate how many entries we could possibly have (in case of -npme all) */
1320 nlistmax = maxPMEnodes - minPMEnodes + 3;
1321 if (0 == minPMEnodes)
1331 /* Now make the actual list which is at most of size nlist */
1332 snew(*nPMEnodes, nlistmax);
1333 nlist = 0; /* start counting again, now the real entries in the list */
1334 for (i = 0; i < nlistmax - 2; i++)
1336 npme = maxPMEnodes - i;
1337 npp = nnodes - npme;
1340 case eNpmeAll: min_factor = 1; break;
1341 case eNpmeReduced: min_factor = 2; break;
1343 /* For 2d PME we want a common largest factor of at least the cube
1344 * root of the number of PP nodes */
1345 min_factor = static_cast<int>(std::cbrt(npp));
1347 default: gmx_fatal(FARGS, "Unknown option for eNPME in make_npme_list");
1349 if (std::gcd(npp, npme) >= min_factor)
1351 (*nPMEnodes)[nlist] = npme;
1355 /* We always test 0 PME nodes and the automatic number */
1356 *nentries = nlist + 2;
1357 (*nPMEnodes)[nlist] = 0;
1358 (*nPMEnodes)[nlist + 1] = -1;
1360 fprintf(stderr, "Will try the following %d different values for -npme:\n", *nentries);
1361 for (i = 0; i < *nentries - 1; i++)
1363 fprintf(stderr, "%d, ", (*nPMEnodes)[i]);
1365 fprintf(stderr, "and %d (auto).\n", (*nPMEnodes)[*nentries - 1]);
1369 /* Allocate memory to store the performance data */
1370 static void init_perfdata(t_perf* perfdata[], int ntprs, int datasets, int repeats)
1375 for (k = 0; k < ntprs; k++)
1377 snew(perfdata[k], datasets);
1378 for (i = 0; i < datasets; i++)
1380 for (j = 0; j < repeats; j++)
1382 snew(perfdata[k][i].Gcycles, repeats);
1383 snew(perfdata[k][i].ns_per_day, repeats);
1384 snew(perfdata[k][i].PME_f_load, repeats);
1391 /* Check for errors on mdrun -h */
1392 static void make_sure_it_runs(char* mdrun_cmd_line, int length, FILE* fp, const t_filenm* fnm, int nfile)
1394 char *command, *msg;
1397 snew(command, length + 15);
1398 snew(msg, length + 500);
1400 fprintf(stdout, "Making sure the benchmarks can be executed by running just 1 step...\n");
1401 sprintf(command, "%s -nsteps 1 -quiet", mdrun_cmd_line);
1402 fprintf(stdout, "Executing '%s' ...\n", command);
1403 ret = gmx_system_call(command);
1407 /* To prevent confusion, do not again issue a gmx_fatal here since we already
1408 * get the error message from mdrun itself */
1410 "Cannot run the first benchmark simulation! Please check the error message of\n"
1411 "mdrun for the source of the problem. Did you provide a command line\n"
1412 "argument that neither gmx tune_pme nor mdrun understands? If you're\n"
1413 "sure your command line should work, you can bypass this check with \n"
1414 "gmx tune_pme -nocheck. The failing command was:\n"
1418 fprintf(stderr, "%s", msg);
1420 fprintf(fp, "%s", msg);
1424 fprintf(stdout, "Benchmarks can be executed!\n");
1426 /* Clean up the benchmark output files we just created */
1427 fprintf(stdout, "Cleaning up ...\n");
1428 remove_if_exists(opt2fn("-bc", nfile, fnm));
1429 remove_if_exists(opt2fn("-be", nfile, fnm));
1430 remove_if_exists(opt2fn("-bcpo", nfile, fnm));
1431 remove_if_exists(opt2fn("-bg", nfile, fnm));
1432 remove_if_exists(opt2fn("-bo", nfile, fnm));
1433 remove_if_exists(opt2fn("-bx", nfile, fnm));
1439 static void do_the_tests(FILE* fp, /* General tune_pme output file */
1440 char** tpr_names, /* Filenames of the input files to test */
1441 int maxPMEnodes, /* Max fraction of nodes to use for PME */
1442 int minPMEnodes, /* Min fraction of nodes to use for PME */
1443 int npme_fixed, /* If >= -1, test fixed number of PME
1445 const char* npmevalues_opt, /* Which -npme values should be tested */
1446 t_perf** perfdata, /* Here the performace data is stored */
1447 int* pmeentries, /* Entries in the nPMEnodes list */
1448 int repeats, /* Repeat each test this often */
1449 int nnodes, /* Total number of nodes = nPP + nPME */
1450 int nr_tprs, /* Total number of tpr files to test */
1451 gmx_bool bThreads, /* Threads or MPI? */
1452 char* cmd_mpirun, /* mpirun command string */
1453 char* cmd_np, /* "-np", "-n", whatever mpirun needs */
1454 char* cmd_mdrun, /* mdrun command string */
1455 char* cmd_args_bench, /* arguments for mdrun in a string */
1456 const t_filenm* fnm, /* List of filenames from command line */
1457 int nfile, /* Number of files specified on the cmdl. */
1458 int presteps, /* DLB equilibration steps, is checked */
1459 int64_t cpt_steps, /* Time step counter in the checkpoint */
1460 gmx_bool bCheck, /* Check whether benchmark mdrun works */
1461 const char* eligible_gpu_ids) /* GPU IDs for
1462 * constructing mdrun command lines */
1464 int i, nr, k, ret, count = 0, totaltests;
1465 int* nPMEnodes = nullptr;
1466 t_perf* pd = nullptr;
1468 char * command, *cmd_stub;
1470 gmx_bool bResetProblem = FALSE;
1471 gmx_bool bFirst = TRUE;
1473 /* This string array corresponds to the eParselog enum type at the start
1475 const char* ParseLog[] = { "OK.",
1476 "Logfile not found!",
1477 "No timings, logfile truncated?",
1478 "Run was terminated.",
1479 "Counters were not reset properly.",
1480 "No DD grid found for these settings.",
1481 "TPX version conflict!",
1482 "mdrun was not started in parallel!",
1483 "Number of PP ranks has a prime factor that is too large.",
1484 "The number of PP ranks did not suit the number of GPUs.",
1485 "Some GPUs were not detected or are incompatible.",
1486 "An error occurred." };
1487 char str_PME_f_load[13];
1490 /* Allocate space for the mdrun command line. 100 extra characters should
1491 be more than enough for the -npme etcetera arguments */
1492 cmdline_length = std::strlen(cmd_mpirun) + std::strlen(cmd_np) + std::strlen(cmd_mdrun)
1493 + std::strlen(cmd_args_bench) + std::strlen(tpr_names[0]) + 100;
1494 snew(command, cmdline_length);
1495 snew(cmd_stub, cmdline_length);
1497 /* Construct the part of the command line that stays the same for all tests: */
1500 sprintf(cmd_stub, "%s%s", cmd_mdrun, cmd_np);
1504 sprintf(cmd_stub, "%s%s%s ", cmd_mpirun, cmd_np, cmd_mdrun);
1507 /* Create a list of numbers of PME nodes to test */
1508 if (npme_fixed < -1)
1510 make_npme_list(npmevalues_opt, pmeentries, &nPMEnodes, nnodes, minPMEnodes, maxPMEnodes);
1516 nPMEnodes[0] = npme_fixed;
1517 fprintf(stderr, "Will use a fixed number of %d PME-only ranks.\n", nPMEnodes[0]);
1522 fprintf(fp, "\nNo benchmarks done since number of repeats (-r) is 0.\n");
1524 finalize(opt2fn("-p", nfile, fnm));
1528 /* Allocate one dataset for each tpr input file: */
1529 init_perfdata(perfdata, nr_tprs, *pmeentries, repeats);
1531 /*****************************************/
1532 /* Main loop over all tpr files to test: */
1533 /*****************************************/
1534 totaltests = nr_tprs * (*pmeentries) * repeats;
1535 for (k = 0; k < nr_tprs; k++)
1537 fprintf(fp, "\nIndividual timings for input file %d (%s):\n", k, tpr_names[k]);
1538 fprintf(fp, "PME ranks Gcycles ns/day PME/f Remark\n");
1539 /* Loop over various numbers of PME nodes: */
1540 for (i = 0; i < *pmeentries; i++)
1542 pd = &perfdata[k][i];
1544 auto cmd_gpu_ids = make_gpu_id_command_line(eligible_gpu_ids);
1546 /* Loop over the repeats for each scenario: */
1547 for (nr = 0; nr < repeats; nr++)
1549 pd->nPMEnodes = nPMEnodes[i];
1551 /* Add -npme and -s to the command line and save it. Note that
1552 * the -passall (if set) options requires cmd_args_bench to be
1553 * at the end of the command line string */
1554 snew(pd->mdrun_cmd_line, cmdline_length);
1555 sprintf(pd->mdrun_cmd_line,
1556 "%s-npme %d -s %s %s %s",
1561 cmd_gpu_ids.c_str());
1563 /* To prevent that all benchmarks fail due to a show-stopper argument
1564 * on the mdrun command line, we make a quick check first.
1565 * This check can be turned off in cases where the automatically chosen
1566 * number of PME-only ranks leads to a number of PP ranks for which no
1567 * decomposition can be found (e.g. for large prime numbers) */
1568 if (bFirst && bCheck)
1570 /* TODO This check is really for a functional
1571 * .tpr, and if we need it, it should take place
1572 * for every .tpr, and the logic for it should be
1573 * immediately inside the loop over k, not in
1574 * this inner loop. */
1575 char* temporary_cmd_line;
1577 snew(temporary_cmd_line, cmdline_length);
1578 /* TODO -npme 0 is more likely to succeed at low
1579 parallelism than the default of -npme -1, but
1580 is more likely to fail at the scaling limit
1581 when the PP domains may be too small. "mpirun
1582 -np 1 mdrun" is probably a reasonable thing to
1583 do for this check, but it'll be easier to
1584 implement that after some refactoring of how
1585 the number of MPI ranks is managed. */
1586 sprintf(temporary_cmd_line, "%s-npme 0 -nb cpu -s %s %s", cmd_stub, tpr_names[k], cmd_args_bench);
1587 make_sure_it_runs(temporary_cmd_line, cmdline_length, fp, fnm, nfile);
1591 /* Do a benchmark simulation: */
1594 sprintf(buf, ", pass %d/%d", nr + 1, repeats);
1601 "\n=== Progress %2.0f%%, tpr %d/%d, run %d/%d%s:\n",
1602 (100.0 * count) / totaltests,
1608 make_backup(opt2fn("-err", nfile, fnm));
1609 sprintf(command, "%s 1> /dev/null 2>%s", pd->mdrun_cmd_line, opt2fn("-err", nfile, fnm));
1610 fprintf(stdout, "%s\n", pd->mdrun_cmd_line);
1611 gmx_system_call(command);
1613 /* Collect the performance data from the log file; also check stderr
1614 * for fatal errors */
1615 ret = parse_logfile(
1616 opt2fn("-bg", nfile, fnm), opt2fn("-err", nfile, fnm), pd, nr, presteps, cpt_steps, nnodes);
1617 if ((presteps > 0) && (ret == eParselogResetProblem))
1619 bResetProblem = TRUE;
1622 if (-1 == pd->nPMEnodes)
1624 sprintf(buf, "(%3d)", pd->guessPME);
1632 if (pd->PME_f_load[nr] > 0.0)
1634 sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load[nr]);
1638 sprintf(str_PME_f_load, "%s", " - ");
1641 /* Write the data we got to disk */
1643 "%4d%s %12.3f %12.3f %s %s",
1650 if (!(ret == eParselogOK || ret == eParselogNoDDGrid || ret == eParselogNotFound))
1652 fprintf(fp, " Check %s file for problems.", ret == eParselogFatal ? "err" : "log");
1658 /* Do some cleaning up and delete the files we do not need any more */
1659 cleanup(fnm, nfile, k, nnodes, pd->nPMEnodes, nr, ret == eParselogFatal);
1661 /* If the first run with this number of processors already failed, do not try again: */
1662 if (pd->Gcycles[0] <= 0.0 && repeats > 1)
1665 "Skipping remaining passes of unsuccessful setting, see log file for "
1667 count += repeats - (nr + 1);
1670 } /* end of repeats loop */
1671 } /* end of -npme loop */
1672 } /* end of tpr file loop */
1677 fprintf(fp, "WARNING: The cycle and time step counters could not be reset properly. ");
1685 static void check_input(int nnodes,
1691 real maxPMEfraction,
1692 real minPMEfraction,
1694 int64_t bench_nsteps,
1695 const t_filenm* fnm,
1705 /* Make sure the input file exists */
1706 if (!gmx_fexist(opt2fn("-s", nfile, fnm)))
1708 gmx_fatal(FARGS, "File %s not found.", opt2fn("-s", nfile, fnm));
1711 /* Make sure that the checkpoint file is not overwritten during benchmarking */
1712 if ((0 == std::strcmp(opt2fn("-cpi", nfile, fnm), opt2fn("-bcpo", nfile, fnm))) && (sim_part > 1))
1715 "Checkpoint input (-cpi) and benchmark checkpoint output (-bcpo) files must not "
1717 "The checkpoint input file must not be overwritten during the benchmarks.\n");
1720 /* Make sure that repeats is >= 0 (if == 0, only write tpr files) */
1723 gmx_fatal(FARGS, "Number of repeats < 0!");
1726 /* Check number of nodes */
1729 gmx_fatal(FARGS, "Number of ranks/threads must be a positive integer.");
1732 /* Automatically choose -ntpr if not set */
1742 /* Set a reasonable scaling factor for rcoulomb */
1745 *rmax = rcoulomb * 1.2;
1748 fprintf(stderr, "Will test %d tpr file%s.\n", *ntprs, *ntprs == 1 ? "" : "s");
1755 "Note: Choose ntpr>1 to shift PME load between real and reciprocal space.\n");
1759 /* Make shure that rmin <= rcoulomb <= rmax */
1768 if (!(*rmin <= *rmax))
1771 "Please choose the Coulomb radii such that rmin <= rmax.\n"
1772 "rmin = %g, rmax = %g, actual rcoul from .tpr file = %g\n",
1777 /* Add test scenarios if rmin or rmax were set */
1780 if (!gmx_within_tol(*rmin, rcoulomb, GMX_REAL_EPS) && (*ntprs == 1))
1783 fprintf(stderr, "NOTE: Setting -rmin to %g changed -ntpr to %d\n", *rmin, *ntprs);
1785 if (!gmx_within_tol(*rmax, rcoulomb, GMX_REAL_EPS) && (*ntprs == 1))
1788 fprintf(stderr, "NOTE: Setting -rmax to %g changed -ntpr to %d\n", *rmax, *ntprs);
1792 /* If one of rmin, rmax is set, we need 2 tpr files at minimum */
1793 if (!gmx_within_tol(*rmax, rcoulomb, GMX_REAL_EPS) || !gmx_within_tol(*rmin, rcoulomb, GMX_REAL_EPS))
1795 *ntprs = std::max(*ntprs, 2);
1798 /* If both rmin, rmax are set, we need 3 tpr files at minimum */
1799 if (!gmx_within_tol(*rmax, rcoulomb, GMX_REAL_EPS) && !gmx_within_tol(*rmin, rcoulomb, GMX_REAL_EPS))
1801 *ntprs = std::max(*ntprs, 3);
1806 fprintf(stderr, "NOTE: Your rmin, rmax setting changed -ntpr to %d\n", *ntprs);
1811 if (gmx_within_tol(*rmin, rcoulomb, GMX_REAL_EPS)
1812 && gmx_within_tol(rcoulomb, *rmax, GMX_REAL_EPS)) /* We have just a single rc */
1815 "WARNING: Resetting -ntpr to 1 since no Coulomb radius scaling is requested.\n"
1816 "Please set rmin < rmax to test Coulomb radii in the [rmin, rmax] interval\n"
1817 "with correspondingly adjusted PME grid settings\n");
1822 /* Check whether max and min fraction are within required values */
1823 if (maxPMEfraction > 0.5 || maxPMEfraction < 0)
1825 gmx_fatal(FARGS, "-max must be between 0 and 0.5");
1827 if (minPMEfraction > 0.5 || minPMEfraction < 0)
1829 gmx_fatal(FARGS, "-min must be between 0 and 0.5");
1831 if (maxPMEfraction < minPMEfraction)
1833 gmx_fatal(FARGS, "-max must be larger or equal to -min");
1836 /* Check whether the number of steps - if it was set - has a reasonable value */
1837 if (bench_nsteps < 0)
1839 gmx_fatal(FARGS, "Number of steps must be positive.");
1842 if (bench_nsteps > 10000 || bench_nsteps < 100)
1844 fprintf(stderr, "WARNING: steps=");
1845 fprintf(stderr, "%" PRId64, bench_nsteps);
1847 ". Are you sure you want to perform so %s steps for each benchmark?\n",
1848 (bench_nsteps < 100) ? "few" : "many");
1853 gmx_fatal(FARGS, "Cannot have a negative number of presteps.\n");
1856 /* Check for rcoulomb scaling if more than one .tpr file is tested */
1859 if (*rmin / rcoulomb < 0.75 || *rmax / rcoulomb > 1.25)
1862 "WARNING: Applying extreme scaling factor. I hope you know what you are "
1867 /* If a fixed number of PME nodes is set we do rcoulomb and PME gird tuning
1868 * only. We need to check whether the requested number of PME-only nodes
1870 if (npme_fixed > -1)
1872 /* No more than 50% of all nodes can be assigned as PME-only nodes. */
1873 if (2 * npme_fixed > nnodes)
1876 "Cannot have more than %d PME-only ranks for a total of %d ranks (you chose "
1882 if ((npme_fixed > 0) && (5 * npme_fixed < nnodes))
1885 "WARNING: Only %g percent of the ranks are assigned as PME-only ranks.\n",
1886 (100.0 * npme_fixed) / nnodes);
1888 if (opt2parg_bSet("-min", npargs, pa) || opt2parg_bSet("-max", npargs, pa))
1891 "NOTE: The -min, -max, and -npme options have no effect when a\n"
1892 " fixed number of PME-only ranks is requested with -fix.\n");
1898 /* Returns TRUE when "opt" is needed at launch time */
1899 static gmx_bool is_launch_file(char* opt, gmx_bool bSet)
1901 if (0 == std::strncmp(opt, "-swap", 5))
1906 /* Apart from the input .tpr and the output log files we need all options that
1907 * were set on the command line and that do not start with -b */
1908 if (0 == std::strncmp(opt, "-b", 2) || 0 == std::strncmp(opt, "-s", 2)
1909 || 0 == std::strncmp(opt, "-err", 4) || 0 == std::strncmp(opt, "-p", 2))
1918 /* Returns TRUE when "opt" defines a file which is needed for the benchmarks runs */
1919 static gmx_bool is_bench_file(char* opt, gmx_bool bSet, gmx_bool bOptional, gmx_bool bIsOutput)
1921 /* Apart from the input .tpr, all files starting with "-b" are for
1922 * _b_enchmark files exclusively */
1923 if (0 == std::strncmp(opt, "-s", 2))
1928 if (0 == std::strncmp(opt, "-b", 2) || 0 == std::strncmp(opt, "-s", 2))
1930 return !bOptional || bSet;
1940 return bSet; /* These are additional input files like -cpi -ei */
1946 /* Adds 'buf' to 'str' */
1947 static void add_to_string(char** str, const char* buf)
1952 len = std::strlen(*str) + std::strlen(buf) + 1;
1954 std::strcat(*str, buf);
1958 /* Create the command line for the benchmark as well as for the real run */
1960 create_command_line_snippets(gmx_bool bAppendFiles,
1961 gmx_bool bKeepAndNumCPT,
1962 gmx_bool bResetHWay,
1966 char* cmd_args_bench[], /* command line arguments for benchmark runs */
1967 char* cmd_args_launch[], /* command line arguments for simulation run */
1968 char extra_args[], /* Add this to the end of the command line */
1969 char* deffnm) /* Default file names, or NULL if not set */
1974 char strbuf[STRLEN];
1977 /* strlen needs at least '\0' as a string: */
1978 snew(*cmd_args_bench, 1);
1979 snew(*cmd_args_launch, 1);
1980 *cmd_args_launch[0] = '\0';
1981 *cmd_args_bench[0] = '\0';
1984 /*******************************************/
1985 /* 1. Process other command line arguments */
1986 /*******************************************/
1989 /* Add equilibration steps to benchmark options */
1990 sprintf(strbuf, "-resetstep %d ", presteps);
1991 add_to_string(cmd_args_bench, strbuf);
1993 /* These switches take effect only at launch time */
1996 sprintf(strbuf, "-deffnm %s ", deffnm);
1997 add_to_string(cmd_args_launch, strbuf);
2001 add_to_string(cmd_args_launch, "-noappend ");
2005 add_to_string(cmd_args_launch, "-cpnum ");
2009 add_to_string(cmd_args_launch, "-resethway ");
2012 /********************/
2013 /* 2. Process files */
2014 /********************/
2015 for (i = 0; i < nfile; i++)
2017 opt = const_cast<char*>(fnm[i].opt);
2018 name = opt2fn(opt, nfile, fnm);
2020 /* Strbuf contains the options, now let's sort out where we need that */
2021 sprintf(strbuf, "%s %s ", opt, name);
2023 if (is_bench_file(opt, opt2bSet(opt, nfile, fnm), is_optional(&fnm[i]), is_output(&fnm[i])))
2025 /* All options starting with -b* need the 'b' removed,
2026 * therefore overwrite strbuf */
2027 if (0 == std::strncmp(opt, "-b", 2))
2029 sprintf(strbuf, "-%s %s ", &opt[2], name);
2032 add_to_string(cmd_args_bench, strbuf);
2035 if (is_launch_file(opt, opt2bSet(opt, nfile, fnm)))
2037 add_to_string(cmd_args_launch, strbuf);
2041 add_to_string(cmd_args_bench, extra_args);
2042 add_to_string(cmd_args_launch, extra_args);
2046 /* Set option opt */
2047 static void setopt(const char* opt, int nfile, t_filenm fnm[])
2051 for (i = 0; (i < nfile); i++)
2053 if (std::strcmp(opt, fnm[i].opt) == 0)
2055 fnm[i].flag |= ffSET;
2061 /* This routine inspects the tpr file and ...
2062 * 1. checks for output files that get triggered by a tpr option. These output
2063 * files are marked as 'set' to allow for a proper cleanup after each
2065 * 2. returns the PME:PP load ratio
2066 * 3. returns rcoulomb from the tpr */
2067 static float inspect_tpr(int nfile, t_filenm fnm[], real* rcoulomb)
2069 gmx_bool bTpi; /* Is test particle insertion requested? */
2070 gmx_bool bFree; /* Is a free energy simulation requested? */
2071 gmx_bool bNM; /* Is a normal mode analysis requested? */
2072 gmx_bool bSwap; /* Is water/ion position swapping requested? */
2077 /* Check tpr file for options that trigger extra output files */
2078 t_inputrec irInstance;
2079 t_inputrec* ir = &irInstance;
2080 read_tpx_state(opt2fn("-s", nfile, fnm), ir, &state, &mtop);
2081 bFree = (FreeEnergyPerturbationType::No != ir->efep);
2082 bNM = (IntegrationAlgorithm::NM == ir->eI);
2083 bSwap = (SwapType::No != ir->eSwapCoords);
2084 bTpi = EI_TPI(ir->eI);
2086 /* Set these output files on the tuning command-line */
2089 setopt("-pf", nfile, fnm);
2090 setopt("-px", nfile, fnm);
2094 setopt("-dhdl", nfile, fnm);
2098 setopt("-tpi", nfile, fnm);
2099 setopt("-tpid", nfile, fnm);
2103 setopt("-mtx", nfile, fnm);
2107 setopt("-swap", nfile, fnm);
2110 *rcoulomb = ir->rcoulomb;
2112 /* Return the estimate for the number of PME nodes */
2113 float npme = pme_load_estimate(mtop, *ir, state.box);
2118 static void couple_files_options(int nfile, t_filenm fnm[])
2121 gmx_bool bSet, bBench;
2126 for (i = 0; i < nfile; i++)
2128 opt = const_cast<char*>(fnm[i].opt);
2129 bSet = ((fnm[i].flag & ffSET) != 0);
2130 bBench = (0 == std::strncmp(opt, "-b", 2));
2132 /* Check optional files */
2133 /* If e.g. -eo is set, then -beo also needs to be set */
2134 if (is_optional(&fnm[i]) && bSet && !bBench)
2136 sprintf(buf, "-b%s", &opt[1]);
2137 setopt(buf, nfile, fnm);
2139 /* If -beo is set, then -eo also needs to be! */
2140 if (is_optional(&fnm[i]) && bSet && bBench)
2142 sprintf(buf, "-%s", &opt[2]);
2143 setopt(buf, nfile, fnm);
2149 #define BENCHSTEPS (1000)
2151 int gmx_tune_pme(int argc, char* argv[])
2153 const char* desc[] = {
2154 "For a given number [TT]-np[tt] or [TT]-ntmpi[tt] of ranks, [THISMODULE] systematically",
2155 "times [gmx-mdrun] with various numbers of PME-only ranks and determines",
2156 "which setting is fastest. It will also test whether performance can",
2157 "be enhanced by shifting load from the reciprocal to the real space",
2158 "part of the Ewald sum. ",
2159 "Simply pass your [REF].tpr[ref] file to [THISMODULE] together with other options",
2160 "for [gmx-mdrun] as needed.[PAR]",
2161 "[THISMODULE] needs to call [gmx-mdrun] and so requires that you",
2162 "specify how to call mdrun with the argument to the [TT]-mdrun[tt]",
2163 "parameter. Depending how you have built GROMACS, values such as",
2164 "'gmx mdrun', 'gmx_d mdrun', or 'gmx_mpi mdrun' might be needed.[PAR]",
2165 "The program that runs MPI programs can be set in the environment variable",
2166 "MPIRUN (defaults to 'mpirun'). Note that for certain MPI frameworks,",
2167 "you need to provide a machine- or hostfile. This can also be passed",
2168 "via the MPIRUN variable, e.g.[PAR]",
2169 "[TT]export MPIRUN=\"/usr/local/mpirun -machinefile hosts\"[tt]",
2170 "Note that in such cases it is normally necessary to compile",
2171 "and/or run [THISMODULE] without MPI support, so that it can call",
2172 "the MPIRUN program.[PAR]",
2173 "Before doing the actual benchmark runs, [THISMODULE] will do a quick",
2174 "check whether [gmx-mdrun] works as expected with the provided parallel settings",
2175 "if the [TT]-check[tt] option is activated (the default).",
2176 "Please call [THISMODULE] with the normal options you would pass to",
2177 "[gmx-mdrun] and add [TT]-np[tt] for the number of ranks to perform the",
2178 "tests on, or [TT]-ntmpi[tt] for the number of threads. You can also add [TT]-r[tt]",
2179 "to repeat each test several times to get better statistics. [PAR]",
2180 "[THISMODULE] can test various real space / reciprocal space workloads",
2181 "for you. With [TT]-ntpr[tt] you control how many extra [REF].tpr[ref] files will be",
2182 "written with enlarged cutoffs and smaller Fourier grids respectively.",
2183 "Typically, the first test (number 0) will be with the settings from the input",
2184 "[REF].tpr[ref] file; the last test (number [TT]ntpr[tt]) will have the Coulomb cutoff",
2185 "specified by [TT]-rmax[tt] with a somewhat smaller PME grid at the same time. ",
2186 "In this last test, the Fourier spacing is multiplied with [TT]rmax[tt]/rcoulomb. ",
2187 "The remaining [REF].tpr[ref] files will have equally-spaced Coulomb radii (and Fourier ",
2188 "spacings) between these extremes. [BB]Note[bb] that you can set [TT]-ntpr[tt] to 1",
2189 "if you just seek the optimal number of PME-only ranks; in that case",
2190 "your input [REF].tpr[ref] file will remain unchanged.[PAR]",
2191 "For the benchmark runs, the default of 1000 time steps should suffice for most",
2192 "MD systems. The dynamic load balancing needs about 100 time steps",
2193 "to adapt to local load imbalances, therefore the time step counters",
2194 "are by default reset after 100 steps. For large systems (>1M atoms), as well as ",
2195 "for a higher accuracy of the measurements, you should set [TT]-resetstep[tt] to a higher ",
2197 "From the 'DD' load imbalance entries in the md.log output file you",
2198 "can tell after how many steps the load is sufficiently balanced. Example call:[PAR]",
2199 "[TT]gmx tune_pme -np 64 -s protein.tpr -launch[tt][PAR]",
2200 "After calling [gmx-mdrun] several times, detailed performance information",
2201 "is available in the output file [TT]perf.out[tt].",
2202 "[BB]Note[bb] that during the benchmarks, a couple of temporary files are written",
2203 "(options [TT]-b*[tt]), these will be automatically deleted after each test.[PAR]",
2204 "If you want the simulation to be started automatically with the",
2205 "optimized parameters, use the command line option [TT]-launch[tt].[PAR]",
2206 "Basic support for GPU-enabled [TT]mdrun[tt] exists. Give a string containing the IDs",
2207 "of the GPUs that you wish to use in the optimization in the [TT]-gpu_id[tt]",
2208 "command-line argument. This works exactly like [TT]mdrun -gpu_id[tt], does not imply a ",
2210 "and merely declares the eligible set of GPU devices. [TT]gmx-tune_pme[tt] will construct ",
2212 "mdrun that use this set appropriately. [TT]gmx-tune_pme[tt] does not support",
2213 "[TT]-gputasks[tt].[PAR]",
2218 int pmeentries = 0; /* How many values for -npme do we actually test for each tpr file */
2219 real maxPMEfraction = 0.50;
2220 real minPMEfraction = 0.25;
2221 int maxPMEnodes, minPMEnodes;
2222 float guessPMEratio; /* guessed PME:PP ratio based on the tpr file */
2223 float guessPMEnodes;
2224 int npme_fixed = -2; /* If >= -1, use only this number
2225 * of PME-only nodes */
2227 real rmin = 0.0, rmax = 0.0; /* min and max value for rcoulomb if scaling is requested */
2228 real rcoulomb = -1.0; /* Coulomb radius as set in .tpr file */
2229 gmx_bool bScaleRvdw = TRUE;
2230 int64_t bench_nsteps = BENCHSTEPS;
2231 int64_t new_sim_nsteps = -1; /* -1 indicates: not set by the user */
2232 int64_t cpt_steps = 0; /* Step counter in .cpt input file */
2233 int presteps = 1500; /* Do a full cycle reset after presteps steps */
2234 gmx_bool bOverwrite = FALSE, bKeepTPR;
2235 gmx_bool bLaunch = FALSE;
2236 char* ExtraArgs = nullptr;
2237 char** tpr_names = nullptr;
2238 const char* simulation_tpr = nullptr;
2239 char* deffnm = nullptr;
2240 int best_npme, best_tpr;
2241 int sim_part = 1; /* For benchmarks with checkpoint files */
2245 /* Default program names if nothing else is found */
2246 char *cmd_mpirun = nullptr, *cmd_mdrun = nullptr;
2247 char *cmd_args_bench, *cmd_args_launch;
2248 char* cmd_np = nullptr;
2250 /* IDs of GPUs that are eligible for computation */
2251 char* eligible_gpu_ids = nullptr;
2253 t_perf** perfdata = nullptr;
2258 /* Print out how long the tuning took */
2261 static t_filenm fnm[] = { /* tune_pme */
2262 { efOUT, "-p", "perf", ffWRITE },
2263 { efLOG, "-err", "bencherr", ffWRITE },
2264 { efTPR, "-so", "tuned", ffWRITE },
2266 { efTPR, "-s", nullptr, ffREAD },
2267 { efTRN, "-o", nullptr, ffWRITE },
2268 { efCOMPRESSED, "-x", nullptr, ffOPTWR },
2269 { efCPT, "-cpi", nullptr, ffOPTRD },
2270 { efCPT, "-cpo", nullptr, ffOPTWR },
2271 { efSTO, "-c", "confout", ffWRITE },
2272 { efEDR, "-e", "ener", ffWRITE },
2273 { efLOG, "-g", "md", ffWRITE },
2274 { efXVG, "-dhdl", "dhdl", ffOPTWR },
2275 { efXVG, "-field", "field", ffOPTWR },
2276 { efXVG, "-table", "table", ffOPTRD },
2277 { efXVG, "-tablep", "tablep", ffOPTRD },
2278 { efXVG, "-tableb", "table", ffOPTRD },
2279 { efTRX, "-rerun", "rerun", ffOPTRD },
2280 { efXVG, "-tpi", "tpi", ffOPTWR },
2281 { efXVG, "-tpid", "tpidist", ffOPTWR },
2282 { efEDI, "-ei", "sam", ffOPTRD },
2283 { efXVG, "-eo", "edsam", ffOPTWR },
2284 { efXVG, "-px", "pullx", ffOPTWR },
2285 { efXVG, "-pf", "pullf", ffOPTWR },
2286 { efXVG, "-ro", "rotation", ffOPTWR },
2287 { efLOG, "-ra", "rotangles", ffOPTWR },
2288 { efLOG, "-rs", "rotslabs", ffOPTWR },
2289 { efLOG, "-rt", "rottorque", ffOPTWR },
2290 { efMTX, "-mtx", "nm", ffOPTWR },
2291 { efXVG, "-swap", "swapions", ffOPTWR },
2292 /* Output files that are deleted after each benchmark run */
2293 { efTRN, "-bo", "bench", ffWRITE },
2294 { efXTC, "-bx", "bench", ffWRITE },
2295 { efCPT, "-bcpo", "bench", ffWRITE },
2296 { efSTO, "-bc", "bench", ffWRITE },
2297 { efEDR, "-be", "bench", ffWRITE },
2298 { efLOG, "-bg", "bench", ffWRITE },
2299 { efXVG, "-beo", "benchedo", ffOPTWR },
2300 { efXVG, "-bdhdl", "benchdhdl", ffOPTWR },
2301 { efXVG, "-bfield", "benchfld", ffOPTWR },
2302 { efXVG, "-btpi", "benchtpi", ffOPTWR },
2303 { efXVG, "-btpid", "benchtpid", ffOPTWR },
2304 { efXVG, "-bdevout", "benchdev", ffOPTWR },
2305 { efXVG, "-brunav", "benchrnav", ffOPTWR },
2306 { efXVG, "-bpx", "benchpx", ffOPTWR },
2307 { efXVG, "-bpf", "benchpf", ffOPTWR },
2308 { efXVG, "-bro", "benchrot", ffOPTWR },
2309 { efLOG, "-bra", "benchrota", ffOPTWR },
2310 { efLOG, "-brs", "benchrots", ffOPTWR },
2311 { efLOG, "-brt", "benchrott", ffOPTWR },
2312 { efMTX, "-bmtx", "benchn", ffOPTWR },
2313 { efNDX, "-bdn", "bench", ffOPTWR },
2314 { efXVG, "-bswap", "benchswp", ffOPTWR }
2317 gmx_bool bThreads = FALSE;
2321 const char* procstring[] = { nullptr, "np", "n", "none", nullptr };
2322 const char* npmevalues_opt[] = { nullptr, "auto", "all", "subset", nullptr };
2324 gmx_bool bAppendFiles = TRUE;
2325 gmx_bool bKeepAndNumCPT = FALSE;
2326 gmx_bool bResetCountersHalfWay = FALSE;
2327 gmx_bool bBenchmark = TRUE;
2328 gmx_bool bCheck = TRUE;
2330 gmx_output_env_t* oenv = nullptr;
2333 /***********************/
2334 /* tune_pme options: */
2335 /***********************/
2340 "Command line to run a simulation, e.g. 'gmx mdrun' or 'gmx_mpi mdrun'" },
2345 "Number of ranks to run the tests on (must be > 2 for separate PME ranks)" },
2350 "Name of the [TT]$MPIRUN[tt] option that specifies the number of ranks to use ('np', or "
2351 "'n'; use 'none' if there is no such option)" },
2356 "Number of MPI-threads to run the tests on (turns MPI & mpirun off)" },
2357 { "-r", FALSE, etINT, { &repeats }, "Repeat each test this often" },
2358 { "-max", FALSE, etREAL, { &maxPMEfraction }, "Max fraction of PME ranks to test with" },
2359 { "-min", FALSE, etREAL, { &minPMEfraction }, "Min fraction of PME ranks to test with" },
2364 "Within -min and -max, benchmark all possible values for [TT]-npme[tt], or just a "
2365 "reasonable subset. "
2366 "Auto neglects -min and -max and chooses reasonable values around a guess for npme "
2367 "derived from the .tpr" },
2372 "If >= -1, do not vary the number of PME-only ranks, instead use this fixed value and "
2373 "only vary rcoulomb and the PME grid spacing." },
2378 "If >0, maximal rcoulomb for -ntpr>1 (rcoulomb upscaling results in fourier grid "
2380 { "-rmin", FALSE, etREAL, { &rmin }, "If >0, minimal rcoulomb for -ntpr>1" },
2381 { "-scalevdw", FALSE, etBOOL, { &bScaleRvdw }, "Scale rvdw along with rcoulomb" },
2386 "Number of [REF].tpr[ref] files to benchmark. Create this many files with different "
2387 "rcoulomb scaling factors depending on -rmin and -rmax. "
2388 "If < 1, automatically choose the number of [REF].tpr[ref] files to test" },
2393 "Take timings for this many steps in the benchmark runs" },
2398 "Let dlb equilibrate this many steps before timings are taken (reset cycle counters "
2399 "after this many steps)" },
2403 { &new_sim_nsteps },
2404 "If non-negative, perform this many steps in the real run (overwrites nsteps from "
2405 "[REF].tpr[ref], add [REF].cpt[ref] steps)" },
2406 { "-launch", FALSE, etBOOL, { &bLaunch }, "Launch the real simulation after optimization" },
2411 "Run the benchmarks or just create the input [REF].tpr[ref] files?" },
2416 "Before the benchmark runs, check whether mdrun works in parallel" },
2420 { &eligible_gpu_ids },
2421 "List of unique GPU device IDs that are eligible for use" },
2422 /******************/
2423 /* mdrun options: */
2424 /******************/
2425 /* We let tune_pme parse and understand these options, because we need to
2426 * prevent that they appear on the mdrun command line for the benchmarks */
2431 "Append to previous output files when continuing from checkpoint instead of adding the "
2432 "simulation part number to all file names (for launch only)" },
2436 { &bKeepAndNumCPT },
2437 "Keep and number checkpoint files (launch only)" },
2438 { "-deffnm", FALSE, etSTR, { &deffnm }, "Set the default filenames (launch only)" },
2442 { &bResetCountersHalfWay },
2443 "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt] "
2447 #define NFILE asize(fnm)
2449 seconds = gmx_gettime();
2451 if (!parse_common_args(
2452 &argc, argv, PCA_NOEXIT_ON_ARGS, NFILE, fnm, asize(pa), pa, asize(desc), desc, 0, nullptr, &oenv))
2457 // procstring[0]Â is used inside two different conditionals further down
2458 GMX_RELEASE_ASSERT(procstring[0] != nullptr, "Options inconsistency; procstring[0]Â is NULL");
2460 /* Store the remaining unparsed command line entries in a string which
2461 * is then attached to the mdrun command line */
2463 ExtraArgs[0] = '\0';
2464 for (i = 1; i < argc; i++) /* argc will now be 1 if everything was understood */
2466 add_to_string(&ExtraArgs, argv[i]);
2467 add_to_string(&ExtraArgs, " ");
2470 if (opt2parg_bSet("-ntmpi", asize(pa), pa))
2473 if (opt2parg_bSet("-npstring", asize(pa), pa))
2475 fprintf(stderr, "WARNING: -npstring has no effect when using threads.\n");
2480 gmx_fatal(FARGS, "Can't run multi-threaded MPI simulation yet!");
2482 /* and now we just set this; a bit of an ugly hack*/
2485 /* Check for PME:PP ratio and whether tpr triggers additional output files */
2486 guessPMEratio = inspect_tpr(NFILE, fnm, &rcoulomb);
2488 /* Automatically set -beo options if -eo is set etc. */
2489 couple_files_options(NFILE, fnm);
2491 /* Construct the command line arguments for benchmark runs
2492 * as well as for the simulation run */
2495 sprintf(bbuf, " -ntmpi %d ", nthreads);
2499 /* This string will be used for MPI runs and will appear after the
2500 * mpirun command. */
2501 if (std::strcmp(procstring[0], "none") != 0)
2503 sprintf(bbuf, " -%s %d ", procstring[0], nnodes);
2513 create_command_line_snippets(bAppendFiles,
2515 bResetCountersHalfWay,
2524 /* Prepare to use checkpoint file if requested */
2526 if (opt2bSet("-cpi", NFILE, fnm))
2528 const char* filename = opt2fn("-cpi", NFILE, fnm);
2530 read_checkpoint_part_and_step(filename, &cpt_sim_part, &cpt_steps);
2531 if (cpt_sim_part == 0)
2533 gmx_fatal(FARGS, "Checkpoint file %s could not be read!", filename);
2535 /* tune_pme will run the next part of the simulation */
2536 sim_part = cpt_sim_part + 1;
2539 /* Open performance output file and write header info */
2540 fp = gmx_ffopen(opt2fn("-p", NFILE, fnm), "w");
2542 /* Make a quick consistency check of command line parameters */
2560 /* Determine the maximum and minimum number of PME nodes to test,
2561 * the actual list of settings is build in do_the_tests(). */
2562 if ((nnodes > 2) && (npme_fixed < -1))
2564 if (0 == std::strcmp(npmevalues_opt[0], "auto"))
2566 /* Determine the npme range automatically based on the PME:PP load guess */
2567 if (guessPMEratio > 1.0)
2569 /* More PME than PP work, probably we do not need separate PME nodes at all! */
2570 maxPMEnodes = nnodes / 2;
2571 minPMEnodes = nnodes / 2;
2575 /* PME : PP load is in the range 0..1, let's test around the guess */
2576 guessPMEnodes = static_cast<int>(nnodes / (1.0 + 1.0 / guessPMEratio));
2577 minPMEnodes = static_cast<int>(std::floor(0.7 * guessPMEnodes));
2578 maxPMEnodes = static_cast<int>(std::ceil(1.6 * guessPMEnodes));
2579 maxPMEnodes = std::min(maxPMEnodes, nnodes / 2);
2584 /* Determine the npme range based on user input */
2585 maxPMEnodes = static_cast<int>(std::floor(maxPMEfraction * nnodes));
2586 minPMEnodes = std::max(static_cast<int>(std::floor(minPMEfraction * nnodes)), 0);
2587 fprintf(stdout, "Will try runs with %d ", minPMEnodes);
2588 if (maxPMEnodes != minPMEnodes)
2590 fprintf(stdout, "- %d ", maxPMEnodes);
2593 "PME-only ranks.\n Note that the automatic number of PME-only ranks and no "
2594 "separate PME ranks are always tested.\n");
2603 /* Get the commands we need to set up the runs from environment variables */
2604 get_program_paths(bThreads, &cmd_mpirun, &cmd_mdrun);
2605 if (bBenchmark && repeats > 0)
2607 check_mdrun_works(bThreads, cmd_mpirun, cmd_np, cmd_mdrun, nullptr != eligible_gpu_ids);
2610 /* Print some header info to file */
2612 fprintf(fp, "\n P E R F O R M A N C E R E S U L T S\n");
2614 fprintf(fp, "%s for GROMACS %s\n", output_env_get_program_display_name(oenv), gmx_version());
2617 fprintf(fp, "Number of ranks : %d\n", nnodes);
2618 fprintf(fp, "The mpirun command is : %s\n", cmd_mpirun);
2619 if (std::strcmp(procstring[0], "none") != 0)
2621 fprintf(fp, "Passing # of ranks via : -%s\n", procstring[0]);
2625 fprintf(fp, "Not setting number of ranks in system call\n");
2630 fprintf(fp, "Number of threads : %d\n", nnodes);
2633 fprintf(fp, "The mdrun command is : %s\n", cmd_mdrun);
2634 fprintf(fp, "mdrun args benchmarks : %s\n", cmd_args_bench);
2635 fprintf(fp, "Benchmark steps : ");
2636 fprintf(fp, "%" PRId64, bench_nsteps);
2638 fprintf(fp, "dlb equilibration steps : %d\n", presteps);
2641 fprintf(fp, "Checkpoint time step : ");
2642 fprintf(fp, "%" PRId64, cpt_steps);
2645 fprintf(fp, "mdrun args at launchtime: %s\n", cmd_args_launch);
2647 if (new_sim_nsteps >= 0)
2650 fprintf(stderr, "Note: Simulation input file %s will have ", opt2fn("-so", NFILE, fnm));
2651 fprintf(stderr, "%" PRId64, new_sim_nsteps + cpt_steps);
2652 fprintf(stderr, " steps.\n");
2653 fprintf(fp, "Simulation steps : ");
2654 fprintf(fp, "%" PRId64, new_sim_nsteps);
2659 fprintf(fp, "Repeats for each test : %d\n", repeats);
2662 if (npme_fixed >= -1)
2664 fprintf(fp, "Fixing -npme at : %d\n", npme_fixed);
2667 fprintf(fp, "Input file : %s\n", opt2fn("-s", NFILE, fnm));
2668 fprintf(fp, " PME/PP load estimate : %g\n", guessPMEratio);
2670 /* Allocate memory for the inputinfo struct: */
2672 info->nr_inputfiles = ntprs;
2673 for (i = 0; i < ntprs; i++)
2675 snew(info->rcoulomb, ntprs);
2676 snew(info->rvdw, ntprs);
2677 snew(info->rlist, ntprs);
2678 snew(info->nkx, ntprs);
2679 snew(info->nky, ntprs);
2680 snew(info->nkz, ntprs);
2681 snew(info->fsx, ntprs);
2682 snew(info->fsy, ntprs);
2683 snew(info->fsz, ntprs);
2685 /* Make alternative tpr files to test: */
2686 snew(tpr_names, ntprs);
2687 for (i = 0; i < ntprs; i++)
2689 snew(tpr_names[i], STRLEN);
2692 /* It can be that ntprs is reduced by make_benchmark_tprs if not enough
2693 * different grids could be found. */
2694 make_benchmark_tprs(opt2fn("-s", NFILE, fnm),
2696 bench_nsteps + presteps,
2705 /********************************************************************************/
2706 /* Main loop over all scenarios we need to test: tpr files, PME nodes, repeats */
2707 /********************************************************************************/
2708 snew(perfdata, ntprs);
2711 GMX_RELEASE_ASSERT(npmevalues_opt[0] != nullptr,
2712 "Options inconsistency; npmevalues_opt[0] is NULL");
2736 fprintf(fp, "\nTuning took%8.1f minutes.\n", (gmx_gettime() - seconds) / 60.0);
2738 /* Analyse the results and give a suggestion for optimal settings: */
2739 bKeepTPR = analyze_data(
2740 fp, opt2fn("-p", NFILE, fnm), perfdata, nnodes, ntprs, pmeentries, repeats, info, &best_tpr, &best_npme);
2742 /* Take the best-performing tpr file and enlarge nsteps to original value */
2743 if (bKeepTPR && !bOverwrite)
2745 simulation_tpr = opt2fn("-s", NFILE, fnm);
2749 simulation_tpr = opt2fn("-so", NFILE, fnm);
2750 modify_PMEsettings(bOverwrite ? (new_sim_nsteps + cpt_steps) : info->orig_sim_steps,
2751 info->orig_init_step,
2752 tpr_names[best_tpr],
2756 /* Let's get rid of the temporary benchmark input files */
2757 for (i = 0; i < ntprs; i++)
2759 fprintf(stdout, "Deleting temporary benchmark input file %s\n", tpr_names[i]);
2760 remove(tpr_names[i]);
2763 /* Now start the real simulation if the user requested it ... */
2765 bLaunch, fp, bThreads, cmd_mpirun, cmd_np, cmd_mdrun, cmd_args_launch, simulation_tpr, best_npme, eligible_gpu_ids);
2769 /* ... or simply print the performance results to screen: */
2772 finalize(opt2fn("-p", NFILE, fnm));