2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief This file declares functions for mdrun to call to manage the
38 * details of doing a restart (ie. reading checkpoints, appending
41 * \todo Clean up the error-prone logic here. Add doxygen.
43 * \author Berk Hess <hess@kth.se>
44 * \author Erik Lindahl <erik@kth.se>
45 * \author Mark Abraham <mark.j.abraham@gmail.com>
47 * \ingroup module_mdrunutility
52 #include "handlerestart.h"
59 #if GMX_NATIVE_WINDOWS
61 #include <sys/locking.h>
69 #include "gromacs/commandline/filenm.h"
70 #include "gromacs/compat/optional.h"
71 #include "gromacs/fileio/checkpoint.h"
72 #include "gromacs/fileio/gmxfio.h"
73 #include "gromacs/gmxlib/network.h"
74 #include "gromacs/mdrunutility/multisim.h"
75 #include "gromacs/mdtypes/commrec.h"
76 #include "gromacs/mdtypes/mdrunoptions.h"
77 #include "gromacs/utility/basedefinitions.h"
78 #include "gromacs/utility/enumerationhelpers.h"
79 #include "gromacs/utility/exceptions.h"
80 #include "gromacs/utility/fatalerror.h"
81 #include "gromacs/utility/path.h"
82 #include "gromacs/utility/smalloc.h"
83 #include "gromacs/utility/stringstream.h"
84 #include "gromacs/utility/textwriter.h"
91 /*! \brief Search for \p fnm_cp in fnm and return true iff found
93 * \todo This could be implemented sanely with a for loop. */
94 gmx_bool exist_output_file(const char *fnm_cp, int nfile, const t_filenm fnm[])
98 /* Check if the output file name stored in the checkpoint file
99 * is one of the output file names of mdrun.
103 !(is_output(&fnm[i]) && strcmp(fnm_cp, fnm[i].filenames[0].c_str()) == 0))
108 return (i < nfile && gmx_fexist(fnm_cp));
111 /*! \brief Throw when mdrun -cpi fails because previous output files are missing.
113 * If we get here, the user requested restarting from a checkpoint file, that checkpoint
114 * file was found (so it is not the first part of a new run), but we are still missing
115 * some or all checkpoint files. In this case we issue a fatal error since there are
116 * so many special cases we cannot keep track of, and better safe than sorry. */
118 throwBecauseOfMissingOutputFiles(const char *checkpointFilename,
119 ArrayRef<const gmx_file_position_t> outputfiles,
120 int nfile, const t_filenm fnm[],
121 size_t numFilesMissing)
123 StringOutputStream stream;
124 TextWriter writer(&stream);
125 writer.writeStringFormatted
126 ("Some output files listed in the checkpoint file %s are not present or not named "
127 "as the output files by the current program:)",
129 auto settings = writer.wrapperSettings();
130 auto oldIndent = settings.indent(), newIndent = 2;
132 writer.writeLine("Expected output files that are present:");
133 settings.setIndent(newIndent);
134 for (const auto &outputfile : outputfiles)
136 if (exist_output_file(outputfile.filename, nfile, fnm))
138 writer.writeLine(outputfile.filename);
141 settings.setIndent(oldIndent);
142 writer.ensureEmptyLine();
144 writer.writeLine("Expected output files that are not present or named differently:");
145 settings.setIndent(newIndent);
146 for (const auto &outputfile : outputfiles)
148 if (!exist_output_file(outputfile.filename,
151 writer.writeLine(outputfile.filename);
154 settings.setIndent(oldIndent);
156 writer.writeLineFormatted(
157 R"(To keep your simulation files safe, this simulation will not restart. Either name your
158 output files exactly the same as the previous simulation part (e.g. with -deffnm), or
159 make sure all the output files are present (e.g. run from the same directory as the
160 previous simulation part), or instruct mdrun to write new output files with mdrun -noappend.
161 In the last case, you will not be able to use appending in future for this simulation.)",
162 numFilesMissing, outputfiles.size());
163 GMX_THROW(InconsistentInputError(stream.toString()));
166 //! Return a string describing the precision of a build of GROMACS.
167 const char *precisionToString(bool isDoublePrecision)
169 return isDoublePrecision ? "double" : "mixed";
172 /*! \brief Describes how mdrun will (re)start and provides supporting
173 * functionality based on that data. */
174 class StartingBehaviorHandler
177 /*! \brief Throw unless all simulations in a multi-sim restart the same way.
179 * The restart could differ if checkpoint or other output files are
180 * not found in a consistent way across the set of multi-simulations,
181 * or are from different simulation parts.
183 * \param[in] ms Multi-sim handler.
185 * May only be called from the master rank of each simulation.
187 * \throws InconsistentInputError if either simulations restart
188 * differently, or from checkpoints from different simulation parts.
190 void ensureMultiSimBehaviorsMatch(const gmx_multisim_t *ms);
191 /*! \brief Return an optional value that describes the index
192 * of the next simulation part when not appending.
194 * \param[in] appendingBehavior Whether this simulation is appending
195 * (relevant only when restarting)
198 compat::optional<int> makeIndexOfNextPart(AppendingBehavior appendingBehavior) const;
200 //! Describes how mdrun will (re)start
201 StartingBehavior startingBehavior = StartingBehavior::NewSimulation;
202 //! When restarting from a checkpoint, contains the contents of its header
203 compat::optional<CheckpointHeaderContents> headerContents;
204 //! When restarting from a checkpoint, contains the names of expected output files
205 compat::optional < std::vector < gmx_file_position_t>> outputFiles;
209 /*! \brief Choose the starting behaviour for this simulation
211 * This routine cannot print tons of data, since it is called before
212 * the log file is opened.
214 * Note that different simulations in a multi-simulation can return
215 * values that depend on whether the respective checkpoint files are
216 * found (and other files found, when appending), and so can differ
217 * between multi-simulations. It is the caller's responsibility to
218 * detect this and react accordingly. */
219 StartingBehaviorHandler
220 chooseStartingBehavior(const AppendingBehavior appendingBehavior,
224 StartingBehaviorHandler handler;
225 if (!opt2bSet("-cpi", nfile, fnm))
227 // No need to tell the user anything
228 handler.startingBehavior = StartingBehavior::NewSimulation;
232 // A -cpi option was provided, do a restart if there is an input checkpoint file available
233 const char *checkpointFilename = opt2fn("-cpi", nfile, fnm);
234 if (!gmx_fexist(checkpointFilename))
236 // This is interpreted as the user intending a new
237 // simulation, so that scripts can call "gmx mdrun -cpi"
238 // for all simulation parts. Thus, appending cannot occur.
239 if (appendingBehavior == AppendingBehavior::Appending)
241 GMX_THROW(InconsistentInputError
242 ("Could not do a restart with appending because the checkpoint file "
243 "was not found. Either supply the name of the right checkpoint file "
244 "or do not use -append"));
246 // No need to tell the user that mdrun -cpi without a file means a new simulation
247 handler.startingBehavior = StartingBehavior::NewSimulation;
251 t_fileio *fp = gmx_fio_open(checkpointFilename, "r");
254 GMX_THROW(FileIOError
255 (formatString("Checkpoint file '%s' was found but could not be opened for "
256 "reading. Check the file permissions.", checkpointFilename)));
259 std::vector < gmx_file_position_t> outputFiles;
260 CheckpointHeaderContents headerContents =
261 read_checkpoint_simulation_part_and_filenames(fp, &outputFiles);
263 GMX_RELEASE_ASSERT(!outputFiles.empty(),
264 "The checkpoint file or its reading is broken, as no output "
265 "file information is stored in it");
266 const char *logFilename = outputFiles[0].filename;
267 GMX_RELEASE_ASSERT(Path::extensionMatches(logFilename, ftp2ext(efLOG)),
268 formatString("The checkpoint file or its reading is broken, the first "
269 "output file '%s' must be a log file with extension '%s'",
270 logFilename, ftp2ext(efLOG)).c_str());
272 if (appendingBehavior != AppendingBehavior::NoAppending)
274 // See whether appending can be done.
276 size_t numFilesMissing = std::count_if(std::begin(outputFiles), std::end(outputFiles),
277 [nfile, fnm](const auto &outputFile)
279 return !exist_output_file(outputFile.filename, nfile, fnm);
281 if (numFilesMissing != 0)
283 // Appending is not possible, because not all previous
284 // output files are present. We don't automatically switch
285 // to numbered output files either, because that prevents
286 // the user from using appending in future. If they want
287 // to restart with missing files, they need to use
289 throwBecauseOfMissingOutputFiles(checkpointFilename, outputFiles, nfile, fnm, numFilesMissing);
292 for (const auto &outputFile : outputFiles)
294 if (outputFile.offset < 0)
296 // Appending of large files is not possible unless
297 // mdrun and the filesystem can do a correct job. We
298 // don't automatically switch to numbered output files
299 // either, because the user can benefit from
300 // understanding that their infrastructure is not very
301 // suitable for running a simulation producing lots of
304 formatString("The original mdrun wrote a file called '%s' which "
305 "is larger than 2 GB, but that mdrun or the filesystem "
306 "it ran on (e.g FAT32) did not support such large files. "
307 "This simulation cannot be restarted with appending. It will "
308 "be easier for you to use mdrun on a 64-bit filesystem, but "
309 "if you choose not to, then you must run mdrun with "
310 "-noappend once your output gets large enough.",
311 outputFile.filename);
312 GMX_THROW(InconsistentInputError(message));
316 const char *logFilename = outputFiles[0].filename;
317 // If the precision does not match, we cannot continue with
318 // appending, and will switch to not appending unless
319 // instructed otherwise.
320 if (headerContents.file_version >= 13 && headerContents.double_prec != GMX_DOUBLE)
322 if (appendingBehavior == AppendingBehavior::Appending)
324 GMX_THROW(InconsistentInputError
325 (formatString("Cannot restart with appending because the previous simulation part used "
326 "%s precision which does not match the %s precision used by this build "
327 "of GROMACS. Either use matching precision or use mdrun -noappend.",
328 precisionToString(headerContents.double_prec),
329 precisionToString(GMX_DOUBLE))));
332 // If the previous log filename had a part number, then we
333 // cannot continue with appending, and will continue without
335 else if (hasSuffixFromNoAppend(logFilename))
337 if (appendingBehavior == AppendingBehavior::Appending)
339 GMX_THROW(InconsistentInputError
340 ("Cannot restart with appending because the previous simulation "
341 "part did not use appending. Either do not use mdrun -append, or "
342 "provide the correct checkpoint file."));
347 // Everything is perfect - we can do an appending restart.
348 handler = { StartingBehavior::RestartWithAppending, headerContents, outputFiles };
352 // No need to tell the user anything because the previous
353 // simulation part also didn't append and that can only happen
354 // when they ask for it.
357 GMX_RELEASE_ASSERT(appendingBehavior != AppendingBehavior::Appending, "Logic error in appending");
358 handler = { StartingBehavior::RestartWithoutAppending, headerContents, outputFiles };
362 //! Check whether chksum_file output file has a checksum that matches \c outputfile from the checkpoint.
363 void checkOutputFile(t_fileio *fileToCheck,
364 const gmx_file_position_t &outputfile)
366 /* compute md5 chksum */
367 std::array<unsigned char, 16> digest;
368 if (outputfile.checksumSize != -1)
370 if (gmx_fio_get_file_md5(fileToCheck, outputfile.offset,
371 &digest) != outputfile.checksumSize) /*at the end of the call the file position is at the end of the file*/
374 formatString("Can't read %d bytes of '%s' to compute checksum. The file "
375 "has been replaced or its contents have been modified. Cannot "
376 "do appending because of this condition.",
377 outputfile.checksumSize,
378 outputfile.filename);
379 GMX_THROW(InconsistentInputError(message));
383 /* compare md5 chksum */
384 if (outputfile.checksumSize != -1 &&
385 digest != outputfile.checksum)
389 fprintf(debug, "chksum for %s: ", outputfile.filename);
390 for (int j = 0; j < 16; j++)
392 fprintf(debug, "%02x", digest[j]);
394 fprintf(debug, "\n");
397 formatString("Checksum wrong for '%s'. The file has been replaced "
398 "or its contents have been modified. Cannot do appending "
399 "because of this condition.", outputfile.filename);
400 GMX_THROW(InconsistentInputError(message));
404 /*! \brief If supported, obtain a write lock on the log file.
406 * This wil prevent e.g. other mdrun instances from changing it while
407 * we attempt to restart with appending. */
408 void lockLogFile(t_fileio *logfio,
409 const char *logFilename)
411 /* Note that there are systems where the lock operation
412 * will succeed, but a second process can also lock the file.
413 * We should probably try to detect this.
415 #if defined __native_client__
418 #elif GMX_NATIVE_WINDOWS
419 if (_locking(fileno(gmx_fio_getfp(logfio)), _LK_NBLCK, LONG_MAX) == -1)
421 // don't initialize here: the struct order is OS dependent!
424 fl.l_whence = SEEK_SET;
429 if (fcntl(fileno(gmx_fio_getfp(logfio)), F_SETLK, &fl) == -1)
434 std::string message = "File locking is not supported on this system. "
435 "Use mdrun -noappend to restart.";
436 GMX_THROW(FileIOError(message));
438 else if (errno == EACCES || errno == EAGAIN)
441 formatString("Failed to lock: %s. Already running "
442 "simulation?", logFilename);
443 GMX_THROW(FileIOError(message));
448 formatString("Failed to lock: %s. %s.",
449 logFilename, std::strerror(errno));
450 GMX_THROW(FileIOError(message));
455 /*! \brief Prepare to append to output files.
457 * We use the file pointer positions of the output files stored in the
458 * checkpoint file and truncate the files such that any frames written
459 * after the checkpoint time are removed. All files are md5sum
460 * checked such that we can be sure that we do not truncate other
461 * (maybe important) files. The log file is locked so that we can
462 * avoid cases where another mdrun instance might still be writing to
465 prepareForAppending(const ArrayRef<const gmx_file_position_t> outputFiles,
470 // Can't check or truncate output files in general
471 // TODO do we do this elsewhere for GMX_FAHCORE?
475 // Handle the log file separately - it comes first in the list
476 // because we have already opened the log file. This ensures that
477 // we retain a lock on the open file that is never lifted after
478 // the checksum is calculated.
479 const gmx_file_position_t &logOutputFile = outputFiles[0];
480 lockLogFile(logfio, logOutputFile.filename);
481 checkOutputFile(logfio, logOutputFile);
483 if (gmx_fio_seek(logfio, logOutputFile.offset) != 0)
486 formatString("Seek error! Failed to truncate log file: %s.",
487 std::strerror(errno));
488 GMX_THROW(FileIOError(message));
491 // Now handle the remaining outputFiles
492 for (const auto &outputFile : outputFiles.subArray(1, outputFiles.size()-1))
494 t_fileio *fileToCheck = gmx_fio_open(outputFile.filename, "r+");
495 checkOutputFile(fileToCheck, outputFile);
496 gmx_fio_close(fileToCheck);
498 if (GMX_NATIVE_WINDOWS)
500 // Can't truncate output files on this platform
504 if (gmx_truncate(outputFile.filename, outputFile.offset) != 0)
507 formatString("Truncation of file %s failed. Cannot do appending "
508 "because of this failure.", outputFile.filename);
509 GMX_THROW(FileIOError(message));
514 void StartingBehaviorHandler::ensureMultiSimBehaviorsMatch(const gmx_multisim_t *ms)
518 // Trivially, the multi-sim behaviors match
522 auto startingBehaviors = gatherIntFromMultiSimulation(ms, static_cast<int>(startingBehavior));
523 bool identicalStartingBehaviors = (std::adjacent_find(std::begin(startingBehaviors),
524 std::end(startingBehaviors),
525 std::not_equal_to<>()) == std::end(startingBehaviors));
527 const EnumerationArray<StartingBehavior, std::string> behaviorStrings =
528 { { "restart with appending", "restart without appending", "new simulation" } };
529 if (!identicalStartingBehaviors)
531 std::string message = formatString(R"(
532 Multi-simulations must all start in the same way, either a new
533 simulation, a restart with appending, or a restart without appending.
534 However, the contents of the multi-simulation directories you specified
535 were inconsistent with each other. Either remove the checkpoint file
536 from each directory, or ensure each directory has a checkpoint file from
537 the same simulation part (and, if you want to append to output files,
538 ensure the old output files are present and named as they were when the
539 checkpoint file was written).
541 To help you identify which directories need attention, the %d
542 simulations wanted the following respective behaviors:
544 for (index simIndex = 0; simIndex != ssize(startingBehaviors); ++simIndex)
546 auto behavior = static_cast<StartingBehavior>(startingBehaviors[simIndex]);
547 message += formatString(" Simulation %6zd: %s\n", simIndex, behaviorStrings[behavior].c_str());
549 GMX_THROW(InconsistentInputError(message));
552 if (startingBehavior == StartingBehavior::NewSimulation)
554 // When not restarting, the behaviors are now known to match
558 // Multi-simulation restarts require that each checkpoint
559 // describes the same simulation part. If those don't match, then
560 // the simulation cannot proceed.
561 auto simulationParts = gatherIntFromMultiSimulation(ms, headerContents->simulation_part);
562 bool identicalSimulationParts = (std::adjacent_find(std::begin(simulationParts),
563 std::end(simulationParts),
564 std::not_equal_to<>()) == std::end(simulationParts));
566 if (!identicalSimulationParts)
568 std::string message = formatString(R"(
569 Multi-simulations must all start in the same way, either a new
570 simulation, a restart with appending, or a restart without appending.
571 However, the checkpoint files you specified were from different
572 simulation parts. Either remove the checkpoint file from each directory,
573 or ensure each directory has a checkpoint file from the same simulation
574 part (and, if you want to append to output files, ensure the old output
575 files are present and named as they were when the checkpoint file was
578 To help you identify which directories need attention, the %d
579 simulation checkpoint files were from the following respective
582 for (index partIndex = 0; partIndex != ssize(simulationParts); ++partIndex)
584 message += formatString(" Simulation %6zd: %d\n", partIndex, simulationParts[partIndex]);
586 GMX_THROW(InconsistentInputError(message));
590 compat::optional<int>
591 StartingBehaviorHandler::makeIndexOfNextPart(const AppendingBehavior appendingBehavior) const
593 compat::optional<int> indexOfNextPart;
595 if (startingBehavior == StartingBehavior::RestartWithoutAppending)
597 indexOfNextPart = headerContents->simulation_part + 1;
599 else if (startingBehavior == StartingBehavior::NewSimulation &&
600 appendingBehavior == AppendingBehavior::NoAppending)
605 return indexOfNextPart;
610 std::tuple<StartingBehavior, LogFilePtr>
611 handleRestart(t_commrec *cr,
612 const gmx_multisim_t *ms,
613 const AppendingBehavior appendingBehavior,
617 StartingBehaviorHandler handler;
618 LogFilePtr logFileGuard = nullptr;
620 // Make sure all ranks agree on whether the (multi-)simulation can
622 int numErrorsFound = 0;
623 std::exception_ptr exceptionPtr;
625 // Only the master rank of each simulation can do anything with
626 // output files, so it is the only one that needs to consider
627 // whether a restart might take place, and how to implement it.
632 handler = chooseStartingBehavior(appendingBehavior, nfile, fnm);
634 handler.ensureMultiSimBehaviorsMatch(ms);
636 // When not appending, prepare a suffix for the part number
637 compat::optional<int> indexOfNextPart
638 = handler.makeIndexOfNextPart(appendingBehavior);
640 // If a part suffix is used, change the file names accordingly.
643 std::string suffix = formatString(".part%04d", *indexOfNextPart);
644 add_suffix_to_output_names(fnm, nfile, suffix.c_str());
647 // Open the log file, now that it has the right name
648 logFileGuard = openLogFile(ftp2fn(efLOG, nfile, fnm),
649 handler.startingBehavior == StartingBehavior::RestartWithAppending);
651 // When appending, the other output files need special handling before opening
652 if (handler.startingBehavior == StartingBehavior::RestartWithAppending)
654 prepareForAppending(*handler.outputFiles, logFileGuard.get());
657 catch (const std::exception & /*ex*/)
659 exceptionPtr = std::current_exception();
663 // Since the master rank (perhaps of only one simulation) may have
664 // found an error condition, we now coordinate the behavior across
665 // all ranks. However, only the applicable ranks will throw a
666 // non-default exception.
668 // TODO Evolve some re-usable infrastructure for this, because it
669 // will be needed in many places while setting up simulations.
672 gmx_sumi(1, &numErrorsFound, cr);
676 gmx_sumi_sim(1, &numErrorsFound, ms);
679 gmx_bcast(sizeof(numErrorsFound), &numErrorsFound, cr);
683 // Throw in a globally coordinated way, if needed
684 if (numErrorsFound > 0)
688 std::rethrow_exception(exceptionPtr);
692 GMX_THROW(ParallelConsistencyError("Another MPI rank encountered an exception"));
697 gmx_bcast(sizeof(handler.startingBehavior), &handler.startingBehavior, cr);
700 return std::make_tuple(handler.startingBehavior, std::move(logFileGuard));