2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief This file declares functions for mdrun to call to manage the
38 * details of doing a restart (ie. reading checkpoints, appending
41 * \todo Clean up the error-prone logic here. Add doxygen.
43 * \author Berk Hess <hess@kth.se>
44 * \author Erik Lindahl <erik@kth.se>
45 * \author Mark Abraham <mark.j.abraham@gmail.com>
47 * \ingroup module_mdrunutility
52 #include "handlerestart.h"
60 #if GMX_NATIVE_WINDOWS
62 # include <sys/locking.h>
71 #include "gromacs/commandline/filenm.h"
72 #include "gromacs/fileio/checkpoint.h"
73 #include "gromacs/fileio/gmxfio.h"
74 #include "gromacs/mdrunutility/multisim.h"
75 #include "gromacs/mdtypes/mdrunoptions.h"
76 #include "gromacs/utility/basedefinitions.h"
77 #include "gromacs/utility/enumerationhelpers.h"
78 #include "gromacs/utility/exceptions.h"
79 #include "gromacs/utility/fatalerror.h"
80 #include "gromacs/utility/path.h"
81 #include "gromacs/utility/smalloc.h"
82 #include "gromacs/utility/stringstream.h"
83 #include "gromacs/utility/textwriter.h"
90 /*! \brief Search for \p fnm_cp in fnm and return true iff found
92 * \todo This could be implemented sanely with a for loop. */
93 gmx_bool exist_output_file(const char* fnm_cp, int nfile, const t_filenm fnm[])
97 /* Check if the output file name stored in the checkpoint file
98 * is one of the output file names of mdrun.
101 while (i < nfile && !(is_output(&fnm[i]) && strcmp(fnm_cp, fnm[i].filenames[0].c_str()) == 0))
106 return (i < nfile && gmx_fexist(fnm_cp));
109 /*! \brief Throw when mdrun -cpi fails because previous output files are missing.
111 * If we get here, the user requested restarting from a checkpoint file, that checkpoint
112 * file was found (so it is not the first part of a new run), but we are still missing
113 * some or all checkpoint files. In this case we issue a fatal error since there are
114 * so many special cases we cannot keep track of, and better safe than sorry. */
115 [[noreturn]] void throwBecauseOfMissingOutputFiles(const char* checkpointFilename,
116 ArrayRef<const gmx_file_position_t> outputfiles,
118 const t_filenm fnm[],
119 size_t numFilesMissing)
121 StringOutputStream stream;
122 TextWriter writer(&stream);
123 writer.writeStringFormatted(
124 "Some output files listed in the checkpoint file %s are not present or not named "
125 "as the output files by the current program:)",
127 auto settings = writer.wrapperSettings();
128 auto oldIndent = settings.indent(), newIndent = 2;
130 writer.writeLine("Expected output files that are present:");
131 settings.setIndent(newIndent);
132 for (const auto& outputfile : outputfiles)
134 if (exist_output_file(outputfile.filename, nfile, fnm))
136 writer.writeLine(outputfile.filename);
139 settings.setIndent(oldIndent);
140 writer.ensureEmptyLine();
142 writer.writeLine("Expected output files that are not present or named differently:");
143 settings.setIndent(newIndent);
144 for (const auto& outputfile : outputfiles)
146 if (!exist_output_file(outputfile.filename, nfile, fnm))
148 writer.writeLine(outputfile.filename);
151 settings.setIndent(oldIndent);
153 writer.writeLineFormatted(
154 R"(To keep your simulation files safe, this simulation will not restart. Either name your
155 output files exactly the same as the previous simulation part (e.g. with -deffnm), or
156 make sure all the output files are present (e.g. run from the same directory as the
157 previous simulation part), or instruct mdrun to write new output files with mdrun -noappend.
158 In the last case, you will not be able to use appending in future for this simulation.)",
159 numFilesMissing, outputfiles.size());
160 GMX_THROW(InconsistentInputError(stream.toString()));
163 //! Return a string describing the precision of a build of GROMACS.
164 const char* precisionToString(bool isDoublePrecision)
166 return isDoublePrecision ? "double" : "mixed";
169 /*! \brief Describes how mdrun will (re)start and provides supporting
170 * functionality based on that data. */
171 class StartingBehaviorHandler
174 /*! \brief Throw unless all simulations in a multi-sim restart the same way.
176 * The restart could differ if checkpoint or other output files are
177 * not found in a consistent way across the set of multi-simulations,
178 * or are from different simulation parts.
180 * \param[in] ms Multi-sim handler.
182 * May only be called from the master rank of each simulation.
184 * \throws InconsistentInputError if either simulations restart
185 * differently, or from checkpoints from different simulation parts.
187 void ensureMultiSimBehaviorsMatch(const gmx_multisim_t* ms);
188 /*! \brief Return an optional value that describes the index
189 * of the next simulation part when not appending.
191 * \param[in] appendingBehavior Whether this simulation is appending
192 * (relevant only when restarting)
195 std::optional<int> makeIndexOfNextPart(AppendingBehavior appendingBehavior) const;
197 //! Describes how mdrun will (re)start
198 StartingBehavior startingBehavior = StartingBehavior::NewSimulation;
199 //! When restarting from a checkpoint, contains the contents of its header
200 std::optional<CheckpointHeaderContents> headerContents;
201 //! When restarting from a checkpoint, contains the names of expected output files
202 std::optional<std::vector<gmx_file_position_t>> outputFiles;
205 /*! \brief Choose the starting behaviour for this simulation
207 * This routine cannot print tons of data, since it is called before
208 * the log file is opened.
210 * Note that different simulations in a multi-simulation can return
211 * values that depend on whether the respective checkpoint files are
212 * found (and other files found, when appending), and so can differ
213 * between multi-simulations. It is the caller's responsibility to
214 * detect this and react accordingly. */
215 StartingBehaviorHandler chooseStartingBehavior(const AppendingBehavior appendingBehavior,
219 StartingBehaviorHandler handler;
220 if (!opt2bSet("-cpi", nfile, fnm))
222 // No need to tell the user anything
223 handler.startingBehavior = StartingBehavior::NewSimulation;
227 // A -cpi option was provided, do a restart if there is an input checkpoint file available
228 const char* checkpointFilename = opt2fn("-cpi", nfile, fnm);
229 if (!gmx_fexist(checkpointFilename))
231 // This is interpreted as the user intending a new
232 // simulation, so that scripts can call "gmx mdrun -cpi"
233 // for all simulation parts. Thus, appending cannot occur.
234 if (appendingBehavior == AppendingBehavior::Appending)
236 GMX_THROW(InconsistentInputError(
237 "Could not do a restart with appending because the checkpoint file "
238 "was not found. Either supply the name of the right checkpoint file "
239 "or do not use -append"));
241 // No need to tell the user that mdrun -cpi without a file means a new simulation
242 handler.startingBehavior = StartingBehavior::NewSimulation;
246 t_fileio* fp = gmx_fio_open(checkpointFilename, "r");
249 GMX_THROW(FileIOError(
250 formatString("Checkpoint file '%s' was found but could not be opened for "
251 "reading. Check the file permissions.",
252 checkpointFilename)));
255 std::vector<gmx_file_position_t> outputFiles;
256 CheckpointHeaderContents headerContents =
257 read_checkpoint_simulation_part_and_filenames(fp, &outputFiles);
259 GMX_RELEASE_ASSERT(!outputFiles.empty(),
260 "The checkpoint file or its reading is broken, as no output "
261 "file information is stored in it");
262 const char* logFilename = outputFiles[0].filename;
263 GMX_RELEASE_ASSERT(Path::extensionMatches(logFilename, ftp2ext(efLOG)),
264 formatString("The checkpoint file or its reading is broken, the first "
265 "output file '%s' must be a log file with extension '%s'",
266 logFilename, ftp2ext(efLOG))
269 if (appendingBehavior != AppendingBehavior::NoAppending)
271 // See whether appending can be done.
273 size_t numFilesMissing = std::count_if(
274 std::begin(outputFiles), std::end(outputFiles), [nfile, fnm](const auto& outputFile) {
275 return !exist_output_file(outputFile.filename, nfile, fnm);
277 if (numFilesMissing != 0)
279 // Appending is not possible, because not all previous
280 // output files are present. We don't automatically switch
281 // to numbered output files either, because that prevents
282 // the user from using appending in future. If they want
283 // to restart with missing files, they need to use
285 throwBecauseOfMissingOutputFiles(checkpointFilename, outputFiles, nfile, fnm, numFilesMissing);
288 for (const auto& outputFile : outputFiles)
290 if (outputFile.offset < 0)
292 // Appending of large files is not possible unless
293 // mdrun and the filesystem can do a correct job. We
294 // don't automatically switch to numbered output files
295 // either, because the user can benefit from
296 // understanding that their infrastructure is not very
297 // suitable for running a simulation producing lots of
299 auto message = formatString(
300 "The original mdrun wrote a file called '%s' which "
301 "is larger than 2 GB, but that mdrun or the filesystem "
302 "it ran on (e.g FAT32) did not support such large files. "
303 "This simulation cannot be restarted with appending. It will "
304 "be easier for you to use mdrun on a 64-bit filesystem, but "
305 "if you choose not to, then you must run mdrun with "
306 "-noappend once your output gets large enough.",
307 outputFile.filename);
308 GMX_THROW(InconsistentInputError(message));
312 const char* logFilename = outputFiles[0].filename;
313 // If the precision does not match, we cannot continue with
314 // appending, and will switch to not appending unless
315 // instructed otherwise.
316 if (headerContents.file_version >= 13 && headerContents.double_prec != GMX_DOUBLE)
318 if (appendingBehavior == AppendingBehavior::Appending)
320 GMX_THROW(InconsistentInputError(formatString(
321 "Cannot restart with appending because the previous simulation part used "
322 "%s precision which does not match the %s precision used by this build "
323 "of GROMACS. Either use matching precision or use mdrun -noappend.",
324 precisionToString(headerContents.double_prec), precisionToString(GMX_DOUBLE))));
327 // If the previous log filename had a part number, then we
328 // cannot continue with appending, and will continue without
330 else if (hasSuffixFromNoAppend(logFilename))
332 if (appendingBehavior == AppendingBehavior::Appending)
334 GMX_THROW(InconsistentInputError(
335 "Cannot restart with appending because the previous simulation "
336 "part did not use appending. Either do not use mdrun -append, or "
337 "provide the correct checkpoint file."));
342 // Everything is perfect - we can do an appending restart.
343 handler = { StartingBehavior::RestartWithAppending, headerContents, outputFiles };
347 // No need to tell the user anything because the previous
348 // simulation part also didn't append and that can only happen
349 // when they ask for it.
352 GMX_RELEASE_ASSERT(appendingBehavior != AppendingBehavior::Appending,
353 "Logic error in appending");
354 handler = { StartingBehavior::RestartWithoutAppending, headerContents, outputFiles };
358 //! Check whether chksum_file output file has a checksum that matches \c outputfile from the checkpoint.
359 void checkOutputFile(t_fileio* fileToCheck, const gmx_file_position_t& outputfile)
361 /* compute md5 chksum */
362 std::array<unsigned char, 16> digest;
363 if (outputfile.checksumSize != -1)
365 if (gmx_fio_get_file_md5(fileToCheck, outputfile.offset, &digest)
366 != outputfile.checksumSize) /*at the end of the call the file position is at the end of the file*/
368 auto message = formatString(
369 "Can't read %d bytes of '%s' to compute checksum. The file "
370 "has been replaced or its contents have been modified. Cannot "
371 "do appending because of this condition.",
372 outputfile.checksumSize, outputfile.filename);
373 GMX_THROW(InconsistentInputError(message));
377 /* compare md5 chksum */
378 if (outputfile.checksumSize != -1 && digest != outputfile.checksum)
382 fprintf(debug, "chksum for %s: ", outputfile.filename);
383 for (int j = 0; j < 16; j++)
385 fprintf(debug, "%02x", digest[j]);
387 fprintf(debug, "\n");
389 auto message = formatString(
390 "Checksum wrong for '%s'. The file has been replaced "
391 "or its contents have been modified. Cannot do appending "
392 "because of this condition.",
393 outputfile.filename);
394 GMX_THROW(InconsistentInputError(message));
398 /*! \brief If supported, obtain a write lock on the log file.
400 * This wil prevent e.g. other mdrun instances from changing it while
401 * we attempt to restart with appending. */
402 void lockLogFile(t_fileio* logfio, const char* logFilename)
404 /* Note that there are systems where the lock operation
405 * will succeed, but a second process can also lock the file.
406 * We should probably try to detect this.
408 #if defined __native_client__
411 #elif GMX_NATIVE_WINDOWS
412 if (_locking(fileno(gmx_fio_getfp(logfio)), _LK_NBLCK, LONG_MAX) == -1)
414 // don't initialize here: the struct order is OS dependent!
417 fl.l_whence = SEEK_SET;
422 if (fcntl(fileno(gmx_fio_getfp(logfio)), F_SETLK, &fl) == -1)
427 std::string message =
428 "File locking is not supported on this system. "
429 "Use mdrun -noappend to restart.";
430 GMX_THROW(FileIOError(message));
432 else if (errno == EACCES || errno == EAGAIN)
434 auto message = formatString(
435 "Failed to lock: %s. Already running "
438 GMX_THROW(FileIOError(message));
442 auto message = formatString("Failed to lock: %s. %s.", logFilename, std::strerror(errno));
443 GMX_THROW(FileIOError(message));
448 /*! \brief Prepare to append to output files.
450 * We use the file pointer positions of the output files stored in the
451 * checkpoint file and truncate the files such that any frames written
452 * after the checkpoint time are removed. All files are md5sum
453 * checked such that we can be sure that we do not truncate other
454 * (maybe important) files. The log file is locked so that we can
455 * avoid cases where another mdrun instance might still be writing to
457 void prepareForAppending(const ArrayRef<const gmx_file_position_t> outputFiles, t_fileio* logfio)
461 // Can't check or truncate output files in general
462 // TODO do we do this elsewhere for GMX_FAHCORE?
466 // Handle the log file separately - it comes first in the list
467 // because we have already opened the log file. This ensures that
468 // we retain a lock on the open file that is never lifted after
469 // the checksum is calculated.
470 const gmx_file_position_t& logOutputFile = outputFiles[0];
471 lockLogFile(logfio, logOutputFile.filename);
472 checkOutputFile(logfio, logOutputFile);
474 if (gmx_fio_seek(logfio, logOutputFile.offset) != 0)
476 auto message = formatString("Seek error! Failed to truncate log file: %s.", std::strerror(errno));
477 GMX_THROW(FileIOError(message));
480 // Now handle the remaining outputFiles
481 for (const auto& outputFile : outputFiles.subArray(1, outputFiles.size() - 1))
483 t_fileio* fileToCheck = gmx_fio_open(outputFile.filename, "r+");
484 checkOutputFile(fileToCheck, outputFile);
485 gmx_fio_close(fileToCheck);
487 if (GMX_NATIVE_WINDOWS)
489 // Can't truncate output files on this platform
493 if (gmx_truncate(outputFile.filename, outputFile.offset) != 0)
495 auto message = formatString(
496 "Truncation of file %s failed. Cannot do appending "
497 "because of this failure.",
498 outputFile.filename);
499 GMX_THROW(FileIOError(message));
504 void StartingBehaviorHandler::ensureMultiSimBehaviorsMatch(const gmx_multisim_t* ms)
508 // Trivially, the multi-sim behaviors match
512 auto startingBehaviors = gatherIntFromMultiSimulation(ms, static_cast<int>(startingBehavior));
513 bool identicalStartingBehaviors =
514 (std::adjacent_find(std::begin(startingBehaviors), std::end(startingBehaviors),
515 std::not_equal_to<>())
516 == std::end(startingBehaviors));
518 const EnumerationArray<StartingBehavior, std::string> behaviorStrings = {
519 { "restart with appending", "restart without appending", "new simulation" }
521 if (!identicalStartingBehaviors)
523 std::string message = formatString(R"(
524 Multi-simulations must all start in the same way, either a new
525 simulation, a restart with appending, or a restart without appending.
526 However, the contents of the multi-simulation directories you specified
527 were inconsistent with each other. Either remove the checkpoint file
528 from each directory, or ensure each directory has a checkpoint file from
529 the same simulation part (and, if you want to append to output files,
530 ensure the old output files are present and named as they were when the
531 checkpoint file was written).
533 To help you identify which directories need attention, the %d
534 simulations wanted the following respective behaviors:
536 ms->numSimulations_);
537 for (index simIndex = 0; simIndex != ssize(startingBehaviors); ++simIndex)
539 auto behavior = static_cast<StartingBehavior>(startingBehaviors[simIndex]);
540 message += formatString(" Simulation %6zd: %s\n", simIndex,
541 behaviorStrings[behavior].c_str());
543 GMX_THROW(InconsistentInputError(message));
546 if (startingBehavior == StartingBehavior::NewSimulation)
548 // When not restarting, the behaviors are now known to match
552 // Multi-simulation restarts require that each checkpoint
553 // describes the same simulation part. If those don't match, then
554 // the simulation cannot proceed.
555 auto simulationParts = gatherIntFromMultiSimulation(ms, headerContents->simulation_part);
556 bool identicalSimulationParts = (std::adjacent_find(std::begin(simulationParts),
557 std::end(simulationParts), std::not_equal_to<>())
558 == std::end(simulationParts));
560 if (!identicalSimulationParts)
562 std::string message = formatString(R"(
563 Multi-simulations must all start in the same way, either a new
564 simulation, a restart with appending, or a restart without appending.
565 However, the checkpoint files you specified were from different
566 simulation parts. Either remove the checkpoint file from each directory,
567 or ensure each directory has a checkpoint file from the same simulation
568 part (and, if you want to append to output files, ensure the old output
569 files are present and named as they were when the checkpoint file was
572 To help you identify which directories need attention, the %d
573 simulation checkpoint files were from the following respective
576 ms->numSimulations_);
577 for (index partIndex = 0; partIndex != ssize(simulationParts); ++partIndex)
579 message += formatString(" Simulation %6zd: %d\n", partIndex, simulationParts[partIndex]);
581 GMX_THROW(InconsistentInputError(message));
585 std::optional<int> StartingBehaviorHandler::makeIndexOfNextPart(const AppendingBehavior appendingBehavior) const
587 std::optional<int> indexOfNextPart;
589 if (startingBehavior == StartingBehavior::RestartWithoutAppending)
591 indexOfNextPart = headerContents->simulation_part + 1;
593 else if (startingBehavior == StartingBehavior::NewSimulation
594 && appendingBehavior == AppendingBehavior::NoAppending)
599 return indexOfNextPart;
604 std::tuple<StartingBehavior, LogFilePtr> handleRestart(const bool isSimulationMaster,
605 MPI_Comm communicator,
606 const gmx_multisim_t* ms,
607 const AppendingBehavior appendingBehavior,
611 StartingBehaviorHandler handler;
612 LogFilePtr logFileGuard = nullptr;
614 // Make sure all ranks agree on whether the (multi-)simulation can
616 int numErrorsFound = 0;
617 std::exception_ptr exceptionPtr;
619 // Only the master rank of each simulation can do anything with
620 // output files, so it is the only one that needs to consider
621 // whether a restart might take place, and how to implement it.
622 if (isSimulationMaster)
626 handler = chooseStartingBehavior(appendingBehavior, nfile, fnm);
628 handler.ensureMultiSimBehaviorsMatch(ms);
630 // When not appending, prepare a suffix for the part number
631 std::optional<int> indexOfNextPart = handler.makeIndexOfNextPart(appendingBehavior);
633 // If a part suffix is used, change the file names accordingly.
636 std::string suffix = formatString(".part%04d", *indexOfNextPart);
637 add_suffix_to_output_names(fnm, nfile, suffix.c_str());
640 // Open the log file, now that it has the right name
641 logFileGuard = openLogFile(ftp2fn(efLOG, nfile, fnm),
642 handler.startingBehavior == StartingBehavior::RestartWithAppending);
644 // When appending, the other output files need special handling before opening
645 if (handler.startingBehavior == StartingBehavior::RestartWithAppending)
647 prepareForAppending(*handler.outputFiles, logFileGuard.get());
650 catch (const std::exception& /*ex*/)
652 exceptionPtr = std::current_exception();
656 // Since the master rank (perhaps of only one simulation) may have
657 // found an error condition, we now coordinate the behavior across
658 // all ranks. However, only the applicable ranks will throw a
659 // non-default exception.
661 // TODO Evolve some re-usable infrastructure for this, because it
662 // will be needed in many places while setting up simulations.
664 int reducedNumErrorsFound;
665 MPI_Allreduce(&numErrorsFound, &reducedNumErrorsFound, 1, MPI_INT, MPI_SUM, communicator);
666 numErrorsFound = reducedNumErrorsFound;
668 // There is nothing to do with no MPI or thread-MPI, as there is
669 // only one rank at this point.
670 GMX_RELEASE_ASSERT(communicator == MPI_COMM_NULL, "Must have null communicator at this point");
673 // Throw in a globally coordinated way, if needed
674 if (numErrorsFound > 0)
678 std::rethrow_exception(exceptionPtr);
682 GMX_THROW(ParallelConsistencyError("Another MPI rank encountered an exception"));
686 // Ensure all ranks agree on the starting behavior, which is easy
687 // because all simulations in a multi-simulation already agreed on
688 // the starting behavior. There is nothing to do with no
689 // MPI or thread-MPI.
691 MPI_Bcast(&handler.startingBehavior, 1, MPI_INT, 0, communicator);
694 return std::make_tuple(handler.startingBehavior, std::move(logFileGuard));