2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2015,2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief This file declares functions for mdrun to call to manage the
38 * details of doing a restart (ie. reading checkpoints, appending
41 * \todo Clean up the error-prone logic here. Add doxygen.
43 * \author Berk Hess <hess@kth.se>
44 * \author Erik Lindahl <erik@kth.se>
45 * \author Mark Abraham <mark.j.abraham@gmail.com>
47 * \ingroup module_mdrunutility
52 #include "handlerestart.h"
60 #if GMX_NATIVE_WINDOWS
62 # include <sys/locking.h>
71 #include "gromacs/commandline/filenm.h"
72 #include "gromacs/fileio/checkpoint.h"
73 #include "gromacs/fileio/gmxfio.h"
74 #include "gromacs/mdrunutility/multisim.h"
75 #include "gromacs/mdtypes/mdrunoptions.h"
76 #include "gromacs/utility/basedefinitions.h"
77 #include "gromacs/utility/enumerationhelpers.h"
78 #include "gromacs/utility/exceptions.h"
79 #include "gromacs/utility/fatalerror.h"
80 #include "gromacs/utility/path.h"
81 #include "gromacs/utility/smalloc.h"
82 #include "gromacs/utility/stringstream.h"
83 #include "gromacs/utility/textwriter.h"
90 /*! \brief Search for \p fnm_cp in fnm and return true iff found
92 * \todo This could be implemented sanely with a for loop. */
93 gmx_bool exist_output_file(const char* fnm_cp, int nfile, const t_filenm fnm[])
97 /* Check if the output file name stored in the checkpoint file
98 * is one of the output file names of mdrun.
101 while (i < nfile && !(is_output(&fnm[i]) && strcmp(fnm_cp, fnm[i].filenames[0].c_str()) == 0))
106 return (i < nfile && gmx_fexist(fnm_cp));
109 /*! \brief Throw when mdrun -cpi fails because previous output files are missing.
111 * If we get here, the user requested restarting from a checkpoint file, that checkpoint
112 * file was found (so it is not the first part of a new run), but we are still missing
113 * some or all checkpoint files. In this case we issue a fatal error since there are
114 * so many special cases we cannot keep track of, and better safe than sorry. */
115 [[noreturn]] void throwBecauseOfMissingOutputFiles(const char* checkpointFilename,
116 ArrayRef<const gmx_file_position_t> outputfiles,
118 const t_filenm fnm[],
119 size_t numFilesMissing)
121 StringOutputStream stream;
122 TextWriter writer(&stream);
123 writer.writeLineFormatted(
124 "Some output files listed in the checkpoint file %s are not present or not named "
125 "as the output files by the current program:)",
127 auto& settings = writer.wrapperSettings();
128 auto oldIndent = settings.indent(), newIndent = 2;
130 writer.writeLine("Expected output files that are present:");
131 settings.setIndent(newIndent);
132 settings.setLineLength(78);
133 for (const auto& outputfile : outputfiles)
135 if (exist_output_file(outputfile.filename, nfile, fnm))
137 writer.writeLine(outputfile.filename);
140 settings.setIndent(oldIndent);
141 writer.ensureEmptyLine();
143 // The implementation of -deffnm does not handle properly the
144 // naming of output files that share a common suffix, such as
145 // pullx.xvg and pullf.xvg from the pull module. Such output files
146 // will be sought by the wrong name by the code that handles the
147 // restart, even though the pull module would later work out what
148 // they should have been called. Since there is a straightforward
149 // way to work around that, we help the user with that. This can
150 // be removed when gitlab issue #3875 is resolved.
151 bool missingFilesIncludedPullOutputFiles = false;
152 writer.writeLine("Expected output files that are not present or named differently:");
153 settings.setIndent(newIndent);
154 for (const auto& outputfile : outputfiles)
156 if (!exist_output_file(outputfile.filename, nfile, fnm))
158 writer.writeLine(outputfile.filename);
159 // If this was a pull file, then we have a known issue and
160 // work-around (See gitlab issue #3442).
161 if (!missingFilesIncludedPullOutputFiles
162 && (contains(outputfile.filename, "pullx")
163 || contains(outputfile.filename, "pullf")))
165 missingFilesIncludedPullOutputFiles = true;
169 if (missingFilesIncludedPullOutputFiles)
171 writer.ensureEmptyLine();
172 writer.writeLineFormatted(
173 "It appears that pull output files were not found. It is known that "
174 "using gmx mdrun -deffnm test with pulling and later "
175 "gmx mdrun -deffnm test -cpi will fail to consider the changed default "
176 "filename when checking the pull output files for restarting with "
177 "appending. You may be able to work around this by using a command like "
178 "gmx mdrun -deffnm test -px test_pullx -pf test_pullf -cpi.");
180 settings.setIndent(oldIndent);
182 writer.ensureEmptyLine();
183 writer.writeLineFormatted(
184 "To keep your simulation files safe, this simulation will not restart. "
185 "Either name your output files exactly the same as the previous simulation "
186 "part (e.g. with -deffnm or explicit naming), or make sure all the output "
187 "files are present (e.g. run from the same directory as the previous simulation "
188 "part), or instruct mdrun to write new output files with mdrun -noappend. In "
189 "the last case, you will not be able to use appending in future for this "
191 numFilesMissing, outputfiles.size());
192 GMX_THROW(InconsistentInputError(stream.toString()));
195 //! Return a string describing the precision of a build of GROMACS.
196 const char* precisionToString(bool isDoublePrecision)
198 return isDoublePrecision ? "double" : "mixed";
201 /*! \brief Describes how mdrun will (re)start and provides supporting
202 * functionality based on that data. */
203 class StartingBehaviorHandler
206 /*! \brief Throw unless all simulations in a multi-sim restart the same way.
208 * The restart could differ if checkpoint or other output files are
209 * not found in a consistent way across the set of multi-simulations,
210 * or are from different simulation parts.
212 * \param[in] ms Multi-sim handler.
214 * May only be called from the master rank of each simulation.
216 * \throws InconsistentInputError if either simulations restart
217 * differently, or from checkpoints from different simulation parts.
219 void ensureMultiSimBehaviorsMatch(const gmx_multisim_t* ms);
220 /*! \brief Return an optional value that describes the index
221 * of the next simulation part when not appending.
223 * \param[in] appendingBehavior Whether this simulation is appending
224 * (relevant only when restarting)
227 std::optional<int> makeIndexOfNextPart(AppendingBehavior appendingBehavior) const;
229 //! Describes how mdrun will (re)start
230 StartingBehavior startingBehavior = StartingBehavior::NewSimulation;
231 //! When restarting from a checkpoint, contains the contents of its header
232 std::optional<CheckpointHeaderContents> headerContents;
233 //! When restarting from a checkpoint, contains the names of expected output files
234 std::optional<std::vector<gmx_file_position_t>> outputFiles;
237 /*! \brief Choose the starting behaviour for this simulation
239 * This routine cannot print tons of data, since it is called before
240 * the log file is opened.
242 * Note that different simulations in a multi-simulation can return
243 * values that depend on whether the respective checkpoint files are
244 * found (and other files found, when appending), and so can differ
245 * between multi-simulations. It is the caller's responsibility to
246 * detect this and react accordingly. */
247 StartingBehaviorHandler chooseStartingBehavior(const AppendingBehavior appendingBehavior,
251 StartingBehaviorHandler handler;
252 if (!opt2bSet("-cpi", nfile, fnm))
254 // No need to tell the user anything
255 handler.startingBehavior = StartingBehavior::NewSimulation;
259 // A -cpi option was provided, do a restart if there is an input checkpoint file available
260 const char* checkpointFilename = opt2fn("-cpi", nfile, fnm);
261 if (!gmx_fexist(checkpointFilename))
263 // This is interpreted as the user intending a new
264 // simulation, so that scripts can call "gmx mdrun -cpi"
265 // for all simulation parts. Thus, appending cannot occur.
266 if (appendingBehavior == AppendingBehavior::Appending)
268 GMX_THROW(InconsistentInputError(
269 "Could not do a restart with appending because the checkpoint file "
270 "was not found. Either supply the name of the right checkpoint file "
271 "or do not use -append"));
273 // No need to tell the user that mdrun -cpi without a file means a new simulation
274 handler.startingBehavior = StartingBehavior::NewSimulation;
278 t_fileio* fp = gmx_fio_open(checkpointFilename, "r");
281 GMX_THROW(FileIOError(
282 formatString("Checkpoint file '%s' was found but could not be opened for "
283 "reading. Check the file permissions.",
284 checkpointFilename)));
287 std::vector<gmx_file_position_t> outputFiles;
288 CheckpointHeaderContents headerContents =
289 read_checkpoint_simulation_part_and_filenames(fp, &outputFiles);
291 GMX_RELEASE_ASSERT(!outputFiles.empty(),
292 "The checkpoint file or its reading is broken, as no output "
293 "file information is stored in it");
294 const char* logFilename = outputFiles[0].filename;
295 GMX_RELEASE_ASSERT(Path::extensionMatches(logFilename, ftp2ext(efLOG)),
296 formatString("The checkpoint file or its reading is broken, the first "
297 "output file '%s' must be a log file with extension '%s'",
298 logFilename, ftp2ext(efLOG))
301 if (appendingBehavior != AppendingBehavior::NoAppending)
303 // See whether appending can be done.
305 size_t numFilesMissing = std::count_if(
306 std::begin(outputFiles), std::end(outputFiles), [nfile, fnm](const auto& outputFile) {
307 return !exist_output_file(outputFile.filename, nfile, fnm);
309 if (numFilesMissing != 0)
311 // Appending is not possible, because not all previous
312 // output files are present. We don't automatically switch
313 // to numbered output files either, because that prevents
314 // the user from using appending in future. If they want
315 // to restart with missing files, they need to use
317 throwBecauseOfMissingOutputFiles(checkpointFilename, outputFiles, nfile, fnm, numFilesMissing);
320 for (const auto& outputFile : outputFiles)
322 if (outputFile.offset < 0)
324 // Appending of large files is not possible unless
325 // mdrun and the filesystem can do a correct job. We
326 // don't automatically switch to numbered output files
327 // either, because the user can benefit from
328 // understanding that their infrastructure is not very
329 // suitable for running a simulation producing lots of
331 auto message = formatString(
332 "The original mdrun wrote a file called '%s' which "
333 "is larger than 2 GB, but that mdrun or the filesystem "
334 "it ran on (e.g FAT32) did not support such large files. "
335 "This simulation cannot be restarted with appending. It will "
336 "be easier for you to use mdrun on a 64-bit filesystem, but "
337 "if you choose not to, then you must run mdrun with "
338 "-noappend once your output gets large enough.",
339 outputFile.filename);
340 GMX_THROW(InconsistentInputError(message));
344 const char* logFilename = outputFiles[0].filename;
345 // If the precision does not match, we cannot continue with
346 // appending, and will switch to not appending unless
347 // instructed otherwise.
348 if (headerContents.file_version >= 13 && headerContents.double_prec != GMX_DOUBLE)
350 if (appendingBehavior == AppendingBehavior::Appending)
352 GMX_THROW(InconsistentInputError(formatString(
353 "Cannot restart with appending because the previous simulation part used "
354 "%s precision which does not match the %s precision used by this build "
355 "of GROMACS. Either use matching precision or use mdrun -noappend.",
356 precisionToString(headerContents.double_prec), precisionToString(GMX_DOUBLE))));
359 // If the previous log filename had a part number, then we
360 // cannot continue with appending, and will continue without
362 else if (hasSuffixFromNoAppend(logFilename))
364 if (appendingBehavior == AppendingBehavior::Appending)
366 GMX_THROW(InconsistentInputError(
367 "Cannot restart with appending because the previous simulation "
368 "part did not use appending. Either do not use mdrun -append, or "
369 "provide the correct checkpoint file."));
374 // Everything is perfect - we can do an appending restart.
375 handler = { StartingBehavior::RestartWithAppending, headerContents, outputFiles };
379 // No need to tell the user anything because the previous
380 // simulation part also didn't append and that can only happen
381 // when they ask for it.
384 GMX_RELEASE_ASSERT(appendingBehavior != AppendingBehavior::Appending,
385 "Logic error in appending");
386 handler = { StartingBehavior::RestartWithoutAppending, headerContents, outputFiles };
390 //! Check whether chksum_file output file has a checksum that matches \c outputfile from the checkpoint.
391 void checkOutputFile(t_fileio* fileToCheck, const gmx_file_position_t& outputfile)
393 /* compute md5 chksum */
394 std::array<unsigned char, 16> digest;
395 if (outputfile.checksumSize != -1)
397 if (gmx_fio_get_file_md5(fileToCheck, outputfile.offset, &digest)
398 != outputfile.checksumSize) /*at the end of the call the file position is at the end of the file*/
400 auto message = formatString(
401 "Can't read %d bytes of '%s' to compute checksum. The file "
402 "has been replaced or its contents have been modified. Cannot "
403 "do appending because of this condition.",
404 outputfile.checksumSize, outputfile.filename);
405 GMX_THROW(InconsistentInputError(message));
409 /* compare md5 chksum */
410 if (outputfile.checksumSize != -1 && digest != outputfile.checksum)
414 fprintf(debug, "chksum for %s: ", outputfile.filename);
415 for (int j = 0; j < 16; j++)
417 fprintf(debug, "%02x", digest[j]);
419 fprintf(debug, "\n");
421 auto message = formatString(
422 "Checksum wrong for '%s'. The file has been replaced "
423 "or its contents have been modified. Cannot do appending "
424 "because of this condition.",
425 outputfile.filename);
426 GMX_THROW(InconsistentInputError(message));
430 /*! \brief If supported, obtain a write lock on the log file.
432 * This wil prevent e.g. other mdrun instances from changing it while
433 * we attempt to restart with appending. */
434 void lockLogFile(t_fileio* logfio, const char* logFilename)
436 /* Note that there are systems where the lock operation
437 * will succeed, but a second process can also lock the file.
438 * We should probably try to detect this.
440 #if defined __native_client__
443 #elif GMX_NATIVE_WINDOWS
444 if (_locking(fileno(gmx_fio_getfp(logfio)), _LK_NBLCK, LONG_MAX) == -1)
446 // don't initialize here: the struct order is OS dependent!
449 fl.l_whence = SEEK_SET;
454 if (fcntl(fileno(gmx_fio_getfp(logfio)), F_SETLK, &fl) == -1)
459 std::string message =
460 "File locking is not supported on this system. "
461 "Use mdrun -noappend to restart.";
462 GMX_THROW(FileIOError(message));
464 else if (errno == EACCES || errno == EAGAIN)
466 auto message = formatString(
467 "Failed to lock: %s. Already running "
470 GMX_THROW(FileIOError(message));
474 auto message = formatString("Failed to lock: %s. %s.", logFilename, std::strerror(errno));
475 GMX_THROW(FileIOError(message));
480 /*! \brief Prepare to append to output files.
482 * We use the file pointer positions of the output files stored in the
483 * checkpoint file and truncate the files such that any frames written
484 * after the checkpoint time are removed. All files are md5sum
485 * checked such that we can be sure that we do not truncate other
486 * (maybe important) files. The log file is locked so that we can
487 * avoid cases where another mdrun instance might still be writing to
489 void prepareForAppending(const ArrayRef<const gmx_file_position_t> outputFiles, t_fileio* logfio)
493 // Can't check or truncate output files in general
494 // TODO do we do this elsewhere for GMX_FAHCORE?
498 // Handle the log file separately - it comes first in the list
499 // because we have already opened the log file. This ensures that
500 // we retain a lock on the open file that is never lifted after
501 // the checksum is calculated.
502 const gmx_file_position_t& logOutputFile = outputFiles[0];
503 lockLogFile(logfio, logOutputFile.filename);
504 checkOutputFile(logfio, logOutputFile);
506 if (gmx_fio_seek(logfio, logOutputFile.offset) != 0)
508 auto message = formatString("Seek error! Failed to truncate log file: %s.", std::strerror(errno));
509 GMX_THROW(FileIOError(message));
512 // Now handle the remaining outputFiles
513 for (const auto& outputFile : outputFiles.subArray(1, outputFiles.size() - 1))
515 t_fileio* fileToCheck = gmx_fio_open(outputFile.filename, "r+");
516 checkOutputFile(fileToCheck, outputFile);
517 gmx_fio_close(fileToCheck);
519 if (GMX_NATIVE_WINDOWS)
521 // Can't truncate output files on this platform
525 if (gmx_truncate(outputFile.filename, outputFile.offset) != 0)
527 auto message = formatString(
528 "Truncation of file %s failed. Cannot do appending "
529 "because of this failure.",
530 outputFile.filename);
531 GMX_THROW(FileIOError(message));
536 void StartingBehaviorHandler::ensureMultiSimBehaviorsMatch(const gmx_multisim_t* ms)
540 // Trivially, the multi-sim behaviors match
544 auto startingBehaviors = gatherIntFromMultiSimulation(ms, static_cast<int>(startingBehavior));
545 bool identicalStartingBehaviors =
546 (std::adjacent_find(std::begin(startingBehaviors), std::end(startingBehaviors),
547 std::not_equal_to<>())
548 == std::end(startingBehaviors));
550 const EnumerationArray<StartingBehavior, std::string> behaviorStrings = {
551 { "restart with appending", "restart without appending", "new simulation" }
553 if (!identicalStartingBehaviors)
555 std::string message = formatString(R"(
556 Multi-simulations must all start in the same way, either a new
557 simulation, a restart with appending, or a restart without appending.
558 However, the contents of the multi-simulation directories you specified
559 were inconsistent with each other. Either remove the checkpoint file
560 from each directory, or ensure each directory has a checkpoint file from
561 the same simulation part (and, if you want to append to output files,
562 ensure the old output files are present and named as they were when the
563 checkpoint file was written).
565 To help you identify which directories need attention, the %d
566 simulations wanted the following respective behaviors:
568 ms->numSimulations_);
569 for (index simIndex = 0; simIndex != ssize(startingBehaviors); ++simIndex)
571 auto behavior = static_cast<StartingBehavior>(startingBehaviors[simIndex]);
572 message += formatString(" Simulation %6zd: %s\n", simIndex,
573 behaviorStrings[behavior].c_str());
575 GMX_THROW(InconsistentInputError(message));
578 if (startingBehavior == StartingBehavior::NewSimulation)
580 // When not restarting, the behaviors are now known to match
584 // Multi-simulation restarts require that each checkpoint
585 // describes the same simulation part. If those don't match, then
586 // the simulation cannot proceed.
587 auto simulationParts = gatherIntFromMultiSimulation(ms, headerContents->simulation_part);
588 bool identicalSimulationParts = (std::adjacent_find(std::begin(simulationParts),
589 std::end(simulationParts), std::not_equal_to<>())
590 == std::end(simulationParts));
592 if (!identicalSimulationParts)
594 std::string message = formatString(R"(
595 Multi-simulations must all start in the same way, either a new
596 simulation, a restart with appending, or a restart without appending.
597 However, the checkpoint files you specified were from different
598 simulation parts. Either remove the checkpoint file from each directory,
599 or ensure each directory has a checkpoint file from the same simulation
600 part (and, if you want to append to output files, ensure the old output
601 files are present and named as they were when the checkpoint file was
604 To help you identify which directories need attention, the %d
605 simulation checkpoint files were from the following respective
608 ms->numSimulations_);
609 for (index partIndex = 0; partIndex != ssize(simulationParts); ++partIndex)
611 message += formatString(" Simulation %6zd: %d\n", partIndex, simulationParts[partIndex]);
613 GMX_THROW(InconsistentInputError(message));
617 std::optional<int> StartingBehaviorHandler::makeIndexOfNextPart(const AppendingBehavior appendingBehavior) const
619 std::optional<int> indexOfNextPart;
621 if (startingBehavior == StartingBehavior::RestartWithoutAppending)
623 indexOfNextPart = headerContents->simulation_part + 1;
625 else if (startingBehavior == StartingBehavior::NewSimulation
626 && appendingBehavior == AppendingBehavior::NoAppending)
631 return indexOfNextPart;
636 std::tuple<StartingBehavior, LogFilePtr> handleRestart(const bool isSimulationMaster,
637 MPI_Comm communicator,
638 const gmx_multisim_t* ms,
639 const AppendingBehavior appendingBehavior,
643 StartingBehaviorHandler handler;
644 LogFilePtr logFileGuard = nullptr;
646 // Make sure all ranks agree on whether the (multi-)simulation can
648 int numErrorsFound = 0;
649 std::exception_ptr exceptionPtr;
651 // Only the master rank of each simulation can do anything with
652 // output files, so it is the only one that needs to consider
653 // whether a restart might take place, and how to implement it.
654 if (isSimulationMaster)
658 handler = chooseStartingBehavior(appendingBehavior, nfile, fnm);
660 handler.ensureMultiSimBehaviorsMatch(ms);
662 // When not appending, prepare a suffix for the part number
663 std::optional<int> indexOfNextPart = handler.makeIndexOfNextPart(appendingBehavior);
665 // If a part suffix is used, change the file names accordingly.
668 std::string suffix = formatString(".part%04d", *indexOfNextPart);
669 add_suffix_to_output_names(fnm, nfile, suffix.c_str());
672 // Open the log file, now that it has the right name
673 logFileGuard = openLogFile(ftp2fn(efLOG, nfile, fnm),
674 handler.startingBehavior == StartingBehavior::RestartWithAppending);
676 // When appending, the other output files need special handling before opening
677 if (handler.startingBehavior == StartingBehavior::RestartWithAppending)
679 prepareForAppending(*handler.outputFiles, logFileGuard.get());
682 catch (const std::exception& /*ex*/)
684 exceptionPtr = std::current_exception();
688 // Since the master rank (perhaps of only one simulation) may have
689 // found an error condition, we now coordinate the behavior across
690 // all ranks. However, only the applicable ranks will throw a
691 // non-default exception.
693 // TODO Evolve some re-usable infrastructure for this, because it
694 // will be needed in many places while setting up simulations.
696 int reducedNumErrorsFound;
697 MPI_Allreduce(&numErrorsFound, &reducedNumErrorsFound, 1, MPI_INT, MPI_SUM, communicator);
698 numErrorsFound = reducedNumErrorsFound;
700 // There is nothing to do with no MPI or thread-MPI, as there is
701 // only one rank at this point.
702 GMX_RELEASE_ASSERT(communicator == MPI_COMM_NULL, "Must have null communicator at this point");
705 // Throw in a globally coordinated way, if needed
706 if (numErrorsFound > 0)
710 std::rethrow_exception(exceptionPtr);
714 GMX_THROW(ParallelConsistencyError("Another MPI rank encountered an exception"));
718 // Ensure all ranks agree on the starting behavior, which is easy
719 // because all simulations in a multi-simulation already agreed on
720 // the starting behavior. There is nothing to do with no
721 // MPI or thread-MPI.
723 MPI_Bcast(&handler.startingBehavior, 1, MPI_INT, 0, communicator);
726 return std::make_tuple(handler.startingBehavior, std::move(logFileGuard));