2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief This file declares functions for mdrun to call to manage the
38 * details of doing a restart (ie. reading checkpoints, appending
41 * \todo Clean up the error-prone logic here. Add doxygen.
43 * \author Berk Hess <hess@kth.se>
44 * \author Erik Lindahl <erik@kth.se>
45 * \author Mark Abraham <mark.j.abraham@gmail.com>
47 * \ingroup module_mdrunutility
52 #include "handlerestart.h"
60 #if GMX_NATIVE_WINDOWS
62 # include <sys/locking.h>
71 #include "gromacs/commandline/filenm.h"
72 #include "gromacs/fileio/checkpoint.h"
73 #include "gromacs/fileio/gmxfio.h"
74 #include "gromacs/mdrunutility/multisim.h"
75 #include "gromacs/mdtypes/mdrunoptions.h"
76 #include "gromacs/utility/basedefinitions.h"
77 #include "gromacs/utility/enumerationhelpers.h"
78 #include "gromacs/utility/exceptions.h"
79 #include "gromacs/utility/fatalerror.h"
80 #include "gromacs/utility/path.h"
81 #include "gromacs/utility/smalloc.h"
82 #include "gromacs/utility/stringstream.h"
83 #include "gromacs/utility/textwriter.h"
90 /*! \brief Search for \p fnm_cp in fnm and return true iff found
92 * \todo This could be implemented sanely with a for loop. */
93 gmx_bool exist_output_file(const char* fnm_cp, int nfile, const t_filenm fnm[])
97 /* Check if the output file name stored in the checkpoint file
98 * is one of the output file names of mdrun.
101 while (i < nfile && !(is_output(&fnm[i]) && strcmp(fnm_cp, fnm[i].filenames[0].c_str()) == 0))
106 return (i < nfile && gmx_fexist(fnm_cp));
109 /*! \brief Throw when mdrun -cpi fails because previous output files are missing.
111 * If we get here, the user requested restarting from a checkpoint file, that checkpoint
112 * file was found (so it is not the first part of a new run), but we are still missing
113 * some or all checkpoint files. In this case we issue a fatal error since there are
114 * so many special cases we cannot keep track of, and better safe than sorry. */
115 [[noreturn]] void throwBecauseOfMissingOutputFiles(const char* checkpointFilename,
116 ArrayRef<const gmx_file_position_t> outputfiles,
118 const t_filenm fnm[],
119 size_t numFilesMissing)
121 StringOutputStream stream;
122 TextWriter writer(&stream);
123 writer.writeStringFormatted(
124 "Some output files listed in the checkpoint file %s are not present or not named "
125 "as the output files by the current program:)",
127 auto settings = writer.wrapperSettings();
128 auto oldIndent = settings.indent(), newIndent = 2;
130 writer.writeLine("Expected output files that are present:");
131 settings.setIndent(newIndent);
132 for (const auto& outputfile : outputfiles)
134 if (exist_output_file(outputfile.filename, nfile, fnm))
136 writer.writeLine(outputfile.filename);
139 settings.setIndent(oldIndent);
140 writer.ensureEmptyLine();
142 writer.writeLine("Expected output files that are not present or named differently:");
143 settings.setIndent(newIndent);
144 for (const auto& outputfile : outputfiles)
146 if (!exist_output_file(outputfile.filename, nfile, fnm))
148 writer.writeLine(outputfile.filename);
151 settings.setIndent(oldIndent);
153 writer.writeLineFormatted(
154 R"(To keep your simulation files safe, this simulation will not restart. Either name your
155 output files exactly the same as the previous simulation part (e.g. with -deffnm), or
156 make sure all the output files are present (e.g. run from the same directory as the
157 previous simulation part), or instruct mdrun to write new output files with mdrun -noappend.
158 In the last case, you will not be able to use appending in future for this simulation.)",
161 GMX_THROW(InconsistentInputError(stream.toString()));
164 //! Return a string describing the precision of a build of GROMACS.
165 const char* precisionToString(bool isDoublePrecision)
167 return isDoublePrecision ? "double" : "mixed";
170 /*! \brief Describes how mdrun will (re)start and provides supporting
171 * functionality based on that data. */
172 class StartingBehaviorHandler
175 /*! \brief Throw unless all simulations in a multi-sim restart the same way.
177 * The restart could differ if checkpoint or other output files are
178 * not found in a consistent way across the set of multi-simulations,
179 * or are from different simulation parts.
181 * \param[in] ms Multi-sim handler.
183 * May only be called from the master rank of each simulation.
185 * \throws InconsistentInputError if either simulations restart
186 * differently, or from checkpoints from different simulation parts.
188 void ensureMultiSimBehaviorsMatch(const gmx_multisim_t* ms);
189 /*! \brief Return an optional value that describes the index
190 * of the next simulation part when not appending.
192 * \param[in] appendingBehavior Whether this simulation is appending
193 * (relevant only when restarting)
196 std::optional<int> makeIndexOfNextPart(AppendingBehavior appendingBehavior) const;
198 //! Describes how mdrun will (re)start
199 StartingBehavior startingBehavior = StartingBehavior::NewSimulation;
200 //! When restarting from a checkpoint, contains the contents of its header
201 std::optional<CheckpointHeaderContents> headerContents;
202 //! When restarting from a checkpoint, contains the names of expected output files
203 std::optional<std::vector<gmx_file_position_t>> outputFiles;
206 /*! \brief Choose the starting behaviour for this simulation
208 * This routine cannot print tons of data, since it is called before
209 * the log file is opened.
211 * Note that different simulations in a multi-simulation can return
212 * values that depend on whether the respective checkpoint files are
213 * found (and other files found, when appending), and so can differ
214 * between multi-simulations. It is the caller's responsibility to
215 * detect this and react accordingly. */
216 StartingBehaviorHandler chooseStartingBehavior(const AppendingBehavior appendingBehavior,
220 StartingBehaviorHandler handler;
221 if (!opt2bSet("-cpi", nfile, fnm))
223 // No need to tell the user anything
224 handler.startingBehavior = StartingBehavior::NewSimulation;
228 // A -cpi option was provided, do a restart if there is an input checkpoint file available
229 const char* checkpointFilename = opt2fn("-cpi", nfile, fnm);
230 if (!gmx_fexist(checkpointFilename))
232 // This is interpreted as the user intending a new
233 // simulation, so that scripts can call "gmx mdrun -cpi"
234 // for all simulation parts. Thus, appending cannot occur.
235 if (appendingBehavior == AppendingBehavior::Appending)
237 GMX_THROW(InconsistentInputError(
238 "Could not do a restart with appending because the checkpoint file "
239 "was not found. Either supply the name of the right checkpoint file "
240 "or do not use -append"));
242 // No need to tell the user that mdrun -cpi without a file means a new simulation
243 handler.startingBehavior = StartingBehavior::NewSimulation;
247 t_fileio* fp = gmx_fio_open(checkpointFilename, "r");
250 GMX_THROW(FileIOError(
251 formatString("Checkpoint file '%s' was found but could not be opened for "
252 "reading. Check the file permissions.",
253 checkpointFilename)));
256 std::vector<gmx_file_position_t> outputFiles;
257 CheckpointHeaderContents headerContents =
258 read_checkpoint_simulation_part_and_filenames(fp, &outputFiles);
260 GMX_RELEASE_ASSERT(!outputFiles.empty(),
261 "The checkpoint file or its reading is broken, as no output "
262 "file information is stored in it");
263 const char* logFilename = outputFiles[0].filename;
264 GMX_RELEASE_ASSERT(Path::extensionMatches(logFilename, ftp2ext(efLOG)),
265 formatString("The checkpoint file or its reading is broken, the first "
266 "output file '%s' must be a log file with extension '%s'",
271 if (appendingBehavior != AppendingBehavior::NoAppending)
273 // See whether appending can be done.
275 size_t numFilesMissing = std::count_if(
276 std::begin(outputFiles), std::end(outputFiles), [nfile, fnm](const auto& outputFile) {
277 return !exist_output_file(outputFile.filename, nfile, fnm);
279 if (numFilesMissing != 0)
281 // Appending is not possible, because not all previous
282 // output files are present. We don't automatically switch
283 // to numbered output files either, because that prevents
284 // the user from using appending in future. If they want
285 // to restart with missing files, they need to use
287 throwBecauseOfMissingOutputFiles(checkpointFilename, outputFiles, nfile, fnm, numFilesMissing);
290 for (const auto& outputFile : outputFiles)
292 if (outputFile.offset < 0)
294 // Appending of large files is not possible unless
295 // mdrun and the filesystem can do a correct job. We
296 // don't automatically switch to numbered output files
297 // either, because the user can benefit from
298 // understanding that their infrastructure is not very
299 // suitable for running a simulation producing lots of
301 auto message = formatString(
302 "The original mdrun wrote a file called '%s' which "
303 "is larger than 2 GB, but that mdrun or the filesystem "
304 "it ran on (e.g FAT32) did not support such large files. "
305 "This simulation cannot be restarted with appending. It will "
306 "be easier for you to use mdrun on a 64-bit filesystem, but "
307 "if you choose not to, then you must run mdrun with "
308 "-noappend once your output gets large enough.",
309 outputFile.filename);
310 GMX_THROW(InconsistentInputError(message));
314 const char* logFilename = outputFiles[0].filename;
315 // If the precision does not match, we cannot continue with
316 // appending, and will switch to not appending unless
317 // instructed otherwise.
318 if (headerContents.file_version >= 13 && headerContents.double_prec != GMX_DOUBLE)
320 if (appendingBehavior == AppendingBehavior::Appending)
322 GMX_THROW(InconsistentInputError(formatString(
323 "Cannot restart with appending because the previous simulation part used "
324 "%s precision which does not match the %s precision used by this build "
325 "of GROMACS. Either use matching precision or use mdrun -noappend.",
326 precisionToString(headerContents.double_prec),
327 precisionToString(GMX_DOUBLE))));
330 // If the previous log filename had a part number, then we
331 // cannot continue with appending, and will continue without
333 else if (hasSuffixFromNoAppend(logFilename))
335 if (appendingBehavior == AppendingBehavior::Appending)
337 GMX_THROW(InconsistentInputError(
338 "Cannot restart with appending because the previous simulation "
339 "part did not use appending. Either do not use mdrun -append, or "
340 "provide the correct checkpoint file."));
345 // Everything is perfect - we can do an appending restart.
346 handler = { StartingBehavior::RestartWithAppending, headerContents, outputFiles };
350 // No need to tell the user anything because the previous
351 // simulation part also didn't append and that can only happen
352 // when they ask for it.
355 GMX_RELEASE_ASSERT(appendingBehavior != AppendingBehavior::Appending,
356 "Logic error in appending");
357 handler = { StartingBehavior::RestartWithoutAppending, headerContents, outputFiles };
361 //! Check whether chksum_file output file has a checksum that matches \c outputfile from the checkpoint.
362 void checkOutputFile(t_fileio* fileToCheck, const gmx_file_position_t& outputfile)
364 /* compute md5 chksum */
365 std::array<unsigned char, 16> digest;
366 if (outputfile.checksumSize != -1)
368 if (gmx_fio_get_file_md5(fileToCheck, outputfile.offset, &digest)
369 != outputfile.checksumSize) /*at the end of the call the file position is at the end of the file*/
371 auto message = formatString(
372 "Can't read %d bytes of '%s' to compute checksum. The file "
373 "has been replaced or its contents have been modified. Cannot "
374 "do appending because of this condition.",
375 outputfile.checksumSize,
376 outputfile.filename);
377 GMX_THROW(InconsistentInputError(message));
381 /* compare md5 chksum */
382 if (outputfile.checksumSize != -1 && digest != outputfile.checksum)
386 fprintf(debug, "chksum for %s: ", outputfile.filename);
387 for (int j = 0; j < 16; j++)
389 fprintf(debug, "%02x", digest[j]);
391 fprintf(debug, "\n");
393 auto message = formatString(
394 "Checksum wrong for '%s'. The file has been replaced "
395 "or its contents have been modified. Cannot do appending "
396 "because of this condition.",
397 outputfile.filename);
398 GMX_THROW(InconsistentInputError(message));
402 /*! \brief If supported, obtain a write lock on the log file.
404 * This wil prevent e.g. other mdrun instances from changing it while
405 * we attempt to restart with appending. */
406 void lockLogFile(t_fileio* logfio, const char* logFilename)
408 /* Note that there are systems where the lock operation
409 * will succeed, but a second process can also lock the file.
410 * We should probably try to detect this.
412 #if defined __native_client__
415 #elif GMX_NATIVE_WINDOWS
416 if (_locking(fileno(gmx_fio_getfp(logfio)), _LK_NBLCK, LONG_MAX) == -1)
418 // don't initialize here: the struct order is OS dependent!
421 fl.l_whence = SEEK_SET;
426 if (fcntl(fileno(gmx_fio_getfp(logfio)), F_SETLK, &fl) == -1)
431 std::string message =
432 "File locking is not supported on this system. "
433 "Use mdrun -noappend to restart.";
434 GMX_THROW(FileIOError(message));
436 else if (errno == EACCES || errno == EAGAIN)
438 auto message = formatString(
439 "Failed to lock: %s. Already running "
442 GMX_THROW(FileIOError(message));
446 auto message = formatString("Failed to lock: %s. %s.", logFilename, std::strerror(errno));
447 GMX_THROW(FileIOError(message));
452 /*! \brief Prepare to append to output files.
454 * We use the file pointer positions of the output files stored in the
455 * checkpoint file and truncate the files such that any frames written
456 * after the checkpoint time are removed. All files are md5sum
457 * checked such that we can be sure that we do not truncate other
458 * (maybe important) files. The log file is locked so that we can
459 * avoid cases where another mdrun instance might still be writing to
461 void prepareForAppending(const ArrayRef<const gmx_file_position_t> outputFiles, t_fileio* logfio)
465 // Can't check or truncate output files in general
466 // TODO do we do this elsewhere for GMX_FAHCORE?
470 // Handle the log file separately - it comes first in the list
471 // because we have already opened the log file. This ensures that
472 // we retain a lock on the open file that is never lifted after
473 // the checksum is calculated.
474 const gmx_file_position_t& logOutputFile = outputFiles[0];
475 lockLogFile(logfio, logOutputFile.filename);
476 checkOutputFile(logfio, logOutputFile);
478 if (gmx_fio_seek(logfio, logOutputFile.offset) != 0)
480 auto message = formatString("Seek error! Failed to truncate log file: %s.", std::strerror(errno));
481 GMX_THROW(FileIOError(message));
484 // Now handle the remaining outputFiles
485 for (const auto& outputFile : outputFiles.subArray(1, outputFiles.size() - 1))
487 t_fileio* fileToCheck = gmx_fio_open(outputFile.filename, "r+");
488 checkOutputFile(fileToCheck, outputFile);
489 gmx_fio_close(fileToCheck);
491 if (GMX_NATIVE_WINDOWS)
493 // Can't truncate output files on this platform
497 if (gmx_truncate(outputFile.filename, outputFile.offset) != 0)
499 auto message = formatString(
500 "Truncation of file %s failed. Cannot do appending "
501 "because of this failure.",
502 outputFile.filename);
503 GMX_THROW(FileIOError(message));
508 void StartingBehaviorHandler::ensureMultiSimBehaviorsMatch(const gmx_multisim_t* ms)
512 // Trivially, the multi-sim behaviors match
516 auto startingBehaviors = gatherIntFromMultiSimulation(ms, static_cast<int>(startingBehavior));
517 bool identicalStartingBehaviors =
519 std::begin(startingBehaviors), std::end(startingBehaviors), std::not_equal_to<>())
520 == std::end(startingBehaviors));
522 const EnumerationArray<StartingBehavior, std::string> behaviorStrings = {
523 { "restart with appending", "restart without appending", "new simulation" }
525 if (!identicalStartingBehaviors)
527 std::string message = formatString(R"(
528 Multi-simulations must all start in the same way, either a new
529 simulation, a restart with appending, or a restart without appending.
530 However, the contents of the multi-simulation directories you specified
531 were inconsistent with each other. Either remove the checkpoint file
532 from each directory, or ensure each directory has a checkpoint file from
533 the same simulation part (and, if you want to append to output files,
534 ensure the old output files are present and named as they were when the
535 checkpoint file was written).
537 To help you identify which directories need attention, the %d
538 simulations wanted the following respective behaviors:
540 ms->numSimulations_);
541 for (index simIndex = 0; simIndex != ssize(startingBehaviors); ++simIndex)
543 auto behavior = static_cast<StartingBehavior>(startingBehaviors[simIndex]);
544 message += formatString(
545 " Simulation %6zd: %s\n", simIndex, behaviorStrings[behavior].c_str());
547 GMX_THROW(InconsistentInputError(message));
550 if (startingBehavior == StartingBehavior::NewSimulation)
552 // When not restarting, the behaviors are now known to match
556 // Multi-simulation restarts require that each checkpoint
557 // describes the same simulation part. If those don't match, then
558 // the simulation cannot proceed.
559 auto simulationParts = gatherIntFromMultiSimulation(ms, headerContents->simulation_part);
560 bool identicalSimulationParts =
562 std::begin(simulationParts), std::end(simulationParts), std::not_equal_to<>())
563 == std::end(simulationParts));
565 if (!identicalSimulationParts)
567 std::string message = formatString(R"(
568 Multi-simulations must all start in the same way, either a new
569 simulation, a restart with appending, or a restart without appending.
570 However, the checkpoint files you specified were from different
571 simulation parts. Either remove the checkpoint file from each directory,
572 or ensure each directory has a checkpoint file from the same simulation
573 part (and, if you want to append to output files, ensure the old output
574 files are present and named as they were when the checkpoint file was
577 To help you identify which directories need attention, the %d
578 simulation checkpoint files were from the following respective
581 ms->numSimulations_);
582 for (index partIndex = 0; partIndex != ssize(simulationParts); ++partIndex)
584 message += formatString(" Simulation %6zd: %d\n", partIndex, simulationParts[partIndex]);
586 GMX_THROW(InconsistentInputError(message));
590 std::optional<int> StartingBehaviorHandler::makeIndexOfNextPart(const AppendingBehavior appendingBehavior) const
592 std::optional<int> indexOfNextPart;
594 if (startingBehavior == StartingBehavior::RestartWithoutAppending)
596 indexOfNextPart = headerContents->simulation_part + 1;
598 else if (startingBehavior == StartingBehavior::NewSimulation
599 && appendingBehavior == AppendingBehavior::NoAppending)
604 return indexOfNextPart;
609 std::tuple<StartingBehavior, LogFilePtr> handleRestart(const bool isSimulationMaster,
610 MPI_Comm communicator,
611 const gmx_multisim_t* ms,
612 const AppendingBehavior appendingBehavior,
616 StartingBehaviorHandler handler;
617 LogFilePtr logFileGuard = nullptr;
619 // Make sure all ranks agree on whether the (multi-)simulation can
621 int numErrorsFound = 0;
622 std::exception_ptr exceptionPtr;
624 // Only the master rank of each simulation can do anything with
625 // output files, so it is the only one that needs to consider
626 // whether a restart might take place, and how to implement it.
627 if (isSimulationMaster)
631 handler = chooseStartingBehavior(appendingBehavior, nfile, fnm);
633 handler.ensureMultiSimBehaviorsMatch(ms);
635 // When not appending, prepare a suffix for the part number
636 std::optional<int> indexOfNextPart = handler.makeIndexOfNextPart(appendingBehavior);
638 // If a part suffix is used, change the file names accordingly.
641 std::string suffix = formatString(".part%04d", *indexOfNextPart);
642 add_suffix_to_output_names(fnm, nfile, suffix.c_str());
645 // Open the log file, now that it has the right name
646 logFileGuard = openLogFile(ftp2fn(efLOG, nfile, fnm),
647 handler.startingBehavior == StartingBehavior::RestartWithAppending);
649 // When appending, the other output files need special handling before opening
650 if (handler.startingBehavior == StartingBehavior::RestartWithAppending)
652 prepareForAppending(*handler.outputFiles, logFileGuard.get());
655 catch (const std::exception& /*ex*/)
657 exceptionPtr = std::current_exception();
661 // Since the master rank (perhaps of only one simulation) may have
662 // found an error condition, we now coordinate the behavior across
663 // all ranks. However, only the applicable ranks will throw a
664 // non-default exception.
666 // TODO Evolve some re-usable infrastructure for this, because it
667 // will be needed in many places while setting up simulations.
669 int reducedNumErrorsFound;
670 MPI_Allreduce(&numErrorsFound, &reducedNumErrorsFound, 1, MPI_INT, MPI_SUM, communicator);
671 numErrorsFound = reducedNumErrorsFound;
673 // There is nothing to do with no MPI or thread-MPI, as there is
674 // only one rank at this point.
675 GMX_RELEASE_ASSERT(communicator == MPI_COMM_NULL, "Must have null communicator at this point");
678 // Throw in a globally coordinated way, if needed
679 if (numErrorsFound > 0)
683 std::rethrow_exception(exceptionPtr);
687 GMX_THROW(ParallelConsistencyError("Another MPI rank encountered an exception"));
691 // Ensure all ranks agree on the starting behavior, which is easy
692 // because all simulations in a multi-simulation already agreed on
693 // the starting behavior. There is nothing to do with no
694 // MPI or thread-MPI.
696 MPI_Bcast(&handler.startingBehavior, 1, MPI_INT, 0, communicator);
699 return std::make_tuple(handler.startingBehavior, std::move(logFileGuard));