2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief This file declares functions for mdrun to call to manage the
38 * details of doing a restart (ie. reading checkpoints, appending
41 * \todo Clean up the error-prone logic here. Add doxygen.
43 * \author Berk Hess <hess@kth.se>
44 * \author Erik Lindahl <erik@kth.se>
45 * \author Mark Abraham <mark.j.abraham@gmail.com>
47 * \ingroup module_mdrunutility
52 #include "handlerestart.h"
59 #if GMX_NATIVE_WINDOWS
61 # include <sys/locking.h>
69 #include "gromacs/commandline/filenm.h"
70 #include "gromacs/compat/optional.h"
71 #include "gromacs/fileio/checkpoint.h"
72 #include "gromacs/fileio/gmxfio.h"
73 #include "gromacs/mdrunutility/multisim.h"
74 #include "gromacs/mdtypes/mdrunoptions.h"
75 #include "gromacs/utility/basedefinitions.h"
76 #include "gromacs/utility/enumerationhelpers.h"
77 #include "gromacs/utility/exceptions.h"
78 #include "gromacs/utility/fatalerror.h"
79 #include "gromacs/utility/path.h"
80 #include "gromacs/utility/smalloc.h"
81 #include "gromacs/utility/stringstream.h"
82 #include "gromacs/utility/textwriter.h"
89 /*! \brief Search for \p fnm_cp in fnm and return true iff found
91 * \todo This could be implemented sanely with a for loop. */
92 gmx_bool exist_output_file(const char* fnm_cp, int nfile, const t_filenm fnm[])
96 /* Check if the output file name stored in the checkpoint file
97 * is one of the output file names of mdrun.
100 while (i < nfile && !(is_output(&fnm[i]) && strcmp(fnm_cp, fnm[i].filenames[0].c_str()) == 0))
105 return (i < nfile && gmx_fexist(fnm_cp));
108 /*! \brief Throw when mdrun -cpi fails because previous output files are missing.
110 * If we get here, the user requested restarting from a checkpoint file, that checkpoint
111 * file was found (so it is not the first part of a new run), but we are still missing
112 * some or all checkpoint files. In this case we issue a fatal error since there are
113 * so many special cases we cannot keep track of, and better safe than sorry. */
114 [[noreturn]] void throwBecauseOfMissingOutputFiles(const char* checkpointFilename,
115 ArrayRef<const gmx_file_position_t> outputfiles,
117 const t_filenm fnm[],
118 size_t numFilesMissing)
120 StringOutputStream stream;
121 TextWriter writer(&stream);
122 writer.writeStringFormatted(
123 "Some output files listed in the checkpoint file %s are not present or not named "
124 "as the output files by the current program:)",
126 auto settings = writer.wrapperSettings();
127 auto oldIndent = settings.indent(), newIndent = 2;
129 writer.writeLine("Expected output files that are present:");
130 settings.setIndent(newIndent);
131 for (const auto& outputfile : outputfiles)
133 if (exist_output_file(outputfile.filename, nfile, fnm))
135 writer.writeLine(outputfile.filename);
138 settings.setIndent(oldIndent);
139 writer.ensureEmptyLine();
141 writer.writeLine("Expected output files that are not present or named differently:");
142 settings.setIndent(newIndent);
143 for (const auto& outputfile : outputfiles)
145 if (!exist_output_file(outputfile.filename, nfile, fnm))
147 writer.writeLine(outputfile.filename);
150 settings.setIndent(oldIndent);
152 writer.writeLineFormatted(
153 R"(To keep your simulation files safe, this simulation will not restart. Either name your
154 output files exactly the same as the previous simulation part (e.g. with -deffnm), or
155 make sure all the output files are present (e.g. run from the same directory as the
156 previous simulation part), or instruct mdrun to write new output files with mdrun -noappend.
157 In the last case, you will not be able to use appending in future for this simulation.)",
158 numFilesMissing, outputfiles.size());
159 GMX_THROW(InconsistentInputError(stream.toString()));
162 //! Return a string describing the precision of a build of GROMACS.
163 const char* precisionToString(bool isDoublePrecision)
165 return isDoublePrecision ? "double" : "mixed";
168 /*! \brief Describes how mdrun will (re)start and provides supporting
169 * functionality based on that data. */
170 class StartingBehaviorHandler
173 /*! \brief Throw unless all simulations in a multi-sim restart the same way.
175 * The restart could differ if checkpoint or other output files are
176 * not found in a consistent way across the set of multi-simulations,
177 * or are from different simulation parts.
179 * \param[in] ms Multi-sim handler.
181 * May only be called from the master rank of each simulation.
183 * \throws InconsistentInputError if either simulations restart
184 * differently, or from checkpoints from different simulation parts.
186 void ensureMultiSimBehaviorsMatch(const gmx_multisim_t* ms);
187 /*! \brief Return an optional value that describes the index
188 * of the next simulation part when not appending.
190 * \param[in] appendingBehavior Whether this simulation is appending
191 * (relevant only when restarting)
194 compat::optional<int> makeIndexOfNextPart(AppendingBehavior appendingBehavior) const;
196 //! Describes how mdrun will (re)start
197 StartingBehavior startingBehavior = StartingBehavior::NewSimulation;
198 //! When restarting from a checkpoint, contains the contents of its header
199 compat::optional<CheckpointHeaderContents> headerContents;
200 //! When restarting from a checkpoint, contains the names of expected output files
201 compat::optional<std::vector<gmx_file_position_t>> outputFiles;
204 /*! \brief Choose the starting behaviour for this simulation
206 * This routine cannot print tons of data, since it is called before
207 * the log file is opened.
209 * Note that different simulations in a multi-simulation can return
210 * values that depend on whether the respective checkpoint files are
211 * found (and other files found, when appending), and so can differ
212 * between multi-simulations. It is the caller's responsibility to
213 * detect this and react accordingly. */
214 StartingBehaviorHandler chooseStartingBehavior(const AppendingBehavior appendingBehavior,
218 StartingBehaviorHandler handler;
219 if (!opt2bSet("-cpi", nfile, fnm))
221 // No need to tell the user anything
222 handler.startingBehavior = StartingBehavior::NewSimulation;
226 // A -cpi option was provided, do a restart if there is an input checkpoint file available
227 const char* checkpointFilename = opt2fn("-cpi", nfile, fnm);
228 if (!gmx_fexist(checkpointFilename))
230 // This is interpreted as the user intending a new
231 // simulation, so that scripts can call "gmx mdrun -cpi"
232 // for all simulation parts. Thus, appending cannot occur.
233 if (appendingBehavior == AppendingBehavior::Appending)
235 GMX_THROW(InconsistentInputError(
236 "Could not do a restart with appending because the checkpoint file "
237 "was not found. Either supply the name of the right checkpoint file "
238 "or do not use -append"));
240 // No need to tell the user that mdrun -cpi without a file means a new simulation
241 handler.startingBehavior = StartingBehavior::NewSimulation;
245 t_fileio* fp = gmx_fio_open(checkpointFilename, "r");
248 GMX_THROW(FileIOError(
249 formatString("Checkpoint file '%s' was found but could not be opened for "
250 "reading. Check the file permissions.",
251 checkpointFilename)));
254 std::vector<gmx_file_position_t> outputFiles;
255 CheckpointHeaderContents headerContents =
256 read_checkpoint_simulation_part_and_filenames(fp, &outputFiles);
258 GMX_RELEASE_ASSERT(!outputFiles.empty(),
259 "The checkpoint file or its reading is broken, as no output "
260 "file information is stored in it");
261 const char* logFilename = outputFiles[0].filename;
262 GMX_RELEASE_ASSERT(Path::extensionMatches(logFilename, ftp2ext(efLOG)),
263 formatString("The checkpoint file or its reading is broken, the first "
264 "output file '%s' must be a log file with extension '%s'",
265 logFilename, ftp2ext(efLOG))
268 if (appendingBehavior != AppendingBehavior::NoAppending)
270 // See whether appending can be done.
272 size_t numFilesMissing = std::count_if(
273 std::begin(outputFiles), std::end(outputFiles), [nfile, fnm](const auto& outputFile) {
274 return !exist_output_file(outputFile.filename, nfile, fnm);
276 if (numFilesMissing != 0)
278 // Appending is not possible, because not all previous
279 // output files are present. We don't automatically switch
280 // to numbered output files either, because that prevents
281 // the user from using appending in future. If they want
282 // to restart with missing files, they need to use
284 throwBecauseOfMissingOutputFiles(checkpointFilename, outputFiles, nfile, fnm, numFilesMissing);
287 for (const auto& outputFile : outputFiles)
289 if (outputFile.offset < 0)
291 // Appending of large files is not possible unless
292 // mdrun and the filesystem can do a correct job. We
293 // don't automatically switch to numbered output files
294 // either, because the user can benefit from
295 // understanding that their infrastructure is not very
296 // suitable for running a simulation producing lots of
298 auto message = formatString(
299 "The original mdrun wrote a file called '%s' which "
300 "is larger than 2 GB, but that mdrun or the filesystem "
301 "it ran on (e.g FAT32) did not support such large files. "
302 "This simulation cannot be restarted with appending. It will "
303 "be easier for you to use mdrun on a 64-bit filesystem, but "
304 "if you choose not to, then you must run mdrun with "
305 "-noappend once your output gets large enough.",
306 outputFile.filename);
307 GMX_THROW(InconsistentInputError(message));
311 const char* logFilename = outputFiles[0].filename;
312 // If the precision does not match, we cannot continue with
313 // appending, and will switch to not appending unless
314 // instructed otherwise.
315 if (headerContents.file_version >= 13 && headerContents.double_prec != GMX_DOUBLE)
317 if (appendingBehavior == AppendingBehavior::Appending)
319 GMX_THROW(InconsistentInputError(formatString(
320 "Cannot restart with appending because the previous simulation part used "
321 "%s precision which does not match the %s precision used by this build "
322 "of GROMACS. Either use matching precision or use mdrun -noappend.",
323 precisionToString(headerContents.double_prec), precisionToString(GMX_DOUBLE))));
326 // If the previous log filename had a part number, then we
327 // cannot continue with appending, and will continue without
329 else if (hasSuffixFromNoAppend(logFilename))
331 if (appendingBehavior == AppendingBehavior::Appending)
333 GMX_THROW(InconsistentInputError(
334 "Cannot restart with appending because the previous simulation "
335 "part did not use appending. Either do not use mdrun -append, or "
336 "provide the correct checkpoint file."));
341 // Everything is perfect - we can do an appending restart.
342 handler = { StartingBehavior::RestartWithAppending, headerContents, outputFiles };
346 // No need to tell the user anything because the previous
347 // simulation part also didn't append and that can only happen
348 // when they ask for it.
351 GMX_RELEASE_ASSERT(appendingBehavior != AppendingBehavior::Appending,
352 "Logic error in appending");
353 handler = { StartingBehavior::RestartWithoutAppending, headerContents, outputFiles };
357 //! Check whether chksum_file output file has a checksum that matches \c outputfile from the checkpoint.
358 void checkOutputFile(t_fileio* fileToCheck, const gmx_file_position_t& outputfile)
360 /* compute md5 chksum */
361 std::array<unsigned char, 16> digest;
362 if (outputfile.checksumSize != -1)
364 if (gmx_fio_get_file_md5(fileToCheck, outputfile.offset, &digest)
365 != outputfile.checksumSize) /*at the end of the call the file position is at the end of the file*/
367 auto message = formatString(
368 "Can't read %d bytes of '%s' to compute checksum. The file "
369 "has been replaced or its contents have been modified. Cannot "
370 "do appending because of this condition.",
371 outputfile.checksumSize, outputfile.filename);
372 GMX_THROW(InconsistentInputError(message));
376 /* compare md5 chksum */
377 if (outputfile.checksumSize != -1 && digest != outputfile.checksum)
381 fprintf(debug, "chksum for %s: ", outputfile.filename);
382 for (int j = 0; j < 16; j++)
384 fprintf(debug, "%02x", digest[j]);
386 fprintf(debug, "\n");
388 auto message = formatString(
389 "Checksum wrong for '%s'. The file has been replaced "
390 "or its contents have been modified. Cannot do appending "
391 "because of this condition.",
392 outputfile.filename);
393 GMX_THROW(InconsistentInputError(message));
397 /*! \brief If supported, obtain a write lock on the log file.
399 * This wil prevent e.g. other mdrun instances from changing it while
400 * we attempt to restart with appending. */
401 void lockLogFile(t_fileio* logfio, const char* logFilename)
403 /* Note that there are systems where the lock operation
404 * will succeed, but a second process can also lock the file.
405 * We should probably try to detect this.
407 #if defined __native_client__
410 #elif GMX_NATIVE_WINDOWS
411 if (_locking(fileno(gmx_fio_getfp(logfio)), _LK_NBLCK, LONG_MAX) == -1)
413 // don't initialize here: the struct order is OS dependent!
416 fl.l_whence = SEEK_SET;
421 if (fcntl(fileno(gmx_fio_getfp(logfio)), F_SETLK, &fl) == -1)
426 std::string message =
427 "File locking is not supported on this system. "
428 "Use mdrun -noappend to restart.";
429 GMX_THROW(FileIOError(message));
431 else if (errno == EACCES || errno == EAGAIN)
433 auto message = formatString(
434 "Failed to lock: %s. Already running "
437 GMX_THROW(FileIOError(message));
441 auto message = formatString("Failed to lock: %s. %s.", logFilename, std::strerror(errno));
442 GMX_THROW(FileIOError(message));
447 /*! \brief Prepare to append to output files.
449 * We use the file pointer positions of the output files stored in the
450 * checkpoint file and truncate the files such that any frames written
451 * after the checkpoint time are removed. All files are md5sum
452 * checked such that we can be sure that we do not truncate other
453 * (maybe important) files. The log file is locked so that we can
454 * avoid cases where another mdrun instance might still be writing to
456 void prepareForAppending(const ArrayRef<const gmx_file_position_t> outputFiles, t_fileio* logfio)
460 // Can't check or truncate output files in general
461 // TODO do we do this elsewhere for GMX_FAHCORE?
465 // Handle the log file separately - it comes first in the list
466 // because we have already opened the log file. This ensures that
467 // we retain a lock on the open file that is never lifted after
468 // the checksum is calculated.
469 const gmx_file_position_t& logOutputFile = outputFiles[0];
470 lockLogFile(logfio, logOutputFile.filename);
471 checkOutputFile(logfio, logOutputFile);
473 if (gmx_fio_seek(logfio, logOutputFile.offset) != 0)
475 auto message = formatString("Seek error! Failed to truncate log file: %s.", std::strerror(errno));
476 GMX_THROW(FileIOError(message));
479 // Now handle the remaining outputFiles
480 for (const auto& outputFile : outputFiles.subArray(1, outputFiles.size() - 1))
482 t_fileio* fileToCheck = gmx_fio_open(outputFile.filename, "r+");
483 checkOutputFile(fileToCheck, outputFile);
484 gmx_fio_close(fileToCheck);
486 if (GMX_NATIVE_WINDOWS)
488 // Can't truncate output files on this platform
492 if (gmx_truncate(outputFile.filename, outputFile.offset) != 0)
494 auto message = formatString(
495 "Truncation of file %s failed. Cannot do appending "
496 "because of this failure.",
497 outputFile.filename);
498 GMX_THROW(FileIOError(message));
503 void StartingBehaviorHandler::ensureMultiSimBehaviorsMatch(const gmx_multisim_t* ms)
507 // Trivially, the multi-sim behaviors match
511 auto startingBehaviors = gatherIntFromMultiSimulation(ms, static_cast<int>(startingBehavior));
512 bool identicalStartingBehaviors =
513 (std::adjacent_find(std::begin(startingBehaviors), std::end(startingBehaviors),
514 std::not_equal_to<>())
515 == std::end(startingBehaviors));
517 const EnumerationArray<StartingBehavior, std::string> behaviorStrings = {
518 { "restart with appending", "restart without appending", "new simulation" }
520 if (!identicalStartingBehaviors)
522 std::string message = formatString(R"(
523 Multi-simulations must all start in the same way, either a new
524 simulation, a restart with appending, or a restart without appending.
525 However, the contents of the multi-simulation directories you specified
526 were inconsistent with each other. Either remove the checkpoint file
527 from each directory, or ensure each directory has a checkpoint file from
528 the same simulation part (and, if you want to append to output files,
529 ensure the old output files are present and named as they were when the
530 checkpoint file was written).
532 To help you identify which directories need attention, the %d
533 simulations wanted the following respective behaviors:
536 for (index simIndex = 0; simIndex != ssize(startingBehaviors); ++simIndex)
538 auto behavior = static_cast<StartingBehavior>(startingBehaviors[simIndex]);
539 message += formatString(" Simulation %6zd: %s\n", simIndex,
540 behaviorStrings[behavior].c_str());
542 GMX_THROW(InconsistentInputError(message));
545 if (startingBehavior == StartingBehavior::NewSimulation)
547 // When not restarting, the behaviors are now known to match
551 // Multi-simulation restarts require that each checkpoint
552 // describes the same simulation part. If those don't match, then
553 // the simulation cannot proceed.
554 auto simulationParts = gatherIntFromMultiSimulation(ms, headerContents->simulation_part);
555 bool identicalSimulationParts = (std::adjacent_find(std::begin(simulationParts),
556 std::end(simulationParts), std::not_equal_to<>())
557 == std::end(simulationParts));
559 if (!identicalSimulationParts)
561 std::string message = formatString(R"(
562 Multi-simulations must all start in the same way, either a new
563 simulation, a restart with appending, or a restart without appending.
564 However, the checkpoint files you specified were from different
565 simulation parts. Either remove the checkpoint file from each directory,
566 or ensure each directory has a checkpoint file from the same simulation
567 part (and, if you want to append to output files, ensure the old output
568 files are present and named as they were when the checkpoint file was
571 To help you identify which directories need attention, the %d
572 simulation checkpoint files were from the following respective
576 for (index partIndex = 0; partIndex != ssize(simulationParts); ++partIndex)
578 message += formatString(" Simulation %6zd: %d\n", partIndex, simulationParts[partIndex]);
580 GMX_THROW(InconsistentInputError(message));
584 compat::optional<int> StartingBehaviorHandler::makeIndexOfNextPart(const AppendingBehavior appendingBehavior) const
586 compat::optional<int> indexOfNextPart;
588 if (startingBehavior == StartingBehavior::RestartWithoutAppending)
590 indexOfNextPart = headerContents->simulation_part + 1;
592 else if (startingBehavior == StartingBehavior::NewSimulation
593 && appendingBehavior == AppendingBehavior::NoAppending)
598 return indexOfNextPart;
603 std::tuple<StartingBehavior, LogFilePtr> handleRestart(const bool isSimulationMaster,
604 MPI_Comm communicator,
605 const gmx_multisim_t* ms,
606 const AppendingBehavior appendingBehavior,
610 StartingBehaviorHandler handler;
611 LogFilePtr logFileGuard = nullptr;
613 // Make sure all ranks agree on whether the (multi-)simulation can
615 int numErrorsFound = 0;
616 std::exception_ptr exceptionPtr;
618 // Only the master rank of each simulation can do anything with
619 // output files, so it is the only one that needs to consider
620 // whether a restart might take place, and how to implement it.
621 if (isSimulationMaster)
625 handler = chooseStartingBehavior(appendingBehavior, nfile, fnm);
627 handler.ensureMultiSimBehaviorsMatch(ms);
629 // When not appending, prepare a suffix for the part number
630 compat::optional<int> indexOfNextPart = handler.makeIndexOfNextPart(appendingBehavior);
632 // If a part suffix is used, change the file names accordingly.
635 std::string suffix = formatString(".part%04d", *indexOfNextPart);
636 add_suffix_to_output_names(fnm, nfile, suffix.c_str());
639 // Open the log file, now that it has the right name
640 logFileGuard = openLogFile(ftp2fn(efLOG, nfile, fnm),
641 handler.startingBehavior == StartingBehavior::RestartWithAppending);
643 // When appending, the other output files need special handling before opening
644 if (handler.startingBehavior == StartingBehavior::RestartWithAppending)
646 prepareForAppending(*handler.outputFiles, logFileGuard.get());
649 catch (const std::exception& /*ex*/)
651 exceptionPtr = std::current_exception();
655 // Since the master rank (perhaps of only one simulation) may have
656 // found an error condition, we now coordinate the behavior across
657 // all ranks. However, only the applicable ranks will throw a
658 // non-default exception.
660 // TODO Evolve some re-usable infrastructure for this, because it
661 // will be needed in many places while setting up simulations.
663 int reducedNumErrorsFound;
664 MPI_Allreduce(&numErrorsFound, &reducedNumErrorsFound, 1, MPI_INT, MPI_SUM, communicator);
665 numErrorsFound = reducedNumErrorsFound;
667 // There is nothing to do with no MPI or thread-MPI, as there is
668 // only one rank at this point.
669 GMX_RELEASE_ASSERT(communicator == MPI_COMM_NULL, "Must have null communicator at this point");
672 // Throw in a globally coordinated way, if needed
673 if (numErrorsFound > 0)
677 std::rethrow_exception(exceptionPtr);
681 GMX_THROW(ParallelConsistencyError("Another MPI rank encountered an exception"));
685 // Ensure all ranks agree on the starting behavior, which is easy
686 // because all simulations in a multi-simulation already agreed on
687 // the starting behavior. There is nothing to do with no
688 // MPI or thread-MPI.
690 MPI_Bcast(&handler.startingBehavior, 1, MPI_INT, 0, communicator);
693 return std::make_tuple(handler.startingBehavior, std::move(logFileGuard));