2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2013,2014,2015,2016,2017 The GROMACS development team.
5 * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
42 #include "gromacs/commandline/filenm.h"
43 #include "gromacs/domdec/collect.h"
44 #include "gromacs/domdec/domdec_struct.h"
45 #include "gromacs/fileio/checkpoint.h"
46 #include "gromacs/fileio/gmxfio.h"
47 #include "gromacs/fileio/tngio.h"
48 #include "gromacs/fileio/trrio.h"
49 #include "gromacs/fileio/xtcio.h"
50 #include "gromacs/math/vec.h"
51 #include "gromacs/mdlib/energyoutput.h"
52 #include "gromacs/mdrunutility/handlerestart.h"
53 #include "gromacs/mdrunutility/multisim.h"
54 #include "gromacs/mdtypes/awh_history.h"
55 #include "gromacs/mdtypes/commrec.h"
56 #include "gromacs/mdtypes/edsamhistory.h"
57 #include "gromacs/mdtypes/energyhistory.h"
58 #include "gromacs/mdtypes/imdoutputprovider.h"
59 #include "gromacs/mdtypes/inputrec.h"
60 #include "gromacs/mdtypes/md_enums.h"
61 #include "gromacs/mdtypes/mdrunoptions.h"
62 #include "gromacs/mdtypes/observableshistory.h"
63 #include "gromacs/mdtypes/state.h"
64 #include "gromacs/mdtypes/swaphistory.h"
65 #include "gromacs/timing/wallcycle.h"
66 #include "gromacs/topology/topology.h"
67 #include "gromacs/utility/baseversion.h"
68 #include "gromacs/utility/fatalerror.h"
69 #include "gromacs/utility/pleasecite.h"
70 #include "gromacs/utility/programcontext.h"
71 #include "gromacs/utility/smalloc.h"
72 #include "gromacs/utility/sysinfo.h"
78 gmx_tng_trajectory_t tng;
79 gmx_tng_trajectory_t tng_low_prec;
80 int x_compression_precision; /* only used by XTC output */
83 gmx_bool bKeepAndNumCPT;
84 IntegrationAlgorithm eIntegrator;
86 LambdaWeightCalculation elamstats;
90 int natoms_x_compressed;
91 const SimulationGroups* groups; /* for compressed position writing */
92 gmx_wallcycle* wcycle;
94 gmx::IMDOutputProvider* outputProvider;
95 const gmx::MDModulesNotifiers* mdModulesNotifiers;
96 bool simulationsShareState;
101 gmx_mdoutf_t init_mdoutf(FILE* fplog,
103 const t_filenm fnm[],
104 const gmx::MdrunOptions& mdrunOptions,
106 gmx::IMDOutputProvider* outputProvider,
107 const gmx::MDModulesNotifiers& mdModulesNotifiers,
108 const t_inputrec* ir,
109 const gmx_mtop_t& top_global,
110 const gmx_output_env_t* oenv,
111 gmx_wallcycle* wcycle,
112 const gmx::StartingBehavior startingBehavior,
113 bool simulationsShareState,
114 const gmx_multisim_t* ms)
117 const char * appendMode = "a+", *writeMode = "w+", *filemode;
118 gmx_bool bCiteTng = FALSE;
120 bool restartWithAppending = (startingBehavior == gmx::StartingBehavior::RestartWithAppending);
124 of->fp_trn = nullptr;
125 of->fp_ene = nullptr;
126 of->fp_xtc = nullptr;
128 of->tng_low_prec = nullptr;
129 of->fp_dhdl = nullptr;
131 of->eIntegrator = ir->eI;
132 of->bExpanded = ir->bExpanded;
133 of->elamstats = ir->expandedvals->elamstats;
134 of->simulation_part = ir->simulation_part;
135 of->x_compression_precision = static_cast<int>(ir->x_compression_precision);
137 of->f_global = nullptr;
138 of->outputProvider = outputProvider;
140 GMX_RELEASE_ASSERT(!simulationsShareState || ms != nullptr,
141 "Need valid multisim object when simulations share state");
142 of->simulationsShareState = simulationsShareState;
143 if (of->simulationsShareState)
145 of->mastersComm = ms->mastersComm_;
150 of->bKeepAndNumCPT = mdrunOptions.checkpointOptions.keepAndNumberCheckpointFiles;
152 filemode = restartWithAppending ? appendMode : writeMode;
154 if (EI_DYNAMICS(ir->eI) && ir->nstxout_compressed > 0)
156 const char* filename;
157 filename = ftp2fn(efCOMPRESSED, nfile, fnm);
158 switch (fn2ftp(filename))
160 case efXTC: of->fp_xtc = open_xtc(filename, filemode); break;
162 gmx_tng_open(filename, filemode[0], &of->tng_low_prec);
163 if (filemode[0] == 'w')
165 gmx_tng_prepare_low_prec_writing(of->tng_low_prec, &top_global, ir);
169 default: gmx_incons("Invalid reduced precision file format");
172 if ((EI_DYNAMICS(ir->eI) || EI_ENERGY_MINIMIZATION(ir->eI))
174 && !(EI_DYNAMICS(ir->eI) && ir->nstxout == 0 && ir->nstvout == 0 && ir->nstfout == 0)))
176 const char* filename;
177 filename = ftp2fn(efTRN, nfile, fnm);
178 switch (fn2ftp(filename))
182 /* If there is no uncompressed coordinate output and
183 there is compressed TNG output write forces
184 and/or velocities to the TNG file instead. */
185 if (ir->nstxout != 0 || ir->nstxout_compressed == 0 || !of->tng_low_prec)
187 of->fp_trn = gmx_trr_open(filename, filemode);
191 gmx_tng_open(filename, filemode[0], &of->tng);
192 if (filemode[0] == 'w')
194 gmx_tng_prepare_md_writing(of->tng, &top_global, ir);
198 default: gmx_incons("Invalid full precision file format");
201 if (EI_DYNAMICS(ir->eI) || EI_ENERGY_MINIMIZATION(ir->eI))
203 of->fp_ene = open_enx(ftp2fn(efEDR, nfile, fnm), filemode);
205 of->fn_cpt = opt2fn("-cpo", nfile, fnm);
207 if ((ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp) && ir->fepvals->nstdhdl > 0
208 && (ir->fepvals->separate_dhdl_file == SeparateDhdlFile::Yes) && EI_DYNAMICS(ir->eI))
210 if (restartWithAppending)
212 of->fp_dhdl = gmx_fio_fopen(opt2fn("-dhdl", nfile, fnm), filemode);
216 of->fp_dhdl = open_dhdl(opt2fn("-dhdl", nfile, fnm), ir, oenv);
220 outputProvider->initOutput(fplog, nfile, fnm, restartWithAppending, oenv);
221 of->mdModulesNotifiers = &mdModulesNotifiers;
223 /* Set up atom counts so they can be passed to actual
224 trajectory-writing routines later. Also, XTC writing needs
225 to know what (and how many) atoms might be in the XTC
226 groups, and how to look up later which ones they are. */
227 of->natoms_global = top_global.natoms;
228 of->groups = &top_global.groups;
229 of->natoms_x_compressed = 0;
230 for (i = 0; (i < top_global.natoms); i++)
232 if (getGroupType(*of->groups, SimulationAtomGroupType::CompressedPositionOutput, i) == 0)
234 of->natoms_x_compressed++;
238 if (ir->nstfout && haveDDAtomOrdering(*cr))
240 snew(of->f_global, top_global.natoms);
246 please_cite(fplog, "Lundborg2014");
252 ener_file_t mdoutf_get_fp_ene(gmx_mdoutf_t of)
257 FILE* mdoutf_get_fp_dhdl(gmx_mdoutf_t of)
262 gmx_wallcycle* mdoutf_get_wcycle(gmx_mdoutf_t of)
267 static void mpiBarrierBeforeRename(const bool applyMpiBarrierBeforeRename, MPI_Comm mpiBarrierCommunicator)
269 if (applyMpiBarrierBeforeRename)
272 MPI_Barrier(mpiBarrierCommunicator);
274 GMX_RELEASE_ASSERT(false, "Should not request a barrier without MPI");
275 GMX_UNUSED_VALUE(mpiBarrierCommunicator);
279 /*! \brief Write a checkpoint to the filename
281 * Appends the _step<step>.cpt with bNumberAndKeep, otherwise moves
282 * the previous checkpoint filename with suffix _prev.cpt.
284 static void write_checkpoint(const char* fn,
285 gmx_bool bNumberAndKeep,
290 IntegrationAlgorithm eIntegrator,
293 LambdaWeightCalculation elamstats,
297 ObservablesHistory* observablesHistory,
298 const gmx::MDModulesNotifiers& mdModulesNotifiers,
299 gmx::WriteCheckpointDataHolder* modularSimulatorCheckpointData,
300 bool applyMpiBarrierBeforeRename,
301 MPI_Comm mpiBarrierCommunicator)
304 char* fntemp; /* the temporary checkpoint file name */
306 char buf[1024], suffix[5 + STEPSTRSIZE], sbuf[STEPSTRSIZE];
309 if (haveDDAtomOrdering(*cr))
311 npmenodes = cr->npmenodes;
319 /* make the new temporary filename */
320 snew(fntemp, std::strlen(fn) + 5 + STEPSTRSIZE);
321 std::strcpy(fntemp, fn);
322 fntemp[std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1] = '\0';
323 sprintf(suffix, "_%s%s", "step", gmx_step_str(step, sbuf));
324 std::strcat(fntemp, suffix);
325 std::strcat(fntemp, fn + std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1);
327 /* if we can't rename, we just overwrite the cpt file.
328 * dangerous if interrupted.
330 snew(fntemp, std::strlen(fn));
331 std::strcpy(fntemp, fn);
333 std::string timebuf = gmx_format_current_time();
337 fprintf(fplog, "Writing checkpoint, step %s at %s\n\n", gmx_step_str(step, buf), timebuf.c_str());
340 /* Get offsets for open files */
341 auto outputfiles = gmx_fio_get_output_file_positions();
343 fp = gmx_fio_open(fntemp, "w");
345 /* We can check many more things now (CPU, acceleration, etc), but
346 * it is highly unlikely to have two separate builds with exactly
347 * the same version, user, time, and build host!
350 int nlambda = (state->dfhist ? state->dfhist->nlambda : 0);
352 edsamhistory_t* edsamhist = observablesHistory->edsamHistory.get();
353 int nED = (edsamhist ? edsamhist->nED : 0);
355 swaphistory_t* swaphist = observablesHistory->swapHistory.get();
356 SwapType eSwapCoords = (swaphist ? swaphist->eSwapCoords : SwapType::No);
358 CheckpointHeaderContents headerContents = { CheckPointVersion::UnknownVersion0,
376 state->nhchainlength,
387 std::strcpy(headerContents.version, gmx_version());
388 std::strcpy(headerContents.fprog, gmx::getProgramContext().fullBinaryPath());
389 std::strcpy(headerContents.ftime, timebuf.c_str());
390 if (haveDDAtomOrdering(*cr))
392 copy_ivec(domdecCells, headerContents.dd_nc);
395 write_checkpoint_data(fp,
403 modularSimulatorCheckpointData);
405 /* we really, REALLY, want to make sure to physically write the checkpoint,
406 and all the files it depends on, out to disk. Because we've
407 opened the checkpoint with gmx_fio_open(), it's in our list
409 ret = gmx_fio_all_output_fsync();
414 sprintf(buf, "Cannot fsync '%s'; maybe you are out of disk space?", gmx_fio_getname(ret));
416 if (getenv(GMX_IGNORE_FSYNC_FAILURE_ENV) == nullptr)
422 gmx_warning("%s", buf);
426 if (gmx_fio_close(fp) != 0)
428 gmx_file("Cannot read/write checkpoint; corrupt file, or maybe you are out of disk space?");
431 /* we don't move the checkpoint if the user specified they didn't want it,
432 or if the fsyncs failed */
434 if (!bNumberAndKeep && !ret)
436 // Add a barrier before renaming to reduce chance to get out of sync (#2440)
437 // Note: Checkpoint might only exist on some ranks, so put barrier before if clause (#3919)
438 mpiBarrierBeforeRename(applyMpiBarrierBeforeRename, mpiBarrierCommunicator);
441 /* Rename the previous checkpoint file */
442 std::strcpy(buf, fn);
443 buf[std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1] = '\0';
444 std::strcat(buf, "_prev");
445 std::strcat(buf, fn + std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1);
448 /* we copy here so that if something goes wrong between now and
449 * the rename below, there's always a state.cpt.
450 * If renames are atomic (such as in POSIX systems),
451 * this copying should be unneccesary.
453 gmx_file_copy(fn, buf, FALSE);
454 /* We don't really care if this fails:
455 * there's already a new checkpoint.
460 gmx_file_rename(fn, buf);
464 /* Rename the checkpoint file from the temporary to the final name */
465 mpiBarrierBeforeRename(applyMpiBarrierBeforeRename, mpiBarrierCommunicator);
467 if (gmx_file_rename(fntemp, fn) != 0)
469 gmx_file("Cannot rename checkpoint file; maybe you are out of disk space?");
472 #endif /* GMX_NO_RENAME */
477 /* Always FAH checkpoint immediately after a GROMACS checkpoint.
479 * Note that it is critical that we save a FAH checkpoint directly
480 * after writing a GROMACS checkpoint. If the program dies, either
481 * by the machine powering off suddenly or the process being,
482 * killed, FAH can recover files that have only appended data by
483 * truncating them to the last recorded length. The GROMACS
484 * checkpoint does not just append data, it is fully rewritten each
485 * time so a crash between moving the new Gromacs checkpoint file in
486 * to place and writing a FAH checkpoint is not recoverable. Thus
487 * the time between these operations must be kept as short as
491 #endif /* end GMX_FAHCORE block */
494 void mdoutf_write_checkpoint(gmx_mdoutf_t of,
499 t_state* state_global,
500 ObservablesHistory* observablesHistory,
501 gmx::WriteCheckpointDataHolder* modularSimulatorCheckpointData)
504 fflush_tng(of->tng_low_prec);
505 /* Write the checkpoint file.
506 * When simulations share the state, an MPI barrier is applied before
507 * renaming old and new checkpoint files to minimize the risk of
508 * checkpoint files getting out of sync.
510 ivec one_ivec = { 1, 1, 1 };
511 write_checkpoint(of->fn_cpt,
515 haveDDAtomOrdering(*cr) ? cr->dd->numCells : one_ivec,
516 haveDDAtomOrdering(*cr) ? cr->dd->nnodes : cr->nnodes,
525 *(of->mdModulesNotifiers),
526 modularSimulatorCheckpointData,
527 of->simulationsShareState,
531 void mdoutf_write_to_trajectory_files(FILE* fplog,
538 t_state* state_local,
539 t_state* state_global,
540 ObservablesHistory* observablesHistory,
541 gmx::ArrayRef<const gmx::RVec> f_local,
542 gmx::WriteCheckpointDataHolder* modularSimulatorCheckpointData)
544 const rvec* f_global;
546 if (haveDDAtomOrdering(*cr))
548 if (mdof_flags & MDOF_CPT)
550 dd_collect_state(cr->dd, state_local, state_global);
554 if (mdof_flags & (MDOF_X | MDOF_X_COMPRESSED))
556 auto globalXRef = MASTER(cr) ? state_global->x : gmx::ArrayRef<gmx::RVec>();
557 dd_collect_vec(cr->dd,
558 state_local->ddp_count,
559 state_local->ddp_count_cg_gl,
564 if (mdof_flags & MDOF_V)
566 auto globalVRef = MASTER(cr) ? state_global->v : gmx::ArrayRef<gmx::RVec>();
567 dd_collect_vec(cr->dd,
568 state_local->ddp_count,
569 state_local->ddp_count_cg_gl,
575 f_global = of->f_global;
576 if (mdof_flags & MDOF_F)
578 auto globalFRef = MASTER(cr) ? gmx::arrayRefFromArray(
579 reinterpret_cast<gmx::RVec*>(of->f_global), of->natoms_global)
580 : gmx::ArrayRef<gmx::RVec>();
581 dd_collect_vec(cr->dd,
582 state_local->ddp_count,
583 state_local->ddp_count_cg_gl,
591 /* We have the whole state locally: copy the local state pointer */
592 state_global = state_local;
594 f_global = as_rvec_array(f_local.data());
599 if (mdof_flags & MDOF_CPT)
601 mdoutf_write_checkpoint(
602 of, fplog, cr, step, t, state_global, observablesHistory, modularSimulatorCheckpointData);
605 if (mdof_flags & (MDOF_X | MDOF_V | MDOF_F))
607 const rvec* x = (mdof_flags & MDOF_X) ? state_global->x.rvec_array() : nullptr;
608 const rvec* v = (mdof_flags & MDOF_V) ? state_global->v.rvec_array() : nullptr;
609 const rvec* f = (mdof_flags & MDOF_F) ? f_global : nullptr;
613 gmx_trr_write_frame(of->fp_trn,
616 state_local->lambda[FreeEnergyPerturbationCouplingType::Fep],
622 if (gmx_fio_flush(of->fp_trn) != 0)
624 gmx_file("Cannot write trajectory; maybe you are out of disk space?");
628 /* If a TNG file is open for uncompressed coordinate output also write
629 velocities and forces to it. */
632 gmx_fwrite_tng(of->tng,
636 state_local->lambda[FreeEnergyPerturbationCouplingType::Fep],
643 /* If only a TNG file is open for compressed coordinate output (no uncompressed
644 coordinate output) also write forces and velocities to it. */
645 else if (of->tng_low_prec)
647 gmx_fwrite_tng(of->tng_low_prec,
651 state_local->lambda[FreeEnergyPerturbationCouplingType::Fep],
659 if (mdof_flags & MDOF_X_COMPRESSED)
661 rvec* xxtc = nullptr;
663 if (of->natoms_x_compressed == of->natoms_global)
665 /* We are writing the positions of all of the atoms to
666 the compressed output */
667 xxtc = state_global->x.rvec_array();
671 /* We are writing the positions of only a subset of
672 the atoms to the compressed output, so we have to
673 make a copy of the subset of coordinates. */
676 snew(xxtc, of->natoms_x_compressed);
677 auto x = makeArrayRef(state_global->x);
678 for (i = 0, j = 0; (i < of->natoms_global); i++)
680 if (getGroupType(*of->groups, SimulationAtomGroupType::CompressedPositionOutput, i) == 0)
682 copy_rvec(x[i], xxtc[j++]);
686 if (write_xtc(of->fp_xtc, of->natoms_x_compressed, step, t, state_local->box, xxtc, of->x_compression_precision)
690 "XTC error. This indicates you are out of disk space, or a "
691 "simulation with major instabilities resulting in coordinates "
692 "that are NaN or too large to be represented in the XTC format.\n");
694 gmx_fwrite_tng(of->tng_low_prec,
698 state_local->lambda[FreeEnergyPerturbationCouplingType::Fep],
700 of->natoms_x_compressed,
704 if (of->natoms_x_compressed != of->natoms_global)
709 if (mdof_flags & (MDOF_BOX | MDOF_LAMBDA) && !(mdof_flags & (MDOF_X | MDOF_V | MDOF_F)))
715 if (mdof_flags & MDOF_BOX)
717 box = state_local->box;
719 if (mdof_flags & MDOF_LAMBDA)
721 lambda = state_local->lambda[FreeEnergyPerturbationCouplingType::Fep];
723 gmx_fwrite_tng(of->tng, FALSE, step, t, lambda, box, natoms, nullptr, nullptr, nullptr);
726 if (mdof_flags & (MDOF_BOX_COMPRESSED | MDOF_LAMBDA_COMPRESSED)
727 && !(mdof_flags & (MDOF_X_COMPRESSED)))
729 if (of->tng_low_prec)
733 if (mdof_flags & MDOF_BOX_COMPRESSED)
735 box = state_local->box;
737 if (mdof_flags & MDOF_LAMBDA_COMPRESSED)
739 lambda = state_local->lambda[FreeEnergyPerturbationCouplingType::Fep];
741 gmx_fwrite_tng(of->tng_low_prec, FALSE, step, t, lambda, box, natoms, nullptr, nullptr, nullptr);
746 /* Write a FAH checkpoint after writing any other data. We may end up
747 checkpointing twice but it's fast so it's ok. */
748 if ((mdof_flags & ~MDOF_CPT))
756 void mdoutf_tng_close(gmx_mdoutf_t of)
758 if (of->tng || of->tng_low_prec)
760 wallcycle_start(of->wcycle, WallCycleCounter::Traj);
761 gmx_tng_close(&of->tng);
762 gmx_tng_close(&of->tng_low_prec);
763 wallcycle_stop(of->wcycle, WallCycleCounter::Traj);
767 void done_mdoutf(gmx_mdoutf_t of)
769 if (of->fp_ene != nullptr)
771 done_ener_file(of->fp_ene);
775 close_xtc(of->fp_xtc);
779 gmx_trr_close(of->fp_trn);
781 if (of->fp_dhdl != nullptr)
783 gmx_fio_fclose(of->fp_dhdl);
785 of->outputProvider->finishOutput();
786 if (of->f_global != nullptr)
791 gmx_tng_close(&of->tng);
792 gmx_tng_close(&of->tng_low_prec);
797 int mdoutf_get_tng_box_output_interval(gmx_mdoutf_t of)
801 return gmx_tng_get_box_output_interval(of->tng);
806 int mdoutf_get_tng_lambda_output_interval(gmx_mdoutf_t of)
810 return gmx_tng_get_lambda_output_interval(of->tng);
815 int mdoutf_get_tng_compressed_box_output_interval(gmx_mdoutf_t of)
817 if (of->tng_low_prec)
819 return gmx_tng_get_box_output_interval(of->tng_low_prec);
824 int mdoutf_get_tng_compressed_lambda_output_interval(gmx_mdoutf_t of)
826 if (of->tng_low_prec)
828 return gmx_tng_get_lambda_output_interval(of->tng_low_prec);