2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2013,2014,2015,2016,2017 The GROMACS development team.
5 * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
42 #include "gromacs/commandline/filenm.h"
43 #include "gromacs/domdec/collect.h"
44 #include "gromacs/domdec/domdec_struct.h"
45 #include "gromacs/fileio/checkpoint.h"
46 #include "gromacs/fileio/gmxfio.h"
47 #include "gromacs/fileio/tngio.h"
48 #include "gromacs/fileio/trrio.h"
49 #include "gromacs/fileio/xtcio.h"
50 #include "gromacs/math/vec.h"
51 #include "gromacs/mdlib/energyoutput.h"
52 #include "gromacs/mdrunutility/handlerestart.h"
53 #include "gromacs/mdrunutility/multisim.h"
54 #include "gromacs/mdtypes/awh_history.h"
55 #include "gromacs/mdtypes/commrec.h"
56 #include "gromacs/mdtypes/edsamhistory.h"
57 #include "gromacs/mdtypes/energyhistory.h"
58 #include "gromacs/mdtypes/imdoutputprovider.h"
59 #include "gromacs/mdtypes/inputrec.h"
60 #include "gromacs/mdtypes/md_enums.h"
61 #include "gromacs/mdtypes/mdrunoptions.h"
62 #include "gromacs/mdtypes/observableshistory.h"
63 #include "gromacs/mdtypes/state.h"
64 #include "gromacs/mdtypes/swaphistory.h"
65 #include "gromacs/timing/wallcycle.h"
66 #include "gromacs/topology/topology.h"
67 #include "gromacs/utility/baseversion.h"
68 #include "gromacs/utility/fatalerror.h"
69 #include "gromacs/utility/pleasecite.h"
70 #include "gromacs/utility/programcontext.h"
71 #include "gromacs/utility/smalloc.h"
72 #include "gromacs/utility/sysinfo.h"
78 gmx_tng_trajectory_t tng;
79 gmx_tng_trajectory_t tng_low_prec;
80 int x_compression_precision; /* only used by XTC output */
83 gmx_bool bKeepAndNumCPT;
84 IntegrationAlgorithm eIntegrator;
86 LambdaWeightCalculation elamstats;
90 int natoms_x_compressed;
91 const SimulationGroups* groups; /* for compressed position writing */
92 gmx_wallcycle* wcycle;
94 gmx::IMDOutputProvider* outputProvider;
95 const gmx::MDModulesNotifiers* mdModulesNotifiers;
96 bool simulationsShareState;
101 gmx_mdoutf_t init_mdoutf(FILE* fplog,
103 const t_filenm fnm[],
104 const gmx::MdrunOptions& mdrunOptions,
106 gmx::IMDOutputProvider* outputProvider,
107 const gmx::MDModulesNotifiers& mdModulesNotifiers,
108 const t_inputrec* ir,
109 const gmx_mtop_t& top_global,
110 const gmx_output_env_t* oenv,
111 gmx_wallcycle* wcycle,
112 const gmx::StartingBehavior startingBehavior,
113 bool simulationsShareState,
114 const gmx_multisim_t* ms)
117 const char * appendMode = "a+", *writeMode = "w+", *filemode;
118 gmx_bool bCiteTng = FALSE;
120 bool restartWithAppending = (startingBehavior == gmx::StartingBehavior::RestartWithAppending);
124 of->fp_trn = nullptr;
125 of->fp_ene = nullptr;
126 of->fp_xtc = nullptr;
128 of->tng_low_prec = nullptr;
129 of->fp_dhdl = nullptr;
131 of->eIntegrator = ir->eI;
132 of->bExpanded = ir->bExpanded;
133 of->elamstats = ir->expandedvals->elamstats;
134 of->simulation_part = ir->simulation_part;
135 of->x_compression_precision = static_cast<int>(ir->x_compression_precision);
137 of->f_global = nullptr;
138 of->outputProvider = outputProvider;
140 GMX_RELEASE_ASSERT(!simulationsShareState || ms != nullptr,
141 "Need valid multisim object when simulations share state");
142 of->simulationsShareState = simulationsShareState;
143 if (of->simulationsShareState)
145 of->mastersComm = ms->mastersComm_;
150 of->bKeepAndNumCPT = mdrunOptions.checkpointOptions.keepAndNumberCheckpointFiles;
152 filemode = restartWithAppending ? appendMode : writeMode;
154 if (EI_DYNAMICS(ir->eI) && ir->nstxout_compressed > 0)
156 const char* filename;
157 filename = ftp2fn(efCOMPRESSED, nfile, fnm);
158 switch (fn2ftp(filename))
160 case efXTC: of->fp_xtc = open_xtc(filename, filemode); break;
162 gmx_tng_open(filename, filemode[0], &of->tng_low_prec);
163 if (filemode[0] == 'w')
165 gmx_tng_prepare_low_prec_writing(of->tng_low_prec, &top_global, ir);
169 default: gmx_incons("Invalid reduced precision file format");
172 if ((EI_DYNAMICS(ir->eI) || EI_ENERGY_MINIMIZATION(ir->eI))
174 && !(EI_DYNAMICS(ir->eI) && ir->nstxout == 0 && ir->nstvout == 0 && ir->nstfout == 0)))
176 const char* filename;
177 filename = ftp2fn(efTRN, nfile, fnm);
178 switch (fn2ftp(filename))
182 /* If there is no uncompressed coordinate output and
183 there is compressed TNG output write forces
184 and/or velocities to the TNG file instead. */
185 if (ir->nstxout != 0 || ir->nstxout_compressed == 0 || !of->tng_low_prec)
187 of->fp_trn = gmx_trr_open(filename, filemode);
191 gmx_tng_open(filename, filemode[0], &of->tng);
192 if (filemode[0] == 'w')
194 gmx_tng_prepare_md_writing(of->tng, &top_global, ir);
198 default: gmx_incons("Invalid full precision file format");
201 if (EI_DYNAMICS(ir->eI) || EI_ENERGY_MINIMIZATION(ir->eI))
203 of->fp_ene = open_enx(ftp2fn(efEDR, nfile, fnm), filemode);
205 of->fn_cpt = opt2fn("-cpo", nfile, fnm);
207 if ((ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp) && ir->fepvals->nstdhdl > 0
208 && (ir->fepvals->separate_dhdl_file == SeparateDhdlFile::Yes) && EI_DYNAMICS(ir->eI))
210 if (restartWithAppending)
212 of->fp_dhdl = gmx_fio_fopen(opt2fn("-dhdl", nfile, fnm), filemode);
216 of->fp_dhdl = open_dhdl(opt2fn("-dhdl", nfile, fnm), ir, oenv);
220 outputProvider->initOutput(fplog, nfile, fnm, restartWithAppending, oenv);
221 of->mdModulesNotifiers = &mdModulesNotifiers;
223 /* Set up atom counts so they can be passed to actual
224 trajectory-writing routines later. Also, XTC writing needs
225 to know what (and how many) atoms might be in the XTC
226 groups, and how to look up later which ones they are. */
227 of->natoms_global = top_global.natoms;
228 of->groups = &top_global.groups;
229 of->natoms_x_compressed = 0;
230 for (i = 0; (i < top_global.natoms); i++)
232 if (getGroupType(*of->groups, SimulationAtomGroupType::CompressedPositionOutput, i) == 0)
234 of->natoms_x_compressed++;
238 if (ir->nstfout && haveDDAtomOrdering(*cr))
240 snew(of->f_global, top_global.natoms);
246 please_cite(fplog, "Lundborg2014");
252 ener_file_t mdoutf_get_fp_ene(gmx_mdoutf_t of)
257 FILE* mdoutf_get_fp_dhdl(gmx_mdoutf_t of)
262 gmx_wallcycle* mdoutf_get_wcycle(gmx_mdoutf_t of)
267 static void mpiBarrierBeforeRename(const bool applyMpiBarrierBeforeRename, MPI_Comm mpiBarrierCommunicator)
269 if (applyMpiBarrierBeforeRename)
272 MPI_Barrier(mpiBarrierCommunicator);
274 GMX_RELEASE_ASSERT(false, "Should not request a barrier without MPI");
275 GMX_UNUSED_VALUE(mpiBarrierCommunicator);
279 /*! \brief Write a checkpoint to the filename
281 * Appends the _step<step>.cpt with bNumberAndKeep, otherwise moves
282 * the previous checkpoint filename with suffix _prev.cpt.
284 static void write_checkpoint(const char* fn,
285 gmx_bool bNumberAndKeep,
290 IntegrationAlgorithm eIntegrator,
293 LambdaWeightCalculation elamstats,
297 ObservablesHistory* observablesHistory,
298 const gmx::MDModulesNotifiers& mdModulesNotifiers,
299 gmx::WriteCheckpointDataHolder* modularSimulatorCheckpointData,
300 bool applyMpiBarrierBeforeRename,
301 MPI_Comm mpiBarrierCommunicator)
304 char* fntemp; /* the temporary checkpoint file name */
306 char buf[1024], suffix[5 + STEPSTRSIZE], sbuf[STEPSTRSIZE];
309 if (haveDDAtomOrdering(*cr))
311 npmenodes = cr->npmenodes;
319 /* make the new temporary filename */
320 snew(fntemp, std::strlen(fn) + 5 + STEPSTRSIZE);
321 std::strcpy(fntemp, fn);
322 fntemp[std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1] = '\0';
323 sprintf(suffix, "_%s%s", "step", gmx_step_str(step, sbuf));
324 std::strcat(fntemp, suffix);
325 std::strcat(fntemp, fn + std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1);
327 /* if we can't rename, we just overwrite the cpt file.
328 * dangerous if interrupted.
330 snew(fntemp, std::strlen(fn));
331 std::strcpy(fntemp, fn);
333 std::string timebuf = gmx_format_current_time();
337 fprintf(fplog, "Writing checkpoint, step %s at %s\n\n", gmx_step_str(step, buf), timebuf.c_str());
340 /* Get offsets for open files */
341 auto outputfiles = gmx_fio_get_output_file_positions();
343 fp = gmx_fio_open(fntemp, "w");
345 /* We can check many more things now (CPU, acceleration, etc), but
346 * it is highly unlikely to have two separate builds with exactly
347 * the same version, user, time, and build host!
350 int nlambda = (state->dfhist ? state->dfhist->nlambda : 0);
352 edsamhistory_t* edsamhist = observablesHistory->edsamHistory.get();
353 int nED = (edsamhist ? edsamhist->nED : 0);
355 swaphistory_t* swaphist = observablesHistory->swapHistory.get();
356 SwapType eSwapCoords = (swaphist ? swaphist->eSwapCoords : SwapType::No);
358 CheckpointHeaderContents headerContents = { CheckPointVersion::UnknownVersion0,
376 state->nhchainlength,
387 std::strcpy(headerContents.version, gmx_version());
388 std::strcpy(headerContents.fprog, gmx::getProgramContext().fullBinaryPath());
389 std::strcpy(headerContents.ftime, timebuf.c_str());
390 if (haveDDAtomOrdering(*cr))
392 copy_ivec(domdecCells, headerContents.dd_nc);
395 write_checkpoint_data(fp,
403 modularSimulatorCheckpointData);
405 /* we really, REALLY, want to make sure to physically write the checkpoint,
406 and all the files it depends on, out to disk. Because we've
407 opened the checkpoint with gmx_fio_open(), it's in our list
409 ret = gmx_fio_all_output_fsync();
414 sprintf(buf, "Cannot fsync '%s'; maybe you are out of disk space?", gmx_fio_getname(ret));
416 if (getenv(GMX_IGNORE_FSYNC_FAILURE_ENV) == nullptr)
422 gmx_warning("%s", buf);
426 if (gmx_fio_close(fp) != 0)
428 gmx_file("Cannot read/write checkpoint; corrupt file, or maybe you are out of disk space?");
431 /* we don't move the checkpoint if the user specified they didn't want it,
432 or if the fsyncs failed */
434 if (!bNumberAndKeep && !ret)
436 // Add a barrier before renaming to reduce chance to get out of sync (#2440)
437 // Note: Checkpoint might only exist on some ranks, so put barrier before if clause (#3919)
438 mpiBarrierBeforeRename(applyMpiBarrierBeforeRename, mpiBarrierCommunicator);
441 /* Rename the previous checkpoint file */
442 std::strcpy(buf, fn);
443 buf[std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1] = '\0';
444 std::strcat(buf, "_prev");
445 std::strcat(buf, fn + std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1);
448 /* we copy here so that if something goes wrong between now and
449 * the rename below, there's always a state.cpt.
450 * If renames are atomic (such as in POSIX systems),
451 * this copying should be unneccesary.
453 gmx_file_copy(fn, buf, FALSE);
454 /* We don't really care if this fails:
455 * there's already a new checkpoint.
460 gmx_file_rename(fn, buf);
464 /* Rename the checkpoint file from the temporary to the final name */
465 mpiBarrierBeforeRename(applyMpiBarrierBeforeRename, mpiBarrierCommunicator);
469 gmx_file_rename(fntemp, fn);
471 catch (gmx::FileIOError const&)
473 // In this case we can be more helpful than the generic message from gmx_file_rename
474 GMX_THROW(gmx::FileIOError(
475 "Cannot rename checkpoint file; maybe you are out of disk space?"));
478 #endif /* GMX_NO_RENAME */
483 /* Always FAH checkpoint immediately after a GROMACS checkpoint.
485 * Note that it is critical that we save a FAH checkpoint directly
486 * after writing a GROMACS checkpoint. If the program dies, either
487 * by the machine powering off suddenly or the process being,
488 * killed, FAH can recover files that have only appended data by
489 * truncating them to the last recorded length. The GROMACS
490 * checkpoint does not just append data, it is fully rewritten each
491 * time so a crash between moving the new Gromacs checkpoint file in
492 * to place and writing a FAH checkpoint is not recoverable. Thus
493 * the time between these operations must be kept as short as
497 #endif /* end GMX_FAHCORE block */
500 void mdoutf_write_checkpoint(gmx_mdoutf_t of,
505 t_state* state_global,
506 ObservablesHistory* observablesHistory,
507 gmx::WriteCheckpointDataHolder* modularSimulatorCheckpointData)
510 fflush_tng(of->tng_low_prec);
511 /* Write the checkpoint file.
512 * When simulations share the state, an MPI barrier is applied before
513 * renaming old and new checkpoint files to minimize the risk of
514 * checkpoint files getting out of sync.
516 ivec one_ivec = { 1, 1, 1 };
517 write_checkpoint(of->fn_cpt,
521 haveDDAtomOrdering(*cr) ? cr->dd->numCells : one_ivec,
522 haveDDAtomOrdering(*cr) ? cr->dd->nnodes : cr->nnodes,
531 *(of->mdModulesNotifiers),
532 modularSimulatorCheckpointData,
533 of->simulationsShareState,
537 void mdoutf_write_to_trajectory_files(FILE* fplog,
544 t_state* state_local,
545 t_state* state_global,
546 ObservablesHistory* observablesHistory,
547 gmx::ArrayRef<const gmx::RVec> f_local,
548 gmx::WriteCheckpointDataHolder* modularSimulatorCheckpointData)
550 const rvec* f_global;
552 if (haveDDAtomOrdering(*cr))
554 if (mdof_flags & MDOF_CPT)
556 dd_collect_state(cr->dd, state_local, state_global);
560 if (mdof_flags & (MDOF_X | MDOF_X_COMPRESSED))
562 auto globalXRef = MASTER(cr) ? state_global->x : gmx::ArrayRef<gmx::RVec>();
563 dd_collect_vec(cr->dd,
564 state_local->ddp_count,
565 state_local->ddp_count_cg_gl,
570 if (mdof_flags & MDOF_V)
572 auto globalVRef = MASTER(cr) ? state_global->v : gmx::ArrayRef<gmx::RVec>();
573 dd_collect_vec(cr->dd,
574 state_local->ddp_count,
575 state_local->ddp_count_cg_gl,
581 f_global = of->f_global;
582 if (mdof_flags & MDOF_F)
584 auto globalFRef = MASTER(cr) ? gmx::arrayRefFromArray(
585 reinterpret_cast<gmx::RVec*>(of->f_global), of->natoms_global)
586 : gmx::ArrayRef<gmx::RVec>();
587 dd_collect_vec(cr->dd,
588 state_local->ddp_count,
589 state_local->ddp_count_cg_gl,
597 /* We have the whole state locally: copy the local state pointer */
598 state_global = state_local;
600 f_global = as_rvec_array(f_local.data());
605 if (mdof_flags & MDOF_CPT)
607 mdoutf_write_checkpoint(
608 of, fplog, cr, step, t, state_global, observablesHistory, modularSimulatorCheckpointData);
611 if (mdof_flags & (MDOF_X | MDOF_V | MDOF_F))
613 const rvec* x = (mdof_flags & MDOF_X) ? state_global->x.rvec_array() : nullptr;
614 const rvec* v = (mdof_flags & MDOF_V) ? state_global->v.rvec_array() : nullptr;
615 const rvec* f = (mdof_flags & MDOF_F) ? f_global : nullptr;
619 gmx_trr_write_frame(of->fp_trn,
622 state_local->lambda[FreeEnergyPerturbationCouplingType::Fep],
628 if (gmx_fio_flush(of->fp_trn) != 0)
630 gmx_file("Cannot write trajectory; maybe you are out of disk space?");
634 /* If a TNG file is open for uncompressed coordinate output also write
635 velocities and forces to it. */
638 gmx_fwrite_tng(of->tng,
642 state_local->lambda[FreeEnergyPerturbationCouplingType::Fep],
649 /* If only a TNG file is open for compressed coordinate output (no uncompressed
650 coordinate output) also write forces and velocities to it. */
651 else if (of->tng_low_prec)
653 gmx_fwrite_tng(of->tng_low_prec,
657 state_local->lambda[FreeEnergyPerturbationCouplingType::Fep],
665 if (mdof_flags & MDOF_X_COMPRESSED)
667 rvec* xxtc = nullptr;
669 if (of->natoms_x_compressed == of->natoms_global)
671 /* We are writing the positions of all of the atoms to
672 the compressed output */
673 xxtc = state_global->x.rvec_array();
677 /* We are writing the positions of only a subset of
678 the atoms to the compressed output, so we have to
679 make a copy of the subset of coordinates. */
682 snew(xxtc, of->natoms_x_compressed);
683 auto x = makeArrayRef(state_global->x);
684 for (i = 0, j = 0; (i < of->natoms_global); i++)
686 if (getGroupType(*of->groups, SimulationAtomGroupType::CompressedPositionOutput, i) == 0)
688 copy_rvec(x[i], xxtc[j++]);
692 if (write_xtc(of->fp_xtc, of->natoms_x_compressed, step, t, state_local->box, xxtc, of->x_compression_precision)
696 "XTC error. This indicates you are out of disk space, or a "
697 "simulation with major instabilities resulting in coordinates "
698 "that are NaN or too large to be represented in the XTC format.\n");
700 gmx_fwrite_tng(of->tng_low_prec,
704 state_local->lambda[FreeEnergyPerturbationCouplingType::Fep],
706 of->natoms_x_compressed,
710 if (of->natoms_x_compressed != of->natoms_global)
715 if (mdof_flags & (MDOF_BOX | MDOF_LAMBDA) && !(mdof_flags & (MDOF_X | MDOF_V | MDOF_F)))
721 if (mdof_flags & MDOF_BOX)
723 box = state_local->box;
725 if (mdof_flags & MDOF_LAMBDA)
727 lambda = state_local->lambda[FreeEnergyPerturbationCouplingType::Fep];
729 gmx_fwrite_tng(of->tng, FALSE, step, t, lambda, box, natoms, nullptr, nullptr, nullptr);
732 if (mdof_flags & (MDOF_BOX_COMPRESSED | MDOF_LAMBDA_COMPRESSED)
733 && !(mdof_flags & (MDOF_X_COMPRESSED)))
735 if (of->tng_low_prec)
739 if (mdof_flags & MDOF_BOX_COMPRESSED)
741 box = state_local->box;
743 if (mdof_flags & MDOF_LAMBDA_COMPRESSED)
745 lambda = state_local->lambda[FreeEnergyPerturbationCouplingType::Fep];
747 gmx_fwrite_tng(of->tng_low_prec, FALSE, step, t, lambda, box, natoms, nullptr, nullptr, nullptr);
752 /* Write a FAH checkpoint after writing any other data. We may end up
753 checkpointing twice but it's fast so it's ok. */
754 if ((mdof_flags & ~MDOF_CPT))
762 void mdoutf_tng_close(gmx_mdoutf_t of)
764 if (of->tng || of->tng_low_prec)
766 wallcycle_start(of->wcycle, WallCycleCounter::Traj);
767 gmx_tng_close(&of->tng);
768 gmx_tng_close(&of->tng_low_prec);
769 wallcycle_stop(of->wcycle, WallCycleCounter::Traj);
773 void done_mdoutf(gmx_mdoutf_t of)
775 if (of->fp_ene != nullptr)
777 done_ener_file(of->fp_ene);
781 close_xtc(of->fp_xtc);
785 gmx_trr_close(of->fp_trn);
787 if (of->fp_dhdl != nullptr)
789 gmx_fio_fclose(of->fp_dhdl);
791 of->outputProvider->finishOutput();
792 if (of->f_global != nullptr)
797 gmx_tng_close(&of->tng);
798 gmx_tng_close(&of->tng_low_prec);
803 int mdoutf_get_tng_box_output_interval(gmx_mdoutf_t of)
807 return gmx_tng_get_box_output_interval(of->tng);
812 int mdoutf_get_tng_lambda_output_interval(gmx_mdoutf_t of)
816 return gmx_tng_get_lambda_output_interval(of->tng);
821 int mdoutf_get_tng_compressed_box_output_interval(gmx_mdoutf_t of)
823 if (of->tng_low_prec)
825 return gmx_tng_get_box_output_interval(of->tng_low_prec);
830 int mdoutf_get_tng_compressed_lambda_output_interval(gmx_mdoutf_t of)
832 if (of->tng_low_prec)
834 return gmx_tng_get_lambda_output_interval(of->tng_low_prec);