2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2013,2014,2015,2016,2017 The GROMACS development team.
5 * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
42 #include "gromacs/commandline/filenm.h"
43 #include "gromacs/domdec/collect.h"
44 #include "gromacs/domdec/domdec_struct.h"
45 #include "gromacs/fileio/checkpoint.h"
46 #include "gromacs/fileio/gmxfio.h"
47 #include "gromacs/fileio/tngio.h"
48 #include "gromacs/fileio/trrio.h"
49 #include "gromacs/fileio/xtcio.h"
50 #include "gromacs/fileio/xvgr.h"
51 #include "gromacs/math/vec.h"
52 #include "gromacs/mdlib/trajectory_writing.h"
53 #include "gromacs/mdrunutility/handlerestart.h"
54 #include "gromacs/mdrunutility/multisim.h"
55 #include "gromacs/mdtypes/awh_history.h"
56 #include "gromacs/mdtypes/commrec.h"
57 #include "gromacs/mdtypes/df_history.h"
58 #include "gromacs/mdtypes/edsamhistory.h"
59 #include "gromacs/mdtypes/energyhistory.h"
60 #include "gromacs/mdtypes/imdoutputprovider.h"
61 #include "gromacs/mdtypes/inputrec.h"
62 #include "gromacs/mdtypes/md_enums.h"
63 #include "gromacs/mdtypes/mdrunoptions.h"
64 #include "gromacs/mdtypes/observableshistory.h"
65 #include "gromacs/mdtypes/state.h"
66 #include "gromacs/mdtypes/swaphistory.h"
67 #include "gromacs/timing/wallcycle.h"
68 #include "gromacs/topology/topology.h"
69 #include "gromacs/utility/baseversion.h"
70 #include "gromacs/utility/fatalerror.h"
71 #include "gromacs/utility/pleasecite.h"
72 #include "gromacs/utility/programcontext.h"
73 #include "gromacs/utility/smalloc.h"
74 #include "gromacs/utility/sysinfo.h"
80 gmx_tng_trajectory_t tng;
81 gmx_tng_trajectory_t tng_low_prec;
82 int x_compression_precision; /* only used by XTC output */
85 gmx_bool bKeepAndNumCPT;
86 IntegrationAlgorithm eIntegrator;
88 LambdaWeightCalculation elamstats;
92 int natoms_x_compressed;
93 const SimulationGroups* groups; /* for compressed position writing */
94 gmx_wallcycle* wcycle;
96 gmx::IMDOutputProvider* outputProvider;
97 const gmx::MDModulesNotifiers* mdModulesNotifiers;
98 bool simulationsShareState;
103 gmx_mdoutf_t init_mdoutf(FILE* fplog,
105 const t_filenm fnm[],
106 const gmx::MdrunOptions& mdrunOptions,
108 gmx::IMDOutputProvider* outputProvider,
109 const gmx::MDModulesNotifiers& mdModulesNotifiers,
110 const t_inputrec* ir,
111 const gmx_mtop_t& top_global,
112 const gmx_output_env_t* oenv,
113 gmx_wallcycle* wcycle,
114 const gmx::StartingBehavior startingBehavior,
115 bool simulationsShareState,
116 const gmx_multisim_t* ms)
119 const char * appendMode = "a+", *writeMode = "w+", *filemode;
120 gmx_bool bCiteTng = FALSE;
122 bool restartWithAppending = (startingBehavior == gmx::StartingBehavior::RestartWithAppending);
126 of->fp_trn = nullptr;
127 of->fp_ene = nullptr;
128 of->fp_xtc = nullptr;
130 of->tng_low_prec = nullptr;
131 of->fp_dhdl = nullptr;
133 of->eIntegrator = ir->eI;
134 of->bExpanded = ir->bExpanded;
135 of->elamstats = ir->expandedvals->elamstats;
136 of->simulation_part = ir->simulation_part;
137 of->x_compression_precision = static_cast<int>(ir->x_compression_precision);
139 of->f_global = nullptr;
140 of->outputProvider = outputProvider;
142 GMX_RELEASE_ASSERT(!simulationsShareState || ms != nullptr,
143 "Need valid multisim object when simulations share state");
144 of->simulationsShareState = simulationsShareState;
145 if (of->simulationsShareState)
147 of->mastersComm = ms->mastersComm_;
152 of->bKeepAndNumCPT = mdrunOptions.checkpointOptions.keepAndNumberCheckpointFiles;
154 filemode = restartWithAppending ? appendMode : writeMode;
156 if (EI_DYNAMICS(ir->eI) && ir->nstxout_compressed > 0)
158 const char* filename;
159 filename = ftp2fn(efCOMPRESSED, nfile, fnm);
160 switch (fn2ftp(filename))
162 case efXTC: of->fp_xtc = open_xtc(filename, filemode); break;
164 gmx_tng_open(filename, filemode[0], &of->tng_low_prec);
165 if (filemode[0] == 'w')
167 gmx_tng_prepare_low_prec_writing(of->tng_low_prec, &top_global, ir);
171 default: gmx_incons("Invalid reduced precision file format");
174 if ((EI_DYNAMICS(ir->eI) || EI_ENERGY_MINIMIZATION(ir->eI))
176 && !(EI_DYNAMICS(ir->eI) && ir->nstxout == 0 && ir->nstvout == 0 && ir->nstfout == 0)))
178 const char* filename;
179 filename = ftp2fn(efTRN, nfile, fnm);
180 switch (fn2ftp(filename))
184 /* If there is no uncompressed coordinate output and
185 there is compressed TNG output write forces
186 and/or velocities to the TNG file instead. */
187 if (ir->nstxout != 0 || ir->nstxout_compressed == 0 || !of->tng_low_prec)
189 of->fp_trn = gmx_trr_open(filename, filemode);
193 gmx_tng_open(filename, filemode[0], &of->tng);
194 if (filemode[0] == 'w')
196 gmx_tng_prepare_md_writing(of->tng, &top_global, ir);
200 default: gmx_incons("Invalid full precision file format");
203 if (EI_DYNAMICS(ir->eI) || EI_ENERGY_MINIMIZATION(ir->eI))
205 of->fp_ene = open_enx(ftp2fn(efEDR, nfile, fnm), filemode);
207 of->fn_cpt = opt2fn("-cpo", nfile, fnm);
209 if ((ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp) && ir->fepvals->nstdhdl > 0
210 && (ir->fepvals->separate_dhdl_file == SeparateDhdlFile::Yes) && EI_DYNAMICS(ir->eI))
212 if (restartWithAppending)
214 of->fp_dhdl = gmx_fio_fopen(opt2fn("-dhdl", nfile, fnm), filemode);
218 of->fp_dhdl = open_dhdl(opt2fn("-dhdl", nfile, fnm), ir, oenv);
222 outputProvider->initOutput(fplog, nfile, fnm, restartWithAppending, oenv);
223 of->mdModulesNotifiers = &mdModulesNotifiers;
225 /* Set up atom counts so they can be passed to actual
226 trajectory-writing routines later. Also, XTC writing needs
227 to know what (and how many) atoms might be in the XTC
228 groups, and how to look up later which ones they are. */
229 of->natoms_global = top_global.natoms;
230 of->groups = &top_global.groups;
231 of->natoms_x_compressed = 0;
232 for (i = 0; (i < top_global.natoms); i++)
234 if (getGroupType(*of->groups, SimulationAtomGroupType::CompressedPositionOutput, i) == 0)
236 of->natoms_x_compressed++;
240 if (ir->nstfout && DOMAINDECOMP(cr))
242 snew(of->f_global, top_global.natoms);
248 please_cite(fplog, "Lundborg2014");
254 ener_file_t mdoutf_get_fp_ene(gmx_mdoutf_t of)
259 FILE* mdoutf_get_fp_dhdl(gmx_mdoutf_t of)
264 gmx_wallcycle* mdoutf_get_wcycle(gmx_mdoutf_t of)
269 static void mpiBarrierBeforeRename(const bool applyMpiBarrierBeforeRename, MPI_Comm mpiBarrierCommunicator)
271 if (applyMpiBarrierBeforeRename)
274 MPI_Barrier(mpiBarrierCommunicator);
276 GMX_RELEASE_ASSERT(false, "Should not request a barrier without MPI");
277 GMX_UNUSED_VALUE(mpiBarrierCommunicator);
281 /*! \brief Write a checkpoint to the filename
283 * Appends the _step<step>.cpt with bNumberAndKeep, otherwise moves
284 * the previous checkpoint filename with suffix _prev.cpt.
286 static void write_checkpoint(const char* fn,
287 gmx_bool bNumberAndKeep,
292 IntegrationAlgorithm eIntegrator,
295 LambdaWeightCalculation elamstats,
299 ObservablesHistory* observablesHistory,
300 const gmx::MDModulesNotifiers& mdModulesNotifiers,
301 gmx::WriteCheckpointDataHolder* modularSimulatorCheckpointData,
302 bool applyMpiBarrierBeforeRename,
303 MPI_Comm mpiBarrierCommunicator)
306 char* fntemp; /* the temporary checkpoint file name */
308 char buf[1024], suffix[5 + STEPSTRSIZE], sbuf[STEPSTRSIZE];
311 if (DOMAINDECOMP(cr))
313 npmenodes = cr->npmenodes;
321 /* make the new temporary filename */
322 snew(fntemp, std::strlen(fn) + 5 + STEPSTRSIZE);
323 std::strcpy(fntemp, fn);
324 fntemp[std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1] = '\0';
325 sprintf(suffix, "_%s%s", "step", gmx_step_str(step, sbuf));
326 std::strcat(fntemp, suffix);
327 std::strcat(fntemp, fn + std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1);
329 /* if we can't rename, we just overwrite the cpt file.
330 * dangerous if interrupted.
332 snew(fntemp, std::strlen(fn));
333 std::strcpy(fntemp, fn);
335 std::string timebuf = gmx_format_current_time();
339 fprintf(fplog, "Writing checkpoint, step %s at %s\n\n", gmx_step_str(step, buf), timebuf.c_str());
342 /* Get offsets for open files */
343 auto outputfiles = gmx_fio_get_output_file_positions();
345 fp = gmx_fio_open(fntemp, "w");
347 /* We can check many more things now (CPU, acceleration, etc), but
348 * it is highly unlikely to have two separate builds with exactly
349 * the same version, user, time, and build host!
352 int nlambda = (state->dfhist ? state->dfhist->nlambda : 0);
354 edsamhistory_t* edsamhist = observablesHistory->edsamHistory.get();
355 int nED = (edsamhist ? edsamhist->nED : 0);
357 swaphistory_t* swaphist = observablesHistory->swapHistory.get();
358 SwapType eSwapCoords = (swaphist ? swaphist->eSwapCoords : SwapType::No);
360 CheckpointHeaderContents headerContents = { 0,
378 state->nhchainlength,
389 std::strcpy(headerContents.version, gmx_version());
390 std::strcpy(headerContents.fprog, gmx::getProgramContext().fullBinaryPath());
391 std::strcpy(headerContents.ftime, timebuf.c_str());
392 if (DOMAINDECOMP(cr))
394 copy_ivec(domdecCells, headerContents.dd_nc);
397 write_checkpoint_data(fp,
405 modularSimulatorCheckpointData);
407 /* we really, REALLY, want to make sure to physically write the checkpoint,
408 and all the files it depends on, out to disk. Because we've
409 opened the checkpoint with gmx_fio_open(), it's in our list
411 ret = gmx_fio_all_output_fsync();
416 sprintf(buf, "Cannot fsync '%s'; maybe you are out of disk space?", gmx_fio_getname(ret));
418 if (getenv(GMX_IGNORE_FSYNC_FAILURE_ENV) == nullptr)
424 gmx_warning("%s", buf);
428 if (gmx_fio_close(fp) != 0)
430 gmx_file("Cannot read/write checkpoint; corrupt file, or maybe you are out of disk space?");
433 /* we don't move the checkpoint if the user specified they didn't want it,
434 or if the fsyncs failed */
436 if (!bNumberAndKeep && !ret)
438 // Add a barrier before renaming to reduce chance to get out of sync (#2440)
439 // Note: Checkpoint might only exist on some ranks, so put barrier before if clause (#3919)
440 mpiBarrierBeforeRename(applyMpiBarrierBeforeRename, mpiBarrierCommunicator);
443 /* Rename the previous checkpoint file */
444 std::strcpy(buf, fn);
445 buf[std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1] = '\0';
446 std::strcat(buf, "_prev");
447 std::strcat(buf, fn + std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1);
450 /* we copy here so that if something goes wrong between now and
451 * the rename below, there's always a state.cpt.
452 * If renames are atomic (such as in POSIX systems),
453 * this copying should be unneccesary.
455 gmx_file_copy(fn, buf, FALSE);
456 /* We don't really care if this fails:
457 * there's already a new checkpoint.
462 gmx_file_rename(fn, buf);
466 /* Rename the checkpoint file from the temporary to the final name */
467 mpiBarrierBeforeRename(applyMpiBarrierBeforeRename, mpiBarrierCommunicator);
469 if (gmx_file_rename(fntemp, fn) != 0)
471 gmx_file("Cannot rename checkpoint file; maybe you are out of disk space?");
474 #endif /* GMX_NO_RENAME */
479 /* Always FAH checkpoint immediately after a GROMACS checkpoint.
481 * Note that it is critical that we save a FAH checkpoint directly
482 * after writing a GROMACS checkpoint. If the program dies, either
483 * by the machine powering off suddenly or the process being,
484 * killed, FAH can recover files that have only appended data by
485 * truncating them to the last recorded length. The GROMACS
486 * checkpoint does not just append data, it is fully rewritten each
487 * time so a crash between moving the new Gromacs checkpoint file in
488 * to place and writing a FAH checkpoint is not recoverable. Thus
489 * the time between these operations must be kept as short as
493 #endif /* end GMX_FAHCORE block */
496 void mdoutf_write_checkpoint(gmx_mdoutf_t of,
501 t_state* state_global,
502 ObservablesHistory* observablesHistory,
503 gmx::WriteCheckpointDataHolder* modularSimulatorCheckpointData)
506 fflush_tng(of->tng_low_prec);
507 /* Write the checkpoint file.
508 * When simulations share the state, an MPI barrier is applied before
509 * renaming old and new checkpoint files to minimize the risk of
510 * checkpoint files getting out of sync.
512 ivec one_ivec = { 1, 1, 1 };
513 write_checkpoint(of->fn_cpt,
517 DOMAINDECOMP(cr) ? cr->dd->numCells : one_ivec,
518 DOMAINDECOMP(cr) ? cr->dd->nnodes : cr->nnodes,
527 *(of->mdModulesNotifiers),
528 modularSimulatorCheckpointData,
529 of->simulationsShareState,
533 void mdoutf_write_to_trajectory_files(FILE* fplog,
540 t_state* state_local,
541 t_state* state_global,
542 ObservablesHistory* observablesHistory,
543 gmx::ArrayRef<const gmx::RVec> f_local,
544 gmx::WriteCheckpointDataHolder* modularSimulatorCheckpointData)
546 const rvec* f_global;
548 if (DOMAINDECOMP(cr))
550 if (mdof_flags & MDOF_CPT)
552 dd_collect_state(cr->dd, state_local, state_global);
556 if (mdof_flags & (MDOF_X | MDOF_X_COMPRESSED))
558 auto globalXRef = MASTER(cr) ? state_global->x : gmx::ArrayRef<gmx::RVec>();
559 dd_collect_vec(cr->dd,
560 state_local->ddp_count,
561 state_local->ddp_count_cg_gl,
566 if (mdof_flags & MDOF_V)
568 auto globalVRef = MASTER(cr) ? state_global->v : gmx::ArrayRef<gmx::RVec>();
569 dd_collect_vec(cr->dd,
570 state_local->ddp_count,
571 state_local->ddp_count_cg_gl,
577 f_global = of->f_global;
578 if (mdof_flags & MDOF_F)
580 auto globalFRef = MASTER(cr) ? gmx::arrayRefFromArray(
581 reinterpret_cast<gmx::RVec*>(of->f_global), of->natoms_global)
582 : gmx::ArrayRef<gmx::RVec>();
583 dd_collect_vec(cr->dd,
584 state_local->ddp_count,
585 state_local->ddp_count_cg_gl,
593 /* We have the whole state locally: copy the local state pointer */
594 state_global = state_local;
596 f_global = as_rvec_array(f_local.data());
601 if (mdof_flags & MDOF_CPT)
603 mdoutf_write_checkpoint(
604 of, fplog, cr, step, t, state_global, observablesHistory, modularSimulatorCheckpointData);
607 if (mdof_flags & (MDOF_X | MDOF_V | MDOF_F))
609 const rvec* x = (mdof_flags & MDOF_X) ? state_global->x.rvec_array() : nullptr;
610 const rvec* v = (mdof_flags & MDOF_V) ? state_global->v.rvec_array() : nullptr;
611 const rvec* f = (mdof_flags & MDOF_F) ? f_global : nullptr;
615 gmx_trr_write_frame(of->fp_trn,
618 state_local->lambda[FreeEnergyPerturbationCouplingType::Fep],
624 if (gmx_fio_flush(of->fp_trn) != 0)
626 gmx_file("Cannot write trajectory; maybe you are out of disk space?");
630 /* If a TNG file is open for uncompressed coordinate output also write
631 velocities and forces to it. */
634 gmx_fwrite_tng(of->tng,
638 state_local->lambda[FreeEnergyPerturbationCouplingType::Fep],
645 /* If only a TNG file is open for compressed coordinate output (no uncompressed
646 coordinate output) also write forces and velocities to it. */
647 else if (of->tng_low_prec)
649 gmx_fwrite_tng(of->tng_low_prec,
653 state_local->lambda[FreeEnergyPerturbationCouplingType::Fep],
661 if (mdof_flags & MDOF_X_COMPRESSED)
663 rvec* xxtc = nullptr;
665 if (of->natoms_x_compressed == of->natoms_global)
667 /* We are writing the positions of all of the atoms to
668 the compressed output */
669 xxtc = state_global->x.rvec_array();
673 /* We are writing the positions of only a subset of
674 the atoms to the compressed output, so we have to
675 make a copy of the subset of coordinates. */
678 snew(xxtc, of->natoms_x_compressed);
679 auto x = makeArrayRef(state_global->x);
680 for (i = 0, j = 0; (i < of->natoms_global); i++)
682 if (getGroupType(*of->groups, SimulationAtomGroupType::CompressedPositionOutput, i) == 0)
684 copy_rvec(x[i], xxtc[j++]);
688 if (write_xtc(of->fp_xtc, of->natoms_x_compressed, step, t, state_local->box, xxtc, of->x_compression_precision)
692 "XTC error. This indicates you are out of disk space, or a "
693 "simulation with major instabilities resulting in coordinates "
694 "that are NaN or too large to be represented in the XTC format.\n");
696 gmx_fwrite_tng(of->tng_low_prec,
700 state_local->lambda[FreeEnergyPerturbationCouplingType::Fep],
702 of->natoms_x_compressed,
706 if (of->natoms_x_compressed != of->natoms_global)
711 if (mdof_flags & (MDOF_BOX | MDOF_LAMBDA) && !(mdof_flags & (MDOF_X | MDOF_V | MDOF_F)))
717 if (mdof_flags & MDOF_BOX)
719 box = state_local->box;
721 if (mdof_flags & MDOF_LAMBDA)
723 lambda = state_local->lambda[FreeEnergyPerturbationCouplingType::Fep];
725 gmx_fwrite_tng(of->tng, FALSE, step, t, lambda, box, natoms, nullptr, nullptr, nullptr);
728 if (mdof_flags & (MDOF_BOX_COMPRESSED | MDOF_LAMBDA_COMPRESSED)
729 && !(mdof_flags & (MDOF_X_COMPRESSED)))
731 if (of->tng_low_prec)
735 if (mdof_flags & MDOF_BOX_COMPRESSED)
737 box = state_local->box;
739 if (mdof_flags & MDOF_LAMBDA_COMPRESSED)
741 lambda = state_local->lambda[FreeEnergyPerturbationCouplingType::Fep];
743 gmx_fwrite_tng(of->tng_low_prec, FALSE, step, t, lambda, box, natoms, nullptr, nullptr, nullptr);
748 /* Write a FAH checkpoint after writing any other data. We may end up
749 checkpointing twice but it's fast so it's ok. */
750 if ((mdof_flags & ~MDOF_CPT))
758 void mdoutf_tng_close(gmx_mdoutf_t of)
760 if (of->tng || of->tng_low_prec)
762 wallcycle_start(of->wcycle, WallCycleCounter::Traj);
763 gmx_tng_close(&of->tng);
764 gmx_tng_close(&of->tng_low_prec);
765 wallcycle_stop(of->wcycle, WallCycleCounter::Traj);
769 void done_mdoutf(gmx_mdoutf_t of)
771 if (of->fp_ene != nullptr)
773 done_ener_file(of->fp_ene);
777 close_xtc(of->fp_xtc);
781 gmx_trr_close(of->fp_trn);
783 if (of->fp_dhdl != nullptr)
785 gmx_fio_fclose(of->fp_dhdl);
787 of->outputProvider->finishOutput();
788 if (of->f_global != nullptr)
793 gmx_tng_close(&of->tng);
794 gmx_tng_close(&of->tng_low_prec);
799 int mdoutf_get_tng_box_output_interval(gmx_mdoutf_t of)
803 return gmx_tng_get_box_output_interval(of->tng);
808 int mdoutf_get_tng_lambda_output_interval(gmx_mdoutf_t of)
812 return gmx_tng_get_lambda_output_interval(of->tng);
817 int mdoutf_get_tng_compressed_box_output_interval(gmx_mdoutf_t of)
819 if (of->tng_low_prec)
821 return gmx_tng_get_box_output_interval(of->tng_low_prec);
826 int mdoutf_get_tng_compressed_lambda_output_interval(gmx_mdoutf_t of)
828 if (of->tng_low_prec)
830 return gmx_tng_get_lambda_output_interval(of->tng_low_prec);