:issue:`3391`
+Fix checkpoint files getting out of sync with simulations sharing data
+""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+
+When simulations share data, e.g., replica exchange, AWH with bias sharing
+or NMR ensemble averaging, MPI barrier have now been added before renaming
+the checkpointing files to avoid that checkpoints files from the simulations
+can get out of sync. Now in very unlikely cases some checkpoint files might
+have temporary names, but all content will be in sync.
+
+:issue:`2440`
+
Fixes for ``gmx`` tools
^^^^^^^^^^^^^^^^^^^^^^^
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2008,2009,2010,2011,2012 by the GROMACS development team.
+ * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
return 0;
}
+static void mpiBarrierBeforeRename(const bool applyMpiBarrierBeforeRename, MPI_Comm mpiBarrierCommunicator)
+{
+ if (applyMpiBarrierBeforeRename)
+ {
+#if GMX_MPI
+ MPI_Barrier(mpiBarrierCommunicator);
+#else
+ GMX_RELEASE_ASSERT(false, "Should not request a barrier without MPI");
+ GMX_UNUSED_VALUE(mpiBarrierCommunicator);
+#endif
+ }
+}
void write_checkpoint(const char* fn,
gmx_bool bNumberAndKeep,
double t,
t_state* state,
ObservablesHistory* observablesHistory,
- const gmx::MdModulesNotifier& mdModulesNotifier)
+ const gmx::MdModulesNotifier& mdModulesNotifier,
+ bool applyMpiBarrierBeforeRename,
+ MPI_Comm mpiBarrierCommunicator)
{
t_fileio* fp;
char* fntemp; /* the temporary checkpoint file name */
if (gmx_fexist(fn))
{
/* Rename the previous checkpoint file */
+ mpiBarrierBeforeRename(applyMpiBarrierBeforeRename, mpiBarrierCommunicator);
+
std::strcpy(buf, fn);
buf[std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1] = '\0';
std::strcat(buf, "_prev");
gmx_file_rename(fn, buf);
}
}
+
+ /* Rename the checkpoint file from the temporary to the final name */
+ mpiBarrierBeforeRename(applyMpiBarrierBeforeRename, mpiBarrierCommunicator);
+
if (gmx_file_rename(fntemp, fn) != 0)
{
gmx_file("Cannot rename checkpoint file; maybe you are out of disk space?");
*
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include "gromacs/math/vectypes.h"
#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/gmxmpi.h"
#include "gromacs/utility/keyvaluetreebuilder.h"
class energyhistory_t;
double t,
t_state* state,
ObservablesHistory* observablesHistory,
- const gmx::MdModulesNotifier& notifier);
+ const gmx::MdModulesNotifier& notifier,
+ bool applyMpiBarrierBeforeRename,
+ MPI_Comm mpiBarrierCommunicator);
/* Loads a checkpoint from fn for run continuation.
* Generates a fatal error on system size mismatch.
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include "gromacs/math/vec.h"
#include "gromacs/mdlib/trajectory_writing.h"
#include "gromacs/mdrunutility/handlerestart.h"
+#include "gromacs/mdrunutility/multisim.h"
#include "gromacs/mdtypes/commrec.h"
#include "gromacs/mdtypes/imdoutputprovider.h"
#include "gromacs/mdtypes/inputrec.h"
rvec* f_global;
gmx::IMDOutputProvider* outputProvider;
const gmx::MdModulesNotifier* mdModulesNotifier;
+ bool simulationsShareState;
+ MPI_Comm mpiCommMasters;
};
gmx_mtop_t* top_global,
const gmx_output_env_t* oenv,
gmx_wallcycle_t wcycle,
- const gmx::StartingBehavior startingBehavior)
+ const gmx::StartingBehavior startingBehavior,
+ bool simulationsShareState,
+ const gmx_multisim_t* ms)
{
gmx_mdoutf_t of;
const char * appendMode = "a+", *writeMode = "w+", *filemode;
of->f_global = nullptr;
of->outputProvider = outputProvider;
+ GMX_RELEASE_ASSERT(!simulationsShareState || ms != nullptr,
+ "Need valid multisim object when simulations share state");
+ of->simulationsShareState = simulationsShareState;
+ if (of->simulationsShareState)
+ {
+ of->mpiCommMasters = ms->mpi_comm_masters;
+ }
+
if (MASTER(cr))
{
of->bKeepAndNumCPT = mdrunOptions.checkpointOptions.keepAndNumberCheckpointFiles;
{
fflush_tng(of->tng);
fflush_tng(of->tng_low_prec);
+ /* Write the checkpoint file.
+ * When simulations share the state, an MPI barrier is applied before
+ * renaming old and new checkpoint files to minimize the risk of
+ * checkpoint files getting out of sync.
+ */
ivec one_ivec = { 1, 1, 1 };
write_checkpoint(of->fn_cpt, of->bKeepAndNumCPT, fplog, cr,
DOMAINDECOMP(cr) ? cr->dd->nc : one_ivec,
DOMAINDECOMP(cr) ? cr->dd->nnodes : cr->nnodes, of->eIntegrator,
of->simulation_part, of->bExpanded, of->elamstats, step, t,
- state_global, observablesHistory, *(of->mdModulesNotifier));
+ state_global, observablesHistory, *(of->mdModulesNotifier),
+ of->simulationsShareState, of->mpiCommMasters);
}
if (mdof_flags & (MDOF_X | MDOF_V | MDOF_F))
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
class energyhistory_t;
struct gmx_mtop_t;
+struct gmx_multisim_t;
struct gmx_output_env_t;
struct ObservablesHistory;
struct t_commrec;
gmx_mtop_t* mtop,
const gmx_output_env_t* oenv,
gmx_wallcycle_t wcycle,
- gmx::StartingBehavior startingBehavior);
+ gmx::StartingBehavior startingBehavior,
+ bool simulationsShareState,
+ const gmx_multisim_t* ms);
/*! \brief Getter for file pointer */
ener_file_t mdoutf_get_fp_ene(gmx_mdoutf_t of);
initialize_lambdas(fplog, *ir, MASTER(cr), &state_global->fep_state, state_global->lambda, lam0);
Update upd(ir, deform);
const bool doSimulatedAnnealing = initSimulatedAnnealing(ir, &upd);
+ const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
+
+ bool simulationsShareState = false;
+ int nstSignalComm = nstglobalcomm;
+ {
+ // TODO This implementation of ensemble orientation restraints is nasty because
+ // a user can't just do multi-sim with single-sim orientation restraints.
+ bool usingEnsembleRestraints =
+ (fcd->disres.nsystems > 1) || ((ms != nullptr) && (fcd->orires.nr != 0));
+ bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr));
+
+ // Replica exchange, ensemble restraints and AWH need all
+ // simulations to remain synchronized, so they need
+ // checkpoints and stop conditions to act on the same step, so
+ // the propagation of such signals must take place between
+ // simulations, not just within simulations.
+ // TODO: Make algorithm initializers set these flags.
+ simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
+
+ if (simulationsShareState)
+ {
+ // Inter-simulation signal communication does not need to happen
+ // often, so we use a minimum of 200 steps to reduce overhead.
+ const int c_minimumInterSimulationSignallingInterval = 200;
+ nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
+ * nstglobalcomm;
+ }
+ }
+
if (startingBehavior != StartingBehavior::RestartWithAppending)
{
pleaseCiteCouplingAlgorithms(fplog, *ir);
}
- gmx_mdoutf* outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider,
- mdModulesNotifier, ir, top_global, oenv, wcycle, startingBehavior);
+ gmx_mdoutf* outf =
+ init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, ir,
+ top_global, oenv, wcycle, startingBehavior, simulationsShareState, ms);
gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, ir, pull_work,
mdoutf_get_fp_dhdl(outf), false, startingBehavior, mdModulesNotifier);
startingBehavior != StartingBehavior::NewSimulation,
shellfc != nullptr, opt2fn("-awh", nfile, fnm), pull_work);
- const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
if (useReplicaExchange && MASTER(cr))
{
repl_ex = init_replica_exchange(fplog, ms, top_global->natoms, ir, replExParams);
bExchanged = FALSE;
bNeedRepartition = FALSE;
- bool simulationsShareState = false;
- int nstSignalComm = nstglobalcomm;
- {
- // TODO This implementation of ensemble orientation restraints is nasty because
- // a user can't just do multi-sim with single-sim orientation restraints.
- bool usingEnsembleRestraints =
- (fcd->disres.nsystems > 1) || ((ms != nullptr) && (fcd->orires.nr != 0));
- bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr));
-
- // Replica exchange, ensemble restraints and AWH need all
- // simulations to remain synchronized, so they need
- // checkpoints and stop conditions to act on the same step, so
- // the propagation of such signals must take place between
- // simulations, not just within simulations.
- // TODO: Make algorithm initializers set these flags.
- simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
-
- if (simulationsShareState)
- {
- // Inter-simulation signal communication does not need to happen
- // often, so we use a minimum of 200 steps to reduce overhead.
- const int c_minimumInterSimulationSignallingInterval = 200;
- nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
- * nstglobalcomm;
- }
- }
-
auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]), simulationsShareState,
MASTER(cr), ir->nstlist, mdrunOptions.reproducible, nstSignalComm,
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
initialize_lambdas(fplog, *ir, MASTER(cr), &state_global->fep_state, state_global->lambda, lam0);
- gmx_mdoutf* outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier,
- ir, top_global, oenv, wcycle, StartingBehavior::NewSimulation);
+ const bool simulationsShareState = false;
+ gmx_mdoutf* outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider,
+ mdModulesNotifier, ir, top_global, oenv, wcycle,
+ StartingBehavior::NewSimulation, simulationsShareState, ms);
gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, ir, pull_work,
mdoutf_get_fp_dhdl(outf), true, StartingBehavior::NewSimulation,
mdModulesNotifier);
*
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
/* Init em and store the local state in s_min */
init_em(fplog, mdlog, CG, cr, inputrec, imdSession, pull_work, state_global, top_global, s_min,
&top, nrnb, fr, &graph, mdAtoms, &gstat, vsite, constr, nullptr);
- gmx_mdoutf* outf =
- init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier,
- inputrec, top_global, nullptr, wcycle, StartingBehavior::NewSimulation);
+ const bool simulationsShareState = false;
+ gmx_mdoutf* outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider,
+ mdModulesNotifier, inputrec, top_global, nullptr, wcycle,
+ StartingBehavior::NewSimulation, simulationsShareState, ms);
gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, inputrec, pull_work, nullptr,
false, StartingBehavior::NewSimulation, mdModulesNotifier);
/* Init em */
init_em(fplog, mdlog, LBFGS, cr, inputrec, imdSession, pull_work, state_global, top_global,
&ems, &top, nrnb, fr, &graph, mdAtoms, &gstat, vsite, constr, nullptr);
- gmx_mdoutf* outf =
- init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier,
- inputrec, top_global, nullptr, wcycle, StartingBehavior::NewSimulation);
+ const bool simulationsShareState = false;
+ gmx_mdoutf* outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider,
+ mdModulesNotifier, inputrec, top_global, nullptr, wcycle,
+ StartingBehavior::NewSimulation, simulationsShareState, ms);
gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, inputrec, pull_work, nullptr,
false, StartingBehavior::NewSimulation, mdModulesNotifier);
/* Init em and store the local state in s_try */
init_em(fplog, mdlog, SD, cr, inputrec, imdSession, pull_work, state_global, top_global, s_try,
&top, nrnb, fr, &graph, mdAtoms, &gstat, vsite, constr, nullptr);
- gmx_mdoutf* outf =
- init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier,
- inputrec, top_global, nullptr, wcycle, StartingBehavior::NewSimulation);
+ const bool simulationsShareState = false;
+ gmx_mdoutf* outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider,
+ mdModulesNotifier, inputrec, top_global, nullptr, wcycle,
+ StartingBehavior::NewSimulation, simulationsShareState, ms);
gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, inputrec, pull_work, nullptr,
false, StartingBehavior::NewSimulation, mdModulesNotifier);
/* Init em and store the local state in state_minimum */
init_em(fplog, mdlog, NM, cr, inputrec, imdSession, pull_work, state_global, top_global,
&state_work, &top, nrnb, fr, &graph, mdAtoms, &gstat, vsite, constr, &shellfc);
- gmx_mdoutf* outf =
- init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier,
- inputrec, top_global, nullptr, wcycle, StartingBehavior::NewSimulation);
+ const bool simulationsShareState = false;
+ gmx_mdoutf* outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider,
+ mdModulesNotifier, inputrec, top_global, nullptr, wcycle,
+ StartingBehavior::NewSimulation, simulationsShareState, ms);
std::vector<int> atom_index = get_atom_index(top_global);
std::vector<gmx::RVec> fneg(atom_index.size(), { 0, 0, 0 });
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
}
initialize_lambdas(fplog, *ir, MASTER(cr), &state_global->fep_state, state_global->lambda, lam0);
- gmx_mdoutf* outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier,
- ir, top_global, oenv, wcycle, StartingBehavior::NewSimulation);
+ const bool simulationsShareState = false;
+ gmx_mdoutf* outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider,
+ mdModulesNotifier, ir, top_global, oenv, wcycle,
+ StartingBehavior::NewSimulation, simulationsShareState, ms);
gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, ir, pull_work,
mdoutf_get_fp_dhdl(outf), true, StartingBehavior::NewSimulation,
mdModulesNotifier);
loggingSignallerBuilder.registerSignallerClient(compat::make_not_null(energySignaller.get()));
auto trajectoryElement = trajectoryElementBuilder.build(
fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, inputrec,
- top_global, oenv, wcycle, startingBehavior);
+ top_global, oenv, wcycle, startingBehavior, simulationsShareState);
loggingSignallerBuilder.registerSignallerClient(compat::make_not_null(trajectoryElement.get()));
// Add checkpoint helper here since we need a pointer to the trajectory element and
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
gmx_mtop_t* top_global,
const gmx_output_env_t* oenv,
gmx_wallcycle* wcycle,
- StartingBehavior startingBehavior) :
+ StartingBehavior startingBehavior,
+ const bool simulationsShareState) :
writeEnergyStep_(-1),
writeStateStep_(-1),
- outf_(init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, inputrec, top_global, oenv, wcycle, startingBehavior)),
+ outf_(init_mdoutf(fplog,
+ nfile,
+ fnm,
+ mdrunOptions,
+ cr,
+ outputProvider,
+ mdModulesNotifier,
+ inputrec,
+ top_global,
+ oenv,
+ wcycle,
+ startingBehavior,
+ simulationsShareState,
+ nullptr)),
nstxout_(inputrec->nstxout),
nstvout_(inputrec->nstvout),
nstfout_(inputrec->nstfout),
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
gmx_mtop_t* top_global,
const gmx_output_env_t* oenv,
gmx_wallcycle* wcycle,
- StartingBehavior startingBehavior);
+ StartingBehavior startingBehavior,
+ bool simulationsSharingState);
//! The next energy writing step
Step writeEnergyStep_;