From ade8e131253c001acf53dd5d22796828569a757f Mon Sep 17 00:00:00 2001 From: Magnus Lundborg Date: Wed, 19 Feb 2020 13:17:14 +0100 Subject: [PATCH] Fix out of sync checkpoint files in simulations sharing state When multidir simulations share the state, the checkpoint files of the different simulations should all be from the same step. To ensure this, MPI barriers have been added before renaming the checkpoint files from their temporary to their final names. So now the contents can never be out of sync. In the worst, and rather unlikely, case that something going wrong during renaming, some checkpoint files could have temporary and some final names. Refs #2440. Change-Id: I88088abb726a36dbf9a9db2fa2eb4a46c3bf2cd7 --- docs/release-notes/2020/2020.1.rst | 11 ++++ src/gromacs/fileio/checkpoint.cpp | 26 +++++++- src/gromacs/fileio/checkpoint.h | 8 ++- src/gromacs/mdlib/mdoutf.cpp | 25 +++++++- src/gromacs/mdlib/mdoutf.h | 7 ++- src/gromacs/mdrun/md.cpp | 62 ++++++++++--------- src/gromacs/mdrun/mimic.cpp | 8 ++- src/gromacs/mdrun/minimize.cpp | 30 +++++---- src/gromacs/mdrun/rerun.cpp | 8 ++- .../modularsimulator/modularsimulator.cpp | 2 +- .../modularsimulator/trajectoryelement.cpp | 20 +++++- .../modularsimulator/trajectoryelement.h | 5 +- 12 files changed, 148 insertions(+), 64 deletions(-) diff --git a/docs/release-notes/2020/2020.1.rst b/docs/release-notes/2020/2020.1.rst index 2c38caf0ec..3d28b5c48c 100644 --- a/docs/release-notes/2020/2020.1.rst +++ b/docs/release-notes/2020/2020.1.rst @@ -120,6 +120,17 @@ and turn negative, leading to wrong lookup and unphysical values. :issue:`3391` +Fix checkpoint files getting out of sync with simulations sharing data +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +When simulations share data, e.g., replica exchange, AWH with bias sharing +or NMR ensemble averaging, MPI barrier have now been added before renaming +the checkpointing files to avoid that checkpoints files from the simulations +can get out of sync. Now in very unlikely cases some checkpoint files might +have temporary names, but all content will be in sync. + +:issue:`2440` + Fixes for ``gmx`` tools ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/src/gromacs/fileio/checkpoint.cpp b/src/gromacs/fileio/checkpoint.cpp index b842bcc013..03b5ae8fb1 100644 --- a/src/gromacs/fileio/checkpoint.cpp +++ b/src/gromacs/fileio/checkpoint.cpp @@ -1,7 +1,9 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by + * Copyright (c) 2008,2009,2010,2011,2012 by the GROMACS development team. + * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team. + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -2175,6 +2177,18 @@ static int do_cpt_files(XDR* xd, gmx_bool bRead, std::vectorf_global = nullptr; of->outputProvider = outputProvider; + GMX_RELEASE_ASSERT(!simulationsShareState || ms != nullptr, + "Need valid multisim object when simulations share state"); + of->simulationsShareState = simulationsShareState; + if (of->simulationsShareState) + { + of->mpiCommMasters = ms->mpi_comm_masters; + } + if (MASTER(cr)) { of->bKeepAndNumCPT = mdrunOptions.checkpointOptions.keepAndNumberCheckpointFiles; @@ -295,12 +308,18 @@ void mdoutf_write_to_trajectory_files(FILE* fplog, { fflush_tng(of->tng); fflush_tng(of->tng_low_prec); + /* Write the checkpoint file. + * When simulations share the state, an MPI barrier is applied before + * renaming old and new checkpoint files to minimize the risk of + * checkpoint files getting out of sync. + */ ivec one_ivec = { 1, 1, 1 }; write_checkpoint(of->fn_cpt, of->bKeepAndNumCPT, fplog, cr, DOMAINDECOMP(cr) ? cr->dd->nc : one_ivec, DOMAINDECOMP(cr) ? cr->dd->nnodes : cr->nnodes, of->eIntegrator, of->simulation_part, of->bExpanded, of->elamstats, step, t, - state_global, observablesHistory, *(of->mdModulesNotifier)); + state_global, observablesHistory, *(of->mdModulesNotifier), + of->simulationsShareState, of->mpiCommMasters); } if (mdof_flags & (MDOF_X | MDOF_V | MDOF_F)) diff --git a/src/gromacs/mdlib/mdoutf.h b/src/gromacs/mdlib/mdoutf.h index efca1c6dd3..1eca3f9a35 100644 --- a/src/gromacs/mdlib/mdoutf.h +++ b/src/gromacs/mdlib/mdoutf.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by + * Copyright (c) 2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -45,6 +45,7 @@ class energyhistory_t; struct gmx_mtop_t; +struct gmx_multisim_t; struct gmx_output_env_t; struct ObservablesHistory; struct t_commrec; @@ -77,7 +78,9 @@ gmx_mdoutf_t init_mdoutf(FILE* fplog, gmx_mtop_t* mtop, const gmx_output_env_t* oenv, gmx_wallcycle_t wcycle, - gmx::StartingBehavior startingBehavior); + gmx::StartingBehavior startingBehavior, + bool simulationsShareState, + const gmx_multisim_t* ms); /*! \brief Getter for file pointer */ ener_file_t mdoutf_get_fp_ene(gmx_mdoutf_t of); diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp index 4b5ad914a3..6f64d3205c 100644 --- a/src/gromacs/mdrun/md.cpp +++ b/src/gromacs/mdrun/md.cpp @@ -257,12 +257,42 @@ void gmx::LegacySimulator::do_md() initialize_lambdas(fplog, *ir, MASTER(cr), &state_global->fep_state, state_global->lambda, lam0); Update upd(ir, deform); const bool doSimulatedAnnealing = initSimulatedAnnealing(ir, &upd); + const bool useReplicaExchange = (replExParams.exchangeInterval > 0); + + bool simulationsShareState = false; + int nstSignalComm = nstglobalcomm; + { + // TODO This implementation of ensemble orientation restraints is nasty because + // a user can't just do multi-sim with single-sim orientation restraints. + bool usingEnsembleRestraints = + (fcd->disres.nsystems > 1) || ((ms != nullptr) && (fcd->orires.nr != 0)); + bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr)); + + // Replica exchange, ensemble restraints and AWH need all + // simulations to remain synchronized, so they need + // checkpoints and stop conditions to act on the same step, so + // the propagation of such signals must take place between + // simulations, not just within simulations. + // TODO: Make algorithm initializers set these flags. + simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim; + + if (simulationsShareState) + { + // Inter-simulation signal communication does not need to happen + // often, so we use a minimum of 200 steps to reduce overhead. + const int c_minimumInterSimulationSignallingInterval = 200; + nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm) + * nstglobalcomm; + } + } + if (startingBehavior != StartingBehavior::RestartWithAppending) { pleaseCiteCouplingAlgorithms(fplog, *ir); } - gmx_mdoutf* outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, - mdModulesNotifier, ir, top_global, oenv, wcycle, startingBehavior); + gmx_mdoutf* outf = + init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, ir, + top_global, oenv, wcycle, startingBehavior, simulationsShareState, ms); gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, ir, pull_work, mdoutf_get_fp_dhdl(outf), false, startingBehavior, mdModulesNotifier); @@ -419,7 +449,6 @@ void gmx::LegacySimulator::do_md() startingBehavior != StartingBehavior::NewSimulation, shellfc != nullptr, opt2fn("-awh", nfile, fnm), pull_work); - const bool useReplicaExchange = (replExParams.exchangeInterval > 0); if (useReplicaExchange && MASTER(cr)) { repl_ex = init_replica_exchange(fplog, ms, top_global->natoms, ir, replExParams); @@ -661,33 +690,6 @@ void gmx::LegacySimulator::do_md() bExchanged = FALSE; bNeedRepartition = FALSE; - bool simulationsShareState = false; - int nstSignalComm = nstglobalcomm; - { - // TODO This implementation of ensemble orientation restraints is nasty because - // a user can't just do multi-sim with single-sim orientation restraints. - bool usingEnsembleRestraints = - (fcd->disres.nsystems > 1) || ((ms != nullptr) && (fcd->orires.nr != 0)); - bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr)); - - // Replica exchange, ensemble restraints and AWH need all - // simulations to remain synchronized, so they need - // checkpoints and stop conditions to act on the same step, so - // the propagation of such signals must take place between - // simulations, not just within simulations. - // TODO: Make algorithm initializers set these flags. - simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim; - - if (simulationsShareState) - { - // Inter-simulation signal communication does not need to happen - // often, so we use a minimum of 200 steps to reduce overhead. - const int c_minimumInterSimulationSignallingInterval = 200; - nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm) - * nstglobalcomm; - } - } - auto stopHandler = stopHandlerBuilder->getStopHandlerMD( compat::not_null(&signals[eglsSTOPCOND]), simulationsShareState, MASTER(cr), ir->nstlist, mdrunOptions.reproducible, nstSignalComm, diff --git a/src/gromacs/mdrun/mimic.cpp b/src/gromacs/mdrun/mimic.cpp index 65e08ed342..0894cfb761 100644 --- a/src/gromacs/mdrun/mimic.cpp +++ b/src/gromacs/mdrun/mimic.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -226,8 +226,10 @@ void gmx::LegacySimulator::do_mimic() initialize_lambdas(fplog, *ir, MASTER(cr), &state_global->fep_state, state_global->lambda, lam0); - gmx_mdoutf* outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, - ir, top_global, oenv, wcycle, StartingBehavior::NewSimulation); + const bool simulationsShareState = false; + gmx_mdoutf* outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, + mdModulesNotifier, ir, top_global, oenv, wcycle, + StartingBehavior::NewSimulation, simulationsShareState, ms); gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, ir, pull_work, mdoutf_get_fp_dhdl(outf), true, StartingBehavior::NewSimulation, mdModulesNotifier); diff --git a/src/gromacs/mdrun/minimize.cpp b/src/gromacs/mdrun/minimize.cpp index e20b7e88b0..d90af4f9d7 100644 --- a/src/gromacs/mdrun/minimize.cpp +++ b/src/gromacs/mdrun/minimize.cpp @@ -3,7 +3,7 @@ * * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2004, The GROMACS development team. - * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by + * Copyright (c) 2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -1084,9 +1084,10 @@ void LegacySimulator::do_cg() /* Init em and store the local state in s_min */ init_em(fplog, mdlog, CG, cr, inputrec, imdSession, pull_work, state_global, top_global, s_min, &top, nrnb, fr, &graph, mdAtoms, &gstat, vsite, constr, nullptr); - gmx_mdoutf* outf = - init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, - inputrec, top_global, nullptr, wcycle, StartingBehavior::NewSimulation); + const bool simulationsShareState = false; + gmx_mdoutf* outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, + mdModulesNotifier, inputrec, top_global, nullptr, wcycle, + StartingBehavior::NewSimulation, simulationsShareState, ms); gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, inputrec, pull_work, nullptr, false, StartingBehavior::NewSimulation, mdModulesNotifier); @@ -1697,9 +1698,10 @@ void LegacySimulator::do_lbfgs() /* Init em */ init_em(fplog, mdlog, LBFGS, cr, inputrec, imdSession, pull_work, state_global, top_global, &ems, &top, nrnb, fr, &graph, mdAtoms, &gstat, vsite, constr, nullptr); - gmx_mdoutf* outf = - init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, - inputrec, top_global, nullptr, wcycle, StartingBehavior::NewSimulation); + const bool simulationsShareState = false; + gmx_mdoutf* outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, + mdModulesNotifier, inputrec, top_global, nullptr, wcycle, + StartingBehavior::NewSimulation, simulationsShareState, ms); gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, inputrec, pull_work, nullptr, false, StartingBehavior::NewSimulation, mdModulesNotifier); @@ -2377,9 +2379,10 @@ void LegacySimulator::do_steep() /* Init em and store the local state in s_try */ init_em(fplog, mdlog, SD, cr, inputrec, imdSession, pull_work, state_global, top_global, s_try, &top, nrnb, fr, &graph, mdAtoms, &gstat, vsite, constr, nullptr); - gmx_mdoutf* outf = - init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, - inputrec, top_global, nullptr, wcycle, StartingBehavior::NewSimulation); + const bool simulationsShareState = false; + gmx_mdoutf* outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, + mdModulesNotifier, inputrec, top_global, nullptr, wcycle, + StartingBehavior::NewSimulation, simulationsShareState, ms); gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, inputrec, pull_work, nullptr, false, StartingBehavior::NewSimulation, mdModulesNotifier); @@ -2622,9 +2625,10 @@ void LegacySimulator::do_nm() /* Init em and store the local state in state_minimum */ init_em(fplog, mdlog, NM, cr, inputrec, imdSession, pull_work, state_global, top_global, &state_work, &top, nrnb, fr, &graph, mdAtoms, &gstat, vsite, constr, &shellfc); - gmx_mdoutf* outf = - init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, - inputrec, top_global, nullptr, wcycle, StartingBehavior::NewSimulation); + const bool simulationsShareState = false; + gmx_mdoutf* outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, + mdModulesNotifier, inputrec, top_global, nullptr, wcycle, + StartingBehavior::NewSimulation, simulationsShareState, ms); std::vector atom_index = get_atom_index(top_global); std::vector fneg(atom_index.size(), { 0, 0, 0 }); diff --git a/src/gromacs/mdrun/rerun.cpp b/src/gromacs/mdrun/rerun.cpp index c75be98e38..39b76695e8 100644 --- a/src/gromacs/mdrun/rerun.cpp +++ b/src/gromacs/mdrun/rerun.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -302,8 +302,10 @@ void gmx::LegacySimulator::do_rerun() } initialize_lambdas(fplog, *ir, MASTER(cr), &state_global->fep_state, state_global->lambda, lam0); - gmx_mdoutf* outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, - ir, top_global, oenv, wcycle, StartingBehavior::NewSimulation); + const bool simulationsShareState = false; + gmx_mdoutf* outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, + mdModulesNotifier, ir, top_global, oenv, wcycle, + StartingBehavior::NewSimulation, simulationsShareState, ms); gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, ir, pull_work, mdoutf_get_fp_dhdl(outf), true, StartingBehavior::NewSimulation, mdModulesNotifier); diff --git a/src/gromacs/modularsimulator/modularsimulator.cpp b/src/gromacs/modularsimulator/modularsimulator.cpp index cfa8f8cf69..14a52c4d77 100644 --- a/src/gromacs/modularsimulator/modularsimulator.cpp +++ b/src/gromacs/modularsimulator/modularsimulator.cpp @@ -474,7 +474,7 @@ void ModularSimulator::constructElementsAndSignallers() loggingSignallerBuilder.registerSignallerClient(compat::make_not_null(energySignaller.get())); auto trajectoryElement = trajectoryElementBuilder.build( fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, inputrec, - top_global, oenv, wcycle, startingBehavior); + top_global, oenv, wcycle, startingBehavior, simulationsShareState); loggingSignallerBuilder.registerSignallerClient(compat::make_not_null(trajectoryElement.get())); // Add checkpoint helper here since we need a pointer to the trajectory element and diff --git a/src/gromacs/modularsimulator/trajectoryelement.cpp b/src/gromacs/modularsimulator/trajectoryelement.cpp index 60998cdf06..1195b294cb 100644 --- a/src/gromacs/modularsimulator/trajectoryelement.cpp +++ b/src/gromacs/modularsimulator/trajectoryelement.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019, by the GROMACS development team, led by + * Copyright (c) 2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -63,10 +63,24 @@ TrajectoryElement::TrajectoryElement(std::vector signa gmx_mtop_t* top_global, const gmx_output_env_t* oenv, gmx_wallcycle* wcycle, - StartingBehavior startingBehavior) : + StartingBehavior startingBehavior, + const bool simulationsShareState) : writeEnergyStep_(-1), writeStateStep_(-1), - outf_(init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, inputrec, top_global, oenv, wcycle, startingBehavior)), + outf_(init_mdoutf(fplog, + nfile, + fnm, + mdrunOptions, + cr, + outputProvider, + mdModulesNotifier, + inputrec, + top_global, + oenv, + wcycle, + startingBehavior, + simulationsShareState, + nullptr)), nstxout_(inputrec->nstxout), nstvout_(inputrec->nstvout), nstfout_(inputrec->nstfout), diff --git a/src/gromacs/modularsimulator/trajectoryelement.h b/src/gromacs/modularsimulator/trajectoryelement.h index d33656a85b..fd70602024 100644 --- a/src/gromacs/modularsimulator/trajectoryelement.h +++ b/src/gromacs/modularsimulator/trajectoryelement.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019, by the GROMACS development team, led by + * Copyright (c) 2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -159,7 +159,8 @@ private: gmx_mtop_t* top_global, const gmx_output_env_t* oenv, gmx_wallcycle* wcycle, - StartingBehavior startingBehavior); + StartingBehavior startingBehavior, + bool simulationsSharingState); //! The next energy writing step Step writeEnergyStep_; -- 2.22.0