#include "checkpoint.h"
-#include "config.h"
-
#include <cerrno>
#include <cstdlib>
#include <cstring>
return 0;
}
-static void mpiBarrierBeforeRename(const bool applyMpiBarrierBeforeRename, MPI_Comm mpiBarrierCommunicator)
-{
- if (applyMpiBarrierBeforeRename)
- {
-#if GMX_MPI
- MPI_Barrier(mpiBarrierCommunicator);
-#else
- GMX_RELEASE_ASSERT(false, "Should not request a barrier without MPI");
- GMX_UNUSED_VALUE(mpiBarrierCommunicator);
-#endif
- }
-}
-
-void write_checkpoint(const char* fn,
- gmx_bool bNumberAndKeep,
- FILE* fplog,
- const t_commrec* cr,
- ivec domdecCells,
- int nppnodes,
- int eIntegrator,
- int simulation_part,
- gmx_bool bExpanded,
- int elamstats,
- int64_t step,
- double t,
- t_state* state,
- ObservablesHistory* observablesHistory,
- const gmx::MdModulesNotifier& mdModulesNotifier,
- bool applyMpiBarrierBeforeRename,
- MPI_Comm mpiBarrierCommunicator)
+void write_checkpoint_data(t_fileio* fp,
+ CheckpointHeaderContents headerContents,
+ gmx_bool bExpanded,
+ int elamstats,
+ t_state* state,
+ ObservablesHistory* observablesHistory,
+ const gmx::MdModulesNotifier& mdModulesNotifier,
+ std::vector<gmx_file_position_t>* outputfiles)
{
- t_fileio* fp;
- char* fntemp; /* the temporary checkpoint file name */
- int npmenodes;
- char buf[1024], suffix[5 + STEPSTRSIZE], sbuf[STEPSTRSIZE];
- t_fileio* ret;
-
- if (DOMAINDECOMP(cr))
- {
- npmenodes = cr->npmenodes;
- }
- else
- {
- npmenodes = 0;
- }
-
-#if !GMX_NO_RENAME
- /* make the new temporary filename */
- snew(fntemp, std::strlen(fn) + 5 + STEPSTRSIZE);
- std::strcpy(fntemp, fn);
- fntemp[std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1] = '\0';
- sprintf(suffix, "_%s%s", "step", gmx_step_str(step, sbuf));
- std::strcat(fntemp, suffix);
- std::strcat(fntemp, fn + std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1);
-#else
- /* if we can't rename, we just overwrite the cpt file.
- * dangerous if interrupted.
- */
- snew(fntemp, std::strlen(fn));
- std::strcpy(fntemp, fn);
-#endif
- std::string timebuf = gmx_format_current_time();
-
- if (fplog)
- {
- fprintf(fplog, "Writing checkpoint, step %s at %s\n\n", gmx_step_str(step, buf), timebuf.c_str());
- }
-
- /* Get offsets for open files */
- auto outputfiles = gmx_fio_get_output_file_positions();
-
- fp = gmx_fio_open(fntemp, "w");
-
- int flags_eks;
+ headerContents.flags_eks = 0;
if (state->ekinstate.bUpToDate)
{
- flags_eks = ((1 << eeksEKIN_N) | (1 << eeksEKINH) | (1 << eeksEKINF) | (1 << eeksEKINO)
- | (1 << eeksEKINSCALEF) | (1 << eeksEKINSCALEH) | (1 << eeksVSCALE)
- | (1 << eeksDEKINDL) | (1 << eeksMVCOS));
- }
- else
- {
- flags_eks = 0;
+ headerContents.flags_eks = ((1 << eeksEKIN_N) | (1 << eeksEKINH) | (1 << eeksEKINF)
+ | (1 << eeksEKINO) | (1 << eeksEKINSCALEF) | (1 << eeksEKINSCALEH)
+ | (1 << eeksVSCALE) | (1 << eeksDEKINDL) | (1 << eeksMVCOS));
}
- energyhistory_t* enerhist = observablesHistory->energyHistory.get();
- int flags_enh = 0;
+ energyhistory_t* enerhist = observablesHistory->energyHistory.get();
+ headerContents.flags_enh = 0;
if (enerhist != nullptr && (enerhist->nsum > 0 || enerhist->nsum_sim > 0))
{
- flags_enh |= (1 << eenhENERGY_N) | (1 << eenhENERGY_NSTEPS) | (1 << eenhENERGY_NSTEPS_SIM);
+ headerContents.flags_enh |=
+ (1 << eenhENERGY_N) | (1 << eenhENERGY_NSTEPS) | (1 << eenhENERGY_NSTEPS_SIM);
if (enerhist->nsum > 0)
{
- flags_enh |= ((1 << eenhENERGY_AVER) | (1 << eenhENERGY_SUM) | (1 << eenhENERGY_NSUM));
+ headerContents.flags_enh |=
+ ((1 << eenhENERGY_AVER) | (1 << eenhENERGY_SUM) | (1 << eenhENERGY_NSUM));
}
if (enerhist->nsum_sim > 0)
{
- flags_enh |= ((1 << eenhENERGY_SUM_SIM) | (1 << eenhENERGY_NSUM_SIM));
+ headerContents.flags_enh |= ((1 << eenhENERGY_SUM_SIM) | (1 << eenhENERGY_NSUM_SIM));
}
if (enerhist->deltaHForeignLambdas != nullptr)
{
- flags_enh |= ((1 << eenhENERGY_DELTA_H_NN) | (1 << eenhENERGY_DELTA_H_LIST)
- | (1 << eenhENERGY_DELTA_H_STARTTIME) | (1 << eenhENERGY_DELTA_H_STARTLAMBDA));
+ headerContents.flags_enh |=
+ ((1 << eenhENERGY_DELTA_H_NN) | (1 << eenhENERGY_DELTA_H_LIST)
+ | (1 << eenhENERGY_DELTA_H_STARTTIME) | (1 << eenhENERGY_DELTA_H_STARTLAMBDA));
}
}
- PullHistory* pullHist = observablesHistory->pullHistory.get();
- int flagsPullHistory = 0;
+ PullHistory* pullHist = observablesHistory->pullHistory.get();
+ headerContents.flagsPullHistory = 0;
if (pullHist != nullptr && (pullHist->numValuesInXSum > 0 || pullHist->numValuesInFSum > 0))
{
- flagsPullHistory |= (1 << epullhPULL_NUMCOORDINATES);
- flagsPullHistory |= ((1 << epullhPULL_NUMGROUPS) | (1 << epullhPULL_NUMVALUESINXSUM)
- | (1 << epullhPULL_NUMVALUESINFSUM));
+ headerContents.flagsPullHistory |= (1 << epullhPULL_NUMCOORDINATES);
+ headerContents.flagsPullHistory |= ((1 << epullhPULL_NUMGROUPS) | (1 << epullhPULL_NUMVALUESINXSUM)
+ | (1 << epullhPULL_NUMVALUESINFSUM));
}
- int flags_dfh;
+ headerContents.flags_dfh = 0;
if (bExpanded)
{
- flags_dfh = ((1 << edfhBEQUIL) | (1 << edfhNATLAMBDA) | (1 << edfhSUMWEIGHTS)
- | (1 << edfhSUMDG) | (1 << edfhTIJ) | (1 << edfhTIJEMP));
+ headerContents.flags_dfh = ((1 << edfhBEQUIL) | (1 << edfhNATLAMBDA) | (1 << edfhSUMWEIGHTS)
+ | (1 << edfhSUMDG) | (1 << edfhTIJ) | (1 << edfhTIJEMP));
if (EWL(elamstats))
{
- flags_dfh |= ((1 << edfhWLDELTA) | (1 << edfhWLHISTO));
+ headerContents.flags_dfh |= ((1 << edfhWLDELTA) | (1 << edfhWLHISTO));
}
if ((elamstats == elamstatsMINVAR) || (elamstats == elamstatsBARKER)
|| (elamstats == elamstatsMETROPOLIS))
{
- flags_dfh |= ((1 << edfhACCUMP) | (1 << edfhACCUMM) | (1 << edfhACCUMP2)
- | (1 << edfhACCUMM2) | (1 << edfhSUMMINVAR) | (1 << edfhSUMVAR));
+ headerContents.flags_dfh |= ((1 << edfhACCUMP) | (1 << edfhACCUMM) | (1 << edfhACCUMP2)
+ | (1 << edfhACCUMM2) | (1 << edfhSUMMINVAR) | (1 << edfhSUMVAR));
}
}
- else
- {
- flags_dfh = 0;
- }
- int flags_awhh = 0;
+ headerContents.flags_awhh = 0;
if (state->awhHistory != nullptr && !state->awhHistory->bias.empty())
{
- flags_awhh |= ((1 << eawhhIN_INITIAL) | (1 << eawhhEQUILIBRATEHISTOGRAM) | (1 << eawhhHISTSIZE)
- | (1 << eawhhNPOINTS) | (1 << eawhhCOORDPOINT) | (1 << eawhhUMBRELLAGRIDPOINT)
- | (1 << eawhhUPDATELIST) | (1 << eawhhLOGSCALEDSAMPLEWEIGHT)
- | (1 << eawhhNUMUPDATES) | (1 << eawhhFORCECORRELATIONGRID));
- }
-
- /* We can check many more things now (CPU, acceleration, etc), but
- * it is highly unlikely to have two separate builds with exactly
- * the same version, user, time, and build host!
- */
-
- int nlambda = (state->dfhist ? state->dfhist->nlambda : 0);
-
- edsamhistory_t* edsamhist = observablesHistory->edsamHistory.get();
- int nED = (edsamhist ? edsamhist->nED : 0);
-
- swaphistory_t* swaphist = observablesHistory->swapHistory.get();
- int eSwapCoords = (swaphist ? swaphist->eSwapCoords : eswapNO);
-
- CheckpointHeaderContents headerContents = { 0,
- { 0 },
- { 0 },
- { 0 },
- { 0 },
- GMX_DOUBLE,
- { 0 },
- { 0 },
- eIntegrator,
- simulation_part,
- step,
- t,
- nppnodes,
- { 0 },
- npmenodes,
- state->natoms,
- state->ngtc,
- state->nnhpres,
- state->nhchainlength,
- nlambda,
- state->flags,
- flags_eks,
- flags_enh,
- flagsPullHistory,
- flags_dfh,
- flags_awhh,
- nED,
- eSwapCoords };
- std::strcpy(headerContents.version, gmx_version());
- std::strcpy(headerContents.fprog, gmx::getProgramContext().fullBinaryPath());
- std::strcpy(headerContents.ftime, timebuf.c_str());
- if (DOMAINDECOMP(cr))
- {
- copy_ivec(domdecCells, headerContents.dd_nc);
+ headerContents.flags_awhh |=
+ ((1 << eawhhIN_INITIAL) | (1 << eawhhEQUILIBRATEHISTOGRAM) | (1 << eawhhHISTSIZE)
+ | (1 << eawhhNPOINTS) | (1 << eawhhCOORDPOINT) | (1 << eawhhUMBRELLAGRIDPOINT)
+ | (1 << eawhhUPDATELIST) | (1 << eawhhLOGSCALEDSAMPLEWEIGHT)
+ | (1 << eawhhNUMUPDATES) | (1 << eawhhFORCECORRELATIONGRID));
}
do_cpt_header(gmx_fio_getxdr(fp), FALSE, nullptr, &headerContents);
if ((do_cpt_state(gmx_fio_getxdr(fp), state->flags, state, nullptr) < 0)
- || (do_cpt_ekinstate(gmx_fio_getxdr(fp), flags_eks, &state->ekinstate, nullptr) < 0)
- || (do_cpt_enerhist(gmx_fio_getxdr(fp), FALSE, flags_enh, enerhist, nullptr) < 0)
- || (doCptPullHist(gmx_fio_getxdr(fp), FALSE, flagsPullHistory, pullHist, StatePart::pullHistory, nullptr)
+ || (do_cpt_ekinstate(gmx_fio_getxdr(fp), headerContents.flags_eks, &state->ekinstate, nullptr) < 0)
+ || (do_cpt_enerhist(gmx_fio_getxdr(fp), FALSE, headerContents.flags_enh, enerhist, nullptr) < 0)
+ || (doCptPullHist(gmx_fio_getxdr(fp), FALSE, headerContents.flagsPullHistory, pullHist,
+ StatePart::pullHistory, nullptr)
+ < 0)
+ || (do_cpt_df_hist(gmx_fio_getxdr(fp), headerContents.flags_dfh, headerContents.nlambda,
+ &state->dfhist, nullptr)
+ < 0)
+ || (do_cpt_EDstate(gmx_fio_getxdr(fp), FALSE, headerContents.nED,
+ observablesHistory->edsamHistory.get(), nullptr)
+ < 0)
+ || (do_cpt_awh(gmx_fio_getxdr(fp), FALSE, headerContents.flags_awhh, state->awhHistory.get(), nullptr) < 0)
+ || (do_cpt_swapstate(gmx_fio_getxdr(fp), FALSE, headerContents.eSwapCoords,
+ observablesHistory->swapHistory.get(), nullptr)
< 0)
- || (do_cpt_df_hist(gmx_fio_getxdr(fp), flags_dfh, nlambda, &state->dfhist, nullptr) < 0)
- || (do_cpt_EDstate(gmx_fio_getxdr(fp), FALSE, nED, edsamhist, nullptr) < 0)
- || (do_cpt_awh(gmx_fio_getxdr(fp), FALSE, flags_awhh, state->awhHistory.get(), nullptr) < 0)
- || (do_cpt_swapstate(gmx_fio_getxdr(fp), FALSE, eSwapCoords, swaphist, nullptr) < 0)
- || (do_cpt_files(gmx_fio_getxdr(fp), FALSE, &outputfiles, nullptr, headerContents.file_version) < 0))
+ || (do_cpt_files(gmx_fio_getxdr(fp), FALSE, outputfiles, nullptr, headerContents.file_version) < 0))
{
gmx_file("Cannot read/write checkpoint; corrupt file, or maybe you are out of disk space?");
}
}
do_cpt_footer(gmx_fio_getxdr(fp), headerContents.file_version);
-
- /* we really, REALLY, want to make sure to physically write the checkpoint,
- and all the files it depends on, out to disk. Because we've
- opened the checkpoint with gmx_fio_open(), it's in our list
- of open files. */
- ret = gmx_fio_all_output_fsync();
-
- if (ret)
- {
- char buf[STRLEN];
- sprintf(buf, "Cannot fsync '%s'; maybe you are out of disk space?", gmx_fio_getname(ret));
-
- if (getenv(GMX_IGNORE_FSYNC_FAILURE_ENV) == nullptr)
- {
- gmx_file(buf);
- }
- else
- {
- gmx_warning("%s", buf);
- }
- }
-
- if (gmx_fio_close(fp) != 0)
- {
- gmx_file("Cannot read/write checkpoint; corrupt file, or maybe you are out of disk space?");
- }
-
- /* we don't move the checkpoint if the user specified they didn't want it,
- or if the fsyncs failed */
-#if !GMX_NO_RENAME
- if (!bNumberAndKeep && !ret)
- {
- if (gmx_fexist(fn))
- {
- /* Rename the previous checkpoint file */
- mpiBarrierBeforeRename(applyMpiBarrierBeforeRename, mpiBarrierCommunicator);
-
- std::strcpy(buf, fn);
- buf[std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1] = '\0';
- std::strcat(buf, "_prev");
- std::strcat(buf, fn + std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1);
- if (!GMX_FAHCORE)
- {
- /* we copy here so that if something goes wrong between now and
- * the rename below, there's always a state.cpt.
- * If renames are atomic (such as in POSIX systems),
- * this copying should be unneccesary.
- */
- gmx_file_copy(fn, buf, FALSE);
- /* We don't really care if this fails:
- * there's already a new checkpoint.
- */
- }
- else
- {
- gmx_file_rename(fn, buf);
- }
- }
-
- /* Rename the checkpoint file from the temporary to the final name */
- mpiBarrierBeforeRename(applyMpiBarrierBeforeRename, mpiBarrierCommunicator);
-
- if (gmx_file_rename(fntemp, fn) != 0)
- {
- gmx_file("Cannot rename checkpoint file; maybe you are out of disk space?");
- }
- }
-#endif /* GMX_NO_RENAME */
-
- sfree(fntemp);
-
-#if GMX_FAHCORE
- /*code for alternate checkpointing scheme. moved from top of loop over
- steps */
- fcRequestCheckPoint();
- if (fcCheckPointParallel(cr->nodeid, NULL, 0) == 0)
- {
- gmx_fatal(3, __FILE__, __LINE__, "Checkpoint error on step %d\n", step);
- }
-#endif /* end GMX_FAHCORE block */
}
static void check_int(FILE* fplog, const char* type, int p, int f, gmx_bool* mm)
int eSwapCoords;
};
-/* Write a checkpoint to <fn>.cpt
- * Appends the _step<step>.cpt with bNumberAndKeep,
- * otherwise moves the previous <fn>.cpt to <fn>_prev.cpt
- */
-void write_checkpoint(const char* fn,
- gmx_bool bNumberAndKeep,
- FILE* fplog,
- const t_commrec* cr,
- ivec domdecCells,
- int nppnodes,
- int eIntegrator,
- int simulation_part,
- gmx_bool bExpanded,
- int elamstats,
- int64_t step,
- double t,
- t_state* state,
- ObservablesHistory* observablesHistory,
- const gmx::MdModulesNotifier& notifier,
- bool applyMpiBarrierBeforeRename,
- MPI_Comm mpiBarrierCommunicator);
+/*! \brief Low-level checkpoint writing function */
+void write_checkpoint_data(t_fileio* fp,
+ CheckpointHeaderContents headerContents,
+ gmx_bool bExpanded,
+ int elamstats,
+ t_state* state,
+ ObservablesHistory* observablesHistory,
+ const gmx::MdModulesNotifier& notifier,
+ std::vector<gmx_file_position_t>* outputfiles);
/* Loads a checkpoint from fn for run continuation.
* Generates a fatal error on system size mismatch.
#include "gromacs/mdlib/tgroup.h"
#include "gromacs/mdlib/update.h"
#include "gromacs/mdlib/vcm.h"
-#include "gromacs/mdrunutility/multisim.h"
#include "gromacs/mdtypes/commrec.h"
#include "gromacs/mdtypes/df_history.h"
#include "gromacs/mdtypes/enerdata.h"
#include "gromacs/utility/smalloc.h"
#include "gromacs/utility/snprintf.h"
-// TODO move this to multi-sim module
-bool multisim_int_all_are_equal(const gmx_multisim_t* ms, int64_t value)
-{
- bool allValuesAreEqual = true;
- int64_t* buf;
-
- GMX_RELEASE_ASSERT(ms, "Invalid use of multi-simulation pointer");
-
- snew(buf, ms->numSimulations_);
- /* send our value to all other master ranks, receive all of theirs */
- buf[ms->simulationIndex_] = value;
- gmx_sumli_sim(ms->numSimulations_, buf, ms);
-
- for (int s = 0; s < ms->numSimulations_; s++)
- {
- if (buf[s] != value)
- {
- allValuesAreEqual = false;
- break;
- }
- }
-
- sfree(buf);
-
- return allValuesAreEqual;
-}
-
-int multisim_min(const gmx_multisim_t* ms, int nmin, int n)
-{
- int* buf;
- gmx_bool bPos, bEqual;
- int s, d;
-
- snew(buf, ms->numSimulations_);
- buf[ms->simulationIndex_] = n;
- gmx_sumi_sim(ms->numSimulations_, buf, ms);
- bPos = TRUE;
- bEqual = TRUE;
- for (s = 0; s < ms->numSimulations_; s++)
- {
- bPos = bPos && (buf[s] > 0);
- bEqual = bEqual && (buf[s] == buf[0]);
- }
- if (bPos)
- {
- if (bEqual)
- {
- nmin = std::min(nmin, buf[0]);
- }
- else
- {
- /* Find the least common multiple */
- for (d = 2; d < nmin; d++)
- {
- s = 0;
- while (s < ms->numSimulations_ && d % buf[s] == 0)
- {
- s++;
- }
- if (s == ms->numSimulations_)
- {
- /* We found the LCM and it is less than nmin */
- nmin = d;
- break;
- }
- }
- }
- }
- sfree(buf);
-
- return nmin;
-}
-
static void calc_ke_part_normal(gmx::ArrayRef<const gmx::RVec> v,
const t_grpopts* opts,
const t_mdatoms* md,
struct gmx_ekindata_t;
struct gmx_enerdata_t;
struct gmx_global_stat;
-struct gmx_multisim_t;
struct gmx_signalling_t;
struct t_extmass;
struct t_forcerec;
* inputrec. */
int computeGlobalCommunicationPeriod(const gmx::MDLogger& mdlog, t_inputrec* ir, const t_commrec* cr);
-/*! \brief Return true if the \p value is equal across the set of multi-simulations
- *
- * \todo This duplicates some of check_multi_int. Consolidate. */
-bool multisim_int_all_are_equal(const gmx_multisim_t* ms, int64_t value);
-
void rerun_parallel_comm(t_commrec* cr, t_trxframe* fr, gmx_bool* bLastStep);
//! \brief Allocate and initialize node-local state entries
gmx::ArrayRef<real> lambda,
int currentFEPState);
-int multisim_min(const gmx_multisim_t* ms, int nmin, int n);
-/* Set an appropriate value for n across the whole multi-simulation */
-
/* Compute global variables during integration
*
#include "mdoutf.h"
+#include "config.h"
+
#include "gromacs/commandline/filenm.h"
#include "gromacs/domdec/collect.h"
#include "gromacs/domdec/domdec_struct.h"
#include "gromacs/mdlib/trajectory_writing.h"
#include "gromacs/mdrunutility/handlerestart.h"
#include "gromacs/mdrunutility/multisim.h"
+#include "gromacs/mdtypes/awh_history.h"
#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/df_history.h"
+#include "gromacs/mdtypes/edsamhistory.h"
+#include "gromacs/mdtypes/energyhistory.h"
#include "gromacs/mdtypes/imdoutputprovider.h"
#include "gromacs/mdtypes/inputrec.h"
#include "gromacs/mdtypes/md_enums.h"
#include "gromacs/mdtypes/mdrunoptions.h"
+#include "gromacs/mdtypes/observableshistory.h"
#include "gromacs/mdtypes/state.h"
+#include "gromacs/mdtypes/swaphistory.h"
#include "gromacs/timing/wallcycle.h"
#include "gromacs/topology/topology.h"
+#include "gromacs/utility/baseversion.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/pleasecite.h"
+#include "gromacs/utility/programcontext.h"
#include "gromacs/utility/smalloc.h"
+#include "gromacs/utility/sysinfo.h"
struct gmx_mdoutf
{
return of->wcycle;
}
+static void mpiBarrierBeforeRename(const bool applyMpiBarrierBeforeRename, MPI_Comm mpiBarrierCommunicator)
+{
+ if (applyMpiBarrierBeforeRename)
+ {
+#if GMX_MPI
+ MPI_Barrier(mpiBarrierCommunicator);
+#else
+ GMX_RELEASE_ASSERT(false, "Should not request a barrier without MPI");
+ GMX_UNUSED_VALUE(mpiBarrierCommunicator);
+#endif
+ }
+}
+/*! \brief Write a checkpoint to the filename
+ *
+ * Appends the _step<step>.cpt with bNumberAndKeep, otherwise moves
+ * the previous checkpoint filename with suffix _prev.cpt.
+ */
+static void write_checkpoint(const char* fn,
+ gmx_bool bNumberAndKeep,
+ FILE* fplog,
+ const t_commrec* cr,
+ ivec domdecCells,
+ int nppnodes,
+ int eIntegrator,
+ int simulation_part,
+ gmx_bool bExpanded,
+ int elamstats,
+ int64_t step,
+ double t,
+ t_state* state,
+ ObservablesHistory* observablesHistory,
+ const gmx::MdModulesNotifier& mdModulesNotifier,
+ bool applyMpiBarrierBeforeRename,
+ MPI_Comm mpiBarrierCommunicator)
+{
+ t_fileio* fp;
+ char* fntemp; /* the temporary checkpoint file name */
+ int npmenodes;
+ char buf[1024], suffix[5 + STEPSTRSIZE], sbuf[STEPSTRSIZE];
+ t_fileio* ret;
+
+ if (DOMAINDECOMP(cr))
+ {
+ npmenodes = cr->npmenodes;
+ }
+ else
+ {
+ npmenodes = 0;
+ }
+
+#if !GMX_NO_RENAME
+ /* make the new temporary filename */
+ snew(fntemp, std::strlen(fn) + 5 + STEPSTRSIZE);
+ std::strcpy(fntemp, fn);
+ fntemp[std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1] = '\0';
+ sprintf(suffix, "_%s%s", "step", gmx_step_str(step, sbuf));
+ std::strcat(fntemp, suffix);
+ std::strcat(fntemp, fn + std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1);
+#else
+ /* if we can't rename, we just overwrite the cpt file.
+ * dangerous if interrupted.
+ */
+ snew(fntemp, std::strlen(fn));
+ std::strcpy(fntemp, fn);
+#endif
+ std::string timebuf = gmx_format_current_time();
+
+ if (fplog)
+ {
+ fprintf(fplog, "Writing checkpoint, step %s at %s\n\n", gmx_step_str(step, buf), timebuf.c_str());
+ }
+
+ /* Get offsets for open files */
+ auto outputfiles = gmx_fio_get_output_file_positions();
+
+ fp = gmx_fio_open(fntemp, "w");
+
+ /* We can check many more things now (CPU, acceleration, etc), but
+ * it is highly unlikely to have two separate builds with exactly
+ * the same version, user, time, and build host!
+ */
+
+ int nlambda = (state->dfhist ? state->dfhist->nlambda : 0);
+
+ edsamhistory_t* edsamhist = observablesHistory->edsamHistory.get();
+ int nED = (edsamhist ? edsamhist->nED : 0);
+
+ swaphistory_t* swaphist = observablesHistory->swapHistory.get();
+ int eSwapCoords = (swaphist ? swaphist->eSwapCoords : eswapNO);
+
+ CheckpointHeaderContents headerContents = { 0,
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ GMX_DOUBLE,
+ { 0 },
+ { 0 },
+ eIntegrator,
+ simulation_part,
+ step,
+ t,
+ nppnodes,
+ { 0 },
+ npmenodes,
+ state->natoms,
+ state->ngtc,
+ state->nnhpres,
+ state->nhchainlength,
+ nlambda,
+ state->flags,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ nED,
+ eSwapCoords };
+ std::strcpy(headerContents.version, gmx_version());
+ std::strcpy(headerContents.fprog, gmx::getProgramContext().fullBinaryPath());
+ std::strcpy(headerContents.ftime, timebuf.c_str());
+ if (DOMAINDECOMP(cr))
+ {
+ copy_ivec(domdecCells, headerContents.dd_nc);
+ }
+
+ write_checkpoint_data(fp, headerContents, bExpanded, elamstats, state, observablesHistory,
+ mdModulesNotifier, &outputfiles);
+
+ /* we really, REALLY, want to make sure to physically write the checkpoint,
+ and all the files it depends on, out to disk. Because we've
+ opened the checkpoint with gmx_fio_open(), it's in our list
+ of open files. */
+ ret = gmx_fio_all_output_fsync();
+
+ if (ret)
+ {
+ char buf[STRLEN];
+ sprintf(buf, "Cannot fsync '%s'; maybe you are out of disk space?", gmx_fio_getname(ret));
+
+ if (getenv(GMX_IGNORE_FSYNC_FAILURE_ENV) == nullptr)
+ {
+ gmx_file(buf);
+ }
+ else
+ {
+ gmx_warning("%s", buf);
+ }
+ }
+
+ if (gmx_fio_close(fp) != 0)
+ {
+ gmx_file("Cannot read/write checkpoint; corrupt file, or maybe you are out of disk space?");
+ }
+
+ /* we don't move the checkpoint if the user specified they didn't want it,
+ or if the fsyncs failed */
+#if !GMX_NO_RENAME
+ if (!bNumberAndKeep && !ret)
+ {
+ if (gmx_fexist(fn))
+ {
+ /* Rename the previous checkpoint file */
+ mpiBarrierBeforeRename(applyMpiBarrierBeforeRename, mpiBarrierCommunicator);
+
+ std::strcpy(buf, fn);
+ buf[std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1] = '\0';
+ std::strcat(buf, "_prev");
+ std::strcat(buf, fn + std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1);
+ if (!GMX_FAHCORE)
+ {
+ /* we copy here so that if something goes wrong between now and
+ * the rename below, there's always a state.cpt.
+ * If renames are atomic (such as in POSIX systems),
+ * this copying should be unneccesary.
+ */
+ gmx_file_copy(fn, buf, FALSE);
+ /* We don't really care if this fails:
+ * there's already a new checkpoint.
+ */
+ }
+ else
+ {
+ gmx_file_rename(fn, buf);
+ }
+ }
+
+ /* Rename the checkpoint file from the temporary to the final name */
+ mpiBarrierBeforeRename(applyMpiBarrierBeforeRename, mpiBarrierCommunicator);
+
+ if (gmx_file_rename(fntemp, fn) != 0)
+ {
+ gmx_file("Cannot rename checkpoint file; maybe you are out of disk space?");
+ }
+ }
+#endif /* GMX_NO_RENAME */
+
+ sfree(fntemp);
+
+#if GMX_FAHCORE
+ /*code for alternate checkpointing scheme. moved from top of loop over
+ steps */
+ fcRequestCheckPoint();
+ if (fcCheckPointParallel(cr->nodeid, NULL, 0) == 0)
+ {
+ gmx_fatal(3, __FILE__, __LINE__, "Checkpoint error on step %d\n", step);
+ }
+#endif /* end GMX_FAHCORE block */
+}
+
void mdoutf_write_to_trajectory_files(FILE* fplog,
const t_commrec* cr,
gmx_mdoutf_t of,
* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
-/*! \libinternal \file
+/*! \internal \file
* \brief Implements the VirtualSitesHandler class and vsite standalone functions
*
* \author Berk Hess <hess@kth.se>
* \ingroup module_mdlib
- * \inlibraryapi
*/
#include "gmxpre.h"
step = ir->init_step;
step_rel = 0;
- // TODO extract this to new multi-simulation module
if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
{
- if (!multisim_int_all_are_equal(ms, ir->nsteps))
- {
- GMX_LOG(mdlog.warning)
- .appendText(
- "Note: The number of steps is not consistent across multi "
- "simulations,\n"
- "but we are proceeding anyway!");
- }
- if (!multisim_int_all_are_equal(ms, ir->init_step))
- {
- if (simulationsShareState)
- {
- if (MASTER(cr))
- {
- gmx_fatal(FARGS,
- "The initial step is not consistent across multi simulations which "
- "share the state");
- }
- gmx_barrier(cr->mpi_comm_mygroup);
- }
- else
- {
- GMX_LOG(mdlog.warning)
- .appendText(
- "Note: The initial step is not consistent across multi "
- "simulations,\n"
- "but we are proceeding anyway!");
- }
- }
+ logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
}
/* and stop now if we should */
#include "config.h"
+#include "gromacs/gmxlib/network.h"
#include "gromacs/mdtypes/commrec.h"
#include "gromacs/utility/exceptions.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/futil.h"
#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/logger.h"
#include "gromacs/utility/smalloc.h"
std::unique_ptr<gmx_multisim_t> buildMultiSimulation(MPI_Comm worldComm,
{
return (isMaster && isMasterSim(ms));
}
+
+static bool multisim_int_all_are_equal(const gmx_multisim_t* ms, int64_t value)
+{
+ bool allValuesAreEqual = true;
+ int64_t* buf;
+
+ GMX_RELEASE_ASSERT(ms, "Invalid use of multi-simulation pointer");
+
+ snew(buf, ms->numSimulations_);
+ /* send our value to all other master ranks, receive all of theirs */
+ buf[ms->simulationIndex_] = value;
+ gmx_sumli_sim(ms->numSimulations_, buf, ms);
+
+ for (int s = 0; s < ms->numSimulations_; s++)
+ {
+ if (buf[s] != value)
+ {
+ allValuesAreEqual = false;
+ break;
+ }
+ }
+
+ sfree(buf);
+
+ return allValuesAreEqual;
+}
+
+void logInitialMultisimStatus(const gmx_multisim_t* ms,
+ const t_commrec* cr,
+ const gmx::MDLogger& mdlog,
+ const bool simulationsShareState,
+ const int numSteps,
+ const int initialStep)
+{
+ if (!multisim_int_all_are_equal(ms, numSteps))
+ {
+ GMX_LOG(mdlog.warning)
+ .appendText(
+ "Note: The number of steps is not consistent across multi "
+ "simulations,\n"
+ "but we are proceeding anyway!");
+ }
+ if (!multisim_int_all_are_equal(ms, initialStep))
+ {
+ if (simulationsShareState)
+ {
+ if (MASTER(cr))
+ {
+ gmx_fatal(FARGS,
+ "The initial step is not consistent across multi simulations which "
+ "share the state");
+ }
+ gmx_barrier(cr->mpi_comm_mygroup);
+ }
+ else
+ {
+ GMX_LOG(mdlog.warning)
+ .appendText(
+ "Note: The initial step is not consistent across multi "
+ "simulations,\n"
+ "but we are proceeding anyway!");
+ }
+ }
+}
#include "gromacs/utility/arrayref.h"
#include "gromacs/utility/gmxmpi.h"
+namespace gmx
+{
+class MDLogger;
+}
+
struct gmx_multisim_t;
+struct t_commrec;
/*! \libinternal
* \brief Builder function for gmx_multisim_t
* This rank prints the remaining run time etc. */
bool isMasterSimMasterRank(const gmx_multisim_t* ms, bool isMaster);
+/*! \brief Log the initial state of the multi-sim
+ *
+ * The simulations may be at different steps, etc so we
+ * report that.
+ *
+ * \param[in] ms The multi-sum object
+ * \param[in] cr The commrec object
+ * \param[in] mdlog Logger
+ * \param[in] simulationsShareState Whether the simulations share state
+ * \param[in] numSteps The number of steps in this simulation
+ * \param[in] initialStep The initial step for this simulation
+ */
+void logInitialMultisimStatus(const gmx_multisim_t* ms,
+ const t_commrec* cr,
+ const gmx::MDLogger& mdlog,
+ bool simulationsShareState,
+ int numSteps,
+ int initialStep);
+
#endif