Move code to prepare for multisim class
authorMark Abraham <mark.j.abraham@gmail.com>
Fri, 31 Jul 2020 12:57:18 +0000 (12:57 +0000)
committerM. Eric Irrgang <mei2n@virginia.edu>
Fri, 31 Jul 2020 12:57:18 +0000 (12:57 +0000)
Some write_checkpoint code organizes cooperation of behavior from
multiple modules so it is too high-level to be in a low-level module
like fileio. mdoutf.cpp is the caller of write_checkpoint so is a
decent spot to put that code. Other parts of write_checkpoint are
quite low level and share implementation details with the
checkpoint-reading code, so should stay in checkpoint.cpp.

Moved misplaced multisim code from md.cpp and md_support.cpp to
the multisim.cpp file, resolving some TODOs.

Fixed some wrong declarations for vsite code.

src/gromacs/fileio/checkpoint.cpp
src/gromacs/fileio/checkpoint.h
src/gromacs/mdlib/md_support.cpp
src/gromacs/mdlib/md_support.h
src/gromacs/mdlib/mdoutf.cpp
src/gromacs/mdlib/vsite.cpp
src/gromacs/mdrun/md.cpp
src/gromacs/mdrunutility/multisim.cpp
src/gromacs/mdrunutility/multisim.h

index f50b8182e0731a8fa330ccef9f2a7d160d565fcd..ef764c64d69d1a8b32e3778a1956b950010c747e 100644 (file)
@@ -41,8 +41,6 @@
 
 #include "checkpoint.h"
 
-#include "config.h"
-
 #include <cerrno>
 #include <cstdlib>
 #include <cstring>
@@ -2172,211 +2170,101 @@ static int do_cpt_files(XDR* xd, gmx_bool bRead, std::vector<gmx_file_position_t
     return 0;
 }
 
-static void mpiBarrierBeforeRename(const bool applyMpiBarrierBeforeRename, MPI_Comm mpiBarrierCommunicator)
-{
-    if (applyMpiBarrierBeforeRename)
-    {
-#if GMX_MPI
-        MPI_Barrier(mpiBarrierCommunicator);
-#else
-        GMX_RELEASE_ASSERT(false, "Should not request a barrier without MPI");
-        GMX_UNUSED_VALUE(mpiBarrierCommunicator);
-#endif
-    }
-}
-
-void write_checkpoint(const char*                   fn,
-                      gmx_bool                      bNumberAndKeep,
-                      FILE*                         fplog,
-                      const t_commrec*              cr,
-                      ivec                          domdecCells,
-                      int                           nppnodes,
-                      int                           eIntegrator,
-                      int                           simulation_part,
-                      gmx_bool                      bExpanded,
-                      int                           elamstats,
-                      int64_t                       step,
-                      double                        t,
-                      t_state*                      state,
-                      ObservablesHistory*           observablesHistory,
-                      const gmx::MdModulesNotifier& mdModulesNotifier,
-                      bool                          applyMpiBarrierBeforeRename,
-                      MPI_Comm                      mpiBarrierCommunicator)
+void write_checkpoint_data(t_fileio*                         fp,
+                           CheckpointHeaderContents          headerContents,
+                           gmx_bool                          bExpanded,
+                           int                               elamstats,
+                           t_state*                          state,
+                           ObservablesHistory*               observablesHistory,
+                           const gmx::MdModulesNotifier&     mdModulesNotifier,
+                           std::vector<gmx_file_position_t>* outputfiles)
 {
-    t_fileio* fp;
-    char*     fntemp; /* the temporary checkpoint file name */
-    int       npmenodes;
-    char      buf[1024], suffix[5 + STEPSTRSIZE], sbuf[STEPSTRSIZE];
-    t_fileio* ret;
-
-    if (DOMAINDECOMP(cr))
-    {
-        npmenodes = cr->npmenodes;
-    }
-    else
-    {
-        npmenodes = 0;
-    }
-
-#if !GMX_NO_RENAME
-    /* make the new temporary filename */
-    snew(fntemp, std::strlen(fn) + 5 + STEPSTRSIZE);
-    std::strcpy(fntemp, fn);
-    fntemp[std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1] = '\0';
-    sprintf(suffix, "_%s%s", "step", gmx_step_str(step, sbuf));
-    std::strcat(fntemp, suffix);
-    std::strcat(fntemp, fn + std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1);
-#else
-    /* if we can't rename, we just overwrite the cpt file.
-     * dangerous if interrupted.
-     */
-    snew(fntemp, std::strlen(fn));
-    std::strcpy(fntemp, fn);
-#endif
-    std::string timebuf = gmx_format_current_time();
-
-    if (fplog)
-    {
-        fprintf(fplog, "Writing checkpoint, step %s at %s\n\n", gmx_step_str(step, buf), timebuf.c_str());
-    }
-
-    /* Get offsets for open files */
-    auto outputfiles = gmx_fio_get_output_file_positions();
-
-    fp = gmx_fio_open(fntemp, "w");
-
-    int flags_eks;
+    headerContents.flags_eks = 0;
     if (state->ekinstate.bUpToDate)
     {
-        flags_eks = ((1 << eeksEKIN_N) | (1 << eeksEKINH) | (1 << eeksEKINF) | (1 << eeksEKINO)
-                     | (1 << eeksEKINSCALEF) | (1 << eeksEKINSCALEH) | (1 << eeksVSCALE)
-                     | (1 << eeksDEKINDL) | (1 << eeksMVCOS));
-    }
-    else
-    {
-        flags_eks = 0;
+        headerContents.flags_eks = ((1 << eeksEKIN_N) | (1 << eeksEKINH) | (1 << eeksEKINF)
+                                    | (1 << eeksEKINO) | (1 << eeksEKINSCALEF) | (1 << eeksEKINSCALEH)
+                                    | (1 << eeksVSCALE) | (1 << eeksDEKINDL) | (1 << eeksMVCOS));
     }
 
-    energyhistory_t* enerhist  = observablesHistory->energyHistory.get();
-    int              flags_enh = 0;
+    energyhistory_t* enerhist = observablesHistory->energyHistory.get();
+    headerContents.flags_enh  = 0;
     if (enerhist != nullptr && (enerhist->nsum > 0 || enerhist->nsum_sim > 0))
     {
-        flags_enh |= (1 << eenhENERGY_N) | (1 << eenhENERGY_NSTEPS) | (1 << eenhENERGY_NSTEPS_SIM);
+        headerContents.flags_enh |=
+                (1 << eenhENERGY_N) | (1 << eenhENERGY_NSTEPS) | (1 << eenhENERGY_NSTEPS_SIM);
         if (enerhist->nsum > 0)
         {
-            flags_enh |= ((1 << eenhENERGY_AVER) | (1 << eenhENERGY_SUM) | (1 << eenhENERGY_NSUM));
+            headerContents.flags_enh |=
+                    ((1 << eenhENERGY_AVER) | (1 << eenhENERGY_SUM) | (1 << eenhENERGY_NSUM));
         }
         if (enerhist->nsum_sim > 0)
         {
-            flags_enh |= ((1 << eenhENERGY_SUM_SIM) | (1 << eenhENERGY_NSUM_SIM));
+            headerContents.flags_enh |= ((1 << eenhENERGY_SUM_SIM) | (1 << eenhENERGY_NSUM_SIM));
         }
         if (enerhist->deltaHForeignLambdas != nullptr)
         {
-            flags_enh |= ((1 << eenhENERGY_DELTA_H_NN) | (1 << eenhENERGY_DELTA_H_LIST)
-                          | (1 << eenhENERGY_DELTA_H_STARTTIME) | (1 << eenhENERGY_DELTA_H_STARTLAMBDA));
+            headerContents.flags_enh |=
+                    ((1 << eenhENERGY_DELTA_H_NN) | (1 << eenhENERGY_DELTA_H_LIST)
+                     | (1 << eenhENERGY_DELTA_H_STARTTIME) | (1 << eenhENERGY_DELTA_H_STARTLAMBDA));
         }
     }
 
-    PullHistory* pullHist         = observablesHistory->pullHistory.get();
-    int          flagsPullHistory = 0;
+    PullHistory* pullHist           = observablesHistory->pullHistory.get();
+    headerContents.flagsPullHistory = 0;
     if (pullHist != nullptr && (pullHist->numValuesInXSum > 0 || pullHist->numValuesInFSum > 0))
     {
-        flagsPullHistory |= (1 << epullhPULL_NUMCOORDINATES);
-        flagsPullHistory |= ((1 << epullhPULL_NUMGROUPS) | (1 << epullhPULL_NUMVALUESINXSUM)
-                             | (1 << epullhPULL_NUMVALUESINFSUM));
+        headerContents.flagsPullHistory |= (1 << epullhPULL_NUMCOORDINATES);
+        headerContents.flagsPullHistory |= ((1 << epullhPULL_NUMGROUPS) | (1 << epullhPULL_NUMVALUESINXSUM)
+                                            | (1 << epullhPULL_NUMVALUESINFSUM));
     }
 
-    int flags_dfh;
+    headerContents.flags_dfh = 0;
     if (bExpanded)
     {
-        flags_dfh = ((1 << edfhBEQUIL) | (1 << edfhNATLAMBDA) | (1 << edfhSUMWEIGHTS)
-                     | (1 << edfhSUMDG) | (1 << edfhTIJ) | (1 << edfhTIJEMP));
+        headerContents.flags_dfh = ((1 << edfhBEQUIL) | (1 << edfhNATLAMBDA) | (1 << edfhSUMWEIGHTS)
+                                    | (1 << edfhSUMDG) | (1 << edfhTIJ) | (1 << edfhTIJEMP));
         if (EWL(elamstats))
         {
-            flags_dfh |= ((1 << edfhWLDELTA) | (1 << edfhWLHISTO));
+            headerContents.flags_dfh |= ((1 << edfhWLDELTA) | (1 << edfhWLHISTO));
         }
         if ((elamstats == elamstatsMINVAR) || (elamstats == elamstatsBARKER)
             || (elamstats == elamstatsMETROPOLIS))
         {
-            flags_dfh |= ((1 << edfhACCUMP) | (1 << edfhACCUMM) | (1 << edfhACCUMP2)
-                          | (1 << edfhACCUMM2) | (1 << edfhSUMMINVAR) | (1 << edfhSUMVAR));
+            headerContents.flags_dfh |= ((1 << edfhACCUMP) | (1 << edfhACCUMM) | (1 << edfhACCUMP2)
+                                         | (1 << edfhACCUMM2) | (1 << edfhSUMMINVAR) | (1 << edfhSUMVAR));
         }
     }
-    else
-    {
-        flags_dfh = 0;
-    }
 
-    int flags_awhh = 0;
+    headerContents.flags_awhh = 0;
     if (state->awhHistory != nullptr && !state->awhHistory->bias.empty())
     {
-        flags_awhh |= ((1 << eawhhIN_INITIAL) | (1 << eawhhEQUILIBRATEHISTOGRAM) | (1 << eawhhHISTSIZE)
-                       | (1 << eawhhNPOINTS) | (1 << eawhhCOORDPOINT) | (1 << eawhhUMBRELLAGRIDPOINT)
-                       | (1 << eawhhUPDATELIST) | (1 << eawhhLOGSCALEDSAMPLEWEIGHT)
-                       | (1 << eawhhNUMUPDATES) | (1 << eawhhFORCECORRELATIONGRID));
-    }
-
-    /* We can check many more things now (CPU, acceleration, etc), but
-     * it is highly unlikely to have two separate builds with exactly
-     * the same version, user, time, and build host!
-     */
-
-    int nlambda = (state->dfhist ? state->dfhist->nlambda : 0);
-
-    edsamhistory_t* edsamhist = observablesHistory->edsamHistory.get();
-    int             nED       = (edsamhist ? edsamhist->nED : 0);
-
-    swaphistory_t* swaphist    = observablesHistory->swapHistory.get();
-    int            eSwapCoords = (swaphist ? swaphist->eSwapCoords : eswapNO);
-
-    CheckpointHeaderContents headerContents = { 0,
-                                                { 0 },
-                                                { 0 },
-                                                { 0 },
-                                                { 0 },
-                                                GMX_DOUBLE,
-                                                { 0 },
-                                                { 0 },
-                                                eIntegrator,
-                                                simulation_part,
-                                                step,
-                                                t,
-                                                nppnodes,
-                                                { 0 },
-                                                npmenodes,
-                                                state->natoms,
-                                                state->ngtc,
-                                                state->nnhpres,
-                                                state->nhchainlength,
-                                                nlambda,
-                                                state->flags,
-                                                flags_eks,
-                                                flags_enh,
-                                                flagsPullHistory,
-                                                flags_dfh,
-                                                flags_awhh,
-                                                nED,
-                                                eSwapCoords };
-    std::strcpy(headerContents.version, gmx_version());
-    std::strcpy(headerContents.fprog, gmx::getProgramContext().fullBinaryPath());
-    std::strcpy(headerContents.ftime, timebuf.c_str());
-    if (DOMAINDECOMP(cr))
-    {
-        copy_ivec(domdecCells, headerContents.dd_nc);
+        headerContents.flags_awhh |=
+                ((1 << eawhhIN_INITIAL) | (1 << eawhhEQUILIBRATEHISTOGRAM) | (1 << eawhhHISTSIZE)
+                 | (1 << eawhhNPOINTS) | (1 << eawhhCOORDPOINT) | (1 << eawhhUMBRELLAGRIDPOINT)
+                 | (1 << eawhhUPDATELIST) | (1 << eawhhLOGSCALEDSAMPLEWEIGHT)
+                 | (1 << eawhhNUMUPDATES) | (1 << eawhhFORCECORRELATIONGRID));
     }
 
     do_cpt_header(gmx_fio_getxdr(fp), FALSE, nullptr, &headerContents);
 
     if ((do_cpt_state(gmx_fio_getxdr(fp), state->flags, state, nullptr) < 0)
-        || (do_cpt_ekinstate(gmx_fio_getxdr(fp), flags_eks, &state->ekinstate, nullptr) < 0)
-        || (do_cpt_enerhist(gmx_fio_getxdr(fp), FALSE, flags_enh, enerhist, nullptr) < 0)
-        || (doCptPullHist(gmx_fio_getxdr(fp), FALSE, flagsPullHistory, pullHist, StatePart::pullHistory, nullptr)
+        || (do_cpt_ekinstate(gmx_fio_getxdr(fp), headerContents.flags_eks, &state->ekinstate, nullptr) < 0)
+        || (do_cpt_enerhist(gmx_fio_getxdr(fp), FALSE, headerContents.flags_enh, enerhist, nullptr) < 0)
+        || (doCptPullHist(gmx_fio_getxdr(fp), FALSE, headerContents.flagsPullHistory, pullHist,
+                          StatePart::pullHistory, nullptr)
+            < 0)
+        || (do_cpt_df_hist(gmx_fio_getxdr(fp), headerContents.flags_dfh, headerContents.nlambda,
+                           &state->dfhist, nullptr)
+            < 0)
+        || (do_cpt_EDstate(gmx_fio_getxdr(fp), FALSE, headerContents.nED,
+                           observablesHistory->edsamHistory.get(), nullptr)
+            < 0)
+        || (do_cpt_awh(gmx_fio_getxdr(fp), FALSE, headerContents.flags_awhh, state->awhHistory.get(), nullptr) < 0)
+        || (do_cpt_swapstate(gmx_fio_getxdr(fp), FALSE, headerContents.eSwapCoords,
+                             observablesHistory->swapHistory.get(), nullptr)
             < 0)
-        || (do_cpt_df_hist(gmx_fio_getxdr(fp), flags_dfh, nlambda, &state->dfhist, nullptr) < 0)
-        || (do_cpt_EDstate(gmx_fio_getxdr(fp), FALSE, nED, edsamhist, nullptr) < 0)
-        || (do_cpt_awh(gmx_fio_getxdr(fp), FALSE, flags_awhh, state->awhHistory.get(), nullptr) < 0)
-        || (do_cpt_swapstate(gmx_fio_getxdr(fp), FALSE, eSwapCoords, swaphist, nullptr) < 0)
-        || (do_cpt_files(gmx_fio_getxdr(fp), FALSE, &outputfiles, nullptr, headerContents.file_version) < 0))
+        || (do_cpt_files(gmx_fio_getxdr(fp), FALSE, outputfiles, nullptr, headerContents.file_version) < 0))
     {
         gmx_file("Cannot read/write checkpoint; corrupt file, or maybe you are out of disk space?");
     }
@@ -2393,86 +2281,6 @@ void write_checkpoint(const char*                   fn,
     }
 
     do_cpt_footer(gmx_fio_getxdr(fp), headerContents.file_version);
-
-    /* we really, REALLY, want to make sure to physically write the checkpoint,
-       and all the files it depends on, out to disk. Because we've
-       opened the checkpoint with gmx_fio_open(), it's in our list
-       of open files.  */
-    ret = gmx_fio_all_output_fsync();
-
-    if (ret)
-    {
-        char buf[STRLEN];
-        sprintf(buf, "Cannot fsync '%s'; maybe you are out of disk space?", gmx_fio_getname(ret));
-
-        if (getenv(GMX_IGNORE_FSYNC_FAILURE_ENV) == nullptr)
-        {
-            gmx_file(buf);
-        }
-        else
-        {
-            gmx_warning("%s", buf);
-        }
-    }
-
-    if (gmx_fio_close(fp) != 0)
-    {
-        gmx_file("Cannot read/write checkpoint; corrupt file, or maybe you are out of disk space?");
-    }
-
-    /* we don't move the checkpoint if the user specified they didn't want it,
-       or if the fsyncs failed */
-#if !GMX_NO_RENAME
-    if (!bNumberAndKeep && !ret)
-    {
-        if (gmx_fexist(fn))
-        {
-            /* Rename the previous checkpoint file */
-            mpiBarrierBeforeRename(applyMpiBarrierBeforeRename, mpiBarrierCommunicator);
-
-            std::strcpy(buf, fn);
-            buf[std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1] = '\0';
-            std::strcat(buf, "_prev");
-            std::strcat(buf, fn + std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1);
-            if (!GMX_FAHCORE)
-            {
-                /* we copy here so that if something goes wrong between now and
-                 * the rename below, there's always a state.cpt.
-                 * If renames are atomic (such as in POSIX systems),
-                 * this copying should be unneccesary.
-                 */
-                gmx_file_copy(fn, buf, FALSE);
-                /* We don't really care if this fails:
-                 * there's already a new checkpoint.
-                 */
-            }
-            else
-            {
-                gmx_file_rename(fn, buf);
-            }
-        }
-
-        /* Rename the checkpoint file from the temporary to the final name */
-        mpiBarrierBeforeRename(applyMpiBarrierBeforeRename, mpiBarrierCommunicator);
-
-        if (gmx_file_rename(fntemp, fn) != 0)
-        {
-            gmx_file("Cannot rename checkpoint file; maybe you are out of disk space?");
-        }
-    }
-#endif /* GMX_NO_RENAME */
-
-    sfree(fntemp);
-
-#if GMX_FAHCORE
-    /*code for alternate checkpointing scheme.  moved from top of loop over
-       steps */
-    fcRequestCheckPoint();
-    if (fcCheckPointParallel(cr->nodeid, NULL, 0) == 0)
-    {
-        gmx_fatal(3, __FILE__, __LINE__, "Checkpoint error on step %d\n", step);
-    }
-#endif /* end GMX_FAHCORE block */
 }
 
 static void check_int(FILE* fplog, const char* type, int p, int f, gmx_bool* mm)
index 3d8f94dae62733691123cdc6696cfac0c0e87f8d..e9c13d18ac5a0be6221855e0ee4daeefde7dae50 100644 (file)
@@ -179,27 +179,15 @@ struct CheckpointHeaderContents
     int eSwapCoords;
 };
 
-/* Write a checkpoint to <fn>.cpt
- * Appends the _step<step>.cpt with bNumberAndKeep,
- * otherwise moves the previous <fn>.cpt to <fn>_prev.cpt
- */
-void write_checkpoint(const char*                   fn,
-                      gmx_bool                      bNumberAndKeep,
-                      FILE*                         fplog,
-                      const t_commrec*              cr,
-                      ivec                          domdecCells,
-                      int                           nppnodes,
-                      int                           eIntegrator,
-                      int                           simulation_part,
-                      gmx_bool                      bExpanded,
-                      int                           elamstats,
-                      int64_t                       step,
-                      double                        t,
-                      t_state*                      state,
-                      ObservablesHistory*           observablesHistory,
-                      const gmx::MdModulesNotifier& notifier,
-                      bool                          applyMpiBarrierBeforeRename,
-                      MPI_Comm                      mpiBarrierCommunicator);
+/*! \brief Low-level checkpoint writing function */
+void write_checkpoint_data(t_fileio*                         fp,
+                           CheckpointHeaderContents          headerContents,
+                           gmx_bool                          bExpanded,
+                           int                               elamstats,
+                           t_state*                          state,
+                           ObservablesHistory*               observablesHistory,
+                           const gmx::MdModulesNotifier&     notifier,
+                           std::vector<gmx_file_position_t>* outputfiles);
 
 /* Loads a checkpoint from fn for run continuation.
  * Generates a fatal error on system size mismatch.
index d6b794743569fc12ea6d93decb724fb275e35fa8..04fff5332ca384c47fa725d6822e38a162d28f58 100644 (file)
@@ -56,7 +56,6 @@
 #include "gromacs/mdlib/tgroup.h"
 #include "gromacs/mdlib/update.h"
 #include "gromacs/mdlib/vcm.h"
-#include "gromacs/mdrunutility/multisim.h"
 #include "gromacs/mdtypes/commrec.h"
 #include "gromacs/mdtypes/df_history.h"
 #include "gromacs/mdtypes/enerdata.h"
 #include "gromacs/utility/smalloc.h"
 #include "gromacs/utility/snprintf.h"
 
-// TODO move this to multi-sim module
-bool multisim_int_all_are_equal(const gmx_multisim_t* ms, int64_t value)
-{
-    bool     allValuesAreEqual = true;
-    int64_t* buf;
-
-    GMX_RELEASE_ASSERT(ms, "Invalid use of multi-simulation pointer");
-
-    snew(buf, ms->numSimulations_);
-    /* send our value to all other master ranks, receive all of theirs */
-    buf[ms->simulationIndex_] = value;
-    gmx_sumli_sim(ms->numSimulations_, buf, ms);
-
-    for (int s = 0; s < ms->numSimulations_; s++)
-    {
-        if (buf[s] != value)
-        {
-            allValuesAreEqual = false;
-            break;
-        }
-    }
-
-    sfree(buf);
-
-    return allValuesAreEqual;
-}
-
-int multisim_min(const gmx_multisim_t* ms, int nmin, int n)
-{
-    int*     buf;
-    gmx_bool bPos, bEqual;
-    int      s, d;
-
-    snew(buf, ms->numSimulations_);
-    buf[ms->simulationIndex_] = n;
-    gmx_sumi_sim(ms->numSimulations_, buf, ms);
-    bPos   = TRUE;
-    bEqual = TRUE;
-    for (s = 0; s < ms->numSimulations_; s++)
-    {
-        bPos   = bPos && (buf[s] > 0);
-        bEqual = bEqual && (buf[s] == buf[0]);
-    }
-    if (bPos)
-    {
-        if (bEqual)
-        {
-            nmin = std::min(nmin, buf[0]);
-        }
-        else
-        {
-            /* Find the least common multiple */
-            for (d = 2; d < nmin; d++)
-            {
-                s = 0;
-                while (s < ms->numSimulations_ && d % buf[s] == 0)
-                {
-                    s++;
-                }
-                if (s == ms->numSimulations_)
-                {
-                    /* We found the LCM and it is less than nmin */
-                    nmin = d;
-                    break;
-                }
-            }
-        }
-    }
-    sfree(buf);
-
-    return nmin;
-}
-
 static void calc_ke_part_normal(gmx::ArrayRef<const gmx::RVec> v,
                                 const t_grpopts*               opts,
                                 const t_mdatoms*               md,
index 87368d5e6c916cfb660d4a368cd7e2bc713ed831..de69061929b2c4642e4e9b35c73f976675f9816f 100644 (file)
@@ -44,7 +44,6 @@
 struct gmx_ekindata_t;
 struct gmx_enerdata_t;
 struct gmx_global_stat;
-struct gmx_multisim_t;
 struct gmx_signalling_t;
 struct t_extmass;
 struct t_forcerec;
@@ -97,11 +96,6 @@ class SimulationSignaller;
  * inputrec. */
 int computeGlobalCommunicationPeriod(const gmx::MDLogger& mdlog, t_inputrec* ir, const t_commrec* cr);
 
-/*! \brief Return true if the \p value is equal across the set of multi-simulations
- *
- * \todo This duplicates some of check_multi_int. Consolidate. */
-bool multisim_int_all_are_equal(const gmx_multisim_t* ms, int64_t value);
-
 void rerun_parallel_comm(t_commrec* cr, t_trxframe* fr, gmx_bool* bLastStep);
 
 //! \brief Allocate and initialize node-local state entries
@@ -121,9 +115,6 @@ void setCurrentLambdasLocal(int64_t             step,
                             gmx::ArrayRef<real> lambda,
                             int                 currentFEPState);
 
-int multisim_min(const gmx_multisim_t* ms, int nmin, int n);
-/* Set an appropriate value for n across the whole multi-simulation */
-
 
 /* Compute global variables during integration
  *
index 15c6517fc48ca9fdeec11ee5d563d6d2f70550e6..795aae408ba957e2a0f9ffa064e25cc457c07289 100644 (file)
@@ -37,6 +37,8 @@
 
 #include "mdoutf.h"
 
+#include "config.h"
+
 #include "gromacs/commandline/filenm.h"
 #include "gromacs/domdec/collect.h"
 #include "gromacs/domdec/domdec_struct.h"
 #include "gromacs/mdlib/trajectory_writing.h"
 #include "gromacs/mdrunutility/handlerestart.h"
 #include "gromacs/mdrunutility/multisim.h"
+#include "gromacs/mdtypes/awh_history.h"
 #include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/df_history.h"
+#include "gromacs/mdtypes/edsamhistory.h"
+#include "gromacs/mdtypes/energyhistory.h"
 #include "gromacs/mdtypes/imdoutputprovider.h"
 #include "gromacs/mdtypes/inputrec.h"
 #include "gromacs/mdtypes/md_enums.h"
 #include "gromacs/mdtypes/mdrunoptions.h"
+#include "gromacs/mdtypes/observableshistory.h"
 #include "gromacs/mdtypes/state.h"
+#include "gromacs/mdtypes/swaphistory.h"
 #include "gromacs/timing/wallcycle.h"
 #include "gromacs/topology/topology.h"
+#include "gromacs/utility/baseversion.h"
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/pleasecite.h"
+#include "gromacs/utility/programcontext.h"
 #include "gromacs/utility/smalloc.h"
+#include "gromacs/utility/sysinfo.h"
 
 struct gmx_mdoutf
 {
@@ -255,6 +266,216 @@ gmx_wallcycle_t mdoutf_get_wcycle(gmx_mdoutf_t of)
     return of->wcycle;
 }
 
+static void mpiBarrierBeforeRename(const bool applyMpiBarrierBeforeRename, MPI_Comm mpiBarrierCommunicator)
+{
+    if (applyMpiBarrierBeforeRename)
+    {
+#if GMX_MPI
+        MPI_Barrier(mpiBarrierCommunicator);
+#else
+        GMX_RELEASE_ASSERT(false, "Should not request a barrier without MPI");
+        GMX_UNUSED_VALUE(mpiBarrierCommunicator);
+#endif
+    }
+}
+/*! \brief Write a checkpoint to the filename
+ *
+ * Appends the _step<step>.cpt with bNumberAndKeep, otherwise moves
+ * the previous checkpoint filename with suffix _prev.cpt.
+ */
+static void write_checkpoint(const char*                   fn,
+                             gmx_bool                      bNumberAndKeep,
+                             FILE*                         fplog,
+                             const t_commrec*              cr,
+                             ivec                          domdecCells,
+                             int                           nppnodes,
+                             int                           eIntegrator,
+                             int                           simulation_part,
+                             gmx_bool                      bExpanded,
+                             int                           elamstats,
+                             int64_t                       step,
+                             double                        t,
+                             t_state*                      state,
+                             ObservablesHistory*           observablesHistory,
+                             const gmx::MdModulesNotifier& mdModulesNotifier,
+                             bool                          applyMpiBarrierBeforeRename,
+                             MPI_Comm                      mpiBarrierCommunicator)
+{
+    t_fileio* fp;
+    char*     fntemp; /* the temporary checkpoint file name */
+    int       npmenodes;
+    char      buf[1024], suffix[5 + STEPSTRSIZE], sbuf[STEPSTRSIZE];
+    t_fileio* ret;
+
+    if (DOMAINDECOMP(cr))
+    {
+        npmenodes = cr->npmenodes;
+    }
+    else
+    {
+        npmenodes = 0;
+    }
+
+#if !GMX_NO_RENAME
+    /* make the new temporary filename */
+    snew(fntemp, std::strlen(fn) + 5 + STEPSTRSIZE);
+    std::strcpy(fntemp, fn);
+    fntemp[std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1] = '\0';
+    sprintf(suffix, "_%s%s", "step", gmx_step_str(step, sbuf));
+    std::strcat(fntemp, suffix);
+    std::strcat(fntemp, fn + std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1);
+#else
+    /* if we can't rename, we just overwrite the cpt file.
+     * dangerous if interrupted.
+     */
+    snew(fntemp, std::strlen(fn));
+    std::strcpy(fntemp, fn);
+#endif
+    std::string timebuf = gmx_format_current_time();
+
+    if (fplog)
+    {
+        fprintf(fplog, "Writing checkpoint, step %s at %s\n\n", gmx_step_str(step, buf), timebuf.c_str());
+    }
+
+    /* Get offsets for open files */
+    auto outputfiles = gmx_fio_get_output_file_positions();
+
+    fp = gmx_fio_open(fntemp, "w");
+
+    /* We can check many more things now (CPU, acceleration, etc), but
+     * it is highly unlikely to have two separate builds with exactly
+     * the same version, user, time, and build host!
+     */
+
+    int nlambda = (state->dfhist ? state->dfhist->nlambda : 0);
+
+    edsamhistory_t* edsamhist = observablesHistory->edsamHistory.get();
+    int             nED       = (edsamhist ? edsamhist->nED : 0);
+
+    swaphistory_t* swaphist    = observablesHistory->swapHistory.get();
+    int            eSwapCoords = (swaphist ? swaphist->eSwapCoords : eswapNO);
+
+    CheckpointHeaderContents headerContents = { 0,
+                                                { 0 },
+                                                { 0 },
+                                                { 0 },
+                                                { 0 },
+                                                GMX_DOUBLE,
+                                                { 0 },
+                                                { 0 },
+                                                eIntegrator,
+                                                simulation_part,
+                                                step,
+                                                t,
+                                                nppnodes,
+                                                { 0 },
+                                                npmenodes,
+                                                state->natoms,
+                                                state->ngtc,
+                                                state->nnhpres,
+                                                state->nhchainlength,
+                                                nlambda,
+                                                state->flags,
+                                                0,
+                                                0,
+                                                0,
+                                                0,
+                                                0,
+                                                nED,
+                                                eSwapCoords };
+    std::strcpy(headerContents.version, gmx_version());
+    std::strcpy(headerContents.fprog, gmx::getProgramContext().fullBinaryPath());
+    std::strcpy(headerContents.ftime, timebuf.c_str());
+    if (DOMAINDECOMP(cr))
+    {
+        copy_ivec(domdecCells, headerContents.dd_nc);
+    }
+
+    write_checkpoint_data(fp, headerContents, bExpanded, elamstats, state, observablesHistory,
+                          mdModulesNotifier, &outputfiles);
+
+    /* we really, REALLY, want to make sure to physically write the checkpoint,
+       and all the files it depends on, out to disk. Because we've
+       opened the checkpoint with gmx_fio_open(), it's in our list
+       of open files.  */
+    ret = gmx_fio_all_output_fsync();
+
+    if (ret)
+    {
+        char buf[STRLEN];
+        sprintf(buf, "Cannot fsync '%s'; maybe you are out of disk space?", gmx_fio_getname(ret));
+
+        if (getenv(GMX_IGNORE_FSYNC_FAILURE_ENV) == nullptr)
+        {
+            gmx_file(buf);
+        }
+        else
+        {
+            gmx_warning("%s", buf);
+        }
+    }
+
+    if (gmx_fio_close(fp) != 0)
+    {
+        gmx_file("Cannot read/write checkpoint; corrupt file, or maybe you are out of disk space?");
+    }
+
+    /* we don't move the checkpoint if the user specified they didn't want it,
+       or if the fsyncs failed */
+#if !GMX_NO_RENAME
+    if (!bNumberAndKeep && !ret)
+    {
+        if (gmx_fexist(fn))
+        {
+            /* Rename the previous checkpoint file */
+            mpiBarrierBeforeRename(applyMpiBarrierBeforeRename, mpiBarrierCommunicator);
+
+            std::strcpy(buf, fn);
+            buf[std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1] = '\0';
+            std::strcat(buf, "_prev");
+            std::strcat(buf, fn + std::strlen(fn) - std::strlen(ftp2ext(fn2ftp(fn))) - 1);
+            if (!GMX_FAHCORE)
+            {
+                /* we copy here so that if something goes wrong between now and
+                 * the rename below, there's always a state.cpt.
+                 * If renames are atomic (such as in POSIX systems),
+                 * this copying should be unneccesary.
+                 */
+                gmx_file_copy(fn, buf, FALSE);
+                /* We don't really care if this fails:
+                 * there's already a new checkpoint.
+                 */
+            }
+            else
+            {
+                gmx_file_rename(fn, buf);
+            }
+        }
+
+        /* Rename the checkpoint file from the temporary to the final name */
+        mpiBarrierBeforeRename(applyMpiBarrierBeforeRename, mpiBarrierCommunicator);
+
+        if (gmx_file_rename(fntemp, fn) != 0)
+        {
+            gmx_file("Cannot rename checkpoint file; maybe you are out of disk space?");
+        }
+    }
+#endif /* GMX_NO_RENAME */
+
+    sfree(fntemp);
+
+#if GMX_FAHCORE
+    /*code for alternate checkpointing scheme.  moved from top of loop over
+       steps */
+    fcRequestCheckPoint();
+    if (fcCheckPointParallel(cr->nodeid, NULL, 0) == 0)
+    {
+        gmx_fatal(3, __FILE__, __LINE__, "Checkpoint error on step %d\n", step);
+    }
+#endif /* end GMX_FAHCORE block */
+}
+
 void mdoutf_write_to_trajectory_files(FILE*                    fplog,
                                       const t_commrec*         cr,
                                       gmx_mdoutf_t             of,
index aab8dffd747d508f0d7af2505b03f5d1f5d158ae..879a28a692300812386962321a3c4598c9034de9 100644 (file)
  * To help us fund GROMACS development, we humbly ask that you cite
  * the research papers on the package. Check out http://www.gromacs.org.
  */
-/*! \libinternal \file
+/*! \internal \file
  * \brief Implements the VirtualSitesHandler class and vsite standalone functions
  *
  * \author Berk Hess <hess@kth.se>
  * \ingroup module_mdlib
- * \inlibraryapi
  */
 
 #include "gmxpre.h"
index 3f563e201633143afcf33ee34406687caa0848bc..4c1f341f4e37497a9dd24120a2f2e8c4b12310e9 100644 (file)
@@ -720,38 +720,9 @@ void gmx::LegacySimulator::do_md()
     step     = ir->init_step;
     step_rel = 0;
 
-    // TODO extract this to new multi-simulation module
     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
     {
-        if (!multisim_int_all_are_equal(ms, ir->nsteps))
-        {
-            GMX_LOG(mdlog.warning)
-                    .appendText(
-                            "Note: The number of steps is not consistent across multi "
-                            "simulations,\n"
-                            "but we are proceeding anyway!");
-        }
-        if (!multisim_int_all_are_equal(ms, ir->init_step))
-        {
-            if (simulationsShareState)
-            {
-                if (MASTER(cr))
-                {
-                    gmx_fatal(FARGS,
-                              "The initial step is not consistent across multi simulations which "
-                              "share the state");
-                }
-                gmx_barrier(cr->mpi_comm_mygroup);
-            }
-            else
-            {
-                GMX_LOG(mdlog.warning)
-                        .appendText(
-                                "Note: The initial step is not consistent across multi "
-                                "simulations,\n"
-                                "but we are proceeding anyway!");
-            }
-        }
+        logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
     }
 
     /* and stop now if we should */
index 12b1f9dece013f01146f801d94fe6c4d7f11a18f..70aba55b948f4cfa45462e09ab322756c5385a27 100644 (file)
 
 #include "config.h"
 
+#include "gromacs/gmxlib/network.h"
 #include "gromacs/mdtypes/commrec.h"
 #include "gromacs/utility/exceptions.h"
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/futil.h"
 #include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/logger.h"
 #include "gromacs/utility/smalloc.h"
 
 std::unique_ptr<gmx_multisim_t> buildMultiSimulation(MPI_Comm                         worldComm,
@@ -424,3 +426,67 @@ bool isMasterSimMasterRank(const gmx_multisim_t* ms, const bool isMaster)
 {
     return (isMaster && isMasterSim(ms));
 }
+
+static bool multisim_int_all_are_equal(const gmx_multisim_t* ms, int64_t value)
+{
+    bool     allValuesAreEqual = true;
+    int64_t* buf;
+
+    GMX_RELEASE_ASSERT(ms, "Invalid use of multi-simulation pointer");
+
+    snew(buf, ms->numSimulations_);
+    /* send our value to all other master ranks, receive all of theirs */
+    buf[ms->simulationIndex_] = value;
+    gmx_sumli_sim(ms->numSimulations_, buf, ms);
+
+    for (int s = 0; s < ms->numSimulations_; s++)
+    {
+        if (buf[s] != value)
+        {
+            allValuesAreEqual = false;
+            break;
+        }
+    }
+
+    sfree(buf);
+
+    return allValuesAreEqual;
+}
+
+void logInitialMultisimStatus(const gmx_multisim_t* ms,
+                              const t_commrec*      cr,
+                              const gmx::MDLogger&  mdlog,
+                              const bool            simulationsShareState,
+                              const int             numSteps,
+                              const int             initialStep)
+{
+    if (!multisim_int_all_are_equal(ms, numSteps))
+    {
+        GMX_LOG(mdlog.warning)
+                .appendText(
+                        "Note: The number of steps is not consistent across multi "
+                        "simulations,\n"
+                        "but we are proceeding anyway!");
+    }
+    if (!multisim_int_all_are_equal(ms, initialStep))
+    {
+        if (simulationsShareState)
+        {
+            if (MASTER(cr))
+            {
+                gmx_fatal(FARGS,
+                          "The initial step is not consistent across multi simulations which "
+                          "share the state");
+            }
+            gmx_barrier(cr->mpi_comm_mygroup);
+        }
+        else
+        {
+            GMX_LOG(mdlog.warning)
+                    .appendText(
+                            "Note: The initial step is not consistent across multi "
+                            "simulations,\n"
+                            "but we are proceeding anyway!");
+        }
+    }
+}
index f616d527d13eafa44935e48dd89537cb6a02a6d4..33222ca9b7cd3aedfd0c16bb0e0bdb3453950928 100644 (file)
 #include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/gmxmpi.h"
 
+namespace gmx
+{
+class MDLogger;
+}
+
 struct gmx_multisim_t;
+struct t_commrec;
 
 /*! \libinternal
  * \brief Builder function for gmx_multisim_t
@@ -176,4 +182,23 @@ bool isMasterSim(const gmx_multisim_t* ms);
  * This rank prints the remaining run time etc. */
 bool isMasterSimMasterRank(const gmx_multisim_t* ms, bool isMaster);
 
+/*! \brief Log the initial state of the multi-sim
+ *
+ * The simulations may be at different steps, etc so we
+ * report that.
+ *
+ * \param[in]  ms                     The multi-sum object
+ * \param[in]  cr                     The commrec object
+ * \param[in]  mdlog                  Logger
+ * \param[in]  simulationsShareState  Whether the simulations share state
+ * \param[in]  numSteps               The number of steps in this simulation
+ * \param[in]  initialStep            The initial step for this simulation
+ */
+void logInitialMultisimStatus(const gmx_multisim_t* ms,
+                              const t_commrec*      cr,
+                              const gmx::MDLogger&  mdlog,
+                              bool                  simulationsShareState,
+                              int                   numSteps,
+                              int                   initialStep);
+
 #endif