Merge release-2019 into master
[alexxy/gromacs.git] / src / gromacs / mdrun / runner.cpp
index 370cff95f94c657027b79afd16368f194ac41918..79ebf9fd33fc50042233446d27e68de5c780b7f6 100644 (file)
 #include <cstring>
 
 #include <algorithm>
+#include <memory>
 
 #include "gromacs/commandline/filenm.h"
-#include "gromacs/compat/make_unique.h"
 #include "gromacs/domdec/domdec.h"
 #include "gromacs/domdec/domdec_struct.h"
 #include "gromacs/domdec/localatomsetmanager.h"
-#include "gromacs/ewald/ewald-utils.h"
+#include "gromacs/domdec/partition.h"
+#include "gromacs/ewald/ewald_utils.h"
 #include "gromacs/ewald/pme.h"
-#include "gromacs/ewald/pme-gpu-program.h"
+#include "gromacs/ewald/pme_gpu_program.h"
 #include "gromacs/fileio/checkpoint.h"
 #include "gromacs/fileio/gmxfio.h"
 #include "gromacs/fileio/oenv.h"
 #include "gromacs/hardware/cpuinfo.h"
 #include "gromacs/hardware/detecthardware.h"
 #include "gromacs/hardware/printhardware.h"
-#include "gromacs/listed-forces/disre.h"
-#include "gromacs/listed-forces/gpubonded.h"
-#include "gromacs/listed-forces/orires.h"
+#include "gromacs/imd/imd.h"
+#include "gromacs/listed_forces/disre.h"
+#include "gromacs/listed_forces/gpubonded.h"
+#include "gromacs/listed_forces/orires.h"
 #include "gromacs/math/functions.h"
 #include "gromacs/math/utilities.h"
 #include "gromacs/math/vec.h"
 #include "gromacs/mdlib/boxdeformation.h"
+#include "gromacs/mdlib/broadcaststructs.h"
 #include "gromacs/mdlib/calc_verletbuf.h"
+#include "gromacs/mdlib/dispersioncorrection.h"
+#include "gromacs/mdlib/enerdata_utils.h"
+#include "gromacs/mdlib/force.h"
 #include "gromacs/mdlib/forcerec.h"
 #include "gromacs/mdlib/gmx_omp_nthreads.h"
 #include "gromacs/mdlib/makeconstraints.h"
 #include "gromacs/mdlib/md_support.h"
 #include "gromacs/mdlib/mdatoms.h"
-#include "gromacs/mdlib/mdrun.h"
 #include "gromacs/mdlib/membed.h"
-#include "gromacs/mdlib/nb_verlet.h"
-#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
-#include "gromacs/mdlib/nbnxn_search.h"
-#include "gromacs/mdlib/nbnxn_tuning.h"
 #include "gromacs/mdlib/ppforceworkload.h"
 #include "gromacs/mdlib/qmmm.h"
 #include "gromacs/mdlib/sighandler.h"
-#include "gromacs/mdlib/sim_util.h"
 #include "gromacs/mdlib/stophandler.h"
-#include "gromacs/mdrun/legacymdrunoptions.h"
-#include "gromacs/mdrun/logging.h"
-#include "gromacs/mdrun/multisim.h"
+#include "gromacs/mdrun/mdmodules.h"
 #include "gromacs/mdrun/simulationcontext.h"
-#include "gromacs/mdrunutility/mdmodules.h"
+#include "gromacs/mdrunutility/handlerestart.h"
+#include "gromacs/mdrunutility/logging.h"
+#include "gromacs/mdrunutility/multisim.h"
+#include "gromacs/mdrunutility/printtime.h"
 #include "gromacs/mdrunutility/threadaffinity.h"
 #include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/enerdata.h"
 #include "gromacs/mdtypes/fcdata.h"
 #include "gromacs/mdtypes/inputrec.h"
 #include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/mdrunoptions.h"
 #include "gromacs/mdtypes/observableshistory.h"
 #include "gromacs/mdtypes/state.h"
+#include "gromacs/nbnxm/gpu_data_mgmt.h"
+#include "gromacs/nbnxm/nbnxm.h"
+#include "gromacs/nbnxm/pairlist_tuning.h"
 #include "gromacs/pbcutil/pbc.h"
 #include "gromacs/pulling/output.h"
 #include "gromacs/pulling/pull.h"
 #include "gromacs/taskassignment/resourcedivision.h"
 #include "gromacs/taskassignment/taskassignment.h"
 #include "gromacs/taskassignment/usergpuids.h"
+#include "gromacs/timing/gpu_timing.h"
 #include "gromacs/timing/wallcycle.h"
+#include "gromacs/timing/wallcyclereporting.h"
 #include "gromacs/topology/mtop_util.h"
 #include "gromacs/trajectory/trajectoryframe.h"
 #include "gromacs/utility/basenetwork.h"
 #include "gromacs/utility/smalloc.h"
 #include "gromacs/utility/stringutil.h"
 
-#include "integrator.h"
+#include "legacysimulator.h"
 #include "replicaexchange.h"
 
 #if GMX_FAHCORE
 namespace gmx
 {
 
+/*! \brief Log if development feature flags are encountered
+ *
+ * The use of dev features indicated by environment variables is logged
+ * in order to ensure that runs with such featrues enabled can be identified
+ * from their log and standard output.
+ *
+ * \param[in]  mdlog        Logger object.
+ */
+static void reportDevelopmentFeatures(const gmx::MDLogger &mdlog)
+{
+    const bool enableGpuBufOps       = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
+    const bool useGpuUpdateConstrain = (getenv("GMX_UPDATE_CONSTRAIN_GPU") != nullptr);
+
+    if (enableGpuBufOps)
+    {
+        GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
+                "NOTE: This run uses the 'GPU buffer ops' feature, enabled by the GMX_USE_GPU_BUFFER_OPS environment variable.");
+    }
+
+    if (useGpuUpdateConstrain)
+    {
+        GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
+                "NOTE: This run uses the 'GPU update/constraints' feature, enabled by the GMX_UPDATE_CONSTRAIN_GPU environment variable.");
+    }
+}
+
 /*! \brief Barrier for safe simultaneous thread access to mdrunner data
  *
  * Used to ensure that the master thread does not modify mdrunner during copy
@@ -163,13 +197,13 @@ static void threadMpiMdrunnerAccessBarrier()
 
 Mdrunner Mdrunner::cloneOnSpawnedThread() const
 {
-    auto newRunner = Mdrunner();
+    auto newRunner = Mdrunner(std::make_unique<MDModules>());
 
     // All runners in the same process share a restraint manager resource because it is
     // part of the interface to the client code, which is associated only with the
     // original thread. Handles to the same resources can be obtained by copy.
     {
-        newRunner.restraintManager_ = compat::make_unique<RestraintManager>(*restraintManager_);
+        newRunner.restraintManager_ = std::make_unique<RestraintManager>(*restraintManager_);
     }
 
     // Copy original cr pointer before master thread can pass the thread barrier
@@ -192,7 +226,8 @@ Mdrunner Mdrunner::cloneOnSpawnedThread() const
     newRunner.replExParams        = replExParams;
     newRunner.pforce              = pforce;
     newRunner.ms                  = ms;
-    newRunner.stopHandlerBuilder_ = compat::make_unique<StopHandlerBuilder>(*stopHandlerBuilder_);
+    newRunner.startingBehavior    = startingBehavior;
+    newRunner.stopHandlerBuilder_ = std::make_unique<StopHandlerBuilder>(*stopHandlerBuilder_);
 
     threadMpiMdrunnerAccessBarrier();
 
@@ -239,7 +274,7 @@ t_commrec *Mdrunner::spawnThreads(int numThreadsToLaunch) const
 
 #if GMX_THREAD_MPI
     /* now spawn new threads that start mdrunner_start_fn(), while
-       the main thread returns, we set thread affinity later */
+       the main thread returns. Thread affinity is handled later. */
     if (tMPI_Init_fn(TRUE, numThreadsToLaunch, TMPI_AFFINITY_NONE,
                      mdrunner_start_fn, static_cast<const void*>(this)) != TMPI_SUCCESS)
     {
@@ -280,8 +315,8 @@ static void prepare_verlet_scheme(FILE                           *fplog,
         ListSetupType      listType  = (makeGpuPairList ? ListSetupType::Gpu : ListSetupType::CpuSimdWhenSupported);
         VerletbufListSetup listSetup = verletbufGetSafeListSetup(listType);
 
-        real               rlist_new;
-        calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1, -1, &listSetup, nullptr, &rlist_new);
+        const real         rlist_new =
+            calcVerletBufferSize(*mtop, det(box), *ir, ir->nstlist, ir->nstlist - 1, -1, listSetup);
 
         if (rlist_new != ir->rlist)
         {
@@ -417,10 +452,144 @@ static TaskTarget findTaskTarget(const char *optionString)
     return returnValue;
 }
 
+//! Finish run, aggregate data to print performance info.
+static void finish_run(FILE *fplog,
+                       const gmx::MDLogger &mdlog,
+                       const t_commrec *cr,
+                       const t_inputrec *inputrec,
+                       t_nrnb nrnb[], gmx_wallcycle_t wcycle,
+                       gmx_walltime_accounting_t walltime_accounting,
+                       nonbonded_verlet_t *nbv,
+                       const gmx_pme_t *pme,
+                       gmx_bool bWriteStat)
+{
+    double  delta_t  = 0;
+    double  nbfs     = 0, mflop = 0;
+    double  elapsed_time,
+            elapsed_time_over_all_ranks,
+            elapsed_time_over_all_threads,
+            elapsed_time_over_all_threads_over_all_ranks;
+    /* Control whether it is valid to print a report. Only the
+       simulation master may print, but it should not do so if the run
+       terminated e.g. before a scheduled reset step. This is
+       complicated by the fact that PME ranks are unaware of the
+       reason why they were sent a pmerecvqxFINISH. To avoid
+       communication deadlocks, we always do the communication for the
+       report, even if we've decided not to write the report, because
+       how long it takes to finish the run is not important when we've
+       decided not to report on the simulation performance.
+
+       Further, we only report performance for dynamical integrators,
+       because those are the only ones for which we plan to
+       consider doing any optimizations. */
+    bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
+
+    if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
+    {
+        GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
+        printReport = false;
+    }
+
+    t_nrnb                  *nrnb_tot;
+    std::unique_ptr<t_nrnb>  nrnbTotalStorage;
+    if (cr->nnodes > 1)
+    {
+        nrnbTotalStorage = std::make_unique<t_nrnb>();
+        nrnb_tot         = nrnbTotalStorage.get();
+#if GMX_MPI
+        MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
+                      cr->mpi_comm_mysim);
+#endif
+    }
+    else
+    {
+        nrnb_tot = nrnb;
+    }
+
+    elapsed_time                  = walltime_accounting_get_time_since_reset(walltime_accounting);
+    elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
+    if (cr->nnodes > 1)
+    {
+#if GMX_MPI
+        /* reduce elapsed_time over all MPI ranks in the current simulation */
+        MPI_Allreduce(&elapsed_time,
+                      &elapsed_time_over_all_ranks,
+                      1, MPI_DOUBLE, MPI_SUM,
+                      cr->mpi_comm_mysim);
+        elapsed_time_over_all_ranks /= cr->nnodes;
+        /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
+         * current simulation. */
+        MPI_Allreduce(&elapsed_time_over_all_threads,
+                      &elapsed_time_over_all_threads_over_all_ranks,
+                      1, MPI_DOUBLE, MPI_SUM,
+                      cr->mpi_comm_mysim);
+#endif
+    }
+    else
+    {
+        elapsed_time_over_all_ranks                  = elapsed_time;
+        elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
+    }
+
+    if (printReport)
+    {
+        print_flop(fplog, nrnb_tot, &nbfs, &mflop);
+    }
+
+    if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
+    {
+        print_dd_statistics(cr, inputrec, fplog);
+    }
+
+    /* TODO Move the responsibility for any scaling by thread counts
+     * to the code that handled the thread region, so that there's a
+     * mechanism to keep cycle counting working during the transition
+     * to task parallelism. */
+    int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
+    int nthreads_pme = gmx_omp_nthreads_get(emntPME);
+    wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
+    auto cycle_sum(wallcycle_sum(cr, wcycle));
+
+    if (printReport)
+    {
+        auto                    nbnxn_gpu_timings = (nbv != nullptr && nbv->useGpu()) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
+        gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
+
+        if (pme_gpu_task_enabled(pme))
+        {
+            pme_gpu_get_timings(pme, &pme_gpu_timings);
+        }
+        wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
+                        elapsed_time_over_all_ranks,
+                        wcycle, cycle_sum,
+                        nbnxn_gpu_timings,
+                        &pme_gpu_timings);
+
+        if (EI_DYNAMICS(inputrec->eI))
+        {
+            delta_t = inputrec->delta_t;
+        }
+
+        if (fplog)
+        {
+            print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
+                       elapsed_time_over_all_ranks,
+                       walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
+                       delta_t, nbfs, mflop);
+        }
+        if (bWriteStat)
+        {
+            print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
+                       elapsed_time_over_all_ranks,
+                       walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
+                       delta_t, nbfs, mflop);
+        }
+    }
+}
+
 int Mdrunner::mdrunner()
 {
     matrix                    box;
-    t_nrnb                   *nrnb;
     t_forcerec               *fr               = nullptr;
     t_fcdata                 *fcd              = nullptr;
     real                      ewaldcoeff_q     = 0;
@@ -428,15 +597,11 @@ int Mdrunner::mdrunner()
     int                       nChargePerturbed = -1, nTypePerturbed = 0;
     gmx_wallcycle_t           wcycle;
     gmx_walltime_accounting_t walltime_accounting = nullptr;
-    int                       rc;
-    int64_t                   reset_counters;
-    int                       nthreads_pme = 1;
-    gmx_membed_t *            membed       = nullptr;
-    gmx_hw_info_t            *hwinfo       = nullptr;
+    gmx_membed_t *            membed              = nullptr;
+    gmx_hw_info_t            *hwinfo              = nullptr;
 
     /* CAUTION: threads may be started later on in this function, so
        cr doesn't reflect the final parallel state right now */
-    std::unique_ptr<gmx::MDModules> mdModules(new gmx::MDModules);
     t_inputrec                      inputrecInstance;
     t_inputrec                     *inputrec = &inputrecInstance;
     gmx_mtop_t                      mtop;
@@ -447,30 +612,11 @@ int Mdrunner::mdrunner()
     // Handle task-assignment related user options.
     EmulateGpuNonbonded emulateGpuNonbonded = (getenv("GMX_EMULATE_GPU") != nullptr ?
                                                EmulateGpuNonbonded::Yes : EmulateGpuNonbonded::No);
-    std::vector<int>    gpuIdsAvailable;
-    try
-    {
-        gpuIdsAvailable = parseUserGpuIds(hw_opt.gpuIdsAvailable);
-        // TODO We could put the GPU IDs into a std::map to find
-        // duplicates, but for the small numbers of IDs involved, this
-        // code is simple and fast.
-        for (size_t i = 0; i != gpuIdsAvailable.size(); ++i)
-        {
-            for (size_t j = i+1; j != gpuIdsAvailable.size(); ++j)
-            {
-                if (gpuIdsAvailable[i] == gpuIdsAvailable[j])
-                {
-                    GMX_THROW(InvalidInputError(formatString("The string of available GPU device IDs '%s' may not contain duplicate device IDs", hw_opt.gpuIdsAvailable.c_str())));
-                }
-            }
-        }
-    }
-    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 
     std::vector<int> userGpuTaskAssignment;
     try
     {
-        userGpuTaskAssignment = parseUserGpuIds(hw_opt.userGpuTaskAssignment);
+        userGpuTaskAssignment = parseUserTaskAssignmentString(hw_opt.userGpuTaskAssignment);
     }
     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
     auto       nonbondedTarget = findTaskTarget(nbpu_opt);
@@ -487,126 +633,45 @@ int Mdrunner::mdrunner()
     // to check that the old log file matches what the checkpoint file
     // expects. Otherwise, we should start to write log output now if
     // there is a file ready for it.
-    if (logFileHandle != nullptr && !mdrunOptions.continuationOptions.appendFiles)
+    if (logFileHandle != nullptr && startingBehavior != StartingBehavior::RestartWithAppending)
     {
         fplog = gmx_fio_getfp(logFileHandle);
     }
     gmx::LoggerOwner logOwner(buildLogger(fplog, cr));
     gmx::MDLogger    mdlog(logOwner.logger());
 
-    // TODO The thread-MPI master rank makes a working
-    // PhysicalNodeCommunicator here, but it gets rebuilt by all ranks
-    // after the threads have been launched. This works because no use
-    // is made of that communicator until after the execution paths
-    // have rejoined. But it is likely that we can improve the way
-    // this is expressed, e.g. by expressly running detection only the
-    // master rank for thread-MPI, rather than relying on the mutex
-    // and reference count.
+    // report any development features that may be enabled by environment variables
+    reportDevelopmentFeatures(mdlog);
+
+    // With thread-MPI, the communicator changes after threads are
+    // launched, so this is rebuilt for the master rank at that
+    // time. The non-master ranks are fine to keep the one made here.
     PhysicalNodeCommunicator physicalNodeComm(MPI_COMM_WORLD, gmx_physicalnode_id_hash());
     hwinfo = gmx_detect_hardware(mdlog, physicalNodeComm);
 
-    gmx_print_detected_hardware(fplog, cr, ms, mdlog, hwinfo);
+    gmx_print_detected_hardware(fplog, isMasterSimMasterRank(ms, MASTER(cr)), mdlog, hwinfo);
 
-    std::vector<int> gpuIdsToUse;
-    auto             compatibleGpus = getCompatibleGpus(hwinfo->gpu_info);
-    if (gpuIdsAvailable.empty())
-    {
-        gpuIdsToUse = compatibleGpus;
-    }
-    else
-    {
-        for (const auto &availableGpuId : gpuIdsAvailable)
-        {
-            bool availableGpuIsCompatible = false;
-            for (const auto &compatibleGpuId : compatibleGpus)
-            {
-                if (availableGpuId == compatibleGpuId)
-                {
-                    availableGpuIsCompatible = true;
-                    break;
-                }
-            }
-            if (!availableGpuIsCompatible)
-            {
-                gmx_fatal(FARGS, "You limited the set of compatible GPUs to a set that included ID #%d, but that ID is not for a compatible GPU. List only compatible GPUs.", availableGpuId);
-            }
-            gpuIdsToUse.push_back(availableGpuId);
-        }
-    }
+    std::vector<int> gpuIdsToUse = makeGpuIdsToUse(hwinfo->gpu_info, hw_opt.gpuIdsAvailable);
 
-    if (fplog != nullptr)
-    {
-        /* Print references after all software/hardware printing */
-        please_cite(fplog, "Abraham2015");
-        please_cite(fplog, "Pall2015");
-        please_cite(fplog, "Pronk2013");
-        please_cite(fplog, "Hess2008b");
-        please_cite(fplog, "Spoel2005a");
-        please_cite(fplog, "Lindahl2001a");
-        please_cite(fplog, "Berendsen95a");
-        writeSourceDoi(fplog);
-    }
+    // Print citation requests after all software/hardware printing
+    pleaseCiteGromacs(fplog);
 
     std::unique_ptr<t_state> globalState;
 
     if (SIMMASTER(cr))
     {
         /* Only the master rank has the global state */
-        globalState = compat::make_unique<t_state>();
+        globalState = std::make_unique<t_state>();
 
         /* Read (nearly) all data required for the simulation */
         read_tpx_state(ftp2fn(efTPR, filenames.size(), filenames.data()), inputrec, globalState.get(), &mtop);
-
-        /* In rerun, set velocities to zero if present */
-        if (doRerun && ((globalState->flags & (1 << estV)) != 0))
-        {
-            // rerun does not use velocities
-            GMX_LOG(mdlog.info).asParagraph().appendText(
-                    "Rerun trajectory contains velocities. Rerun does only evaluate "
-                    "potential energy and forces. The velocities will be ignored.");
-            for (int i = 0; i < globalState->natoms; i++)
-            {
-                clear_rvec(globalState->v[i]);
-            }
-            globalState->flags &= ~(1 << estV);
-        }
-
-        if (inputrec->cutoff_scheme != ecutsVERLET)
-        {
-            if (nstlist_cmdline > 0)
-            {
-                gmx_fatal(FARGS, "Can not set nstlist with the group cut-off scheme");
-            }
-
-            if (!compatibleGpus.empty())
-            {
-                GMX_LOG(mdlog.warning).asParagraph().appendText(
-                        "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
-                        "      To use a GPU, set the mdp option: cutoff-scheme = Verlet");
-            }
-        }
     }
 
     /* Check and update the hardware options for internal consistency */
-    check_and_update_hw_opt_1(mdlog, &hw_opt, cr, domdecOptions.numPmeRanks);
-
-    /* Early check for externally set process affinity. */
-    gmx_check_thread_affinity_set(mdlog, cr,
-                                  &hw_opt, hwinfo->nthreads_hw_avail, FALSE);
+    checkAndUpdateHardwareOptions(mdlog, &hw_opt, SIMMASTER(cr), domdecOptions.numPmeRanks);
 
     if (GMX_THREAD_MPI && SIMMASTER(cr))
     {
-        if (domdecOptions.numPmeRanks > 0 && hw_opt.nthreads_tmpi <= 0)
-        {
-            gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME ranks");
-        }
-
-        /* Since the master knows the cut-off scheme, update hw_opt for this.
-         * This is done later for normal MPI and also once more with tMPI
-         * for all tMPI ranks.
-         */
-        check_and_update_hw_opt_2(&hw_opt, inputrec->cutoff_scheme);
-
         bool useGpuForNonbonded = false;
         bool useGpuForPme       = false;
         try
@@ -618,7 +683,6 @@ int Mdrunner::mdrunner()
             useGpuForNonbonded = decideWhetherToUseGpusForNonbondedWithThreadMpi
                     (nonbondedTarget, gpuIdsToUse, userGpuTaskAssignment, emulateGpuNonbonded,
                     canUseGpuForNonbonded,
-                    inputrec->cutoff_scheme == ecutsVERLET,
                     gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, GMX_THREAD_MPI),
                     hw_opt.nthreads_tmpi);
             useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi
@@ -683,12 +747,10 @@ int Mdrunner::mdrunner()
         // handle. If unsuitable, we will notice that during task
         // assignment.
         bool gpusWereDetected      = hwinfo->ngpu_compatible_tot > 0;
-        bool usingVerletScheme     = inputrec->cutoff_scheme == ecutsVERLET;
         auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
         useGpuForNonbonded = decideWhetherToUseGpusForNonbonded(nonbondedTarget, userGpuTaskAssignment,
                                                                 emulateGpuNonbonded,
                                                                 canUseGpuForNonbonded,
-                                                                usingVerletScheme,
                                                                 gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, !GMX_THREAD_MPI),
                                                                 gpusWereDetected);
         useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment,
@@ -697,7 +759,7 @@ int Mdrunner::mdrunner()
                                                     gpusWereDetected);
         auto canUseGpuForBonded = buildSupportsGpuBondeds(nullptr) && inputSupportsGpuBondeds(*inputrec, mtop, nullptr);
         useGpuForBonded =
-            decideWhetherToUseGpusForBonded(useGpuForNonbonded, useGpuForPme, usingVerletScheme,
+            decideWhetherToUseGpusForBonded(useGpuForNonbonded, useGpuForPme,
                                             bondedTarget, canUseGpuForBonded,
                                             EVDW_PME(inputrec->vdwtype),
                                             EEL_PME_EWALD(inputrec->coulombtype),
@@ -722,17 +784,17 @@ int Mdrunner::mdrunner()
     // TODO: hide restraint implementation details from Mdrunner.
     // There is nothing unique about restraints at this point as far as the
     // Mdrunner is concerned. The Mdrunner should just be getting a sequence of
-    // factory functions from the SimulationContext on which to call mdModules->add().
+    // factory functions from the SimulationContext on which to call mdModules_->add().
     // TODO: capture all restraints into a single RestraintModule, passed to the runner builder.
     for (auto && restraint : restraintManager_->getRestraints())
     {
         auto module = RestraintMDModule::create(restraint,
                                                 restraint->sites());
-        mdModules->add(std::move(module));
+        mdModules_->add(std::move(module));
     }
 
     // TODO: Error handling
-    mdModules->assignOptionsToModules(*inputrec->params, nullptr);
+    mdModules_->assignOptionsToModules(*inputrec->params, nullptr);
 
     if (fplog != nullptr)
     {
@@ -742,6 +804,20 @@ int Mdrunner::mdrunner()
 
     if (SIMMASTER(cr))
     {
+        /* In rerun, set velocities to zero if present */
+        if (doRerun && ((globalState->flags & (1 << estV)) != 0))
+        {
+            // rerun does not use velocities
+            GMX_LOG(mdlog.info).asParagraph().appendText(
+                    "Rerun trajectory contains velocities. Rerun does only evaluate "
+                    "potential energy and forces. The velocities will be ignored.");
+            for (int i = 0; i < globalState->natoms; i++)
+            {
+                clear_rvec(globalState->v[i]);
+            }
+            globalState->flags &= ~(1 << estV);
+        }
+
         /* now make sure the state is initialized and propagated */
         set_state_entries(globalState.get(), inputrec);
     }
@@ -753,7 +829,7 @@ int Mdrunner::mdrunner()
     {
         if (!MASTER(cr))
         {
-            globalState = compat::make_unique<t_state>();
+            globalState = std::make_unique<t_state>();
         }
         broadcastStateWithoutDynamics(cr, globalState.get());
     }
@@ -784,11 +860,6 @@ int Mdrunner::mdrunner()
         gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
     }
 
-    if (can_use_allvsall(inputrec, TRUE, cr, fplog) && DOMAINDECOMP(cr))
-    {
-        gmx_fatal(FARGS, "All-vs-all loops do not work with domain decomposition, use a single MPI rank");
-    }
-
     if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
     {
         if (domdecOptions.numPmeRanks > 0)
@@ -846,15 +917,11 @@ int Mdrunner::mdrunner()
 
     ObservablesHistory   observablesHistory = {};
 
-    ContinuationOptions &continuationOptions = mdrunOptions.continuationOptions;
-
-    if (continuationOptions.startedFromCheckpoint)
+    if (startingBehavior != StartingBehavior::NewSimulation)
     {
         /* Check if checkpoint file exists before doing continuation.
          * This way we can use identical input options for the first and subsequent runs...
          */
-        gmx_bool bReadEkin;
-
         if (mdrunOptions.numStepsCommandline > -2)
         {
             /* Temporarily set the number of steps to unmlimited to avoid
@@ -868,17 +935,10 @@ int Mdrunner::mdrunner()
                         logFileHandle,
                         cr, domdecOptions.numCells,
                         inputrec, globalState.get(),
-                        &bReadEkin, &observablesHistory,
-                        continuationOptions.appendFiles,
-                        continuationOptions.appendFilesOptionSet,
+                        &observablesHistory,
                         mdrunOptions.reproducible);
 
-        if (bReadEkin)
-        {
-            continuationOptions.haveReadEkin = true;
-        }
-
-        if (continuationOptions.appendFiles && logFileHandle)
+        if (startingBehavior == StartingBehavior::RestartWithAppending && logFileHandle)
         {
             // Now we can start normal logging to the truncated log file.
             fplog    = gmx_fio_getfp(logFileHandle);
@@ -907,12 +967,13 @@ int Mdrunner::mdrunner()
         gmx_bcast(sizeof(box), box, cr);
     }
 
-    /* Update rlist and nstlist. */
-    if (inputrec->cutoff_scheme == ecutsVERLET)
+    if (inputrec->cutoff_scheme != ecutsVERLET)
     {
-        prepare_verlet_scheme(fplog, cr, inputrec, nstlist_cmdline, &mtop, box,
-                              useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes), *hwinfo->cpuInfo);
+        gmx_fatal(FARGS, "This group-scheme .tpr file can no longer be run by mdrun. Please update to the Verlet scheme, or use an earlier version of GROMACS if necessary.");
     }
+    /* Update rlist and nstlist. */
+    prepare_verlet_scheme(fplog, cr, inputrec, nstlist_cmdline, &mtop, box,
+                          useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes), *hwinfo->cpuInfo);
 
     LocalAtomSetManager atomSets;
 
@@ -966,9 +1027,13 @@ int Mdrunner::mdrunner()
     fflush(stderr);
 #endif
 
-    /* Check and update hw_opt for the cut-off scheme */
-    check_and_update_hw_opt_2(&hw_opt, inputrec->cutoff_scheme);
-
+    // If mdrun -pin auto honors any affinity setting that already
+    // exists. If so, it is nice to provide feedback about whether
+    // that existing affinity setting was from OpenMP or something
+    // else, so we run this code both before and after we initialize
+    // the OpenMP support.
+    gmx_check_thread_affinity_set(mdlog,
+                                  &hw_opt, hwinfo->nthreads_hw_avail, FALSE);
     /* Check and update the number of OpenMP threads requested */
     checkAndUpdateRequestedNumOpenmpThreads(&hw_opt, *hwinfo, cr, ms, physicalNodeComm.size_,
                                             pmeRunMode, mtop);
@@ -978,17 +1043,16 @@ int Mdrunner::mdrunner()
                           physicalNodeComm.size_,
                           hw_opt.nthreads_omp,
                           hw_opt.nthreads_omp_pme,
-                          !thisRankHasDuty(cr, DUTY_PP),
-                          inputrec->cutoff_scheme == ecutsVERLET);
+                          !thisRankHasDuty(cr, DUTY_PP));
 
-    // Enable FP exception detection for the Verlet scheme, but not in
+    // Enable FP exception detection, but not in
     // Release mode and not for compilers with known buggy FP
     // exception support (clang with any optimization) or suspected
     // buggy FP exception support (gcc 7.* with optimization).
 #if !defined NDEBUG && \
     !((defined __clang__ || (defined(__GNUC__) && !defined(__ICC) && __GNUC__ == 7)) \
     && defined __OPTIMIZE__)
-    const bool bEnableFPE = inputrec->cutoff_scheme == ecutsVERLET;
+    const bool bEnableFPE = true;
 #else
     const bool bEnableFPE = false;
 #endif
@@ -1038,11 +1102,11 @@ int Mdrunner::mdrunner()
             }
             else if (nonbondedTarget == TaskTarget::Gpu)
             {
-                gmx_fatal(FARGS, "Cannot run short-ranged nonbonded interactions on a GPU because there is none detected.");
+                gmx_fatal(FARGS, "Cannot run short-ranged nonbonded interactions on a GPU because no GPU is detected.");
             }
             else if (bondedTarget == TaskTarget::Gpu)
             {
-                gmx_fatal(FARGS, "Cannot run bonded interactions on a GPU because there is none detected.");
+                gmx_fatal(FARGS, "Cannot run bonded interactions on a GPU because no GPU is detected.");
             }
         }
     }
@@ -1057,7 +1121,7 @@ int Mdrunner::mdrunner()
             }
             else if (pmeTarget == TaskTarget::Gpu)
             {
-                gmx_fatal(FARGS, "Cannot run PME on a GPU because there is none detected.");
+                gmx_fatal(FARGS, "Cannot run PME on a GPU because no GPU is detected.");
             }
         }
     }
@@ -1148,34 +1212,23 @@ int Mdrunner::mdrunner()
         }
     }
 
-    /* getting number of PP/PME threads
+    /* getting number of PP/PME threads on this MPI / tMPI rank.
        PME: env variable should be read only on one node to make sure it is
        identical everywhere;
      */
-    nthreads_pme = gmx_omp_nthreads_get(emntPME);
-
-    int numThreadsOnThisRank;
-    /* threads on this MPI process or TMPI thread */
-    if (thisRankHasDuty(cr, DUTY_PP))
-    {
-        numThreadsOnThisRank = gmx_omp_nthreads_get(emntNonbonded);
-    }
-    else
-    {
-        numThreadsOnThisRank = nthreads_pme;
-    }
-
+    const int numThreadsOnThisRank =
+        thisRankHasDuty(cr, DUTY_PP) ? gmx_omp_nthreads_get(emntNonbonded) : gmx_omp_nthreads_get(emntPME);
     checkHardwareOversubscription(numThreadsOnThisRank, cr->nodeid,
                                   *hwinfo->hardwareTopology,
                                   physicalNodeComm, mdlog);
 
-    if (hw_opt.thread_affinity != threadaffOFF)
+    if (hw_opt.threadAffinity != ThreadAffinity::Off)
     {
         /* Before setting affinity, check whether the affinity has changed
          * - which indicates that probably the OpenMP library has changed it
          * since we first checked).
          */
-        gmx_check_thread_affinity_set(mdlog, cr,
+        gmx_check_thread_affinity_set(mdlog,
                                       &hw_opt, hwinfo->nthreads_hw_avail, TRUE);
 
         int numThreadsOnThisNode, intraNodeThreadOffset;
@@ -1199,7 +1252,7 @@ int Mdrunner::mdrunner()
     {
         /* Master synchronizes its value of reset_counters with all nodes
          * including PME only nodes */
-        reset_counters = wcycle_get_reset_counters(wcycle);
+        int64_t reset_counters = wcycle_get_reset_counters(wcycle);
         gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
         wcycle_set_reset_counters(wcycle, reset_counters);
     }
@@ -1222,12 +1275,12 @@ int Mdrunner::mdrunner()
     std::unique_ptr<MDAtoms>     mdAtoms;
     std::unique_ptr<gmx_vsite_t> vsite;
 
-    snew(nrnb, 1);
+    t_nrnb nrnb;
     if (thisRankHasDuty(cr, DUTY_PP))
     {
         /* Initiate forcerecord */
-        fr                 = mk_forcerec();
-        fr->forceProviders = mdModules->initForceProviders();
+        fr                 = new t_forcerec;
+        fr->forceProviders = mdModules_->initForceProviders();
         init_forcerec(fplog, mdlog, fr, fcd,
                       inputrec, &mtop, cr, box,
                       opt2fn("-table", filenames.size(), filenames.data()),
@@ -1236,7 +1289,8 @@ int Mdrunner::mdrunner()
                       *hwinfo, nonbondedDeviceInfo,
                       useGpuForBonded,
                       FALSE,
-                      pforce);
+                      pforce,
+                      wcycle);
 
         /* Initialize the mdAtoms structure.
          * mdAtoms is not filled with atom data,
@@ -1326,10 +1380,10 @@ int Mdrunner::mdrunner()
                 pmedata = gmx_pme_init(cr,
                                        getNumPmeDomains(cr->dd),
                                        inputrec,
-                                       mtop.natoms, nChargePerturbed != 0, nTypePerturbed != 0,
+                                       nChargePerturbed != 0, nTypePerturbed != 0,
                                        mdrunOptions.reproducible,
                                        ewaldcoeff_q, ewaldcoeff_lj,
-                                       nthreads_pme,
+                                       gmx_omp_nthreads_get(emntPME),
                                        pmeRunMode, nullptr,
                                        pmeDeviceInfo, pmeGpuProgram.get(), mdlog);
             }
@@ -1348,6 +1402,7 @@ int Mdrunner::mdrunner()
         signal_handler_install();
     }
 
+    pull_t *pull_work = nullptr;
     if (thisRankHasDuty(cr, DUTY_PP))
     {
         /* Assumes uniform use of the number of OpenMP threads */
@@ -1356,18 +1411,18 @@ int Mdrunner::mdrunner()
         if (inputrec->bPull)
         {
             /* Initialize pull code */
-            inputrec->pull_work =
+            pull_work =
                 init_pull(fplog, inputrec->pull, inputrec,
                           &mtop, cr, &atomSets, inputrec->fepvals->init_lambda);
             if (inputrec->pull->bXOutAverage || inputrec->pull->bFOutAverage)
             {
-                initPullHistory(inputrec->pull_work, &observablesHistory);
+                initPullHistory(pull_work, &observablesHistory);
             }
             if (EI_DYNAMICS(inputrec->eI) && MASTER(cr))
             {
-                init_pull_output_files(inputrec->pull_work,
+                init_pull_output_files(pull_work,
                                        filenames.size(), filenames.data(), oenv,
-                                       continuationOptions);
+                                       startingBehavior);
             }
         }
 
@@ -1384,15 +1439,19 @@ int Mdrunner::mdrunner()
                                         globalState.get(),
                                         &mtop,
                                         oenv,
-                                        mdrunOptions);
+                                        mdrunOptions,
+                                        startingBehavior);
         }
 
+        t_swap *swap = nullptr;
         if (inputrec->eSwapCoords != eswapNO)
         {
             /* Initialize ion swapping code */
-            init_swapcoords(fplog, inputrec, opt2fn_master("-swap", filenames.size(), filenames.data(), cr),
-                            &mtop, globalState.get(), &observablesHistory,
-                            cr, &atomSets, oenv, mdrunOptions);
+            swap = init_swapcoords(fplog, inputrec,
+                                   opt2fn_master("-swap", filenames.size(), filenames.data(), cr),
+                                   &mtop, globalState.get(), &observablesHistory,
+                                   cr, &atomSets, oenv, mdrunOptions,
+                                   startingBehavior);
         }
 
         /* Let makeConstraints know whether we have essential dynamics constraints.
@@ -1400,9 +1459,18 @@ int Mdrunner::mdrunner()
          */
         bool doEssentialDynamics = (opt2fn_null("-ei", filenames.size(), filenames.data()) != nullptr
                                     || observablesHistory.edsamHistory);
-        auto constr              = makeConstraints(mtop, *inputrec, doEssentialDynamics,
+        auto constr              = makeConstraints(mtop, *inputrec, pull_work, doEssentialDynamics,
                                                    fplog, *mdAtoms->mdatoms(),
-                                                   cr, ms, nrnb, wcycle, fr->bMolPBC);
+                                                   cr, ms, &nrnb, wcycle, fr->bMolPBC);
+
+        /* Energy terms and groups */
+        gmx_enerdata_t enerd(mtop.groups.groups[SimulationAtomGroupType::EnergyOutput].size(), inputrec->fepvals->n_lambda);
+
+        /* Set up interactive MD (IMD) */
+        auto imdSession = makeImdSession(inputrec, cr, wcycle, &enerd, ms, &mtop, mdlog,
+                                         MASTER(cr) ? globalState->x.rvec_array() : nullptr,
+                                         filenames.size(), filenames.data(), oenv, mdrunOptions.imdOptions,
+                                         startingBehavior);
 
         if (DOMAINDECOMP(cr))
         {
@@ -1422,41 +1490,43 @@ int Mdrunner::mdrunner()
         // understood.
         PpForceWorkload ppForceWorkload;
 
-        GMX_ASSERT(stopHandlerBuilder_, "Runner must provide StopHandlerBuilder to integrator.");
+        GMX_ASSERT(stopHandlerBuilder_, "Runner must provide StopHandlerBuilder to simulator.");
         /* Now do whatever the user wants us to do (how flexible...) */
-        Integrator integrator {
+        LegacySimulator simulator {
             fplog, cr, ms, mdlog, static_cast<int>(filenames.size()), filenames.data(),
             oenv,
             mdrunOptions,
+            startingBehavior,
             vsite.get(), constr.get(),
             enforcedRotation ? enforcedRotation->getLegacyEnfrot() : nullptr,
             deform.get(),
-            mdModules->outputProvider(),
-            inputrec, &mtop,
+            mdModules_->outputProvider(),
+            inputrec, imdSession.get(), pull_work, swap, &mtop,
             fcd,
             globalState.get(),
             &observablesHistory,
-            mdAtoms.get(), nrnb, wcycle, fr,
+            mdAtoms.get(), &nrnb, wcycle, fr,
+            &enerd,
             &ppForceWorkload,
             replExParams,
             membed,
             walltime_accounting,
             std::move(stopHandlerBuilder_)
         };
-        integrator.run(inputrec->eI, doRerun);
+        simulator.run(inputrec->eI, doRerun);
 
         if (inputrec->bPull)
         {
-            finish_pull(inputrec->pull_work);
+            finish_pull(pull_work);
         }
-
+        finish_swapcoords(swap);
     }
     else
     {
         GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
         /* do PME only */
         walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
-        gmx_pmeonly(pmedata, cr, nrnb, wcycle, walltime_accounting, inputrec, pmeRunMode);
+        gmx_pmeonly(pmedata, cr, &nrnb, wcycle, walltime_accounting, inputrec, pmeRunMode);
     }
 
     wallcycle_stop(wcycle, ewcRUN);
@@ -1465,12 +1535,15 @@ int Mdrunner::mdrunner()
      * if rerunMD, don't write last frame again
      */
     finish_run(fplog, mdlog, cr,
-               inputrec, nrnb, wcycle, walltime_accounting,
-               fr ? fr->nbv : nullptr,
+               inputrec, &nrnb, wcycle, walltime_accounting,
+               fr ? fr->nbv.get() : nullptr,
                pmedata,
                EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms));
 
-    // Free PME data
+    // clean up cycle counter
+    wallcycle_destroy(wcycle);
+
+// Free PME data
     if (pmedata)
     {
         gmx_pme_destroy(pmedata);
@@ -1483,13 +1556,13 @@ int Mdrunner::mdrunner()
     // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
     mdAtoms.reset(nullptr);
     globalState.reset(nullptr);
-    mdModules.reset(nullptr);   // destruct force providers here as they might also use the GPU
+    mdModules_.reset(nullptr);   // destruct force providers here as they might also use the GPU
 
     /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
-    free_gpu_resources(fr, physicalNodeComm);
+    free_gpu_resources(fr, physicalNodeComm, hwinfo->gpu_info);
     free_gpu(nonbondedDeviceInfo);
     free_gpu(pmeDeviceInfo);
-    done_forcerec(fr, mtop.molblock.size(), mtop.groups.grps[egcENER].nr);
+    done_forcerec(fr, mtop.molblock.size());
     sfree(fcd);
 
     if (doMembed)
@@ -1497,12 +1570,9 @@ int Mdrunner::mdrunner()
         free_membed(membed);
     }
 
-    gmx_hardware_info_free();
-
     /* Does what it says */
     print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
     walltime_accounting_destroy(walltime_accounting);
-    sfree(nrnb);
 
     // Ensure log file content is written
     if (logFileHandle)
@@ -1517,7 +1587,7 @@ int Mdrunner::mdrunner()
         gmx_fedisableexcept();
     }
 
-    rc = static_cast<int>(gmx_get_stop_condition());
+    auto rc = static_cast<int>(gmx_get_stop_condition());
 
 #if GMX_THREAD_MPI
     /* we need to join all threads. The sub-threads join when they
@@ -1525,11 +1595,11 @@ int Mdrunner::mdrunner()
        wait for that. */
     if (PAR(cr) && MASTER(cr))
     {
-        done_commrec(cr);
         tMPI_Finalize();
     }
+    //TODO free commrec in MPI simulations
+    done_commrec(cr);
 #endif
-
     return rc;
 }
 
@@ -1561,6 +1631,11 @@ void Mdrunner::addPotential(std::shared_ptr<gmx::IRestraintPotential> puller,
                                  std::move(name));
 }
 
+Mdrunner::Mdrunner(std::unique_ptr<MDModules> mdModules)
+    : mdModules_(std::move(mdModules))
+{
+}
+
 Mdrunner::Mdrunner(Mdrunner &&) noexcept = default;
 
 //NOLINTNEXTLINE(performance-noexcept-move-constructor) working around GCC bug 58265
@@ -1570,11 +1645,13 @@ class Mdrunner::BuilderImplementation
 {
     public:
         BuilderImplementation() = delete;
-        explicit BuilderImplementation(SimulationContext* context);
+        BuilderImplementation(std::unique_ptr<MDModules> mdModules,
+                              SimulationContext        * context);
         ~BuilderImplementation();
 
         BuilderImplementation &setExtraMdrunOptions(const MdrunOptions &options,
-                                                    real                forceWarningThreshold);
+                                                    real                forceWarningThreshold,
+                                                    StartingBehavior    startingBehavior);
 
         void addDomdec(const DomdecOptions &options);
 
@@ -1603,6 +1680,7 @@ class Mdrunner::BuilderImplementation
         Mdrunner build();
 
     private:
+
         // Default parameters copied from runner.h
         // \todo Clarify source(s) of default parameters.
 
@@ -1626,6 +1704,12 @@ class Mdrunner::BuilderImplementation
         //! Print a warning if any force is larger than this (in kJ/mol nm).
         real forceWarningThreshold_ = -1;
 
+        //! Whether the simulation will start afresh, or restart with/without appending.
+        StartingBehavior startingBehavior_ = StartingBehavior::NewSimulation;
+
+        //! The modules that comprise the functionality of mdrun.
+        std::unique_ptr<MDModules> mdModules_;
+
         /*! \brief  Non-owning pointer to SimulationContext (owned and managed by client)
          *
          * \internal
@@ -1665,7 +1749,9 @@ class Mdrunner::BuilderImplementation
         std::unique_ptr<StopHandlerBuilder> stopHandlerBuilder_ = nullptr;
 };
 
-Mdrunner::BuilderImplementation::BuilderImplementation(SimulationContext* context) :
+Mdrunner::BuilderImplementation::BuilderImplementation(std::unique_ptr<MDModules> mdModules,
+                                                       SimulationContext        * context) :
+    mdModules_(std::move(mdModules)),
     context_(context)
 {
     GMX_ASSERT(context_, "Bug found. It should not be possible to construct builder without a valid context.");
@@ -1674,11 +1760,13 @@ Mdrunner::BuilderImplementation::BuilderImplementation(SimulationContext* contex
 Mdrunner::BuilderImplementation::~BuilderImplementation() = default;
 
 Mdrunner::BuilderImplementation &
-Mdrunner::BuilderImplementation::setExtraMdrunOptions(const MdrunOptions &options,
-                                                      real                forceWarningThreshold)
+Mdrunner::BuilderImplementation::setExtraMdrunOptions(const MdrunOptions    &options,
+                                                      const real             forceWarningThreshold,
+                                                      const StartingBehavior startingBehavior)
 {
     mdrunOptions_          = options;
     forceWarningThreshold_ = forceWarningThreshold;
+    startingBehavior_      = startingBehavior;
     return *this;
 }
 
@@ -1699,17 +1787,19 @@ void Mdrunner::BuilderImplementation::addReplicaExchange(const ReplicaExchangePa
 
 void Mdrunner::BuilderImplementation::addMultiSim(gmx_multisim_t* multisim)
 {
-    multisim_ = compat::make_unique<gmx_multisim_t*>(multisim);
+    multisim_ = std::make_unique<gmx_multisim_t*>(multisim);
 }
 
 Mdrunner Mdrunner::BuilderImplementation::build()
 {
-    auto newRunner = Mdrunner();
+    auto newRunner = Mdrunner(std::move(mdModules_));
 
     GMX_ASSERT(context_, "Bug found. It should not be possible to call build() without a valid context.");
 
-    newRunner.mdrunOptions    = mdrunOptions_;
-    newRunner.domdecOptions   = domdecOptions_;
+    newRunner.mdrunOptions          = mdrunOptions_;
+    newRunner.pforce                = forceWarningThreshold_;
+    newRunner.startingBehavior      = startingBehavior_;
+    newRunner.domdecOptions         = domdecOptions_;
 
     // \todo determine an invariant to check or confirm that all gmx_hw_opt_t objects are valid
     newRunner.hw_opt          = hardwareOptions_;
@@ -1776,7 +1866,7 @@ Mdrunner Mdrunner::BuilderImplementation::build()
         GMX_THROW(gmx::APIError("MdrunnerBuilder::addBondedTaskAssignment() is required before build()"));
     }
 
-    newRunner.restraintManager_ = compat::make_unique<gmx::RestraintManager>();
+    newRunner.restraintManager_ = std::make_unique<gmx::RestraintManager>();
 
     if (stopHandlerBuilder_)
     {
@@ -1784,7 +1874,7 @@ Mdrunner Mdrunner::BuilderImplementation::build()
     }
     else
     {
-        newRunner.stopHandlerBuilder_ = compat::make_unique<StopHandlerBuilder>();
+        newRunner.stopHandlerBuilder_ = std::make_unique<StopHandlerBuilder>();
     }
 
     return newRunner;
@@ -1832,17 +1922,19 @@ void Mdrunner::BuilderImplementation::addStopHandlerBuilder(std::unique_ptr<Stop
     stopHandlerBuilder_ = std::move(builder);
 }
 
-MdrunnerBuilder::MdrunnerBuilder(compat::not_null<SimulationContext*> context) :
-    impl_ {gmx::compat::make_unique<Mdrunner::BuilderImplementation>(context)}
+MdrunnerBuilder::MdrunnerBuilder(std::unique_ptr<MDModules>           mdModules,
+                                 compat::not_null<SimulationContext*> context) :
+    impl_ {std::make_unique<Mdrunner::BuilderImplementation>(std::move(mdModules), context)}
 {
 }
 
 MdrunnerBuilder::~MdrunnerBuilder() = default;
 
-MdrunnerBuilder &MdrunnerBuilder::addSimulationMethod(const MdrunOptions &options,
-                                                      real                forceWarningThreshold)
+MdrunnerBuilder &MdrunnerBuilder::addSimulationMethod(const MdrunOptions    &options,
+                                                      real                   forceWarningThreshold,
+                                                      const StartingBehavior startingBehavior)
 {
-    impl_->setExtraMdrunOptions(options, forceWarningThreshold);
+    impl_->setExtraMdrunOptions(options, forceWarningThreshold, startingBehavior);
     return *this;
 }