Disable PME Mixed mode with FEP

[alexxy/gromacs.git] / src / gromacs / mdrun / runner.cpp
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index 903d64f4322aca6115b5f8e84bfa2e09ec2b12ec..423a90905cbb3eb86779f38271287c9ee0df3912 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -3,7 +3,7 @@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2011-2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -73,11 +73,11 @@
  #include "gromacs/fileio/tpxio.h"
  #include "gromacs/gmxlib/network.h"
  #include "gromacs/gmxlib/nrnb.h"
-#include "gromacs/gpu_utils/device_context.h"
  #include "gromacs/gpu_utils/device_stream_manager.h"
  #include "gromacs/hardware/cpuinfo.h"
  #include "gromacs/hardware/detecthardware.h"
  #include "gromacs/hardware/device_management.h"
+#include "gromacs/hardware/hardwaretopology.h"
  #include "gromacs/hardware/printhardware.h"
  #include "gromacs/imd/imd.h"
  #include "gromacs/listed_forces/disre.h"
@@ -95,6 +95,7 @@
  #include "gromacs/mdlib/force.h"
  #include "gromacs/mdlib/forcerec.h"
  #include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/mdlib/gpuforcereduction.h"
  #include "gromacs/mdlib/makeconstraints.h"
  #include "gromacs/mdlib/md_support.h"
  #include "gromacs/mdlib/mdatoms.h"
@@ -105,11 +106,14 @@
  #include "gromacs/mdlib/vsite.h"
  #include "gromacs/mdrun/mdmodules.h"
  #include "gromacs/mdrun/simulationcontext.h"
+#include "gromacs/mdrun/simulationinput.h"
+#include "gromacs/mdrun/simulationinputhandle.h"
  #include "gromacs/mdrunutility/handlerestart.h"
  #include "gromacs/mdrunutility/logging.h"
  #include "gromacs/mdrunutility/multisim.h"
  #include "gromacs/mdrunutility/printtime.h"
  #include "gromacs/mdrunutility/threadaffinity.h"
+#include "gromacs/mdtypes/checkpointdata.h"
  #include "gromacs/mdtypes/commrec.h"
  #include "gromacs/mdtypes/enerdata.h"
  #include "gromacs/mdtypes/fcdata.h"
@@ -168,10 +172,6 @@
  #include "replicaexchange.h"
  #include "simulatorbuilder.h"
  
-#if GMX_FAHCORE
-#    include "corewrap.h"
-#endif
-
  namespace gmx
  {
  
@@ -206,6 +206,7 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md
      devFlags.enableGpuBufferOps =
              GMX_GPU_CUDA && useGpuForNonbonded && (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
      devFlags.enableGpuHaloExchange = GMX_GPU_CUDA && GMX_THREAD_MPI && getenv("GMX_GPU_DD_COMMS") != nullptr;
+    devFlags.forceGpuUpdateDefault = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr) || GMX_FAHCORE;
      devFlags.enableGpuPmePPComm =
              GMX_GPU_CUDA && GMX_THREAD_MPI && getenv("GMX_GPU_PME_PP_COMMS") != nullptr;
  
@@ -266,6 +267,15 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md
      {
          if (pmeRunMode == PmeRunMode::GPU)
          {
+            if (!devFlags.enableGpuBufferOps)
+            {
+                GMX_LOG(mdlog.warning)
+                        .asParagraph()
+                        .appendTextFormatted(
+                                "Enabling GPU buffer operations required by GMX_GPU_PME_PP_COMMS "
+                                "(equivalent with GMX_USE_GPU_BUFFER_OPS=1).");
+                devFlags.enableGpuBufferOps = true;
+            }
              GMX_LOG(mdlog.warning)
                      .asParagraph()
                      .appendTextFormatted(
@@ -326,6 +336,7 @@ Mdrunner Mdrunner::cloneOnSpawnedThread() const
      newRunner.hw_opt    = hw_opt;
      newRunner.filenames = filenames;
  
+    newRunner.hwinfo_         = hwinfo_;
      newRunner.oenv            = oenv;
      newRunner.mdrunOptions    = mdrunOptions;
      newRunner.domdecOptions   = domdecOptions;
@@ -339,10 +350,12 @@ Mdrunner Mdrunner::cloneOnSpawnedThread() const
      newRunner.pforce          = pforce;
      // Give the spawned thread the newly created valid communicator
      // for the simulation.
-    newRunner.communicator        = MPI_COMM_WORLD;
-    newRunner.ms                  = ms;
-    newRunner.startingBehavior    = startingBehavior;
-    newRunner.stopHandlerBuilder_ = std::make_unique<StopHandlerBuilder>(*stopHandlerBuilder_);
+    newRunner.libraryWorldCommunicator = MPI_COMM_WORLD;
+    newRunner.simulationCommunicator   = MPI_COMM_WORLD;
+    newRunner.ms                       = ms;
+    newRunner.startingBehavior         = startingBehavior;
+    newRunner.stopHandlerBuilder_      = std::make_unique<StopHandlerBuilder>(*stopHandlerBuilder_);
+    newRunner.inputHolder_             = inputHolder_;
  
      threadMpiMdrunnerAccessBarrier();
  
@@ -384,7 +397,8 @@ void Mdrunner::spawnThreads(int numThreadsToLaunch)
  
      // Give the master thread the newly created valid communicator for
      // the simulation.
-    communicator = MPI_COMM_WORLD;
+    libraryWorldCommunicator = MPI_COMM_WORLD;
+    simulationCommunicator   = MPI_COMM_WORLD;
      threadMpiMdrunnerAccessBarrier();
  #else
      GMX_UNUSED_VALUE(numThreadsToLaunch);
@@ -712,7 +726,6 @@ int Mdrunner::mdrunner()
      gmx_wallcycle_t           wcycle;
      gmx_walltime_accounting_t walltime_accounting = nullptr;
      MembedHolder              membedHolder(filenames.size(), filenames.data());
-    gmx_hw_info_t*            hwinfo = nullptr;
  
      /* CAUTION: threads may be started later on in this function, so
         cr doesn't reflect the final parallel state right now */
@@ -747,24 +760,14 @@ int Mdrunner::mdrunner()
      {
          fplog = gmx_fio_getfp(logFileHandle);
      }
-    const bool       isSimulationMasterRank = findIsSimulationMasterRank(ms, communicator);
+    const bool isSimulationMasterRank = findIsSimulationMasterRank(ms, simulationCommunicator);
      gmx::LoggerOwner logOwner(buildLogger(fplog, isSimulationMasterRank));
      gmx::MDLogger    mdlog(logOwner.logger());
  
-    // TODO The thread-MPI master rank makes a working
-    // PhysicalNodeCommunicator here, but it gets rebuilt by all ranks
-    // after the threads have been launched. This works because no use
-    // is made of that communicator until after the execution paths
-    // have rejoined. But it is likely that we can improve the way
-    // this is expressed, e.g. by expressly running detection only the
-    // master rank for thread-MPI, rather than relying on the mutex
-    // and reference count.
-    PhysicalNodeCommunicator physicalNodeComm(communicator, gmx_physicalnode_id_hash());
-    hwinfo = gmx_detect_hardware(mdlog, physicalNodeComm);
-
-    gmx_print_detected_hardware(fplog, isSimulationMasterRank && isMasterSim(ms), mdlog, hwinfo);
+    gmx_print_detected_hardware(fplog, isSimulationMasterRank && isMasterSim(ms), mdlog, hwinfo_);
  
-    std::vector<int> gpuIdsToUse = makeGpuIdsToUse(hwinfo->gpu_info, hw_opt.gpuIdsAvailable);
+    std::vector<int> gpuIdsToUse = makeGpuIdsToUse(hwinfo_->deviceInfoList, hw_opt.gpuIdsAvailable);
+    const int        numDevicesToUse = gmx::ssize(gpuIdsToUse);
  
      // Print citation requests after all software/hardware printing
      pleaseCiteGromacs(fplog);
@@ -786,8 +789,8 @@ int Mdrunner::mdrunner()
          /* Read (nearly) all data required for the simulation
           * and keep the partly serialized tpr contents to send to other ranks later
           */
-        *partialDeserializedTpr = read_tpx_state(ftp2fn(efTPR, filenames.size(), filenames.data()),
-                                                 inputrec.get(), globalState.get(), &mtop);
+        applyGlobalSimulationState(*inputHolder_.get(), partialDeserializedTpr.get(),
+                                   globalState.get(), inputrec.get(), &mtop);
      }
  
      /* Check and update the hardware options for internal consistency */
@@ -807,13 +810,13 @@ int Mdrunner::mdrunner()
              // the number of GPUs to choose the number of ranks.
              auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
              useGpuForNonbonded         = decideWhetherToUseGpusForNonbondedWithThreadMpi(
-                    nonbondedTarget, gpuIdsToUse, userGpuTaskAssignment, emulateGpuNonbonded,
+                    nonbondedTarget, numDevicesToUse, userGpuTaskAssignment, emulateGpuNonbonded,
                      canUseGpuForNonbonded,
                      gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, GMX_THREAD_MPI),
                      hw_opt.nthreads_tmpi);
              useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi(
-                    useGpuForNonbonded, pmeTarget, gpuIdsToUse, userGpuTaskAssignment, *hwinfo,
-                    *inputrec, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
+                    useGpuForNonbonded, pmeTarget, pmeFftTarget, numDevicesToUse, userGpuTaskAssignment,
+                    *hwinfo_, *inputrec, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
          }
          GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
  
@@ -823,22 +826,29 @@ int Mdrunner::mdrunner()
           * prevent any possible subsequent checks from working
           * correctly. */
          hw_opt.nthreads_tmpi =
-                get_nthreads_mpi(hwinfo, &hw_opt, gpuIdsToUse, useGpuForNonbonded, useGpuForPme,
+                get_nthreads_mpi(hwinfo_, &hw_opt, numDevicesToUse, useGpuForNonbonded, useGpuForPme,
                                   inputrec.get(), &mtop, mdlog, membedHolder.doMembed());
  
          // Now start the threads for thread MPI.
          spawnThreads(hw_opt.nthreads_tmpi);
          // The spawned threads enter mdrunner() and execution of
          // master and spawned threads joins at the end of this block.
-        physicalNodeComm = PhysicalNodeCommunicator(communicator, gmx_physicalnode_id_hash());
      }
  
-    GMX_RELEASE_ASSERT(ms || communicator == MPI_COMM_WORLD,
-                       "Must have valid world communicator unless running a multi-simulation");
-    CommrecHandle crHandle = init_commrec(communicator);
+    GMX_RELEASE_ASSERT(ms || simulationCommunicator != MPI_COMM_NULL,
+                       "Must have valid communicator unless running a multi-simulation");
+    CommrecHandle crHandle = init_commrec(simulationCommunicator);
      t_commrec*    cr       = crHandle.get();
      GMX_RELEASE_ASSERT(cr != nullptr, "Must have valid commrec");
  
+    PhysicalNodeCommunicator physicalNodeComm(libraryWorldCommunicator, gmx_physicalnode_id_hash());
+
+    // If we detected the topology on this system, double-check that it makes sense
+    if (hwinfo_->hardwareTopology->isThisSystem())
+    {
+        hardwareTopologyDoubleCheckDetection(mdlog, *hwinfo_->hardwareTopology);
+    }
+
      if (PAR(cr))
      {
          /* now broadcast everything to the non-master nodes/threads: */
@@ -869,7 +879,7 @@ int Mdrunner::mdrunner()
      bool useGpuForPme       = false;
      bool useGpuForBonded    = false;
      bool useGpuForUpdate    = false;
-    bool gpusWereDetected   = hwinfo->ngpu_compatible_tot > 0;
+    bool gpusWereDetected   = hwinfo_->ngpu_compatible_tot > 0;
      try
      {
          // It's possible that there are different numbers of GPUs on
@@ -881,14 +891,11 @@ int Mdrunner::mdrunner()
                  nonbondedTarget, userGpuTaskAssignment, emulateGpuNonbonded, canUseGpuForNonbonded,
                  gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, !GMX_THREAD_MPI), gpusWereDetected);
          useGpuForPme = decideWhetherToUseGpusForPme(
-                useGpuForNonbonded, pmeTarget, userGpuTaskAssignment, *hwinfo, *inputrec,
-                cr->sizeOfDefaultCommunicator, domdecOptions.numPmeRanks, gpusWereDetected);
-        auto canUseGpuForBonded = buildSupportsGpuBondeds(nullptr)
-                                  && inputSupportsGpuBondeds(*inputrec, mtop, nullptr);
-        useGpuForBonded = decideWhetherToUseGpusForBonded(
-                useGpuForNonbonded, useGpuForPme, bondedTarget, canUseGpuForBonded,
-                EVDW_PME(inputrec->vdwtype), EEL_PME_EWALD(inputrec->coulombtype),
-                domdecOptions.numPmeRanks, gpusWereDetected);
+                useGpuForNonbonded, pmeTarget, pmeFftTarget, userGpuTaskAssignment, *hwinfo_,
+                *inputrec, cr->sizeOfDefaultCommunicator, domdecOptions.numPmeRanks, gpusWereDetected);
+        useGpuForBonded = decideWhetherToUseGpusForBonded(useGpuForNonbonded, useGpuForPme,
+                                                          bondedTarget, *inputrec, mtop,
+                                                          domdecOptions.numPmeRanks, gpusWereDetected);
      }
      GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
  
@@ -1028,13 +1035,6 @@ int Mdrunner::mdrunner()
          }
      }
  
-#if GMX_FAHCORE
-    if (MASTER(cr))
-    {
-        fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
-    }
-#endif
-
      /* NMR restraints must be initialized before load_checkpoint,
       * since with time averaging the history is added to t_state.
       * For proper consistency check we therefore need to extend
@@ -1059,8 +1059,24 @@ int Mdrunner::mdrunner()
              globalState != nullptr ? globalState->box : box, MASTER(cr) ? DDRole::Master : DDRole::Agent,
              PAR(cr) ? NumRanks::Multiple : NumRanks::Single, cr->mpi_comm_mygroup, *inputrec);
  
+#if GMX_FAHCORE
+    /* We have to remember the generation's first step before reading checkpoint.
+       This way, we can report to the F@H core both the generation's first step
+       and the restored first step, thus making it able to distinguish between
+       an interruption/resume and start of the n-th generation simulation.
+       Having this information, the F@H core can correctly calculate and report
+       the progress.
+     */
+    int gen_first_step = 0;
+    if (MASTER(cr))
+    {
+        gen_first_step = inputrec->init_step;
+    }
+#endif
+
      ObservablesHistory observablesHistory = {};
  
+    auto modularSimulatorCheckpointData = std::make_unique<ReadCheckpointDataHolder>();
      if (startingBehavior != StartingBehavior::NewSimulation)
      {
          /* Check if checkpoint file exists before doing continuation.
@@ -1075,9 +1091,21 @@ int Mdrunner::mdrunner()
              inputrec->nsteps = -1;
          }
  
-        load_checkpoint(opt2fn_master("-cpi", filenames.size(), filenames.data(), cr),
-                        logFileHandle, cr, domdecOptions.numCells, inputrec.get(), globalState.get(),
-                        &observablesHistory, mdrunOptions.reproducible, mdModules_->notifier());
+        // Finish applying initial simulation state information from external sources on all ranks.
+        // Reconcile checkpoint file data with Mdrunner state established up to this point.
+        applyLocalState(*inputHolder_.get(), logFileHandle, cr, domdecOptions.numCells,
+                        inputrec.get(), globalState.get(), &observablesHistory,
+                        mdrunOptions.reproducible, mdModules_->notifier(),
+                        modularSimulatorCheckpointData.get(), useModularSimulator);
+        // TODO: (#3652) Synchronize filesystem state, SimulationInput contents, and program
+        //  invariants
+        //  on all code paths.
+        // Write checkpoint or provide hook to update SimulationInput.
+        // If there was a checkpoint file, SimulationInput contains more information
+        // than if there wasn't. At this point, we have synchronized the in-memory
+        // state with the filesystem state only for restarted simulations. We should
+        // be calling applyLocalState unconditionally and expect that the completeness
+        // of SimulationInput is not dependent on its creation method.
  
          if (startingBehavior == StartingBehavior::RestartWithAppending && logFileHandle)
          {
@@ -1089,6 +1117,13 @@ int Mdrunner::mdrunner()
          }
      }
  
+#if GMX_FAHCORE
+    if (MASTER(cr))
+    {
+        fcRegisterSteps(inputrec->nsteps + inputrec->init_step, gen_first_step);
+    }
+#endif
+
      if (mdrunOptions.numStepsCommandline > -2)
      {
          GMX_LOG(mdlog.info)
@@ -1125,9 +1160,8 @@ int Mdrunner::mdrunner()
       */
      prepare_verlet_scheme(fplog, cr, inputrec.get(), nstlist_cmdline, &mtop, box,
                            useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes),
-                          *hwinfo->cpuInfo);
+                          *hwinfo_->cpuInfo);
  
-    const bool prefer1DAnd1PulseDD = (devFlags.enableGpuHaloExchange && useGpuForNonbonded);
      // This builder is necessary while we have multi-part construction
      // of DD. Before DD is constructed, we use the existence of
      // the builder object to indicate that further construction of DD
@@ -1136,7 +1170,7 @@ int Mdrunner::mdrunner()
      if (useDomainDecomposition)
      {
          ddBuilder = std::make_unique<DomainDecompositionBuilder>(
-                mdlog, cr, domdecOptions, mdrunOptions, prefer1DAnd1PulseDD, mtop, *inputrec, box,
+                mdlog, cr, domdecOptions, mdrunOptions, mtop, *inputrec, box,
                  positionsFromStatePointer(globalState.get()));
      }
      else
@@ -1156,7 +1190,7 @@ int Mdrunner::mdrunner()
  
      // Produce the task assignment for this rank - done after DD is constructed
      GpuTaskAssignments gpuTaskAssignments = GpuTaskAssignmentsBuilder::build(
-            gpuIdsToUse, userGpuTaskAssignment, *hwinfo, communicator, physicalNodeComm,
+            gpuIdsToUse, userGpuTaskAssignment, *hwinfo_, simulationCommunicator, physicalNodeComm,
              nonbondedTarget, pmeTarget, bondedTarget, updateTarget, useGpuForNonbonded,
              useGpuForPme, thisRankHasDuty(cr, DUTY_PP),
              // TODO cr->duty & DUTY_PME should imply that a PME
@@ -1198,6 +1232,19 @@ int Mdrunner::mdrunner()
          ddBuilder.reset(nullptr);
          // Note that local state still does not exist yet.
      }
+    // Ensure that all atoms within the same update group are in the
+    // same periodic image. Otherwise, a simulation that did not use
+    // update groups (e.g. a single-rank simulation) cannot always be
+    // correctly restarted in a way that does use update groups
+    // (e.g. a multi-rank simulation).
+    if (isSimulationMasterRank)
+    {
+        const bool useUpdateGroups = cr->dd ? ddUsesUpdateGroups(*cr->dd) : false;
+        if (useUpdateGroups)
+        {
+            putUpdateGroupAtomsInSamePeriodicImage(*cr->dd, mtop, globalState->box, globalState->x);
+        }
+    }
  
      // The GPU update is decided here because we need to know whether the constraints or
      // SETTLEs can span accross the domain borders (i.e. whether or not update groups are
@@ -1206,18 +1253,41 @@ int Mdrunner::mdrunner()
      try
      {
          const bool useUpdateGroups = cr->dd ? ddUsesUpdateGroups(*cr->dd) : false;
+        const bool haveFrozenAtoms = inputrecFrozenAtoms(inputrec.get());
  
          useGpuForUpdate = decideWhetherToUseGpuForUpdate(
                  useDomainDecomposition, useUpdateGroups, pmeRunMode, domdecOptions.numPmeRanks > 0,
                  useGpuForNonbonded, updateTarget, gpusWereDetected, *inputrec, mtop,
                  doEssentialDynamics, gmx_mtop_ftype_count(mtop, F_ORIRES) > 0,
-                replExParams.exchangeInterval > 0, doRerun, devFlags, mdlog);
+                replExParams.exchangeInterval > 0, haveFrozenAtoms, doRerun, devFlags, mdlog);
      }
      GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
  
      const bool printHostName = (cr->nnodes > 1);
      gpuTaskAssignments.reportGpuUsage(mdlog, printHostName, useGpuForBonded, pmeRunMode, useGpuForUpdate);
  
+    const bool disableNonbondedCalculation = (getenv("GMX_NO_NONBONDED") != nullptr);
+    if (disableNonbondedCalculation)
+    {
+        /* turn off non-bonded calculations */
+        GMX_LOG(mdlog.warning)
+                .asParagraph()
+                .appendText(
+                        "Found environment variable GMX_NO_NONBONDED.\n"
+                        "Disabling nonbonded calculations.");
+    }
+
+    MdrunScheduleWorkload runScheduleWork;
+
+    bool useGpuDirectHalo = decideWhetherToUseGpuForHalo(
+            devFlags, havePPDomainDecomposition(cr), useGpuForNonbonded, useModularSimulator,
+            doRerun, EI_ENERGY_MINIMIZATION(inputrec->eI));
+
+    // Also populates the simulation constant workload description.
+    runScheduleWork.simulationWork = createSimulationWorkload(
+            *inputrec, disableNonbondedCalculation, devFlags, useGpuForNonbonded, pmeRunMode,
+            useGpuForBonded, useGpuForUpdate, useGpuDirectHalo);
+
      std::unique_ptr<DeviceStreamManager> deviceStreamManager = nullptr;
  
      if (deviceInfo != nullptr)
@@ -1227,15 +1297,14 @@ int Mdrunner::mdrunner()
              dd_setup_dlb_resource_sharing(cr, deviceId);
          }
          deviceStreamManager = std::make_unique<DeviceStreamManager>(
-                *deviceInfo, useGpuForPme, useGpuForNonbonded, havePPDomainDecomposition(cr),
-                useGpuForUpdate, useTiming);
+                *deviceInfo, havePPDomainDecomposition(cr), runScheduleWork.simulationWork, useTiming);
      }
  
      // If the user chose a task assignment, give them some hints
      // where appropriate.
      if (!userGpuTaskAssignment.empty())
      {
-        gpuTaskAssignments.logPerformanceHints(mdlog, ssize(gpuIdsToUse));
+        gpuTaskAssignments.logPerformanceHints(mdlog, numDevicesToUse);
      }
  
      if (PAR(cr))
@@ -1272,12 +1341,12 @@ int Mdrunner::mdrunner()
      // that existing affinity setting was from OpenMP or something
      // else, so we run this code both before and after we initialize
      // the OpenMP support.
-    gmx_check_thread_affinity_set(mdlog, &hw_opt, hwinfo->nthreads_hw_avail, FALSE);
+    gmx_check_thread_affinity_set(mdlog, &hw_opt, hwinfo_->nthreads_hw_avail, FALSE);
      /* Check and update the number of OpenMP threads requested */
-    checkAndUpdateRequestedNumOpenmpThreads(&hw_opt, *hwinfo, cr, ms, physicalNodeComm.size_,
+    checkAndUpdateRequestedNumOpenmpThreads(&hw_opt, *hwinfo_, cr, ms, physicalNodeComm.size_,
                                              pmeRunMode, mtop, *inputrec);
  
-    gmx_omp_nthreads_init(mdlog, cr, hwinfo->nthreads_hw_avail, physicalNodeComm.size_,
+    gmx_omp_nthreads_init(mdlog, cr, hwinfo_->nthreads_hw_avail, physicalNodeComm.size_,
                            hw_opt.nthreads_omp, hw_opt.nthreads_omp_pme, !thisRankHasDuty(cr, DUTY_PP));
  
      // Enable FP exception detection, but not in
@@ -1298,7 +1367,7 @@ int Mdrunner::mdrunner()
      }
  
      /* Now that we know the setup is consistent, check for efficiency */
-    check_resource_division_efficiency(hwinfo, gpuTaskAssignments.thisRankHasAnyGpuTask(),
+    check_resource_division_efficiency(hwinfo_, gpuTaskAssignments.thisRankHasAnyGpuTask(),
                                         mdrunOptions.ntompOptionIsSet, cr, mdlog);
  
      /* getting number of PP/PME threads on this MPI / tMPI rank.
@@ -1307,14 +1376,15 @@ int Mdrunner::mdrunner()
       */
      const int numThreadsOnThisRank = thisRankHasDuty(cr, DUTY_PP) ? gmx_omp_nthreads_get(emntNonbonded)
                                                                    : gmx_omp_nthreads_get(emntPME);
-    checkHardwareOversubscription(numThreadsOnThisRank, cr->nodeid, *hwinfo->hardwareTopology,
+    checkHardwareOversubscription(numThreadsOnThisRank, cr->nodeid, *hwinfo_->hardwareTopology,
                                    physicalNodeComm, mdlog);
  
      // Enable Peer access between GPUs where available
      // Only for DD, only master PP rank needs to perform setup, and only if thread MPI plus
      // any of the GPU communication features are active.
      if (DOMAINDECOMP(cr) && MASTER(cr) && thisRankHasDuty(cr, DUTY_PP) && GMX_THREAD_MPI
-        && (devFlags.enableGpuHaloExchange || devFlags.enableGpuPmePPComm))
+        && (runScheduleWork.simulationWork.useGpuHaloExchange
+            || runScheduleWork.simulationWork.useGpuPmePpCommunication))
      {
          setupGpuDevicePeerAccess(gpuIdsToUse, mdlog);
      }
@@ -1325,14 +1395,14 @@ int Mdrunner::mdrunner()
           * - which indicates that probably the OpenMP library has changed it
           * since we first checked).
           */
-        gmx_check_thread_affinity_set(mdlog, &hw_opt, hwinfo->nthreads_hw_avail, TRUE);
+        gmx_check_thread_affinity_set(mdlog, &hw_opt, hwinfo_->nthreads_hw_avail, TRUE);
  
          int numThreadsOnThisNode, intraNodeThreadOffset;
          analyzeThreadsOnThisNode(physicalNodeComm, numThreadsOnThisRank, &numThreadsOnThisNode,
                                   &intraNodeThreadOffset);
  
          /* Set the CPU affinity */
-        gmx_set_thread_affinity(mdlog, cr, &hw_opt, *hwinfo->hardwareTopology, numThreadsOnThisRank,
+        gmx_set_thread_affinity(mdlog, cr, &hw_opt, *hwinfo_->hardwareTopology, numThreadsOnThisRank,
                                  numThreadsOnThisNode, intraNodeThreadOffset, nullptr);
      }
  
@@ -1379,14 +1449,14 @@ int Mdrunner::mdrunner()
                        opt2fn("-tablep", filenames.size(), filenames.data()),
                        opt2fns("-tableb", filenames.size(), filenames.data()), pforce);
          // Dirty hack, for fixing disres and orires should be made mdmodules
-        fr->listedForces->fcdata().disres = disresdata;
-        fr->listedForces->fcdata().orires = oriresdata;
+        fr->fcdata->disres = disresdata;
+        fr->fcdata->orires = oriresdata;
  
          // Save a handle to device stream manager to use elsewhere in the code
          // TODO: Forcerec is not a correct place to store it.
          fr->deviceStreamManager = deviceStreamManager.get();
  
-        if (devFlags.enableGpuPmePPComm && !thisRankHasDuty(cr, DUTY_PME))
+        if (runScheduleWork.simulationWork.useGpuPmePpCommunication && !thisRankHasDuty(cr, DUTY_PME))
          {
              GMX_RELEASE_ASSERT(
                      deviceStreamManager != nullptr,
@@ -1401,10 +1471,11 @@ int Mdrunner::mdrunner()
                      deviceStreamManager->stream(DeviceStreamType::PmePpTransfer));
          }
  
-        fr->nbv = Nbnxm::init_nb_verlet(mdlog, inputrec.get(), fr, cr, *hwinfo, useGpuForNonbonded,
+        fr->nbv = Nbnxm::init_nb_verlet(mdlog, inputrec.get(), fr, cr, *hwinfo_,
+                                        runScheduleWork.simulationWork.useGpuNonbonded,
                                          deviceStreamManager.get(), &mtop, box, wcycle);
          // TODO: Move the logic below to a GPU bonded builder
-        if (useGpuForBonded)
+        if (runScheduleWork.simulationWork.useGpuBonded)
          {
              GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
                                 "GPU device stream manager should be valid in order to use GPU "
@@ -1519,17 +1590,22 @@ int Mdrunner::mdrunner()
              try
              {
                  // TODO: This should be in the builder.
-                GMX_RELEASE_ASSERT(!useGpuForPme || (deviceStreamManager != nullptr),
+                GMX_RELEASE_ASSERT(!runScheduleWork.simulationWork.useGpuPme
+                                           || (deviceStreamManager != nullptr),
                                     "Device stream manager should be valid in order to use GPU "
                                     "version of PME.");
                  GMX_RELEASE_ASSERT(
-                        !useGpuForPme || deviceStreamManager->streamIsValid(DeviceStreamType::Pme),
+                        !runScheduleWork.simulationWork.useGpuPme
+                                || deviceStreamManager->streamIsValid(DeviceStreamType::Pme),
                          "GPU PME stream should be valid in order to use GPU version of PME.");
  
-                const DeviceContext* deviceContext =
-                        useGpuForPme ? &deviceStreamManager->context() : nullptr;
+                const DeviceContext* deviceContext = runScheduleWork.simulationWork.useGpuPme
+                                                             ? &deviceStreamManager->context()
+                                                             : nullptr;
                  const DeviceStream* pmeStream =
-                        useGpuForPme ? &deviceStreamManager->stream(DeviceStreamType::Pme) : nullptr;
+                        runScheduleWork.simulationWork.useGpuPme
+                                ? &deviceStreamManager->stream(DeviceStreamType::Pme)
+                                : nullptr;
  
                  pmedata = gmx_pme_init(cr, getNumPmeDomains(cr->dd), inputrec.get(),
                                         nChargePerturbed != 0, nTypePerturbed != 0,
@@ -1561,7 +1637,7 @@ int Mdrunner::mdrunner()
          if (inputrec->bPull)
          {
              /* Initialize pull code */
-            pull_work = init_pull(fplog, inputrec->pull, inputrec.get(), &mtop, cr, &atomSets,
+            pull_work = init_pull(fplog, inputrec->pull.get(), inputrec.get(), &mtop, cr, &atomSets,
                                    inputrec->fepvals->init_lambda);
              if (inputrec->pull->bXOutAverage || inputrec->pull->bFOutAverage)
              {
@@ -1600,6 +1676,11 @@ int Mdrunner::mdrunner()
          gmx_enerdata_t enerd(mtop.groups.groups[SimulationAtomGroupType::EnergyOutput].size(),
                               inputrec->fepvals->n_lambda);
  
+        // cos acceleration is only supported by md, but older tpr
+        // files might still combine it with other integrators
+        GMX_RELEASE_ASSERT(inputrec->cos_accel == 0.0 || inputrec->eI == eiMD,
+                           "cos_acceleration is only supported by integrator=md");
+
          /* Kinetic energy data */
          gmx_ekindata_t ekind;
          init_ekindata(fplog, &mtop, &(inputrec->opts), &ekind, inputrec->cos_accel);
@@ -1620,19 +1701,19 @@ int Mdrunner::mdrunner()
                              domdecOptions.checkBondedInteractions, fr->cginfo_mb);
          }
  
-        // TODO This is not the right place to manage the lifetime of
-        // this data structure, but currently it's the easiest way to
-        // make it work.
-        MdrunScheduleWorkload runScheduleWork;
-        // Also populates the simulation constant workload description.
-        runScheduleWork.simulationWork =
-                createSimulationWorkload(*inputrec, useGpuForNonbonded, pmeRunMode, useGpuForBonded,
-                                         useGpuForUpdate, devFlags.enableGpuBufferOps,
-                                         devFlags.enableGpuHaloExchange, devFlags.enableGpuPmePPComm);
+        if (runScheduleWork.simulationWork.useGpuBufferOps)
+        {
+            fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique<gmx::GpuForceReduction>(
+                    deviceStreamManager->context(),
+                    deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal), wcycle);
+            fr->gpuForceReduction[gmx::AtomLocality::NonLocal] = std::make_unique<gmx::GpuForceReduction>(
+                    deviceStreamManager->context(),
+                    deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal), wcycle);
+        }
  
          std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
          if (gpusWereDetected
-            && ((useGpuForPme && thisRankHasDuty(cr, DUTY_PME))
+            && ((runScheduleWork.simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME))
                  || runScheduleWork.simulationWork.useGpuBufferOps))
          {
              GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator)
@@ -1670,6 +1751,7 @@ int Mdrunner::mdrunner()
          simulatorBuilder.add(IonSwapping(swap));
          simulatorBuilder.add(TopologyData(&mtop, mdAtoms.get()));
          simulatorBuilder.add(BoxDeformationHandle(deform.get()));
+        simulatorBuilder.add(std::move(modularSimulatorCheckpointData));
  
          // build and run simulator object based on user-input
          auto simulator = simulatorBuilder.build(useModularSimulator);
@@ -1716,7 +1798,7 @@ int Mdrunner::mdrunner()
      }
  
      // FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
-    // before we destroy the GPU context(s) in free_gpu().
+    // before we destroy the GPU context(s)
      // Pinned buffers are associated with contexts in CUDA.
      // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
      mdAtoms.reset(nullptr);
@@ -1730,18 +1812,17 @@ int Mdrunner::mdrunner()
      sfree(disresdata);
      sfree(oriresdata);
  
-    if (hwinfo->gpu_info.n_dev > 0)
+    if (!hwinfo_->deviceInfoList.empty())
      {
          /* stop the GPU profiler (only CUDA) */
          stopGpuProfiler();
      }
  
      /* With tMPI we need to wait for all ranks to finish deallocation before
-     * destroying the CUDA context in free_gpu() as some tMPI ranks may be sharing
+     * destroying the CUDA context as some tMPI ranks may be sharing
       * GPU and context.
       *
-     * This is not a concern in OpenCL where we use one context per rank which
-     * is freed in nbnxn_gpu_free().
+     * This is not a concern in OpenCL where we use one context per rank.
       *
       * Note: it is safe to not call the barrier on the ranks which do not use GPU,
       * but it is easier and more futureproof to call it on the whole node.
@@ -1756,8 +1837,7 @@ int Mdrunner::mdrunner()
      {
          physicalNodeComm.barrier();
      }
-
-    free_gpu(deviceInfo);
+    releaseDevice(deviceInfo);
  
      /* Does what it says */
      print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
@@ -1821,7 +1901,8 @@ Mdrunner::Mdrunner(std::unique_ptr<MDModules> mdModules) : mdModules_(std::move(
  
  Mdrunner::Mdrunner(Mdrunner&&) noexcept = default;
  
-Mdrunner& Mdrunner::operator=(Mdrunner&& /*handle*/) noexcept = default;
+//NOLINTNEXTLINE(performance-noexcept-move-constructor) working around GCC bug 58265 in CentOS 7
+Mdrunner& Mdrunner::operator=(Mdrunner&& /*handle*/) noexcept(BUGFREE_NOEXCEPT_STRING) = default;
  
  class Mdrunner::BuilderImplementation
  {
@@ -1834,8 +1915,12 @@ public:
                                                  real                forceWarningThreshold,
                                                  StartingBehavior    startingBehavior);
  
+    void addHardwareDetectionResult(const gmx_hw_info_t* hwinfo);
+
      void addDomdec(const DomdecOptions& options);
  
+    void addInput(SimulationInputHandle inputHolder);
+
      void addVerletList(int nstlist);
  
      void addReplicaExchange(const ReplicaExchangeParameters& params);
@@ -1879,11 +1964,14 @@ private:
      //! Command-line override for the duration of a neighbor list with the Verlet scheme.
      int nstlist_ = 0;
  
+    //! World communicator, used for hardware detection and task assignment
+    MPI_Comm libraryWorldCommunicator_ = MPI_COMM_NULL;
+
      //! Multisim communicator handle.
      gmx_multisim_t* multiSimulation_;
  
      //! mdrun communicator
-    MPI_Comm communicator_ = MPI_COMM_NULL;
+    MPI_Comm simulationCommunicator_ = MPI_COMM_NULL;
  
      //! Print a warning if any force is larger than this (in kJ/mol nm).
      real forceWarningThreshold_ = -1;
@@ -1894,6 +1982,9 @@ private:
      //! The modules that comprise the functionality of mdrun.
      std::unique_ptr<MDModules> mdModules_;
  
+    //! Detected hardware.
+    const gmx_hw_info_t* hwinfo_ = nullptr;
+
      //! \brief Parallelism information.
      gmx_hw_opt_t hardwareOptions_;
  
@@ -1920,14 +2011,24 @@ private:
       * \brief Builder for simulation stop signal handler.
       */
      std::unique_ptr<StopHandlerBuilder> stopHandlerBuilder_ = nullptr;
+
+    /*!
+     * \brief Sources for initial simulation state.
+     *
+     * See issue #3652 for near-term refinements to the SimulationInput interface.
+     *
+     * See issue #3379 for broader discussion on API aspects of simulation inputs and outputs.
+     */
+    SimulationInputHandle inputHolder_;
  };
  
  Mdrunner::BuilderImplementation::BuilderImplementation(std::unique_ptr<MDModules> mdModules,
                                                         compat::not_null<SimulationContext*> context) :
      mdModules_(std::move(mdModules))
  {
-    communicator_    = context->communicator_;
-    multiSimulation_ = context->multiSimulation_.get();
+    libraryWorldCommunicator_ = context->libraryWorldCommunicator_;
+    simulationCommunicator_   = context->simulationCommunicator_;
+    multiSimulation_          = context->multiSimulation_.get();
  }
  
  Mdrunner::BuilderImplementation::~BuilderImplementation() = default;
@@ -1977,11 +2078,32 @@ Mdrunner Mdrunner::BuilderImplementation::build()
  
      newRunner.filenames = filenames_;
  
-    newRunner.communicator = communicator_;
+    newRunner.libraryWorldCommunicator = libraryWorldCommunicator_;
+
+    newRunner.simulationCommunicator = simulationCommunicator_;
  
      // nullptr is a valid value for the multisim handle
      newRunner.ms = multiSimulation_;
  
+    if (hwinfo_)
+    {
+        newRunner.hwinfo_ = hwinfo_;
+    }
+    else
+    {
+        GMX_THROW(gmx::APIError(
+                "MdrunnerBuilder::addHardwareDetectionResult() is required before build()"));
+    }
+
+    if (inputHolder_)
+    {
+        newRunner.inputHolder_ = std::move(inputHolder_);
+    }
+    else
+    {
+        GMX_THROW(gmx::APIError("MdrunnerBuilder::addInput() is required before build()."));
+    }
+
      // \todo Clarify ownership and lifetime management for gmx_output_env_t
      // \todo Update sanity checking when output environment has clearly specified invariants.
      // Initialization and default values for oenv are not well specified in the current version.
@@ -2051,6 +2173,11 @@ Mdrunner Mdrunner::BuilderImplementation::build()
      return newRunner;
  }
  
+void Mdrunner::BuilderImplementation::addHardwareDetectionResult(const gmx_hw_info_t* hwinfo)
+{
+    hwinfo_ = hwinfo;
+}
+
  void Mdrunner::BuilderImplementation::addNonBonded(const char* nbpu_opt)
  {
      nbpu_opt_ = nbpu_opt;
@@ -2097,6 +2224,11 @@ void Mdrunner::BuilderImplementation::addStopHandlerBuilder(std::unique_ptr<Stop
      stopHandlerBuilder_ = std::move(builder);
  }
  
+void Mdrunner::BuilderImplementation::addInput(SimulationInputHandle inputHolder)
+{
+    inputHolder_ = std::move(inputHolder);
+}
+
  MdrunnerBuilder::MdrunnerBuilder(std::unique_ptr<MDModules>           mdModules,
                                   compat::not_null<SimulationContext*> context) :
      impl_{ std::make_unique<Mdrunner::BuilderImplementation>(std::move(mdModules), context) }
@@ -2105,6 +2237,12 @@ MdrunnerBuilder::MdrunnerBuilder(std::unique_ptr<MDModules>           mdModules,
  
  MdrunnerBuilder::~MdrunnerBuilder() = default;
  
+MdrunnerBuilder& MdrunnerBuilder::addHardwareDetectionResult(const gmx_hw_info_t* hwinfo)
+{
+    impl_->addHardwareDetectionResult(hwinfo);
+    return *this;
+}
+
  MdrunnerBuilder& MdrunnerBuilder::addSimulationMethod(const MdrunOptions&    options,
                                                        real                   forceWarningThreshold,
                                                        const StartingBehavior startingBehavior)
@@ -2201,6 +2339,12 @@ MdrunnerBuilder& MdrunnerBuilder::addStopHandlerBuilder(std::unique_ptr<StopHand
      return *this;
  }
  
+MdrunnerBuilder& MdrunnerBuilder::addInput(SimulationInputHandle input)
+{
+    impl_->addInput(std::move(input));
+    return *this;
+}
+
  MdrunnerBuilder::MdrunnerBuilder(MdrunnerBuilder&&) noexcept = default;
  
  MdrunnerBuilder& MdrunnerBuilder::operator=(MdrunnerBuilder&&) noexcept = default;