Made gromacs work again without MPI

[alexxy/gromacs.git] / src / gromacs / mdrun / runner.cpp
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index 9d509473f9c7e4ef71bca79b94bedbdc6252f952..3bf1d5071cd6d9d21bb56dd62ad182f178488c99 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -62,7 +62,9 @@
  #include "gromacs/domdec/domdec_struct.h"
  #include "gromacs/domdec/gpuhaloexchange.h"
  #include "gromacs/domdec/localatomsetmanager.h"
+#include "gromacs/domdec/makebondedlinks.h"
  #include "gromacs/domdec/partition.h"
+#include "gromacs/domdec/reversetopology.h"
  #include "gromacs/ewald/ewald_utils.h"
  #include "gromacs/ewald/pme.h"
  #include "gromacs/ewald/pme_gpu_program.h"
@@ -82,7 +84,7 @@
  #include "gromacs/hardware/printhardware.h"
  #include "gromacs/imd/imd.h"
  #include "gromacs/listed_forces/disre.h"
-#include "gromacs/listed_forces/gpubonded.h"
+#include "gromacs/listed_forces/listed_forces_gpu.h"
  #include "gromacs/listed_forces/listed_forces.h"
  #include "gromacs/listed_forces/orires.h"
  #include "gromacs/math/functions.h"
@@ -126,6 +128,7 @@
  #include "gromacs/mdtypes/mdatom.h"
  #include "gromacs/mdtypes/mdrunoptions.h"
  #include "gromacs/mdtypes/observableshistory.h"
+#include "gromacs/mdtypes/observablesreducer.h"
  #include "gromacs/mdtypes/simulation_workload.h"
  #include "gromacs/mdtypes/state.h"
  #include "gromacs/mdtypes/state_propagator_data_gpu.h"
@@ -161,12 +164,13 @@
  #include "gromacs/utility/keyvaluetree.h"
  #include "gromacs/utility/logger.h"
  #include "gromacs/utility/loggerbuilder.h"
-#include "gromacs/utility/mdmodulenotification.h"
+#include "gromacs/utility/mdmodulesnotifiers.h"
  #include "gromacs/utility/physicalnodecommunicator.h"
  #include "gromacs/utility/pleasecite.h"
  #include "gromacs/utility/programcontext.h"
  #include "gromacs/utility/smalloc.h"
  #include "gromacs/utility/stringutil.h"
+#include "gromacs/utility/mpiinfo.h"
  
  #include "isimulator.h"
  #include "membedholder.h"
@@ -199,19 +203,77 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md
  {
      DevelopmentFeatureFlags devFlags;
  
-    // Some builds of GCC 5 give false positive warnings that these
-    // getenv results are ignored when clearly they are used.
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-result"
-
      devFlags.enableGpuBufferOps =
              GMX_GPU_CUDA && useGpuForNonbonded && (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
-    devFlags.enableGpuHaloExchange = GMX_GPU_CUDA && GMX_THREAD_MPI && getenv("GMX_GPU_DD_COMMS") != nullptr;
+    devFlags.enableGpuHaloExchange = GMX_MPI && GMX_GPU_CUDA && getenv("GMX_GPU_DD_COMMS") != nullptr;
      devFlags.forceGpuUpdateDefault = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr) || GMX_FAHCORE;
-    devFlags.enableGpuPmePPComm =
-            GMX_GPU_CUDA && GMX_THREAD_MPI && getenv("GMX_GPU_PME_PP_COMMS") != nullptr;
+    devFlags.enableGpuPmePPComm = GMX_MPI && GMX_GPU_CUDA && getenv("GMX_GPU_PME_PP_COMMS") != nullptr;
+
+    // Direct GPU comm path is being used with CUDA_AWARE_MPI
+    // make sure underlying MPI implementation is CUDA-aware
+    if (!GMX_THREAD_MPI && (devFlags.enableGpuPmePPComm || devFlags.enableGpuHaloExchange))
+    {
+        const bool haveDetectedCudaAwareMpi =
+                (checkMpiCudaAwareSupport() == CudaAwareMpiStatus::Supported);
+        const bool forceCudaAwareMpi = (getenv("GMX_FORCE_CUDA_AWARE_MPI") != nullptr);
+
+        if (!haveDetectedCudaAwareMpi && forceCudaAwareMpi)
+        {
+            // CUDA-aware support not detected in MPI library but, user has forced it's use
+            GMX_LOG(mdlog.warning)
+                    .asParagraph()
+                    .appendTextFormatted(
+                            "This run has forced use of 'CUDA-aware MPI'. "
+                            "But, GROMACS cannot determine if underlying MPI "
+                            "is CUDA-aware. GROMACS recommends use of latest openMPI version "
+                            "for CUDA-aware support. "
+                            "If you observe failures at runtime, try unsetting "
+                            "GMX_FORCE_CUDA_AWARE_MPI environment variable.");
+        }
  
-#pragma GCC diagnostic pop
+        if (haveDetectedCudaAwareMpi || forceCudaAwareMpi)
+        {
+            devFlags.usingCudaAwareMpi = true;
+            GMX_LOG(mdlog.warning)
+                    .asParagraph()
+                    .appendTextFormatted(
+                            "Using CUDA-aware MPI for 'GPU halo exchange' or 'GPU PME-PP "
+                            "communications' feature.");
+        }
+        else
+        {
+            if (devFlags.enableGpuHaloExchange)
+            {
+                GMX_LOG(mdlog.warning)
+                        .asParagraph()
+                        .appendTextFormatted(
+                                "GMX_GPU_DD_COMMS environment variable detected, but the 'GPU "
+                                "halo exchange' feature will not be enabled as GROMACS couldn't "
+                                "detect CUDA_aware support in underlying MPI implementation.");
+                devFlags.enableGpuHaloExchange = false;
+            }
+            if (devFlags.enableGpuPmePPComm)
+            {
+                GMX_LOG(mdlog.warning)
+                        .asParagraph()
+                        .appendText(
+                                "GMX_GPU_PME_PP_COMMS environment variable detected, but the "
+                                "'GPU PME-PP communications' feature will not be enabled as "
+                                "GROMACS couldn't "
+                                "detect CUDA_aware support in underlying MPI implementation.");
+                devFlags.enableGpuPmePPComm = false;
+            }
+
+            GMX_LOG(mdlog.warning)
+                    .asParagraph()
+                    .appendTextFormatted(
+                            "GROMACS recommends use of latest OpenMPI version for CUDA-aware "
+                            "support. "
+                            "If you are certain about CUDA-aware support in your MPI library, "
+                            "you can force it's use by setting environment variable "
+                            " GMX_FORCE_CUDA_AWARE_MPI.");
+        }
+    }
  
      if (devFlags.enableGpuBufferOps)
      {
@@ -373,7 +435,7 @@ static void mdrunner_start_fn(const void* arg)
  {
      try
      {
-        auto masterMdrunner = reinterpret_cast<const gmx::Mdrunner*>(arg);
+        const auto* masterMdrunner = reinterpret_cast<const gmx::Mdrunner*>(arg);
          /* copy the arg list to make sure that it's thread-local. This
             doesn't copy pointed-to items, of course; fnm, cr and fplog
             are reset in the call below, all others should be const. */
@@ -413,14 +475,14 @@ static void prepare_verlet_scheme(FILE*               fplog,
                                    t_commrec*          cr,
                                    t_inputrec*         ir,
                                    int                 nstlist_cmdline,
-                                  const gmx_mtop_t*   mtop,
+                                  const gmx_mtop_t&   mtop,
                                    const matrix        box,
                                    bool                makeGpuPairList,
                                    const gmx::CpuInfo& cpuinfo)
  {
      // We checked the cut-offs in grompp, but double-check here.
      // We have PME+LJcutoff kernels for rcoulomb>rvdw.
-    if (EEL_PME_EWALD(ir->coulombtype) && ir->vdwtype == eelCUT)
+    if (EEL_PME_EWALD(ir->coulombtype) && ir->vdwtype == VanDerWaalsType::Cut)
      {
          GMX_RELEASE_ASSERT(ir->rcoulomb >= ir->rvdw,
                             "With Verlet lists and PME we should have rcoulomb>=rvdw");
@@ -445,7 +507,7 @@ static void prepare_verlet_scheme(FILE*               fplog,
          VerletbufListSetup listSetup = verletbufGetSafeListSetup(listType);
  
          const real rlist_new =
-                calcVerletBufferSize(*mtop, det(box), *ir, ir->nstlist, ir->nstlist - 1, -1, listSetup);
+                calcVerletBufferSize(mtop, det(box), *ir, ir->nstlist, ir->nstlist - 1, -1, listSetup);
  
          if (rlist_new != ir->rlist)
          {
@@ -472,7 +534,7 @@ static void prepare_verlet_scheme(FILE*               fplog,
      if (EI_DYNAMICS(ir->eI))
      {
          /* Set or try nstlist values */
-        increaseNstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, makeGpuPairList, cpuinfo);
+        increaseNstlist(fplog, cr, ir, nstlist_cmdline, &mtop, box, makeGpuPairList, cpuinfo);
      }
  }
  
@@ -597,9 +659,9 @@ static TaskTarget findTaskTarget(const char* optionString)
  static void finish_run(FILE*                     fplog,
                         const gmx::MDLogger&      mdlog,
                         const t_commrec*          cr,
-                       const t_inputrec*         inputrec,
+                       const t_inputrec&         inputrec,
                         t_nrnb                    nrnb[],
-                       gmx_wallcycle_t           wcycle,
+                       gmx_wallcycle*            wcycle,
                         gmx_walltime_accounting_t walltime_accounting,
                         nonbonded_verlet_t*       nbv,
                         const gmx_pme_t*          pme,
@@ -622,7 +684,7 @@ static void finish_run(FILE*                     fplog,
         Further, we only report performance for dynamical integrators,
         because those are the only ones for which we plan to
         consider doing any optimizations. */
-    bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
+    bool printReport = EI_DYNAMICS(inputrec.eI) && SIMMASTER(cr);
  
      if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
      {
@@ -639,7 +701,7 @@ static void finish_run(FILE*                     fplog,
          nrnbTotalStorage = std::make_unique<t_nrnb>();
          nrnb_tot         = nrnbTotalStorage.get();
  #if GMX_MPI
-        MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
+        MPI_Allreduce(nrnb->n.data(), nrnb_tot->n.data(), eNRNB, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
  #endif
      }
      else
@@ -677,7 +739,7 @@ static void finish_run(FILE*                     fplog,
          print_flop(fplog, nrnb_tot, &nbfs, &mflop);
      }
  
-    if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
+    if (thisRankHasDuty(cr, DUTY_PP) && haveDDAtomOrdering(*cr))
      {
          print_dd_statistics(cr, inputrec, fplog);
      }
@@ -686,15 +748,15 @@ static void finish_run(FILE*                     fplog,
       * to the code that handled the thread region, so that there's a
       * mechanism to keep cycle counting working during the transition
       * to task parallelism. */
-    int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
-    int nthreads_pme = gmx_omp_nthreads_get(emntPME);
+    int nthreads_pp  = gmx_omp_nthreads_get(ModuleMultiThread::Nonbonded);
+    int nthreads_pme = gmx_omp_nthreads_get(ModuleMultiThread::Pme);
      wallcycle_scale_by_num_threads(
              wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
      auto cycle_sum(wallcycle_sum(cr, wcycle));
  
      if (printReport)
      {
-        auto nbnxn_gpu_timings =
+        auto* nbnxn_gpu_timings =
                  (nbv != nullptr && nbv->useGpu()) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
          gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
  
@@ -714,9 +776,9 @@ static void finish_run(FILE*                     fplog,
                          nbnxn_gpu_timings,
                          &pme_gpu_timings);
  
-        if (EI_DYNAMICS(inputrec->eI))
+        if (EI_DYNAMICS(inputrec.eI))
          {
-            delta_t = inputrec->delta_t;
+            delta_t = inputrec.delta_t;
          }
  
          if (fplog)
@@ -744,14 +806,13 @@ static void finish_run(FILE*                     fplog,
  
  int Mdrunner::mdrunner()
  {
-    matrix                    box;
-    t_forcerec*               fr               = nullptr;
-    real                      ewaldcoeff_q     = 0;
-    real                      ewaldcoeff_lj    = 0;
-    int                       nChargePerturbed = -1, nTypePerturbed = 0;
-    gmx_wallcycle_t           wcycle;
-    gmx_walltime_accounting_t walltime_accounting = nullptr;
-    MembedHolder              membedHolder(filenames.size(), filenames.data());
+    matrix                      box;
+    std::unique_ptr<t_forcerec> fr;
+    real                        ewaldcoeff_q     = 0;
+    real                        ewaldcoeff_lj    = 0;
+    int                         nChargePerturbed = -1, nTypePerturbed = 0;
+    gmx_walltime_accounting_t   walltime_accounting = nullptr;
+    MembedHolder                membedHolder(filenames.size(), filenames.data());
  
      /* CAUTION: threads may be started later on in this function, so
         cr doesn't reflect the final parallel state right now */
@@ -792,8 +853,9 @@ int Mdrunner::mdrunner()
  
      gmx_print_detected_hardware(fplog, isSimulationMasterRank && isMasterSim(ms), mdlog, hwinfo_);
  
-    std::vector<int> gpuIdsToUse = makeGpuIdsToUse(hwinfo_->deviceInfoList, hw_opt.gpuIdsAvailable);
-    const int        numDevicesToUse = gmx::ssize(gpuIdsToUse);
+    std::vector<int> availableDevices =
+            makeListOfAvailableDevices(hwinfo_->deviceInfoList, hw_opt.devicesSelectedByUser);
+    const int numAvailableDevices = gmx::ssize(availableDevices);
  
      // Print citation requests after all software/hardware printing
      pleaseCiteGromacs(fplog);
@@ -837,7 +899,7 @@ int Mdrunner::mdrunner()
              auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
              useGpuForNonbonded         = decideWhetherToUseGpusForNonbondedWithThreadMpi(
                      nonbondedTarget,
-                    numDevicesToUse,
+                    numAvailableDevices > 0,
                      userGpuTaskAssignment,
                      emulateGpuNonbonded,
                      canUseGpuForNonbonded,
@@ -845,7 +907,7 @@ int Mdrunner::mdrunner()
                      hw_opt.nthreads_tmpi);
              useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi(useGpuForNonbonded,
                                                                       pmeTarget,
-                                                                     numDevicesToUse,
+                                                                     numAvailableDevices,
                                                                       userGpuTaskAssignment,
                                                                       *hwinfo_,
                                                                       *inputrec,
@@ -861,11 +923,11 @@ int Mdrunner::mdrunner()
           * correctly. */
          hw_opt.nthreads_tmpi = get_nthreads_mpi(hwinfo_,
                                                  &hw_opt,
-                                                numDevicesToUse,
+                                                numAvailableDevices,
                                                  useGpuForNonbonded,
                                                  useGpuForPme,
                                                  inputrec.get(),
-                                                &mtop,
+                                                mtop,
                                                  mdlog,
                                                  membedHolder.doMembed());
  
@@ -875,7 +937,7 @@ int Mdrunner::mdrunner()
          // master and spawned threads joins at the end of this block.
      }
  
-    GMX_RELEASE_ASSERT(ms || simulationCommunicator != MPI_COMM_NULL,
+    GMX_RELEASE_ASSERT(!GMX_MPI || ms || simulationCommunicator != MPI_COMM_NULL,
                         "Must have valid communicator unless running a multi-simulation");
      CommrecHandle crHandle = init_commrec(simulationCommunicator);
      t_commrec*    cr       = crHandle.get();
@@ -912,11 +974,17 @@ int Mdrunner::mdrunner()
              "Linear acceleration has been removed in GROMACS 2022, and was broken for many years "
              "before that. Use GROMACS 4.5 or earlier if you need this feature.");
  
+    // Now we decide whether to use the domain decomposition machinery.
+    // Note that this does not necessarily imply actually using multiple domains.
      // Now the number of ranks is known to all ranks, and each knows
      // the inputrec read by the master rank. The ranks can now all run
      // the task-deciding functions and will agree on the result
      // without needing to communicate.
-    const bool useDomainDecomposition = (PAR(cr) && !(EI_TPI(inputrec->eI) || inputrec->eI == eiNM));
+    // The LBFGS minimizer, test-particle insertion, normal modes and shell dynamics don't support DD
+    const bool useDomainDecomposition =
+            !(inputrec->eI == IntegrationAlgorithm::LBFGS || EI_TPI(inputrec->eI)
+              || inputrec->eI == IntegrationAlgorithm::NM
+              || gmx_mtop_particletype_count(mtop)[ParticleType::Shell] > 0);
  
      // Note that these variables describe only their own node.
      //
@@ -972,6 +1040,8 @@ int Mdrunner::mdrunner()
                                                                doEssentialDynamics,
                                                                membedHolder.doMembed());
  
+    ObservablesReducerBuilder observablesReducerBuilder;
+
      // Build restraints.
      // TODO: hide restraint implementation details from Mdrunner.
      // There is nothing unique about restraints at this point as far as the
@@ -986,13 +1056,23 @@ int Mdrunner::mdrunner()
  
      // TODO: Error handling
      mdModules_->assignOptionsToModules(*inputrec->params, nullptr);
-    // now that the MdModules know their options, they know which callbacks to sign up to
+    // now that the MDModules know their options, they know which callbacks to sign up to
      mdModules_->subscribeToSimulationSetupNotifications();
-    const auto& mdModulesNotifier = mdModules_->notifier().simulationSetupNotifications_;
+    const auto& setupNotifier = mdModules_->notifiers().simulationSetupNotifier_;
+
+    // Notify MdModules of existing logger
+    setupNotifier.notify(mdlog);
  
+    // Notify MdModules of internal parameters, saved into KVT
      if (inputrec->internalParameters != nullptr)
      {
-        mdModulesNotifier.notify(*inputrec->internalParameters);
+        setupNotifier.notify(*inputrec->internalParameters);
+    }
+
+    // Let MdModules know the .tpr filename
+    {
+        gmx::MdRunInputFilename mdRunInputFilename = { ftp2fn(efTPR, filenames.size(), filenames.data()) };
+        setupNotifier.notify(mdRunInputFilename);
      }
  
      if (fplog != nullptr)
@@ -1004,7 +1084,7 @@ int Mdrunner::mdrunner()
      if (SIMMASTER(cr))
      {
          /* In rerun, set velocities to zero if present */
-        if (doRerun && ((globalState->flags & (1 << estV)) != 0))
+        if (doRerun && ((globalState->flags & enumValueToBitMask(StateEntry::V)) != 0))
          {
              // rerun does not use velocities
              GMX_LOG(mdlog.info)
@@ -1016,7 +1096,7 @@ int Mdrunner::mdrunner()
              {
                  clear_rvec(globalState->v[i]);
              }
-            globalState->flags &= ~(1 << estV);
+            globalState->flags &= ~enumValueToBitMask(StateEntry::V);
          }
  
          /* now make sure the state is initialized and propagated */
@@ -1026,14 +1106,14 @@ int Mdrunner::mdrunner()
      /* NM and TPI parallelize over force/energy calculations, not atoms,
       * so we need to initialize and broadcast the global state.
       */
-    if (inputrec->eI == eiNM || inputrec->eI == eiTPI)
+    if (inputrec->eI == IntegrationAlgorithm::NM || inputrec->eI == IntegrationAlgorithm::TPI)
      {
          if (!MASTER(cr))
          {
              globalState = std::make_unique<t_state>();
          }
          broadcastStateWithoutDynamics(
-                cr->mpiDefaultCommunicator, DOMAINDECOMP(cr), PAR(cr), globalState.get());
+                cr->mpiDefaultCommunicator, haveDDAtomOrdering(*cr), PAR(cr), globalState.get());
      }
  
      /* A parallel command line option consistency check that we can
@@ -1056,68 +1136,13 @@ int Mdrunner::mdrunner()
  #endif
      }
  
-    if (doRerun && (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
+    if (doRerun && (EI_ENERGY_MINIMIZATION(inputrec->eI) || IntegrationAlgorithm::NM == inputrec->eI))
      {
          gmx_fatal(FARGS,
                    "The .mdp file specified an energy mininization or normal mode algorithm, and "
                    "these are not compatible with mdrun -rerun");
      }
  
-    /* Object for collecting reasons for not using PME-only ranks */
-    SeparatePmeRanksPermitted separatePmeRanksPermitted;
-
-    /* Permit MDModules to notify whether they want to use PME-only ranks */
-    mdModulesNotifier.notify(&separatePmeRanksPermitted);
-
-    /* If simulation is not using PME then disable PME-only ranks */
-    if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
-    {
-        separatePmeRanksPermitted.disablePmeRanks(
-                "PME-only ranks are requested, but the system does not use PME "
-                "for electrostatics or LJ");
-    }
-
-    /* With NB GPUs we don't automatically use PME-only CPU ranks. PME ranks can
-     * improve performance with many threads per GPU, since our OpenMP
-     * scaling is bad, but it's difficult to automate the setup.
-     */
-    if (useGpuForNonbonded && domdecOptions.numPmeRanks < 0)
-    {
-        separatePmeRanksPermitted.disablePmeRanks(
-                "PME-only CPU ranks are not automatically used when "
-                "non-bonded interactions are computed on GPUs");
-    }
-
-    /* If GPU is used for PME then only 1 PME rank is permitted */
-    if (useGpuForPme && (domdecOptions.numPmeRanks < 0 || domdecOptions.numPmeRanks > 1))
-    {
-        separatePmeRanksPermitted.disablePmeRanks(
-                "PME GPU decomposition is not supported. Only one separate PME-only GPU rank "
-                "can be used.");
-    }
-
-    /* Disable PME-only ranks if some parts of the code requested so and it's up to GROMACS to decide */
-    if (!separatePmeRanksPermitted.permitSeparatePmeRanks() && domdecOptions.numPmeRanks < 0)
-    {
-        domdecOptions.numPmeRanks = 0;
-        GMX_LOG(mdlog.info)
-                .asParagraph()
-                .appendText("Simulation will not use PME-only ranks because: "
-                            + separatePmeRanksPermitted.reasonsWhyDisabled());
-    }
-
-    /* If some parts of the code could not use PME-only ranks and
-     * user explicitly used mdrun -npme option then throw an error */
-    if (!separatePmeRanksPermitted.permitSeparatePmeRanks() && domdecOptions.numPmeRanks > 0)
-    {
-        gmx_fatal_collective(FARGS,
-                             cr->mpiDefaultCommunicator,
-                             MASTER(cr),
-                             "Requested -npme %d option is not viable because: %s",
-                             domdecOptions.numPmeRanks,
-                             separatePmeRanksPermitted.reasonsWhyDisabled().c_str());
-    }
-
      /* NMR restraints must be initialized before load_checkpoint,
       * since with time averaging the history is added to t_state.
       * For proper consistency check we therefore need to extend
@@ -1130,7 +1155,7 @@ int Mdrunner::mdrunner()
      t_disresdata* disresdata;
      snew(disresdata, 1);
      init_disres(fplog,
-                &mtop,
+                mtop,
                  inputrec.get(),
                  DisResRunMode::MDRun,
                  MASTER(cr) ? DDRole::Master : DDRole::Agent,
@@ -1141,9 +1166,10 @@ int Mdrunner::mdrunner()
                  globalState.get(),
                  replExParams.exchangeInterval > 0);
  
-    t_oriresdata* oriresdata;
-    snew(oriresdata, 1);
-    init_orires(fplog, &mtop, inputrec.get(), cr, ms, globalState.get(), oriresdata);
+    if (gmx_mtop_ftype_count(mtop, F_ORIRES) > 0 && isSimulationMasterRank)
+    {
+        extendStateWithOriresHistory(mtop, *inputrec, globalState.get());
+    }
  
      auto deform = prepareBoxDeformation(globalState != nullptr ? globalState->box : box,
                                          MASTER(cr) ? DDRole::Master : DDRole::Agent,
@@ -1193,7 +1219,7 @@ int Mdrunner::mdrunner()
                          globalState.get(),
                          &observablesHistory,
                          mdrunOptions.reproducible,
-                        mdModules_->notifier(),
+                        mdModules_->notifiers(),
                          modularSimulatorCheckpointData.get(),
                          useModularSimulator);
          // TODO: (#3652) Synchronize filesystem state, SimulationInput contents, and program
@@ -1246,7 +1272,7 @@ int Mdrunner::mdrunner()
          gmx_bcast(sizeof(box), box, cr->mpiDefaultCommunicator);
      }
  
-    if (inputrec->cutoff_scheme != ecutsVERLET)
+    if (inputrec->cutoff_scheme != CutoffScheme::Verlet)
      {
          gmx_fatal(FARGS,
                    "This group-scheme .tpr file can no longer be run by mdrun. Please update to the "
@@ -1261,11 +1287,63 @@ int Mdrunner::mdrunner()
                            cr,
                            inputrec.get(),
                            nstlist_cmdline,
-                          &mtop,
+                          mtop,
                            box,
                            useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes),
                            *hwinfo_->cpuInfo);
  
+    // We need to decide on update groups early, as this affects
+    // inter-domain communication distances.
+    auto       updateGroupingsPerMoleculeType = makeUpdateGroupingsPerMoleculeType(mtop);
+    const real maxUpdateGroupRadius           = computeMaxUpdateGroupRadius(
+            mtop, updateGroupingsPerMoleculeType, maxReferenceTemperature(*inputrec));
+    const real   cutoffMargin = std::sqrt(max_cutoff2(inputrec->pbcType, box)) - inputrec->rlist;
+    UpdateGroups updateGroups = makeUpdateGroups(mdlog,
+                                                 std::move(updateGroupingsPerMoleculeType),
+                                                 maxUpdateGroupRadius,
+                                                 useDomainDecomposition,
+                                                 systemHasConstraintsOrVsites(mtop),
+                                                 cutoffMargin);
+
+    try
+    {
+        const bool haveFrozenAtoms = inputrecFrozenAtoms(inputrec.get());
+
+        useGpuForUpdate = decideWhetherToUseGpuForUpdate(useDomainDecomposition,
+                                                         updateGroups.useUpdateGroups(),
+                                                         pmeRunMode,
+                                                         domdecOptions.numPmeRanks > 0,
+                                                         useGpuForNonbonded,
+                                                         updateTarget,
+                                                         gpusWereDetected,
+                                                         *inputrec,
+                                                         mtop,
+                                                         doEssentialDynamics,
+                                                         gmx_mtop_ftype_count(mtop, F_ORIRES) > 0,
+                                                         haveFrozenAtoms,
+                                                         doRerun,
+                                                         devFlags,
+                                                         mdlog);
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
+
+    bool useGpuDirectHalo = false;
+
+    if (useGpuForNonbonded)
+    {
+        // cr->npmenodes is not yet initialized.
+        // domdecOptions.numPmeRanks == -1 results in 0 separate PME ranks when useGpuForNonbonded is true.
+        // Todo: remove this assumption later once auto mode has support for separate PME rank
+        const int numPmeRanks = domdecOptions.numPmeRanks > 0 ? domdecOptions.numPmeRanks : 0;
+        bool      havePPDomainDecomposition = (cr->sizeOfDefaultCommunicator - numPmeRanks) > 1;
+        useGpuDirectHalo                    = decideWhetherToUseGpuForHalo(devFlags,
+                                                        havePPDomainDecomposition,
+                                                        useGpuForNonbonded,
+                                                        useModularSimulator,
+                                                        doRerun,
+                                                        EI_ENERGY_MINIMIZATION(inputrec->eI));
+    }
+
      // This builder is necessary while we have multi-part construction
      // of DD. Before DD is constructed, we use the existence of
      // the builder object to indicate that further construction of DD
@@ -1273,15 +1351,25 @@ int Mdrunner::mdrunner()
      std::unique_ptr<DomainDecompositionBuilder> ddBuilder;
      if (useDomainDecomposition)
      {
-        ddBuilder = std::make_unique<DomainDecompositionBuilder>(
+        // P2P GPU comm + GPU update leads to case in which we enqueue async work for multiple
+        // timesteps. DLB needs to be disabled in that case
+        const bool directGpuCommUsedWithGpuUpdate = GMX_THREAD_MPI && useGpuDirectHalo && useGpuForUpdate;
+        ddBuilder                                 = std::make_unique<DomainDecompositionBuilder>(
                  mdlog,
                  cr,
                  domdecOptions,
                  mdrunOptions,
                  mtop,
                  *inputrec,
+                mdModules_->notifiers(),
                  box,
-                positionsFromStatePointer(globalState.get()));
+                updateGroups.updateGroupingPerMoleculeType(),
+                updateGroups.useUpdateGroups(),
+                updateGroups.maxUpdateGroupRadius(),
+                positionsFromStatePointer(globalState.get()),
+                useGpuForNonbonded,
+                useGpuForPme,
+                directGpuCommUsedWithGpuUpdate);
      }
      else
      {
@@ -1300,7 +1388,7 @@ int Mdrunner::mdrunner()
  
      // Produce the task assignment for this rank - done after DD is constructed
      GpuTaskAssignments gpuTaskAssignments = GpuTaskAssignmentsBuilder::build(
-            gpuIdsToUse,
+            availableDevices,
              userGpuTaskAssignment,
              *hwinfo_,
              simulationCommunicator,
@@ -1342,41 +1430,46 @@ int Mdrunner::mdrunner()
      // requires it (e.g. pull, CompEl, density fitting), so that we
      // don't update the local atom sets unilaterally every step.
      LocalAtomSetManager atomSets;
+
+    // Local state and topology are declared (and perhaps constructed)
+    // now, because DD needs them for the LocalTopologyChecker, but
+    // they do not contain valid data until after the first DD
+    // partition.
+    std::unique_ptr<t_state> localStateInstance;
+    t_state*                 localState;
+    gmx_localtop_t           localTopology(mtop.ffparams);
+
      if (ddBuilder)
      {
+        localStateInstance = std::make_unique<t_state>();
+        localState         = localStateInstance.get();
          // TODO Pass the GPU streams to ddBuilder to use in buffer
          // transfers (e.g. halo exchange)
-        cr->dd = ddBuilder->build(&atomSets);
+        cr->dd = ddBuilder->build(&atomSets, localTopology, *localState, &observablesReducerBuilder);
          // The builder's job is done, so destruct it
          ddBuilder.reset(nullptr);
          // Note that local state still does not exist yet.
      }
+    else
+    {
+        // Without DD, the local state is merely an alias to the global state,
+        // so we don't need to allocate anything.
+        localState = globalState.get();
+    }
  
-    // The GPU update is decided here because we need to know whether the constraints or
-    // SETTLEs can span accross the domain borders (i.e. whether or not update groups are
-    // defined). This is only known after DD is initialized, hence decision on using GPU
-    // update is done so late.
-    try
+    // Ensure that all atoms within the same update group are in the
+    // same periodic image. Otherwise, a simulation that did not use
+    // update groups (e.g. a single-rank simulation) cannot always be
+    // correctly restarted in a way that does use update groups
+    // (e.g. a multi-rank simulation).
+    if (isSimulationMasterRank)
      {
          const bool useUpdateGroups = cr->dd ? ddUsesUpdateGroups(*cr->dd) : false;
-
-        useGpuForUpdate = decideWhetherToUseGpuForUpdate(useDomainDecomposition,
-                                                         useUpdateGroups,
-                                                         pmeRunMode,
-                                                         domdecOptions.numPmeRanks > 0,
-                                                         useGpuForNonbonded,
-                                                         updateTarget,
-                                                         gpusWereDetected,
-                                                         *inputrec,
-                                                         mtop,
-                                                         doEssentialDynamics,
-                                                         gmx_mtop_ftype_count(mtop, F_ORIRES) > 0,
-                                                         replExParams.exchangeInterval > 0,
-                                                         doRerun,
-                                                         devFlags,
-                                                         mdlog);
+        if (useUpdateGroups)
+        {
+            putUpdateGroupAtomsInSamePeriodicImage(*cr->dd, mtop, globalState->box, globalState->x);
+        }
      }
-    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
  
      const bool printHostName = (cr->nnodes > 1);
      gpuTaskAssignments.reportGpuUsage(mdlog, printHostName, useGpuForBonded, pmeRunMode, useGpuForUpdate);
@@ -1394,17 +1487,16 @@ int Mdrunner::mdrunner()
  
      MdrunScheduleWorkload runScheduleWork;
  
-    bool useGpuDirectHalo = decideWhetherToUseGpuForHalo(devFlags,
-                                                         havePPDomainDecomposition(cr),
-                                                         useGpuForNonbonded,
-                                                         useModularSimulator,
-                                                         doRerun,
-                                                         EI_ENERGY_MINIMIZATION(inputrec->eI));
-
      // Also populates the simulation constant workload description.
+    // Note: currently the default duty is DUTY_PP | DUTY_PME for all simulations, including those without PME,
+    // so this boolean is sufficient on all ranks to determine whether separate PME ranks are used,
+    // but this will no longer be the case if cr->duty is changed for !EEL_PME(fr->ic->eeltype).
+    const bool haveSeparatePmeRank = (!thisRankHasDuty(cr, DUTY_PP) || !thisRankHasDuty(cr, DUTY_PME));
      runScheduleWork.simulationWork = createSimulationWorkload(*inputrec,
                                                                disableNonbondedCalculation,
                                                                devFlags,
+                                                              havePPDomainDecomposition(cr),
+                                                              haveSeparatePmeRank,
                                                                useGpuForNonbonded,
                                                                pmeRunMode,
                                                                useGpuForBonded,
@@ -1415,7 +1507,7 @@ int Mdrunner::mdrunner()
  
      if (deviceInfo != nullptr)
      {
-        if (DOMAINDECOMP(cr) && thisRankHasDuty(cr, DUTY_PP))
+        if (runScheduleWork.simulationWork.havePpDomainDecomposition && thisRankHasDuty(cr, DUTY_PP))
          {
              dd_setup_dlb_resource_sharing(cr, deviceId);
          }
@@ -1427,7 +1519,7 @@ int Mdrunner::mdrunner()
      // where appropriate.
      if (!userGpuTaskAssignment.empty())
      {
-        gpuTaskAssignments.logPerformanceHints(mdlog, numDevicesToUse);
+        gpuTaskAssignments.logPerformanceHints(mdlog, numAvailableDevices);
      }
  
      if (PAR(cr))
@@ -1494,19 +1586,20 @@ int Mdrunner::mdrunner()
         PME: env variable should be read only on one node to make sure it is
         identical everywhere;
       */
-    const int numThreadsOnThisRank = thisRankHasDuty(cr, DUTY_PP) ? gmx_omp_nthreads_get(emntNonbonded)
-                                                                  : gmx_omp_nthreads_get(emntPME);
+    const int numThreadsOnThisRank = thisRankHasDuty(cr, DUTY_PP)
+                                             ? gmx_omp_nthreads_get(ModuleMultiThread::Nonbonded)
+                                             : gmx_omp_nthreads_get(ModuleMultiThread::Pme);
      checkHardwareOversubscription(
              numThreadsOnThisRank, cr->nodeid, *hwinfo_->hardwareTopology, physicalNodeComm, mdlog);
  
      // Enable Peer access between GPUs where available
      // Only for DD, only master PP rank needs to perform setup, and only if thread MPI plus
      // any of the GPU communication features are active.
-    if (DOMAINDECOMP(cr) && MASTER(cr) && thisRankHasDuty(cr, DUTY_PP) && GMX_THREAD_MPI
+    if (haveDDAtomOrdering(*cr) && MASTER(cr) && thisRankHasDuty(cr, DUTY_PP) && GMX_THREAD_MPI
          && (runScheduleWork.simulationWork.useGpuHaloExchange
              || runScheduleWork.simulationWork.useGpuPmePpCommunication))
      {
-        setupGpuDevicePeerAccess(gpuIdsToUse, mdlog);
+        setupGpuDevicePeerAccess(gpuTaskAssignments.deviceIdsAssigned(), mdlog);
      }
  
      if (hw_opt.threadAffinity != ThreadAffinity::Off)
@@ -1540,15 +1633,16 @@ int Mdrunner::mdrunner()
                          "The -resetstep functionality is deprecated, and may be removed in a "
                          "future version.");
      }
-    wcycle = wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
+    std::unique_ptr<gmx_wallcycle> wcycle =
+            wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
  
      if (PAR(cr))
      {
          /* Master synchronizes its value of reset_counters with all nodes
           * including PME only nodes */
-        int64_t reset_counters = wcycle_get_reset_counters(wcycle);
+        int64_t reset_counters = wcycle_get_reset_counters(wcycle.get());
          gmx_bcast(sizeof(reset_counters), &reset_counters, cr->mpi_comm_mysim);
-        wcycle_set_reset_counters(wcycle, reset_counters);
+        wcycle_set_reset_counters(wcycle.get(), reset_counters);
      }
  
      // Membrane embedding must be initialized before we call init_forcerec()
@@ -1564,24 +1658,24 @@ int Mdrunner::mdrunner()
      const bool               thisRankHasPmeGpuTask = gpuTaskAssignments.thisRankHasPmeGpuTask();
      std::unique_ptr<MDAtoms> mdAtoms;
      std::unique_ptr<VirtualSitesHandler> vsite;
-    std::unique_ptr<GpuBonded>           gpuBonded;
  
      t_nrnb nrnb;
      if (thisRankHasDuty(cr, DUTY_PP))
      {
-        mdModulesNotifier.notify(*cr);
-        mdModulesNotifier.notify(&atomSets);
-        mdModulesNotifier.notify(mtop);
-        mdModulesNotifier.notify(inputrec->pbcType);
-        mdModulesNotifier.notify(SimulationTimeStep{ inputrec->delta_t });
+        setupNotifier.notify(*cr);
+        setupNotifier.notify(&atomSets);
+        setupNotifier.notify(mtop);
+        setupNotifier.notify(inputrec->pbcType);
+        setupNotifier.notify(SimulationTimeStep{ inputrec->delta_t });
          /* Initiate forcerecord */
-        fr                 = new t_forcerec;
+        fr                 = std::make_unique<t_forcerec>();
          fr->forceProviders = mdModules_->initForceProviders();
          init_forcerec(fplog,
                        mdlog,
-                      fr,
-                      inputrec.get(),
-                      &mtop,
+                      runScheduleWork.simulationWork,
+                      fr.get(),
+                      *inputrec,
+                      mtop,
                        cr,
                        box,
                        opt2fn("-table", filenames.size(), filenames.data()),
@@ -1590,7 +1684,11 @@ int Mdrunner::mdrunner()
                        pforce);
          // Dirty hack, for fixing disres and orires should be made mdmodules
          fr->fcdata->disres = disresdata;
-        fr->fcdata->orires = oriresdata;
+        if (gmx_mtop_ftype_count(mtop, F_ORIRES) > 0)
+        {
+            fr->fcdata->orires = std::make_unique<t_oriresdata>(
+                    fplog, mtop, *inputrec, ms, globalState.get(), &atomSets);
+        }
  
          // Save a handle to device stream manager to use elsewhere in the code
          // TODO: Forcerec is not a correct place to store it.
@@ -1609,34 +1707,44 @@ int Mdrunner::mdrunner()
              fr->pmePpCommGpu = std::make_unique<gmx::PmePpCommGpu>(
                      cr->mpi_comm_mysim,
                      cr->dd->pme_nodeid,
+                    &cr->dd->pmeForceReceiveBuffer,
                      deviceStreamManager->context(),
                      deviceStreamManager->stream(DeviceStreamType::PmePpTransfer));
          }
  
          fr->nbv = Nbnxm::init_nb_verlet(mdlog,
-                                        inputrec.get(),
-                                        fr,
+                                        *inputrec,
+                                        *fr,
                                          cr,
                                          *hwinfo_,
                                          runScheduleWork.simulationWork.useGpuNonbonded,
                                          deviceStreamManager.get(),
-                                        &mtop,
+                                        mtop,
                                          box,
-                                        wcycle);
+                                        wcycle.get());
          // TODO: Move the logic below to a GPU bonded builder
          if (runScheduleWork.simulationWork.useGpuBonded)
          {
              GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
                                 "GPU device stream manager should be valid in order to use GPU "
                                 "version of bonded forces.");
-            gpuBonded = std::make_unique<GpuBonded>(
+            fr->listedForcesGpu = std::make_unique<ListedForcesGpu>(
                      mtop.ffparams,
                      fr->ic->epsfac * fr->fudgeQQ,
                      deviceStreamManager->context(),
                      deviceStreamManager->bondedStream(havePPDomainDecomposition(cr)),
-                    wcycle);
-            fr->gpuBonded = gpuBonded.get();
+                    wcycle.get());
          }
+        fr->longRangeNonbondeds = std::make_unique<CpuPpLongRangeNonbondeds>(fr->n_tpi,
+                                                                             fr->ic->ewaldcoeff_q,
+                                                                             fr->ic->epsilon_r,
+                                                                             fr->qsum,
+                                                                             fr->ic->eeltype,
+                                                                             fr->ic->vdwtype,
+                                                                             *inputrec,
+                                                                             &nrnb,
+                                                                             wcycle.get(),
+                                                                             fplog);
  
          /* Initialize the mdAtoms structure.
           * mdAtoms is not filled with atom data,
@@ -1654,7 +1762,8 @@ int Mdrunner::mdrunner()
          }
  
          /* Initialize the virtual site communication */
-        vsite = makeVirtualSitesHandler(mtop, cr, fr->pbcType);
+        vsite = makeVirtualSitesHandler(
+                mtop, cr, fr->pbcType, updateGroups.updateGroupingPerMoleculeType());
  
          calc_shifts(box, fr->shift_vec);
  
@@ -1678,6 +1787,11 @@ int Mdrunner::mdrunner()
                  constructVirtualSitesGlobal(mtop, globalState->x);
              }
          }
+        // Make the DD reverse topology, now that any vsites that are present are available
+        if (haveDDAtomOrdering(*cr))
+        {
+            dd_make_reverse_top(fplog, cr->dd, mtop, vsite.get(), *inputrec, domdecOptions.ddBondedChecking);
+        }
  
          if (EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
          {
@@ -1754,10 +1868,10 @@ int Mdrunner::mdrunner()
                  const DeviceContext* deviceContext = runScheduleWork.simulationWork.useGpuPme
                                                               ? &deviceStreamManager->context()
                                                               : nullptr;
-                const DeviceStream* pmeStream =
+                const DeviceStream*  pmeStream =
                          runScheduleWork.simulationWork.useGpuPme
-                                ? &deviceStreamManager->stream(DeviceStreamType::Pme)
-                                : nullptr;
+                                 ? &deviceStreamManager->stream(DeviceStreamType::Pme)
+                                 : nullptr;
  
                  pmedata = gmx_pme_init(cr,
                                         getNumPmeDomains(cr->dd),
@@ -1767,7 +1881,7 @@ int Mdrunner::mdrunner()
                                         mdrunOptions.reproducible,
                                         ewaldcoeff_q,
                                         ewaldcoeff_lj,
-                                       gmx_omp_nthreads_get(emntPME),
+                                       gmx_omp_nthreads_get(ModuleMultiThread::Pme),
                                         pmeRunMode,
                                         nullptr,
                                         deviceContext,
@@ -1794,7 +1908,7 @@ int Mdrunner::mdrunner()
      if (thisRankHasDuty(cr, DUTY_PP))
      {
          /* Assumes uniform use of the number of OpenMP threads */
-        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(ModuleMultiThread::Default));
  
          if (inputrec->bPull)
          {
@@ -1802,7 +1916,7 @@ int Mdrunner::mdrunner()
              pull_work = init_pull(fplog,
                                    inputrec->pull.get(),
                                    inputrec.get(),
-                                  &mtop,
+                                  mtop,
                                    cr,
                                    &atomSets,
                                    inputrec->fepvals->init_lambda);
@@ -1827,20 +1941,20 @@ int Mdrunner::mdrunner()
                                          cr,
                                          &atomSets,
                                          globalState.get(),
-                                        &mtop,
+                                        mtop,
                                          oenv,
                                          mdrunOptions,
                                          startingBehavior);
          }
  
          t_swap* swap = nullptr;
-        if (inputrec->eSwapCoords != eswapNO)
+        if (inputrec->eSwapCoords != SwapType::No)
          {
              /* Initialize ion swapping code */
              swap = init_swapcoords(fplog,
                                     inputrec.get(),
                                     opt2fn_master("-swap", filenames.size(), filenames.data(), cr),
-                                   &mtop,
+                                   mtop,
                                     globalState.get(),
                                     &observablesHistory,
                                     cr,
@@ -1851,8 +1965,18 @@ int Mdrunner::mdrunner()
          }
  
          /* Let makeConstraints know whether we have essential dynamics constraints. */
-        auto constr = makeConstraints(
-                mtop, *inputrec, pull_work, doEssentialDynamics, fplog, cr, ms, &nrnb, wcycle, fr->bMolPBC);
+        auto constr = makeConstraints(mtop,
+                                      *inputrec,
+                                      pull_work,
+                                      doEssentialDynamics,
+                                      fplog,
+                                      cr,
+                                      updateGroups.useUpdateGroups(),
+                                      ms,
+                                      &nrnb,
+                                      wcycle.get(),
+                                      fr->bMolPBC,
+                                      &observablesReducerBuilder);
  
          /* Energy terms and groups */
          gmx_enerdata_t enerd(mtop.groups.groups[SimulationAtomGroupType::EnergyOutput].size(),
@@ -1860,41 +1984,36 @@ int Mdrunner::mdrunner()
  
          // cos acceleration is only supported by md, but older tpr
          // files might still combine it with other integrators
-        GMX_RELEASE_ASSERT(inputrec->cos_accel == 0.0 || inputrec->eI == eiMD,
+        GMX_RELEASE_ASSERT(inputrec->cos_accel == 0.0 || inputrec->eI == IntegrationAlgorithm::MD,
                             "cos_acceleration is only supported by integrator=md");
  
          /* Kinetic energy data */
-        gmx_ekindata_t ekind;
-        init_ekindata(fplog, &(inputrec->opts), &ekind, inputrec->cos_accel);
+        gmx_ekindata_t ekind(inputrec->opts.ngtc,
+                             inputrec->cos_accel,
+                             gmx_omp_nthreads_get(ModuleMultiThread::Update));
  
          /* Set up interactive MD (IMD) */
          auto imdSession = makeImdSession(inputrec.get(),
                                           cr,
-                                         wcycle,
+                                         wcycle.get(),
                                           &enerd,
                                           ms,
-                                         &mtop,
+                                         mtop,
                                           mdlog,
-                                         MASTER(cr) ? globalState->x.rvec_array() : nullptr,
+                                         MASTER(cr) ? globalState->x : gmx::ArrayRef<gmx::RVec>(),
                                           filenames.size(),
                                           filenames.data(),
                                           oenv,
                                           mdrunOptions.imdOptions,
                                           startingBehavior);
  
-        if (DOMAINDECOMP(cr))
+        if (haveDDAtomOrdering(*cr))
          {
              GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
-            /* This call is not included in init_domain_decomposition mainly
-             * because fr->cginfo_mb is set later.
+            /* This call is not included in init_domain_decomposition
+             * because fr->atomInfoForEachMoleculeBlock is set later.
               */
-            dd_init_bondeds(fplog,
-                            cr->dd,
-                            mtop,
-                            vsite.get(),
-                            inputrec.get(),
-                            domdecOptions.checkBondedInteractions,
-                            fr->cginfo_mb);
+            makeBondedLinks(cr->dd, mtop, fr->atomInfoForEachMoleculeBlock);
          }
  
          if (runScheduleWork.simulationWork.useGpuBufferOps)
@@ -1902,11 +2021,11 @@ int Mdrunner::mdrunner()
              fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique<gmx::GpuForceReduction>(
                      deviceStreamManager->context(),
                      deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal),
-                    wcycle);
+                    wcycle.get());
              fr->gpuForceReduction[gmx::AtomLocality::NonLocal] = std::make_unique<gmx::GpuForceReduction>(
                      deviceStreamManager->context(),
                      deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal),
-                    wcycle);
+                    wcycle.get());
          }
  
          std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
@@ -1914,39 +2033,41 @@ int Mdrunner::mdrunner()
              && ((runScheduleWork.simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME))
                  || runScheduleWork.simulationWork.useGpuBufferOps))
          {
-            GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator)
-                                                      ? GpuApiCallBehavior::Async
-                                                      : GpuApiCallBehavior::Sync;
+            GpuApiCallBehavior transferKind =
+                    (inputrec->eI == IntegrationAlgorithm::MD && !doRerun && !useModularSimulator)
+                            ? GpuApiCallBehavior::Async
+                            : GpuApiCallBehavior::Sync;
              GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
                                 "GPU device stream manager should be initialized to use GPU.");
              stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
-                    *deviceStreamManager, transferKind, pme_gpu_get_block_size(fr->pmedata), wcycle);
+                    *deviceStreamManager, transferKind, pme_gpu_get_block_size(fr->pmedata), wcycle.get());
              fr->stateGpu = stateGpu.get();
          }
  
          GMX_ASSERT(stopHandlerBuilder_, "Runner must provide StopHandlerBuilder to simulator.");
          SimulatorBuilder simulatorBuilder;
  
-        simulatorBuilder.add(SimulatorStateData(globalState.get(), &observablesHistory, &enerd, &ekind));
+        simulatorBuilder.add(SimulatorStateData(
+                globalState.get(), localState, &observablesHistory, &enerd, &ekind));
          simulatorBuilder.add(std::move(membedHolder));
          simulatorBuilder.add(std::move(stopHandlerBuilder_));
          simulatorBuilder.add(SimulatorConfig(mdrunOptions, startingBehavior, &runScheduleWork));
  
  
-        simulatorBuilder.add(SimulatorEnv(fplog, cr, ms, mdlog, oenv));
-        simulatorBuilder.add(Profiling(&nrnb, walltime_accounting, wcycle));
+        simulatorBuilder.add(SimulatorEnv(fplog, cr, ms, mdlog, oenv, &observablesReducerBuilder));
+        simulatorBuilder.add(Profiling(&nrnb, walltime_accounting, wcycle.get()));
          simulatorBuilder.add(ConstraintsParam(
                  constr.get(), enforcedRotation ? enforcedRotation->getLegacyEnfrot() : nullptr, vsite.get()));
          // TODO: Separate `fr` to a separate add, and make the `build` handle the coupling sensibly.
          simulatorBuilder.add(LegacyInput(
-                static_cast<int>(filenames.size()), filenames.data(), inputrec.get(), fr));
+                static_cast<int>(filenames.size()), filenames.data(), inputrec.get(), fr.get()));
          simulatorBuilder.add(ReplicaExchangeParameters(replExParams));
          simulatorBuilder.add(InteractiveMD(imdSession.get()));
-        simulatorBuilder.add(SimulatorModules(mdModules_->outputProvider(), mdModules_->notifier()));
+        simulatorBuilder.add(SimulatorModules(mdModules_->outputProvider(), mdModules_->notifiers()));
          simulatorBuilder.add(CenterOfMassPulling(pull_work));
          // Todo move to an MDModule
          simulatorBuilder.add(IonSwapping(swap));
-        simulatorBuilder.add(TopologyData(&mtop, mdAtoms.get()));
+        simulatorBuilder.add(TopologyData(mtop, &localTopology, mdAtoms.get()));
          simulatorBuilder.add(BoxDeformationHandle(deform.get()));
          simulatorBuilder.add(std::move(modularSimulatorCheckpointData));
  
@@ -1970,18 +2091,19 @@ int Mdrunner::mdrunner()
      {
          GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
          /* do PME only */
-        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
+        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(ModuleMultiThread::Pme));
          gmx_pmeonly(pmedata,
                      cr,
                      &nrnb,
-                    wcycle,
+                    wcycle.get(),
                      walltime_accounting,
                      inputrec.get(),
                      pmeRunMode,
+                    runScheduleWork.simulationWork.useGpuPmePpCommunication,
                      deviceStreamManager.get());
      }
  
-    wallcycle_stop(wcycle, ewcRUN);
+    wallcycle_stop(wcycle.get(), WallCycleCounter::Run);
  
      /* Finish up, write some stuff
       * if rerunMD, don't write last frame again
@@ -1989,16 +2111,14 @@ int Mdrunner::mdrunner()
      finish_run(fplog,
                 mdlog,
                 cr,
-               inputrec.get(),
+               *inputrec,
                 &nrnb,
-               wcycle,
+               wcycle.get(),
                 walltime_accounting,
                 fr ? fr->nbv.get() : nullptr,
                 pmedata,
                 EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms));
  
-    // clean up cycle counter
-    wallcycle_destroy(wcycle);
  
      deviceStreamManager.reset(nullptr);
      // Free PME data
@@ -2014,14 +2134,11 @@ int Mdrunner::mdrunner()
      // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
      mdAtoms.reset(nullptr);
      globalState.reset(nullptr);
+    localStateInstance.reset(nullptr);
      mdModules_.reset(nullptr); // destruct force providers here as they might also use the GPU
-    gpuBonded.reset(nullptr);
-    /* Free pinned buffers in *fr */
-    delete fr;
-    fr = nullptr;
+    fr.reset(nullptr);         // destruct forcerec before gpu
      // TODO convert to C++ so we can get rid of these frees
      sfree(disresdata);
-    sfree(oriresdata);
  
      if (!hwinfo_->deviceInfoList.empty())
      {
@@ -2048,7 +2165,14 @@ int Mdrunner::mdrunner()
      {
          physicalNodeComm.barrier();
      }
-    releaseDevice(deviceInfo);
+
+    if (!devFlags.usingCudaAwareMpi)
+    {
+        // Don't reset GPU in case of CUDA-AWARE MPI
+        // UCX creates CUDA buffers which are cleaned-up as part of MPI_Finalize()
+        // resetting the device before MPI_Finalize() results in crashes inside UCX
+        releaseDevice(deviceInfo);
+    }
  
      /* Does what it says */
      print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());