Merge remote-tracking branch 'origin/release-2021' into merge-2021-into-master
authorAndrey Alekseenko <al42and@gmail.com>
Thu, 19 Aug 2021 09:51:13 +0000 (12:51 +0300)
committerAndrey Alekseenko <al42and@gmail.com>
Thu, 19 Aug 2021 10:10:25 +0000 (13:10 +0300)
Resolved conflicts:
 - cmake/gmxVersionInfo.cmake
 - docs/CMakeLists.txt
 - python_packaging/src/gmxapi/export_system.cpp
 - python_packaging/src/setup.py
 - src/gromacs/domdec/domdec.cpp
 - src/gromacs/gmxana/gmx_chi.cpp

12 files changed:
1  2 
cmake/gmxVersionInfo.cmake
docs/CMakeLists.txt
docs/release-notes/index.rst
python_packaging/src/CMakeLists.txt
python_packaging/src/gmxapi/export_system.cpp
python_packaging/src/setup.py
src/gromacs/domdec/domdec.cpp
src/gromacs/domdec/domdec.h
src/gromacs/domdec/domdec_internal.h
src/gromacs/gmxana/anadih.cpp
src/gromacs/gmxana/gmx_chi.cpp
src/gromacs/mdrun/runner.cpp

index 185f9f0e501690faebe458641fe7e700b5ad5fa0,86abb3bcf3dd90f9213f06ecb68eacd3d01db7df..c4485a330f4951445b192353fb0abcb689a7b7a2
@@@ -60,7 -60,6 +60,7 @@@
  #         GROMACS     2019   4
  #         GROMACS     2020   5
  #         GROMACS     2021   6
 +#         GROMACS     2022   7
  #   LIBRARY_SOVERSION_MINOR so minor version for the built libraries.
  #       Should be increased for each release that changes only the implementation.
  #       In GROMACS, the typical policy is to increase it for each patch version
  
  # The GROMACS convention is that these are the version number of the next
  # release that is going to be made from this branch.
 -set(GMX_VERSION_MAJOR 2021)
 -set(GMX_VERSION_PATCH 4)
 +set(GMX_VERSION_MAJOR 2022)
 +set(GMX_VERSION_PATCH 0)
  # The suffix, on the other hand, is used mainly for betas and release
  # candidates, where it signifies the most recent such release from
  # this branch; it will be empty before the first such release, as well
@@@ -215,7 -214,7 +215,7 @@@ set(GMX_VERSION_SUFFIX ""
  # here. The important thing is to minimize the chance of third-party
  # code being able to dynamically link with a version of libgromacs
  # that might not work.
 -set(LIBRARY_SOVERSION_MAJOR 6)
 +set(LIBRARY_SOVERSION_MAJOR 7)
  set(LIBRARY_SOVERSION_MINOR 0)
  set(LIBRARY_VERSION ${LIBRARY_SOVERSION_MAJOR}.${LIBRARY_SOVERSION_MINOR}.0)
  
@@@ -257,13 -256,13 +257,13 @@@ if (NOT SOURCE_IS_SOURCE_DISTRIBUTION A
  endif()
  
  set(REGRESSIONTEST_VERSION "${GMX_VERSION_STRING}")
 -set(REGRESSIONTEST_BRANCH "release-2021") 
 +set(REGRESSIONTEST_BRANCH "master")
  # Run the regressiontests packaging job with the correct pakage
  # version string, and the release box checked, in order to have it
  # build the regressiontests tarball with all the right naming. The
  # naming affects the md5sum that has to go here, and if it isn't right
  # release workflow will report a failure.
- set(REGRESSIONTEST_MD5SUM "62606f5f6ea37f6114b6e9cf58218f13" CACHE INTERNAL "MD5 sum of the regressiontests tarball for this GROMACS version")
+ set(REGRESSIONTEST_MD5SUM "93956ea42c4d16fdd541518c05972989" CACHE INTERNAL "MD5 sum of the regressiontests tarball for this GROMACS version")
  
  math(EXPR GMX_VERSION_NUMERIC
       "${GMX_VERSION_MAJOR}*10000 + ${GMX_VERSION_PATCH}")
diff --combined docs/CMakeLists.txt
index 468c8ed31a43d6c7bcc1a145c76d69f5a9b37591,c43d24b279bee367e8b0a56f446a0a80b3e6093d..6eefe6ed5f2788a59b00f363389633587d196dec
@@@ -338,7 -338,7 +338,7 @@@ if (SPHINX_FOUND
          dev-manual/doxygen.rst
          dev-manual/error-handling.rst
          dev-manual/formatting.rst
 -        dev-manual/gitlab.rst
 +        dev-manual/gitlab-ci.rst
          dev-manual/gmxtree.rst
          dev-manual/includestyle.rst
          dev-manual/index.rst
          how-to/visualize.rst
          install-guide/index.rst
          release-notes/index.rst
 +        release-notes/2022/major/highlights.rst
 +        release-notes/2022/major/features.rst
 +        release-notes/2022/major/performance.rst
 +        release-notes/2022/major/tools.rst
 +        release-notes/2022/major/bugs-fixed.rst
 +        release-notes/2022/major/removed-functionality.rst
 +        release-notes/2022/major/deprecated-functionality.rst
 +        release-notes/2022/major/portability.rst
 +        release-notes/2022/major/miscellaneous.rst
 +        release-notes/2022/major/api.rst
+         release-notes/2021/2021.4.rst
          release-notes/2021/2021.3.rst
          release-notes/2021/2021.2.rst
          release-notes/2021/2021.1.rst
index 0702e8693404a67b8392a99a84ca667a50957771,a9f0bf93e07db767cde10cbed5ca996256b036a4..1b364f73e1cdff792a12c3f495509fa5f686ba42
@@@ -8,19 -8,19 +8,19 @@@ releases of |Gromacs|. Major releases c
  functionality supported, whereas patch releases contain only fixes for
  issues identified in the corresponding major releases.
  
 -Two versions of |Gromacs| are under active maintenance, the 2021
 -series and the 2020 series. In the latter, only highly conservative
 +Two versions of |Gromacs| are under active maintenance, the 2022
 +series and the 2021 series. In the latter, only highly conservative
  fixes will be made, and only to address issues that affect scientific
  correctness. Naturally, some of those releases will be made after the
 -year 2020 ends, but we keep 2019 in the name so users understand how
 +year 2021 ends, but we keep 2021 in the name so users understand how
  up to date their version is. Such fixes will also be incorporated into
 -the 2021 release series, as appropriate. Around the time the 2022
 -release is made, the 2020 series will no longer be maintained.
 +the 2022 release series, as appropriate. Around the time the 2023
 +release is made, the 2021 series will no longer be maintained.
  
  Where issue numbers are reported in these release notes, more details
  can be found on the `issue tracker`_ at that issue number.
  
 -|Gromacs| 2021 series
 +|Gromacs| 2022 series
  ---------------------
  
  .. todolist::
@@@ -31,32 -31,12 +31,33 @@@ Patch release
  .. toctree::
     :maxdepth: 1
  
+    2021/2021.4
     2021/2021.3
     2021/2021.2
     2021/2021.1
  
  
 +Major release
 +^^^^^^^^^^^^^
 +
 +.. toctree::
 +   :maxdepth: 1
 +
 +   2022/major/highlights
 +   2022/major/features
 +   2022/major/performance
 +   2022/major/api
 +   2022/major/tools
 +   2022/major/bugs-fixed
 +   2022/major/deprecated-functionality
 +   2022/major/removed-functionality
 +   2022/major/portability
 +   2022/major/miscellaneous
 +
 +
 +|Gromacs| 2021 series
 +---------------------
 +
  Major release
  ^^^^^^^^^^^^^
  
     2021/major/portability
     2021/major/miscellaneous
  
 +
 +Older (unmaintained) |Gromacs| series
 +-------------------------------------------------------
 +
  |Gromacs| 2020 series
  ---------------------
  
@@@ -142,6 -118,9 +143,6 @@@ Major releas
     2019/major/portability
     2019/major/miscellaneous
  
 -Older (unmaintained) |Gromacs| series
 --------------------------------------------------------
 -
  |Gromacs| 2018 series
  ---------------------
  
index 4fb372dc063c7196e27d5fe368e6e16dfee8d9f4,3cc138593877da16b9f4c492a63a02937f81e640..1e8de44d91aa33430df4e52d76d176de38a63826
@@@ -38,7 -38,7 +38,7 @@@
  # configure and run CMake. CMake could be invoked directly by the user or a
  # parent package, but the Python distribution would not be packaged automatically.
  # Reference https://gitlab.com/gromacs/gromacs/-/issues/2896 for additional discussion.
 -cmake_minimum_required(VERSION 3.13.0)
 +cmake_minimum_required(VERSION 3.16.3)
  
  # This needs to be set before project() in order to pick up toolchain files
  #list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../cmake)
@@@ -55,7 -55,7 +55,7 @@@ set(CMAKE_OSX_ARCHITECTURES x86_64 CACH
  # Note that this is the gmxapi._gmxapi Python bindings package version,
  # not the C++ API version. It is not essential that it match the pure Python
  # package version, but is likely to do so.
 -project(gmxapi VERSION 0.2.1)
 +project(gmxapi VERSION 0.3.0)
  
  # Check if Python package is being built directly or via add_subdirectory
  set(GMXAPI_MASTER_PROJECT OFF)
@@@ -76,9 -76,13 +76,13 @@@ if(POLICY CMP0074) #3.1
  endif()
  
  if(GMXAPI_MASTER_PROJECT)
-     find_package(gmxapi 0.2.1 REQUIRED
+     find_package(gmxapi 0.2 REQUIRED
                   HINTS "$ENV{GROMACS_DIR}"
                   )
+     if (gmxapi_VERSION VERSION_LESS 0.2.1)
+         message(WARNING "Your GROMACS installation does not support custom MD plugins. "
+                 "If you need this feature, please install GROMACS 2021.3 or higher.")
+     endif ()
  endif()
  if(gmxapi_FOUND)
      set(_suffix "")
@@@ -132,6 -136,12 +136,12 @@@ pybind11_add_module(_gmxap
                      ${GMXAPI_PYTHON_EXTENSION_SOURCES}
                      )
  
+ if (gmxapi_VERSION VERSION_GREATER_EQUAL 0.2.1)
+     target_sources(_gmxapi PRIVATE gmxapi/launch_021.cpp)
+ else()
+     target_sources(_gmxapi PRIVATE gmxapi/launch_020.cpp)
+ endif()
  target_include_directories(_gmxapi PRIVATE
                             ${CMAKE_CURRENT_SOURCE_DIR}/gmxapi
                             ${CMAKE_CURRENT_BINARY_DIR}/gmxapi
index b504ccb9e74599b62bc3af6d1c91e722ecd7b33f,273ad5087afe05000f2157bb144404c78b1958ce..39eeb91d78c3e0b5c0818aeb2e3a394b896290c0
@@@ -44,6 -44,7 +44,7 @@@
  #include "gmxapi/session.h"
  #include "gmxapi/status.h"
  #include "gmxapi/system.h"
+ #include "gmxapi/version.h"
  
  #include "pycontext.h"
  #include "pysystem.h"
@@@ -75,24 -76,15 +76,17 @@@ void export_system(py::module& m
      // required to maintain and to pass to the API.
      py::class_<::gmxapi::Session, std::shared_ptr<::gmxapi::Session>> session(m, "MDSession");
      session.def("run", &::gmxapi::Session::run, "Run the simulation workflow");
 -    session.def("close", &::gmxapi::Session::close,
 +    session.def("close",
 +                &::gmxapi::Session::close,
                  "Shut down the execution environment and close the session.");
  
      // Export system container class
      py::class_<System, std::shared_ptr<System>> system(m, "MDSystem");
-     system.def(
-             "launch",
-             [](System* system, std::shared_ptr<PyContext> context) {
-                 auto work       = gmxapi::getWork(*system->get());
-                 auto newSession = context->launch(*work);
-                 return newSession;
-             },
-             "Launch the configured workflow in the provided context.");
+     system.def("launch", &launch, "Launch the configured workflow in the provided context.");
  
      // Module-level function
 -    m.def("from_tpr", &gmxpy::from_tpr,
 +    m.def("from_tpr",
 +          &gmxpy::from_tpr,
            "Return a system container initialized from the given input record.");
  }
  
index 27f3408f4cb31e52e04e1b2486fd1b639160fe4c,b5826a16566451fe46c5c182a5eaaa3ae7a3ea46..025aea1c68199c4bef79eda3cd299083d90e225a
@@@ -87,7 -87,7 +87,7 @@@ exist in the same location (with differ
  used when guessing a toolchain, because setup.py does not know which corresponds
  to the gmxapi support library.
  
--If specifying GMXTOOLCHAINDIR and gmxapi_DIR, the tool chain directory must be 
++If specifying GMXTOOLCHAINDIR and gmxapi_DIR, the tool chain directory must be
  located within a subdirectory of gmxapi_DIR.
  
  Refer to project web site for complete documentation.
@@@ -157,12 -157,23 +157,23 @@@ cmake_platform_hints = '-DCMAKE_TOOLCHA
  cmake_gmxapi_hint = '-Dgmxapi_ROOT={}'.format(gmxapi_DIR)
  cmake_args = [cmake_platform_hints, cmake_gmxapi_hint]
  
+ long_description = """gmxapi provides Python access to GROMACS molecular simulation tools.
+ Operations can be connected flexibly to allow high performance simulation and
+ analysis with complex control and data flows. Users can define new operations
+ in C++ or Python with the same tool kit used to implement this package.
+ This Python package requires a compatible GROMACS installation with the API
+ libraries and headers.
+ See http://gmxapi.org/ for details on installation and usage.
+ """
  setup(
      name='gmxapi',
  
      # TODO: single-source version information (currently repeated in gmxapi/version.py and CMakeLists.txt)
 -    version='0.2.2',
 -    python_requires='>=3.6',
 +    version='0.3.0a3',
 +    python_requires='>=3.7',
      install_requires=['networkx>=2.0',
                        'numpy>=1'],
  
      author='M. Eric Irrgang',
      author_email='info@gmxapi.org',
      description='gmxapi Python interface for GROMACS',
+     long_description=long_description,
      license='LGPL',
      url='http://gmxapi.org/',
  
index 3324ca2feae691fa90c8fd2a96446acf9645ea3a,d75345d5927642325fd1ad0414ba74aaad046e64..3627ea02d6dba4cf196572d380554b653764b384
  
  #include "gromacs/domdec/builder.h"
  #include "gromacs/domdec/collect.h"
 +#include "gromacs/domdec/computemultibodycutoffs.h"
  #include "gromacs/domdec/dlb.h"
  #include "gromacs/domdec/dlbtiming.h"
  #include "gromacs/domdec/domdec_network.h"
  #include "gromacs/domdec/ga2la.h"
  #include "gromacs/domdec/gpuhaloexchange.h"
 +#include "gromacs/domdec/localtopologychecker.h"
  #include "gromacs/domdec/options.h"
  #include "gromacs/domdec/partition.h"
 +#include "gromacs/ewald/pme.h"
 +#include "gromacs/domdec/reversetopology.h"
  #include "gromacs/gmxlib/network.h"
  #include "gromacs/gmxlib/nrnb.h"
  #include "gromacs/gpu_utils/device_stream_manager.h"
  #include "gromacs/mdlib/constr.h"
  #include "gromacs/mdlib/constraintrange.h"
  #include "gromacs/mdlib/updategroups.h"
 +#include "gromacs/mdlib/vcm.h"
  #include "gromacs/mdlib/vsite.h"
 +#include "gromacs/mdrun/mdmodules.h"
  #include "gromacs/mdtypes/commrec.h"
  #include "gromacs/mdtypes/forceoutput.h"
 +#include "gromacs/mdtypes/forcerec.h"
  #include "gromacs/mdtypes/inputrec.h"
  #include "gromacs/mdtypes/mdrunoptions.h"
  #include "gromacs/mdtypes/state.h"
  #include "gromacs/utility/basedefinitions.h"
  #include "gromacs/utility/basenetwork.h"
  #include "gromacs/utility/cstringutil.h"
 +#include "gromacs/utility/enumerationhelpers.h"
  #include "gromacs/utility/exceptions.h"
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/gmxmpi.h"
  #include "gromacs/utility/logger.h"
 +#include "gromacs/utility/mdmodulesnotifiers.h"
  #include "gromacs/utility/real.h"
  #include "gromacs/utility/smalloc.h"
  #include "gromacs/utility/strconvert.h"
  #include "utility.h"
  
  // TODO remove this when moving domdec into gmx namespace
 +using gmx::ArrayRef;
  using gmx::DdRankOrder;
  using gmx::DlbOption;
  using gmx::DomdecOptions;
 +using gmx::RangePartitioning;
  
 -static const char* edlbs_names[int(DlbState::nr)] = { "off", "off", "auto", "locked", "on", "on" };
 -
 -/* The size per atom group of the cggl_flag buffer in gmx_domdec_comm_t */
 -#define DD_CGIBS 2
 -
 -/* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 -#define DD_FLAG_NRCG 65535
 -#define DD_FLAG_FW(d) (1 << (16 + (d)*2))
 -#define DD_FLAG_BW(d) (1 << (16 + (d)*2 + 1))
 +static const char* enumValueToString(DlbState enumValue)
 +{
 +    static constexpr gmx::EnumerationArray<DlbState, const char*> dlbStateNames = {
 +        "off", "off", "auto", "locked", "on", "on"
 +    };
 +    return dlbStateNames[enumValue];
 +}
  
  /* The DD zone order */
  static const ivec dd_zo[DD_MAXZONE] = { { 0, 0, 0 }, { 1, 0, 0 }, { 1, 1, 0 }, { 0, 1, 0 },
@@@ -187,7 -178,7 +187,7 @@@ static int ddcoord2ddnodeid(gmx_domdec_
  
  int ddglatnr(const gmx_domdec_t* dd, int i)
  {
 -    int atnr;
 +    int atnr = 0;
  
      if (dd == nullptr)
      {
          {
              gmx_fatal(FARGS,
                        "glatnr called with %d, which is larger than the local number of atoms (%d)",
 -                      i, dd->comm->atomRanges.numAtomsTotal());
 +                      i,
 +                      dd->comm->atomRanges.numAtomsTotal());
          }
          atnr = dd->globalAtomIndices[i] + 1;
      }
      return atnr;
  }
  
 -gmx::ArrayRef<const gmx::RangePartitioning> getUpdateGroupingPerMoleculetype(const gmx_domdec_t& dd)
 +void dd_store_state(const gmx_domdec_t& dd, t_state* state)
  {
 -    GMX_RELEASE_ASSERT(dd.comm, "Need a valid dd.comm");
 -    return dd.comm->systemInfo.updateGroupingPerMoleculetype;
 -}
 -
 -void dd_store_state(gmx_domdec_t* dd, t_state* state)
 -{
 -    int i;
 -
 -    if (state->ddp_count != dd->ddp_count)
 +    if (state->ddp_count != dd.ddp_count)
      {
          gmx_incons("The MD state does not match the domain decomposition state");
      }
  
 -    state->cg_gl.resize(dd->ncg_home);
 -    for (i = 0; i < dd->ncg_home; i++)
 +    state->cg_gl.resize(dd.numHomeAtoms);
 +    for (int i = 0; i < dd.numHomeAtoms; i++)
      {
 -        state->cg_gl[i] = dd->globalAtomGroupIndices[i];
 +        state->cg_gl[i] = dd.globalAtomGroupIndices[i];
      }
  
 -    state->ddp_count_cg_gl = dd->ddp_count;
 +    state->ddp_count_cg_gl = dd.ddp_count;
  }
  
  gmx_domdec_zones_t* domdec_zones(gmx_domdec_t* dd)
@@@ -239,45 -237,49 +239,45 @@@ int dd_numHomeAtoms(const gmx_domdec_t
      return dd.comm->atomRanges.numHomeAtoms();
  }
  
 -int dd_natoms_mdatoms(const gmx_domdec_t* dd)
 +int dd_natoms_mdatoms(const gmx_domdec_t& dd)
  {
      /* We currently set mdatoms entries for all atoms:
       * local + non-local + communicated for vsite + constraints
       */
  
 -    return dd->comm->atomRanges.numAtomsTotal();
 +    return dd.comm->atomRanges.numAtomsTotal();
  }
  
 -int dd_natoms_vsite(const gmx_domdec_t* dd)
 +int dd_natoms_vsite(const gmx_domdec_t& dd)
  {
 -    return dd->comm->atomRanges.end(DDAtomRanges::Type::Vsites);
 +    return dd.comm->atomRanges.end(DDAtomRanges::Type::Vsites);
  }
  
 -void dd_get_constraint_range(const gmx_domdec_t* dd, int* at_start, int* at_end)
 +void dd_get_constraint_range(const gmx_domdec_t& dd, int* at_start, int* at_end)
  {
 -    *at_start = dd->comm->atomRanges.start(DDAtomRanges::Type::Constraints);
 -    *at_end   = dd->comm->atomRanges.end(DDAtomRanges::Type::Constraints);
 +    *at_start = dd.comm->atomRanges.start(DDAtomRanges::Type::Constraints);
 +    *at_end   = dd.comm->atomRanges.end(DDAtomRanges::Type::Constraints);
  }
  
  void dd_move_x(gmx_domdec_t* dd, const matrix box, gmx::ArrayRef<gmx::RVec> x, gmx_wallcycle* wcycle)
  {
 -    wallcycle_start(wcycle, ewcMOVEX);
 +    wallcycle_start(wcycle, WallCycleCounter::MoveX);
  
 -    int                    nzone, nat_tot;
 -    gmx_domdec_comm_t*     comm;
 -    gmx_domdec_comm_dim_t* cd;
 -    rvec                   shift = { 0, 0, 0 };
 -    gmx_bool               bPBC, bScrew;
 +    rvec shift = { 0, 0, 0 };
  
 -    comm = dd->comm;
 +    gmx_domdec_comm_t* comm = dd->comm;
  
 -    nzone   = 1;
 -    nat_tot = comm->atomRanges.numHomeAtoms();
 +    int nzone   = 1;
 +    int nat_tot = comm->atomRanges.numHomeAtoms();
      for (int d = 0; d < dd->ndim; d++)
      {
 -        bPBC   = (dd->ci[dd->dim[d]] == 0);
 -        bScrew = (bPBC && dd->unitCellInfo.haveScrewPBC && dd->dim[d] == XX);
 +        const bool bPBC   = (dd->ci[dd->dim[d]] == 0);
 +        const bool bScrew = (bPBC && dd->unitCellInfo.haveScrewPBC && dd->dim[d] == XX);
          if (bPBC)
          {
              copy_rvec(box[dd->dim[d]], shift);
          }
 -        cd = &comm->cd[d];
 +        gmx_domdec_comm_dim_t* cd = &comm->cd[d];
          for (const gmx_domdec_ind_t& ind : cd->ind)
          {
              DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
          nzone += nzone;
      }
  
 -    wallcycle_stop(wcycle, ewcMOVEX);
 +    wallcycle_stop(wcycle, WallCycleCounter::MoveX);
  }
  
  void dd_move_f(gmx_domdec_t* dd, gmx::ForceWithShiftForces* forceWithShiftForces, gmx_wallcycle* wcycle)
  {
 -    wallcycle_start(wcycle, ewcMOVEF);
 +    wallcycle_start(wcycle, WallCycleCounter::MoveF);
  
      gmx::ArrayRef<gmx::RVec> f      = forceWithShiftForces->force();
      gmx::ArrayRef<gmx::RVec> fshift = forceWithShiftForces->shiftForces();
          /* Determine which shift vector we need */
          ivec vis        = { 0, 0, 0 };
          vis[dd->dim[d]] = 1;
 -        const int is    = IVEC2IS(vis);
 +        const int is    = gmx::ivecToShiftIndex(vis);
  
          /* Loop over the pulses */
          const gmx_domdec_comm_dim_t& cd = comm.cd[d];
          }
          nzone /= 2;
      }
 -    wallcycle_stop(wcycle, ewcMOVEF);
 +    wallcycle_stop(wcycle, WallCycleCounter::MoveF);
  }
  
  /* Convenience function for extracting a real buffer from an rvec buffer
@@@ -476,13 -478,17 +476,13 @@@ static gmx::ArrayRef<real> realArrayRef
  
  void dd_atom_spread_real(gmx_domdec_t* dd, real v[])
  {
 -    int                    nzone, nat_tot;
 -    gmx_domdec_comm_t*     comm;
 -    gmx_domdec_comm_dim_t* cd;
 -
 -    comm = dd->comm;
 +    gmx_domdec_comm_t* comm = dd->comm;
  
 -    nzone   = 1;
 -    nat_tot = comm->atomRanges.numHomeAtoms();
 +    int nzone   = 1;
 +    int nat_tot = comm->atomRanges.numHomeAtoms();
      for (int d = 0; d < dd->ndim; d++)
      {
 -        cd = &comm->cd[d];
 +        gmx_domdec_comm_dim_t* cd = &comm->cd[d];
          for (const gmx_domdec_ind_t& ind : cd->ind)
          {
              /* Note: We provision for RVec instead of real, so a factor of 3
  
  void dd_atom_sum_real(gmx_domdec_t* dd, real v[])
  {
 -    int                    nzone, nat_tot;
 -    gmx_domdec_comm_t*     comm;
 -    gmx_domdec_comm_dim_t* cd;
 -
 -    comm = dd->comm;
 +    gmx_domdec_comm_t* comm = dd->comm;
  
 -    nzone   = comm->zones.n / 2;
 -    nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
 +    int nzone   = comm->zones.n / 2;
 +    int nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
      for (int d = dd->ndim - 1; d >= 0; d--)
      {
 -        cd = &comm->cd[d];
 +        gmx_domdec_comm_dim_t* cd = &comm->cd[d];
          for (int p = cd->numPulses() - 1; p >= 0; p--)
          {
              const gmx_domdec_ind_t& ind = cd->ind[p];
@@@ -619,7 -629,9 +619,7 @@@ real dd_cutoff_multibody(const gmx_domd
  
  real dd_cutoff_twobody(const gmx_domdec_t* dd)
  {
 -    real r_mb;
 -
 -    r_mb = dd_cutoff_multibody(dd);
 +    const real r_mb = dd_cutoff_multibody(dd);
  
      return std::max(dd->comm->systemInfo.cutoff, r_mb);
  }
@@@ -673,14 -685,17 +673,14 @@@ static std::vector<int> dd_interleaved_
      return pmeRanks;
  }
  
 -static int gmx_ddcoord2pmeindex(const t_commrec* cr, int x, int y, int z)
 +static int gmx_ddcoord2pmeindex(const gmx_domdec_t& dd, int x, int y, int z)
  {
 -    gmx_domdec_t* dd;
 -    ivec          coords;
 -    int           slab;
 +    ivec coords;
  
 -    dd         = cr->dd;
 -    coords[XX] = x;
 -    coords[YY] = y;
 -    coords[ZZ] = z;
 -    slab       = ddindex2pmeindex(dd->comm->ddRankSetup, dd_index(dd->numCells, coords));
 +    coords[XX]     = x;
 +    coords[YY]     = y;
 +    coords[ZZ]     = z;
 +    const int slab = ddindex2pmeindex(dd.comm->ddRankSetup, dd_index(dd.numCells, coords));
  
      return slab;
  }
@@@ -706,9 -721,10 +706,10 @@@ static int ddcoord2simnodeid(const t_co
          }
          else
          {
-             if (cr->dd->comm->ddRankSetup.usePmeOnlyRanks)
+             const DDRankSetup& rankSetup = cr->dd->comm->ddRankSetup;
+             if (rankSetup.rankOrder != DdRankOrder::pp_pme && rankSetup.usePmeOnlyRanks)
              {
 -                nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
 +                nodeid = ddindex + gmx_ddcoord2pmeindex(*cr->dd, x, y, z);
              }
              else
              {
@@@ -820,7 -836,7 +821,7 @@@ std::vector<int> get_pme_ddranks(const 
                  else
                  {
                      /* The slab corresponds to the nodeid in the PME group */
 -                    if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
 +                    if (gmx_ddcoord2pmeindex(*cr->dd, x, y, z) == pmenodeid)
                      {
                          ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
                      }
  
  static gmx_bool receive_vir_ener(const gmx_domdec_t* dd, gmx::ArrayRef<const int> pmeRanks, const t_commrec* cr)
  {
 -    gmx_bool bReceive = TRUE;
 +    bool bReceive = true;
  
      const DDRankSetup& ddRankSetup = dd->comm->ddRankSetup;
      if (ddRankSetup.usePmeOnlyRanks)
              coords[cartSetup.cartpmedim]++;
              if (coords[cartSetup.cartpmedim] < dd->numCells[cartSetup.cartpmedim])
              {
 -                int rank;
 +                int rank = 0;
                  MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
                  if (dd_simnode2pmenode(ddRankSetup, cartSetup, pmeRanks, cr, rank) == pmenode)
                  {
  
  static void set_slb_pme_dim_f(gmx_domdec_t* dd, int dim, real** dim_f)
  {
 -    gmx_domdec_comm_t* comm;
 -    int                i;
 -
 -    comm = dd->comm;
 +    gmx_domdec_comm_t* comm = dd->comm;
  
      snew(*dim_f, dd->numCells[dim] + 1);
      (*dim_f)[0] = 0;
 -    for (i = 1; i < dd->numCells[dim]; i++)
 +    for (int i = 1; i < dd->numCells[dim]; i++)
      {
          if (comm->slb_frac[dim])
          {
@@@ -936,8 -955,16 +937,8 @@@ static void init_ddpme(gmx_domdec_t* dd
           */
          if (dimind == 0 || xyz[XX] == dd->ci[XX])
          {
 -            const int pmeindex = ddindex2pmeindex(ddRankSetup, i);
 -            int       slab;
 -            if (dimind == 0)
 -            {
 -                slab = pmeindex / nso;
 -            }
 -            else
 -            {
 -                slab = pmeindex % ddpme->nslab;
 -            }
 +            const int pmeindex  = ddindex2pmeindex(ddRankSetup, i);
 +            const int slab      = (dimind == 0) ? (pmeindex / nso) : (pmeindex % ddpme->nslab);
              ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
              ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
          }
      set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
  }
  
 -int dd_pme_maxshift_x(const gmx_domdec_t* dd)
 +int dd_pme_maxshift_x(const gmx_domdec_t& dd)
  {
 -    const DDRankSetup& ddRankSetup = dd->comm->ddRankSetup;
 +    const DDRankSetup& ddRankSetup = dd.comm->ddRankSetup;
  
      if (ddRankSetup.ddpme[0].dim == XX)
      {
      }
  }
  
 -int dd_pme_maxshift_y(const gmx_domdec_t* dd)
 +int dd_pme_maxshift_y(const gmx_domdec_t& dd)
  {
 -    const DDRankSetup& ddRankSetup = dd->comm->ddRankSetup;
 +    const DDRankSetup& ddRankSetup = dd.comm->ddRankSetup;
  
      if (ddRankSetup.ddpme[0].dim == YY)
      {
      }
  }
  
 -bool ddHaveSplitConstraints(const gmx_domdec_t& dd)
 -{
 -    return dd.comm->systemInfo.haveSplitConstraints;
 -}
 -
  bool ddUsesUpdateGroups(const gmx_domdec_t& dd)
  {
      return dd.comm->systemInfo.useUpdateGroups;
@@@ -1006,16 -1038,17 +1007,16 @@@ void dd_cycles_add(const gmx_domdec_t* 
  #if GMX_MPI
  static void make_load_communicator(gmx_domdec_t* dd, int dim_ind, ivec loc)
  {
 -    MPI_Comm c_row;
 -    int      dim, i, rank;
 +    MPI_Comm c_row = MPI_COMM_NULL;
      ivec     loc_c;
 -    gmx_bool bPartOfGroup = FALSE;
 +    bool     bPartOfGroup = false;
  
 -    dim = dd->dim[dim_ind];
 +    const int dim = dd->dim[dim_ind];
      copy_ivec(loc, loc_c);
 -    for (i = 0; i < dd->numCells[dim]; i++)
 +    for (int i = 0; i < dd->numCells[dim]; i++)
      {
 -        loc_c[dim] = i;
 -        rank       = dd_index(dd->numCells, loc_c);
 +        loc_c[dim]     = i;
 +        const int rank = dd_index(dd->numCells, loc_c);
          if (rank == dd->rank)
          {
              /* This process is part of the group */
  void dd_setup_dlb_resource_sharing(const t_commrec* cr, int gpu_id)
  {
  #if GMX_MPI
 -    int           physicalnode_id_hash;
 -    gmx_domdec_t* dd;
 -    MPI_Comm      mpi_comm_pp_physicalnode;
 +    MPI_Comm mpi_comm_pp_physicalnode = MPI_COMM_NULL;
  
      if (!thisRankHasDuty(cr, DUTY_PP) || gpu_id < 0)
      {
          return;
      }
  
 -    physicalnode_id_hash = gmx_physicalnode_id_hash();
 +    const int physicalnode_id_hash = gmx_physicalnode_id_hash();
  
 -    dd = cr->dd;
 +    gmx_domdec_t* dd = cr->dd;
  
      if (debug)
      {
          fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
 -        fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n", dd->rank,
 -                physicalnode_id_hash, gpu_id);
 +        fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n", dd->rank, physicalnode_id_hash, gpu_id);
      }
      /* Split the PP communicator over the physical nodes */
      /* TODO: See if we should store this (before), as it's also used for
  static void make_load_communicators(gmx_domdec_t gmx_unused* dd)
  {
  #if GMX_MPI
 -    int  dim0, dim1, i, j;
      ivec loc;
  
      if (debug)
      make_load_communicator(dd, 0, loc);
      if (dd->ndim > 1)
      {
 -        dim0 = dd->dim[0];
 -        for (i = 0; i < dd->numCells[dim0]; i++)
 +        const int dim0 = dd->dim[0];
 +        for (int i = 0; i < dd->numCells[dim0]; i++)
          {
              loc[dim0] = i;
              make_load_communicator(dd, 1, loc);
      }
      if (dd->ndim > 2)
      {
 -        dim0 = dd->dim[0];
 -        for (i = 0; i < dd->numCells[dim0]; i++)
 +        const int dim0 = dd->dim[0];
 +        for (int i = 0; i < dd->numCells[dim0]; i++)
          {
 -            loc[dim0] = i;
 -            dim1      = dd->dim[1];
 -            for (j = 0; j < dd->numCells[dim1]; j++)
 +            loc[dim0]      = i;
 +            const int dim1 = dd->dim[1];
 +            for (int j = 0; j < dd->numCells[dim1]; j++)
              {
                  loc[dim1] = j;
                  make_load_communicator(dd, 2, loc);
  /*! \brief Sets up the relation between neighboring domains and zones */
  static void setup_neighbor_relations(gmx_domdec_t* dd)
  {
 -    int                 d, dim, m;
 -    ivec                tmp, s;
 -    gmx_domdec_zones_t* zones;
 +    ivec tmp, s;
      GMX_ASSERT((dd->ndim >= 0) && (dd->ndim <= DIM), "Must have valid number of dimensions for DD");
  
 -    for (d = 0; d < dd->ndim; d++)
 +    for (int d = 0; d < dd->ndim; d++)
      {
 -        dim = dd->dim[d];
 +        const int dim = dd->dim[d];
          copy_ivec(dd->ci, tmp);
          tmp[dim]           = (tmp[dim] + 1) % dd->numCells[dim];
          dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
          dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
          if (debug)
          {
 -            fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n", dd->rank, dim,
 -                    dd->neighbor[d][0], dd->neighbor[d][1]);
 +            fprintf(debug,
 +                    "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
 +                    dd->rank,
 +                    dim,
 +                    dd->neighbor[d][0],
 +                    dd->neighbor[d][1]);
          }
      }
  
      int nizone = (1 << std::max(dd->ndim - 1, 0));
      assert(nizone >= 1 && nizone <= DD_MAXIZONE);
  
 -    zones = &dd->comm->zones;
 +    gmx_domdec_zones_t* zones = &dd->comm->zones;
  
      for (int i = 0; i < nzone; i++)
      {
 -        m = 0;
 +        int m = 0;
          clear_ivec(zones->shift[i]);
 -        for (d = 0; d < dd->ndim; d++)
 +        for (int d = 0; d < dd->ndim; d++)
          {
              zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
          }
      zones->n = nzone;
      for (int i = 0; i < nzone; i++)
      {
 -        for (d = 0; d < DIM; d++)
 +        for (int d = 0; d < DIM; d++)
          {
              s[d] = dd->ci[d] - zones->shift[i][d];
              if (s[d] < 0)
           */
          iZone.jZoneRange = gmx::Range<int>(std::min(ddNonbondedZonePairRanges[iZoneIndex][1], nzone),
                                             std::min(ddNonbondedZonePairRanges[iZoneIndex][2], nzone));
 -        for (dim = 0; dim < DIM; dim++)
 +        for (int dim = 0; dim < DIM; dim++)
          {
              if (dd->numCells[dim] == 1)
              {
  static void make_pp_communicator(const gmx::MDLogger& mdlog,
                                   gmx_domdec_t*        dd,
                                   t_commrec gmx_unused* cr,
 -                                 bool gmx_unused reorder)
 +                                 bool gmx_unused       reorder)
  {
  #if GMX_MPI
      gmx_domdec_comm_t*  comm      = dd->comm;
          /* Set up cartesian communication for the particle-particle part */
          GMX_LOG(mdlog.info)
                  .appendTextFormatted("Will use a Cartesian communicator: %d x %d x %d",
 -                                     dd->numCells[XX], dd->numCells[YY], dd->numCells[ZZ]);
 +                                     dd->numCells[XX],
 +                                     dd->numCells[YY],
 +                                     dd->numCells[ZZ]);
  
          ivec periods;
          for (int i = 0; i < DIM; i++)
          {
              periods[i] = TRUE;
          }
 -        MPI_Comm comm_cart;
 -        MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->numCells, periods, static_cast<int>(reorder),
 -                        &comm_cart);
 +        MPI_Comm comm_cart = MPI_COMM_NULL;
 +        MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->numCells, periods, static_cast<int>(reorder), &comm_cart);
          /* We overwrite the old communicator with the new cartesian one */
          cr->mpi_comm_mygroup = comm_cart;
      }
          /* Get the rank of the DD master,
           * above we made sure that the master node is a PP node.
           */
 -        int rank;
 -        if (MASTER(cr))
 -        {
 -            rank = dd->rank;
 -        }
 -        else
 -        {
 -            rank = 0;
 -        }
 +        int rank = MASTER(cr) ? dd->rank : 0;
          MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
      }
      else if (cartSetup.bCartesianPP)
              buf[dd_index(dd->numCells, dd->ci)] = cr->sim_nodeid;
          }
          /* Communicate the ddindex to simulation nodeid index */
 -        MPI_Allreduce(buf.data(), cartSetup.ddindex2simnodeid.data(), dd->nnodes, MPI_INT, MPI_SUM,
 -                      cr->mpi_comm_mysim);
 +        MPI_Allreduce(buf.data(), cartSetup.ddindex2simnodeid.data(), dd->nnodes, MPI_INT, MPI_SUM, cr->mpi_comm_mysim);
  
          /* Determine the master coordinates and rank.
           * The DD master should be the same node as the master of this sim.
  #endif
  
      GMX_LOG(mdlog.info)
 -            .appendTextFormatted("Domain decomposition rank %d, coordinates %d %d %d\n", dd->rank,
 -                                 dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +            .appendTextFormatted("Domain decomposition rank %d, coordinates %d %d %d\n",
 +                                 dd->rank,
 +                                 dd->ci[XX],
 +                                 dd->ci[YY],
 +                                 dd->ci[ZZ]);
      if (debug)
      {
 -        fprintf(debug, "Domain decomposition rank %d, coordinates %d %d %d\n\n", dd->rank,
 -                dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +        fprintf(debug,
 +                "Domain decomposition rank %d, coordinates %d %d %d\n\n",
 +                dd->rank,
 +                dd->ci[XX],
 +                dd->ci[YY],
 +                dd->ci[ZZ]);
      }
  }
  
@@@ -1402,7 -1438,8 +1403,7 @@@ static void receive_ddindex2simnodeid(g
              buf[dd_index(dd->numCells, dd->ci)] = cr->sim_nodeid;
          }
          /* Communicate the ddindex to simulation nodeid index */
 -        MPI_Allreduce(buf.data(), cartSetup.ddindex2simnodeid.data(), dd->nnodes, MPI_INT, MPI_SUM,
 -                      cr->mpi_comm_mysim);
 +        MPI_Allreduce(buf.data(), cartSetup.ddindex2simnodeid.data(), dd->nnodes, MPI_INT, MPI_SUM, cr->mpi_comm_mysim);
      }
  #else
      GMX_UNUSED_VALUE(dd);
  
  static CartesianRankSetup split_communicator(const gmx::MDLogger& mdlog,
                                               t_commrec*           cr,
 -                                             bool gmx_unused    reorder,
 -                                             const DDRankSetup& ddRankSetup,
 -                                             ivec               ddCellIndex,
 -                                             std::vector<int>*  pmeRanks)
 +                                             const DdRankOrder    ddRankOrder,
 +                                             bool gmx_unused      reorder,
 +                                             const DDRankSetup&   ddRankSetup,
 +                                             ivec                 ddCellIndex,
 +                                             std::vector<int>*    pmeRanks)
  {
      CartesianRankSetup cartSetup;
  
 -    const DdRankOrder ddRankOrder = ddRankSetup.rankOrder;
 -
      cartSetup.bCartesianPP     = (ddRankOrder == DdRankOrder::cartesian);
      cartSetup.bCartesianPP_PME = false;
  
                      .appendTextFormatted(
                              "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or "
                              "nx*nz (%d*%d)",
 -                            ddRankSetup.numRanksDoingPme, numDDCells[XX], numDDCells[YY],
 -                            numDDCells[XX], numDDCells[ZZ]);
 +                            ddRankSetup.numRanksDoingPme,
 +                            numDDCells[XX],
 +                            numDDCells[YY],
 +                            numDDCells[XX],
 +                            numDDCells[ZZ]);
              GMX_LOG(mdlog.info)
                      .appendText("Will not use a Cartesian communicator for PP <-> PME\n");
          }
      if (cartSetup.bCartesianPP_PME)
      {
  #if GMX_MPI
 -        int  rank;
 +        int  rank = 0;
          ivec periods;
  
          GMX_LOG(mdlog.info)
                  .appendTextFormatted(
                          "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d",
 -                        cartSetup.ntot[XX], cartSetup.ntot[YY], cartSetup.ntot[ZZ]);
 +                        cartSetup.ntot[XX],
 +                        cartSetup.ntot[YY],
 +                        cartSetup.ntot[ZZ]);
  
          for (int i = 0; i < DIM; i++)
          {
              periods[i] = TRUE;
          }
 -        MPI_Comm comm_cart;
 -        MPI_Cart_create(cr->mpi_comm_mysim, DIM, cartSetup.ntot, periods, static_cast<int>(reorder),
 -                        &comm_cart);
 +        MPI_Comm comm_cart = MPI_COMM_NULL;
 +        MPI_Cart_create(cr->mpi_comm_mysim, DIM, cartSetup.ntot, periods, static_cast<int>(reorder), &comm_cart);
          MPI_Comm_rank(comm_cart, &rank);
          if (MASTER(cr) && rank != 0)
          {
          MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, ddCellIndex);
  
          GMX_LOG(mdlog.info)
 -                .appendTextFormatted("Cartesian rank %d, coordinates %d %d %d\n", cr->sim_nodeid,
 -                                     ddCellIndex[XX], ddCellIndex[YY], ddCellIndex[ZZ]);
 +                .appendTextFormatted("Cartesian rank %d, coordinates %d %d %d\n",
 +                                     cr->sim_nodeid,
 +                                     ddCellIndex[XX],
 +                                     ddCellIndex[YY],
 +                                     ddCellIndex[ZZ]);
  
          if (ddCellIndex[cartSetup.cartpmedim] < numDDCells[cartSetup.cartpmedim])
          {
          }
  
          /* Split the sim communicator into PP and PME only nodes */
 -        MPI_Comm_split(cr->mpi_comm_mysim, getThisRankDuties(cr),
 -                       dd_index(cartSetup.ntot, ddCellIndex), &cr->mpi_comm_mygroup);
 +        MPI_Comm_split(cr->mpi_comm_mysim,
 +                       getThisRankDuties(cr),
 +                       dd_index(cartSetup.ntot, ddCellIndex),
 +                       &cr->mpi_comm_mygroup);
  #else
          GMX_UNUSED_VALUE(ddCellIndex);
  #endif
   */
  static CartesianRankSetup makeGroupCommunicators(const gmx::MDLogger& mdlog,
                                                   const DDSettings&    ddSettings,
 +                                                 const DdRankOrder    ddRankOrder,
                                                   const DDRankSetup&   ddRankSetup,
                                                   t_commrec*           cr,
                                                   ivec                 ddCellIndex,
      if (ddRankSetup.usePmeOnlyRanks)
      {
          /* Split the communicator into a PP and PME part */
 -        cartSetup = split_communicator(mdlog, cr, ddSettings.useCartesianReorder, ddRankSetup,
 -                                       ddCellIndex, pmeRanks);
 +        cartSetup = split_communicator(
 +                mdlog, cr, ddRankOrder, ddSettings.useCartesianReorder, ddRankSetup, ddCellIndex, pmeRanks);
      }
      else
      {
@@@ -1648,9 -1676,7 +1649,9 @@@ static void setupGroupCommunication(con
          dd->pme_receive_vir_ener = receive_vir_ener(dd, pmeRanks, cr);
          if (debug)
          {
 -            fprintf(debug, "My pme_nodeid %d receive ener %s\n", dd->pme_nodeid,
 +            fprintf(debug,
 +                    "My pme_nodeid %d receive ener %s\n",
 +                    dd->pme_nodeid,
                      gmx::boolToString(dd->pme_receive_vir_ener));
          }
      }
  
  static real* get_slb_frac(const gmx::MDLogger& mdlog, const char* dir, int nc, const char* size_string)
  {
 -    real * slb_frac, tot;
 -    int    i, n;
 -    double dbl;
 -
 -    slb_frac = nullptr;
 +    real* slb_frac = nullptr;
      if (nc > 1 && size_string != nullptr)
      {
          GMX_LOG(mdlog.info).appendTextFormatted("Using static load balancing for the %s direction", dir);
          snew(slb_frac, nc);
 -        tot = 0;
 -        for (i = 0; i < nc; i++)
 +        real tot = 0;
 +        for (int i = 0; i < nc; i++)
          {
 -            dbl = 0;
 +            double dbl = 0;
 +            int    n   = 0;
              sscanf(size_string, "%20lf%n", &dbl, &n);
              if (dbl == 0)
              {
                  gmx_fatal(FARGS,
                            "Incorrect or not enough DD cell size entries for direction %s: '%s'",
 -                          dir, size_string);
 +                          dir,
 +                          size_string);
              }
              slb_frac[i] = dbl;
              size_string += n;
          }
          /* Normalize */
          std::string relativeCellSizes = "Relative cell sizes:";
 -        for (i = 0; i < nc; i++)
 +        for (int i = 0; i < nc; i++)
          {
              slb_frac[i] /= tot;
              relativeCellSizes += gmx::formatString(" %5.3f", slb_frac[i]);
      return slb_frac;
  }
  
 -static int multi_body_bondeds_count(const gmx_mtop_t* mtop)
 +static int multi_body_bondeds_count(const gmx_mtop_t& mtop)
  {
 -    int                  n     = 0;
 -    gmx_mtop_ilistloop_t iloop = gmx_mtop_ilistloop_init(mtop);
 -    int                  nmol;
 -    while (const InteractionLists* ilists = gmx_mtop_ilistloop_next(iloop, &nmol))
 +    int n = 0;
 +    for (const auto ilists : IListRange(mtop))
      {
 -        for (auto& ilist : extractILists(*ilists, IF_BOND))
 +        for (auto& ilist : extractILists(ilists.list(), IF_BOND))
          {
              if (NRAL(ilist.functionType) > 2)
              {
 -                n += nmol * (ilist.iatoms.size() / ilistStride(ilist));
 +                n += ilists.nmol() * (ilist.iatoms.size() / ilistStride(ilist));
              }
          }
      }
  
  static int dd_getenv(const gmx::MDLogger& mdlog, const char* env_var, int def)
  {
 -    char* val;
 -    int   nst;
 -
 -    nst = def;
 -    val = getenv(env_var);
 +    int   nst = def;
 +    char* val = getenv(env_var);
      if (val)
      {
          if (sscanf(val, "%20d", &nst) <= 0)
      return nst;
  }
  
 -static void check_dd_restrictions(const gmx_domdec_t* dd, const t_inputrec* ir, const gmx::MDLogger& mdlog)
 +static void check_dd_restrictions(const gmx_domdec_t* dd, const t_inputrec& inputrec, const gmx::MDLogger& mdlog)
  {
 -    if (ir->pbcType == PbcType::Screw
 +    if (inputrec.pbcType == PbcType::Screw
          && (dd->numCells[XX] == 1 || dd->numCells[YY] > 1 || dd->numCells[ZZ] > 1))
      {
 -        gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction",
 -                  c_pbcTypeNames[ir->pbcType].c_str());
 +        gmx_fatal(FARGS,
 +                  "With pbc=%s can only do domain decomposition in the x-direction",
 +                  c_pbcTypeNames[inputrec.pbcType].c_str());
      }
  
 -    if (ir->nstlist == 0)
 +    if (inputrec.nstlist == 0)
      {
          gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
      }
  
 -    if (ir->comm_mode == ecmANGULAR && ir->pbcType != PbcType::No)
 +    if (inputrec.comm_mode == ComRemovalAlgorithm::Angular && inputrec.pbcType != PbcType::No)
      {
          GMX_LOG(mdlog.warning)
                  .appendText(
@@@ -1806,14 -1838,14 +1807,14 @@@ static DlbState forceDlbOffOrBail(DlbSt
   * \param [in] dlbOption   Enum value for the DLB option.
   * \param [in] bRecordLoad True if the load balancer is recording load information.
   * \param [in] mdrunOptions  Options for mdrun.
 - * \param [in] ir          Pointer mdrun to input parameters.
 + * \param [in] inputrec    Pointer mdrun to input parameters.
   * \returns                DLB initial/startup state.
   */
  static DlbState determineInitialDlbState(const gmx::MDLogger&     mdlog,
                                           DlbOption                dlbOption,
                                           gmx_bool                 bRecordLoad,
                                           const gmx::MdrunOptions& mdrunOptions,
 -                                         const t_inputrec*        ir)
 +                                         const t_inputrec&        inputrec)
  {
      DlbState dlbState = DlbState::offCanTurnOn;
  
      }
  
      /* Unsupported integrators */
 -    if (!EI_DYNAMICS(ir->eI))
 +    if (!EI_DYNAMICS(inputrec.eI))
      {
 -        auto reasonStr = gmx::formatString(
 -                "it is only supported with dynamics, not with integrator '%s'.", EI(ir->eI));
 +        auto reasonStr =
 +                gmx::formatString("it is only supported with dynamics, not with integrator '%s'.",
 +                                  enumValueToString(inputrec.eI));
          return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
      }
  
                                    "ensured.",
                          mdlog);
              default:
 -                gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice",
 +                gmx_fatal(FARGS,
 +                          "Death horror: undefined case (%d) for load balancing choice",
                            static_cast<int>(dlbState));
          }
      }
@@@ -1907,16 -1937,40 +1908,16 @@@ static gmx_domdec_comm_t* init_dd_comm(
      return comm;
  }
  
 -/* Returns whether mtop contains constraints and/or vsites */
 -static bool systemHasConstraintsOrVsites(const gmx_mtop_t& mtop)
 +static void setupUpdateGroups(const gmx::MDLogger&              mdlog,
 +                              const gmx_mtop_t&                 mtop,
 +                              ArrayRef<const RangePartitioning> updateGroupingsPerMoleculeType,
 +                              const bool                        useUpdateGroups,
 +                              const real                        maxUpdateGroupRadius,
 +                              DDSystemInfo*                     systemInfo)
  {
 -    auto ilistLoop = gmx_mtop_ilistloop_init(mtop);
 -    int  nmol;
 -    while (const InteractionLists* ilists = gmx_mtop_ilistloop_next(ilistLoop, &nmol))
 -    {
 -        if (!extractILists(*ilists, IF_CONSTRAINT | IF_VSITE).empty())
 -        {
 -            return true;
 -        }
 -    }
 -
 -    return false;
 -}
 -
 -static void setupUpdateGroups(const gmx::MDLogger& mdlog,
 -                              const gmx_mtop_t&    mtop,
 -                              const t_inputrec&    inputrec,
 -                              const real           cutoffMargin,
 -                              DDSystemInfo*        systemInfo)
 -{
 -    /* When we have constraints and/or vsites, it is beneficial to use
 -     * update groups (when possible) to allow independent update of groups.
 -     */
 -    if (!systemHasConstraintsOrVsites(mtop))
 -    {
 -        /* No constraints or vsites, atoms can be updated independently */
 -        return;
 -    }
 -
 -    systemInfo->updateGroupingPerMoleculetype = gmx::makeUpdateGroups(mtop);
 -    systemInfo->useUpdateGroups               = (!systemInfo->updateGroupingPerMoleculetype.empty()
 -                                   && getenv("GMX_NO_UPDATEGROUPS") == nullptr);
 +    systemInfo->updateGroupingsPerMoleculeType = updateGroupingsPerMoleculeType;
 +    systemInfo->useUpdateGroups                = useUpdateGroups;
 +    systemInfo->maxUpdateGroupRadius           = maxUpdateGroupRadius;
  
      if (systemInfo->useUpdateGroups)
      {
          for (const auto& molblock : mtop.molblock)
          {
              numUpdateGroups += molblock.nmol
 -                               * systemInfo->updateGroupingPerMoleculetype[molblock.type].numBlocks();
 +                               * systemInfo->updateGroupingsPerMoleculeType[molblock.type].numBlocks();
          }
  
 -        systemInfo->maxUpdateGroupRadius = computeMaxUpdateGroupRadius(
 -                mtop, systemInfo->updateGroupingPerMoleculetype, maxReferenceTemperature(inputrec));
 -
 -        /* To use update groups, the large domain-to-domain cutoff distance
 -         * should be compatible with the box size.
 -         */
 -        systemInfo->useUpdateGroups = (atomToAtomIntoDomainToDomainCutoff(*systemInfo, 0) < cutoffMargin);
 -
 -        if (systemInfo->useUpdateGroups)
 -        {
 -            GMX_LOG(mdlog.info)
 -                    .appendTextFormatted(
 -                            "Using update groups, nr %d, average size %.1f atoms, max. radius %.3f "
 -                            "nm\n",
 -                            numUpdateGroups, mtop.natoms / static_cast<double>(numUpdateGroups),
 -                            systemInfo->maxUpdateGroupRadius);
 -        }
 -        else
 -        {
 -            GMX_LOG(mdlog.info)
 -                    .appendTextFormatted(
 -                            "The combination of rlist and box size prohibits the use of update "
 -                            "groups\n");
 -            systemInfo->updateGroupingPerMoleculetype.clear();
 -        }
 +        GMX_LOG(mdlog.info)
 +                .appendTextFormatted(
 +                        "Using update groups, nr %d, average size %.1f atoms, max. radius %.3f "
 +                        "nm\n",
 +                        numUpdateGroups,
 +                        mtop.natoms / static_cast<double>(numUpdateGroups),
 +                        systemInfo->maxUpdateGroupRadius);
      }
  }
  
@@@ -1948,15 -2020,15 +1949,15 @@@ UnitCellInfo::UnitCellInfo(const t_inpu
  /* Returns whether molecules are always whole, i.e. not broken by PBC */
  static bool moleculesAreAlwaysWhole(const gmx_mtop_t&                           mtop,
                                      const bool                                  useUpdateGroups,
 -                                    gmx::ArrayRef<const gmx::RangePartitioning> updateGroupingPerMoleculetype)
 +                                    gmx::ArrayRef<const gmx::RangePartitioning> updateGroupingsPerMoleculeType)
  {
      if (useUpdateGroups)
      {
 -        GMX_RELEASE_ASSERT(updateGroupingPerMoleculetype.size() == mtop.moltype.size(),
 +        GMX_RELEASE_ASSERT(updateGroupingsPerMoleculeType.size() == mtop.moltype.size(),
                             "Need one grouping per moltype");
          for (size_t mol = 0; mol < mtop.moltype.size(); mol++)
          {
 -            if (updateGroupingPerMoleculetype[mol].numBlocks() > 1)
 +            if (updateGroupingsPerMoleculeType[mol].numBlocks() > 1)
              {
                  return false;
              }
  }
  
  /*! \brief Generate the simulation system information */
 -static DDSystemInfo getSystemInfo(const gmx::MDLogger&           mdlog,
 -                                  DDRole                         ddRole,
 -                                  MPI_Comm                       communicator,
 -                                  const DomdecOptions&           options,
 -                                  const gmx_mtop_t&              mtop,
 -                                  const t_inputrec&              ir,
 -                                  const matrix                   box,
 -                                  gmx::ArrayRef<const gmx::RVec> xGlobal)
 +static DDSystemInfo getSystemInfo(const gmx::MDLogger&              mdlog,
 +                                  DDRole                            ddRole,
 +                                  MPI_Comm                          communicator,
 +                                  const DomdecOptions&              options,
 +                                  const gmx_mtop_t&                 mtop,
 +                                  const t_inputrec&                 ir,
 +                                  const matrix                      box,
 +                                  ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
 +                                  const bool                        useUpdateGroups,
 +                                  const real                        maxUpdateGroupRadius,
 +                                  gmx::ArrayRef<const gmx::RVec>    xGlobal)
  {
      const real tenPercentMargin = 1.1;
  
      DDSystemInfo systemInfo;
  
 -    /* We need to decide on update groups early, as this affects communication distances */
 -    systemInfo.useUpdateGroups = false;
 -    if (ir.cutoff_scheme == ecutsVERLET)
 -    {
 -        real cutoffMargin = std::sqrt(max_cutoff2(ir.pbcType, box)) - ir.rlist;
 -        setupUpdateGroups(mdlog, mtop, ir, cutoffMargin, &systemInfo);
 -    }
 +    setupUpdateGroups(
 +            mdlog, mtop, updateGroupingPerMoleculeType, useUpdateGroups, maxUpdateGroupRadius, &systemInfo);
  
      systemInfo.moleculesAreAlwaysWhole = moleculesAreAlwaysWhole(
 -            mtop, systemInfo.useUpdateGroups, systemInfo.updateGroupingPerMoleculetype);
 +            mtop, systemInfo.useUpdateGroups, systemInfo.updateGroupingsPerMoleculeType);
      systemInfo.haveInterDomainBondeds =
              (!systemInfo.moleculesAreAlwaysWhole || mtop.bIntermolecularInteractions);
      systemInfo.haveInterDomainMultiBodyBondeds =
 -            (systemInfo.haveInterDomainBondeds && multi_body_bondeds_count(&mtop) > 0);
 +            (systemInfo.haveInterDomainBondeds && multi_body_bondeds_count(mtop) > 0);
  
      if (systemInfo.useUpdateGroups)
      {
 -        systemInfo.haveSplitConstraints = false;
 -        systemInfo.haveSplitSettles     = false;
 +        systemInfo.mayHaveSplitConstraints = false;
 +        systemInfo.mayHaveSplitSettles     = false;
      }
      else
      {
 -        systemInfo.haveSplitConstraints = (gmx_mtop_ftype_count(mtop, F_CONSTR) > 0
 -                                           || gmx_mtop_ftype_count(mtop, F_CONSTRNC) > 0);
 -        systemInfo.haveSplitSettles     = (gmx_mtop_ftype_count(mtop, F_SETTLE) > 0);
 +        systemInfo.mayHaveSplitConstraints = (gmx_mtop_ftype_count(mtop, F_CONSTR) > 0
 +                                              || gmx_mtop_ftype_count(mtop, F_CONSTRNC) > 0);
 +        systemInfo.mayHaveSplitSettles     = (gmx_mtop_ftype_count(mtop, F_SETTLE) > 0);
      }
  
      if (ir.rlist == 0)
       */
      constexpr real c_chanceThatAtomMovesBeyondDomain = 1e-12;
      const real     limitForAtomDisplacement          = minCellSizeForAtomDisplacement(
 -            mtop, ir, systemInfo.updateGroupingPerMoleculetype, c_chanceThatAtomMovesBeyondDomain);
 +            mtop, ir, systemInfo.updateGroupingsPerMoleculeType, c_chanceThatAtomMovesBeyondDomain);
      GMX_LOG(mdlog.info).appendTextFormatted("Minimum cell size due to atom displacement: %.3f nm", limitForAtomDisplacement);
  
      systemInfo.cellsizeLimit = std::max(systemInfo.cellsizeLimit, limitForAtomDisplacement);
          }
          else
          {
 -            real r_2b, r_mb;
 +            real r_2b = 0;
 +            real r_mb = 0;
  
              if (ddRole == DDRole::Master)
              {
 -                dd_bonded_cg_distance(mdlog, &mtop, &ir, xGlobal, box,
 -                                      options.checkBondedInteractions, &r_2b, &r_mb);
 +                dd_bonded_cg_distance(mdlog, mtop, ir, xGlobal, box, options.ddBondedChecking, &r_2b, &r_mb);
              }
              gmx_bcast(sizeof(r_2b), &r_2b, communicator);
              gmx_bcast(sizeof(r_mb), &r_mb, communicator);
      }
  
      systemInfo.constraintCommunicationRange = 0;
 -    if (systemInfo.haveSplitConstraints && options.constraintCommunicationRange <= 0)
 +    if (systemInfo.mayHaveSplitConstraints && options.constraintCommunicationRange <= 0)
      {
          /* There is a cell size limit due to the constraints (P-LINCS) */
          systemInfo.constraintCommunicationRange = gmx::constr_r_max(mdlog, &mtop, &ir);
@@@ -2179,24 -2253,19 +2180,24 @@@ static void checkDDGridSetup(const DDGr
  {
      if (options.numCells[XX] <= 0 && (ddGridSetup.numDomains[XX] == 0))
      {
 -        char     buf[STRLEN];
 -        gmx_bool bC = (systemInfo.haveSplitConstraints
 -                       && systemInfo.constraintCommunicationRange > systemInfo.minCutoffForMultiBody);
 -        sprintf(buf, "Change the number of ranks or mdrun option %s%s%s", !bC ? "-rdd" : "-rcon",
 -                ddSettings.initialDlbState != DlbState::offUser ? " or -dds" : "",
 -                bC ? " or your LINCS settings" : "");
 -
 -        gmx_fatal_collective(FARGS, communicator, ddRole == DDRole::Master,
 +        const bool  bC = (systemInfo.mayHaveSplitConstraints
 +                         && systemInfo.constraintCommunicationRange > systemInfo.minCutoffForMultiBody);
 +        std::string message =
 +                gmx::formatString("Change the number of ranks or mdrun option %s%s%s",
 +                                  !bC ? "-rdd" : "-rcon",
 +                                  ddSettings.initialDlbState != DlbState::offUser ? " or -dds" : "",
 +                                  bC ? " or your LINCS settings" : "");
 +
 +        gmx_fatal_collective(FARGS,
 +                             communicator,
 +                             ddRole == DDRole::Master,
                               "There is no domain decomposition for %d ranks that is compatible "
                               "with the given box and a minimum cell size of %g nm\n"
                               "%s\n"
                               "Look in the log file for details on the domain decomposition",
 -                             numNodes - ddGridSetup.numPmeOnlyRanks, cellsizeLimit, buf);
 +                             numNodes - ddGridSetup.numPmeOnlyRanks,
 +                             cellsizeLimit,
 +                             message.c_str());
      }
  
      const real acs = average_cellsize_min(ddbox, ddGridSetup.numDomains);
          else
          {
              gmx_fatal_collective(
 -                    FARGS, communicator, ddRole == DDRole::Master,
 +                    FARGS,
 +                    communicator,
 +                    ddRole == DDRole::Master,
                      "The initial cell size (%f) is smaller than the cell size limit (%f), change "
                      "options -dd, -rdd or -rcon, see the log file for details",
 -                    acs, cellsizeLimit);
 +                    acs,
 +                    cellsizeLimit);
          }
      }
  
              ddGridSetup.numDomains[XX] * ddGridSetup.numDomains[YY] * ddGridSetup.numDomains[ZZ];
      if (numNodes - numPPRanks != ddGridSetup.numPmeOnlyRanks)
      {
 -        gmx_fatal_collective(FARGS, communicator, ddRole == DDRole::Master,
 +        gmx_fatal_collective(FARGS,
 +                             communicator,
 +                             ddRole == DDRole::Master,
                               "The size of the domain decomposition grid (%d) does not match the "
                               "number of PP ranks (%d). The total number of ranks is %d",
 -                             numPPRanks, numNodes - ddGridSetup.numPmeOnlyRanks, numNodes);
 +                             numPPRanks,
 +                             numNodes - ddGridSetup.numPmeOnlyRanks,
 +                             numNodes);
      }
      if (ddGridSetup.numPmeOnlyRanks > numPPRanks)
      {
 -        gmx_fatal_collective(FARGS, communicator, ddRole == DDRole::Master,
 +        gmx_fatal_collective(FARGS,
 +                             communicator,
 +                             ddRole == DDRole::Master,
                               "The number of separate PME ranks (%d) is larger than the number of "
                               "PP ranks (%d), this is not supported.",
 -                             ddGridSetup.numPmeOnlyRanks, numPPRanks);
 +                             ddGridSetup.numPmeOnlyRanks,
 +                             numPPRanks);
      }
  }
  
  /*! \brief Set the cell size and interaction limits, as well as the DD grid */
  static DDRankSetup getDDRankSetup(const gmx::MDLogger& mdlog,
                                    int                  numNodes,
+                                   const DdRankOrder    rankOrder,
                                    const DDGridSetup&   ddGridSetup,
                                    const t_inputrec&    ir)
  {
      GMX_LOG(mdlog.info)
              .appendTextFormatted("Domain decomposition grid %d x %d x %d, separate PME ranks %d",
 -                                 ddGridSetup.numDomains[XX], ddGridSetup.numDomains[YY],
 -                                 ddGridSetup.numDomains[ZZ], ddGridSetup.numPmeOnlyRanks);
 +                                 ddGridSetup.numDomains[XX],
 +                                 ddGridSetup.numDomains[YY],
 +                                 ddGridSetup.numDomains[ZZ],
 +                                 ddGridSetup.numPmeOnlyRanks);
  
      DDRankSetup ddRankSetup;
  
+     ddRankSetup.rankOrder = rankOrder;
      ddRankSetup.numPPRanks = numNodes - ddGridSetup.numPmeOnlyRanks;
      copy_ivec(ddGridSetup.numDomains, ddRankSetup.numPPCells);
  
          }
          GMX_LOG(mdlog.info)
                  .appendTextFormatted("PME domain decomposition: %d x %d x %d",
 -                                     ddRankSetup.npmenodes_x, ddRankSetup.npmenodes_y, 1);
 +                                     ddRankSetup.npmenodes_x,
 +                                     ddRankSetup.npmenodes_y,
 +                                     1);
      }
      else
      {
@@@ -2337,8 -2395,8 +2341,8 @@@ static void set_dd_limits(const gmx::MD
                            const DDSystemInfo&  systemInfo,
                            const DDGridSetup&   ddGridSetup,
                            const int            numPPRanks,
 -                          const gmx_mtop_t*    mtop,
 -                          const t_inputrec*    ir,
 +                          const gmx_mtop_t&    mtop,
 +                          const t_inputrec&    ir,
                            const gmx_ddbox_t&   ddbox)
  {
      gmx_domdec_comm_t* comm = dd->comm;
           *       but that is not yet available here. But this anyhow only
           *       affect performance up to the second dd_partition_system call.
           */
 -        const int homeAtomCountEstimate = mtop->natoms / numPPRanks;
 +        const int homeAtomCountEstimate = mtop.natoms / numPPRanks;
          comm->updateGroupsCog           = std::make_unique<gmx::UpdateGroupsCog>(
 -                *mtop, systemInfo.updateGroupingPerMoleculetype, maxReferenceTemperature(*ir),
 -                homeAtomCountEstimate);
 +                mtop, systemInfo.updateGroupingsPerMoleculeType, maxReferenceTemperature(ir), homeAtomCountEstimate);
      }
  
      /* Set the DD setup given by ddGridSetup */
          fprintf(debug,
                  "Bonded atom communication beyond the cut-off: %s\n"
                  "cellsize limit %f\n",
 -                gmx::boolToString(systemInfo.filterBondedCommunication), comm->cellsize_limit);
 +                gmx::boolToString(systemInfo.filterBondedCommunication),
 +                comm->cellsize_limit);
      }
  
      if (ddRole == DDRole::Master)
      }
  }
  
 -void dd_init_bondeds(FILE*                           fplog,
 -                     gmx_domdec_t*                   dd,
 -                     const gmx_mtop_t&               mtop,
 -                     const gmx::VirtualSitesHandler* vsite,
 -                     const t_inputrec*               ir,
 -                     gmx_bool                        bBCheck,
 -                     gmx::ArrayRef<cginfo_mb_t>      cginfo_mb)
 -{
 -    gmx_domdec_comm_t* comm;
 -
 -    dd_make_reverse_top(fplog, dd, &mtop, vsite, ir, bBCheck);
 -
 -    comm = dd->comm;
 -
 -    if (comm->systemInfo.filterBondedCommunication)
 -    {
 -        /* Communicate atoms beyond the cut-off for bonded interactions */
 -        comm->bondedLinks = makeBondedLinks(mtop, cginfo_mb);
 -    }
 -    else
 -    {
 -        /* Only communicate atoms based on cut-off */
 -        comm->bondedLinks = nullptr;
 -    }
 -}
 -
  static void writeSettings(gmx::TextWriter*   log,
                            gmx_domdec_t*      dd,
 -                          const gmx_mtop_t*  mtop,
 -                          const t_inputrec*  ir,
 +                          const gmx_mtop_t&  mtop,
 +                          const t_inputrec&  ir,
                            gmx_bool           bDynLoadBal,
                            real               dlb_scale,
                            const gmx_ddbox_t* ddbox)
  {
 -    gmx_domdec_comm_t* comm;
 -    int                d;
 -    ivec               np;
 -    real               limit, shrink;
 -
 -    comm = dd->comm;
 +    gmx_domdec_comm_t* comm = dd->comm;
  
      if (bDynLoadBal)
      {
          log->writeString("The maximum number of communication pulses is:");
 -        for (d = 0; d < dd->ndim; d++)
 +        for (int d = 0; d < dd->ndim; d++)
          {
              log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
          }
                                  comm->cellsize_limit);
          log->writeLineFormatted("The requested allowed shrink of DD cells (option -dds) is: %.2f", dlb_scale);
          log->writeString("The allowed shrink of domain decomposition cells is:");
 -        for (d = 0; d < DIM; d++)
 +        for (int d = 0; d < DIM; d++)
          {
              if (dd->numCells[d] > 1)
              {
 -                if (d >= ddbox->npbcdim && dd->numCells[d] == 2)
 -                {
 -                    shrink = 0;
 -                }
 -                else
 -                {
 -                    shrink = comm->cellsize_min_dlb[d]
 -                             / (ddbox->box_size[d] * ddbox->skew_fac[d] / dd->numCells[d]);
 -                }
 +                const real shrink =
 +                        (d >= ddbox->npbcdim && dd->numCells[d] == 2)
 +                                ? 0
 +                                : comm->cellsize_min_dlb[d]
 +                                          / (ddbox->box_size[d] * ddbox->skew_fac[d] / dd->numCells[d]);
                  log->writeStringFormatted(" %c %.2f", dim2char(d), shrink);
              }
          }
      }
      else
      {
 +        ivec np;
          set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
          log->writeString("The initial number of communication pulses is:");
 -        for (d = 0; d < dd->ndim; d++)
 +        for (int d = 0; d < dd->ndim; d++)
          {
              log->writeStringFormatted(" %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
          }
          log->ensureLineBreak();
          log->writeString("The initial domain decomposition cell size is:");
 -        for (d = 0; d < DIM; d++)
 +        for (int d = 0; d < DIM; d++)
          {
              if (dd->numCells[d] > 1)
              {
      }
  
      const bool haveInterDomainVsites =
 -            (countInterUpdategroupVsites(*mtop, comm->systemInfo.updateGroupingPerMoleculetype) != 0);
 +            (countInterUpdategroupVsites(mtop, comm->systemInfo.updateGroupingsPerMoleculeType) != 0);
  
      if (comm->systemInfo.haveInterDomainBondeds || haveInterDomainVsites
 -        || comm->systemInfo.haveSplitConstraints || comm->systemInfo.haveSplitSettles)
 +        || comm->systemInfo.mayHaveSplitConstraints || comm->systemInfo.mayHaveSplitSettles)
      {
          std::string decompUnits;
          if (comm->systemInfo.useUpdateGroups)
  
          log->writeLineFormatted("The maximum allowed distance for %s involved in interactions is:",
                                  decompUnits.c_str());
 -        log->writeLineFormatted("%40s  %-7s %6.3f nm", "non-bonded interactions", "",
 -                                comm->systemInfo.cutoff);
 +        log->writeLineFormatted(
 +                "%40s  %-7s %6.3f nm", "non-bonded interactions", "", comm->systemInfo.cutoff);
  
 +        real limit = 0;
          if (bDynLoadBal)
          {
              limit = dd->comm->cellsize_limit;
                          "deformation)");
              }
              limit = dd->comm->cellsize_min[XX];
 -            for (d = 1; d < DIM; d++)
 +            for (int d = 1; d < DIM; d++)
              {
                  limit = std::min(limit, dd->comm->cellsize_min[d]);
              }
  
          if (comm->systemInfo.haveInterDomainBondeds)
          {
 -            log->writeLineFormatted("%40s  %-7s %6.3f nm", "two-body bonded interactions", "(-rdd)",
 +            log->writeLineFormatted("%40s  %-7s %6.3f nm",
 +                                    "two-body bonded interactions",
 +                                    "(-rdd)",
                                      std::max(comm->systemInfo.cutoff, comm->cutoff_mbody));
 -            log->writeLineFormatted("%40s  %-7s %6.3f nm", "multi-body bonded interactions",
 +            log->writeLineFormatted("%40s  %-7s %6.3f nm",
 +                                    "multi-body bonded interactions",
                                      "(-rdd)",
                                      (comm->systemInfo.filterBondedCommunication || isDlbOn(dd->comm))
                                              ? comm->cutoff_mbody
          {
              log->writeLineFormatted("%40s  %-7s %6.3f nm", "virtual site constructions", "(-rcon)", limit);
          }
 -        if (comm->systemInfo.haveSplitConstraints || comm->systemInfo.haveSplitSettles)
 +        if (comm->systemInfo.mayHaveSplitConstraints || comm->systemInfo.mayHaveSplitSettles)
          {
              std::string separation =
 -                    gmx::formatString("atoms separated by up to %d constraints", 1 + ir->nProjOrder);
 +                    gmx::formatString("atoms separated by up to %d constraints", 1 + ir.nProjOrder);
              log->writeLineFormatted("%40s  %-7s %6.3f nm\n", separation.c_str(), "(-rcon)", limit);
          }
          log->ensureLineBreak();
  
  static void logSettings(const gmx::MDLogger& mdlog,
                          gmx_domdec_t*        dd,
 -                        const gmx_mtop_t*    mtop,
 -                        const t_inputrec*    ir,
 +                        const gmx_mtop_t&    mtop,
 +                        const t_inputrec&    ir,
                          real                 dlb_scale,
                          const gmx_ddbox_t*   ddbox)
  {
  static void set_cell_limits_dlb(const gmx::MDLogger& mdlog,
                                  gmx_domdec_t*        dd,
                                  real                 dlb_scale,
 -                                const t_inputrec*    ir,
 +                                const t_inputrec&    inputrec,
                                  const gmx_ddbox_t*   ddbox)
  {
 -    gmx_domdec_comm_t* comm;
 -    int                d, dim, npulse, npulse_d_max, npulse_d;
 -    gmx_bool           bNoCutOff;
 +    int npulse       = 0;
 +    int npulse_d_max = 0;
 +    int npulse_d     = 0;
  
 -    comm = dd->comm;
 +    gmx_domdec_comm_t* comm = dd->comm;
  
 -    bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
 +    bool bNoCutOff = (inputrec.rvdw == 0 || inputrec.rcoulomb == 0);
  
      /* Determine the maximum number of comm. pulses in one dimension */
  
      {
          /* See if we can do with less pulses, based on dlb_scale */
          npulse_d_max = 0;
 -        for (d = 0; d < dd->ndim; d++)
 +        for (int d = 0; d < dd->ndim; d++)
          {
 -            dim      = dd->dim[d];
 +            int dim  = dd->dim[d];
              npulse_d = static_cast<int>(
                      1
                      + dd->numCells[dim] * comm->systemInfo.cutoff
      }
  
      /* This env var can override npulse */
 -    d = dd_getenv(mdlog, "GMX_DD_NPULSE", 0);
 -    if (d > 0)
 +    const int ddPulseEnv = dd_getenv(mdlog, "GMX_DD_NPULSE", 0);
 +    if (ddPulseEnv > 0)
      {
 -        npulse = d;
 +        npulse = ddPulseEnv;
      }
  
      comm->maxpulse       = 1;
 -    comm->bVacDLBNoLimit = (ir->pbcType == PbcType::No);
 -    for (d = 0; d < dd->ndim; d++)
 +    comm->bVacDLBNoLimit = (inputrec.pbcType == PbcType::No);
 +    for (int d = 0; d < dd->ndim; d++)
      {
          comm->cd[d].np_dlb = std::min(npulse, dd->numCells[dd->dim[d]] - 1);
          comm->maxpulse     = std::max(comm->maxpulse, comm->cd[d].np_dlb);
      }
      comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
      /* Set the minimum cell size for each DD dimension */
 -    for (d = 0; d < dd->ndim; d++)
 +    for (int d = 0; d < dd->ndim; d++)
      {
          if (comm->bVacDLBNoLimit || comm->cd[d].np_dlb * comm->cellsize_limit >= comm->systemInfo.cutoff)
          {
@@@ -2701,29 -2789,29 +2705,29 @@@ bool dd_moleculesAreAlwaysWhole(const g
      return dd.comm->systemInfo.moleculesAreAlwaysWhole;
  }
  
 -gmx_bool dd_bonded_molpbc(const gmx_domdec_t* dd, PbcType pbcType)
 +bool dd_bonded_molpbc(const gmx_domdec_t& dd, PbcType pbcType)
  {
      /* If each molecule is a single charge group
       * or we use domain decomposition for each periodic dimension,
       * we do not need to take pbc into account for the bonded interactions.
       */
 -    return (pbcType != PbcType::No && dd->comm->systemInfo.haveInterDomainBondeds
 -            && !(dd->numCells[XX] > 1 && dd->numCells[YY] > 1
 -                 && (dd->numCells[ZZ] > 1 || pbcType == PbcType::XY)));
 +    return (pbcType != PbcType::No && dd.comm->systemInfo.haveInterDomainBondeds
 +            && !(dd.numCells[XX] > 1 && dd.numCells[YY] > 1
 +                 && (dd.numCells[ZZ] > 1 || pbcType == PbcType::XY)));
  }
  
  /*! \brief Sets grid size limits and PP-PME setup, prints settings to log */
  static void set_ddgrid_parameters(const gmx::MDLogger& mdlog,
                                    gmx_domdec_t*        dd,
                                    real                 dlb_scale,
 -                                  const gmx_mtop_t*    mtop,
 -                                  const t_inputrec*    ir,
 +                                  const gmx_mtop_t&    mtop,
 +                                  const t_inputrec&    inputrec,
                                    const gmx_ddbox_t*   ddbox)
  {
      gmx_domdec_comm_t* comm        = dd->comm;
      DDRankSetup&       ddRankSetup = comm->ddRankSetup;
  
 -    if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
 +    if (EEL_PME(inputrec.coulombtype) || EVDW_PME(inputrec.vdwtype))
      {
          init_ddpme(dd, &ddRankSetup.ddpme[0], 0);
          if (ddRankSetup.npmedecompdim >= 2)
          ddRankSetup.numRanksDoingPme = 0;
          if (dd->pme_nodeid >= 0)
          {
 -            gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
 +            gmx_fatal_collective(FARGS,
 +                                 dd->mpi_comm_all,
 +                                 DDMASTER(dd),
                                   "Can not have separate PME ranks without PME electrostatics");
          }
      }
      }
      if (!isDlbDisabled(comm))
      {
 -        set_cell_limits_dlb(mdlog, dd, dlb_scale, ir, ddbox);
 +        set_cell_limits_dlb(mdlog, dd, dlb_scale, inputrec, ddbox);
      }
  
 -    logSettings(mdlog, dd, mtop, ir, dlb_scale, ddbox);
 +    logSettings(mdlog, dd, mtop, inputrec, dlb_scale, ddbox);
  
 -    real vol_frac;
 -    if (ir->pbcType == PbcType::No)
 -    {
 -        vol_frac = 1 - 1 / static_cast<double>(dd->nnodes);
 -    }
 -    else
 -    {
 -        vol_frac = (1 + comm_box_frac(dd->numCells, comm->systemInfo.cutoff, *ddbox))
 -                   / static_cast<double>(dd->nnodes);
 -    }
 +    const real vol_frac = (inputrec.pbcType == PbcType::No)
 +                                  ? (1 - 1 / static_cast<double>(dd->nnodes))
 +                                  : ((1 + comm_box_frac(dd->numCells, comm->systemInfo.cutoff, *ddbox))
 +                                     / static_cast<double>(dd->nnodes));
      if (debug)
      {
          fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
      }
 -    int natoms_tot = mtop->natoms;
 +    int natoms_tot = mtop.natoms;
  
      dd->ga2la = new gmx_ga2la_t(natoms_tot, static_cast<int>(vol_frac * natoms_tot));
  }
@@@ -2804,19 -2896,17 +2808,19 @@@ static DDSettings getDDSettings(const g
          ddSettings.recordLoad = (wallcycle_have_counter() && recload > 0);
      }
  
 -    ddSettings.initialDlbState = determineInitialDlbState(mdlog, options.dlbOption,
 -                                                          ddSettings.recordLoad, mdrunOptions, &ir);
 +    ddSettings.initialDlbState =
 +            determineInitialDlbState(mdlog, options.dlbOption, ddSettings.recordLoad, mdrunOptions, ir);
      GMX_LOG(mdlog.info)
              .appendTextFormatted("Dynamic load balancing: %s",
 -                                 edlbs_names[static_cast<int>(ddSettings.initialDlbState)]);
 +                                 enumValueToString(ddSettings.initialDlbState));
  
      return ddSettings;
  }
  
  gmx_domdec_t::gmx_domdec_t(const t_inputrec& ir) : unitCellInfo(ir) {}
  
 +gmx_domdec_t::~gmx_domdec_t() = default;
 +
  namespace gmx
  {
  
@@@ -2827,20 -2917,14 +2831,20 @@@ class DomainDecompositionBuilder::Imp
  {
  public:
      //! Constructor
 -    Impl(const MDLogger&      mdlog,
 -         t_commrec*           cr,
 -         const DomdecOptions& options,
 -         const MdrunOptions&  mdrunOptions,
 -         const gmx_mtop_t&    mtop,
 -         const t_inputrec&    ir,
 -         const matrix         box,
 -         ArrayRef<const RVec> xGlobal);
 +    Impl(const MDLogger&                   mdlog,
 +         t_commrec*                        cr,
 +         const DomdecOptions&              options,
 +         const MdrunOptions&               mdrunOptions,
 +         const gmx_mtop_t&                 mtop,
 +         const t_inputrec&                 ir,
 +         const MDModulesNotifiers&         notifiers,
 +         const matrix                      box,
 +         ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
 +         bool                              useUpdateGroups,
 +         real                              maxUpdateGroupRadius,
 +         ArrayRef<const RVec>              xGlobal,
 +         bool                              useGpuForNonbonded,
 +         bool                              useGpuForPme);
  
      //! Build the resulting DD manager
      gmx_domdec_t* build(LocalAtomSetManager* atomSets);
      const gmx_mtop_t& mtop_;
      //! User input values from the tpr file
      const t_inputrec& ir_;
 +    //! MdModules object
 +    const MDModulesNotifiers& notifiers_;
      //! }
  
      //! Internal objects used in constructing DD
      //! }
  };
  
 -DomainDecompositionBuilder::Impl::Impl(const MDLogger&      mdlog,
 -                                       t_commrec*           cr,
 -                                       const DomdecOptions& options,
 -                                       const MdrunOptions&  mdrunOptions,
 -                                       const gmx_mtop_t&    mtop,
 -                                       const t_inputrec&    ir,
 -                                       const matrix         box,
 -                                       ArrayRef<const RVec> xGlobal) :
 -    mdlog_(mdlog),
 -    cr_(cr),
 -    options_(options),
 -    mtop_(mtop),
 -    ir_(ir)
 +DomainDecompositionBuilder::Impl::Impl(const MDLogger&                   mdlog,
 +                                       t_commrec*                        cr,
 +                                       const DomdecOptions&              options,
 +                                       const MdrunOptions&               mdrunOptions,
 +                                       const gmx_mtop_t&                 mtop,
 +                                       const t_inputrec&                 ir,
 +                                       const MDModulesNotifiers&         notifiers,
 +                                       const matrix                      box,
 +                                       ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
 +                                       const bool                        useUpdateGroups,
 +                                       const real                        maxUpdateGroupRadius,
 +                                       ArrayRef<const RVec>              xGlobal,
 +                                       bool                              useGpuForNonbonded,
 +                                       bool                              useGpuForPme) :
 +    mdlog_(mdlog), cr_(cr), options_(options), mtop_(mtop), ir_(ir), notifiers_(notifiers)
  {
      GMX_LOG(mdlog_.info).appendTextFormatted("\nInitializing Domain Decomposition on %d ranks", cr_->sizeOfDefaultCommunicator);
  
          srand(1 + cr_->rankInDefaultCommunicator);
      }
  
 -    systemInfo_ = getSystemInfo(mdlog_, MASTER(cr_) ? DDRole::Master : DDRole::Agent,
 -                                cr->mpiDefaultCommunicator, options_, mtop_, ir_, box, xGlobal);
 +    systemInfo_ = getSystemInfo(mdlog_,
 +                                MASTER(cr_) ? DDRole::Master : DDRole::Agent,
 +                                cr->mpiDefaultCommunicator,
 +                                options_,
 +                                mtop_,
 +                                ir_,
 +                                box,
 +                                updateGroupingPerMoleculeType,
 +                                useUpdateGroups,
 +                                maxUpdateGroupRadius,
 +                                xGlobal);
  
      const int  numRanksRequested         = cr_->sizeOfDefaultCommunicator;
      const bool checkForLargePrimeFactors = (options_.numCells[0] <= 0);
 -    checkForValidRankCountRequests(numRanksRequested, EEL_PME(ir_.coulombtype),
 -                                   options_.numPmeRanks, checkForLargePrimeFactors);
 +
 +
 +    /* Checks for ability to use PME-only ranks */
 +    auto separatePmeRanksPermitted = checkForSeparatePmeRanks(
 +            notifiers_, options_, numRanksRequested, useGpuForNonbonded, useGpuForPme);
 +
 +    /* Checks for validity of requested Ranks setup */
 +    checkForValidRankCountRequests(numRanksRequested,
 +                                   EEL_PME(ir_.coulombtype) | EVDW_PME(ir_.vdwtype),
 +                                   options_.numPmeRanks,
 +                                   separatePmeRanksPermitted,
 +                                   checkForLargePrimeFactors);
  
      // DD grid setup uses a more different cell size limit for
      // automated setup than the one in systemInfo_. The latter is used
      // in set_dd_limits() to configure DLB, for example.
      const real gridSetupCellsizeLimit =
 -            getDDGridSetupCellSizeLimit(mdlog_, !isDlbDisabled(ddSettings_.initialDlbState),
 -                                        options_.dlbScaling, ir_, systemInfo_.cellsizeLimit);
 -    ddGridSetup_ =
 -            getDDGridSetup(mdlog_, MASTER(cr_) ? DDRole::Master : DDRole::Agent,
 -                           cr->mpiDefaultCommunicator, numRanksRequested, options_, ddSettings_,
 -                           systemInfo_, gridSetupCellsizeLimit, mtop_, ir_, box, xGlobal, &ddbox_);
 -    checkDDGridSetup(ddGridSetup_, MASTER(cr_) ? DDRole::Master : DDRole::Agent,
 -                     cr->mpiDefaultCommunicator, cr->sizeOfDefaultCommunicator, options_,
 -                     ddSettings_, systemInfo_, gridSetupCellsizeLimit, ddbox_);
 +            getDDGridSetupCellSizeLimit(mdlog_,
 +                                        !isDlbDisabled(ddSettings_.initialDlbState),
 +                                        options_.dlbScaling,
 +                                        ir_,
 +                                        systemInfo_.cellsizeLimit);
 +    ddGridSetup_ = getDDGridSetup(mdlog_,
 +                                  MASTER(cr_) ? DDRole::Master : DDRole::Agent,
 +                                  cr->mpiDefaultCommunicator,
 +                                  numRanksRequested,
 +                                  options_,
 +                                  ddSettings_,
 +                                  systemInfo_,
 +                                  gridSetupCellsizeLimit,
 +                                  mtop_,
 +                                  ir_,
 +                                  separatePmeRanksPermitted,
 +                                  box,
 +                                  xGlobal,
 +                                  &ddbox_);
 +    checkDDGridSetup(ddGridSetup_,
 +                     MASTER(cr_) ? DDRole::Master : DDRole::Agent,
 +                     cr->mpiDefaultCommunicator,
 +                     cr->sizeOfDefaultCommunicator,
 +                     options_,
 +                     ddSettings_,
 +                     systemInfo_,
 +                     gridSetupCellsizeLimit,
 +                     ddbox_);
  
      cr_->npmenodes = ddGridSetup_.numPmeOnlyRanks;
  
-     ddRankSetup_ = getDDRankSetup(mdlog_, cr_->sizeOfDefaultCommunicator, ddGridSetup_, ir_);
 -    ddRankSetup_ = getDDRankSetup(mdlog_, cr_->sizeOfDefaultCommunicator, options_.rankOrder,
 -                                  ddGridSetup_, ir_);
++    ddRankSetup_ = getDDRankSetup(
++            mdlog_, cr_->sizeOfDefaultCommunicator, options_.rankOrder, ddGridSetup_, ir_);
  
      /* Generate the group communicator, also decides the duty of each rank */
 -    cartSetup_ = makeGroupCommunicators(mdlog_, ddSettings_, ddRankSetup_, cr_, ddCellIndex_, &pmeRanks_);
 +    cartSetup_ = makeGroupCommunicators(
 +            mdlog_, ddSettings_, options_.rankOrder, ddRankSetup_, cr_, ddCellIndex_, &pmeRanks_);
  }
  
  gmx_domdec_t* DomainDecompositionBuilder::Impl::build(LocalAtomSetManager* atomSets)
      dd->comm->ddRankSetup        = ddRankSetup_;
      dd->comm->cartesianRankSetup = cartSetup_;
  
 -    set_dd_limits(mdlog_, MASTER(cr_) ? DDRole::Master : DDRole::Agent, dd, options_, ddSettings_,
 -                  systemInfo_, ddGridSetup_, ddRankSetup_.numPPRanks, &mtop_, &ir_, ddbox_);
 +    set_dd_limits(mdlog_,
 +                  MASTER(cr_) ? DDRole::Master : DDRole::Agent,
 +                  dd,
 +                  options_,
 +                  ddSettings_,
 +                  systemInfo_,
 +                  ddGridSetup_,
 +                  ddRankSetup_.numPPRanks,
 +                  mtop_,
 +                  ir_,
 +                  ddbox_);
  
      setupGroupCommunication(mdlog_, ddSettings_, pmeRanks_, cr_, mtop_.natoms, dd);
  
      if (thisRankHasDuty(cr_, DUTY_PP))
      {
 -        set_ddgrid_parameters(mdlog_, dd, options_.dlbScaling, &mtop_, &ir_, &ddbox_);
 +        set_ddgrid_parameters(mdlog_, dd, options_.dlbScaling, mtop_, ir_, &ddbox_);
  
          setup_neighbor_relations(dd);
      }
  
      /* Set overallocation to avoid frequent reallocation of arrays */
 -    set_over_alloc_dd(TRUE);
 +    set_over_alloc_dd(true);
  
      dd->atomSets = atomSets;
  
 +    dd->localTopologyChecker = std::make_unique<LocalTopologyChecker>(
 +            mdlog_, cr_, mtop_, dd->comm->systemInfo.useUpdateGroups);
 +
      return dd;
  }
  
 -DomainDecompositionBuilder::DomainDecompositionBuilder(const MDLogger&      mdlog,
 -                                                       t_commrec*           cr,
 -                                                       const DomdecOptions& options,
 -                                                       const MdrunOptions&  mdrunOptions,
 -                                                       const gmx_mtop_t&    mtop,
 -                                                       const t_inputrec&    ir,
 -                                                       const matrix         box,
 -                                                       ArrayRef<const RVec> xGlobal) :
 -    impl_(new Impl(mdlog, cr, options, mdrunOptions, mtop, ir, box, xGlobal))
 +DomainDecompositionBuilder::DomainDecompositionBuilder(const MDLogger&           mdlog,
 +                                                       t_commrec*                cr,
 +                                                       const DomdecOptions&      options,
 +                                                       const MdrunOptions&       mdrunOptions,
 +                                                       const gmx_mtop_t&         mtop,
 +                                                       const t_inputrec&         ir,
 +                                                       const MDModulesNotifiers& notifiers,
 +                                                       const matrix              box,
 +                                                       ArrayRef<const RangePartitioning> updateGroupingPerMoleculeType,
 +                                                       const bool           useUpdateGroups,
 +                                                       const real           maxUpdateGroupRadius,
 +                                                       ArrayRef<const RVec> xGlobal,
 +                                                       const bool           useGpuForNonbonded,
 +                                                       const bool           useGpuForPme) :
 +    impl_(new Impl(mdlog,
 +                   cr,
 +                   options,
 +                   mdrunOptions,
 +                   mtop,
 +                   ir,
 +                   notifiers,
 +                   box,
 +                   updateGroupingPerMoleculeType,
 +                   useUpdateGroups,
 +                   maxUpdateGroupRadius,
 +                   xGlobal,
 +                   useGpuForNonbonded,
 +                   useGpuForPme))
  {
  }
  
@@@ -3063,7 -3074,9 +3068,7 @@@ DomainDecompositionBuilder::~DomainDeco
  static gmx_bool test_dd_cutoff(const t_commrec* cr, const matrix box, gmx::ArrayRef<const gmx::RVec> x, real cutoffRequested)
  {
      gmx_ddbox_t ddbox;
 -    int         d, dim, np;
 -    real        inv_cell_size;
 -    int         LocallyLimited;
 +    int         LocallyLimited = 0;
  
      const auto* dd = cr->dd;
  
  
      LocallyLimited = 0;
  
 -    for (d = 0; d < dd->ndim; d++)
 +    for (int d = 0; d < dd->ndim; d++)
      {
 -        dim = dd->dim[d];
 +        const int dim = dd->dim[d];
  
 -        inv_cell_size = DD_CELL_MARGIN * dd->numCells[dim] / ddbox.box_size[dim];
 +        real inv_cell_size = DD_CELL_MARGIN * dd->numCells[dim] / ddbox.box_size[dim];
          if (dd->unitCellInfo.ddBoxIsDynamic)
          {
              inv_cell_size *= DD_PRES_SCALE_MARGIN;
          }
  
 -        np = 1 + static_cast<int>(cutoffRequested * inv_cell_size * ddbox.skew_fac[dim]);
 +        const int np = 1 + static_cast<int>(cutoffRequested * inv_cell_size * ddbox.skew_fac[dim]);
  
          if (!isDlbDisabled(dd->comm) && (dim < ddbox.npbcdim) && (dd->comm->cd[d].np_dlb > 0))
          {
          /* If DLB is not active yet, we don't need to check the grid jumps.
           * Actually we shouldn't, because then the grid jump data is not set.
           */
 -        if (isDlbOn(dd->comm) && check_grid_jump(0, dd, cutoffRequested, &ddbox, FALSE))
 +        if (isDlbOn(dd->comm) && gmx::check_grid_jump(0, dd, cutoffRequested, &ddbox, FALSE))
          {
              LocallyLimited = 1;
          }
      return TRUE;
  }
  
 -gmx_bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRef<const gmx::RVec> x, real cutoffRequested)
 +bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRef<const gmx::RVec> x, real cutoffRequested)
  {
 -    gmx_bool bCutoffAllowed;
 -
 -    bCutoffAllowed = test_dd_cutoff(cr, box, x, cutoffRequested);
 +    bool bCutoffAllowed = test_dd_cutoff(cr, box, x, cutoffRequested);
  
      if (bCutoffAllowed)
      {
@@@ -3163,14 -3178,9 +3168,14 @@@ void constructGpuHaloExchange(const gmx
          for (int pulse = cr.dd->gpuHaloExchange[d].size(); pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
          {
              cr.dd->gpuHaloExchange[d].push_back(std::make_unique<gmx::GpuHaloExchange>(
 -                    cr.dd, d, cr.mpi_comm_mysim, deviceStreamManager.context(),
 +                    cr.dd,
 +                    d,
 +                    cr.mpi_comm_mygroup,
 +                    deviceStreamManager.context(),
                      deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal),
 -                    deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal), pulse, wcycle));
 +                    deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal),
 +                    pulse,
 +                    wcycle));
          }
      }
  }
@@@ -3212,31 -3222,41 +3217,70 @@@ void communicateGpuHaloForces(const t_c
      }
  }
  
 -        const auto& updateGrouping = dd.comm->systemInfo.updateGroupingPerMoleculetype[molblock.type];
 +const gmx::LocalTopologyChecker& dd_localTopologyChecker(const gmx_domdec_t& dd)
 +{
 +    return *dd.localTopologyChecker;
 +}
 +
 +gmx::LocalTopologyChecker* dd_localTopologyChecker(gmx_domdec_t* dd)
 +{
 +    return dd->localTopologyChecker.get();
 +}
 +
 +void dd_init_local_state(const gmx_domdec_t& dd, const t_state* state_global, t_state* state_local)
 +{
 +    std::array<int, 5> buf;
 +
 +    if (DDMASTER(dd))
 +    {
 +        buf[0] = state_global->flags;
 +        buf[1] = state_global->ngtc;
 +        buf[2] = state_global->nnhpres;
 +        buf[3] = state_global->nhchainlength;
 +        buf[4] = state_global->dfhist ? state_global->dfhist->nlambda : 0;
 +    }
 +    dd_bcast(&dd, buf.size() * sizeof(int), buf.data());
 +
 +    init_gtc_state(state_local, buf[1], buf[2], buf[3]);
 +    init_dfhist_state(state_local, buf[4]);
 +    state_local->flags = buf[0];
 +}
++
+ void putUpdateGroupAtomsInSamePeriodicImage(const gmx_domdec_t&      dd,
+                                             const gmx_mtop_t&        mtop,
+                                             const matrix             box,
+                                             gmx::ArrayRef<gmx::RVec> positions)
+ {
+     int atomOffset = 0;
+     for (const gmx_molblock_t& molblock : mtop.molblock)
+     {
++        const auto& updateGrouping = dd.comm->systemInfo.updateGroupingsPerMoleculeType[molblock.type];
+         for (int mol = 0; mol < molblock.nmol; mol++)
+         {
+             for (int g = 0; g < updateGrouping.numBlocks(); g++)
+             {
+                 const auto& block     = updateGrouping.block(g);
+                 const int   atomBegin = atomOffset + block.begin();
+                 const int   atomEnd   = atomOffset + block.end();
+                 for (int a = atomBegin + 1; a < atomEnd; a++)
+                 {
+                     // Make sure that atoms in the same update group
+                     // are in the same periodic image after restarts.
+                     for (int d = DIM - 1; d >= 0; d--)
+                     {
+                         while (positions[a][d] - positions[atomBegin][d] > 0.5_real * box[d][d])
+                         {
+                             positions[a] -= box[d];
+                         }
+                         while (positions[a][d] - positions[atomBegin][d] < -0.5_real * box[d][d])
+                         {
+                             positions[a] += box[d];
+                         }
+                     }
+                 }
+             }
+             atomOffset += updateGrouping.fullRange().end();
+         }
+     }
+ }
index 20a7c6994563644e27c5e591c342e5268681a7fb,8939ac35e7278211cad903621b4194c2dc15922c..3fa388684230a4d90ce1406f56da1b07deb66732
  
  #include "gromacs/gpu_utils/devicebuffer_datatype.h"
  #include "gromacs/math/vectypes.h"
 -#include "gromacs/utility/arrayref.h"
 -#include "gromacs/utility/basedefinitions.h"
  #include "gromacs/utility/real.h"
  
 -struct cginfo_mb_t;
  struct gmx_domdec_t;
  struct gmx_ddbox_t;
  struct gmx_domdec_zones_t;
  struct gmx_localtop_t;
  struct gmx_mtop_t;
 -struct t_block;
 -struct t_blocka;
  struct t_commrec;
  struct t_forcerec;
  struct t_inputrec;
@@@ -84,15 -89,11 +84,15 @@@ class GpuEventSynchronizer
  
  namespace gmx
  {
 +struct AtomInfoWithinMoleculeBlock;
  class DeviceStreamManager;
  class ForceWithShiftForces;
 +class LocalTopologyChecker;
  class MDLogger;
  class RangePartitioning;
  class VirtualSitesHandler;
 +template<typename>
 +class ArrayRef;
  } // namespace gmx
  
  /*! \brief Returns the global topology atom number belonging to local atom index i.
   */
  int ddglatnr(const gmx_domdec_t* dd, int i);
  
 -/*! \brief Returns a list of update group partitioning for each molecule type or empty when update groups are not used */
 -gmx::ArrayRef<const gmx::RangePartitioning> getUpdateGroupingPerMoleculetype(const gmx_domdec_t& dd);
 -
  /*! \brief Store the global cg indices of the home cgs in state,
   *
   * This means it can be reset, even after a new DD partitioning.
   */
 -void dd_store_state(struct gmx_domdec_t* dd, t_state* state);
 +void dd_store_state(const gmx_domdec_t& dd, t_state* state);
  
  /*! \brief Returns a pointer to the gmx_domdec_zones_t struct */
  struct gmx_domdec_zones_t* domdec_zones(struct gmx_domdec_t* dd);
@@@ -119,13 -123,13 +119,13 @@@ int dd_numAtomsZones(const gmx_domdec_t
  int dd_numHomeAtoms(const gmx_domdec_t& dd);
  
  /*! \brief Returns the atom range in the local state for atoms that need to be present in mdatoms */
 -int dd_natoms_mdatoms(const gmx_domdec_t* dd);
 +int dd_natoms_mdatoms(const gmx_domdec_t& dd);
  
  /*! \brief Returns the atom range in the local state for atoms involved in virtual sites */
 -int dd_natoms_vsite(const gmx_domdec_t* dd);
 +int dd_natoms_vsite(const gmx_domdec_t& dd);
  
  /*! \brief Sets the atom range for atom in the local state for atoms received in constraints communication */
 -void dd_get_constraint_range(const gmx_domdec_t* dd, int* at_start, int* at_end);
 +void dd_get_constraint_range(const gmx_domdec_t& dd, int* at_start, int* at_end);
  
  /*! \libinternal \brief Struct for passing around the number of PME domains */
  struct NumPmeDomains
@@@ -141,19 -145,31 +141,19 @@@ NumPmeDomains getNumPmeDomains(const gm
  std::vector<int> get_pme_ddranks(const t_commrec* cr, int pmenodeid);
  
  /*! \brief Returns the maximum shift for coordinate communication in PME, dim x */
 -int dd_pme_maxshift_x(const gmx_domdec_t* dd);
 +int dd_pme_maxshift_x(const gmx_domdec_t& dd);
  
  /*! \brief Returns the maximum shift for coordinate communication in PME, dim y */
 -int dd_pme_maxshift_y(const gmx_domdec_t* dd);
 -
 -/*! \brief Return whether constraints, not including settles, cross domain boundaries */
 -bool ddHaveSplitConstraints(const gmx_domdec_t& dd);
 +int dd_pme_maxshift_y(const gmx_domdec_t& dd);
  
  /*! \brief Return whether update groups are used */
  bool ddUsesUpdateGroups(const gmx_domdec_t& dd);
  
 -/*! \brief Initialize data structures for bonded interactions */
 -void dd_init_bondeds(FILE*                           fplog,
 -                     gmx_domdec_t*                   dd,
 -                     const gmx_mtop_t&               mtop,
 -                     const gmx::VirtualSitesHandler* vsite,
 -                     const t_inputrec*               ir,
 -                     gmx_bool                        bBCheck,
 -                     gmx::ArrayRef<cginfo_mb_t>      cginfo_mb);
 -
  /*! \brief Returns whether molecules are always whole, i.e. not broken by PBC */
  bool dd_moleculesAreAlwaysWhole(const gmx_domdec_t& dd);
  
  /*! \brief Returns if we need to do pbc for calculating bonded interactions */
 -gmx_bool dd_bonded_molpbc(const gmx_domdec_t* dd, PbcType pbcType);
 +bool dd_bonded_molpbc(const gmx_domdec_t& dd, PbcType pbcType);
  
  /*! \brief Change the DD non-bonded communication cut-off.
   *
   * \param[in] x                Position vector, used for computing the dimensions of the system
   * \param[in] cutoffRequested  The requested atom to atom cut-off distance, usually the pair-list cutoff distance
   */
 -gmx_bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRef<const gmx::RVec> x, real cutoffRequested);
 +bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRef<const gmx::RVec> x, real cutoffRequested);
  
  /*! \brief Set up communication for averaging GPU wait times over domains
   *
@@@ -222,12 -238,10 +222,12 @@@ void dd_move_x_constraints(struct gmx_d
                             const matrix             box,
                             gmx::ArrayRef<gmx::RVec> x0,
                             gmx::ArrayRef<gmx::RVec> x1,
 -                           gmx_bool                 bX1IsCoord);
 +                           bool                     bX1IsCoord);
  
  /*! \brief Communicates the coordinates involved in virtual sites */
  void dd_move_x_vsites(const gmx_domdec_t& dd, const matrix box, rvec* x);
 +/*! \brief Communicates the positions and velocities involved in virtual sites */
 +void dd_move_x_and_v_vsites(const gmx_domdec_t& dd, const matrix box, rvec* x, rvec* v);
  
  /*! \brief Returns the local atom count array for all constraints
   *
   */
  gmx::ArrayRef<const int> dd_constraints_nlocalatoms(const gmx_domdec_t* dd);
  
 -/* In domdec_top.c */
 -
 -/*! \brief Print error output when interactions are missing */
 -[[noreturn]] void dd_print_missing_interactions(const gmx::MDLogger&           mdlog,
 -                                                t_commrec*                     cr,
 -                                                int                            local_count,
 -                                                const gmx_mtop_t*              top_global,
 -                                                const gmx_localtop_t*          top_local,
 -                                                gmx::ArrayRef<const gmx::RVec> x,
 -                                                const matrix                   box);
 -
 -/*! \brief Generate and store the reverse topology */
 -void dd_make_reverse_top(FILE*                           fplog,
 -                         gmx_domdec_t*                   dd,
 -                         const gmx_mtop_t*               mtop,
 -                         const gmx::VirtualSitesHandler* vsite,
 -                         const t_inputrec*               ir,
 -                         gmx_bool                        bBCheck);
 -
 -/*! \brief Generate the local topology and virtual site data */
 -void dd_make_local_top(struct gmx_domdec_t*       dd,
 -                       struct gmx_domdec_zones_t* zones,
 -                       int                        npbcdim,
 -                       matrix                     box,
 -                       rvec                       cellsize_min,
 -                       const ivec                 npulse,
 -                       t_forcerec*                fr,
 -                       rvec*                      cgcm_or_x,
 -                       const gmx_mtop_t&          top,
 -                       gmx_localtop_t*            ltop);
 -
 -/*! \brief Sort ltop->ilist when we are doing free energy. */
 -void dd_sort_local_top(gmx_domdec_t* dd, const t_mdatoms* mdatoms, gmx_localtop_t* ltop);
 -
 -/*! \brief Construct local state */
 -void dd_init_local_state(struct gmx_domdec_t* dd, const t_state* state_global, t_state* local_state);
 +/*! Const getter for the local topology checker
 + *
 + * \returns Const handle to local topology checker */
 +const gmx::LocalTopologyChecker& dd_localTopologyChecker(const gmx_domdec_t& dd);
  
 -/*! \brief Generate a list of links between atoms that are linked by bonded interactions
 +/*! Getter for the local topology checker
   *
 - * Also stores whether atoms are linked in \p cginfo_mb.
 - */
 -t_blocka* makeBondedLinks(const gmx_mtop_t& mtop, gmx::ArrayRef<cginfo_mb_t> cginfo_mb);
 -
 -/*! \brief Calculate the maximum distance involved in 2-body and multi-body bonded interactions */
 -void dd_bonded_cg_distance(const gmx::MDLogger&           mdlog,
 -                           const gmx_mtop_t*              mtop,
 -                           const t_inputrec*              ir,
 -                           gmx::ArrayRef<const gmx::RVec> x,
 -                           const matrix                   box,
 -                           gmx_bool                       bBCheck,
 -                           real*                          r_2b,
 -                           real*                          r_mb);
 + * \returns Handle to local topology checker */
 +gmx::LocalTopologyChecker* dd_localTopologyChecker(gmx_domdec_t* dd);
 +
 +/*! \brief Construct local state */
 +void dd_init_local_state(const gmx_domdec_t& dd, const t_state* state_global, t_state* local_state);
  
  /*! \brief Construct the GPU halo exchange object(s).
   *
@@@ -291,4 -345,27 +291,27 @@@ void communicateGpuHaloCoordinates(cons
   */
  void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces);
  
+ /*! \brief Wraps the \c positions so that atoms from the same
+  * update group share the same periodic image wrt \c box.
+  *
+  * When DD and update groups are in use, the simulation master rank
+  * should call this to ensure that e.g. when restarting a simulation
+  * that did not use update groups that the coordinates satisfy the new
+  * requirements.
+  *
+  * This function can probably be removed when even single-rank
+  * simulations use domain decomposition, because then the choice of
+  * whether update groups are used is probably going to be the same
+  * regardless of the rank count.
+  *
+  * \param[in]    dd         The DD manager
+  * \param[in]    mtop       The system topology
+  * \param[in]    box        The global system box
+  * \param[in]    positions  The global system positions
+  */
+ void putUpdateGroupAtomsInSamePeriodicImage(const gmx_domdec_t&      dd,
+                                             const gmx_mtop_t&        mtop,
+                                             const matrix             box,
+                                             gmx::ArrayRef<gmx::RVec> positions);
  #endif
index 40e67444615942618f1793650c51de821f04f3da,f2aedaec7a5d1c8a08cc27191e2fa8d243a7f0ac..d82bafa79fa44fe0323352cb3d839b8fcb6cee45
@@@ -60,6 -60,13 +60,13 @@@ struct t_commrec
  
  struct BalanceRegion;
  
+ namespace gmx
+ {
+ enum class DdRankOrder : int;
+ }
+ // namespace
  //! Indices to communicate in a dimension
  struct gmx_domdec_ind_t
  {
@@@ -189,8 -196,6 +196,8 @@@ typedef struct gmx_domdec_sor
      std::vector<gmx_cgsort_t> moved;
      /**< Integer buffer for sorting */
      std::vector<int> intBuffer;
 +    /**< Int64 buffer for sorting */
 +    std::vector<int64_t> int64Buffer;
  } gmx_domdec_sort_t;
  
  /*! \brief Manages atom ranges and order for the local state atom vectors */
@@@ -282,7 -287,7 +289,7 @@@ enum class DlbStat
      offTemporarilyLocked, /**< DLB is off and temporarily can't turn on */
      onCanTurnOff,         /**< DLB is on and can turn off when slow */
      onUser,               /**< DLB is permanently on per user request */
 -    nr                    /**< The number of DLB states */
 +    Count                 /**< The number of DLB states */
  };
  
  /*! \brief The PME domain decomposition for one dimension */
@@@ -425,7 -430,7 +432,7 @@@ struct DDSystemInf
      //! True when update groups are used
      bool useUpdateGroups = false;
      //! Update atom grouping for each molecule type
 -    std::vector<gmx::RangePartitioning> updateGroupingPerMoleculetype;
 +    gmx::ArrayRef<const gmx::RangePartitioning> updateGroupingsPerMoleculeType;
      //! The maximum radius over all update groups
      real maxUpdateGroupRadius;
  
      real cellsizeLimit = 0;
  
      //! Can atoms connected by constraints be assigned to different domains?
 -    bool haveSplitConstraints = false;
 +    bool mayHaveSplitConstraints = false;
      //! Can atoms connected by settles be assigned to different domains?
 -    bool haveSplitSettles = false;
 +    bool mayHaveSplitSettles = false;
      //! Estimated communication range needed for constraints
      real constraintCommunicationRange = 0;
  
@@@ -499,6 -504,9 +506,9 @@@ struct DDSetting
  /*! \brief Information on how the DD ranks are set up */
  struct DDRankSetup
  {
+     /**< The rank ordering */
+     gmx::DdRankOrder rankOrder;
      /**< The number of particle-particle (non PME-only) ranks */
      int numPPRanks = 0;
      /**< The DD PP grid */
@@@ -585,35 -593,35 +595,35 @@@ struct gmx_domdec_comm_t // NOLINT (cla
      /**< Cut-off for multi-body interactions, also 2-body bonded when \p cutoff_mody > \p cutoff */
      real cutoff_mbody = 0;
      /**< The minimum guaranteed cell-size, Cartesian indexing */
 -    rvec cellsize_min = {};
 +    gmx::RVec cellsize_min = { 0, 0, 0 };
      /**< The minimum guaranteed cell-size with dlb=auto */
 -    rvec cellsize_min_dlb = {};
 +    gmx::RVec cellsize_min_dlb = { 0, 0, 0 };
      /**< The lower limit for the DD cell size with DLB */
      real cellsize_limit = 0;
      /**< Effectively no NB cut-off limit with DLB for systems without PBC? */
 -    gmx_bool bVacDLBNoLimit = false;
 +    bool bVacDLBNoLimit = false;
  
      /** With PME load balancing we set limits on DLB */
 -    gmx_bool bPMELoadBalDLBLimits = false;
 +    bool bPMELoadBalDLBLimits = false;
      /** DLB needs to take into account that we want to allow this maximum
       *  cut-off (for PME load balancing), this could limit cell boundaries.
       */
      real PMELoadBal_max_cutoff = 0;
  
      /**< box lower corner, required with dim's without pbc and -gcom */
 -    rvec box0 = {};
 +    gmx::RVec box0 = { 0, 0, 0 };
      /**< box size, required with dim's without pbc and -gcom */
 -    rvec box_size = {};
 +    gmx::RVec box_size = { 0, 0, 0 };
  
      /**< The DD cell lower corner, in triclinic space */
 -    rvec cell_x0 = {};
 +    gmx::RVec cell_x0 = { 0, 0, 0 };
      /**< The DD cell upper corner, in triclinic space */
 -    rvec cell_x1 = {};
 +    gmx::RVec cell_x1 = { 0, 0, 0 };
  
      /**< The old \p cell_x0, to check cg displacements */
 -    rvec old_cell_x0 = {};
 +    gmx::RVec old_cell_x0 = { 0, 0, 0 };
      /**< The old \p cell_x1, to check cg displacements */
 -    rvec old_cell_x1 = {};
 +    gmx::RVec old_cell_x1 = { 0, 0, 0 };
  
      /** The communication setup and charge group boundaries for the zones */
      gmx_domdec_zones_t zones;
       * dynamic load balancing.
       */
      /**< Zone limits for dim 1 with staggered grids */
 -    gmx_ddzone_t zone_d1[2];
 +    std::array<gmx_ddzone_t, 2> zone_d1;
      /**< Zone limits for dim 2 with staggered grids */
      gmx_ddzone_t zone_d2[2][2];
  
      /** The coordinate/force communication setup and indices */
 -    gmx_domdec_comm_dim_t cd[DIM];
 +    std::array<gmx_domdec_comm_dim_t, DIM> cd;
      /** Restricts the maximum number of cells to communicate with in one dimension
       *
       * Dynamic load balancing is not permitted to change sizes if it
      int64_t master_cg_ddp_count = 0;
  
      /** The number of cg's received from the direct neighbors */
 -    int zone_ncg1[DD_MAXZONE] = { 0 };
 +    std::array<int, DD_MAXZONE> zone_ncg1 = { 0 };
  
      /** The atom ranges in the local state */
      DDAtomRanges atomRanges;
  
      /* Cycle counters over nstlist steps */
      /**< Total cycles counted */
 -    float cycl[ddCyclNr] = {};
 +    std::array<float, ddCyclNr> cycl = { 0 };
      /**< The number of cycle recordings */
 -    int cycl_n[ddCyclNr] = {};
 +    std::array<int, ddCyclNr> cycl_n = { 0 };
      /**< The maximum cycle count */
 -    float cycl_max[ddCyclNr] = {};
 +    std::array<float, ddCyclNr> cycl_max = { 0 };
      /**< Total flops counted */
      double flop = 0.0;
      /**< The number of flop recordings */
      /**< Max \p load_sum over the ranks */
      double load_max = 0.0;
      /**< Was load balancing limited, per DD dim */
 -    ivec load_lim = {};
 +    gmx::IVec load_lim = { 0, 0, 0 };
      /**< Total time on PP done during PME overlap time */
      double load_mdf = 0.0;
      /**< Total time on our PME-only rank */
index 6bae98b5599a0f5af104622f6ecbe2e56368a6aa,5489ffac6f4ee0e0bb392491ef8dbd91e9306b4a..7238fa320f9bd253ee694eddac64ea6944803329
@@@ -51,7 -51,6 +51,7 @@@
  #include "gromacs/listed_forces/bonded.h"
  #include "gromacs/math/functions.h"
  #include "gromacs/math/units.h"
 +#include "gromacs/math/utilities.h"
  #include "gromacs/math/vec.h"
  #include "gromacs/math/vecdump.h"
  #include "gromacs/pbcutil/pbc.h"
@@@ -108,7 -107,7 +108,7 @@@ static int calc_RBbin(real phi, int gmx
  
  static int calc_Nbin(real phi, int multiplicity, real core_frac)
  {
 -    static const real r360 = 360 * DEG2RAD;
 +    static const real r360 = 360 * gmx::c_deg2Rad;
      real              rot_width, core_width, core_offset, low, hi;
      int               bin;
      /* with multiplicity 3 and core_frac 0.5
      {
          low = ((bin - 1) * rot_width) + core_offset;
          hi  = ((bin - 1) * rot_width) + core_offset + core_width;
 -        low *= DEG2RAD;
 -        hi *= DEG2RAD;
 +        low *= gmx::c_deg2Rad;
 +        hi *= gmx::c_deg2Rad;
          if ((phi > low) && (phi < hi))
          {
              return bin;
@@@ -162,8 -161,8 +162,8 @@@ void ana_dih_trans(const char
          multiplicity[k] = 3;
      }
  
 -    low_ana_dih_trans(TRUE, fn_trans, TRUE, fn_histo, maxchi, dih, nlist, dlist, nframes, nangles,
 -                      grpname, multiplicity, time, bRb, 0.5, oenv);
 +    low_ana_dih_trans(
 +            TRUE, fn_trans, TRUE, fn_histo, maxchi, dih, nlist, dlist, nframes, nangles, grpname, multiplicity, time, bRb, 0.5, oenv);
      sfree(dlist);
      sfree(multiplicity);
  }
@@@ -195,7 -194,7 +195,7 @@@ void low_ana_dih_trans(gmx_boo
      int (*calc_bin)(real, int, real);
      real dt;
  
-     if (1 <= nframes)
+     if (nframes <= 1)
      {
          return;
      }
@@@ -576,7 -575,8 +576,7 @@@ void get_chi_product_traj(real*
              if (bAll)
              {
                  /* print cuml rotamer vs time */
 -                print_one(oenv, "chiproduct", dlist[i].name, "chi product for",
 -                          "cumulative rotamer", nframes, time, chi_prtrj);
 +                print_one(oenv, "chiproduct", dlist[i].name, "chi product for", "cumulative rotamer", nframes, time, chi_prtrj);
              }
  
              /* make a histogram pf culm. rotamer occupancy too */
@@@ -698,12 -698,13 +698,12 @@@ static void calc_angles(struct t_pbc* p
  
      for (i = ix = 0; (ix < n3); i++, ix += 3)
      {
 -        ang[i] = bond_angle(x_s[index[ix]], x_s[index[ix + 1]], x_s[index[ix + 2]], pbc, r_ij, r_kj,
 -                            &costh, &t1, &t2);
 +        ang[i] = bond_angle(
 +                x_s[index[ix]], x_s[index[ix + 1]], x_s[index[ix + 2]], pbc, r_ij, r_kj, &costh, &t1, &t2);
      }
      if (debug)
      {
 -        fprintf(debug, "Angle[0]=%g, costh=%g, index0 = %d, %d, %d\n", ang[0], costh, index[0],
 -                index[1], index[2]);
 +        fprintf(debug, "Angle[0]=%g, costh=%g, index0 = %d, %d, %d\n", ang[0], costh, index[0], index[1], index[2]);
          pr_rvec(debug, 0, "rij", r_ij, DIM, TRUE);
          pr_rvec(debug, 0, "rkj", r_kj, DIM, TRUE);
      }
@@@ -717,7 -718,7 +717,7 @@@ static real calc_fraction(const real an
  
      for (i = 0; i < nangles; i++)
      {
 -        angle = angles[i] * RAD2DEG;
 +        angle = angles[i] * gmx::c_rad2Deg;
  
          if (angle > 135 && angle < 225)
          {
@@@ -746,19 -747,8 +746,19 @@@ static void calc_dihs(struct t_pbc* pbc
  
      for (i = ix = 0; (ix < n4); i++, ix += 4)
      {
 -        aaa = dih_angle(x_s[index[ix]], x_s[index[ix + 1]], x_s[index[ix + 2]], x_s[index[ix + 3]],
 -                        pbc, r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
 +        aaa = dih_angle(x_s[index[ix]],
 +                        x_s[index[ix + 1]],
 +                        x_s[index[ix + 2]],
 +                        x_s[index[ix + 3]],
 +                        pbc,
 +                        r_ij,
 +                        r_kj,
 +                        r_kl,
 +                        m,
 +                        n,
 +                        &t1,
 +                        &t2,
 +                        &t3);
  
          ang[i] = aaa; /* not taking into account ryckaert bellemans yet */
      }
@@@ -782,7 -772,8 +782,7 @@@ void make_histo(FILE* log, int ndata, r
      dx = npoints / (maxx - minx);
      if (debug)
      {
 -        fprintf(debug, "Histogramming: ndata=%d, nhisto=%d, minx=%g,maxx=%g,dx=%g\n", ndata,
 -                npoints, minx, maxx, dx);
 +        fprintf(debug, "Histogramming: ndata=%d, nhisto=%d, minx=%g,maxx=%g,dx=%g\n", ndata, npoints, minx, maxx, dx);
      }
      for (i = 0; (i < ndata); i++)
      {
index 0abd978d194e4b0f3c4d20f97105441204848110,d049f74b8830713520d80f0c227db4e62bd04455..deed54bfdf0fb50b9b931a4e6d3ff918c0994a5a
@@@ -127,9 -127,10 +127,9 @@@ static gmx_bool bAllowed(real phi, rea
                                   "0000000000000000000000000000000000000000000000000000000000000",
                                   "0000000000000000000000000000000000000000000000000000000000000",
                                   "0000000000000000000000000000000000000000000000000000000000000" };
 -#define NPP asize(map)
 -    int x, y;
 +    int                x, y;
  
 -#define INDEX(ppp) (((static_cast<int>(360 + (ppp)*RAD2DEG)) % 360) / 6)
 +#define INDEX(ppp) (((static_cast<int>(360 + (ppp)*gmx::c_rad2Deg)) % 360) / 6)
      x = INDEX(phi);
      y = INDEX(psi);
  #undef INDEX
@@@ -595,7 -596,8 +595,7 @@@ static void histogramming(FILE
                      }
                      else if (debug)
                      {
 -                        fprintf(debug, "Res. %d has imcomplete occupancy or bfacs > %g\n",
 -                                dlist[i].resnr, bfac_max);
 +                        fprintf(debug, "Res. %d has imcomplete occupancy or bfacs > %g\n", dlist[i].resnr, bfac_max);
                      }
                  }
                  else
                      normalize_histo(nbin, his_aa[Dih][i], (360.0 / nbin), normhisto);
                  }
  
-                 residue_name = rt->nameFromResidueIndex(i).c_str();
+                 std::string residueName = rt->nameFromResidueIndex(i);
+                 residue_name            = residueName.c_str();
                  switch (Dih)
                  {
                      case edPhi:
                          break;
                      default:
                          sprintf(hisfile, "histo-chi%d%s", Dih - NONCHI + 1, residue_name);
 -                        sprintf(title, "\\xc\\f{}\\s%d\\N Distribution for %s", Dih - NONCHI + 1,
 -                                residue_name);
 +                        sprintf(title, "\\xc\\f{}\\s%d\\N Distribution for %s", Dih - NONCHI + 1, residue_name);
                  }
                  std::strcpy(hhisfile, hisfile);
                  std::strcat(hhisfile, ".xvg");
@@@ -931,18 -935,16 +932,18 @@@ static void do_rama(in
              Psi = dlist[i].j0[edPsi];
              for (j = 0; (j < nf); j++)
              {
 -                phi = RAD2DEG * dih[Phi][j];
 -                psi = RAD2DEG * dih[Psi][j];
 +                phi = gmx::c_rad2Deg * dih[Phi][j];
 +                psi = gmx::c_rad2Deg * dih[Psi][j];
                  fprintf(fp, "%10g  %10g\n", phi, psi);
                  if (bViol)
                  {
 -                    fprintf(gp, "%d\n", static_cast<int>(!bAllowed(dih[Phi][j], RAD2DEG * dih[Psi][j])));
 +                    fprintf(gp,
 +                            "%d\n",
 +                            static_cast<int>(!bAllowed(dih[Phi][j], gmx::c_rad2Deg * dih[Psi][j])));
                  }
                  if (bOm)
                  {
 -                    omega = RAD2DEG * dih[Om][j];
 +                    omega = gmx::c_rad2Deg * dih[Om][j];
                      mat[static_cast<int>(((phi * NMAT) / 360) + gmx::exactDiv(NMAT, 2))]
                         [static_cast<int>(((psi * NMAT) / 360) + gmx::exactDiv(NMAT, 2))] += omega;
                  }
                  lo += 180;
                  hi += 180;
                  nlevels = 20;
 -                write_xpm3(fp, 0, "Omega/Ramachandran Plot", "Deg", "Phi", "Psi", NMAT, NMAT, axis,
 -                           axis, mat, lo, 180.0, hi, rlo, rmid, rhi, &nlevels);
 +                write_xpm3(fp,
 +                           0,
 +                           "Omega/Ramachandran Plot",
 +                           "Deg",
 +                           "Phi",
 +                           "Psi",
 +                           NMAT,
 +                           NMAT,
 +                           axis,
 +                           axis,
 +                           mat,
 +                           lo,
 +                           180.0,
 +                           hi,
 +                           rlo,
 +                           rmid,
 +                           rhi,
 +                           &nlevels);
                  gmx_ffclose(fp);
                  for (j = 0; (j < NMAT); j++)
                  {
          if ((has_dihedral(edChi1, &(dlist[i]))) && (has_dihedral(edChi2, &(dlist[i]))))
          {
              sprintf(fn, "ramaX1X2%s.xvg", dlist[i].name);
 -            fp  = rama_file(fn, "\\8c\\4\\s1\\N-\\8c\\4\\s2\\N Ramachandran Plot",
 -                           "\\8c\\4\\s1\\N (deg)", "\\8c\\4\\s2\\N (deg)", oenv);
 +            fp  = rama_file(fn,
 +                           "\\8c\\4\\s1\\N-\\8c\\4\\s2\\N Ramachandran Plot",
 +                           "\\8c\\4\\s1\\N (deg)",
 +                           "\\8c\\4\\s2\\N (deg)",
 +                           oenv);
              Xi1 = dlist[i].j0[edChi1];
              Xi2 = dlist[i].j0[edChi2];
              for (j = 0; (j < nf); j++)
              {
 -                fprintf(fp, "%10g  %10g\n", RAD2DEG * dih[Xi1][j], RAD2DEG * dih[Xi2][j]);
 +                fprintf(fp, "%10g  %10g\n", gmx::c_rad2Deg * dih[Xi1][j], gmx::c_rad2Deg * dih[Xi2][j]);
              }
              xvgrclose(fp);
          }
@@@ -1044,6 -1027,7 +1045,6 @@@ static void print_transitions(const cha
  
      /*  must correspond with enum in pp2shift.h:38 */
      char* leg[edMax];
 -#define NLEG asize(leg)
  
      leg[0] = gmx_strdup("Phi");
      leg[1] = gmx_strdup("Psi");
@@@ -1212,20 -1196,8 +1213,20 @@@ static void order_params(FILE
          z0 *= 10.0; /* nm -> angstrom */
          for (i = 0; (i < 10); i++)
          {
 -            gmx_fprintf_pdb_atomline(fp, epdbATOM, atoms->nr + 1 + i, "CA", ' ', "LEG", ' ',
 -                                     atoms->nres + 1, ' ', x0, y0, z0 + (1.2 * i), 0.0, -0.1 * i,
 +            gmx_fprintf_pdb_atomline(fp,
 +                                     PdbRecordType::Atom,
 +                                     atoms->nr + 1 + i,
 +                                     "CA",
 +                                     ' ',
 +                                     "LEG",
 +                                     ' ',
 +                                     atoms->nres + 1,
 +                                     ' ',
 +                                     x0,
 +                                     y0,
 +                                     z0 + (1.2 * i),
 +                                     0.0,
 +                                     -0.1 * i,
                                       "");
          }
          gmx_ffclose(fp);
@@@ -1464,8 -1436,8 +1465,8 @@@ int gmx_chi(int argc, char* argv[]
  
      npargs = asize(pa);
      ppa    = add_acf_pargs(&npargs, pa);
 -    if (!parse_common_args(&argc, argv, PCA_CAN_VIEW | PCA_CAN_TIME, NFILE, fnm, npargs, ppa,
 -                           asize(desc), desc, asize(bugs), bugs, &oenv))
 +    if (!parse_common_args(
 +                &argc, argv, PCA_CAN_VIEW | PCA_CAN_TIME, NFILE, fnm, npargs, ppa, asize(desc), desc, asize(bugs), bugs, &oenv))
      {
          sfree(ppa);
          return 0;
      snew(dih, ndih);
  
      /* COMPUTE ALL DIHEDRALS! */
 -    read_ang_dih(ftp2fn(efTRX, NFILE, fnm), FALSE, TRUE, FALSE, bPBC, 1, &idum, &nf, &time, isize,
 -                 index, &trans_frac, &aver_angle, dih, oenv);
 +    read_ang_dih(
 +            ftp2fn(efTRX, NFILE, fnm), FALSE, TRUE, FALSE, bPBC, 1, &idum, &nf, &time, isize, index, &trans_frac, &aver_angle, dih, oenv);
  
      dt = (time[nf - 1] - time[0]) / (nf - 1); /* might want this for corr or n. transit*/
      if (bCorr)
      }
  
      /* put angles in -M_PI to M_PI ! and correct phase factor for phi and psi
-      * pass nactdih instead of ndih to low_ana_dih_trans and get_chi_product_traj
+      * pass nactdih instead of ndih to low_ana_dih_trans
       * to prevent accessing off end of arrays when maxchi < 5 or 6. */
      nactdih = reset_em_all(nlist, dlist, nf, dih, maxchi);
  
      }
  
      /* Histogramming & J coupling constants & calc of S2 order params */
 -    histogramming(log, nbin, &rt, nf, maxchi, dih, nlist, dlist, index, bPhi, bPsi, bOmega, bChi,
 -                  bNormHisto, bSSHisto, ftp2fn(efDAT, NFILE, fnm), bfac_max, &atoms, bDo_jc,
 -                  opt2fn("-jc", NFILE, fnm), oenv);
 +    histogramming(log,
 +                  nbin,
 +                  &rt,
 +                  nf,
 +                  maxchi,
 +                  dih,
 +                  nlist,
 +                  dlist,
 +                  index,
 +                  bPhi,
 +                  bPsi,
 +                  bOmega,
 +                  bChi,
 +                  bNormHisto,
 +                  bSSHisto,
 +                  ftp2fn(efDAT, NFILE, fnm),
 +                  bfac_max,
 +                  &atoms,
 +                  bDo_jc,
 +                  opt2fn("-jc", NFILE, fnm),
 +                  oenv);
  
      /* transitions
       *
      }
  
  
 -    low_ana_dih_trans(bDo_ot, opt2fn("-ot", NFILE, fnm), bDo_oh, opt2fn("-oh", NFILE, fnm), maxchi, dih,
 -                      nlist, dlist, nf, nactdih, grpname, multiplicity, time, FALSE, core_frac, oenv);
 +    low_ana_dih_trans(bDo_ot,
 +                      opt2fn("-ot", NFILE, fnm),
 +                      bDo_oh,
 +                      opt2fn("-oh", NFILE, fnm),
 +                      maxchi,
 +                      dih,
 +                      nlist,
 +                      dlist,
 +                      nf,
 +                      nactdih,
 +                      grpname,
 +                      multiplicity,
 +                      time,
 +                      FALSE,
 +                      core_frac,
 +                      oenv);
  
      /* Order parameters */
 -    order_params(log, opt2fn("-o", NFILE, fnm), maxchi, nlist, dlist, ftp2fn_null(efPDB, NFILE, fnm),
 -                 bfac_init, &atoms, x, pbcType, box, bPhi, bPsi, bChi, oenv);
 +    order_params(log,
 +                 opt2fn("-o", NFILE, fnm),
 +                 maxchi,
 +                 nlist,
 +                 dlist,
 +                 ftp2fn_null(efPDB, NFILE, fnm),
 +                 bfac_init,
 +                 &atoms,
 +                 x,
 +                 pbcType,
 +                 box,
 +                 bPhi,
 +                 bPsi,
 +                 bChi,
 +                 oenv);
  
      /* Print ramachandran maps! */
      if (bRama)
          }
          mk_chi_lookup(chi_lookup, maxchi, nlist, dlist);
  
 -        get_chi_product_traj(dih, nf, nlist, maxchi, dlist, time, chi_lookup, multiplicity, FALSE,
 -                             bNormHisto, core_frac, bAll, opt2fn("-cp", NFILE, fnm), oenv);
 +        get_chi_product_traj(dih,
 +                             nf,
-                              nactdih,
++                             nlist,
 +                             maxchi,
 +                             dlist,
 +                             time,
 +                             chi_lookup,
 +                             multiplicity,
 +                             FALSE,
 +                             bNormHisto,
 +                             core_frac,
 +                             bAll,
 +                             opt2fn("-cp", NFILE, fnm),
 +                             oenv);
  
          for (i = 0; i < nlist; i++)
          {
      /* Correlation comes last because it messes up the angles */
      if (bCorr)
      {
 -        do_dihcorr(opt2fn("-corr", NFILE, fnm), nf, ndih, dih, dt, nlist, dlist, time, maxchi, bPhi,
 -                   bPsi, bChi, bOmega, oenv);
 +        do_dihcorr(opt2fn("-corr", NFILE, fnm), nf, ndih, dih, dt, nlist, dlist, time, maxchi, bPhi, bPsi, bChi, bOmega, oenv);
      }
  
  
index e3eb7ff8f227766936c457fa2636eca9d310aa18,8d19c598c2a03a6baf8aa6a3b940af7ea1badd16..2462285bef078b77394308accdff5101000d2387
  #include "gromacs/domdec/domdec_struct.h"
  #include "gromacs/domdec/gpuhaloexchange.h"
  #include "gromacs/domdec/localatomsetmanager.h"
 +#include "gromacs/domdec/makebondedlinks.h"
  #include "gromacs/domdec/partition.h"
 +#include "gromacs/domdec/reversetopology.h"
  #include "gromacs/ewald/ewald_utils.h"
 +#include "gromacs/ewald/pme.h"
  #include "gromacs/ewald/pme_gpu_program.h"
  #include "gromacs/ewald/pme_only.h"
  #include "gromacs/ewald/pme_pp_comm_gpu.h"
@@@ -84,7 -81,7 +84,7 @@@
  #include "gromacs/hardware/printhardware.h"
  #include "gromacs/imd/imd.h"
  #include "gromacs/listed_forces/disre.h"
 -#include "gromacs/listed_forces/gpubonded.h"
 +#include "gromacs/listed_forces/listed_forces_gpu.h"
  #include "gromacs/listed_forces/listed_forces.h"
  #include "gromacs/listed_forces/orires.h"
  #include "gromacs/math/functions.h"
  #include "gromacs/utility/keyvaluetree.h"
  #include "gromacs/utility/logger.h"
  #include "gromacs/utility/loggerbuilder.h"
 -#include "gromacs/utility/mdmodulenotification.h"
 +#include "gromacs/utility/mdmodulesnotifiers.h"
  #include "gromacs/utility/physicalnodecommunicator.h"
  #include "gromacs/utility/pleasecite.h"
  #include "gromacs/utility/programcontext.h"
  #include "gromacs/utility/smalloc.h"
  #include "gromacs/utility/stringutil.h"
 +#include "gromacs/utility/mpiinfo.h"
  
  #include "isimulator.h"
  #include "membedholder.h"
@@@ -202,77 -198,19 +202,77 @@@ static DevelopmentFeatureFlags manageDe
  {
      DevelopmentFeatureFlags devFlags;
  
 -    // Some builds of GCC 5 give false positive warnings that these
 -    // getenv results are ignored when clearly they are used.
 -#pragma GCC diagnostic push
 -#pragma GCC diagnostic ignored "-Wunused-result"
 -
      devFlags.enableGpuBufferOps =
              GMX_GPU_CUDA && useGpuForNonbonded && (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
 -    devFlags.enableGpuHaloExchange = GMX_GPU_CUDA && GMX_THREAD_MPI && getenv("GMX_GPU_DD_COMMS") != nullptr;
 +    devFlags.enableGpuHaloExchange = GMX_MPI && GMX_GPU_CUDA && getenv("GMX_GPU_DD_COMMS") != nullptr;
      devFlags.forceGpuUpdateDefault = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr) || GMX_FAHCORE;
 -    devFlags.enableGpuPmePPComm =
 -            GMX_GPU_CUDA && GMX_THREAD_MPI && getenv("GMX_GPU_PME_PP_COMMS") != nullptr;
 +    devFlags.enableGpuPmePPComm = GMX_MPI && GMX_GPU_CUDA && getenv("GMX_GPU_PME_PP_COMMS") != nullptr;
 +
 +    // Direct GPU comm path is being used with CUDA_AWARE_MPI
 +    // make sure underlying MPI implementation is CUDA-aware
 +    if (!GMX_THREAD_MPI && (devFlags.enableGpuPmePPComm || devFlags.enableGpuHaloExchange))
 +    {
 +        const bool haveDetectedCudaAwareMpi =
 +                (checkMpiCudaAwareSupport() == CudaAwareMpiStatus::Supported);
 +        const bool forceCudaAwareMpi = (getenv("GMX_FORCE_CUDA_AWARE_MPI") != nullptr);
 +
 +        if (!haveDetectedCudaAwareMpi && forceCudaAwareMpi)
 +        {
 +            // CUDA-aware support not detected in MPI library but, user has forced it's use
 +            GMX_LOG(mdlog.warning)
 +                    .asParagraph()
 +                    .appendTextFormatted(
 +                            "This run has forced use of 'CUDA-aware MPI'. "
 +                            "But, GROMACS cannot determine if underlying MPI "
 +                            "is CUDA-aware. GROMACS recommends use of latest openMPI version "
 +                            "for CUDA-aware support. "
 +                            "If you observe failures at runtime, try unsetting "
 +                            "GMX_FORCE_CUDA_AWARE_MPI environment variable.");
 +        }
 +
 +        if (haveDetectedCudaAwareMpi || forceCudaAwareMpi)
 +        {
 +            devFlags.usingCudaAwareMpi = true;
 +            GMX_LOG(mdlog.warning)
 +                    .asParagraph()
 +                    .appendTextFormatted(
 +                            "Using CUDA-aware MPI for 'GPU halo exchange' or 'GPU PME-PP "
 +                            "communications' feature.");
 +        }
 +        else
 +        {
 +            if (devFlags.enableGpuHaloExchange)
 +            {
 +                GMX_LOG(mdlog.warning)
 +                        .asParagraph()
 +                        .appendTextFormatted(
 +                                "GMX_GPU_DD_COMMS environment variable detected, but the 'GPU "
 +                                "halo exchange' feature will not be enabled as GROMACS couldn't "
 +                                "detect CUDA_aware support in underlying MPI implementation.");
 +                devFlags.enableGpuHaloExchange = false;
 +            }
 +            if (devFlags.enableGpuPmePPComm)
 +            {
 +                GMX_LOG(mdlog.warning)
 +                        .asParagraph()
 +                        .appendText(
 +                                "GMX_GPU_PME_PP_COMMS environment variable detected, but the "
 +                                "'GPU PME-PP communications' feature will not be enabled as "
 +                                "GROMACS couldn't "
 +                                "detect CUDA_aware support in underlying MPI implementation.");
 +                devFlags.enableGpuPmePPComm = false;
 +            }
  
 -#pragma GCC diagnostic pop
 +            GMX_LOG(mdlog.warning)
 +                    .asParagraph()
 +                    .appendTextFormatted(
 +                            "GROMACS recommends use of latest OpenMPI version for CUDA-aware "
 +                            "support. "
 +                            "If you are certain about CUDA-aware support in your MPI library, "
 +                            "you can force it's use by setting environment variable "
 +                            " GMX_FORCE_CUDA_AWARE_MPI.");
 +        }
 +    }
  
      if (devFlags.enableGpuBufferOps)
      {
@@@ -434,7 -372,7 +434,7 @@@ static void mdrunner_start_fn(const voi
  {
      try
      {
 -        auto masterMdrunner = reinterpret_cast<const gmx::Mdrunner*>(arg);
 +        const auto* masterMdrunner = reinterpret_cast<const gmx::Mdrunner*>(arg);
          /* copy the arg list to make sure that it's thread-local. This
             doesn't copy pointed-to items, of course; fnm, cr and fplog
             are reset in the call below, all others should be const. */
@@@ -450,7 -388,8 +450,7 @@@ void Mdrunner::spawnThreads(int numThre
  #if GMX_THREAD_MPI
      /* now spawn new threads that start mdrunner_start_fn(), while
         the main thread returns. Thread affinity is handled later. */
 -    if (tMPI_Init_fn(TRUE, numThreadsToLaunch, TMPI_AFFINITY_NONE, mdrunner_start_fn,
 -                     static_cast<const void*>(this))
 +    if (tMPI_Init_fn(TRUE, numThreadsToLaunch, TMPI_AFFINITY_NONE, mdrunner_start_fn, static_cast<const void*>(this))
          != TMPI_SUCCESS)
      {
          GMX_THROW(gmx::InternalError("Failed to spawn thread-MPI threads"));
@@@ -474,14 -413,14 +474,14 @@@ static void prepare_verlet_scheme(FILE
                                    t_commrec*          cr,
                                    t_inputrec*         ir,
                                    int                 nstlist_cmdline,
 -                                  const gmx_mtop_t*   mtop,
 +                                  const gmx_mtop_t&   mtop,
                                    const matrix        box,
                                    bool                makeGpuPairList,
                                    const gmx::CpuInfo& cpuinfo)
  {
      // We checked the cut-offs in grompp, but double-check here.
      // We have PME+LJcutoff kernels for rcoulomb>rvdw.
 -    if (EEL_PME_EWALD(ir->coulombtype) && ir->vdwtype == eelCUT)
 +    if (EEL_PME_EWALD(ir->coulombtype) && ir->vdwtype == VanDerWaalsType::Cut)
      {
          GMX_RELEASE_ASSERT(ir->rcoulomb >= ir->rvdw,
                             "With Verlet lists and PME we should have rcoulomb>=rvdw");
                             "With Verlet lists and no PME rcoulomb and rvdw should be identical");
      }
      /* For NVE simulations, we will retain the initial list buffer */
 -    if (EI_DYNAMICS(ir->eI) && ir->verletbuf_tol > 0 && !(EI_MD(ir->eI) && ir->etc == etcNO))
 +    if (EI_DYNAMICS(ir->eI) && ir->verletbuf_tol > 0
 +        && !(EI_MD(ir->eI) && ir->etc == TemperatureCoupling::No))
      {
          /* Update the Verlet buffer size for the current run setup */
  
          VerletbufListSetup listSetup = verletbufGetSafeListSetup(listType);
  
          const real rlist_new =
 -                calcVerletBufferSize(*mtop, det(box), *ir, ir->nstlist, ir->nstlist - 1, -1, listSetup);
 +                calcVerletBufferSize(mtop, det(box), *ir, ir->nstlist, ir->nstlist - 1, -1, listSetup);
  
          if (rlist_new != ir->rlist)
          {
              {
                  fprintf(fplog,
                          "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
 -                        ir->rlist, rlist_new, listSetup.cluster_size_i, listSetup.cluster_size_j);
 +                        ir->rlist,
 +                        rlist_new,
 +                        listSetup.cluster_size_i,
 +                        listSetup.cluster_size_j);
              }
              ir->rlist = rlist_new;
          }
  
      if (nstlist_cmdline > 0 && (!EI_DYNAMICS(ir->eI) || ir->verletbuf_tol <= 0))
      {
 -        gmx_fatal(FARGS, "Can not set nstlist without %s",
 +        gmx_fatal(FARGS,
 +                  "Can not set nstlist without %s",
                    !EI_DYNAMICS(ir->eI) ? "dynamics" : "verlet-buffer-tolerance");
      }
  
      if (EI_DYNAMICS(ir->eI))
      {
          /* Set or try nstlist values */
 -        increaseNstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, makeGpuPairList, cpuinfo);
 +        increaseNstlist(fplog, cr, ir, nstlist_cmdline, &mtop, box, makeGpuPairList, cpuinfo);
      }
  }
  
@@@ -556,13 -490,11 +556,13 @@@ static void override_nsteps_cmdline(con
          {
              sprintf(sbuf_msg,
                      "Overriding nsteps with value passed on the command line: %s steps, %.3g ps",
 -                    gmx_step_str(nsteps_cmdline, sbuf_steps), fabs(nsteps_cmdline * ir->delta_t));
 +                    gmx_step_str(nsteps_cmdline, sbuf_steps),
 +                    fabs(nsteps_cmdline * ir->delta_t));
          }
          else
          {
 -            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps",
 +            sprintf(sbuf_msg,
 +                    "Overriding nsteps with value passed on the command line: %s steps",
                      gmx_step_str(nsteps_cmdline, sbuf_steps));
          }
  
@@@ -658,9 -590,9 +658,9 @@@ static TaskTarget findTaskTarget(const 
  static void finish_run(FILE*                     fplog,
                         const gmx::MDLogger&      mdlog,
                         const t_commrec*          cr,
 -                       const t_inputrec*         inputrec,
 +                       const t_inputrec&         inputrec,
                         t_nrnb                    nrnb[],
 -                       gmx_wallcycle_t           wcycle,
 +                       gmx_wallcycle           wcycle,
                         gmx_walltime_accounting_t walltime_accounting,
                         nonbonded_verlet_t*       nbv,
                         const gmx_pme_t*          pme,
         Further, we only report performance for dynamical integrators,
         because those are the only ones for which we plan to
         consider doing any optimizations. */
 -    bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
 +    bool printReport = EI_DYNAMICS(inputrec.eI) && SIMMASTER(cr);
  
      if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
      {
          nrnbTotalStorage = std::make_unique<t_nrnb>();
          nrnb_tot         = nrnbTotalStorage.get();
  #if GMX_MPI
 -        MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 +        MPI_Allreduce(nrnb->n.data(), nrnb_tot->n.data(), eNRNB, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
  #endif
      }
      else
      {
  #if GMX_MPI
          /* reduce elapsed_time over all MPI ranks in the current simulation */
 -        MPI_Allreduce(&elapsed_time, &elapsed_time_over_all_ranks, 1, MPI_DOUBLE, MPI_SUM,
 -                      cr->mpi_comm_mysim);
 +        MPI_Allreduce(&elapsed_time, &elapsed_time_over_all_ranks, 1, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
          elapsed_time_over_all_ranks /= cr->nnodes;
          /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
           * current simulation. */
 -        MPI_Allreduce(&elapsed_time_over_all_threads, &elapsed_time_over_all_threads_over_all_ranks,
 -                      1, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 +        MPI_Allreduce(&elapsed_time_over_all_threads,
 +                      &elapsed_time_over_all_threads_over_all_ranks,
 +                      1,
 +                      MPI_DOUBLE,
 +                      MPI_SUM,
 +                      cr->mpi_comm_mysim);
  #endif
      }
      else
       * to the code that handled the thread region, so that there's a
       * mechanism to keep cycle counting working during the transition
       * to task parallelism. */
 -    int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
 -    int nthreads_pme = gmx_omp_nthreads_get(emntPME);
 -    wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP),
 -                                   nthreads_pp, nthreads_pme);
 +    int nthreads_pp  = gmx_omp_nthreads_get(ModuleMultiThread::Nonbonded);
 +    int nthreads_pme = gmx_omp_nthreads_get(ModuleMultiThread::Pme);
 +    wallcycle_scale_by_num_threads(
 +            wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
      auto cycle_sum(wallcycle_sum(cr, wcycle));
  
      if (printReport)
      {
 -        auto nbnxn_gpu_timings =
 +        auto* nbnxn_gpu_timings =
                  (nbv != nullptr && nbv->useGpu()) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
          gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
  
          {
              pme_gpu_get_timings(pme, &pme_gpu_timings);
          }
 -        wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
 -                        elapsed_time_over_all_ranks, wcycle, cycle_sum, nbnxn_gpu_timings,
 +        wallcycle_print(fplog,
 +                        mdlog,
 +                        cr->nnodes,
 +                        cr->npmenodes,
 +                        nthreads_pp,
 +                        nthreads_pme,
 +                        elapsed_time_over_all_ranks,
 +                        wcycle,
 +                        cycle_sum,
 +                        nbnxn_gpu_timings,
                          &pme_gpu_timings);
  
 -        if (EI_DYNAMICS(inputrec->eI))
 +        if (EI_DYNAMICS(inputrec.eI))
          {
 -            delta_t = inputrec->delta_t;
 +            delta_t = inputrec.delta_t;
          }
  
          if (fplog)
          {
 -            print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks, elapsed_time_over_all_ranks,
 +            print_perf(fplog,
 +                       elapsed_time_over_all_threads_over_all_ranks,
 +                       elapsed_time_over_all_ranks,
                         walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
 -                       delta_t, nbfs, mflop);
 +                       delta_t,
 +                       nbfs,
 +                       mflop);
          }
          if (bWriteStat)
          {
 -            print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks, elapsed_time_over_all_ranks,
 +            print_perf(stderr,
 +                       elapsed_time_over_all_threads_over_all_ranks,
 +                       elapsed_time_over_all_ranks,
                         walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
 -                       delta_t, nbfs, mflop);
 +                       delta_t,
 +                       nbfs,
 +                       mflop);
          }
      }
  }
  
  int Mdrunner::mdrunner()
  {
 -    matrix                    box;
 -    t_forcerec*               fr               = nullptr;
 -    real                      ewaldcoeff_q     = 0;
 -    real                      ewaldcoeff_lj    = 0;
 -    int                       nChargePerturbed = -1, nTypePerturbed = 0;
 -    gmx_wallcycle_t           wcycle;
 -    gmx_walltime_accounting_t walltime_accounting = nullptr;
 -    MembedHolder              membedHolder(filenames.size(), filenames.data());
 +    matrix                      box;
 +    std::unique_ptr<t_forcerec> fr;
 +    real                        ewaldcoeff_q     = 0;
 +    real                        ewaldcoeff_lj    = 0;
 +    int                         nChargePerturbed = -1, nTypePerturbed = 0;
 +    gmx_walltime_accounting_t   walltime_accounting = nullptr;
 +    MembedHolder                membedHolder(filenames.size(), filenames.data());
  
      /* CAUTION: threads may be started later on in this function, so
         cr doesn't reflect the final parallel state right now */
  
      gmx_print_detected_hardware(fplog, isSimulationMasterRank && isMasterSim(ms), mdlog, hwinfo_);
  
 -    std::vector<int> gpuIdsToUse = makeGpuIdsToUse(hwinfo_->deviceInfoList, hw_opt.gpuIdsAvailable);
 -    const int        numDevicesToUse = gmx::ssize(gpuIdsToUse);
 +    std::vector<int> availableDevices =
 +            makeListOfAvailableDevices(hwinfo_->deviceInfoList, hw_opt.devicesSelectedByUser);
 +    const int numAvailableDevices = gmx::ssize(availableDevices);
  
      // Print citation requests after all software/hardware printing
      pleaseCiteGromacs(fplog);
          /* Read (nearly) all data required for the simulation
           * and keep the partly serialized tpr contents to send to other ranks later
           */
 -        applyGlobalSimulationState(*inputHolder_.get(), partialDeserializedTpr.get(),
 -                                   globalState.get(), inputrec.get(), &mtop);
 +        applyGlobalSimulationState(
 +                *inputHolder_.get(), partialDeserializedTpr.get(), globalState.get(), inputrec.get(), &mtop);
      }
  
      /* Check and update the hardware options for internal consistency */
 -    checkAndUpdateHardwareOptions(mdlog, &hw_opt, isSimulationMasterRank, domdecOptions.numPmeRanks,
 -                                  inputrec.get());
 +    checkAndUpdateHardwareOptions(
 +            mdlog, &hw_opt, isSimulationMasterRank, domdecOptions.numPmeRanks, inputrec.get());
  
      if (GMX_THREAD_MPI && isSimulationMasterRank)
      {
              // the number of GPUs to choose the number of ranks.
              auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
              useGpuForNonbonded         = decideWhetherToUseGpusForNonbondedWithThreadMpi(
 -                    nonbondedTarget, numDevicesToUse, userGpuTaskAssignment, emulateGpuNonbonded,
 +                    nonbondedTarget,
 +                    numAvailableDevices > 0,
 +                    userGpuTaskAssignment,
 +                    emulateGpuNonbonded,
                      canUseGpuForNonbonded,
                      gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, GMX_THREAD_MPI),
                      hw_opt.nthreads_tmpi);
 -            useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi(
 -                    useGpuForNonbonded, pmeTarget, numDevicesToUse, userGpuTaskAssignment, *hwinfo_,
 -                    *inputrec, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
 +            useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi(useGpuForNonbonded,
 +                                                                     pmeTarget,
 +                                                                     numAvailableDevices,
 +                                                                     userGpuTaskAssignment,
 +                                                                     *hwinfo_,
 +                                                                     *inputrec,
 +                                                                     hw_opt.nthreads_tmpi,
 +                                                                     domdecOptions.numPmeRanks);
          }
          GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
  
           * TODO Over-writing the user-supplied value here does
           * prevent any possible subsequent checks from working
           * correctly. */
 -        hw_opt.nthreads_tmpi =
 -                get_nthreads_mpi(hwinfo_, &hw_opt, numDevicesToUse, useGpuForNonbonded, useGpuForPme,
 -                                 inputrec.get(), &mtop, mdlog, membedHolder.doMembed());
 +        hw_opt.nthreads_tmpi = get_nthreads_mpi(hwinfo_,
 +                                                &hw_opt,
 +                                                numAvailableDevices,
 +                                                useGpuForNonbonded,
 +                                                useGpuForPme,
 +                                                inputrec.get(),
 +                                                mtop,
 +                                                mdlog,
 +                                                membedHolder.doMembed());
  
          // Now start the threads for thread MPI.
          spawnThreads(hw_opt.nthreads_tmpi);
              // On non-master ranks, allocate the object that will receive data in the following call.
              inputrec = std::make_unique<t_inputrec>();
          }
 -        init_parallel(cr->mpiDefaultCommunicator, MASTER(cr), inputrec.get(), &mtop,
 +        init_parallel(cr->mpiDefaultCommunicator,
 +                      MASTER(cr),
 +                      inputrec.get(),
 +                      &mtop,
                        partialDeserializedTpr.get());
      }
      GMX_RELEASE_ASSERT(inputrec != nullptr, "All ranks should have a valid inputrec now");
      partialDeserializedTpr.reset(nullptr);
  
 +    GMX_RELEASE_ASSERT(
 +            !inputrec->useConstantAcceleration,
 +            "Linear acceleration has been removed in GROMACS 2022, and was broken for many years "
 +            "before that. Use GROMACS 4.5 or earlier if you need this feature.");
 +
      // Now the number of ranks is known to all ranks, and each knows
      // the inputrec read by the master rank. The ranks can now all run
      // the task-deciding functions and will agree on the result
      // without needing to communicate.
 -    const bool useDomainDecomposition = (PAR(cr) && !(EI_TPI(inputrec->eI) || inputrec->eI == eiNM));
 +    const bool useDomainDecomposition =
 +            (PAR(cr) && !(EI_TPI(inputrec->eI) || inputrec->eI == IntegrationAlgorithm::NM));
  
      // Note that these variables describe only their own node.
      //
          // assignment.
          auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
          useGpuForNonbonded         = decideWhetherToUseGpusForNonbonded(
 -                nonbondedTarget, userGpuTaskAssignment, emulateGpuNonbonded, canUseGpuForNonbonded,
 -                gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, !GMX_THREAD_MPI), gpusWereDetected);
 -        useGpuForPme = decideWhetherToUseGpusForPme(
 -                useGpuForNonbonded, pmeTarget, userGpuTaskAssignment, *hwinfo_, *inputrec,
 -                cr->sizeOfDefaultCommunicator, domdecOptions.numPmeRanks, gpusWereDetected);
 -        useGpuForBonded = decideWhetherToUseGpusForBonded(useGpuForNonbonded, useGpuForPme,
 -                                                          bondedTarget, *inputrec, mtop,
 -                                                          domdecOptions.numPmeRanks, gpusWereDetected);
 +                nonbondedTarget,
 +                userGpuTaskAssignment,
 +                emulateGpuNonbonded,
 +                canUseGpuForNonbonded,
 +                gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, !GMX_THREAD_MPI),
 +                gpusWereDetected);
 +        useGpuForPme    = decideWhetherToUseGpusForPme(useGpuForNonbonded,
 +                                                    pmeTarget,
 +                                                    userGpuTaskAssignment,
 +                                                    *hwinfo_,
 +                                                    *inputrec,
 +                                                    cr->sizeOfDefaultCommunicator,
 +                                                    domdecOptions.numPmeRanks,
 +                                                    gpusWereDetected);
 +        useGpuForBonded = decideWhetherToUseGpusForBonded(
 +                useGpuForNonbonded, useGpuForPme, bondedTarget, *inputrec, mtop, domdecOptions.numPmeRanks, gpusWereDetected);
      }
      GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
  
      const DevelopmentFeatureFlags devFlags =
              manageDevelopmentFeatures(mdlog, useGpuForNonbonded, pmeRunMode);
  
 -    const bool useModularSimulator =
 -            checkUseModularSimulator(false, inputrec.get(), doRerun, mtop, ms, replExParams,
 -                                     nullptr, doEssentialDynamics, membedHolder.doMembed());
 +    const bool useModularSimulator = checkUseModularSimulator(false,
 +                                                              inputrec.get(),
 +                                                              doRerun,
 +                                                              mtop,
 +                                                              ms,
 +                                                              replExParams,
 +                                                              nullptr,
 +                                                              doEssentialDynamics,
 +                                                              membedHolder.doMembed());
  
      // Build restraints.
      // TODO: hide restraint implementation details from Mdrunner.
  
      // TODO: Error handling
      mdModules_->assignOptionsToModules(*inputrec->params, nullptr);
 -    // now that the MdModules know their options, they know which callbacks to sign up to
 +    // now that the MDModules know their options, they know which callbacks to sign up to
      mdModules_->subscribeToSimulationSetupNotifications();
 -    const auto& mdModulesNotifier = mdModules_->notifier().simulationSetupNotifications_;
 +    const auto& setupNotifier = mdModules_->notifiers().simulationSetupNotifier_;
  
 +    // Notify MdModules of existing logger
 +    setupNotifier.notify(mdlog);
 +
 +    // Notify MdModules of internal parameters, saved into KVT
      if (inputrec->internalParameters != nullptr)
      {
 -        mdModulesNotifier.notify(*inputrec->internalParameters);
 +        setupNotifier.notify(*inputrec->internalParameters);
 +    }
 +
 +    // Let MdModules know the .tpr filename
 +    {
 +        gmx::MdRunInputFilename mdRunInputFilename = { ftp2fn(efTPR, filenames.size(), filenames.data()) };
 +        setupNotifier.notify(mdRunInputFilename);
      }
  
      if (fplog != nullptr)
      if (SIMMASTER(cr))
      {
          /* In rerun, set velocities to zero if present */
 -        if (doRerun && ((globalState->flags & (1 << estV)) != 0))
 +        if (doRerun && ((globalState->flags & enumValueToBitMask(StateEntry::V)) != 0))
          {
              // rerun does not use velocities
              GMX_LOG(mdlog.info)
              {
                  clear_rvec(globalState->v[i]);
              }
 -            globalState->flags &= ~(1 << estV);
 +            globalState->flags &= ~enumValueToBitMask(StateEntry::V);
          }
  
          /* now make sure the state is initialized and propagated */
      /* NM and TPI parallelize over force/energy calculations, not atoms,
       * so we need to initialize and broadcast the global state.
       */
 -    if (inputrec->eI == eiNM || inputrec->eI == eiTPI)
 +    if (inputrec->eI == IntegrationAlgorithm::NM || inputrec->eI == IntegrationAlgorithm::TPI)
      {
          if (!MASTER(cr))
          {
              globalState = std::make_unique<t_state>();
          }
 -        broadcastStateWithoutDynamics(cr->mpiDefaultCommunicator, DOMAINDECOMP(cr), PAR(cr),
 -                                      globalState.get());
 +        broadcastStateWithoutDynamics(
 +                cr->mpiDefaultCommunicator, DOMAINDECOMP(cr), PAR(cr), globalState.get());
      }
  
      /* A parallel command line option consistency check that we can
  #endif
      }
  
 -    if (doRerun && (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
 +    if (doRerun && (EI_ENERGY_MINIMIZATION(inputrec->eI) || IntegrationAlgorithm::NM == inputrec->eI))
      {
          gmx_fatal(FARGS,
                    "The .mdp file specified an energy mininization or normal mode algorithm, and "
                    "these are not compatible with mdrun -rerun");
      }
  
 -    if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
 -    {
 -        if (domdecOptions.numPmeRanks > 0)
 -        {
 -            gmx_fatal_collective(FARGS, cr->mpiDefaultCommunicator, MASTER(cr),
 -                                 "PME-only ranks are requested, but the system does not use PME "
 -                                 "for electrostatics or LJ");
 -        }
 -
 -        domdecOptions.numPmeRanks = 0;
 -    }
 -
 -    if (useGpuForNonbonded && domdecOptions.numPmeRanks < 0)
 -    {
 -        /* With NB GPUs we don't automatically use PME-only CPU ranks. PME ranks can
 -         * improve performance with many threads per GPU, since our OpenMP
 -         * scaling is bad, but it's difficult to automate the setup.
 -         */
 -        domdecOptions.numPmeRanks = 0;
 -    }
 -    if (useGpuForPme)
 -    {
 -        if (domdecOptions.numPmeRanks < 0)
 -        {
 -            domdecOptions.numPmeRanks = 0;
 -            // TODO possibly print a note that one can opt-in for a separate PME GPU rank?
 -        }
 -        else
 -        {
 -            GMX_RELEASE_ASSERT(domdecOptions.numPmeRanks <= 1,
 -                               "PME GPU decomposition is not supported");
 -        }
 -    }
 -
      /* NMR restraints must be initialized before load_checkpoint,
       * since with time averaging the history is added to t_state.
       * For proper consistency check we therefore need to extend
      /* This needs to be called before read_checkpoint to extend the state */
      t_disresdata* disresdata;
      snew(disresdata, 1);
 -    init_disres(fplog, &mtop, inputrec.get(), DisResRunMode::MDRun,
 +    init_disres(fplog,
 +                mtop,
 +                inputrec.get(),
 +                DisResRunMode::MDRun,
                  MASTER(cr) ? DDRole::Master : DDRole::Agent,
 -                PAR(cr) ? NumRanks::Multiple : NumRanks::Single, cr->mpi_comm_mysim, ms, disresdata,
 -                globalState.get(), replExParams.exchangeInterval > 0);
 +                PAR(cr) ? NumRanks::Multiple : NumRanks::Single,
 +                cr->mpi_comm_mysim,
 +                ms,
 +                disresdata,
 +                globalState.get(),
 +                replExParams.exchangeInterval > 0);
  
 -    t_oriresdata* oriresdata;
 -    snew(oriresdata, 1);
 -    init_orires(fplog, &mtop, inputrec.get(), cr, ms, globalState.get(), oriresdata);
 +    std::unique_ptr<t_oriresdata> oriresData;
 +    if (gmx_mtop_ftype_count(mtop, F_ORIRES) > 0)
 +    {
 +        oriresData = std::make_unique<t_oriresdata>(fplog, mtop, *inputrec, cr, ms, globalState.get());
 +    }
  
 -    auto deform = prepareBoxDeformation(
 -            globalState != nullptr ? globalState->box : box, MASTER(cr) ? DDRole::Master : DDRole::Agent,
 -            PAR(cr) ? NumRanks::Multiple : NumRanks::Single, cr->mpi_comm_mygroup, *inputrec);
 +    auto deform = prepareBoxDeformation(globalState != nullptr ? globalState->box : box,
 +                                        MASTER(cr) ? DDRole::Master : DDRole::Agent,
 +                                        PAR(cr) ? NumRanks::Multiple : NumRanks::Single,
 +                                        cr->mpi_comm_mygroup,
 +                                        *inputrec);
  
  #if GMX_FAHCORE
      /* We have to remember the generation's first step before reading checkpoint.
  
          // Finish applying initial simulation state information from external sources on all ranks.
          // Reconcile checkpoint file data with Mdrunner state established up to this point.
 -        applyLocalState(*inputHolder_.get(), logFileHandle, cr, domdecOptions.numCells,
 -                        inputrec.get(), globalState.get(), &observablesHistory,
 -                        mdrunOptions.reproducible, mdModules_->notifier(),
 -                        modularSimulatorCheckpointData.get(), useModularSimulator);
 +        applyLocalState(*inputHolder_.get(),
 +                        logFileHandle,
 +                        cr,
 +                        domdecOptions.numCells,
 +                        inputrec.get(),
 +                        globalState.get(),
 +                        &observablesHistory,
 +                        mdrunOptions.reproducible,
 +                        mdModules_->notifiers(),
 +                        modularSimulatorCheckpointData.get(),
 +                        useModularSimulator);
          // TODO: (#3652) Synchronize filesystem state, SimulationInput contents, and program
          //  invariants
          //  on all code paths.
          gmx_bcast(sizeof(box), box, cr->mpiDefaultCommunicator);
      }
  
 -    if (inputrec->cutoff_scheme != ecutsVERLET)
 +    if (inputrec->cutoff_scheme != CutoffScheme::Verlet)
      {
          gmx_fatal(FARGS,
                    "This group-scheme .tpr file can no longer be run by mdrun. Please update to the "
       * increase rlist) tries to check if the newly chosen value fits with the DD scheme. As this is
       * run before any DD scheme is set up, this check is never executed. See #3334 for more details.
       */
 -    prepare_verlet_scheme(fplog, cr, inputrec.get(), nstlist_cmdline, &mtop, box,
 +    prepare_verlet_scheme(fplog,
 +                          cr,
 +                          inputrec.get(),
 +                          nstlist_cmdline,
 +                          mtop,
 +                          box,
                            useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes),
                            *hwinfo_->cpuInfo);
  
 +    // We need to decide on update groups early, as this affects
 +    // inter-domain communication distances.
 +    auto       updateGroupingsPerMoleculeType = makeUpdateGroupingsPerMoleculeType(mtop);
 +    const real maxUpdateGroupRadius           = computeMaxUpdateGroupRadius(
 +            mtop, updateGroupingsPerMoleculeType, maxReferenceTemperature(*inputrec));
 +    const real   cutoffMargin = std::sqrt(max_cutoff2(inputrec->pbcType, box)) - inputrec->rlist;
 +    UpdateGroups updateGroups = makeUpdateGroups(mdlog,
 +                                                 std::move(updateGroupingsPerMoleculeType),
 +                                                 maxUpdateGroupRadius,
 +                                                 useDomainDecomposition,
 +                                                 systemHasConstraintsOrVsites(mtop),
 +                                                 cutoffMargin);
 +
      // This builder is necessary while we have multi-part construction
      // of DD. Before DD is constructed, we use the existence of
      // the builder object to indicate that further construction of DD
      if (useDomainDecomposition)
      {
          ddBuilder = std::make_unique<DomainDecompositionBuilder>(
 -                mdlog, cr, domdecOptions, mdrunOptions, mtop, *inputrec, box,
 -                positionsFromStatePointer(globalState.get()));
 +                mdlog,
 +                cr,
 +                domdecOptions,
 +                mdrunOptions,
 +                mtop,
 +                *inputrec,
 +                mdModules_->notifiers(),
 +                box,
 +                updateGroups.updateGroupingPerMoleculeType(),
 +                updateGroups.useUpdateGroups(),
 +                updateGroups.maxUpdateGroupRadius(),
 +                positionsFromStatePointer(globalState.get()),
 +                useGpuForNonbonded,
 +                useGpuForPme);
      }
      else
      {
  
      // Produce the task assignment for this rank - done after DD is constructed
      GpuTaskAssignments gpuTaskAssignments = GpuTaskAssignmentsBuilder::build(
 -            gpuIdsToUse, userGpuTaskAssignment, *hwinfo_, simulationCommunicator, physicalNodeComm,
 -            nonbondedTarget, pmeTarget, bondedTarget, updateTarget, useGpuForNonbonded,
 -            useGpuForPme, thisRankHasDuty(cr, DUTY_PP),
 +            availableDevices,
 +            userGpuTaskAssignment,
 +            *hwinfo_,
 +            simulationCommunicator,
 +            physicalNodeComm,
 +            nonbondedTarget,
 +            pmeTarget,
 +            bondedTarget,
 +            updateTarget,
 +            useGpuForNonbonded,
 +            useGpuForPme,
 +            thisRankHasDuty(cr, DUTY_PP),
              // TODO cr->duty & DUTY_PME should imply that a PME
              // algorithm is active, but currently does not.
              EEL_PME(inputrec->coulombtype) && thisRankHasDuty(cr, DUTY_PME));
          ddBuilder.reset(nullptr);
          // Note that local state still does not exist yet.
      }
+     // Ensure that all atoms within the same update group are in the
+     // same periodic image. Otherwise, a simulation that did not use
+     // update groups (e.g. a single-rank simulation) cannot always be
+     // correctly restarted in a way that does use update groups
+     // (e.g. a multi-rank simulation).
+     if (isSimulationMasterRank)
+     {
+         const bool useUpdateGroups = cr->dd ? ddUsesUpdateGroups(*cr->dd) : false;
+         if (useUpdateGroups)
+         {
+             putUpdateGroupAtomsInSamePeriodicImage(*cr->dd, mtop, globalState->box, globalState->x);
+         }
+     }
  
      // The GPU update is decided here because we need to know whether the constraints or
 -    // SETTLEs can span accross the domain borders (i.e. whether or not update groups are
 +    // SETTLEs can span across the domain borders (i.e. whether or not update groups are
      // defined). This is only known after DD is initialized, hence decision on using GPU
      // update is done so late.
      try
      {
 -        const bool useUpdateGroups = cr->dd ? ddUsesUpdateGroups(*cr->dd) : false;
          const bool haveFrozenAtoms = inputrecFrozenAtoms(inputrec.get());
  
 -        useGpuForUpdate = decideWhetherToUseGpuForUpdate(
 -                useDomainDecomposition, useUpdateGroups, pmeRunMode, domdecOptions.numPmeRanks > 0,
 -                useGpuForNonbonded, updateTarget, gpusWereDetected, *inputrec, mtop,
 -                doEssentialDynamics, gmx_mtop_ftype_count(mtop, F_ORIRES) > 0,
 -                replExParams.exchangeInterval > 0, haveFrozenAtoms, doRerun, devFlags, mdlog);
 +        useGpuForUpdate = decideWhetherToUseGpuForUpdate(useDomainDecomposition,
 +                                                         updateGroups.useUpdateGroups(),
 +                                                         pmeRunMode,
 +                                                         domdecOptions.numPmeRanks > 0,
 +                                                         useGpuForNonbonded,
 +                                                         updateTarget,
 +                                                         gpusWereDetected,
 +                                                         *inputrec,
 +                                                         mtop,
 +                                                         doEssentialDynamics,
 +                                                         gmx_mtop_ftype_count(mtop, F_ORIRES) > 0,
 +                                                         haveFrozenAtoms,
 +                                                         doRerun,
 +                                                         devFlags,
 +                                                         mdlog);
      }
      GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
  
  
      MdrunScheduleWorkload runScheduleWork;
  
 -    bool useGpuDirectHalo = decideWhetherToUseGpuForHalo(
 -            devFlags, havePPDomainDecomposition(cr), useGpuForNonbonded, useModularSimulator,
 -            doRerun, EI_ENERGY_MINIMIZATION(inputrec->eI));
 +    bool useGpuDirectHalo = decideWhetherToUseGpuForHalo(devFlags,
 +                                                         havePPDomainDecomposition(cr),
 +                                                         useGpuForNonbonded,
 +                                                         useModularSimulator,
 +                                                         doRerun,
 +                                                         EI_ENERGY_MINIMIZATION(inputrec->eI));
  
      // Also populates the simulation constant workload description.
 -    runScheduleWork.simulationWork = createSimulationWorkload(
 -            *inputrec, disableNonbondedCalculation, devFlags, useGpuForNonbonded, pmeRunMode,
 -            useGpuForBonded, useGpuForUpdate, useGpuDirectHalo);
 +    runScheduleWork.simulationWork = createSimulationWorkload(*inputrec,
 +                                                              disableNonbondedCalculation,
 +                                                              devFlags,
 +                                                              havePPDomainDecomposition(cr),
 +                                                              useGpuForNonbonded,
 +                                                              pmeRunMode,
 +                                                              useGpuForBonded,
 +                                                              useGpuForUpdate,
 +                                                              useGpuDirectHalo);
  
      std::unique_ptr<DeviceStreamManager> deviceStreamManager = nullptr;
  
      // where appropriate.
      if (!userGpuTaskAssignment.empty())
      {
 -        gpuTaskAssignments.logPerformanceHints(mdlog, numDevicesToUse);
 +        gpuTaskAssignments.logPerformanceHints(mdlog, numAvailableDevices);
      }
  
      if (PAR(cr))
                  .appendTextFormatted(
                          "This is simulation %d out of %d running as a composite GROMACS\n"
                          "multi-simulation job. Setup for this simulation:\n",
 -                        ms->simulationIndex_, ms->numSimulations_);
 +                        ms->simulationIndex_,
 +                        ms->numSimulations_);
      }
      GMX_LOG(mdlog.warning)
 -            .appendTextFormatted("Using %d MPI %s\n", cr->nnodes,
 +            .appendTextFormatted("Using %d MPI %s\n",
 +                                 cr->nnodes,
  #    if GMX_THREAD_MPI
                                   cr->nnodes == 1 ? "thread" : "threads"
  #    else
      // the OpenMP support.
      gmx_check_thread_affinity_set(mdlog, &hw_opt, hwinfo_->nthreads_hw_avail, FALSE);
      /* Check and update the number of OpenMP threads requested */
 -    checkAndUpdateRequestedNumOpenmpThreads(&hw_opt, *hwinfo_, cr, ms, physicalNodeComm.size_,
 -                                            pmeRunMode, mtop, *inputrec);
 -
 -    gmx_omp_nthreads_init(mdlog, cr, hwinfo_->nthreads_hw_avail, physicalNodeComm.size_,
 -                          hw_opt.nthreads_omp, hw_opt.nthreads_omp_pme, !thisRankHasDuty(cr, DUTY_PP));
 -
 -    // Enable FP exception detection, but not in
 -    // Release mode and not for compilers with known buggy FP
 -    // exception support (clang with any optimization) or suspected
 -    // buggy FP exception support (gcc 7.* with optimization).
 -#if !defined NDEBUG                                                                         \
 -        && !((defined __clang__ || (defined(__GNUC__) && !defined(__ICC) && __GNUC__ == 7)) \
 -             && defined __OPTIMIZE__)
 -    const bool bEnableFPE = true;
 -#else
 -    const bool bEnableFPE = false;
 -#endif
 +    checkAndUpdateRequestedNumOpenmpThreads(
 +            &hw_opt, *hwinfo_, cr, ms, physicalNodeComm.size_, pmeRunMode, mtop, *inputrec);
 +
 +    gmx_omp_nthreads_init(mdlog,
 +                          cr,
 +                          hwinfo_->nthreads_hw_avail,
 +                          physicalNodeComm.size_,
 +                          hw_opt.nthreads_omp,
 +                          hw_opt.nthreads_omp_pme,
 +                          !thisRankHasDuty(cr, DUTY_PP));
 +
 +    const bool bEnableFPE = gmxShouldEnableFPExceptions();
      // FIXME - reconcile with gmx_feenableexcept() call from CommandLineModuleManager::run()
      if (bEnableFPE)
      {
      }
  
      /* Now that we know the setup is consistent, check for efficiency */
 -    check_resource_division_efficiency(hwinfo_, gpuTaskAssignments.thisRankHasAnyGpuTask(),
 -                                       mdrunOptions.ntompOptionIsSet, cr, mdlog);
 +    check_resource_division_efficiency(
 +            hwinfo_, gpuTaskAssignments.thisRankHasAnyGpuTask(), mdrunOptions.ntompOptionIsSet, cr, mdlog);
  
      /* getting number of PP/PME threads on this MPI / tMPI rank.
         PME: env variable should be read only on one node to make sure it is
         identical everywhere;
       */
 -    const int numThreadsOnThisRank = thisRankHasDuty(cr, DUTY_PP) ? gmx_omp_nthreads_get(emntNonbonded)
 -                                                                  : gmx_omp_nthreads_get(emntPME);
 -    checkHardwareOversubscription(numThreadsOnThisRank, cr->nodeid, *hwinfo_->hardwareTopology,
 -                                  physicalNodeComm, mdlog);
 +    const int numThreadsOnThisRank = thisRankHasDuty(cr, DUTY_PP)
 +                                             ? gmx_omp_nthreads_get(ModuleMultiThread::Nonbonded)
 +                                             : gmx_omp_nthreads_get(ModuleMultiThread::Pme);
 +    checkHardwareOversubscription(
 +            numThreadsOnThisRank, cr->nodeid, *hwinfo_->hardwareTopology, physicalNodeComm, mdlog);
  
      // Enable Peer access between GPUs where available
      // Only for DD, only master PP rank needs to perform setup, and only if thread MPI plus
          && (runScheduleWork.simulationWork.useGpuHaloExchange
              || runScheduleWork.simulationWork.useGpuPmePpCommunication))
      {
 -        setupGpuDevicePeerAccess(gpuIdsToUse, mdlog);
 +        setupGpuDevicePeerAccess(gpuTaskAssignments.deviceIdsAssigned(), mdlog);
      }
  
      if (hw_opt.threadAffinity != ThreadAffinity::Off)
          gmx_check_thread_affinity_set(mdlog, &hw_opt, hwinfo_->nthreads_hw_avail, TRUE);
  
          int numThreadsOnThisNode, intraNodeThreadOffset;
 -        analyzeThreadsOnThisNode(physicalNodeComm, numThreadsOnThisRank, &numThreadsOnThisNode,
 -                                 &intraNodeThreadOffset);
 +        analyzeThreadsOnThisNode(
 +                physicalNodeComm, numThreadsOnThisRank, &numThreadsOnThisNode, &intraNodeThreadOffset);
  
          /* Set the CPU affinity */
 -        gmx_set_thread_affinity(mdlog, cr, &hw_opt, *hwinfo_->hardwareTopology, numThreadsOnThisRank,
 -                                numThreadsOnThisNode, intraNodeThreadOffset, nullptr);
 +        gmx_set_thread_affinity(mdlog,
 +                                cr,
 +                                &hw_opt,
 +                                *hwinfo_->hardwareTopology,
 +                                numThreadsOnThisRank,
 +                                numThreadsOnThisNode,
 +                                intraNodeThreadOffset,
 +                                nullptr);
      }
  
      if (mdrunOptions.timingOptions.resetStep > -1)
                          "The -resetstep functionality is deprecated, and may be removed in a "
                          "future version.");
      }
 -    wcycle = wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
 +    std::unique_ptr<gmx_wallcycle> wcycle =
 +            wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
  
      if (PAR(cr))
      {
          /* Master synchronizes its value of reset_counters with all nodes
           * including PME only nodes */
 -        int64_t reset_counters = wcycle_get_reset_counters(wcycle);
 +        int64_t reset_counters = wcycle_get_reset_counters(wcycle.get());
          gmx_bcast(sizeof(reset_counters), &reset_counters, cr->mpi_comm_mysim);
 -        wcycle_set_reset_counters(wcycle, reset_counters);
 +        wcycle_set_reset_counters(wcycle.get(), reset_counters);
      }
  
      // Membrane embedding must be initialized before we call init_forcerec()
 -    membedHolder.initializeMembed(fplog, filenames.size(), filenames.data(), &mtop, inputrec.get(),
 -                                  globalState.get(), cr, &mdrunOptions.checkpointOptions.period);
 +    membedHolder.initializeMembed(fplog,
 +                                  filenames.size(),
 +                                  filenames.data(),
 +                                  &mtop,
 +                                  inputrec.get(),
 +                                  globalState.get(),
 +                                  cr,
 +                                  &mdrunOptions.checkpointOptions.period);
  
      const bool               thisRankHasPmeGpuTask = gpuTaskAssignments.thisRankHasPmeGpuTask();
      std::unique_ptr<MDAtoms> mdAtoms;
      std::unique_ptr<VirtualSitesHandler> vsite;
 -    std::unique_ptr<GpuBonded>           gpuBonded;
  
      t_nrnb nrnb;
      if (thisRankHasDuty(cr, DUTY_PP))
      {
 -        mdModulesNotifier.notify(*cr);
 -        mdModulesNotifier.notify(&atomSets);
 -        mdModulesNotifier.notify(inputrec->pbcType);
 -        mdModulesNotifier.notify(SimulationTimeStep{ inputrec->delta_t });
 +        setupNotifier.notify(*cr);
 +        setupNotifier.notify(&atomSets);
 +        setupNotifier.notify(mtop);
 +        setupNotifier.notify(inputrec->pbcType);
 +        setupNotifier.notify(SimulationTimeStep{ inputrec->delta_t });
          /* Initiate forcerecord */
 -        fr                 = new t_forcerec;
 +        fr                 = std::make_unique<t_forcerec>();
          fr->forceProviders = mdModules_->initForceProviders();
 -        init_forcerec(fplog, mdlog, fr, inputrec.get(), &mtop, cr, box,
 +        init_forcerec(fplog,
 +                      mdlog,
 +                      runScheduleWork.simulationWork,
 +                      fr.get(),
 +                      *inputrec,
 +                      mtop,
 +                      cr,
 +                      box,
                        opt2fn("-table", filenames.size(), filenames.data()),
                        opt2fn("-tablep", filenames.size(), filenames.data()),
 -                      opt2fns("-tableb", filenames.size(), filenames.data()), pforce);
 +                      opt2fns("-tableb", filenames.size(), filenames.data()),
 +                      pforce);
          // Dirty hack, for fixing disres and orires should be made mdmodules
          fr->fcdata->disres = disresdata;
 -        fr->fcdata->orires = oriresdata;
 +        fr->fcdata->orires.swap(oriresData);
  
          // Save a handle to device stream manager to use elsewhere in the code
          // TODO: Forcerec is not a correct place to store it.
                      "GPU PP-PME stream should be valid in order to use GPU PME-PP direct "
                      "communications.");
              fr->pmePpCommGpu = std::make_unique<gmx::PmePpCommGpu>(
 -                    cr->mpi_comm_mysim, cr->dd->pme_nodeid, deviceStreamManager->context(),
 +                    cr->mpi_comm_mysim,
 +                    cr->dd->pme_nodeid,
 +                    &cr->dd->pmeForceReceiveBuffer,
 +                    deviceStreamManager->context(),
                      deviceStreamManager->stream(DeviceStreamType::PmePpTransfer));
          }
  
 -        fr->nbv = Nbnxm::init_nb_verlet(mdlog, inputrec.get(), fr, cr, *hwinfo_,
 +        fr->nbv = Nbnxm::init_nb_verlet(mdlog,
 +                                        *inputrec,
 +                                        *fr,
 +                                        cr,
 +                                        *hwinfo_,
                                          runScheduleWork.simulationWork.useGpuNonbonded,
 -                                        deviceStreamManager.get(), &mtop, box, wcycle);
 +                                        deviceStreamManager.get(),
 +                                        mtop,
 +                                        box,
 +                                        wcycle.get());
          // TODO: Move the logic below to a GPU bonded builder
          if (runScheduleWork.simulationWork.useGpuBonded)
          {
              GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
                                 "GPU device stream manager should be valid in order to use GPU "
                                 "version of bonded forces.");
 -            gpuBonded = std::make_unique<GpuBonded>(
 -                    mtop.ffparams, fr->ic->epsfac * fr->fudgeQQ, deviceStreamManager->context(),
 -                    deviceStreamManager->bondedStream(havePPDomainDecomposition(cr)), wcycle);
 -            fr->gpuBonded = gpuBonded.get();
 +            fr->listedForcesGpu = std::make_unique<ListedForcesGpu>(
 +                    mtop.ffparams,
 +                    fr->ic->epsfac * fr->fudgeQQ,
 +                    deviceStreamManager->context(),
 +                    deviceStreamManager->bondedStream(havePPDomainDecomposition(cr)),
 +                    wcycle.get());
          }
  
          /* Initialize the mdAtoms structure.
          }
  
          /* Initialize the virtual site communication */
 -        vsite = makeVirtualSitesHandler(mtop, cr, fr->pbcType);
 +        vsite = makeVirtualSitesHandler(
 +                mtop, cr, fr->pbcType, updateGroups.updateGroupingPerMoleculeType());
  
          calc_shifts(box, fr->shift_vec);
  
                  constructVirtualSitesGlobal(mtop, globalState->x);
              }
          }
 +        // Make the DD reverse topology, now that any vsites that are present are available
 +        if (DOMAINDECOMP(cr))
 +        {
 +            dd_make_reverse_top(fplog, cr->dd, mtop, vsite.get(), *inputrec, domdecOptions.ddBondedChecking);
 +        }
  
          if (EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
          {
                  const DeviceContext* deviceContext = runScheduleWork.simulationWork.useGpuPme
                                                               ? &deviceStreamManager->context()
                                                               : nullptr;
 -                const DeviceStream* pmeStream =
 +                const DeviceStream*  pmeStream =
                          runScheduleWork.simulationWork.useGpuPme
 -                                ? &deviceStreamManager->stream(DeviceStreamType::Pme)
 -                                : nullptr;
 -
 -                pmedata = gmx_pme_init(cr, getNumPmeDomains(cr->dd), inputrec.get(),
 -                                       nChargePerturbed != 0, nTypePerturbed != 0,
 -                                       mdrunOptions.reproducible, ewaldcoeff_q, ewaldcoeff_lj,
 -                                       gmx_omp_nthreads_get(emntPME), pmeRunMode, nullptr,
 -                                       deviceContext, pmeStream, pmeGpuProgram.get(), mdlog);
 +                                 ? &deviceStreamManager->stream(DeviceStreamType::Pme)
 +                                 : nullptr;
 +
 +                pmedata = gmx_pme_init(cr,
 +                                       getNumPmeDomains(cr->dd),
 +                                       inputrec.get(),
 +                                       nChargePerturbed != 0,
 +                                       nTypePerturbed != 0,
 +                                       mdrunOptions.reproducible,
 +                                       ewaldcoeff_q,
 +                                       ewaldcoeff_lj,
 +                                       gmx_omp_nthreads_get(ModuleMultiThread::Pme),
 +                                       pmeRunMode,
 +                                       nullptr,
 +                                       deviceContext,
 +                                       pmeStream,
 +                                       pmeGpuProgram.get(),
 +                                       mdlog);
              }
              GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
          }
      if (thisRankHasDuty(cr, DUTY_PP))
      {
          /* Assumes uniform use of the number of OpenMP threads */
 -        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
 +        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(ModuleMultiThread::Default));
  
          if (inputrec->bPull)
          {
              /* Initialize pull code */
 -            pull_work = init_pull(fplog, inputrec->pull.get(), inputrec.get(), &mtop, cr, &atomSets,
 +            pull_work = init_pull(fplog,
 +                                  inputrec->pull.get(),
 +                                  inputrec.get(),
 +                                  mtop,
 +                                  cr,
 +                                  &atomSets,
                                    inputrec->fepvals->init_lambda);
              if (inputrec->pull->bXOutAverage || inputrec->pull->bFOutAverage)
              {
          if (inputrec->bRot)
          {
              /* Initialize enforced rotation code */
 -            enforcedRotation = init_rot(fplog, inputrec.get(), filenames.size(), filenames.data(),
 -                                        cr, &atomSets, globalState.get(), &mtop, oenv, mdrunOptions,
 +            enforcedRotation = init_rot(fplog,
 +                                        inputrec.get(),
 +                                        filenames.size(),
 +                                        filenames.data(),
 +                                        cr,
 +                                        &atomSets,
 +                                        globalState.get(),
 +                                        mtop,
 +                                        oenv,
 +                                        mdrunOptions,
                                          startingBehavior);
          }
  
          t_swap* swap = nullptr;
 -        if (inputrec->eSwapCoords != eswapNO)
 +        if (inputrec->eSwapCoords != SwapType::No)
          {
              /* Initialize ion swapping code */
 -            swap = init_swapcoords(fplog, inputrec.get(),
 +            swap = init_swapcoords(fplog,
 +                                   inputrec.get(),
                                     opt2fn_master("-swap", filenames.size(), filenames.data(), cr),
 -                                   &mtop, globalState.get(), &observablesHistory, cr, &atomSets,
 -                                   oenv, mdrunOptions, startingBehavior);
 +                                   mtop,
 +                                   globalState.get(),
 +                                   &observablesHistory,
 +                                   cr,
 +                                   &atomSets,
 +                                   oenv,
 +                                   mdrunOptions,
 +                                   startingBehavior);
          }
  
          /* Let makeConstraints know whether we have essential dynamics constraints. */
 -        auto constr = makeConstraints(mtop, *inputrec, pull_work, doEssentialDynamics, fplog, cr,
 -                                      ms, &nrnb, wcycle, fr->bMolPBC);
 +        auto constr = makeConstraints(mtop,
 +                                      *inputrec,
 +                                      pull_work,
 +                                      doEssentialDynamics,
 +                                      fplog,
 +                                      cr,
 +                                      updateGroups.useUpdateGroups(),
 +                                      ms,
 +                                      &nrnb,
 +                                      wcycle.get(),
 +                                      fr->bMolPBC);
  
          /* Energy terms and groups */
          gmx_enerdata_t enerd(mtop.groups.groups[SimulationAtomGroupType::EnergyOutput].size(),
  
          // cos acceleration is only supported by md, but older tpr
          // files might still combine it with other integrators
 -        GMX_RELEASE_ASSERT(inputrec->cos_accel == 0.0 || inputrec->eI == eiMD,
 +        GMX_RELEASE_ASSERT(inputrec->cos_accel == 0.0 || inputrec->eI == IntegrationAlgorithm::MD,
                             "cos_acceleration is only supported by integrator=md");
  
          /* Kinetic energy data */
 -        gmx_ekindata_t ekind;
 -        init_ekindata(fplog, &mtop, &(inputrec->opts), &ekind, inputrec->cos_accel);
 +        gmx_ekindata_t ekind(inputrec->opts.ngtc,
 +                             inputrec->cos_accel,
 +                             gmx_omp_nthreads_get(ModuleMultiThread::Update));
  
          /* Set up interactive MD (IMD) */
 -        auto imdSession =
 -                makeImdSession(inputrec.get(), cr, wcycle, &enerd, ms, &mtop, mdlog,
 -                               MASTER(cr) ? globalState->x.rvec_array() : nullptr, filenames.size(),
 -                               filenames.data(), oenv, mdrunOptions.imdOptions, startingBehavior);
 +        auto imdSession = makeImdSession(inputrec.get(),
 +                                         cr,
 +                                         wcycle.get(),
 +                                         &enerd,
 +                                         ms,
 +                                         mtop,
 +                                         mdlog,
 +                                         MASTER(cr) ? globalState->x : gmx::ArrayRef<gmx::RVec>(),
 +                                         filenames.size(),
 +                                         filenames.data(),
 +                                         oenv,
 +                                         mdrunOptions.imdOptions,
 +                                         startingBehavior);
  
          if (DOMAINDECOMP(cr))
          {
              GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
 -            /* This call is not included in init_domain_decomposition mainly
 -             * because fr->cginfo_mb is set later.
 +            /* This call is not included in init_domain_decomposition
 +             * because fr->atomInfoForEachMoleculeBlock is set later.
               */
 -            dd_init_bondeds(fplog, cr->dd, mtop, vsite.get(), inputrec.get(),
 -                            domdecOptions.checkBondedInteractions, fr->cginfo_mb);
 +            makeBondedLinks(cr->dd, mtop, fr->atomInfoForEachMoleculeBlock);
          }
  
          if (runScheduleWork.simulationWork.useGpuBufferOps)
          {
              fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique<gmx::GpuForceReduction>(
                      deviceStreamManager->context(),
 -                    deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal), wcycle);
 +                    deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal),
 +                    wcycle.get());
              fr->gpuForceReduction[gmx::AtomLocality::NonLocal] = std::make_unique<gmx::GpuForceReduction>(
                      deviceStreamManager->context(),
 -                    deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal), wcycle);
 +                    deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal),
 +                    wcycle.get());
          }
  
          std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
              && ((runScheduleWork.simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME))
                  || runScheduleWork.simulationWork.useGpuBufferOps))
          {
 -            GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator)
 -                                                      ? GpuApiCallBehavior::Async
 -                                                      : GpuApiCallBehavior::Sync;
 +            GpuApiCallBehavior transferKind =
 +                    (inputrec->eI == IntegrationAlgorithm::MD && !doRerun && !useModularSimulator)
 +                            ? GpuApiCallBehavior::Async
 +                            : GpuApiCallBehavior::Sync;
              GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
                                 "GPU device stream manager should be initialized to use GPU.");
              stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
 -                    *deviceStreamManager, transferKind, pme_gpu_get_block_size(fr->pmedata), wcycle);
 +                    *deviceStreamManager, transferKind, pme_gpu_get_block_size(fr->pmedata), wcycle.get());
              fr->stateGpu = stateGpu.get();
          }
  
  
  
          simulatorBuilder.add(SimulatorEnv(fplog, cr, ms, mdlog, oenv));
 -        simulatorBuilder.add(Profiling(&nrnb, walltime_accounting, wcycle));
 +        simulatorBuilder.add(Profiling(&nrnb, walltime_accounting, wcycle.get()));
          simulatorBuilder.add(ConstraintsParam(
 -                constr.get(), enforcedRotation ? enforcedRotation->getLegacyEnfrot() : nullptr,
 -                vsite.get()));
 +                constr.get(), enforcedRotation ? enforcedRotation->getLegacyEnfrot() : nullptr, vsite.get()));
          // TODO: Separate `fr` to a separate add, and make the `build` handle the coupling sensibly.
 -        simulatorBuilder.add(LegacyInput(static_cast<int>(filenames.size()), filenames.data(),
 -                                         inputrec.get(), fr));
 +        simulatorBuilder.add(LegacyInput(
 +                static_cast<int>(filenames.size()), filenames.data(), inputrec.get(), fr.get()));
          simulatorBuilder.add(ReplicaExchangeParameters(replExParams));
          simulatorBuilder.add(InteractiveMD(imdSession.get()));
 -        simulatorBuilder.add(SimulatorModules(mdModules_->outputProvider(), mdModules_->notifier()));
 +        simulatorBuilder.add(SimulatorModules(mdModules_->outputProvider(), mdModules_->notifiers()));
          simulatorBuilder.add(CenterOfMassPulling(pull_work));
          // Todo move to an MDModule
          simulatorBuilder.add(IonSwapping(swap));
 -        simulatorBuilder.add(TopologyData(&mtop, mdAtoms.get()));
 +        simulatorBuilder.add(TopologyData(mtop, mdAtoms.get()));
          simulatorBuilder.add(BoxDeformationHandle(deform.get()));
          simulatorBuilder.add(std::move(modularSimulatorCheckpointData));
  
      {
          GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
          /* do PME only */
 -        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
 -        gmx_pmeonly(pmedata, cr, &nrnb, wcycle, walltime_accounting, inputrec.get(), pmeRunMode,
 +        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(ModuleMultiThread::Pme));
 +        gmx_pmeonly(pmedata,
 +                    cr,
 +                    &nrnb,
 +                    wcycle.get(),
 +                    walltime_accounting,
 +                    inputrec.get(),
 +                    pmeRunMode,
 +                    runScheduleWork.simulationWork.useGpuPmePpCommunication,
                      deviceStreamManager.get());
      }
  
 -    wallcycle_stop(wcycle, ewcRUN);
 +    wallcycle_stop(wcycle.get(), WallCycleCounter::Run);
  
      /* Finish up, write some stuff
       * if rerunMD, don't write last frame again
       */
 -    finish_run(fplog, mdlog, cr, inputrec.get(), &nrnb, wcycle, walltime_accounting,
 -               fr ? fr->nbv.get() : nullptr, pmedata, EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms));
 +    finish_run(fplog,
 +               mdlog,
 +               cr,
 +               *inputrec,
 +               &nrnb,
 +               wcycle.get(),
 +               walltime_accounting,
 +               fr ? fr->nbv.get() : nullptr,
 +               pmedata,
 +               EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms));
  
 -    // clean up cycle counter
 -    wallcycle_destroy(wcycle);
  
      deviceStreamManager.reset(nullptr);
      // Free PME data
      mdAtoms.reset(nullptr);
      globalState.reset(nullptr);
      mdModules_.reset(nullptr); // destruct force providers here as they might also use the GPU
 -    gpuBonded.reset(nullptr);
 -    /* Free pinned buffers in *fr */
 -    delete fr;
 -    fr = nullptr;
 +    fr.reset(nullptr);         // destruct forcerec before gpu
      // TODO convert to C++ so we can get rid of these frees
      sfree(disresdata);
 -    sfree(oriresdata);
  
      if (!hwinfo_->deviceInfoList.empty())
      {
      {
          physicalNodeComm.barrier();
      }
 -    releaseDevice(deviceInfo);
 +
 +    if (!devFlags.usingCudaAwareMpi)
 +    {
 +        // Don't reset GPU in case of CUDA-AWARE MPI
 +        // UCX creates CUDA buffers which are cleaned-up as part of MPI_Finalize()
 +        // resetting the device before MPI_Finalize() results in crashes inside UCX
 +        releaseDevice(deviceInfo);
 +    }
  
      /* Does what it says */
      print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());