From 07369a19f453f2bfa5e96fcc833f64e7d435913d Mon Sep 17 00:00:00 2001
From: Berk Hess <hess@kth.se>
Date: Wed, 30 Sep 2020 15:13:58 +0000
Subject: [PATCH] Move computeSlowForces into stepWork

Also moved a flag into forcerec and fixed documentation.
---
 docs/user-guide/mdp-options.rst               |  42 ++-
 src/gromacs/fileio/tpxio.cpp                  |  25 ++
 src/gromacs/gmxpreprocess/grompp.cpp          |   2 +
 src/gromacs/gmxpreprocess/readir.cpp          | 131 +++++++++
 src/gromacs/gmxpreprocess/readir.h            |   2 +
 ...arametersWithValuesIncludingAssignment.xml |   2 +
 .../GetIrTest_AcceptsElectricField.xml        |   2 +
 ...IrTest_AcceptsElectricFieldOscillating.xml |   2 +
 .../GetIrTest_AcceptsElectricFieldPulsed.xml  |   2 +
 .../refdata/GetIrTest_AcceptsEmptyLines.xml   |   2 +
 .../GetIrTest_AcceptsImplicitSolventNo.xml    |   2 +
 .../GetIrTest_AcceptsKeyWithoutValue.xml      |   2 +
 .../tests/refdata/GetIrTest_AcceptsMimic.xml  |   2 +
 ...IrTest_HandlesDifferentKindsOfMdpLines.xml |   2 +
 src/gromacs/listed_forces/gpubonded_impl.cpp  |   4 +
 src/gromacs/mdlib/constr.cpp                  |   2 +
 src/gromacs/mdlib/force.cpp                   |   3 +-
 src/gromacs/mdlib/force.h                     |  13 +
 src/gromacs/mdlib/force_flags.h               |   4 +-
 src/gromacs/mdlib/forcerec.cpp                |  73 ++++-
 src/gromacs/mdlib/sim_util.cpp                | 278 ++++++++++++------
 src/gromacs/mdlib/update.cpp                  | 116 +++++++-
 src/gromacs/mdlib/update.h                    |  10 +
 src/gromacs/mdrun/md.cpp                      |  37 ++-
 src/gromacs/mdrun/shellfc.cpp                 |   4 +-
 src/gromacs/mdtypes/CMakeLists.txt            |   1 +
 src/gromacs/mdtypes/forcebuffers.cpp          |  21 +-
 src/gromacs/mdtypes/forcebuffers.h            |  53 +++-
 src/gromacs/mdtypes/forcerec.h                |   9 +-
 src/gromacs/mdtypes/inputrec.cpp              |  72 ++++-
 src/gromacs/mdtypes/inputrec.h                |   6 +
 src/gromacs/mdtypes/multipletimestepping.cpp  |  97 ++++++
 src/gromacs/mdtypes/multipletimestepping.h    |  82 ++++++
 src/gromacs/mdtypes/simulation_workload.h     |   9 +-
 src/gromacs/mdtypes/tests/forcebuffers.cpp    |   4 +-
 src/gromacs/nbnxm/pairlist_tuning.cpp         |  46 ++-
 src/gromacs/nbnxm/pairlistparams.cpp          |   3 +-
 src/gromacs/nbnxm/pairlistparams.h            |   2 +
 src/gromacs/nbnxm/pairlistsets.h              |   3 +-
 src/gromacs/nbnxm/prunekerneldispatch.cpp     |   3 +-
 src/gromacs/pulling/pull.cpp                  |   4 +
 src/gromacs/taskassignment/decidegpuusage.cpp |   5 +
 .../decidesimulationworkload.cpp              |  21 +-
 src/programs/mdrun/tests/CMakeLists.txt       |   1 +
 .../mdrun/tests/multiple_time_stepping.cpp    | 194 ++++++++++++
 .../mdrun/tests/simulatorcomparison.cpp       |   5 +-
 .../mdrun/tests/simulatorcomparison.h         |   4 +-
 47 files changed, 1242 insertions(+), 167 deletions(-)
 create mode 100644 src/gromacs/mdtypes/multipletimestepping.cpp
 create mode 100644 src/gromacs/mdtypes/multipletimestepping.h
 create mode 100644 src/programs/mdrun/tests/multiple_time_stepping.cpp

diff --git a/docs/user-guide/mdp-options.rst b/docs/user-guide/mdp-options.rst
index 0fea1a8b57..d6f9d3c63f 100644
--- a/docs/user-guide/mdp-options.rst
+++ b/docs/user-guide/mdp-options.rst
@@ -231,6 +231,45 @@ Run control
          same simulation. This option is generally useful to set only
          when coping with a crashed simulation where files were lost.
 
+.. mdp:: mts
+
+   .. mdp-value:: no
+
+      Evaluate all forces at every integration step.
+
+   .. mdp-value:: yes
+
+      Use a multiple timing-stepping integrator to evaluate some forces, as specified
+      by :mdp:`mts-level2-forces` every :mdp:`mts-level2-factor` integration
+      steps. All other forces are evaluated at every step.
+
+.. mdp:: mts-levels
+
+        (2)
+	The number of levels for the multiple time-stepping scheme.
+	Currently only 2 is supported.
+
+.. mdp:: mts-level2-forces
+
+   (longrange-nonbonded nonbonded pair dihedral)
+   A list of force groups that will be evaluated only every
+   :mdp:`mts-level2-factor` steps. Supported entries are:
+   ``longrange-nonbonded``, ``nonbonded``, ``pair``, ``dihedral`` and
+   ``angle``. With ``pair`` the listed pair forces (such as 1-4) are
+   selected. With ``dihedral`` all dihedrals are selected, including cmap.
+   All other forces, including all restraints, are evaluated and
+   integrated every step. When PME or Ewald is used for electrostatics
+   and/or LJ interactions, ``longrange-nonbonded`` has to be entered here.
+   The default value should work well for most standard atomistic simulations
+   and in particular for replacing virtual site treatment for increasing
+   the time step.
+
+.. mdp:: mts-level2-factor
+
+      (2) [steps]
+      Interval for computing the forces in level 2 of the multiple time-stepping
+      scheme
+
 .. mdp:: comm-mode
 
    .. mdp-value:: Linear
@@ -1649,7 +1688,8 @@ pull-coord2-vec, pull-coord2-k, and so on.
       Center of mass pulling using a constraint between the reference
       group and one or more groups. The setup is identical to the
       option umbrella, except for the fact that a rigid constraint is
-      applied instead of a harmonic potential.
+      applied instead of a harmonic potential. Note that this type is
+      not supported in combination with multiple time stepping.
 
    .. mdp-value:: constant-force
 
diff --git a/src/gromacs/fileio/tpxio.cpp b/src/gromacs/fileio/tpxio.cpp
index c762178c8b..9ebd98f4b3 100644
--- a/src/gromacs/fileio/tpxio.cpp
+++ b/src/gromacs/fileio/tpxio.cpp
@@ -58,6 +58,7 @@
 #include "gromacs/mdtypes/awh_params.h"
 #include "gromacs/mdtypes/inputrec.h"
 #include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/multipletimestepping.h"
 #include "gromacs/mdtypes/pull_params.h"
 #include "gromacs/mdtypes/state.h"
 #include "gromacs/pbcutil/boxutilities.h"
@@ -132,6 +133,7 @@ enum tpxv
     tpxv_AddSizeField, /**< Added field with information about the size of the serialized tpr file in bytes, excluding the header */
     tpxv_StoreNonBondedInteractionExclusionGroup, /**< Store the non bonded interaction exclusion group in the topology */
     tpxv_VSite1,                                  /**< Added 1 type virtual site */
+    tpxv_MTS,                                     /**< Added multiple time stepping */
     tpxv_Count                                    /**< the total number of tpxv versions */
 };
 
@@ -1082,6 +1084,29 @@ static void do_inputrec(gmx::ISerializer* serializer, t_inputrec* ir, int file_v
 
     serializer->doInt(&ir->simulation_part);
 
+    if (file_version >= tpxv_MTS)
+    {
+        serializer->doBool(&ir->useMts);
+        int numLevels = ir->mtsLevels.size();
+        if (ir->useMts)
+        {
+            serializer->doInt(&numLevels);
+        }
+        ir->mtsLevels.resize(numLevels);
+        for (auto& mtsLevel : ir->mtsLevels)
+        {
+            int forceGroups = mtsLevel.forceGroups.to_ulong();
+            serializer->doInt(&forceGroups);
+            mtsLevel.forceGroups = std::bitset<static_cast<int>(gmx::MtsForceGroups::Count)>(forceGroups);
+            serializer->doInt(&mtsLevel.stepFactor);
+        }
+    }
+    else
+    {
+        ir->useMts = false;
+        ir->mtsLevels.clear();
+    }
+
     if (file_version >= 67)
     {
         serializer->doInt(&ir->nstcalcenergy);
diff --git a/src/gromacs/gmxpreprocess/grompp.cpp b/src/gromacs/gmxpreprocess/grompp.cpp
index ae10d65d75..6fb2264e27 100644
--- a/src/gromacs/gmxpreprocess/grompp.cpp
+++ b/src/gromacs/gmxpreprocess/grompp.cpp
@@ -1835,6 +1835,7 @@ int gmx_grompp(int argc, char* argv[])
     snew(opts, 1);
     snew(opts->include, STRLEN);
     snew(opts->define, STRLEN);
+    snew(opts->mtsLevel2Forces, STRLEN);
 
     gmx::LoggerBuilder builder;
     builder.addTargetStream(gmx::MDLogger::LogLevel::Info, &gmx::TextOutputFile::standardOutput());
@@ -2389,6 +2390,7 @@ int gmx_grompp(int argc, char* argv[])
     sfree(opts->wall_atomtype[0]);
     sfree(opts->wall_atomtype[1]);
     sfree(opts->include);
+    sfree(opts->mtsLevel2Forces);
     sfree(opts);
     for (auto& mol : mi)
     {
diff --git a/src/gromacs/gmxpreprocess/readir.cpp b/src/gromacs/gmxpreprocess/readir.cpp
index 6aa80244ba..835eb4c4c6 100644
--- a/src/gromacs/gmxpreprocess/readir.cpp
+++ b/src/gromacs/gmxpreprocess/readir.cpp
@@ -59,6 +59,7 @@
 #include "gromacs/mdrun/mdmodules.h"
 #include "gromacs/mdtypes/inputrec.h"
 #include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/multipletimestepping.h"
 #include "gromacs/mdtypes/pull_params.h"
 #include "gromacs/options/options.h"
 #include "gromacs/options/treesupport.h"
@@ -232,6 +233,89 @@ static void process_interaction_modifier(int* eintmod)
     }
 }
 
+static void checkMtsRequirement(const t_inputrec& ir, const char* param, const int nstValue, warninp_t wi)
+{
+    GMX_RELEASE_ASSERT(ir.mtsLevels.size() >= 2, "Need at least two levels for MTS");
+    const int mtsFactor = ir.mtsLevels.back().stepFactor;
+    if (nstValue % mtsFactor != 0)
+    {
+        auto message = gmx::formatString(
+                "With MTS, %s = %d should be a multiple of mts-factor = %d", param, nstValue, mtsFactor);
+        warning_error(wi, message.c_str());
+    }
+}
+
+static void setupMtsLevels(gmx::ArrayRef<gmx::MtsLevel> mtsLevels,
+                           const t_inputrec&            ir,
+                           const t_gromppopts&          opts,
+                           warninp_t                    wi)
+{
+    if (!(ir.eI == eiMD || ir.eI == eiSD1))
+    {
+        auto message = gmx::formatString(
+                "Multiple time stepping is only supported with integrators %s and %s",
+                ei_names[eiMD], ei_names[eiSD1]);
+        warning_error(wi, message.c_str());
+    }
+    if (opts.numMtsLevels != 2)
+    {
+        warning_error(wi, "Only mts-levels = 2 is supported");
+    }
+    else
+    {
+        const std::vector<std::string> inputForceGroups = gmx::splitString(opts.mtsLevel2Forces);
+        auto&                          forceGroups      = mtsLevels[1].forceGroups;
+        for (const auto& inputForceGroup : inputForceGroups)
+        {
+            bool found     = false;
+            int  nameIndex = 0;
+            for (const auto& forceGroupName : gmx::mtsForceGroupNames)
+            {
+                if (gmx::equalCaseInsensitive(inputForceGroup, forceGroupName))
+                {
+                    forceGroups.set(nameIndex);
+                    found = true;
+                }
+                nameIndex++;
+            }
+            if (!found)
+            {
+                auto message =
+                        gmx::formatString("Unknown MTS force group '%s'", inputForceGroup.c_str());
+                warning_error(wi, message.c_str());
+            }
+        }
+
+        if (mtsLevels[1].stepFactor <= 1)
+        {
+            gmx_fatal(FARGS, "mts-factor should be larger than 1");
+        }
+
+        // Make the level 0 use the complement of the force groups of group 1
+        mtsLevels[0].forceGroups = ~mtsLevels[1].forceGroups;
+        mtsLevels[0].stepFactor  = 1;
+
+        if ((EEL_FULL(ir.coulombtype) || EVDW_PME(ir.vdwtype))
+            && !mtsLevels[1].forceGroups[static_cast<int>(gmx::MtsForceGroups::LongrangeNonbonded)])
+        {
+            warning_error(wi,
+                          "With long-range electrostatics and/or LJ treatment, the long-range part "
+                          "has to be part of the mts-level2-forces");
+        }
+
+        if (ir.nstcalcenergy > 0)
+        {
+            checkMtsRequirement(ir, "nstcalcenergy", ir.nstcalcenergy, wi);
+        }
+        checkMtsRequirement(ir, "nstenergy", ir.nstenergy, wi);
+        checkMtsRequirement(ir, "nstlog", ir.nstlog, wi);
+        if (ir.efep != efepNO)
+        {
+            checkMtsRequirement(ir, "nstdhdl", ir.fepvals->nstdhdl, wi);
+        }
+    }
+}
+
 void check_ir(const char*                   mdparin,
               const gmx::MdModulesNotifier& mdModulesNotifier,
               t_inputrec*                   ir,
@@ -537,6 +621,12 @@ void check_ir(const char*                   mdparin,
             {
                 ir->nstpcouple = ir_optimal_nstpcouple(ir);
             }
+            if (ir->useMts && ir->nstpcouple % ir->mtsLevels.back().stepFactor != 0)
+            {
+                warning_error(wi,
+                              "With multiple time stepping, nstpcouple should be a mutiple of "
+                              "mts-factor");
+            }
         }
 
         if (ir->nstcalcenergy > 0)
@@ -1883,6 +1973,24 @@ void get_ir(const char*     mdparin,
     printStringNoNewline(
             &inp, "Part index is updated automatically on checkpointing (keeps files separate)");
     ir->simulation_part = get_eint(&inp, "simulation-part", 1, wi);
+    printStringNoNewline(&inp, "Multiple time-stepping");
+    ir->useMts = (get_eeenum(&inp, "mts", yesno_names, wi) != 0);
+    if (ir->useMts)
+    {
+        opts->numMtsLevels = get_eint(&inp, "mts-levels", 2, wi);
+        ir->mtsLevels.resize(2);
+        gmx::MtsLevel& mtsLevel = ir->mtsLevels[1];
+        setStringEntry(&inp, "mts-level2-forces", opts->mtsLevel2Forces,
+                       "longrange-nonbonded nonbonded pair dihedral");
+        mtsLevel.stepFactor = get_eint(&inp, "mts-level2-factor", 2, wi);
+
+        // We clear after reading without dynamics to not force the user to remove MTS mdp options
+        if (!EI_DYNAMICS(ir->eI))
+        {
+            ir->useMts = false;
+            ir->mtsLevels.clear();
+        }
+    }
     printStringNoNewline(&inp, "mode for center of mass motion removal");
     ir->comm_mode = get_eeenum(&inp, "comm-mode", ecm_names, wi);
     printStringNoNewline(&inp, "number of steps for center of mass motion removal");
@@ -2113,6 +2221,20 @@ void get_ir(const char*     mdparin,
     {
         snew(ir->pull, 1);
         inputrecStrings->pullGroupNames = read_pullparams(&inp, ir->pull, wi);
+
+        if (ir->useMts)
+        {
+            for (int c = 0; c < ir->pull->ncoord; c++)
+            {
+                if (ir->pull->coord[c].eType == epullCONSTRAINT)
+                {
+                    warning_error(wi,
+                                  "Constraint COM pulling is not supported in combination with "
+                                  "multiple time stepping");
+                    break;
+                }
+            }
+        }
     }
 
     /* AWH biasing
@@ -2639,6 +2761,12 @@ void get_ir(const char*     mdparin,
         }
     }
 
+    /* Set up MTS levels, this needs to happen before checking AWH parameters */
+    if (ir->useMts)
+    {
+        setupMtsLevels(ir->mtsLevels, *ir, *opts, wi);
+    }
+
     if (ir->bDoAwh)
     {
         gmx::checkAwhParams(ir->awhParams, ir, wi);
@@ -4061,6 +4189,9 @@ static void check_combination_rules(const t_inputrec* ir, const gmx_mtop_t* mtop
 
 void triple_check(const char* mdparin, t_inputrec* ir, gmx_mtop_t* sys, warninp_t wi)
 {
+    // Not meeting MTS requirements should have resulted in a fatal error, so we can assert here
+    gmx::assertMtsRequirements(*ir);
+
     char                      err_buf[STRLEN];
     int                       i, m, c, nmol;
     bool                      bCharge, bAcc;
diff --git a/src/gromacs/gmxpreprocess/readir.h b/src/gromacs/gmxpreprocess/readir.h
index 939b1cd3fc..a08b10810c 100644
--- a/src/gromacs/gmxpreprocess/readir.h
+++ b/src/gromacs/gmxpreprocess/readir.h
@@ -90,6 +90,8 @@ struct t_gromppopts
     bool  bGenPairs;
     real  tempi;
     int   seed;
+    int   numMtsLevels;
+    char* mtsLevel2Forces;
     bool  bOrire;
     bool  bMorse;
     char* wall_atomtype[2];
diff --git a/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsDefineParametersWithValuesIncludingAssignment.xml b/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsDefineParametersWithValuesIncludingAssignment.xml
index f71418a53a..3c19abb8dc 100644
--- a/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsDefineParametersWithValuesIncludingAssignment.xml
+++ b/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsDefineParametersWithValuesIncludingAssignment.xml
@@ -20,6 +20,8 @@ nsteps                   = 0
 init-step                = 0
 ; Part index is updated automatically on checkpointing (keeps files separate)
 simulation-part          = 1
+; Multiple time-stepping
+mts                      = no
 ; mode for center of mass motion removal
 comm-mode                = Linear
 ; number of steps for center of mass motion removal
diff --git a/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsElectricField.xml b/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsElectricField.xml
index ae3fb3f47c..9e9fe25635 100644
--- a/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsElectricField.xml
+++ b/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsElectricField.xml
@@ -20,6 +20,8 @@ nsteps                   = 0
 init-step                = 0
 ; Part index is updated automatically on checkpointing (keeps files separate)
 simulation-part          = 1
+; Multiple time-stepping
+mts                      = no
 ; mode for center of mass motion removal
 comm-mode                = Linear
 ; number of steps for center of mass motion removal
diff --git a/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsElectricFieldOscillating.xml b/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsElectricFieldOscillating.xml
index 1315d0e103..a029bb7b3e 100644
--- a/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsElectricFieldOscillating.xml
+++ b/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsElectricFieldOscillating.xml
@@ -20,6 +20,8 @@ nsteps                   = 0
 init-step                = 0
 ; Part index is updated automatically on checkpointing (keeps files separate)
 simulation-part          = 1
+; Multiple time-stepping
+mts                      = no
 ; mode for center of mass motion removal
 comm-mode                = Linear
 ; number of steps for center of mass motion removal
diff --git a/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsElectricFieldPulsed.xml b/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsElectricFieldPulsed.xml
index 93e200d5cd..ec1827434b 100644
--- a/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsElectricFieldPulsed.xml
+++ b/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsElectricFieldPulsed.xml
@@ -20,6 +20,8 @@ nsteps                   = 0
 init-step                = 0
 ; Part index is updated automatically on checkpointing (keeps files separate)
 simulation-part          = 1
+; Multiple time-stepping
+mts                      = no
 ; mode for center of mass motion removal
 comm-mode                = Linear
 ; number of steps for center of mass motion removal
diff --git a/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsEmptyLines.xml b/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsEmptyLines.xml
index f403b671d2..27cc46aa67 100644
--- a/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsEmptyLines.xml
+++ b/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsEmptyLines.xml
@@ -20,6 +20,8 @@ nsteps                   = 0
 init-step                = 0
 ; Part index is updated automatically on checkpointing (keeps files separate)
 simulation-part          = 1
+; Multiple time-stepping
+mts                      = no
 ; mode for center of mass motion removal
 comm-mode                = Linear
 ; number of steps for center of mass motion removal
diff --git a/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsImplicitSolventNo.xml b/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsImplicitSolventNo.xml
index f403b671d2..27cc46aa67 100644
--- a/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsImplicitSolventNo.xml
+++ b/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsImplicitSolventNo.xml
@@ -20,6 +20,8 @@ nsteps                   = 0
 init-step                = 0
 ; Part index is updated automatically on checkpointing (keeps files separate)
 simulation-part          = 1
+; Multiple time-stepping
+mts                      = no
 ; mode for center of mass motion removal
 comm-mode                = Linear
 ; number of steps for center of mass motion removal
diff --git a/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsKeyWithoutValue.xml b/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsKeyWithoutValue.xml
index f403b671d2..27cc46aa67 100644
--- a/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsKeyWithoutValue.xml
+++ b/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsKeyWithoutValue.xml
@@ -20,6 +20,8 @@ nsteps                   = 0
 init-step                = 0
 ; Part index is updated automatically on checkpointing (keeps files separate)
 simulation-part          = 1
+; Multiple time-stepping
+mts                      = no
 ; mode for center of mass motion removal
 comm-mode                = Linear
 ; number of steps for center of mass motion removal
diff --git a/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsMimic.xml b/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsMimic.xml
index 96add6c35d..fb7c36b8d3 100644
--- a/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsMimic.xml
+++ b/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_AcceptsMimic.xml
@@ -20,6 +20,8 @@ nsteps                   = 0
 init-step                = 0
 ; Part index is updated automatically on checkpointing (keeps files separate)
 simulation-part          = 1
+; Multiple time-stepping
+mts                      = no
 ; mode for center of mass motion removal
 comm-mode                = Linear
 ; number of steps for center of mass motion removal
diff --git a/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_HandlesDifferentKindsOfMdpLines.xml b/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_HandlesDifferentKindsOfMdpLines.xml
index a78794df85..8c0e339019 100644
--- a/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_HandlesDifferentKindsOfMdpLines.xml
+++ b/src/gromacs/gmxpreprocess/tests/refdata/GetIrTest_HandlesDifferentKindsOfMdpLines.xml
@@ -20,6 +20,8 @@ nsteps                   = 0
 init_step                = 0
 ; Part index is updated automatically on checkpointing (keeps files separate)
 simulation-part          = 1
+; Multiple time-stepping
+mts                      = no
 ; mode for center of mass motion removal
 comm-mode                = Linear
 ; number of steps for center of mass motion removal
diff --git a/src/gromacs/listed_forces/gpubonded_impl.cpp b/src/gromacs/listed_forces/gpubonded_impl.cpp
index af62aac7a0..ff62325723 100644
--- a/src/gromacs/listed_forces/gpubonded_impl.cpp
+++ b/src/gromacs/listed_forces/gpubonded_impl.cpp
@@ -149,6 +149,10 @@ bool inputSupportsGpuBondeds(const t_inputrec& ir, const gmx_mtop_t& mtop, std::
     {
         errorReasons.emplace_back("MiMiC");
     }
+    if (ir.useMts)
+    {
+        errorReasons.emplace_back("Cannot run with multiple time stepping");
+    }
     if (ir.opts.ngener > 1)
     {
         errorReasons.emplace_back("Cannot run with multiple energy groups");
diff --git a/src/gromacs/mdlib/constr.cpp b/src/gromacs/mdlib/constr.cpp
index 6f591d62b4..49d6823209 100644
--- a/src/gromacs/mdlib/constr.cpp
+++ b/src/gromacs/mdlib/constr.cpp
@@ -928,6 +928,8 @@ void Constraints::Impl::setConstraints(gmx_localtop_t* top,
             {
                 // We are using the local topology, so there are only
                 // F_CONSTR constraints.
+                GMX_RELEASE_ASSERT(idef->il[F_CONSTRNC].empty(),
+                                   "Here we should not have no-connect constraints");
                 make_shake_sblock_dd(shaked.get(), idef->il[F_CONSTR]);
             }
             else
diff --git a/src/gromacs/mdlib/force.cpp b/src/gromacs/mdlib/force.cpp
index f7c5a486cf..2bd8088a6a 100644
--- a/src/gromacs/mdlib/force.cpp
+++ b/src/gromacs/mdlib/force.cpp
@@ -122,7 +122,8 @@ void calculateLongRangeNonbondeds(t_forcerec*                   fr,
     /* Do long-range electrostatics and/or LJ-PME
      * and compute PME surface terms when necessary.
      */
-    if (computePmeOnCpu || fr->ic->eeltype == eelEWALD || haveEwaldSurfaceTerm)
+    if ((computePmeOnCpu || fr->ic->eeltype == eelEWALD || haveEwaldSurfaceTerm)
+        && stepWork.computeNonbondedForces)
     {
         int  status = 0;
         real Vlr_q = 0, Vlr_lj = 0;
diff --git a/src/gromacs/mdlib/force.h b/src/gromacs/mdlib/force.h
index c8cfb79c16..63f80e8839 100644
--- a/src/gromacs/mdlib/force.h
+++ b/src/gromacs/mdlib/force.h
@@ -76,6 +76,19 @@ class StepWorkload;
 class VirtualSitesHandler;
 } // namespace gmx
 
+/* Perform the force and, if requested, energy computation
+ *
+ * Without multiple time stepping the force is returned in force->force().
+ *
+ * With multiple time stepping the behavior depends on the integration step.
+ * At fast steps (step % mtsFactor != 0), the fast force is returned in
+ * force->force(). The force->forceMtsCombined() buffer is unused.
+ * At slow steps, the normal force is returned in force->force(),
+ * unless the \p GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE is set in \p legacyFlags.
+ * A MTS-combined force, F_fast + mtsFactor*F_slow, is always returned in
+ * force->forceMtsCombined(). This forceMts can be used directly in a standard
+ * leap-frog integrator to do multiple time stepping.
+ */
 void do_force(FILE*                               log,
               const t_commrec*                    cr,
               const gmx_multisim_t*               ms,
diff --git a/src/gromacs/mdlib/force_flags.h b/src/gromacs/mdlib/force_flags.h
index 10f3969aaf..dd138740e0 100644
--- a/src/gromacs/mdlib/force_flags.h
+++ b/src/gromacs/mdlib/force_flags.h
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2012, The GROMACS development team.
- * Copyright (c) 2012,2014,2015,2017,2019, by the GROMACS development team, led by
+ * Copyright (c) 2012,2014,2015,2017,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -57,6 +57,8 @@
 #define GMX_FORCE_ENERGY (1u << 9u)
 /* Calculate dHdl */
 #define GMX_FORCE_DHDL (1u << 10u)
+/* Tells whether only the MTS combined force buffer is needed and not the normal force buffer */
+#define GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE (1u << 11u)
 
 /* Normally one want all energy terms and forces */
 #define GMX_FORCE_ALLFORCES (GMX_FORCE_LISTED | GMX_FORCE_NONBONDED | GMX_FORCE_FORCES)
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp
index 6144b3a2fa..0a9f0dc89f 100644
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -82,9 +82,8 @@
 #include "gromacs/mdtypes/inputrec.h"
 #include "gromacs/mdtypes/interaction_const.h"
 #include "gromacs/mdtypes/md_enums.h"
-#include "gromacs/nbnxm/gpu_data_mgmt.h"
+#include "gromacs/mdtypes/multipletimestepping.h"
 #include "gromacs/nbnxm/nbnxm.h"
-#include "gromacs/nbnxm/nbnxm_geometry.h"
 #include "gromacs/pbcutil/ishift.h"
 #include "gromacs/pbcutil/pbc.h"
 #include "gromacs/tables/forcetable.h"
@@ -615,7 +614,10 @@ void forcerec_set_ranges(t_forcerec* fr, int natoms_force, int natoms_force_cons
     fr->natoms_force        = natoms_force;
     fr->natoms_force_constr = natoms_force_constr;
 
-    fr->forceHelperBuffers->resize(natoms_f_novirsum);
+    for (auto& forceHelperBuffers : fr->forceHelperBuffers)
+    {
+        forceHelperBuffers.resize(natoms_f_novirsum);
+    }
 }
 
 static real cutoff_inf(real cutoff)
@@ -1156,11 +1158,26 @@ void init_forcerec(FILE*                            fp,
     /* 1-4 interaction electrostatics */
     fr->fudgeQQ = mtop->ffparams.fudgeQQ;
 
-    const bool haveDirectVirialContributions =
-            (EEL_FULL(ic->eeltype) || EVDW_PME(ic->vdwtype) || fr->forceProviders->hasForceProvider()
-             || gmx_mtop_ftype_count(mtop, F_POSRES) > 0 || gmx_mtop_ftype_count(mtop, F_FBPOSRES) > 0
-             || ir->nwall > 0 || ir->bPull || ir->bRot || ir->bIMD);
-    fr->forceHelperBuffers = std::make_unique<ForceHelperBuffers>(haveDirectVirialContributions);
+    // Multiple time stepping
+    fr->useMts = ir->useMts;
+
+    if (fr->useMts)
+    {
+        gmx::assertMtsRequirements(*ir);
+    }
+
+    const bool haveDirectVirialContributionsFast =
+            fr->forceProviders->hasForceProvider() || gmx_mtop_ftype_count(mtop, F_POSRES) > 0
+            || gmx_mtop_ftype_count(mtop, F_FBPOSRES) > 0 || ir->nwall > 0 || ir->bPull || ir->bRot
+            || ir->bIMD;
+    const bool haveDirectVirialContributionsSlow = EEL_FULL(ic->eeltype) || EVDW_PME(ic->vdwtype);
+    for (int i = 0; i < (fr->useMts ? 2 : 1); i++)
+    {
+        bool haveDirectVirialContributions =
+                (((!fr->useMts || i == 0) && haveDirectVirialContributionsFast)
+                 || ((!fr->useMts || i == 1) && haveDirectVirialContributionsSlow));
+        fr->forceHelperBuffers.emplace_back(haveDirectVirialContributions);
+    }
 
     if (fr->shift_vec == nullptr)
     {
@@ -1264,9 +1281,43 @@ void init_forcerec(FILE*                            fp,
     }
 
     /* Initialize the thread working data for bonded interactions */
-    fr->listedForces.emplace_back(
-            mtop->ffparams, mtop->groups.groups[SimulationAtomGroupType::EnergyOutput].size(),
-            gmx_omp_nthreads_get(emntBonded), ListedForces::interactionSelectionAll(), fp);
+    if (fr->useMts)
+    {
+        // Add one ListedForces object for each MTS level
+        bool isFirstLevel = true;
+        for (const auto& mtsLevel : ir->mtsLevels)
+        {
+            ListedForces::InteractionSelection interactionSelection;
+            const auto&                        forceGroups = mtsLevel.forceGroups;
+            if (forceGroups[static_cast<int>(gmx::MtsForceGroups::Pair)])
+            {
+                interactionSelection.set(static_cast<int>(ListedForces::InteractionGroup::Pairs));
+            }
+            if (forceGroups[static_cast<int>(gmx::MtsForceGroups::Dihedral)])
+            {
+                interactionSelection.set(static_cast<int>(ListedForces::InteractionGroup::Dihedrals));
+            }
+            if (forceGroups[static_cast<int>(gmx::MtsForceGroups::Angle)])
+            {
+                interactionSelection.set(static_cast<int>(ListedForces::InteractionGroup::Angles));
+            }
+            if (isFirstLevel)
+            {
+                interactionSelection.set(static_cast<int>(ListedForces::InteractionGroup::Rest));
+                isFirstLevel = false;
+            }
+            fr->listedForces.emplace_back(
+                    mtop->ffparams, mtop->groups.groups[SimulationAtomGroupType::EnergyOutput].size(),
+                    gmx_omp_nthreads_get(emntBonded), interactionSelection, fp);
+        }
+    }
+    else
+    {
+        // Add one ListedForces object with all listed interactions
+        fr->listedForces.emplace_back(
+                mtop->ffparams, mtop->groups.groups[SimulationAtomGroupType::EnergyOutput].size(),
+                gmx_omp_nthreads_get(emntBonded), ListedForces::interactionSelectionAll(), fp);
+    }
 
     // QM/MM initialization if requested
     if (ir->bQMMM)
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index eed67b4540..2af9a25280 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -44,6 +44,7 @@
 #include <cstring>
 
 #include <array>
+#include <optional>
 
 #include "gromacs/applied_forces/awh/awh.h"
 #include "gromacs/domdec/dlbtiming.h"
@@ -92,6 +93,7 @@
 #include "gromacs/mdtypes/inputrec.h"
 #include "gromacs/mdtypes/md_enums.h"
 #include "gromacs/mdtypes/mdatom.h"
+#include "gromacs/mdtypes/multipletimestepping.h"
 #include "gromacs/mdtypes/simulation_workload.h"
 #include "gromacs/mdtypes/state.h"
 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
@@ -716,7 +718,8 @@ static void launchPmeGpuFftAndGather(gmx_pme_t*               pmedata,
  *
  * \param[in]     nbv              Nonbonded verlet structure
  * \param[in,out] pmedata          PME module data
- * \param[in,out] forceOutputs     Output buffer for the forces and virial
+ * \param[in,out] forceOutputsNonbonded  Force outputs for the non-bonded forces and shift forces
+ * \param[in,out] forceOutputsPme  Force outputs for the PME forces and virial
  * \param[in,out] enerd            Energy data structure results are reduced into
  * \param[in]     lambdaQ          The Coulomb lambda of the current system state.
  * \param[in]     stepWork         Step schedule flags
@@ -724,7 +727,8 @@ static void launchPmeGpuFftAndGather(gmx_pme_t*               pmedata,
  */
 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t* nbv,
                                         gmx_pme_t*          pmedata,
-                                        gmx::ForceOutputs*  forceOutputs,
+                                        gmx::ForceOutputs*  forceOutputsNonbonded,
+                                        gmx::ForceOutputs*  forceOutputsPme,
                                         gmx_enerdata_t*     enerd,
                                         const real          lambdaQ,
                                         const StepWorkload& stepWork,
@@ -733,10 +737,6 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t* nbv,
     bool isPmeGpuDone = false;
     bool isNbGpuDone  = false;
 
-
-    gmx::ForceWithShiftForces& forceWithShiftForces = forceOutputs->forceWithShiftForces();
-    gmx::ForceWithVirial&      forceWithVirial      = forceOutputs->forceWithVirial();
-
     gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
 
     while (!isPmeGpuDone || !isNbGpuDone)
@@ -745,22 +745,24 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t* nbv,
         {
             GpuTaskCompletion completionType =
                     (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
-            isPmeGpuDone = pme_gpu_try_finish_task(pmedata, stepWork, wcycle, &forceWithVirial,
-                                                   enerd, lambdaQ, completionType);
+            isPmeGpuDone = pme_gpu_try_finish_task(pmedata, stepWork, wcycle,
+                                                   &forceOutputsPme->forceWithVirial(), enerd,
+                                                   lambdaQ, completionType);
         }
 
         if (!isNbGpuDone)
         {
+            auto&             forceBuffersNonbonded = forceOutputsNonbonded->forceWithShiftForces();
             GpuTaskCompletion completionType =
                     (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
             isNbGpuDone = Nbnxm::gpu_try_finish_task(
                     nbv->gpu_nbv, stepWork, AtomLocality::Local, enerd->grpp.ener[egLJSR].data(),
-                    enerd->grpp.ener[egCOULSR].data(), forceWithShiftForces.shiftForces(),
+                    enerd->grpp.ener[egCOULSR].data(), forceBuffersNonbonded.shiftForces(),
                     completionType, wcycle);
 
             if (isNbGpuDone)
             {
-                nbv->atomdata_add_nbat_f_to_f(AtomLocality::Local, forceWithShiftForces.force());
+                nbv->atomdata_add_nbat_f_to_f(AtomLocality::Local, forceBuffersNonbonded.force());
             }
         }
     }
@@ -769,8 +771,6 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t* nbv,
 /*! \brief Set up the different force buffers; also does clearing.
  *
  * \param[in] forceHelperBuffers  Helper force buffers
- * \param[in] pull_work The pull work object.
- * \param[in] inputrec  input record
  * \param[in] force     force array
  * \param[in] stepWork  Step schedule flags
  * \param[out] wcycle   wallcycle recording structure
@@ -778,8 +778,6 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t* nbv,
  * \returns             Cleared force output structure
  */
 static ForceOutputs setupForceOutputs(ForceHelperBuffers*                 forceHelperBuffers,
-                                      pull_t*                             pull_work,
-                                      const t_inputrec&                   inputrec,
                                       gmx::ArrayRefWithPadding<gmx::RVec> force,
                                       const StepWorkload&                 stepWork,
                                       gmx_wallcycle_t                     wcycle)
@@ -823,11 +821,6 @@ static ForceOutputs setupForceOutputs(ForceHelperBuffers*                 forceH
         clearRVecs(forceWithVirial.force_, true);
     }
 
-    if (inputrec.bPull && pull_have_constraint(pull_work))
-    {
-        clear_pull_forces(pull_work);
-    }
-
     wallcycle_sub_stop(wcycle, ewcsCLEAR_FORCE_BUFFER);
 
     return ForceOutputs(forceWithShiftForces, forceHelperBuffers->haveDirectVirialContributions(),
@@ -878,25 +871,34 @@ static DomainLifetimeWorkload setupDomainLifetimeWorkload(const t_inputrec&
 /*! \brief Set up force flag stuct from the force bitmask.
  *
  * \param[in]      legacyFlags          Force bitmask flags used to construct the new flags
+ * \param[in]      mtsLevels            The multiple time-stepping levels, either empty or 2 levels
+ * \param[in]      step                 The current MD step
  * \param[in]      simulationWork       Simulation workload description.
  * \param[in]      rankHasPmeDuty       If this rank computes PME.
  *
  * \returns New Stepworkload description.
  */
-static StepWorkload setupStepWorkload(const int                 legacyFlags,
-                                      const SimulationWorkload& simulationWork,
-                                      const bool                rankHasPmeDuty)
+static StepWorkload setupStepWorkload(const int                     legacyFlags,
+                                      ArrayRef<const gmx::MtsLevel> mtsLevels,
+                                      const int64_t                 step,
+                                      const SimulationWorkload&     simulationWork,
+                                      const bool                    rankHasPmeDuty)
 {
+    GMX_ASSERT(mtsLevels.empty() || mtsLevels.size() == 2, "Expect 0 or 2 MTS levels");
+    const bool computeSlowForces = (mtsLevels.empty() || step % mtsLevels[1].stepFactor == 0);
+
     StepWorkload flags;
     flags.stateChanged        = ((legacyFlags & GMX_FORCE_STATECHANGED) != 0);
     flags.haveDynamicBox      = ((legacyFlags & GMX_FORCE_DYNAMICBOX) != 0);
     flags.doNeighborSearch    = ((legacyFlags & GMX_FORCE_NS) != 0);
+    flags.computeSlowForces   = computeSlowForces;
     flags.computeVirial       = ((legacyFlags & GMX_FORCE_VIRIAL) != 0);
     flags.computeEnergy       = ((legacyFlags & GMX_FORCE_ENERGY) != 0);
     flags.computeForces       = ((legacyFlags & GMX_FORCE_FORCES) != 0);
     flags.computeListedForces = ((legacyFlags & GMX_FORCE_LISTED) != 0);
     flags.computeNonbondedForces =
-            ((legacyFlags & GMX_FORCE_NONBONDED) != 0) && simulationWork.computeNonbonded;
+            ((legacyFlags & GMX_FORCE_NONBONDED) != 0) && simulationWork.computeNonbonded
+            && !(simulationWork.computeNonbondedAtMtsLevel1 && !computeSlowForces);
     flags.computeDhdl = ((legacyFlags & GMX_FORCE_DHDL) != 0);
 
     if (simulationWork.useGpuBufferOps)
@@ -906,10 +908,9 @@ static StepWorkload setupStepWorkload(const int                 legacyFlags,
     }
     flags.useGpuXBufferOps = simulationWork.useGpuBufferOps;
     // on virial steps the CPU reduction path is taken
-    flags.useGpuFBufferOps    = simulationWork.useGpuBufferOps && !flags.computeVirial;
-    flags.useGpuPmeFReduction = flags.useGpuFBufferOps
-                                && (simulationWork.useGpuPme
-                                    && (rankHasPmeDuty || simulationWork.useGpuPmePpCommunication));
+    flags.useGpuFBufferOps = simulationWork.useGpuBufferOps && !flags.computeVirial;
+    flags.useGpuPmeFReduction = flags.computeSlowForces && flags.useGpuFBufferOps && simulationWork.useGpuPme
+                                && (rankHasPmeDuty || simulationWork.useGpuPmePpCommunication);
 
     return flags;
 }
@@ -929,7 +930,7 @@ static void launchGpuEndOfStepTasks(nonbonded_verlet_t*               nbv,
                                     int64_t                           step,
                                     gmx_wallcycle_t                   wcycle)
 {
-    if (runScheduleWork.simulationWork.useGpuNonbonded)
+    if (runScheduleWork.simulationWork.useGpuNonbonded && runScheduleWork.stepWork.computeNonbondedForces)
     {
         /* Launch pruning before buffer clearing because the API overhead of the
          * clear kernel launches can leave the GPU idle while it could be running
@@ -1008,6 +1009,27 @@ static void reduceAndUpdateMuTot(DipoleData*                   dipoleData,
     }
 }
 
+/*! \brief Combines MTS level0 and level1 force buffes into a full and MTS-combined force buffer.
+ *
+ * \param[in]     numAtoms        The number of atoms to combine forces for
+ * \param[in,out] forceMtsLevel0  Input: F_level0, output: F_level0 + F_level1
+ * \param[in,out] forceMts        Input: F_level1, output: F_level0 + mtsFactor * F_level1
+ * \param[in]     mtsFactor       The factor between the level0 and level1 time step
+ */
+static void combineMtsForces(const int      numAtoms,
+                             ArrayRef<RVec> forceMtsLevel0,
+                             ArrayRef<RVec> forceMts,
+                             const real     mtsFactor)
+{
+    const int gmx_unused numThreads = gmx_omp_nthreads_get(emntDefault);
+#pragma omp parallel for num_threads(numThreads) schedule(static)
+    for (int i = 0; i < numAtoms; i++)
+    {
+        const RVec forceMtsLevel0Tmp = forceMtsLevel0[i];
+        forceMtsLevel0[i] += forceMts[i];
+        forceMts[i] = forceMtsLevel0Tmp + mtsFactor * forceMts[i];
+    }
+}
 
 /*! \brief Setup for the local and non-local GPU force reductions:
  * reinitialization plus the registration of forces and dependencies.
@@ -1120,19 +1142,19 @@ void do_force(FILE*                               fplog,
                "The size of the force buffer should be at least the number of atoms to compute "
                "forces for");
 
-    nonbonded_verlet_t*          nbv      = fr->nbv.get();
-    interaction_const_t*         ic       = fr->ic;
+    nonbonded_verlet_t*  nbv = fr->nbv.get();
+    interaction_const_t* ic  = fr->ic;
+
     gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
 
     const SimulationWorkload& simulationWork = runScheduleWork->simulationWork;
 
-
-    runScheduleWork->stepWork =
-            setupStepWorkload(legacyFlags, simulationWork, thisRankHasDuty(cr, DUTY_PME));
+    runScheduleWork->stepWork    = setupStepWorkload(legacyFlags, inputrec->mtsLevels, step,
+                                                  simulationWork, thisRankHasDuty(cr, DUTY_PME));
     const StepWorkload& stepWork = runScheduleWork->stepWork;
 
-
-    const bool useGpuPmeOnThisRank = simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME);
+    const bool useGpuPmeOnThisRank =
+            simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME) && stepWork.computeSlowForces;
 
     /* At a search step we need to start the first balancing region
      * somewhere early inside the step after communication during domain
@@ -1179,7 +1201,7 @@ void do_force(FILE*                               fplog,
 
     // If coordinates are to be sent to PME task from CPU memory, perform that send here.
     // Otherwise the send will occur after H2D coordinate transfer.
-    if (GMX_MPI && !thisRankHasDuty(cr, DUTY_PME) && !pmeSendCoordinatesFromGpu)
+    if (GMX_MPI && !thisRankHasDuty(cr, DUTY_PME) && !pmeSendCoordinatesFromGpu && stepWork.computeSlowForces)
     {
         /* Send particle coordinates to the pme nodes */
         if (!stepWork.doNeighborSearch && simulationWork.useGpuUpdate)
@@ -1364,7 +1386,7 @@ void do_force(FILE*                               fplog,
             setupGpuForceReductions(runScheduleWork, cr, fr, ddUsesGpuDirectCommunication);
         }
     }
-    else if (!EI_TPI(inputrec->eI))
+    else if (!EI_TPI(inputrec->eI) && stepWork.computeNonbondedForces)
     {
         if (stepWork.useGpuXBufferOps)
         {
@@ -1385,7 +1407,7 @@ void do_force(FILE*                               fplog,
         }
     }
 
-    if (simulationWork.useGpuNonbonded)
+    if (simulationWork.useGpuNonbonded && (stepWork.computeNonbondedForces || domainWork.haveGpuBondedWork))
     {
         ddBalanceRegionHandler.openBeforeForceComputationGpu();
 
@@ -1516,7 +1538,7 @@ void do_force(FILE*                               fplog,
         }
     }
 
-    if (simulationWork.useGpuNonbonded)
+    if (simulationWork.useGpuNonbonded && stepWork.computeNonbondedForces)
     {
         /* launch D2H copy-back F */
         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
@@ -1589,10 +1611,35 @@ void do_force(FILE*                               fplog,
      */
     wallcycle_start(wcycle, ewcFORCE);
 
-    // Set up and clear force outputs.
-    // We use std::move to keep the compiler happy, it has no effect.
-    ForceOutputs forceOut = setupForceOutputs(fr->forceHelperBuffers.get(), pull_work, *inputrec,
-                                              std::move(force), stepWork, wcycle);
+    /* Set up and clear force outputs:
+     * forceOutMtsLevel0:  everything except what is in the other two outputs
+     * forceOutMtsLevel1:  PME-mesh and listed-forces group 1
+     * forceOutNonbonded: non-bonded forces
+     * Without multiple time stepping all point to the same object.
+     * With multiple time-stepping the use is different for MTS fast (level0 only) and slow steps.
+     */
+    ForceOutputs forceOutMtsLevel0 =
+            setupForceOutputs(&fr->forceHelperBuffers[0], force, stepWork, wcycle);
+
+    // Force output for MTS combined forces, only set at level1 MTS steps
+    std::optional<ForceOutputs> forceOutMts =
+            (fr->useMts && stepWork.computeSlowForces)
+                    ? std::optional(setupForceOutputs(&fr->forceHelperBuffers[1],
+                                                      forceView->forceMtsCombinedWithPadding(),
+                                                      stepWork, wcycle))
+                    : std::nullopt;
+
+    ForceOutputs* forceOutMtsLevel1 =
+            fr->useMts ? (stepWork.computeSlowForces ? &forceOutMts.value() : nullptr) : &forceOutMtsLevel0;
+
+    const bool nonbondedAtMtsLevel1 = runScheduleWork->simulationWork.computeNonbondedAtMtsLevel1;
+
+    ForceOutputs* forceOutNonbonded = nonbondedAtMtsLevel1 ? forceOutMtsLevel1 : &forceOutMtsLevel0;
+
+    if (inputrec->bPull && pull_have_constraint(pull_work))
+    {
+        clear_pull_forces(pull_work);
+    }
 
     /* We calculate the non-bonded forces, when done on the CPU, here.
      * We do this before calling do_force_lowlevel, because in that
@@ -1609,26 +1656,26 @@ void do_force(FILE*                               fplog,
         do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::Local, enbvClearFYes, step, nrnb, wcycle);
     }
 
-    if (fr->efep != efepNO)
+    if (fr->efep != efepNO && stepWork.computeNonbondedForces)
     {
         /* Calculate the local and non-local free energy interactions here.
          * Happens here on the CPU both with and without GPU.
          */
         nbv->dispatchFreeEnergyKernel(InteractionLocality::Local, fr,
                                       as_rvec_array(x.unpaddedArrayRef().data()),
-                                      &forceOut.forceWithShiftForces(), *mdatoms, inputrec->fepvals,
-                                      lambda, enerd, stepWork, nrnb);
+                                      &forceOutNonbonded->forceWithShiftForces(), *mdatoms,
+                                      inputrec->fepvals, lambda, enerd, stepWork, nrnb);
 
         if (havePPDomainDecomposition(cr))
         {
             nbv->dispatchFreeEnergyKernel(InteractionLocality::NonLocal, fr,
                                           as_rvec_array(x.unpaddedArrayRef().data()),
-                                          &forceOut.forceWithShiftForces(), *mdatoms,
+                                          &forceOutNonbonded->forceWithShiftForces(), *mdatoms,
                                           inputrec->fepvals, lambda, enerd, stepWork, nrnb);
         }
     }
 
-    if (!useOrEmulateGpuNb)
+    if (stepWork.computeNonbondedForces && !useOrEmulateGpuNb)
     {
         if (havePPDomainDecomposition(cr))
         {
@@ -1643,7 +1690,8 @@ void do_force(FILE*                               fplog,
              * communication with calculation with domain decomposition.
              */
             wallcycle_stop(wcycle, ewcFORCE);
-            nbv->atomdata_add_nbat_f_to_f(AtomLocality::All, forceOut.forceWithShiftForces().force());
+            nbv->atomdata_add_nbat_f_to_f(AtomLocality::All,
+                                          forceOutNonbonded->forceWithShiftForces().force());
             wallcycle_start_nocount(wcycle, ewcFORCE);
         }
 
@@ -1652,8 +1700,8 @@ void do_force(FILE*                               fplog,
         {
             /* This is not in a subcounter because it takes a
                negligible and constant-sized amount of time */
-            nbnxn_atomdata_add_nbat_fshift_to_fshift(*nbv->nbat,
-                                                     forceOut.forceWithShiftForces().shiftForces());
+            nbnxn_atomdata_add_nbat_fshift_to_fshift(
+                    *nbv->nbat, forceOutNonbonded->forceWithShiftForces().shiftForces());
         }
     }
 
@@ -1670,7 +1718,7 @@ void do_force(FILE*                               fplog,
     {
         /* foreign lambda component for walls */
         real dvdl_walls = do_walls(*inputrec, *fr, box, *mdatoms, x.unpaddedConstArrayRef(),
-                                   &forceOut.forceWithVirial(), lambda[efptVDW],
+                                   &forceOutMtsLevel0.forceWithVirial(), lambda[efptVDW],
                                    enerd->grpp.ener[egLJSR].data(), nrnb);
         enerd->dvdl_lin[efptVDW] += dvdl_walls;
     }
@@ -1697,8 +1745,10 @@ void do_force(FILE*                               fplog,
             set_pbc_dd(&pbc, fr->pbcType, DOMAINDECOMP(cr) ? cr->dd->numCells : nullptr, TRUE, box);
         }
 
-        for (auto& listedForces : fr->listedForces)
+        for (int mtsIndex = 0; mtsIndex < (fr->useMts && stepWork.computeSlowForces ? 2 : 1); mtsIndex++)
         {
+            ListedForces& listedForces = fr->listedForces[mtsIndex];
+            ForceOutputs& forceOut     = (mtsIndex == 0 ? forceOutMtsLevel0 : *forceOutMtsLevel1);
             listedForces.calculate(
                     wcycle, box, inputrec->fepvals, cr, ms, x, xWholeMolecules, fr->fcdata.get(),
                     hist, &forceOut, fr, &pbc, enerd, nrnb, lambda.data(), mdatoms,
@@ -1706,9 +1756,13 @@ void do_force(FILE*                               fplog,
         }
     }
 
-    calculateLongRangeNonbondeds(fr, inputrec, cr, nrnb, wcycle, mdatoms, x.unpaddedConstArrayRef(),
-                                 &forceOut.forceWithVirial(), enerd, box, lambda.data(),
-                                 as_rvec_array(dipoleData.muStateAB), stepWork, ddBalanceRegionHandler);
+    if (stepWork.computeSlowForces)
+    {
+        calculateLongRangeNonbondeds(fr, inputrec, cr, nrnb, wcycle, mdatoms,
+                                     x.unpaddedConstArrayRef(), &forceOutMtsLevel1->forceWithVirial(),
+                                     enerd, box, lambda.data(), as_rvec_array(dipoleData.muStateAB),
+                                     stepWork, ddBalanceRegionHandler);
+    }
 
     wallcycle_stop(wcycle, ewcFORCE);
 
@@ -1733,16 +1787,19 @@ void do_force(FILE*                               fplog,
     }
 
     computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation, imdSession, pull_work, step, t,
-                         wcycle, fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda,
-                         stepWork, &forceOut.forceWithVirial(), enerd, ed, stepWork.doNeighborSearch);
-
+                         wcycle, fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda, stepWork,
+                         &forceOutMtsLevel0.forceWithVirial(), enerd, ed, stepWork.doNeighborSearch);
 
+    GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFBufferOps),
+               "The schedule below does not allow for nonbonded MTS with GPU buffer ops");
+    GMX_ASSERT(!(nonbondedAtMtsLevel1 && useGpuForcesHaloExchange),
+               "The schedule below does not allow for nonbonded MTS with GPU halo exchange");
     // Will store the amount of cycles spent waiting for the GPU that
     // will be later used in the DLB accounting.
     float cycles_wait_gpu = 0;
-    if (useOrEmulateGpuNb)
+    if (useOrEmulateGpuNb && stepWork.computeNonbondedForces)
     {
-        auto& forceWithShiftForces = forceOut.forceWithShiftForces();
+        auto& forceWithShiftForces = forceOutNonbonded->forceWithShiftForces();
 
         /* wait for non-local forces (or calculate in emulation mode) */
         if (havePPDomainDecomposition(cr))
@@ -1771,7 +1828,7 @@ void do_force(FILE*                               fplog,
 
                 if (haveNonLocalForceContribInCpuBuffer)
                 {
-                    stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(),
+                    stateGpu->copyForcesToGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
                                               AtomLocality::NonLocal);
                 }
 
@@ -1780,7 +1837,7 @@ void do_force(FILE*                               fplog,
                 if (!useGpuForcesHaloExchange)
                 {
                     // copy from GPU input for dd_move_f()
-                    stateGpu->copyForcesFromGpu(forceOut.forceWithShiftForces().force(),
+                    stateGpu->copyForcesFromGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
                                                 AtomLocality::NonLocal);
                 }
             }
@@ -1789,7 +1846,6 @@ void do_force(FILE*                               fplog,
                 nbv->atomdata_add_nbat_f_to_f(AtomLocality::NonLocal, forceWithShiftForces.force());
             }
 
-
             if (fr->nbv->emulateGpu() && stepWork.computeVirial)
             {
                 nbnxn_atomdata_add_nbat_fshift_to_fshift(*nbv->nbat, forceWithShiftForces.shiftForces());
@@ -1797,6 +1853,20 @@ void do_force(FILE*                               fplog,
         }
     }
 
+    /* Combining the forces for multiple time stepping before the halo exchange, when possible,
+     * avoids an extra halo exchange (when DD is used) and post-processing step.
+     */
+    const bool combineMtsForcesBeforeHaloExchange =
+            (stepWork.computeForces && fr->useMts && stepWork.computeSlowForces
+             && (legacyFlags & GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE) != 0
+             && !(stepWork.computeVirial || simulationWork.useGpuNonbonded || useGpuPmeOnThisRank));
+    if (combineMtsForcesBeforeHaloExchange)
+    {
+        const int numAtoms = havePPDomainDecomposition(cr) ? dd_numAtomsZones(*cr->dd) : mdatoms->homenr;
+        combineMtsForces(numAtoms, force.unpaddedArrayRef(), forceView->forceMtsCombined(),
+                         inputrec->mtsLevels[1].stepFactor);
+    }
+
     if (havePPDomainDecomposition(cr))
     {
         /* We are done with the CPU compute.
@@ -1808,12 +1878,12 @@ void do_force(FILE*                               fplog,
 
         if (stepWork.computeForces)
         {
-
             if (useGpuForcesHaloExchange)
             {
                 if (domainWork.haveCpuLocalForceWork)
                 {
-                    stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), AtomLocality::Local);
+                    stateGpu->copyForcesToGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
+                                              AtomLocality::Local);
                 }
                 communicateGpuHaloForces(*cr, domainWork.haveCpuLocalForceWork);
             }
@@ -1823,7 +1893,18 @@ void do_force(FILE*                               fplog,
                 {
                     stateGpu->waitForcesReadyOnHost(AtomLocality::NonLocal);
                 }
-                dd_move_f(cr->dd, &forceOut.forceWithShiftForces(), wcycle);
+
+                // Without MTS or with MTS at slow steps with uncombined forces we need to
+                // communicate the fast forces
+                if (!fr->useMts || !combineMtsForcesBeforeHaloExchange)
+                {
+                    dd_move_f(cr->dd, &forceOutMtsLevel0.forceWithShiftForces(), wcycle);
+                }
+                // With MTS we need to communicate the slow or combined (in forceOutMtsLevel1) forces
+                if (fr->useMts && stepWork.computeSlowForces)
+                {
+                    dd_move_f(cr->dd, &forceOutMtsLevel1->forceWithShiftForces(), wcycle);
+                }
             }
         }
     }
@@ -1834,18 +1915,18 @@ void do_force(FILE*                               fplog,
                              && !DOMAINDECOMP(cr) && !stepWork.useGpuFBufferOps);
     if (alternateGpuWait)
     {
-        alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &forceOut, enerd, lambda[efptCOUL],
-                                    stepWork, wcycle);
+        alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, forceOutNonbonded,
+                                    forceOutMtsLevel1, enerd, lambda[efptCOUL], stepWork, wcycle);
     }
 
     if (!alternateGpuWait && useGpuPmeOnThisRank)
     {
-        pme_gpu_wait_and_reduce(fr->pmedata, stepWork, wcycle, &forceOut.forceWithVirial(), enerd,
-                                lambda[efptCOUL]);
+        pme_gpu_wait_and_reduce(fr->pmedata, stepWork, wcycle,
+                                &forceOutMtsLevel1->forceWithVirial(), enerd, lambda[efptCOUL]);
     }
 
     /* Wait for local GPU NB outputs on the non-alternating wait path */
-    if (!alternateGpuWait && simulationWork.useGpuNonbonded)
+    if (!alternateGpuWait && stepWork.computeNonbondedForces && simulationWork.useGpuNonbonded)
     {
         /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
          * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
@@ -1855,7 +1936,8 @@ void do_force(FILE*                               fplog,
         const float gpuWaitApiOverheadMargin = 2e6F; /* cycles */
         const float waitCycles               = Nbnxm::gpu_wait_finish_task(
                 nbv->gpu_nbv, stepWork, AtomLocality::Local, enerd->grpp.ener[egLJSR].data(),
-                enerd->grpp.ener[egCOULSR].data(), forceOut.forceWithShiftForces().shiftForces(), wcycle);
+                enerd->grpp.ener[egCOULSR].data(),
+                forceOutNonbonded->forceWithShiftForces().shiftForces(), wcycle);
 
         if (ddBalanceRegionHandler.useBalancingRegion())
         {
@@ -1886,13 +1968,13 @@ void do_force(FILE*                               fplog,
 
     // If on GPU PME-PP comms or GPU update path, receive forces from PME before GPU buffer ops
     // TODO refactor this and unify with below default-path call to the same function
-    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME)
+    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && stepWork.computeSlowForces
         && (simulationWork.useGpuPmePpCommunication || simulationWork.useGpuUpdate))
     {
         /* In case of node-splitting, the PP nodes receive the long-range
          * forces, virial and energy from the PME nodes here.
          */
-        pme_receive_force_ener(fr, cr, &forceOut.forceWithVirial(), enerd,
+        pme_receive_force_ener(fr, cr, &forceOutMtsLevel1->forceWithVirial(), enerd,
                                simulationWork.useGpuPmePpCommunication,
                                stepWork.useGpuPmeFReduction, wcycle);
     }
@@ -1900,12 +1982,14 @@ void do_force(FILE*                               fplog,
 
     /* Do the nonbonded GPU (or emulation) force buffer reduction
      * on the non-alternating path. */
+    GMX_ASSERT(!(nonbondedAtMtsLevel1 && stepWork.useGpuFBufferOps),
+               "The schedule below does not allow for nonbonded MTS with GPU buffer ops");
     if (useOrEmulateGpuNb && !alternateGpuWait)
     {
-        gmx::ArrayRef<gmx::RVec> forceWithShift = forceOut.forceWithShiftForces().force();
-
         if (stepWork.useGpuFBufferOps)
         {
+            ArrayRef<gmx::RVec> forceWithShift = forceOutNonbonded->forceWithShiftForces().force();
+
             // Flag to specify whether the CPU force buffer has contributions to
             // local atoms. This depends on whether there are CPU-based force tasks
             // or when DD is active the halo exchange has resulted in contributions
@@ -1932,7 +2016,10 @@ void do_force(FILE*                               fplog,
                 stateGpu->copyForcesToGpu(forceWithShift, locality);
             }
 
-            fr->gpuForceReduction[gmx::AtomLocality::Local]->execute();
+            if (stepWork.computeNonbondedForces)
+            {
+                fr->gpuForceReduction[gmx::AtomLocality::Local]->execute();
+            }
 
             // Copy forces to host if they are needed for update or if virtual sites are enabled.
             // If there are vsites, we need to copy forces every step to spread vsite forces on host.
@@ -1946,8 +2033,9 @@ void do_force(FILE*                               fplog,
                 stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
             }
         }
-        else
+        else if (stepWork.computeNonbondedForces)
         {
+            ArrayRef<gmx::RVec> forceWithShift = forceOutNonbonded->forceWithShiftForces().force();
             nbv->atomdata_add_nbat_f_to_f(AtomLocality::Local, forceWithShift);
         }
     }
@@ -1960,27 +2048,49 @@ void do_force(FILE*                               fplog,
         dd_force_flop_stop(cr->dd, nrnb);
     }
 
+    const bool haveCombinedMtsForces = (stepWork.computeForces && fr->useMts && stepWork.computeSlowForces
+                                        && combineMtsForcesBeforeHaloExchange);
     if (stepWork.computeForces)
     {
-        postProcessForceWithShiftForces(nrnb, wcycle, box, x.unpaddedArrayRef(), &forceOut,
+        postProcessForceWithShiftForces(nrnb, wcycle, box, x.unpaddedArrayRef(), &forceOutMtsLevel0,
                                         vir_force, *mdatoms, *fr, vsite, stepWork);
+
+        if (fr->useMts && stepWork.computeSlowForces && !haveCombinedMtsForces)
+        {
+            postProcessForceWithShiftForces(nrnb, wcycle, box, x.unpaddedArrayRef(), forceOutMtsLevel1,
+                                            vir_force, *mdatoms, *fr, vsite, stepWork);
+        }
     }
 
     // TODO refactor this and unify with above GPU PME-PP / GPU update path call to the same function
     if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication
-        && !simulationWork.useGpuUpdate)
+        && !simulationWork.useGpuUpdate && stepWork.computeSlowForces)
     {
         /* In case of node-splitting, the PP nodes receive the long-range
          * forces, virial and energy from the PME nodes here.
          */
-        pme_receive_force_ener(fr, cr, &forceOut.forceWithVirial(), enerd,
+        pme_receive_force_ener(fr, cr, &forceOutMtsLevel1->forceWithVirial(), enerd,
                                simulationWork.useGpuPmePpCommunication, false, wcycle);
     }
 
     if (stepWork.computeForces)
     {
-        postProcessForces(cr, step, nrnb, wcycle, box, x.unpaddedArrayRef(), &forceOut, vir_force,
-                          mdatoms, fr, vsite, stepWork);
+        /* If we don't use MTS or if we already combined the MTS forces before, we only
+         * need to post-process one ForceOutputs object here, called forceOutCombined,
+         * otherwise we have to post-process two outputs and then combine them.
+         */
+        ForceOutputs& forceOutCombined = (haveCombinedMtsForces ? forceOutMts.value() : forceOutMtsLevel0);
+        postProcessForces(cr, step, nrnb, wcycle, box, x.unpaddedArrayRef(), &forceOutCombined,
+                          vir_force, mdatoms, fr, vsite, stepWork);
+
+        if (fr->useMts && stepWork.computeSlowForces && !haveCombinedMtsForces)
+        {
+            postProcessForces(cr, step, nrnb, wcycle, box, x.unpaddedArrayRef(), forceOutMtsLevel1,
+                              vir_force, mdatoms, fr, vsite, stepWork);
+
+            combineMtsForces(mdatoms->homenr, force.unpaddedArrayRef(),
+                             forceView->forceMtsCombined(), inputrec->mtsLevels[1].stepFactor);
+        }
     }
 
     if (stepWork.computeEnergy)
diff --git a/src/gromacs/mdlib/update.cpp b/src/gromacs/mdlib/update.cpp
index 0cc8ca8e5b..46f848d4e7 100644
--- a/src/gromacs/mdlib/update.cpp
+++ b/src/gromacs/mdlib/update.cpp
@@ -150,6 +150,12 @@ public:
                                bool              do_log,
                                bool              do_ene);
 
+    void update_for_constraint_virial(const t_inputrec&                                inputRecord,
+                                      const t_mdatoms&                                 md,
+                                      const t_state&                                   state,
+                                      const gmx::ArrayRefWithPadding<const gmx::RVec>& f,
+                                      const gmx_ekindata_t&                            ekind);
+
     void update_temperature_constants(const t_inputrec& inputRecord);
 
     const std::vector<bool>& getAndersenRandomizeGroup() const { return sd_.randomize_group; }
@@ -235,6 +241,15 @@ void Update::update_sd_second_half(const t_inputrec& inputRecord,
                                         constr, do_log, do_ene);
 }
 
+void Update::update_for_constraint_virial(const t_inputrec& inputRecord,
+                                          const t_mdatoms&  md,
+                                          const t_state&    state,
+                                          const gmx::ArrayRefWithPadding<const gmx::RVec>& f,
+                                          const gmx_ekindata_t&                            ekind)
+{
+    return impl_->update_for_constraint_virial(inputRecord, md, state, f, ekind);
+}
+
 void Update::update_temperature_constants(const t_inputrec& inputRecord)
 {
     return impl_->update_temperature_constants(inputRecord);
@@ -252,6 +267,13 @@ static void clearVsiteVelocities(int start, int nrend, const unsigned short* par
     }
 }
 
+/*! \brief Sets whether we store the updated velocities */
+enum class StoreUpdatedVelocities
+{
+    yes, //!< Store the updated velocities
+    no   //!< Do not store the updated velocities
+};
+
 /*! \brief Sets the number of different temperature coupling values */
 enum class NumTempScaleValues
 {
@@ -273,6 +295,7 @@ enum class ApplyParrinelloRahmanVScaling
 
 /*! \brief Integrate using leap-frog with T-scaling and optionally diagonal Parrinello-Rahman p-coupling
  *
+ * \tparam       storeUpdatedVelocities Tells whether we should store the updated velocities
  * \tparam       numTempScaleValues     The number of different T-couple values
  * \tparam       applyPRVScaling        Apply Parrinello-Rahman velocity scaling
  * \param[in]    start                  Index of first atom to update
@@ -293,7 +316,7 @@ enum class ApplyParrinelloRahmanVScaling
  * Note that we might get even better SIMD acceleration when we introduce
  * aligned (and padded) memory, possibly with some hints for the compilers.
  */
-template<NumTempScaleValues numTempScaleValues, ApplyParrinelloRahmanVScaling applyPRVScaling>
+template<StoreUpdatedVelocities storeUpdatedVelocities, NumTempScaleValues numTempScaleValues, ApplyParrinelloRahmanVScaling applyPRVScaling>
 static void updateMDLeapfrogSimple(int         start,
                                    int         nrend,
                                    real        dt,
@@ -335,7 +358,10 @@ static void updateMDLeapfrogSimple(int         start,
             {
                 vNew -= dtPressureCouple * pRVScaleMatrixDiagonal[d] * v[a][d];
             }
-            v[a][d]      = vNew;
+            if (storeUpdatedVelocities == StoreUpdatedVelocities::yes)
+            {
+                v[a][d] = vNew;
+            }
             xprime[a][d] = x[a][d] + vNew * dt;
         }
     }
@@ -399,6 +425,7 @@ static inline void simdStoreRvecs(rvec* r, int index, SimdReal r0, SimdReal r1,
 
 /*! \brief Integrate using leap-frog with single group T-scaling and SIMD
  *
+ * \tparam       storeUpdatedVelocities Tells whether we should store the updated velocities
  * \param[in]    start                  Index of first atom to update
  * \param[in]    nrend                  Last atom to update: \p nrend - 1
  * \param[in]    dt                     The time step
@@ -409,6 +436,7 @@ static inline void simdStoreRvecs(rvec* r, int index, SimdReal r0, SimdReal r1,
  * \param[inout] v                      Velocities
  * \param[in]    f                      Forces
  */
+template<StoreUpdatedVelocities storeUpdatedVelocities>
 static void updateMDLeapfrogSimpleSimd(int         start,
                                        int         nrend,
                                        real        dt,
@@ -441,7 +469,10 @@ static void updateMDLeapfrogSimpleSimd(int         start,
         v1 = fma(f1 * invMass1, timestep, lambdaSystem * v1);
         v2 = fma(f2 * invMass2, timestep, lambdaSystem * v2);
 
-        simdStoreRvecs(v, a, v0, v1, v2);
+        if (storeUpdatedVelocities == StoreUpdatedVelocities::yes)
+        {
+            simdStoreRvecs(v, a, v0, v1, v2);
+        }
 
         SimdReal x0, x1, x2;
         simdLoadRvecs(x, a, &x0, &x1, &x2);
@@ -700,13 +731,15 @@ static void do_update_md(int         start,
 
             if (haveSingleTempScaleValue)
             {
-                updateMDLeapfrogSimple<NumTempScaleValues::single, ApplyParrinelloRahmanVScaling::diagonal>(
+                updateMDLeapfrogSimple<StoreUpdatedVelocities::yes, NumTempScaleValues::single,
+                                       ApplyParrinelloRahmanVScaling::diagonal>(
                         start, nrend, dt, dtPressureCouple, invMassPerDim, tcstat, cTC, diagM, x,
                         xprime, v, f);
             }
             else
             {
-                updateMDLeapfrogSimple<NumTempScaleValues::multiple, ApplyParrinelloRahmanVScaling::diagonal>(
+                updateMDLeapfrogSimple<StoreUpdatedVelocities::yes, NumTempScaleValues::multiple,
+                                       ApplyParrinelloRahmanVScaling::diagonal>(
                         start, nrend, dt, dtPressureCouple, invMassPerDim, tcstat, cTC, diagM, x,
                         xprime, v, f);
             }
@@ -724,25 +757,58 @@ static void do_update_md(int         start,
                 /* Check if we can use invmass instead of invMassPerDim */
                 if (!md->havePartiallyFrozenAtoms)
                 {
-                    updateMDLeapfrogSimpleSimd(start, nrend, dt, md->invmass, tcstat, x, xprime, v, f);
+                    updateMDLeapfrogSimpleSimd<StoreUpdatedVelocities::yes>(
+                            start, nrend, dt, md->invmass, tcstat, x, xprime, v, f);
                 }
                 else
 #endif
                 {
-                    updateMDLeapfrogSimple<NumTempScaleValues::single, ApplyParrinelloRahmanVScaling::no>(
+                    updateMDLeapfrogSimple<StoreUpdatedVelocities::yes, NumTempScaleValues::single,
+                                           ApplyParrinelloRahmanVScaling::no>(
                             start, nrend, dt, dtPressureCouple, invMassPerDim, tcstat, cTC, nullptr,
                             x, xprime, v, f);
                 }
             }
             else
             {
-                updateMDLeapfrogSimple<NumTempScaleValues::multiple, ApplyParrinelloRahmanVScaling::no>(
+                updateMDLeapfrogSimple<StoreUpdatedVelocities::yes, NumTempScaleValues::multiple,
+                                       ApplyParrinelloRahmanVScaling::no>(
                         start, nrend, dt, dtPressureCouple, invMassPerDim, tcstat, cTC, nullptr, x,
                         xprime, v, f);
             }
         }
     }
 }
+/*! \brief Handles the Leap-frog MD x and v integration */
+static void doUpdateMDDoNotUpdateVelocities(int         start,
+                                            int         nrend,
+                                            real        dt,
+                                            const rvec* gmx_restrict x,
+                                            rvec* gmx_restrict xprime,
+                                            rvec* gmx_restrict v,
+                                            const rvec* gmx_restrict f,
+                                            const t_mdatoms&         md,
+                                            const gmx_ekindata_t&    ekind)
+{
+    GMX_ASSERT(nrend == start || xprime != x,
+               "For SIMD optimization certain compilers need to have xprime != x");
+
+    gmx::ArrayRef<const t_grp_tcstat> tcstat = ekind.tcstat;
+
+    /* Check if we can use invmass instead of invMassPerDim */
+#if GMX_HAVE_SIMD_UPDATE
+    if (!md.havePartiallyFrozenAtoms)
+    {
+        updateMDLeapfrogSimpleSimd<StoreUpdatedVelocities::no>(start, nrend, dt, md.invmass, tcstat,
+                                                               x, xprime, v, f);
+    }
+    else
+#endif
+    {
+        updateMDLeapfrogSimple<StoreUpdatedVelocities::no, NumTempScaleValues::single, ApplyParrinelloRahmanVScaling::no>(
+                start, nrend, dt, dt, md.invMassPerDim, tcstat, nullptr, nullptr, x, xprime, v, f);
+    }
+}
 
 static void do_update_vv_vel(int                  start,
                              int                  nrend,
@@ -1494,3 +1560,37 @@ void Update::Impl::update_coords(const t_inputrec&
         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
     }
 }
+
+void Update::Impl::update_for_constraint_virial(const t_inputrec& inputRecord,
+                                                const t_mdatoms&  md,
+                                                const t_state&    state,
+                                                const gmx::ArrayRefWithPadding<const gmx::RVec>& f,
+                                                const gmx_ekindata_t& ekind)
+{
+    GMX_ASSERT(inputRecord.eI == eiMD || inputRecord.eI == eiSD1,
+               "Only leap-frog is supported here");
+
+    // Cast to real for faster code, no loss in precision
+    const real dt = inputRecord.delta_t;
+
+    const int nth = gmx_omp_nthreads_get(emntUpdate);
+
+#pragma omp parallel for num_threads(nth) schedule(static)
+    for (int th = 0; th < nth; th++)
+    {
+        try
+        {
+            int start_th, end_th;
+            getThreadAtomRange(nth, th, md.homenr, &start_th, &end_th);
+
+            const rvec* x_rvec  = state.x.rvec_array();
+            rvec*       xp_rvec = xp_.rvec_array();
+            rvec*       v_rvec  = const_cast<rvec*>(state.v.rvec_array());
+            const rvec* f_rvec  = as_rvec_array(f.unpaddedConstArrayRef().data());
+
+            doUpdateMDDoNotUpdateVelocities(start_th, end_th, dt, x_rvec, xp_rvec, v_rvec, f_rvec,
+                                            md, ekind);
+        }
+        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
+    }
+}
diff --git a/src/gromacs/mdlib/update.h b/src/gromacs/mdlib/update.h
index 3f122b59f7..514f8e7648 100644
--- a/src/gromacs/mdlib/update.h
+++ b/src/gromacs/mdlib/update.h
@@ -166,6 +166,16 @@ public:
                                gmx::Constraints* constr,
                                bool              do_log,
                                bool              do_ene);
+
+    /*! \brief Performs a leap-frog update without updating \p state so the constrain virial
+     * can be computed.
+     */
+    void update_for_constraint_virial(const t_inputrec&                                inputRecord,
+                                      const t_mdatoms&                                 md,
+                                      const t_state&                                   state,
+                                      const gmx::ArrayRefWithPadding<const gmx::RVec>& f,
+                                      const gmx_ekindata_t&                            ekind);
+
     /*! \brief Update pre-computed constants that depend on the reference temperature for coupling.
      *
      * This could change e.g. in simulated annealing.
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp
index 4d306410e5..e2f5581f3a 100644
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -120,6 +120,7 @@
 #include "gromacs/mdtypes/md_enums.h"
 #include "gromacs/mdtypes/mdatom.h"
 #include "gromacs/mdtypes/mdrunoptions.h"
+#include "gromacs/mdtypes/multipletimestepping.h"
 #include "gromacs/mdtypes/observableshistory.h"
 #include "gromacs/mdtypes/pullhistory.h"
 #include "gromacs/mdtypes/simulation_workload.h"
@@ -330,9 +331,9 @@ void gmx::LegacySimulator::do_md()
     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
 
-    ForceBuffers f(((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
-                           ? PinningPolicy::PinnedIfSupported
-                           : PinningPolicy::CannotBePinned);
+    ForceBuffers f(fr->useMts, ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
+                                       ? PinningPolicy::PinnedIfSupported
+                                       : PinningPolicy::CannotBePinned);
     if (DOMAINDECOMP(cr))
     {
         stateInstance = std::make_unique<t_state>();
@@ -919,6 +920,10 @@ void gmx::LegacySimulator::do_md()
         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
+        if (fr->useMts && !do_per_step(step, ir->nstfout))
+        {
+            force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
+        }
 
         if (shellfc)
         {
@@ -1300,13 +1305,31 @@ void gmx::LegacySimulator::do_md()
         }
         else
         {
-            upd.update_coords(*ir, step, mdatoms, state, f.view().forceWithPadding(), fcdata, ekind,
-                              M, etrtPOSITION, cr, constr != nullptr);
+            /* With multiple time stepping we need to do an additional normal
+             * update step to obtain the virial, as the actual MTS integration
+             * using an acceleration where the slow forces are multiplied by mtsFactor.
+             * Using that acceleration would result in a virial with the slow
+             * force contribution would be a factor mtsFactor too large.
+             */
+            if (fr->useMts && bCalcVir && constr != nullptr)
+            {
+                upd.update_for_constraint_virial(*ir, *mdatoms, *state, f.view().forceWithPadding(), *ekind);
+
+                constrain_coordinates(constr, do_log, do_ene, step, state,
+                                      upd.xp()->arrayRefWithPadding(), &dvdl_constr, bCalcVir, shake_vir);
+            }
+
+            ArrayRefWithPadding<const RVec> forceCombined =
+                    (fr->useMts && step % ir->mtsLevels[1].stepFactor == 0)
+                            ? f.view().forceMtsCombinedWithPadding()
+                            : f.view().forceWithPadding();
+            upd.update_coords(*ir, step, mdatoms, state, forceCombined, fcdata, ekind, M,
+                              etrtPOSITION, cr, constr != nullptr);
 
             wallcycle_stop(wcycle, ewcUPDATE);
 
-            constrain_coordinates(constr, do_log, do_ene, step, state,
-                                  upd.xp()->arrayRefWithPadding(), &dvdl_constr, bCalcVir, shake_vir);
+            constrain_coordinates(constr, do_log, do_ene, step, state, upd.xp()->arrayRefWithPadding(),
+                                  &dvdl_constr, bCalcVir && !fr->useMts, shake_vir);
 
             upd.update_sd_second_half(*ir, step, &dvdl_constr, mdatoms, state, cr, nrnb, wcycle,
                                       constr, do_log, do_ene);
diff --git a/src/gromacs/mdrun/shellfc.cpp b/src/gromacs/mdrun/shellfc.cpp
index d08e7f8660..a2c8b86f08 100644
--- a/src/gromacs/mdrun/shellfc.cpp
+++ b/src/gromacs/mdrun/shellfc.cpp
@@ -1020,7 +1020,7 @@ void relax_shell_flexcon(FILE*                         fplog,
         pr_rvecs(debug, 0, "x b4 do_force", as_rvec_array(x.data()), homenr);
     }
     int                   shellfc_flags = force_flags | (bVerbose ? GMX_FORCE_ENERGY : 0);
-    gmx::ForceBuffersView forceViewInit = gmx::ForceBuffersView(forceWithPadding[Min]);
+    gmx::ForceBuffersView forceViewInit = gmx::ForceBuffersView(forceWithPadding[Min], {}, false);
     do_force(fplog, cr, ms, inputrec, nullptr, enforcedRotation, imdSession, pull_work, mdstep,
              nrnb, wcycle, top, box, xPadded, hist, &forceViewInit, force_vir, md, enerd, lambda,
              fr, runScheduleWork, vsite, mu_tot, t, nullptr,
@@ -1107,7 +1107,7 @@ void relax_shell_flexcon(FILE*                         fplog,
             pr_rvecs(debug, 0, "RELAX: pos[Try]  ", as_rvec_array(pos[Try].data()), homenr);
         }
         /* Try the new positions */
-        gmx::ForceBuffersView forceViewTry = gmx::ForceBuffersView(forceWithPadding[Try]);
+        gmx::ForceBuffersView forceViewTry = gmx::ForceBuffersView(forceWithPadding[Try], {}, false);
         do_force(fplog, cr, ms, inputrec, nullptr, enforcedRotation, imdSession, pull_work, 1, nrnb, wcycle,
                  top, box, posWithPadding[Try], hist, &forceViewTry, force_vir, md, enerd, lambda, fr,
                  runScheduleWork, vsite, mu_tot, t, nullptr, shellfc_flags, ddBalanceRegionHandler);
diff --git a/src/gromacs/mdtypes/CMakeLists.txt b/src/gromacs/mdtypes/CMakeLists.txt
index bc55056149..ac7b8d191c 100644
--- a/src/gromacs/mdtypes/CMakeLists.txt
+++ b/src/gromacs/mdtypes/CMakeLists.txt
@@ -42,6 +42,7 @@ file(GLOB MDTYPES_SOURCES
     inputrec.cpp
     interaction_const.cpp
     md_enums.cpp
+    multipletimestepping.cpp
     observableshistory.cpp
     state.cpp)
 
diff --git a/src/gromacs/mdtypes/forcebuffers.cpp b/src/gromacs/mdtypes/forcebuffers.cpp
index e76c02ad43..ee0c0d0d91 100644
--- a/src/gromacs/mdtypes/forcebuffers.cpp
+++ b/src/gromacs/mdtypes/forcebuffers.cpp
@@ -49,7 +49,19 @@
 namespace gmx
 {
 
-ForceBuffers::ForceBuffers(PinningPolicy pinningPolicy) : force_({}, { pinningPolicy }), view_({})
+ForceBuffers::ForceBuffers() :
+    force_({}),
+    forceMtsCombined_({}),
+    view_({}, {}, false),
+    useForceMtsCombined_(false)
+{
+}
+
+ForceBuffers::ForceBuffers(const bool useForceMtsCombined, const PinningPolicy pinningPolicy) :
+    force_({}, { pinningPolicy }),
+    forceMtsCombined_({}),
+    view_({}, {}, useForceMtsCombined),
+    useForceMtsCombined_(useForceMtsCombined)
 {
 }
 
@@ -72,7 +84,12 @@ PinningPolicy ForceBuffers::pinningPolicy() const
 void ForceBuffers::resize(int numAtoms)
 {
     force_.resizeWithPadding(numAtoms);
-    view_ = ForceBuffersView(force_.arrayRefWithPadding());
+    if (useForceMtsCombined_)
+    {
+        forceMtsCombined_.resizeWithPadding(numAtoms);
+    }
+    view_ = ForceBuffersView(force_.arrayRefWithPadding(), forceMtsCombined_.arrayRefWithPadding(),
+                             useForceMtsCombined_);
 }
 
 } // namespace gmx
diff --git a/src/gromacs/mdtypes/forcebuffers.h b/src/gromacs/mdtypes/forcebuffers.h
index 2d20ff2a2b..63176345de 100644
--- a/src/gromacs/mdtypes/forcebuffers.h
+++ b/src/gromacs/mdtypes/forcebuffers.h
@@ -66,7 +66,14 @@ class ForceBuffersView
 {
 public:
     //! Constructor, creates a view to \p force
-    ForceBuffersView(const ArrayRefWithPadding<RVec>& force) : force_(force) {}
+    ForceBuffersView(const ArrayRefWithPadding<RVec>& force,
+                     const ArrayRefWithPadding<RVec>& forceMtsCombined,
+                     const bool                       useForceMtsCombined) :
+        force_(force),
+        forceMtsCombined_(forceMtsCombined),
+        useForceMtsCombined_(useForceMtsCombined)
+    {
+    }
 
     //! Copy constructor deleted to avoid creating non-const from const
     ForceBuffersView(const ForceBuffersView& o) = delete;
@@ -91,23 +98,57 @@ public:
     //! Returns an ArrayRefWithPadding to the force buffer
     ArrayRefWithPadding<RVec> forceWithPadding() { return force_; }
 
+    //! Returns a const arrayref to the MTS force buffer without padding
+    ArrayRef<const RVec> forceMtsCombined() const
+    {
+        GMX_ASSERT(useForceMtsCombined_, "Need the MTS buffer");
+        return forceMtsCombined_.unpaddedConstArrayRef();
+    }
+
+    //! Returns an arrayref to the MTS force buffer without padding
+    ArrayRef<RVec> forceMtsCombined()
+    {
+        GMX_ASSERT(useForceMtsCombined_, "Need the MTS buffer");
+        return forceMtsCombined_.unpaddedArrayRef();
+    }
+
+    //! Returns an ArrayRefWithPadding to the MTS force buffer
+    ArrayRefWithPadding<RVec> forceMtsCombinedWithPadding()
+    {
+        GMX_ASSERT(useForceMtsCombined_, "Need the MTS buffer");
+        return forceMtsCombined_;
+    }
+
 private:
     //! The force buffer
     ArrayRefWithPadding<RVec> force_;
+    //! The force buffer for combined fast and slow forces with MTS
+    ArrayRefWithPadding<RVec> forceMtsCombined_;
+    //! Wether we use forceMtsCombined_
+    bool useForceMtsCombined_;
 };
 
 /*! \libinternal \brief Object that holds the force buffers
  *
+ * Contains a normal force buffer and optionally a force buffer for combined fast and slow
+ * forces for use with multiple time stepping.
  * More buffers can be added when needed. Those should also be added
  * to ForceBuffersView.
- * Can be pinned for efficient transfer to/from GPUs.
+ * The force buffer (not forceMtsCombined) can be pinned for efficient transfer to/from GPUs.
  * All access happens through the ForceBuffersView object.
  */
 class ForceBuffers
 {
 public:
-    //! Constructor, creates an empty force buffer with pinning not active
-    ForceBuffers(PinningPolicy pinningPolicy = PinningPolicy::CannotBePinned);
+    //! Constructor, creates an empty force buffer with pinning not active and no MTS force buffer
+    ForceBuffers();
+
+    /*! \brief Constructor, with options for using the MTS force buffer and the pinning policy
+     *
+     * \param[in] useForceMtsCombined  Whether to enable use of the forceMtsCombined buffer
+     * \param[in] pinningPolicy        The pinning policy for the force (not MTS) buffer
+     */
+    ForceBuffers(bool useForceMtsCombined, PinningPolicy pinningPolicy);
 
     //! Copy constructor deleted, but could be implemented
     ForceBuffers(const ForceBuffers& o) = delete;
@@ -141,8 +182,12 @@ public:
 private:
     //! The force buffer
     PaddedHostVector<RVec> force_;
+    //! Force buffer with combined fast and slow forces for use with multiple time stepping
+    PaddedHostVector<RVec> forceMtsCombined_;
     //! The view to the force buffer
     ForceBuffersView view_;
+    //! Wether we use forceMtsCombined_
+    bool useForceMtsCombined_;
 };
 
 } // namespace gmx
diff --git a/src/gromacs/mdtypes/forcerec.h b/src/gromacs/mdtypes/forcerec.h
index 4fa0a03f80..98ad9ff1ab 100644
--- a/src/gromacs/mdtypes/forcerec.h
+++ b/src/gromacs/mdtypes/forcerec.h
@@ -257,8 +257,8 @@ struct t_forcerec
     /* The number of atoms participating in force calculation and constraints */
     int natoms_force_constr = 0;
 
-    /* Helper buffer for ForceOutputs */
-    std::unique_ptr<ForceHelperBuffers> forceHelperBuffers;
+    /* List of helper buffers for ForceOutputs, one for each time step with MTS */
+    std::vector<ForceHelperBuffers> forceHelperBuffers;
 
     /* Data for PPPM/PME/Ewald */
     struct gmx_pme_t* pmedata                = nullptr;
@@ -299,14 +299,15 @@ struct t_forcerec
     real userreal3 = 0;
     real userreal4 = 0;
 
+    /* Tells whether we use multiple time stepping, computing some forces less frequently */
+    bool useMts = false;
+
     /* Data for special listed force calculations */
     std::unique_ptr<t_fcdata> fcdata;
 
     // The listed forces calculation data, 1 entry or multiple entries with multiple time stepping
     std::vector<ListedForces> listedForces;
 
-    static constexpr size_t c_listedForcesFast = 0;
-
     /* TODO: Replace the pointer by an object once we got rid of C */
     gmx::GpuBonded* gpuBonded = nullptr;
 
diff --git a/src/gromacs/mdtypes/inputrec.cpp b/src/gromacs/mdtypes/inputrec.cpp
index e1f25314bd..4f0ec9ca8f 100644
--- a/src/gromacs/mdtypes/inputrec.cpp
+++ b/src/gromacs/mdtypes/inputrec.cpp
@@ -44,11 +44,13 @@
 #include <cstring>
 
 #include <algorithm>
+#include <numeric>
 
 #include "gromacs/math/veccompare.h"
 #include "gromacs/math/vecdump.h"
 #include "gromacs/mdtypes/awh_params.h"
 #include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/multipletimestepping.h"
 #include "gromacs/mdtypes/pull_params.h"
 #include "gromacs/pbcutil/pbc.h"
 #include "gromacs/utility/compare.h"
@@ -75,7 +77,7 @@ const int nstmin_berendsen_tcouple = 5;
 const int nstmin_berendsen_pcouple = 10;
 const int nstmin_harmonic          = 20;
 
-/* Default values for nst T- and P- coupling intervals, used when the are no other
+/* Default values for T- and P- coupling intervals, used when the are no other
  * restrictions.
  */
 constexpr int c_defaultNstTCouple = 10;
@@ -109,6 +111,11 @@ int ir_optimal_nstcalcenergy(const t_inputrec* ir)
         nst = c_defaultNstCalcEnergy;
     }
 
+    if (ir->useMts)
+    {
+        nst = std::lcm(nst, ir->mtsLevels.back().stepFactor);
+    }
+
     return nst;
 }
 
@@ -196,29 +203,41 @@ int pcouple_min_integration_steps(int epc)
 
 int ir_optimal_nstpcouple(const t_inputrec* ir)
 {
-    int nmin, nwanted, n;
+    const int minIntegrationSteps = pcouple_min_integration_steps(ir->epc);
 
-    nmin = pcouple_min_integration_steps(ir->epc);
+    const int nwanted = c_defaultNstPCouple;
 
-    nwanted = c_defaultNstPCouple;
+    // With multiple time-stepping we can only compute the pressure at slowest steps
+    const int minNstPCouple = (ir->useMts ? ir->mtsLevels.back().stepFactor : 1);
 
-    if (nmin == 0 || ir->delta_t * nwanted <= ir->tau_p)
+    int n;
+    if (minIntegrationSteps == 0 || ir->delta_t * nwanted <= ir->tau_p)
     {
         n = nwanted;
     }
     else
     {
-        n = static_cast<int>(ir->tau_p / (ir->delta_t * nmin) + 0.001);
-        if (n < 1)
+        n = static_cast<int>(ir->tau_p / (ir->delta_t * minIntegrationSteps) + 0.001);
+        if (n < minNstPCouple)
         {
-            n = 1;
+            n = minNstPCouple;
         }
-        while (nwanted % n != 0)
+        // Without MTS we try to make nstpcouple a "nice" number
+        if (!ir->useMts)
         {
-            n--;
+            while (nwanted % n != 0)
+            {
+                n--;
+            }
         }
     }
 
+    // With MTS, nstpcouple should be a multiple of the slowest MTS interval
+    if (ir->useMts)
+    {
+        n = n - (n % minNstPCouple);
+    }
+
     return n;
 }
 
@@ -819,6 +838,30 @@ void pr_inputrec(FILE* fp, int indent, const char* title, const t_inputrec* ir,
         PSTEP("nsteps", ir->nsteps);
         PSTEP("init-step", ir->init_step);
         PI("simulation-part", ir->simulation_part);
+        PS("mts", EBOOL(ir->useMts));
+        if (ir->useMts)
+        {
+            for (int mtsIndex = 1; mtsIndex < static_cast<int>(ir->mtsLevels.size()); mtsIndex++)
+            {
+                const auto&       mtsLevel = ir->mtsLevels[mtsIndex];
+                const std::string forceKey = gmx::formatString("mts-level%d-forces", mtsIndex + 1);
+                std::string       forceGroups;
+                for (int i = 0; i < static_cast<int>(gmx::MtsForceGroups::Count); i++)
+                {
+                    if (mtsLevel.forceGroups[i])
+                    {
+                        if (!forceGroups.empty())
+                        {
+                            forceGroups += " ";
+                        }
+                        forceGroups += gmx::mtsForceGroupNames[i];
+                    }
+                }
+                PS(forceKey.c_str(), forceGroups.c_str());
+                const std::string factorKey = gmx::formatString("mts-level%d-factor", mtsIndex + 1);
+                PI(factorKey.c_str(), mtsLevel.stepFactor);
+            }
+        }
         PS("comm-mode", ECOM(ir->comm_mode));
         PI("nstcomm", ir->nstcomm);
 
@@ -1287,6 +1330,15 @@ void cmp_inputrec(FILE* fp, const t_inputrec* ir1, const t_inputrec* ir2, real f
     cmp_int64(fp, "inputrec->nsteps", ir1->nsteps, ir2->nsteps);
     cmp_int64(fp, "inputrec->init_step", ir1->init_step, ir2->init_step);
     cmp_int(fp, "inputrec->simulation_part", -1, ir1->simulation_part, ir2->simulation_part);
+    cmp_int(fp, "inputrec->mts", -1, static_cast<int>(ir1->useMts), static_cast<int>(ir2->useMts));
+    if (ir1->useMts && ir2->useMts)
+    {
+        cmp_int(fp, "inputrec->mts-levels", -1, ir1->mtsLevels.size(), ir2->mtsLevels.size());
+        cmp_int(fp, "inputrec->mts-level2-forces", -1, ir1->mtsLevels[1].forceGroups.to_ulong(),
+                ir2->mtsLevels[1].forceGroups.to_ulong());
+        cmp_int(fp, "inputrec->mts-level2-factor", -1, ir1->mtsLevels[1].stepFactor,
+                ir2->mtsLevels[1].stepFactor);
+    }
     cmp_int(fp, "inputrec->pbcType", -1, static_cast<int>(ir1->pbcType), static_cast<int>(ir2->pbcType));
     cmp_bool(fp, "inputrec->bPeriodicMols", -1, ir1->bPeriodicMols, ir2->bPeriodicMols);
     cmp_int(fp, "inputrec->cutoff_scheme", -1, ir1->cutoff_scheme, ir2->cutoff_scheme);
diff --git a/src/gromacs/mdtypes/inputrec.h b/src/gromacs/mdtypes/inputrec.h
index ef4302a590..40ed79ea8d 100644
--- a/src/gromacs/mdtypes/inputrec.h
+++ b/src/gromacs/mdtypes/inputrec.h
@@ -41,6 +41,7 @@
 #include <cstdio>
 
 #include <memory>
+#include <vector>
 
 #include "gromacs/math/vectypes.h"
 #include "gromacs/mdtypes/md_enums.h"
@@ -59,6 +60,7 @@ namespace gmx
 class Awh;
 struct AwhParams;
 class KeyValueTreeObject;
+struct MtsLevel;
 } // namespace gmx
 
 enum class PbcType;
@@ -348,6 +350,10 @@ struct t_inputrec // NOLINT (clang-analyzer-optin.performance.Padding)
     double init_t;
     //! Time step (ps)
     double delta_t;
+    //! Whether we use multiple time stepping
+    bool useMts;
+    //! The multiple time stepping levels
+    std::vector<gmx::MtsLevel> mtsLevels;
     //! Precision of x in compressed trajectory file
     real x_compression_precision;
     //! Requested fourier_spacing, when nk? not set
diff --git a/src/gromacs/mdtypes/multipletimestepping.cpp b/src/gromacs/mdtypes/multipletimestepping.cpp
new file mode 100644
index 0000000000..be56600040
--- /dev/null
+++ b/src/gromacs/mdtypes/multipletimestepping.cpp
@@ -0,0 +1,97 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#include "gmxpre.h"
+
+#include "multipletimestepping.h"
+
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/utility/gmxassert.h"
+
+namespace gmx
+{
+
+int nonbondedMtsFactor(const t_inputrec& ir)
+{
+    GMX_RELEASE_ASSERT(!ir.useMts || ir.mtsLevels.size() == 2, "Only 2 MTS levels supported here");
+
+    if (ir.useMts && ir.mtsLevels[1].forceGroups[static_cast<int>(MtsForceGroups::Nonbonded)])
+    {
+        return ir.mtsLevels[1].stepFactor;
+    }
+    else
+    {
+        return 1;
+    }
+}
+
+void assertMtsRequirements(const t_inputrec& ir)
+{
+    if (!ir.useMts)
+    {
+        return;
+    }
+
+    GMX_RELEASE_ASSERT(ir.mtsLevels.size() >= 2, "Need at least two levels for MTS");
+
+    GMX_RELEASE_ASSERT(ir.mtsLevels[0].stepFactor == 1, "Base MTS step should be 1");
+
+    GMX_RELEASE_ASSERT(
+            (!EEL_FULL(ir.coulombtype) && !EVDW_PME(ir.vdwtype))
+                    || ir.mtsLevels.back().forceGroups[static_cast<int>(MtsForceGroups::LongrangeNonbonded)],
+            "Long-range nonbondeds should be in the highest MTS level");
+
+    for (const auto& mtsLevel : ir.mtsLevels)
+    {
+        const int mtsFactor = mtsLevel.stepFactor;
+        GMX_RELEASE_ASSERT(ir.nstcalcenergy % mtsFactor == 0,
+                           "nstcalcenergy should be a multiple of mtsFactor");
+        GMX_RELEASE_ASSERT(ir.nstenergy % mtsFactor == 0,
+                           "nstenergy should be a multiple of mtsFactor");
+        GMX_RELEASE_ASSERT(ir.nstlog % mtsFactor == 0, "nstlog should be a multiple of mtsFactor");
+        GMX_RELEASE_ASSERT(ir.epc == epcNO || ir.nstpcouple % mtsFactor == 0,
+                           "nstpcouple should be a multiple of mtsFactor");
+        GMX_RELEASE_ASSERT(ir.efep == efepNO || ir.fepvals->nstdhdl % mtsFactor == 0,
+                           "nstdhdl should be a multiple of mtsFactor");
+        if (ir.mtsLevels.back().forceGroups[static_cast<int>(gmx::MtsForceGroups::Nonbonded)])
+        {
+            GMX_RELEASE_ASSERT(ir.nstlist % ir.mtsLevels.back().stepFactor == 0,
+                               "With multiple time stepping for the non-bonded pair interactions, "
+                               "nstlist should be a "
+                               "multiple of mtsFactor");
+        }
+    }
+}
+
+} // namespace gmx
diff --git a/src/gromacs/mdtypes/multipletimestepping.h b/src/gromacs/mdtypes/multipletimestepping.h
new file mode 100644
index 0000000000..a1a953a3a7
--- /dev/null
+++ b/src/gromacs/mdtypes/multipletimestepping.h
@@ -0,0 +1,82 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_MULTIPLETIMESTEPPING_H
+#define GMX_MULTIPLETIMESTEPPING_H
+
+#include <bitset>
+
+#include "gromacs/utility/enumerationhelpers.h"
+
+struct t_inputrec;
+
+namespace gmx
+{
+
+//! Force group available for selection for multiple time step integration
+enum class MtsForceGroups : int
+{
+    LongrangeNonbonded, //!< PME-mesh or Ewald for electrostatics and/or LJ
+    Nonbonded,          //!< Non-bonded pair interactions
+    Pair,               //!< Bonded pair interactions
+    Dihedral,           //!< Dihedrals, including cmap (not restraints)
+    Angle,              //! Bonded angle potentials (not restraints)
+    Count               //! The number of groups above
+};
+
+static const gmx::EnumerationArray<MtsForceGroups, std::string> mtsForceGroupNames = {
+    "longrange-nonbonded", "nonbonded", "pair", "dihedral", "angle"
+};
+
+//! Setting for a single level for multiple time step integration
+struct MtsLevel
+{
+    //! The force group selection for this level;
+    std::bitset<static_cast<int>(MtsForceGroups::Count)> forceGroups;
+    //! The factor between the base, fastest, time step and the time step for this level
+    int stepFactor;
+};
+
+/*! \brief Returns the interval in steps at which the non-bonded pair forces are calculated
+ *
+ * Note: returns 1 when multiple time-stepping is not activated.
+ */
+int nonbondedMtsFactor(const t_inputrec& ir);
+
+//! (Release) Asserts that all multiple time-stepping requirements on \p ir are fulfilled
+void assertMtsRequirements(const t_inputrec& ir);
+
+} // namespace gmx
+
+#endif /* GMX_MULTIPLETIMESTEPPING_H */
diff --git a/src/gromacs/mdtypes/simulation_workload.h b/src/gromacs/mdtypes/simulation_workload.h
index 0763e6f125..f2d08c5847 100644
--- a/src/gromacs/mdtypes/simulation_workload.h
+++ b/src/gromacs/mdtypes/simulation_workload.h
@@ -69,6 +69,8 @@ public:
     bool haveDynamicBox = false;
     //! Whether neighbor searching needs to be done this step
     bool doNeighborSearch = false;
+    //! Whether the slow forces need to be computed this step (in addition to the faster forces)
+    bool computeSlowForces = false;
     //! Whether virial needs to be computed this step
     bool computeVirial = false;
     //! Whether energies need to be computed this step this step
@@ -111,10 +113,9 @@ class DomainLifetimeWorkload
 public:
     //! Whether the current nstlist step-range has bonded work to run on a GPU.
     bool haveGpuBondedWork = false;
-    //! Whether the current nstlist step-range has bonded work to run on he CPU.
+    //! Whether the current nstlist step-range has bonded work to run on the CPU.
     bool haveCpuBondedWork = false;
-    //! Whether the current nstlist step-range has listed forces work to run on he CPU.
-    //  Note: currently this is haveCpuBondedWork | haveRestraintsWork
+    //! Whether the current nstlist step-range has listed (bonded + restraints) forces work to run on the CPU.
     bool haveCpuListedForceWork = false;
     //! Whether the current nstlist step-range has special forces on the CPU.
     bool haveSpecialForces = false;
@@ -142,6 +143,8 @@ class SimulationWorkload
 public:
     //! Whether to compute nonbonded pair interactions
     bool computeNonbonded = false;
+    //! Wether nonbonded pair forces are to be computed at slow MTS steps only
+    bool computeNonbondedAtMtsLevel1 = false;
     //! Whether total dipole needs to be computed
     bool computeMuTot = false;
     //! If we have calculation of short range nonbondeds on CPU
diff --git a/src/gromacs/mdtypes/tests/forcebuffers.cpp b/src/gromacs/mdtypes/tests/forcebuffers.cpp
index d23b91023c..504d42f12b 100644
--- a/src/gromacs/mdtypes/tests/forcebuffers.cpp
+++ b/src/gromacs/mdtypes/tests/forcebuffers.cpp
@@ -64,7 +64,7 @@ TEST(ForceBuffers, ConstructsUnpinned)
 
 TEST(ForceBuffers, ConstructsPinned)
 {
-    ForceBuffers forceBuffers(PinningPolicy::PinnedIfSupported);
+    ForceBuffers forceBuffers(false, PinningPolicy::PinnedIfSupported);
 
     EXPECT_EQ(forceBuffers.pinningPolicy(), PinningPolicy::PinnedIfSupported);
 }
@@ -123,7 +123,7 @@ TEST(ForceBuffers, CopyWorks)
 
 TEST(ForceBuffers, CopyDoesNotPin)
 {
-    ForceBuffers forceBuffers(PinningPolicy::PinnedIfSupported);
+    ForceBuffers forceBuffers(false, PinningPolicy::PinnedIfSupported);
 
     ForceBuffers forceBuffersCopy;
     forceBuffersCopy = forceBuffers;
diff --git a/src/gromacs/nbnxm/pairlist_tuning.cpp b/src/gromacs/nbnxm/pairlist_tuning.cpp
index e4c90428c2..6d0afa41ce 100644
--- a/src/gromacs/nbnxm/pairlist_tuning.cpp
+++ b/src/gromacs/nbnxm/pairlist_tuning.cpp
@@ -59,6 +59,7 @@
 #include "gromacs/mdtypes/commrec.h"
 #include "gromacs/mdtypes/inputrec.h"
 #include "gromacs/mdtypes/interaction_const.h"
+#include "gromacs/mdtypes/multipletimestepping.h"
 #include "gromacs/mdtypes/state.h"
 #include "gromacs/pbcutil/pbc.h"
 #include "gromacs/topology/topology.h"
@@ -152,25 +153,33 @@ void increaseNstlist(FILE*               fp,
     const char* dd_err  = "Can not increase nstlist because of domain decomposition limitations";
     char        buf[STRLEN];
 
+    /* When most of the computation, and in particular the non-bondeds is only
+     * performed every ir->mtsFactor steps due to multiple time stepping,
+     * we scale all nstlist values by this factor.
+     */
+    const int mtsFactor = gmx::nonbondedMtsFactor(*ir);
+
     if (nstlist_cmdline <= 0)
     {
-        if (ir->nstlist == 1)
+        if (ir->nstlist <= mtsFactor)
         {
-            /* The user probably set nstlist=1 for a reason,
-             * don't mess with the settings.
+            /* The user probably set nstlist<=mtsFactor for a reason,
+             * don't mess with the settings, except when < mtsFactor.
              */
+            ir->nstlist = mtsFactor;
+
             return;
         }
 
         /* With a GPU and fixed nstlist suggest tuning nstlist */
-        if (fp != nullptr && useOrEmulateGpuForNonbondeds && ir->nstlist < nstlist_try[0]
+        if (fp != nullptr && useOrEmulateGpuForNonbondeds && ir->nstlist < nstlist_try[0] * mtsFactor
             && !supportsDynamicPairlistGenerationInterval(*ir))
         {
             fprintf(fp, nstl_gpu, ir->nstlist);
         }
 
         nstlist_ind = 0;
-        while (nstlist_ind < NNSTL && ir->nstlist >= nstlist_try[nstlist_ind])
+        while (nstlist_ind < NNSTL && ir->nstlist >= nstlist_try[nstlist_ind] * mtsFactor)
         {
             nstlist_ind++;
         }
@@ -249,10 +258,10 @@ void increaseNstlist(FILE*               fp,
     VerletbufListSetup listSetup = verletbufGetSafeListSetup(listType);
 
     /* Allow rlist to make the list a given factor larger than the list
-     * would be with the reference value for nstlist (10).
+     * would be with the reference value for nstlist (10*mtsFactor).
      */
     nstlist_prev = ir->nstlist;
-    ir->nstlist  = nbnxnReferenceNstlist;
+    ir->nstlist  = nbnxnReferenceNstlist * mtsFactor;
     const real rlistWithReferenceNstlist =
             calcVerletBufferSize(*mtop, det(box), *ir, ir->nstlist, ir->nstlist - 1, -1, listSetup);
     ir->nstlist = nstlist_prev;
@@ -273,11 +282,12 @@ void increaseNstlist(FILE*               fp,
     {
         if (nstlist_cmdline <= 0)
         {
-            ir->nstlist = nstlist_try[nstlist_ind];
+            ir->nstlist = nstlist_try[nstlist_ind] * mtsFactor;
         }
 
         /* Set the pair-list buffer size in ir */
-        rlist_new = calcVerletBufferSize(*mtop, det(box), *ir, ir->nstlist, ir->nstlist - 1, -1, listSetup);
+        rlist_new = calcVerletBufferSize(*mtop, det(box), *ir, ir->nstlist, ir->nstlist - mtsFactor,
+                                         -1, listSetup);
 
         /* Does rlist fit in the box? */
         bBox = (gmx::square(rlist_new) < max_cutoff2(ir->pbcType, box));
@@ -394,7 +404,17 @@ static void setDynamicPairlistPruningParameters(const t_inputrec*          ir,
                                                 const interaction_const_t* ic,
                                                 PairlistParams*            listParams)
 {
-    listParams->lifetime = ir->nstlist - 1;
+    /* When applying multiple time stepping to the non-bonded forces,
+     * we only compute them every mtsFactor steps, so all parameters here
+     * should be a multiple of mtsFactor.
+     */
+    listParams->mtsFactor = gmx::nonbondedMtsFactor(*ir);
+
+    const int mtsFactor = listParams->mtsFactor;
+
+    GMX_RELEASE_ASSERT(ir->nstlist % mtsFactor == 0, "nstlist should be a multiple of mtsFactor");
+
+    listParams->lifetime = ir->nstlist - mtsFactor;
 
     /* When nstlistPrune was set by the user, we need to execute one loop
      * iteration to determine rlistInner.
@@ -407,9 +427,9 @@ static void setDynamicPairlistPruningParameters(const t_inputrec*          ir,
     {
         /* Dynamic pruning on the GPU is performed on the list for
          * the next step on the coordinates of the current step,
-         * so the list lifetime is nstlistPrune (not the usual nstlist-1).
+         * so the list lifetime is nstlistPrune (not the usual nstlist-mtsFactor).
          */
-        int listLifetime         = tunedNstlistPrune - (useGpuList ? 0 : 1);
+        int listLifetime         = tunedNstlistPrune - (useGpuList ? 0 : mtsFactor);
         listParams->nstlistPrune = tunedNstlistPrune;
         listParams->rlistInner   = calcVerletBufferSize(*mtop, det(box), *ir, tunedNstlistPrune,
                                                       listLifetime, -1, listSetup);
@@ -418,7 +438,7 @@ static void setDynamicPairlistPruningParameters(const t_inputrec*          ir,
          * every c_nbnxnGpuRollingListPruningInterval steps,
          * so keep nstlistPrune a multiple of the interval.
          */
-        tunedNstlistPrune += useGpuList ? c_nbnxnGpuRollingListPruningInterval : 1;
+        tunedNstlistPrune += (useGpuList ? c_nbnxnGpuRollingListPruningInterval : 1) * mtsFactor;
     } while (!userSetNstlistPrune && tunedNstlistPrune < ir->nstlist
              && listParams->rlistInner == interactionCutoff);
 
diff --git a/src/gromacs/nbnxm/pairlistparams.cpp b/src/gromacs/nbnxm/pairlistparams.cpp
index 26f6fec3f9..32f2299a32 100644
--- a/src/gromacs/nbnxm/pairlistparams.cpp
+++ b/src/gromacs/nbnxm/pairlistparams.cpp
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -60,6 +60,7 @@ PairlistParams::PairlistParams(const Nbnxm::KernelType kernelType,
     rlistInner(rlist),
     haveMultipleDomains(haveMultipleDomains),
     useDynamicPruning(false),
+    mtsFactor(1),
     nstlistPrune(-1),
     numRollingPruningParts(1),
     lifetime(-1)
diff --git a/src/gromacs/nbnxm/pairlistparams.h b/src/gromacs/nbnxm/pairlistparams.h
index 92826f070d..9eefac4a9d 100644
--- a/src/gromacs/nbnxm/pairlistparams.h
+++ b/src/gromacs/nbnxm/pairlistparams.h
@@ -130,6 +130,8 @@ struct PairlistParams
     bool haveMultipleDomains;
     //! Are we using dynamic pair-list pruning
     bool useDynamicPruning;
+    //! The interval in steps for computing non-bonded interactions, =1 without MTS
+    int mtsFactor;
     //! Pair-list dynamic pruning interval
     int nstlistPrune;
     //! The number parts to divide the pair-list into for rolling pruning, a value of 1 gives no rolling pruning
diff --git a/src/gromacs/nbnxm/pairlistsets.h b/src/gromacs/nbnxm/pairlistsets.h
index d3ca0bdb8e..0119680c72 100644
--- a/src/gromacs/nbnxm/pairlistsets.h
+++ b/src/gromacs/nbnxm/pairlistsets.h
@@ -109,7 +109,8 @@ public:
         const int age = numStepsWithPairlist(step);
 
         return (params_.useDynamicPruning && age > 0 && age < params_.lifetime
-                && (params_.haveMultipleDomains || age % 2 == 0));
+                && step % params_.mtsFactor == 0
+                && (params_.haveMultipleDomains || age % (2 * params_.mtsFactor) == 0));
     }
 
     //! Changes the pair-list outer and inner radius
diff --git a/src/gromacs/nbnxm/prunekerneldispatch.cpp b/src/gromacs/nbnxm/prunekerneldispatch.cpp
index 7b55de4137..1ad8ef347a 100644
--- a/src/gromacs/nbnxm/prunekerneldispatch.cpp
+++ b/src/gromacs/nbnxm/prunekerneldispatch.cpp
@@ -101,7 +101,8 @@ void nonbonded_verlet_t::dispatchPruneKernelGpu(int64_t step)
     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
     wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_NONBONDED);
 
-    const bool stepIsEven = (pairlistSets().numStepsWithPairlist(step) % 2 == 0);
+    const bool stepIsEven =
+            (pairlistSets().numStepsWithPairlist(step) % (2 * pairlistSets().params().mtsFactor) == 0);
 
     Nbnxm::gpu_launch_kernel_pruneonly(
             gpu_nbv, stepIsEven ? gmx::InteractionLocality::Local : gmx::InteractionLocality::NonLocal,
diff --git a/src/gromacs/pulling/pull.cpp b/src/gromacs/pulling/pull.cpp
index a68c4ecb57..67cf362e79 100644
--- a/src/gromacs/pulling/pull.cpp
+++ b/src/gromacs/pulling/pull.cpp
@@ -1988,6 +1988,10 @@ struct pull_t* init_pull(FILE*                     fplog,
                           epull_names[epullUMBRELLA]);
             }
 
+            GMX_RELEASE_ASSERT(
+                    !ir->useMts,
+                    "Constraint pulling can not be combined with multiple time stepping");
+
             pull->bConstraint = TRUE;
         }
         else
diff --git a/src/gromacs/taskassignment/decidegpuusage.cpp b/src/gromacs/taskassignment/decidegpuusage.cpp
index 1083c67fa4..85dfe60e93 100644
--- a/src/gromacs/taskassignment/decidegpuusage.cpp
+++ b/src/gromacs/taskassignment/decidegpuusage.cpp
@@ -588,6 +588,11 @@ bool decideWhetherToUseGpuForUpdate(const bool                     isDomainDecom
         }
     }
 
+    if (inputrec.useMts)
+    {
+        errorMessage += "Multiple time stepping is not supported.\n";
+    }
+
     if (inputrec.eConstrAlg == econtSHAKE && hasAnyConstraints && gmx_mtop_ftype_count(mtop, F_CONSTR) > 0)
     {
         errorMessage += "SHAKE constraints are not supported.\n";
diff --git a/src/gromacs/taskassignment/decidesimulationworkload.cpp b/src/gromacs/taskassignment/decidesimulationworkload.cpp
index 60abedf026..06b1eeb29d 100644
--- a/src/gromacs/taskassignment/decidesimulationworkload.cpp
+++ b/src/gromacs/taskassignment/decidesimulationworkload.cpp
@@ -44,6 +44,7 @@
 #include "decidesimulationworkload.h"
 
 #include "gromacs/ewald/pme.h"
+#include "gromacs/mdtypes/multipletimestepping.h"
 #include "gromacs/taskassignment/decidegpuusage.h"
 #include "gromacs/taskassignment/taskassignment.h"
 #include "gromacs/utility/arrayref.h"
@@ -61,15 +62,19 @@ SimulationWorkload createSimulationWorkload(const t_inputrec& inputrec,
 {
     SimulationWorkload simulationWorkload;
     simulationWorkload.computeNonbonded = !disableNonbondedCalculation;
-    simulationWorkload.computeMuTot     = inputrecNeedMutot(&inputrec);
-    simulationWorkload.useCpuNonbonded  = !useGpuForNonbonded;
-    simulationWorkload.useGpuNonbonded  = useGpuForNonbonded;
-    simulationWorkload.useCpuPme        = (pmeRunMode == PmeRunMode::CPU);
+    simulationWorkload.computeNonbondedAtMtsLevel1 =
+            simulationWorkload.computeNonbonded && inputrec.useMts
+            && inputrec.mtsLevels.back().forceGroups[static_cast<int>(MtsForceGroups::Nonbonded)];
+    simulationWorkload.computeMuTot    = inputrecNeedMutot(&inputrec);
+    simulationWorkload.useCpuNonbonded = !useGpuForNonbonded;
+    simulationWorkload.useGpuNonbonded = useGpuForNonbonded;
+    simulationWorkload.useCpuPme       = (pmeRunMode == PmeRunMode::CPU);
     simulationWorkload.useGpuPme = (pmeRunMode == PmeRunMode::GPU || pmeRunMode == PmeRunMode::Mixed);
-    simulationWorkload.useGpuPmeFft       = (pmeRunMode == PmeRunMode::Mixed);
-    simulationWorkload.useGpuBonded       = useGpuForBonded;
-    simulationWorkload.useGpuUpdate       = useGpuForUpdate;
-    simulationWorkload.useGpuBufferOps    = devFlags.enableGpuBufferOps || useGpuForUpdate;
+    simulationWorkload.useGpuPmeFft    = (pmeRunMode == PmeRunMode::Mixed);
+    simulationWorkload.useGpuBonded    = useGpuForBonded;
+    simulationWorkload.useGpuUpdate    = useGpuForUpdate;
+    simulationWorkload.useGpuBufferOps = (devFlags.enableGpuBufferOps || useGpuForUpdate)
+                                         && !simulationWorkload.computeNonbondedAtMtsLevel1;
     simulationWorkload.useGpuHaloExchange = devFlags.enableGpuHaloExchange;
     simulationWorkload.useGpuPmePpCommunication =
             devFlags.enableGpuPmePPComm && (pmeRunMode == PmeRunMode::GPU);
diff --git a/src/programs/mdrun/tests/CMakeLists.txt b/src/programs/mdrun/tests/CMakeLists.txt
index b131fba9d4..4b5257839c 100644
--- a/src/programs/mdrun/tests/CMakeLists.txt
+++ b/src/programs/mdrun/tests/CMakeLists.txt
@@ -101,6 +101,7 @@ set(exename "mdrun-test")
 gmx_add_gtest_executable(${exename}
     CPP_SOURCE_FILES
         ewaldsurfaceterm.cpp
+        multiple_time_stepping.cpp
         orires.cpp
         simulator.cpp
         swapcoords.cpp
diff --git a/src/programs/mdrun/tests/multiple_time_stepping.cpp b/src/programs/mdrun/tests/multiple_time_stepping.cpp
new file mode 100644
index 0000000000..44d6ffd990
--- /dev/null
+++ b/src/programs/mdrun/tests/multiple_time_stepping.cpp
@@ -0,0 +1,194 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * Tests to compare that multiple time stepping is (nearly) identical to normal integration.
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \ingroup module_mdrun_integration_tests
+ */
+#include "gmxpre.h"
+
+#include "gromacs/topology/ifunc.h"
+#include "gromacs/utility/stringutil.h"
+
+#include "testutils/mpitest.h"
+#include "testutils/setenv.h"
+#include "testutils/simulationdatabase.h"
+
+#include "moduletest.h"
+#include "simulatorcomparison.h"
+
+namespace gmx
+{
+namespace test
+{
+namespace
+{
+
+/*! \brief Test fixture base for two integration schemes
+ *
+ * This test ensures that integration with(out) different multiple time stepping
+ * scheems (called via different mdp options) yield near identical energies,
+ * forces and virial at step 0 and similar energies and virial after 4 steps.
+ */
+using MtsComparisonTestParams = std::tuple<std::string, std::string>;
+class MtsComparisonTest : public MdrunTestFixture, public ::testing::WithParamInterface<MtsComparisonTestParams>
+{
+};
+
+//! Returns set of energy terms to compare with associated tolerances
+EnergyTermsToCompare energyTermsToCompare(const real energyTol, const real virialTol)
+{
+    return EnergyTermsToCompare{ { { interaction_function[F_EPOT].longname,
+                                     relativeToleranceAsFloatingPoint(100.0, energyTol) },
+                                   { "Vir-XX", relativeToleranceAsFloatingPoint(30.0, virialTol) },
+                                   { "Vir-YY", relativeToleranceAsFloatingPoint(30.0, virialTol) },
+                                   { "Vir-ZZ", relativeToleranceAsFloatingPoint(30.0, virialTol) } } };
+}
+
+TEST_P(MtsComparisonTest, WithinTolerances)
+{
+    auto params         = GetParam();
+    auto simulationName = std::get<0>(params);
+    auto mtsScheme      = std::get<1>(params);
+
+    // Note that there should be no relevant limitation on MPI ranks and OpenMP threads
+    SCOPED_TRACE(formatString("Comparing for '%s' no MTS with MTS scheme '%s'",
+                              simulationName.c_str(), mtsScheme.c_str()));
+
+    const int numSteps         = 4;
+    auto      sharedMdpOptions = gmx::formatString(
+            "integrator   = md\n"
+            "dt           = 0.001\n"
+            "nsteps       = %d\n"
+            "verlet-buffer-tolerance = -1\n"
+            "rlist        = 1.0\n"
+            "coulomb-type = PME\n"
+            "vdw-type     = cut-off\n"
+            "rcoulomb     = 0.9\n"
+            "rvdw         = 0.9\n"
+            "constraints  = h-bonds\n",
+            numSteps);
+
+    // set nstfout to > numSteps so we only write forces at step 0
+    const int nstfout       = 2 * numSteps;
+    auto      refMdpOptions = sharedMdpOptions
+                         + gmx::formatString(
+                                   "mts       = no\n"
+                                   "nstcalcenergy = %d\n"
+                                   "nstenergy = %d\n"
+                                   "nstxout   = 0\n"
+                                   "nstvout   = 0\n"
+                                   "nstfout   = %d\n",
+                                   numSteps, numSteps, nstfout);
+
+    auto mtsMdpOptions = sharedMdpOptions
+                         + gmx::formatString(
+                                   "mts        = yes\n"
+                                   "mts-levels = 2\n"
+                                   "mts-level2-forces = %s\n"
+                                   "mts-level2-factor = 2\n"
+                                   "nstcalcenergy = %d\n"
+                                   "nstenergy  = %d\n"
+                                   "nstxout    = 0\n"
+                                   "nstvout    = 0\n"
+                                   "nstfout    = %d\n",
+                                   mtsScheme.c_str(), numSteps, numSteps, nstfout);
+
+    // At step 0 the energy and virial should only differ due to rounding errors
+    EnergyTermsToCompare energyTermsToCompareStep0 = energyTermsToCompare(0.001, 0.01);
+    EnergyTermsToCompare energyTermsToCompareAllSteps =
+            energyTermsToCompare(mtsScheme == "pme" ? 0.015 : 0.04, mtsScheme == "pme" ? 0.1 : 0.2);
+
+    // Specify how trajectory frame matching must work.
+    TrajectoryFrameMatchSettings trajectoryMatchSettings{ true,
+                                                          true,
+                                                          true,
+                                                          ComparisonConditions::NoComparison,
+                                                          ComparisonConditions::NoComparison,
+                                                          ComparisonConditions::MustCompare };
+    TrajectoryTolerances trajectoryTolerances = TrajectoryComparison::s_defaultTrajectoryTolerances;
+
+    // Build the functor that will compare reference and test
+    // trajectory frames in the chosen way.
+    TrajectoryComparison trajectoryComparison{ trajectoryMatchSettings, trajectoryTolerances };
+
+    // Set file names
+    auto simulator1TrajectoryFileName = fileManager_.getTemporaryFilePath("sim1.trr");
+    auto simulator1EdrFileName        = fileManager_.getTemporaryFilePath("sim1.edr");
+    auto simulator2TrajectoryFileName = fileManager_.getTemporaryFilePath("sim2.trr");
+    auto simulator2EdrFileName        = fileManager_.getTemporaryFilePath("sim2.edr");
+
+    // Run grompp
+    runner_.tprFileName_ = fileManager_.getTemporaryFilePath("sim.tpr");
+    runner_.useTopGroAndNdxFromDatabase(simulationName);
+    runner_.useStringAsMdpFile(refMdpOptions);
+    runGrompp(&runner_);
+
+    // Do first mdrun
+    runner_.fullPrecisionTrajectoryFileName_ = simulator1TrajectoryFileName;
+    runner_.edrFileName_                     = simulator1EdrFileName;
+    runMdrun(&runner_);
+
+    runner_.useStringAsMdpFile(mtsMdpOptions);
+    runGrompp(&runner_);
+
+    // Do second mdrun
+    runner_.fullPrecisionTrajectoryFileName_ = simulator2TrajectoryFileName;
+    runner_.edrFileName_                     = simulator2EdrFileName;
+    runMdrun(&runner_);
+
+    // Compare simulation results at step 0, which should be indentical
+    compareEnergies(simulator1EdrFileName, simulator2EdrFileName, energyTermsToCompareStep0,
+                    MaxNumFrames(1));
+    compareTrajectories(simulator1TrajectoryFileName, simulator2TrajectoryFileName, trajectoryComparison);
+
+    // Compare energies at the last step (and step 0 again) with lower tolerance
+    compareEnergies(simulator1EdrFileName, simulator2EdrFileName, energyTermsToCompareAllSteps,
+                    MaxNumFrames::compareAllFrames());
+}
+
+INSTANTIATE_TEST_CASE_P(
+        MultipleTimeSteppingIsNearSingleTimeStepping,
+        MtsComparisonTest,
+        ::testing::Combine(::testing::Values("ala"),
+                           ::testing::Values("longrange-nonbonded",
+                                             "longrange-nonbonded nonbonded pair dihedral")));
+
+} // namespace
+} // namespace test
+} // namespace gmx
diff --git a/src/programs/mdrun/tests/simulatorcomparison.cpp b/src/programs/mdrun/tests/simulatorcomparison.cpp
index c832d5ab9f..b4c72451aa 100644
--- a/src/programs/mdrun/tests/simulatorcomparison.cpp
+++ b/src/programs/mdrun/tests/simulatorcomparison.cpp
@@ -86,11 +86,12 @@ void runMdrun(SimulationRunner* runner, const std::vector<SimulationOptionTuple>
 
 void compareEnergies(const std::string&          edr1Name,
                      const std::string&          edr2Name,
-                     const EnergyTermsToCompare& energyTermsToCompare)
+                     const EnergyTermsToCompare& energyTermsToCompare,
+                     const MaxNumFrames          maxNumFrames)
 {
     // Build the functor that will compare energy frames on the chosen
     // energy terms.
-    EnergyComparison energyComparison(energyTermsToCompare, MaxNumFrames::compareAllFrames());
+    EnergyComparison energyComparison(energyTermsToCompare, maxNumFrames);
 
     // Build the manager that will present matching pairs of frames to compare.
     //
diff --git a/src/programs/mdrun/tests/simulatorcomparison.h b/src/programs/mdrun/tests/simulatorcomparison.h
index b31566dff9..bc23c312c1 100644
--- a/src/programs/mdrun/tests/simulatorcomparison.h
+++ b/src/programs/mdrun/tests/simulatorcomparison.h
@@ -48,6 +48,7 @@
 
 #include <string>
 
+#include "comparison_helpers.h"
 #include "energycomparison.h"
 #include "trajectorycomparison.h"
 
@@ -67,7 +68,8 @@ void runMdrun(SimulationRunner*                         runner,
 
 void compareEnergies(const std::string&          edr1Name,
                      const std::string&          edr2Name,
-                     const EnergyTermsToCompare& energyTermsToCompare);
+                     const EnergyTermsToCompare& energyTermsToCompare,
+                     MaxNumFrames                maxNumFrams = MaxNumFrames::compareAllFrames());
 
 void compareTrajectories(const std::string&          trajectory1Name,
                          const std::string&          trajectory2Name,
-- 
2.22.0