Merge branch release-2016

[alexxy/gromacs.git] / src / gromacs / mdlib / update.cpp
diff --git a/src/gromacs/mdlib/update.cpp b/src/gromacs/mdlib/update.cpp

index b4e2bdfbc74a9da3aac2a361cfd74ba796ea9a65..0f1374b73ec07ae1e299517380eea288ea843209 100644 (file)
--- a/src/gromacs/mdlib/update.cpp
+++ b/src/gromacs/mdlib/update.cpp
@@ -64,12 +64,14 @@
  #include "gromacs/mdtypes/group.h"
  #include "gromacs/mdtypes/inputrec.h"
  #include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/state.h"
  #include "gromacs/pbcutil/boxutilities.h"
  #include "gromacs/pbcutil/mshift.h"
  #include "gromacs/pbcutil/pbc.h"
  #include "gromacs/pulling/pull.h"
  #include "gromacs/random/tabulatednormaldistribution.h"
  #include "gromacs/random/threefry.h"
+#include "gromacs/simd/simd.h"
  #include "gromacs/timing/wallcycle.h"
  #include "gromacs/utility/exceptions.h"
  #include "gromacs/utility/fatalerror.h"
@@ -78,6 +80,8 @@
  #include "gromacs/utility/gmxomp.h"
  #include "gromacs/utility/smalloc.h"
  
+using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
+
  /*For debugging, start at v(-dt/2) for velolcity verlet -- uncomment next line */
  /*#define STARTFROMDT2*/
  
@@ -102,177 +106,544 @@ typedef struct {
  
  struct gmx_update_t
  {
-    gmx_stochd_t *sd;
+    gmx_stochd_t     *sd;
      /* xprime for constraint algorithms */
-    rvec         *xp;
-    int           xp_nalloc;
+    PaddedRVecVector  xp;
  
      /* Variables for the deform algorithm */
-    gmx_int64_t     deformref_step;
-    matrix          deformref_box;
+    gmx_int64_t       deformref_step;
+    matrix            deformref_box;
  };
  
+static bool isTemperatureCouplingStep(gmx_int64_t step, const t_inputrec *ir)
+{
+    /* We should only couple after a step where energies were determined (for leapfrog versions)
+       or the step energies are determined, for velocity verlet versions */
+    int offset;
+    if (EI_VV(ir->eI))
+    {
+        offset = 0;
+    }
+    else
+    {
+        offset = 1;
+    }
+    return ir->etc != etcNO &&
+           (ir->nsttcouple == 1 ||
+            do_per_step(step + ir->nsttcouple - offset, ir->nsttcouple));
+}
  
-static void do_update_md(int start, int nrend,
-                         double dt, int nstpcouple,
-                         t_grp_tcstat *tcstat,
-                         double nh_vxi[],
-                         gmx_bool bNEMD, t_grp_acc *gstat, rvec accel[],
-                         ivec nFreeze[],
-                         real invmass[],
-                         unsigned short ptype[], unsigned short cFREEZE[],
-                         unsigned short cACC[], unsigned short cTC[],
-                         rvec x[], rvec xprime[], rvec v[],
-                         rvec f[], matrix M,
-                         gmx_bool bNH, gmx_bool bPR)
+static bool isPressureCouplingStep(gmx_int64_t step, const t_inputrec *ir)
  {
-    double imass, w_dt;
-    int    gf = 0, ga = 0, gt = 0;
-    rvec   vrel;
-    real   vn, vv, va, vb, vnrel;
-    real   lg, vxi = 0, u;
-    int    n, d;
+    GMX_ASSERT(ir->epc != epcMTTK, "MTTK pressure coupling is not handled here");
  
-    if (bNH || bPR)
+    int offset;
+    if (ir->epc == epcBERENDSEN)
      {
-        /* Update with coupling to extended ensembles, used for
-         * Nose-Hoover and Parrinello-Rahman coupling
-         * Nose-Hoover uses the reversible leap-frog integrator from
-         * Holian et al. Phys Rev E 52(3) : 2338, 1995
-         */
-        for (n = start; n < nrend; n++)
+        offset = 0;
+    }
+    else
+    {
+        offset = 1;
+    }
+    /* We should only couple after a step where pressures were determined */
+    return ir->epc != etcNO &&
+           (ir->nstpcouple == 1 ||
+            do_per_step(step + ir->nstpcouple - offset, ir->nstpcouple));
+}
+
+/*! \brief Sets the number of different temperature coupling values */
+enum class NumTempScaleValues
+{
+    single,   //!< Single T-scaling value (either one group or all values =1)
+    multiple  //!< Multiple T-scaling values, need to use T-group indices
+};
+
+/*! \brief Sets if to apply no or diagonal Parrinello-Rahman pressure scaling
+ *
+ * Note that this enum is only used in updateMDLeapfrogSimple(), which does
+ * not handle fully anistropic Parrinello-Rahman scaling, so we only have
+ * options \p no and \p diagonal here and no anistropic option.
+ */
+enum class ApplyParrinelloRahmanVScaling
+{
+    no,       //!< Do not apply velocity scaling (not a PR-coupling run or step)
+    diagonal  //!< Apply velocity scaling using a diagonal matrix
+};
+
+/*! \brief Integrate using leap-frog with T-scaling and optionally diagonal Parrinello-Rahman p-coupling
+ *
+ * \tparam       numTempScaleValues     The number of different T-couple values
+ * \tparam       applyPRVScaling        Apply Parrinello-Rahman velocity scaling
+ * \param[in]    start                  Index of first atom to update
+ * \param[in]    nrend                  Last atom to update: \p nrend - 1
+ * \param[in]    dt                     The time step
+ * \param[in]    dtPressureCouple       Time step for pressure coupling
+ * \param[in]    invMassPerDim          1/mass per atom and dimension
+ * \param[in]    tcstat                 Temperature coupling information
+ * \param[in]    cTC                    T-coupling group index per atom
+ * \param[in]    pRVScaleMatrixDiagonal Parrinello-Rahman v-scale matrix diagonal
+ * \param[in]    x                      Input coordinates
+ * \param[out]   xprime                 Updated coordinates
+ * \param[inout] v                      Velocities
+ * \param[in]    f                      Forces
+ *
+ * We expect this template to get good SIMD acceleration by most compilers,
+ * unlike the more complex general template.
+ * Note that we might get even better SIMD acceleration when we introduce
+ * aligned (and padded) memory, possibly with some hints for the compilers.
+ */
+template<NumTempScaleValues            numTempScaleValues,
+         ApplyParrinelloRahmanVScaling applyPRVScaling>
+static void
+updateMDLeapfrogSimple(int                       start,
+                       int                       nrend,
+                       real                      dt,
+                       real                      dtPressureCouple,
+                       const rvec * gmx_restrict invMassPerDim,
+                       const t_grp_tcstat      * tcstat,
+                       const unsigned short    * cTC,
+                       const rvec                pRVScaleMatrixDiagonal,
+                       const rvec * gmx_restrict x,
+                       rvec       * gmx_restrict xprime,
+                       rvec       * gmx_restrict v,
+                       const rvec * gmx_restrict f)
+{
+    real lambdaGroup;
+
+    if (numTempScaleValues == NumTempScaleValues::single)
+    {
+        lambdaGroup = tcstat[0].lambda;
+    }
+
+    for (int a = start; a < nrend; a++)
+    {
+        if (numTempScaleValues == NumTempScaleValues::multiple)
          {
-            imass = invmass[n];
-            if (cFREEZE)
-            {
-                gf   = cFREEZE[n];
-            }
-            if (cACC)
-            {
-                ga   = cACC[n];
-            }
-            if (cTC)
-            {
-                gt   = cTC[n];
-            }
-            lg   = tcstat[gt].lambda;
-            if (bNH)
+            lambdaGroup = tcstat[cTC[a]].lambda;
+        }
+
+        for (int d = 0; d < DIM; d++)
+        {
+            /* Note that using rvec invMassPerDim results in more efficient
+             * SIMD code, but this increases the cache pressure.
+             * For large systems with PME on the CPU this slows down the
+             * (then already slow) update by 20%. If all data remains in cache,
+             * using rvec is much faster.
+             */
+            real vNew = lambdaGroup*v[a][d] + f[a][d]*invMassPerDim[a][d]*dt;
+
+            if (applyPRVScaling == ApplyParrinelloRahmanVScaling::diagonal)
              {
-                vxi   = nh_vxi[gt];
+                vNew -= dtPressureCouple*pRVScaleMatrixDiagonal[d]*v[a][d];
              }
-            rvec_sub(v[n], gstat[ga].u, vrel);
+            v[a][d]      = vNew;
+            xprime[a][d] = x[a][d] + vNew*dt;
+        }
+    }
+}
  
-            for (d = 0; d < DIM; d++)
-            {
-                if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
-                {
-                    vnrel = (lg*vrel[d] + dt*(imass*f[n][d] - 0.5*vxi*vrel[d]
-                                              - nstpcouple*iprod(M[d], vrel)))/(1 + 0.5*vxi*dt);
-                    /* do not scale the mean velocities u */
-                    vn             = gstat[ga].u[d] + accel[ga][d]*dt + vnrel;
-                    v[n][d]        = vn;
-                    xprime[n][d]   = x[n][d]+vn*dt;
-                }
-                else
+#if GMX_SIMD && GMX_SIMD_HAVE_REAL
+#define GMX_HAVE_SIMD_UPDATE 1
+#else
+#define GMX_HAVE_SIMD_UPDATE 0
+#endif
+
+#if GMX_HAVE_SIMD_UPDATE
+
+/*! \brief Load (aligned) the contents of GMX_SIMD_REAL_WIDTH rvec elements sequentially into 3 SIMD registers
+ *
+ * The loaded output is:
+ * \p r0: { r[index][XX], r[index][YY], ... }
+ * \p r1: { ... }
+ * \p r2: { ..., r[index+GMX_SIMD_REAL_WIDTH-1][YY], r[index+GMX_SIMD_REAL_WIDTH-1][ZZ] }
+ *
+ * \param[in]  r      Real to an rvec array, has to be aligned to SIMD register width
+ * \param[in]  index  Index of the first rvec triplet of reals to load
+ * \param[out] r0     Pointer to first SIMD register
+ * \param[out] r1     Pointer to second SIMD register
+ * \param[out] r2     Pointer to third SIMD register
+ */
+static inline void simdLoadRvecs(const rvec *r,
+                                 int         index,
+                                 SimdReal   *r0,
+                                 SimdReal   *r1,
+                                 SimdReal   *r2)
+{
+    const real *realPtr = r[index];
+
+    GMX_ASSERT(isSimdAligned(realPtr), "Pointer should be SIMD aligned");
+
+    *r0 = simdLoad(realPtr + 0*GMX_SIMD_REAL_WIDTH);
+    *r1 = simdLoad(realPtr + 1*GMX_SIMD_REAL_WIDTH);
+    *r2 = simdLoad(realPtr + 2*GMX_SIMD_REAL_WIDTH);
+}
+
+/*! \brief Store (aligned) 3 SIMD registers sequentially to GMX_SIMD_REAL_WIDTH rvec elements
+ *
+ * The stored output is:
+ * \p r[index] = { { r0[0], r0[1], ... }
+ * ...
+ * \p r[index+GMX_SIMD_REAL_WIDTH-1] =  { ... , r2[GMX_SIMD_REAL_WIDTH-2], r2[GMX_SIMD_REAL_WIDTH-1] }
+ *
+ * \param[out] r      Pointer to an rvec array, has to be aligned to SIMD register width
+ * \param[in]  index  Index of the first rvec triplet of reals to store to
+ * \param[in]  r0     First SIMD register
+ * \param[in]  r1     Second SIMD register
+ * \param[in]  r2     Third SIMD register
+ */
+static inline void simdStoreRvecs(rvec     *r,
+                                  int       index,
+                                  SimdReal  r0,
+                                  SimdReal  r1,
+                                  SimdReal  r2)
+{
+    real *realPtr = r[index];
+
+    GMX_ASSERT(isSimdAligned(realPtr), "Pointer should be SIMD aligned");
+
+    store(realPtr + 0*GMX_SIMD_REAL_WIDTH, r0);
+    store(realPtr + 1*GMX_SIMD_REAL_WIDTH, r1);
+    store(realPtr + 2*GMX_SIMD_REAL_WIDTH, r2);
+}
+
+/*! \brief Integrate using leap-frog with single group T-scaling and SIMD
+ *
+ * \param[in]    start                  Index of first atom to update
+ * \param[in]    nrend                  Last atom to update: \p nrend - 1
+ * \param[in]    dt                     The time step
+ * \param[in]    invMass                1/mass per atom
+ * \param[in]    tcstat                 Temperature coupling information
+ * \param[in]    x                      Input coordinates
+ * \param[out]   xprime                 Updated coordinates
+ * \param[inout] v                      Velocities
+ * \param[in]    f                      Forces
+ */
+static void
+updateMDLeapfrogSimpleSimd(int                       start,
+                           int                       nrend,
+                           real                      dt,
+                           const real * gmx_restrict invMass,
+                           const t_grp_tcstat      * tcstat,
+                           const rvec * gmx_restrict x,
+                           rvec       * gmx_restrict xprime,
+                           rvec       * gmx_restrict v,
+                           const rvec * gmx_restrict f)
+{
+    SimdReal                  timestep(dt);
+    SimdReal                  lambdaSystem(tcstat[0].lambda);
+
+    /* We declare variables here, since code is often slower when declaring them inside the loop */
+
+    /* Note: We should implement a proper PaddedVector, so we don't need this check */
+    GMX_ASSERT(isSimdAligned(invMass), "invMass should be aligned");
+
+    for (int a = start; a < nrend; a += GMX_SIMD_REAL_WIDTH)
+    {
+        SimdReal invMass0, invMass1, invMass2;
+        expandScalarsToTriplets(simdLoad(invMass + a),
+                                &invMass0, &invMass1, &invMass2);
+
+        SimdReal v0, v1, v2;
+        SimdReal f0, f1, f2;
+        simdLoadRvecs(v, a, &v0, &v1, &v2);
+        simdLoadRvecs(f, a, &f0, &f1, &f2);
+
+        v0 = fma(f0*invMass0, timestep, lambdaSystem*v0);
+        v1 = fma(f1*invMass1, timestep, lambdaSystem*v1);
+        v2 = fma(f2*invMass2, timestep, lambdaSystem*v2);
+
+        simdStoreRvecs(v, a, v0, v1, v2);
+
+        SimdReal x0, x1, x2;
+        simdLoadRvecs(x, a, &x0, &x1, &x2);
+
+        SimdReal xprime0 = fma(v0, timestep, x0);
+        SimdReal xprime1 = fma(v1, timestep, x1);
+        SimdReal xprime2 = fma(v2, timestep, x2);
+
+        simdStoreRvecs(xprime, a, xprime0, xprime1, xprime2);
+    }
+}
+
+#endif // GMX_HAVE_SIMD_UPDATE
+
+/*! \brief Sets the NEMD acceleration type */
+enum class AccelerationType
+{
+    none, group, cosine
+};
+
+/*! \brief Integrate using leap-frog with support for everything
+ *
+ * \tparam       accelerationType       Type of NEMD acceleration
+ * \param[in]    start                  Index of first atom to update
+ * \param[in]    nrend                  Last atom to update: \p nrend - 1
+ * \param[in]    doNoseHoover           If to apply Nose-Hoover T-coupling
+ * \param[in]    dt                     The time step
+ * \param[in]    dtPressureCouple       Time step for pressure coupling, is 0 when no pressure coupling should be applied at this step
+ * \param[in]    ir                     The input parameter record
+ * \param[in]    md                     Atom properties
+ * \param[in]    ekind                  Kinetic energy data
+ * \param[in]    box                    The box dimensions
+ * \param[in]    x                      Input coordinates
+ * \param[out]   xprime                 Updated coordinates
+ * \param[inout] v                      Velocities
+ * \param[in]    f                      Forces
+ * \param[in]    nh_vxi                 Nose-Hoover velocity scaling factors
+ * \param[in]    M                      Parrinello-Rahman scaling matrix
+ */
+template<AccelerationType accelerationType>
+static void
+updateMDLeapfrogGeneral(int                         start,
+                        int                         nrend,
+                        bool                        doNoseHoover,
+                        real                        dt,
+                        real                        dtPressureCouple,
+                        const t_inputrec          * ir,
+                        const t_mdatoms           * md,
+                        const gmx_ekindata_t      * ekind,
+                        const matrix                box,
+                        const rvec   * gmx_restrict x,
+                        rvec         * gmx_restrict xprime,
+                        rvec         * gmx_restrict v,
+                        const rvec   * gmx_restrict f,
+                        const double * gmx_restrict nh_vxi,
+                        const matrix                M)
+{
+    /* This is a version of the leap-frog integrator that supports
+     * all combinations of T-coupling, P-coupling and NEMD.
+     * Nose-Hoover uses the reversible leap-frog integrator from
+     * Holian et al. Phys Rev E 52(3) : 2338, 1995
+     */
+
+    const unsigned short    * cTC           = md->cTC;
+    const t_grp_tcstat      * tcstat        = ekind->tcstat;
+
+    const unsigned short    * cACC          = md->cACC;
+    const rvec              * accel         = ir->opts.acc;
+    const t_grp_acc         * grpstat       = ekind->grpstat;
+
+    const rvec * gmx_restrict invMassPerDim = md->invMassPerDim;
+
+    /* Initialize group values, changed later when multiple groups are used */
+    int  ga       = 0;
+    int  gt       = 0;
+    real factorNH = 0;
+
+    for (int n = start; n < nrend; n++)
+    {
+        if (cTC)
+        {
+            gt  = cTC[n];
+        }
+        real lg = tcstat[gt].lambda;
+
+        rvec vRel;
+        real cosineZ, vCosine;
+#ifdef __INTEL_COMPILER
+#pragma warning( disable : 280 )
+#endif
+        switch (accelerationType)
+        {
+            case AccelerationType::none:
+                copy_rvec(v[n], vRel);
+                break;
+            case AccelerationType::group:
+                if (cACC)
                  {
-                    v[n][d]        = 0.0;
-                    xprime[n][d]   = x[n][d];
+                    ga = cACC[n];
                  }
+                /* Avoid scaling the group velocity */
+                rvec_sub(v[n], grpstat[ga].u, vRel);
+                break;
+            case AccelerationType::cosine:
+                cosineZ = std::cos(x[n][ZZ]*static_cast<real>(M_PI)/box[ZZ][ZZ]);
+                vCosine = cosineZ*ekind->cosacc.vcos;
+                /* Avoid scaling the cosine profile velocity */
+                copy_rvec(v[n], vRel);
+                vRel[XX] -= vCosine;
+                break;
+        }
+
+        if (doNoseHoover)
+        {
+            /* Here we account for multiple time stepping, by increasing
+             * the Nose-Hoover correction by nsttcouple
+             */
+            factorNH = 0.5*ir->nsttcouple*dt*nh_vxi[gt];
+        }
+
+        for (int d = 0; d < DIM; d++)
+        {
+            real vNew =
+                (lg*vRel[d] + (f[n][d]*invMassPerDim[n][d]*dt - factorNH*vRel[d]
+                               - dtPressureCouple*iprod(M[d], vRel)))/(1 + factorNH);
+            switch (accelerationType)
+            {
+                case AccelerationType::none:
+                    break;
+                case AccelerationType::group:
+                    /* Add back the mean velocity and apply acceleration */
+                    vNew += grpstat[ga].u[d] + accel[ga][d]*dt;
+                    break;
+                case AccelerationType::cosine:
+                    if (d == XX)
+                    {
+                        /* Add back the mean velocity and apply acceleration */
+                        vNew += vCosine + cosineZ*ekind->cosacc.cos_accel*dt;
+                    }
+                    break;
              }
+            v[n][d]       = vNew;
+            xprime[n][d]  = x[n][d] + vNew*dt;
          }
      }
-    else if (cFREEZE != NULL ||
-             nFreeze[0][XX] || nFreeze[0][YY] || nFreeze[0][ZZ] ||
-             bNEMD)
+}
+
+/*! \brief Handles the Leap-frog MD x and v integration */
+static void do_update_md(int                         start,
+                         int                         nrend,
+                         gmx_int64_t                 step,
+                         real                        dt,
+                         const t_inputrec          * ir,
+                         const t_mdatoms           * md,
+                         const gmx_ekindata_t      * ekind,
+                         const matrix                box,
+                         const rvec   * gmx_restrict x,
+                         rvec         * gmx_restrict xprime,
+                         rvec         * gmx_restrict v,
+                         const rvec   * gmx_restrict f,
+                         const double * gmx_restrict nh_vxi,
+                         const matrix                M)
+{
+    GMX_ASSERT(nrend == start || xprime != x, "For SIMD optimization certain compilers need to have xprime != x");
+
+    /* Note: Berendsen pressure scaling is handled after do_update_md() */
+    bool doTempCouple       = isTemperatureCouplingStep(step, ir);
+    bool doNoseHoover       = (ir->etc == etcNOSEHOOVER && doTempCouple);
+    bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN && isPressureCouplingStep(step, ir));
+    bool doPROffDiagonal    = (doParrinelloRahman && (M[YY][XX] != 0 || M[ZZ][XX] != 0 || M[ZZ][YY] != 0));
+
+    real dtPressureCouple   = (doParrinelloRahman ? ir->nstpcouple*dt : 0);
+
+    /* NEMD (also cosine) acceleration is applied in updateMDLeapFrogGeneral */
+    bool doAcceleration     = (ekind->bNEMD || ekind->cosacc.cos_accel != 0);
+
+    if (doNoseHoover || doPROffDiagonal || doAcceleration)
      {
-        /* Update with Berendsen/v-rescale coupling and freeze or NEMD */
-        for (n = start; n < nrend; n++)
+        if (!doAcceleration)
          {
-            w_dt = invmass[n]*dt;
-            if (cFREEZE)
-            {
-                gf   = cFREEZE[n];
-            }
-            if (cACC)
+            updateMDLeapfrogGeneral<AccelerationType::none>
+                (start, nrend, doNoseHoover, dt, dtPressureCouple,
+                ir, md, ekind, box, x, xprime, v, f, nh_vxi, M);
+        }
+        else if (ekind->bNEMD)
+        {
+            updateMDLeapfrogGeneral<AccelerationType::group>
+                (start, nrend, doNoseHoover, dt, dtPressureCouple,
+                ir, md, ekind, box, x, xprime, v, f, nh_vxi, M);
+        }
+        else
+        {
+            updateMDLeapfrogGeneral<AccelerationType::cosine>
+                (start, nrend, doNoseHoover, dt, dtPressureCouple,
+                ir, md, ekind, box, x, xprime, v, f, nh_vxi, M);
+        }
+    }
+    else
+    {
+        /* We determine if we have a single T-coupling lambda value for all
+         * atoms. That allows for better SIMD acceleration in the template.
+         * If we do not do temperature coupling (in the run or this step),
+         * all scaling values are 1, so we effectively have a single value.
+         */
+        bool haveSingleTempScaleValue = (!doTempCouple || ekind->ngtc == 1);
+
+        /* Extract some pointers needed by all cases */
+        const unsigned short *cTC           = md->cTC;
+        const t_grp_tcstat   *tcstat        = ekind->tcstat;
+        const rvec           *invMassPerDim = md->invMassPerDim;
+
+        if (doParrinelloRahman)
+        {
+            GMX_ASSERT(!doPROffDiagonal, "updateMDLeapfrogSimple only support diagonal Parrinello-Rahman scaling matrices");
+
+            rvec diagM;
+            for (int d = 0; d < DIM; d++)
              {
-                ga   = cACC[n];
+                diagM[d] = M[d][d];
              }
-            if (cTC)
+
+            if (haveSingleTempScaleValue)
              {
-                gt   = cTC[n];
+                updateMDLeapfrogSimple
+                <NumTempScaleValues::single,
+                 ApplyParrinelloRahmanVScaling::diagonal>
+                    (start, nrend, dt, dtPressureCouple,
+                    invMassPerDim, tcstat, cTC, diagM, x, xprime, v, f);
              }
-            lg   = tcstat[gt].lambda;
-
-            for (d = 0; d < DIM; d++)
+            else
              {
-                vn             = v[n][d];
-                if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
-                {
-                    vv             = lg*vn + f[n][d]*w_dt;
-
-                    /* do not scale the mean velocities u */
-                    u              = gstat[ga].u[d];
-                    va             = vv + accel[ga][d]*dt;
-                    vb             = va + (1.0-lg)*u;
-                    v[n][d]        = vb;
-                    xprime[n][d]   = x[n][d]+vb*dt;
-                }
-                else
-                {
-                    v[n][d]        = 0.0;
-                    xprime[n][d]   = x[n][d];
-                }
+                updateMDLeapfrogSimple
+                <NumTempScaleValues::multiple,
+                 ApplyParrinelloRahmanVScaling::diagonal>
+                    (start, nrend, dt, dtPressureCouple,
+                    invMassPerDim, tcstat, cTC, diagM, x, xprime, v, f);
              }
          }
-    }
-    else
-    {
-        /* Plain update with Berendsen/v-rescale coupling */
-        for (n = start; n < nrend; n++)
+        else
          {
-            if ((ptype[n] != eptVSite) && (ptype[n] != eptShell))
+            if (haveSingleTempScaleValue)
              {
-                w_dt = invmass[n]*dt;
-                if (cTC)
+                /* Note that modern compilers are pretty good at vectorizing
+                 * updateMDLeapfrogSimple(). But the SIMD version will still
+                 * be faster because invMass lowers the cache pressure
+                 * compared to invMassPerDim.
+                 */
+#if GMX_HAVE_SIMD_UPDATE
+                /* Check if we can use invmass instead of invMassPerDim */
+                if (!md->havePartiallyFrozenAtoms)
                  {
-                    gt = cTC[n];
+                    updateMDLeapfrogSimpleSimd
+                        (start, nrend, dt, md->invmass, tcstat, x, xprime, v, f);
                  }
-                lg = tcstat[gt].lambda;
-
-                for (d = 0; d < DIM; d++)
+                else
+#endif
                  {
-                    vn           = lg*v[n][d] + f[n][d]*w_dt;
-                    v[n][d]      = vn;
-                    xprime[n][d] = x[n][d] + vn*dt;
+                    updateMDLeapfrogSimple
+                    <NumTempScaleValues::single,
+                     ApplyParrinelloRahmanVScaling::no>
+                        (start, nrend, dt, dtPressureCouple,
+                        invMassPerDim, tcstat, cTC, nullptr, x, xprime, v, f);
                  }
              }
              else
              {
-                for (d = 0; d < DIM; d++)
-                {
-                    v[n][d]        = 0.0;
-                    xprime[n][d]   = x[n][d];
-                }
+                updateMDLeapfrogSimple
+                <NumTempScaleValues::multiple,
+                 ApplyParrinelloRahmanVScaling::no>
+                    (start, nrend, dt, dtPressureCouple,
+                    invMassPerDim, tcstat, cTC, nullptr, x, xprime, v, f);
              }
          }
      }
  }
  
-static void do_update_vv_vel(int start, int nrend, double dt,
+static void do_update_vv_vel(int start, int nrend, real dt,
                               rvec accel[], ivec nFreeze[], real invmass[],
                               unsigned short ptype[], unsigned short cFREEZE[],
-                             unsigned short cACC[], rvec v[], rvec f[],
+                             unsigned short cACC[], rvec v[], const rvec f[],
                               gmx_bool bExtended, real veta, real alpha)
  {
-    double w_dt;
      int    gf = 0, ga = 0;
      int    n, d;
-    double g, mv1, mv2;
+    real   g, mv1, mv2;
  
      if (bExtended)
      {
          g        = 0.25*dt*veta*alpha;
-        mv1      = exp(-g);
+        mv1      = std::exp(-g);
          mv2      = gmx::series_sinhx(g);
      }
      else
@@ -282,7 +653,7 @@ static void do_update_vv_vel(int start, int nrend, double dt,
      }
      for (n = start; n < nrend; n++)
      {
-        w_dt = invmass[n]*dt;
+        real w_dt = invmass[n]*dt;
          if (cFREEZE)
          {
              gf   = cFREEZE[n];
@@ -306,21 +677,21 @@ static void do_update_vv_vel(int start, int nrend, double dt,
      }
  } /* do_update_vv_vel */
  
-static void do_update_vv_pos(int start, int nrend, double dt,
+static void do_update_vv_pos(int start, int nrend, real dt,
                               ivec nFreeze[],
                               unsigned short ptype[], unsigned short cFREEZE[],
-                             rvec x[], rvec xprime[], rvec v[],
+                             const rvec x[], rvec xprime[], rvec v[],
                               gmx_bool bExtended, real veta)
  {
      int    gf = 0;
      int    n, d;
-    double g, mr1, mr2;
+    real   g, mr1, mr2;
  
      /* Would it make more sense if Parrinello-Rahman was put here? */
      if (bExtended)
      {
          g        = 0.5*dt*veta;
-        mr1      = exp(g);
+        mr1      = std::exp(g);
          mr2      = gmx::series_sinhx(g);
      }
      else
@@ -351,114 +722,6 @@ static void do_update_vv_pos(int start, int nrend, double dt,
      }
  } /* do_update_vv_pos */
  
-static void do_update_visc(int start, int nrend,
-                           double dt, int nstpcouple,
-                           t_grp_tcstat *tcstat,
-                           double nh_vxi[],
-                           real invmass[],
-                           unsigned short ptype[], unsigned short cTC[],
-                           rvec x[], rvec xprime[], rvec v[],
-                           rvec f[], matrix M, matrix box, real
-                           cos_accel, real vcos,
-                           gmx_bool bNH, gmx_bool bPR)
-{
-    double imass, w_dt;
-    int    gt = 0;
-    real   vn, vc;
-    real   lg, vxi = 0, vv;
-    real   fac, cosz;
-    rvec   vrel;
-    int    n, d;
-
-    fac = 2*M_PI/(box[ZZ][ZZ]);
-
-    if (bNH || bPR)
-    {
-        /* Update with coupling to extended ensembles, used for
-         * Nose-Hoover and Parrinello-Rahman coupling
-         */
-        for (n = start; n < nrend; n++)
-        {
-            imass = invmass[n];
-            if (cTC)
-            {
-                gt   = cTC[n];
-            }
-            lg   = tcstat[gt].lambda;
-            cosz = cos(fac*x[n][ZZ]);
-
-            copy_rvec(v[n], vrel);
-
-            vc            = cosz*vcos;
-            vrel[XX]     -= vc;
-            if (bNH)
-            {
-                vxi        = nh_vxi[gt];
-            }
-            for (d = 0; d < DIM; d++)
-            {
-                if ((ptype[n] != eptVSite) && (ptype[n] != eptShell))
-                {
-                    vn  = (lg*vrel[d] + dt*(imass*f[n][d] - 0.5*vxi*vrel[d]
-                                            - nstpcouple*iprod(M[d], vrel)))/(1 + 0.5*vxi*dt);
-                    if (d == XX)
-                    {
-                        vn += vc + dt*cosz*cos_accel;
-                    }
-                    v[n][d]        = vn;
-                    xprime[n][d]   = x[n][d]+vn*dt;
-                }
-                else
-                {
-                    xprime[n][d]   = x[n][d];
-                }
-            }
-        }
-    }
-    else
-    {
-        /* Classic version of update, used with berendsen coupling */
-        for (n = start; n < nrend; n++)
-        {
-            w_dt = invmass[n]*dt;
-            if (cTC)
-            {
-                gt   = cTC[n];
-            }
-            lg   = tcstat[gt].lambda;
-            cosz = cos(fac*x[n][ZZ]);
-
-            for (d = 0; d < DIM; d++)
-            {
-                vn             = v[n][d];
-
-                if ((ptype[n] != eptVSite) && (ptype[n] != eptShell))
-                {
-                    if (d == XX)
-                    {
-                        vc           = cosz*vcos;
-                        /* Do not scale the cosine velocity profile */
-                        vv           = vc + lg*(vn - vc + f[n][d]*w_dt);
-                        /* Add the cosine accelaration profile */
-                        vv          += dt*cosz*cos_accel;
-                    }
-                    else
-                    {
-                        vv           = lg*(vn + f[n][d]*w_dt);
-                    }
-                    v[n][d]        = vv;
-                    xprime[n][d]   = x[n][d]+vv*dt;
-                }
-                else
-                {
-                    v[n][d]        = 0.0;
-                    xprime[n][d]   = x[n][d];
-                }
-            }
-        }
-    }
-}
-
  static gmx_stochd_t *init_stochd(const t_inputrec *ir)
  {
      gmx_stochd_t   *sd;
@@ -483,7 +746,7 @@ static gmx_stochd_t *init_stochd(const t_inputrec *ir)
          {
              if (opts->tau_t[gt] > 0)
              {
-                sdc[gt].em  = exp(-ir->delta_t/opts->tau_t[gt]);
+                sdc[gt].em  = std::exp(-ir->delta_t/opts->tau_t[gt]);
              }
              else
              {
@@ -550,9 +813,7 @@ void update_temperature_constants(gmx_update_t *upd, const t_inputrec *ir)
  
  gmx_update_t *init_update(const t_inputrec *ir)
  {
-    gmx_update_t *upd;
-
-    snew(upd, 1);
+    gmx_update_t *upd = new(gmx_update_t);
  
      if (ir->eI == eiBD || EI_SD(ir->eI) || ir->etc == etcVRESCALE || ETC_ANDERSEN(ir->etc))
      {
@@ -561,31 +822,25 @@ gmx_update_t *init_update(const t_inputrec *ir)
  
      update_temperature_constants(upd, ir);
  
-    upd->xp        = NULL;
-    upd->xp_nalloc = 0;
+    upd->xp.resize(0);
  
      return upd;
  }
  
-void update_realloc(gmx_update_t *upd, int state_nalloc)
+void update_realloc(gmx_update_t *upd, int natoms)
  {
      GMX_ASSERT(upd, "upd must be allocated before its fields can be reallocated");
-    if (state_nalloc > upd->xp_nalloc)
-    {
-        upd->xp_nalloc = state_nalloc;
-        /* We need to allocate one element extra, since we might use
-         * (unaligned) 4-wide SIMD loads to access rvec entries. */
-        srenew(upd->xp, upd->xp_nalloc + 1);
-    }
+
+    upd->xp.resize(gmx::paddedRVecVectorSize(natoms));
  }
  
  static void do_update_sd1(gmx_stochd_t *sd,
-                          int start, int nrend, double dt,
+                          int start, int nrend, real dt,
                            rvec accel[], ivec nFreeze[],
                            real invmass[], unsigned short ptype[],
                            unsigned short cFREEZE[], unsigned short cACC[],
                            unsigned short cTC[],
-                          rvec x[], rvec xprime[], rvec v[], rvec f[],
+                          const rvec x[], rvec xprime[], rvec v[], const rvec f[],
                            gmx_bool bDoConstr,
                            gmx_bool bFirstHalfConstr,
                            gmx_int64_t step, int seed, int* gatindex)
@@ -724,12 +979,12 @@ static void do_update_sd1(gmx_stochd_t *sd,
      }
  }
  
-static void do_update_bd(int start, int nrend, double dt,
+static void do_update_bd(int start, int nrend, real dt,
                           ivec nFreeze[],
                           real invmass[], unsigned short ptype[],
                           unsigned short cFREEZE[], unsigned short cTC[],
-                         rvec x[], rvec xprime[], rvec v[],
-                         rvec f[], real friction_coefficient,
+                         const rvec x[], rvec xprime[], rvec v[],
+                         const rvec f[], real friction_coefficient,
                           real *rf, gmx_int64_t step, int seed,
                           int* gatindex)
  {
@@ -791,17 +1046,23 @@ static void do_update_bd(int start, int nrend, double dt,
  }
  
  static void dump_it_all(FILE gmx_unused *fp, const char gmx_unused *title,
-                        int gmx_unused natoms, rvec gmx_unused x[], rvec gmx_unused xp[],
-                        rvec gmx_unused v[], rvec gmx_unused f[])
+                        int gmx_unused natoms,
+                        const gmx_unused PaddedRVecVector *x,
+                        const gmx_unused PaddedRVecVector *xp,
+                        const gmx_unused PaddedRVecVector *v,
+                        const gmx_unused PaddedRVecVector *f)
  {
  #ifdef DEBUG
      if (fp)
      {
          fprintf(fp, "%s\n", title);
-        pr_rvecs(fp, 0, "x", x, natoms);
-        pr_rvecs(fp, 0, "xp", xp, natoms);
-        pr_rvecs(fp, 0, "v", v, natoms);
-        pr_rvecs(fp, 0, "f", f, natoms);
+        pr_rvecs(fp, 0, "x", as_rvec_array(x->data()), natoms);
+        pr_rvecs(fp, 0, "xp", as_rvec_array(xp->data()), natoms);
+        pr_rvecs(fp, 0, "v", as_rvec_array(v->data()), natoms);
+        if (f != NULL)
+        {
+            pr_rvecs(fp, 0, "f", as_rvec_array(f->data()), natoms);
+        }
      }
  #endif
  }
@@ -957,7 +1218,7 @@ static void calc_ke_part_visc(matrix box, rvec x[], rvec v[],
  
          /* Note that the times of x and v differ by half a step */
          /* MRS -- would have to be changed for VV */
-        cosz         = cos(fac*x[n][ZZ]);
+        cosz         = std::cos(fac*x[n][ZZ]);
          /* Calculate the amplitude of the new velocity profile */
          mvcos       += 2*cosz*md->massT[n]*v[n][XX];
  
@@ -1000,11 +1261,11 @@ void calc_ke_part(t_state *state, t_grpopts *opts, t_mdatoms *md,
  {
      if (ekind->cosacc.cos_accel == 0)
      {
-        calc_ke_part_normal(state->v, opts, md, ekind, nrnb, bEkinAveVel);
+        calc_ke_part_normal(as_rvec_array(state->v.data()), opts, md, ekind, nrnb, bEkinAveVel);
      }
      else
      {
-        calc_ke_part_visc(state->box, state->x, state->v, opts, md, ekind, nrnb, bEkinAveVel);
+        calc_ke_part_visc(state->box, as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), opts, md, ekind, nrnb, bEkinAveVel);
      }
  }
  
@@ -1014,9 +1275,9 @@ extern void init_ekinstate(ekinstate_t *ekinstate, const t_inputrec *ir)
      snew(ekinstate->ekinh, ekinstate->ekin_n);
      snew(ekinstate->ekinf, ekinstate->ekin_n);
      snew(ekinstate->ekinh_old, ekinstate->ekin_n);
-    snew(ekinstate->ekinscalef_nhc, ekinstate->ekin_n);
-    snew(ekinstate->ekinscaleh_nhc, ekinstate->ekin_n);
-    snew(ekinstate->vscale_nhc, ekinstate->ekin_n);
+    ekinstate->ekinscalef_nhc.resize(ekinstate->ekin_n);
+    ekinstate->ekinscaleh_nhc.resize(ekinstate->ekin_n);
+    ekinstate->vscale_nhc.resize(ekinstate->ekin_n);
      ekinstate->dekindl = 0;
      ekinstate->mvcos   = 0;
  }
@@ -1157,122 +1418,96 @@ void update_tcouple(gmx_int64_t       step,
                      t_mdatoms        *md)
  
  {
-    gmx_bool   bTCouple = FALSE;
-    real       dttc;
-    int        i, offset;
+    bool doTemperatureCoupling = false;
  
      /* if using vv with trotter decomposition methods, we do this elsewhere in the code */
-    if (inputrec->etc != etcNO &&
-        !(inputrecNvtTrotter(inputrec) || inputrecNptTrotter(inputrec) || inputrecNphTrotter(inputrec)))
+    if (!(EI_VV(inputrec->eI) &&
+          (inputrecNvtTrotter(inputrec) || inputrecNptTrotter(inputrec) || inputrecNphTrotter(inputrec))))
      {
-        /* We should only couple after a step where energies were determined (for leapfrog versions)
-           or the step energies are determined, for velocity verlet versions */
-
-        if (EI_VV(inputrec->eI))
-        {
-            offset = 0;
-        }
-        else
-        {
-            offset = 1;
-        }
-        bTCouple = (inputrec->nsttcouple == 1 ||
-                    do_per_step(step+inputrec->nsttcouple-offset,
-                                inputrec->nsttcouple));
+        doTemperatureCoupling = isTemperatureCouplingStep(step, inputrec);
      }
  
-    if (bTCouple)
+    if (doTemperatureCoupling)
      {
-        dttc = inputrec->nsttcouple*inputrec->delta_t;
+        real dttc = inputrec->nsttcouple*inputrec->delta_t;
  
          switch (inputrec->etc)
          {
              case etcNO:
                  break;
              case etcBERENDSEN:
-                berendsen_tcoupl(inputrec, ekind, dttc);
+                berendsen_tcoupl(inputrec, ekind, dttc, state->therm_integral);
                  break;
              case etcNOSEHOOVER:
                  nosehoover_tcoupl(&(inputrec->opts), ekind, dttc,
-                                  state->nosehoover_xi, state->nosehoover_vxi, MassQ);
+                                  state->nosehoover_xi.data(), state->nosehoover_vxi.data(), MassQ);
                  break;
              case etcVRESCALE:
                  vrescale_tcoupl(inputrec, step, ekind, dttc,
-                                state->therm_integral);
+                                state->therm_integral.data());
                  break;
          }
          /* rescale in place here */
          if (EI_VV(inputrec->eI))
          {
-            rescale_velocities(ekind, md, 0, md->homenr, state->v);
+            rescale_velocities(ekind, md, 0, md->homenr, as_rvec_array(state->v.data()));
          }
      }
      else
      {
          /* Set the T scaling lambda to 1 to have no scaling */
-        for (i = 0; (i < inputrec->opts.ngtc); i++)
+        for (int i = 0; (i < inputrec->opts.ngtc); i++)
          {
              ekind->tcstat[i].lambda = 1.0;
          }
      }
  }
  
-void update_pcouple(FILE             *fplog,
-                    gmx_int64_t       step,
-                    t_inputrec       *inputrec,
-                    t_state          *state,
-                    matrix            pcoupl_mu,
-                    matrix            M,
-                    gmx_bool          bInitStep)
+/*! \brief Computes the atom range for a thread to operate on, ensured SIMD aligned ranges
+ *
+ * \param[in]  numThreads   The number of threads to divide atoms over
+ * \param[in]  threadIndex  The thread to get the range for
+ * \param[in]  numAtoms     The total number of atoms (on this rank)
+ * \param[out] startAtom    The start of the atom range
+ * \param[out] endAtom      The end of the atom range, note that this is in general not a multiple of the SIMD width
+ */
+static void getThreadAtomRange(int numThreads, int threadIndex, int numAtoms,
+                               int *startAtom, int *endAtom)
  {
-    gmx_bool   bPCouple = FALSE;
-    real       dtpc     = 0;
-    int        i;
-
-    /* if using Trotter pressure, we do this in coupling.c, so we leave it false. */
-    if (inputrec->epc != epcNO && (!(inputrecNptTrotter(inputrec) || inputrecNphTrotter(inputrec))))
-    {
-        /* We should only couple after a step where energies were determined */
-        bPCouple = (inputrec->nstpcouple == 1 ||
-                    do_per_step(step+inputrec->nstpcouple-1,
-                                inputrec->nstpcouple));
-    }
+#if GMX_HAVE_SIMD_UPDATE
+    constexpr int blockSize = GMX_SIMD_REAL_WIDTH;
+#else
+    constexpr int blockSize = 1;
+#endif
+    int           numBlocks = (numAtoms + blockSize - 1)/blockSize;
  
-    clear_mat(pcoupl_mu);
-    for (i = 0; i < DIM; i++)
+    *startAtom    = ((numBlocks* threadIndex     )/numThreads)*blockSize;
+    *endAtom      = ((numBlocks*(threadIndex + 1))/numThreads)*blockSize;
+    if (threadIndex == numThreads - 1)
      {
-        pcoupl_mu[i][i] = 1.0;
+        *endAtom  = numAtoms;
      }
+}
  
-    clear_mat(M);
-
-    if (bPCouple)
+void update_pcouple_before_coordinates(FILE             *fplog,
+                                       gmx_int64_t       step,
+                                       const t_inputrec *inputrec,
+                                       t_state          *state,
+                                       matrix            parrinellorahmanMu,
+                                       matrix            M,
+                                       gmx_bool          bInitStep)
+{
+    /* Berendsen P-coupling is completely handled after the coordinate update.
+     * Trotter P-coupling is handled by separate calls to trotter_update().
+     */
+    if (inputrec->epc == epcPARRINELLORAHMAN &&
+        isPressureCouplingStep(step, inputrec))
      {
-        dtpc = inputrec->nstpcouple*inputrec->delta_t;
+        real dtpc = inputrec->nstpcouple*inputrec->delta_t;
  
-        switch (inputrec->epc)
-        {
-            /* We can always pcoupl, even if we did not sum the energies
-             * the previous step, since state->pres_prev is only updated
-             * when the energies have been summed.
-             */
-            case (epcNO):
-                break;
-            case (epcBERENDSEN):
-                if (!bInitStep)
-                {
-                    berendsen_pcoupl(fplog, step, inputrec, dtpc, state->pres_prev, state->box,
-                                     pcoupl_mu);
-                }
-                break;
-            case (epcPARRINELLORAHMAN):
-                parrinellorahman_pcoupl(fplog, step, inputrec, dtpc, state->pres_prev,
-                                        state->box, state->box_rel, state->boxv,
-                                        M, pcoupl_mu, bInitStep);
-                break;
-            default:
-                break;
-        }
+        parrinellorahman_pcoupl(fplog, step, inputrec, dtpc, state->pres_prev,
+                                state->box, state->box_rel, state->boxv,
+                                M, parrinellorahmanMu, bInitStep);
      }
  }
  
@@ -1284,7 +1519,7 @@ void update_constraints(FILE             *fplog,
                          t_state          *state,
                          gmx_bool          bMolPBC,
                          t_graph          *graph,
-                        rvec              force[],   /* forces on home particles */
+                        PaddedRVecVector *force,     /* forces on home particles */
                          t_idef           *idef,
                          tensor            vir_part,
                          t_commrec        *cr,
@@ -1296,8 +1531,6 @@ void update_constraints(FILE             *fplog,
                          gmx_bool          bCalcVir)
  {
      gmx_bool             bLastStep, bLog = FALSE, bEner = FALSE, bDoConstr = FALSE;
-    double               dt;
-    int                  start, homenr, nrend, i;
      tensor               vir_con;
      int                  nth, th;
  
@@ -1313,11 +1546,16 @@ void update_constraints(FILE             *fplog,
      /* for now, SD update is here -- though it really seems like it
         should be reformulated as a velocity verlet method, since it has two parts */
  
-    start  = 0;
-    homenr = md->homenr;
-    nrend  = start+homenr;
+    int  homenr = md->homenr;
  
-    dt   = inputrec->delta_t;
+    /* Cast delta_t from double to real to make the integrators faster.
+     * The only reason for having delta_t double is to get accurate values
+     * for t=delta_t*step when step is larger than float precision.
+     * For integration dt the accuracy of real suffices, since with
+     * integral += dt*integrand the increment is nearly always (much) smaller
+     * than the integral (and the integrand has real precision).
+     */
+    real dt     = inputrec->delta_t;
  
      /*
       *  Steps (7C, 8C)
@@ -1342,28 +1580,28 @@ void update_constraints(FILE             *fplog,
          wallcycle_start(wcycle, ewcCONSTR);
          if (EI_VV(inputrec->eI) && bFirstHalf)
          {
-            constrain(NULL, bLog, bEner, constr, idef,
+            constrain(nullptr, bLog, bEner, constr, idef,
                        inputrec, cr, step, 1, 1.0, md,
-                      state->x, state->v, state->v,
+                      as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
                        bMolPBC, state->box,
                        state->lambda[efptBONDED], dvdlambda,
-                      NULL, bCalcVir ? &vir_con : NULL, nrnb, econqVeloc);
+                      nullptr, bCalcVir ? &vir_con : nullptr, nrnb, econqVeloc);
          }
          else
          {
-            constrain(NULL, bLog, bEner, constr, idef,
+            constrain(nullptr, bLog, bEner, constr, idef,
                        inputrec, cr, step, 1, 1.0, md,
-                      state->x, upd->xp, NULL,
+                      as_rvec_array(state->x.data()), as_rvec_array(upd->xp.data()), nullptr,
                        bMolPBC, state->box,
                        state->lambda[efptBONDED], dvdlambda,
-                      state->v, bCalcVir ? &vir_con : NULL, nrnb, econqCoord);
+                      as_rvec_array(state->v.data()), bCalcVir ? &vir_con : nullptr, nrnb, econqCoord);
          }
          wallcycle_stop(wcycle, ewcCONSTR);
  
          where();
  
          dump_it_all(fplog, "After Shake",
-                    state->natoms, state->x, upd->xp, state->v, force);
+                    state->natoms, &state->x, &upd->xp, &state->v, force);
  
          if (bCalcVir)
          {
@@ -1389,9 +1627,7 @@ void update_constraints(FILE             *fplog,
              try
              {
                  int start_th, end_th;
-
-                start_th = start + ((nrend-start)* th   )/nth;
-                end_th   = start + ((nrend-start)*(th+1))/nth;
+                getThreadAtomRange(nth, th, homenr, &start_th, &end_th);
  
                  /* The second part of the SD integration */
                  do_update_sd1(upd->sd,
@@ -1399,10 +1635,10 @@ void update_constraints(FILE             *fplog,
                                inputrec->opts.acc, inputrec->opts.nFreeze,
                                md->invmass, md->ptype,
                                md->cFREEZE, md->cACC, md->cTC,
-                              state->x, upd->xp, state->v, force,
+                              as_rvec_array(state->x.data()), as_rvec_array(upd->xp.data()), as_rvec_array(state->v.data()), as_rvec_array(force->data()),
                                bDoConstr, FALSE,
                                step, inputrec->ld_seed,
-                              DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
+                              DOMAINDECOMP(cr) ? cr->dd->gatindex : nullptr);
              }
              GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
          }
@@ -1414,12 +1650,12 @@ void update_constraints(FILE             *fplog,
              /* Constrain the coordinates upd->xp for half a time step */
              wallcycle_start(wcycle, ewcCONSTR);
  
-            constrain(NULL, bLog, bEner, constr, idef,
+            constrain(nullptr, bLog, bEner, constr, idef,
                        inputrec, cr, step, 1, 0.5, md,
-                      state->x, upd->xp, NULL,
+                      as_rvec_array(state->x.data()), as_rvec_array(upd->xp.data()), nullptr,
                        bMolPBC, state->box,
                        state->lambda[efptBONDED], dvdlambda,
-                      state->v, NULL, nrnb, econqCoord);
+                      as_rvec_array(state->v.data()), nullptr, nrnb, econqCoord);
  
              wallcycle_stop(wcycle, ewcCONSTR);
          }
@@ -1437,7 +1673,7 @@ void update_constraints(FILE             *fplog,
           */
          wallcycle_start_nocount(wcycle, ewcUPDATE);
  
-        if (md->cFREEZE != NULL && constr != NULL)
+        if (md->cFREEZE != nullptr && constr != nullptr)
          {
              /* If we have atoms that are frozen along some, but not all
               * dimensions, the constraints will have moved them also along
@@ -1458,7 +1694,7 @@ void update_constraints(FILE             *fplog,
              }
              if (partialFreezeAndConstraints)
              {
-                for (int i = start; i < nrend; i++)
+                for (int i = 0; i < homenr; i++)
                  {
                      int g = md->cFREEZE[i];
  
@@ -1475,7 +1711,7 @@ void update_constraints(FILE             *fplog,
  
          if (graph && (graph->nnodes > 0))
          {
-            unshift_x(graph, state->box, state->x, upd->xp);
+            unshift_x(graph, state->box, as_rvec_array(state->x.data()), as_rvec_array(upd->xp.data()));
              if (TRICLINIC(state->box))
              {
                  inc_nrnb(nrnb, eNR_SHIFTX, 2*graph->nnodes);
@@ -1487,42 +1723,44 @@ void update_constraints(FILE             *fplog,
          }
          else
          {
+            /* The copy is performance sensitive, so use a bare pointer */
+            rvec *xp = as_rvec_array(upd->xp.data());
  #ifndef __clang_analyzer__
              // cppcheck-suppress unreadVariable
              nth = gmx_omp_nthreads_get(emntUpdate);
  #endif
  #pragma omp parallel for num_threads(nth) schedule(static)
-            for (i = start; i < nrend; i++)
+            for (int i = 0; i < homenr; i++)
              {
                  // Trivial statement, does not throw
-                copy_rvec(upd->xp[i], state->x[i]);
+                copy_rvec(xp[i], state->x[i]);
              }
          }
          wallcycle_stop(wcycle, ewcUPDATE);
  
          dump_it_all(fplog, "After unshift",
-                    state->natoms, state->x, upd->xp, state->v, force);
+                    state->natoms, &state->x, &upd->xp, &state->v, force);
      }
  /* ############# END the update of velocities and positions ######### */
  }
  
-void update_box(FILE             *fplog,
-                gmx_int64_t       step,
-                t_inputrec       *inputrec,  /* input record and box stuff     */
-                t_mdatoms        *md,
-                t_state          *state,
-                rvec              force[],   /* forces on home particles */
-                matrix            pcoupl_mu,
-                t_nrnb           *nrnb,
-                gmx_update_t     *upd)
+void update_pcouple_after_coordinates(FILE             *fplog,
+                                      gmx_int64_t       step,
+                                      const t_inputrec *inputrec,
+                                      const t_mdatoms  *md,
+                                      const matrix      pressure,
+                                      const matrix      forceVirial,
+                                      const matrix      constraintVirial,
+                                      const matrix      parrinellorahmanMu,
+                                      t_state          *state,
+                                      t_nrnb           *nrnb,
+                                      gmx_update_t     *upd)
  {
-    double               dt;
-    int                  start, homenr, i, n, m;
+    int  start  = 0;
+    int  homenr = md->homenr;
  
-    start  = 0;
-    homenr = md->homenr;
-
-    dt = inputrec->delta_t;
+    /* Cast to real for faster code, no loss in precision (see comment above) */
+    real dt     = inputrec->delta_t;
  
      where();
  
@@ -1532,35 +1770,41 @@ void update_box(FILE             *fplog,
          case (epcNO):
              break;
          case (epcBERENDSEN):
-            /* We should only scale after a step where the pressure (kinetic
-             * energy and virial) was calculated. This happens after the
-             * coordinate update, whereas the current routine is called before
-             * that, so we scale when step % nstpcouple = 1 instead of 0.
-             */
-            if (inputrec->nstpcouple == 1 || (step % inputrec->nstpcouple == 1))
+            if (isPressureCouplingStep(step, inputrec))
              {
-                berendsen_pscale(inputrec, pcoupl_mu, state->box, state->box_rel,
-                                 start, homenr, state->x, md->cFREEZE, nrnb);
+                real   dtpc = inputrec->nstpcouple*dt;
+                matrix mu;
+                berendsen_pcoupl(fplog, step, inputrec, dtpc,
+                                 pressure, state->box,
+                                 forceVirial, constraintVirial,
+                                 mu, &state->baros_integral);
+                berendsen_pscale(inputrec, mu, state->box, state->box_rel,
+                                 start, homenr, as_rvec_array(state->x.data()),
+                                 md->cFREEZE, nrnb);
              }
              break;
          case (epcPARRINELLORAHMAN):
-            /* The box velocities were updated in do_pr_pcoupl in the update
-             * iteration, but we dont change the box vectors until we get here
-             * since we need to be able to shift/unshift above.
-             */
-            for (i = 0; i < DIM; i++)
+            if (isPressureCouplingStep(step, inputrec))
              {
-                for (m = 0; m <= i; m++)
+                /* The box velocities were updated in do_pr_pcoupl,
+                 * but we dont change the box vectors until we get here
+                 * since we need to be able to shift/unshift above.
+                 */
+                real dtpc = inputrec->nstpcouple*dt;
+                for (int i = 0; i < DIM; i++)
                  {
-                    state->box[i][m] += dt*state->boxv[i][m];
+                    for (int m = 0; m <= i; m++)
+                    {
+                        state->box[i][m] += dtpc*state->boxv[i][m];
+                    }
                  }
-            }
-            preserve_box_shape(inputrec, state->box_rel, state->box);
+                preserve_box_shape(inputrec, state->box_rel, state->box);
  
-            /* Scale the coordinates */
-            for (n = start; (n < start+homenr); n++)
-            {
-                tmvmul_ur0(pcoupl_mu, state->x[n], state->x[n]);
+                /* Scale the coordinates */
+                for (int n = start; n < start + homenr; n++)
+                {
+                    tmvmul_ur0(parrinellorahmanMu, state->x[n], state->x[n]);
+                }
              }
              break;
          case (epcMTTK):
@@ -1571,7 +1815,7 @@ void update_box(FILE             *fplog,
                         ln V_new = ln V_old + 3*dt*veta => V_new = V_old*exp(3*dt*veta) =>
                         Side length scales as exp(veta*dt) */
  
-                    msmul(state->box, exp(state->veta*dt), state->box);
+                    msmul(state->box, std::exp(state->veta*dt), state->box);
  
                      /* Relate veta to boxv.  veta = d(eta)/dT = (1/DIM)*1/V dV/dT.
                         o               If we assume isotropic scaling, and box length scaling
@@ -1594,11 +1838,11 @@ void update_box(FILE             *fplog,
  
      if (inputrecDeform(inputrec))
      {
-        deform(upd, start, homenr, state->x, state->box, inputrec, step);
+        deform(upd, start, homenr, as_rvec_array(state->x.data()), state->box, inputrec, step);
      }
      where();
      dump_it_all(fplog, "After update",
-                state->natoms, state->x, upd->xp, state->v, force);
+                state->natoms, &state->x, &upd->xp, &state->v, nullptr);
  }
  
  void update_coords(FILE             *fplog,
@@ -1606,7 +1850,7 @@ void update_coords(FILE             *fplog,
                     t_inputrec       *inputrec,  /* input record and box stuff  */
                     t_mdatoms        *md,
                     t_state          *state,
-                   rvec             *f,    /* forces on home particles */
+                   PaddedRVecVector *f,    /* forces on home particles */
                     t_fcdata         *fcd,
                     gmx_ekindata_t   *ekind,
                     matrix            M,
@@ -1615,12 +1859,7 @@ void update_coords(FILE             *fplog,
                     t_commrec        *cr, /* these shouldn't be here -- need to think about it */
                     gmx_constr_t      constr)
  {
-    gmx_bool          bNH, bPR, bDoConstr = FALSE;
-    double            dt, alpha;
-    int               start, homenr, nrend;
-    int               nth, th;
-
-    bDoConstr = (NULL != constr);
+    gmx_bool bDoConstr = (nullptr != constr);
  
      /* Running the velocity half does nothing except for velocity verlet */
      if ((UpdatePart == etrtVELOCITY1 || UpdatePart == etrtVELOCITY2) &&
@@ -1629,11 +1868,10 @@ void update_coords(FILE             *fplog,
          gmx_incons("update_coords called for velocity without VV integrator");
      }
  
-    start  = 0;
-    homenr = md->homenr;
-    nrend  = start+homenr;
+    int  homenr = md->homenr;
  
-    dt   = inputrec->delta_t;
+    /* Cast to real for faster code, no loss in precision (see comment above) */
+    real dt     = inputrec->delta_t;
  
      /* We need to update the NMR restraint history when time averaging is used */
      if (state->flags & (1<<estDISRE_RM3TAV))
@@ -1645,54 +1883,45 @@ void update_coords(FILE             *fplog,
          update_orires_history(fcd, &state->hist);
      }
  
-
-    bNH = inputrec->etc == etcNOSEHOOVER;
-    bPR = ((inputrec->epc == epcPARRINELLORAHMAN) || (inputrec->epc == epcMTTK));
-
      /* ############# START The update of velocities and positions ######### */
      where();
      dump_it_all(fplog, "Before update",
-                state->natoms, state->x, upd->xp, state->v, f);
+                state->natoms, &state->x, &upd->xp, &state->v, f);
  
-    nth = gmx_omp_nthreads_get(emntUpdate);
+    int nth = gmx_omp_nthreads_get(emntUpdate);
  
-#pragma omp parallel for num_threads(nth) schedule(static) private(alpha)
-    for (th = 0; th < nth; th++)
+#pragma omp parallel for num_threads(nth) schedule(static)
+    for (int th = 0; th < nth; th++)
      {
          try
          {
              int start_th, end_th;
+            getThreadAtomRange(nth, th, homenr, &start_th, &end_th);
+
+#ifndef NDEBUG
+            /* Strictly speaking, we would only need this check with SIMD
+             * and for the actual SIMD width. But since the code currently
+             * always adds padding for GMX_REAL_MAX_SIMD_WIDTH, we check that.
+             */
+            size_t homenrSimdPadded = ((homenr + GMX_REAL_MAX_SIMD_WIDTH - 1)/GMX_REAL_MAX_SIMD_WIDTH)*GMX_REAL_MAX_SIMD_WIDTH;
+            GMX_ASSERT(state->x.size() >= homenrSimdPadded, "state->x needs to be padded for SIMD access");
+            GMX_ASSERT(upd->xp.size()  >= homenrSimdPadded, "upd->xp needs to be padded for SIMD access");
+            GMX_ASSERT(state->v.size() >= homenrSimdPadded, "state->v needs to be padded for SIMD access");
+            GMX_ASSERT(f->size()       >= homenrSimdPadded, "f needs to be padded for SIMD access");
+#endif
  
-            start_th = start + ((nrend-start)* th   )/nth;
-            end_th   = start + ((nrend-start)*(th+1))/nth;
+            const rvec *x_rvec  = as_rvec_array(state->x.data());
+            rvec       *xp_rvec = as_rvec_array(upd->xp.data());
+            rvec       *v_rvec  = as_rvec_array(state->v.data());
+            const rvec *f_rvec  = as_rvec_array(f->data());
  
              switch (inputrec->eI)
              {
                  case (eiMD):
-                    if (ekind->cosacc.cos_accel == 0)
-                    {
-                        do_update_md(start_th, end_th,
-                                     dt, inputrec->nstpcouple,
-                                     ekind->tcstat, state->nosehoover_vxi,
-                                     ekind->bNEMD, ekind->grpstat, inputrec->opts.acc,
-                                     inputrec->opts.nFreeze,
-                                     md->invmass, md->ptype,
-                                     md->cFREEZE, md->cACC, md->cTC,
-                                     state->x, upd->xp, state->v, f, M,
-                                     bNH, bPR);
-                    }
-                    else
-                    {
-                        do_update_visc(start_th, end_th,
-                                       dt, inputrec->nstpcouple,
-                                       ekind->tcstat, state->nosehoover_vxi,
-                                       md->invmass, md->ptype,
-                                       md->cTC, state->x, upd->xp, state->v, f, M,
-                                       state->box,
-                                       ekind->cosacc.cos_accel,
-                                       ekind->cosacc.vcos,
-                                       bNH, bPR);
-                    }
+                    do_update_md(start_th, end_th, step, dt,
+                                 inputrec, md, ekind, state->box,
+                                 x_rvec, xp_rvec, v_rvec, f_rvec,
+                                 state->nosehoover_vxi.data(), M);
                      break;
                  case (eiSD1):
                      /* With constraints, the SD1 update is done in 2 parts */
@@ -1701,22 +1930,28 @@ void update_coords(FILE             *fplog,
                                    inputrec->opts.acc, inputrec->opts.nFreeze,
                                    md->invmass, md->ptype,
                                    md->cFREEZE, md->cACC, md->cTC,
-                                  state->x, upd->xp, state->v, f,
+                                  x_rvec, xp_rvec, v_rvec, f_rvec,
                                    bDoConstr, TRUE,
-                                  step, inputrec->ld_seed, DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
+                                  step, inputrec->ld_seed, DOMAINDECOMP(cr) ? cr->dd->gatindex : nullptr);
                      break;
                  case (eiBD):
                      do_update_bd(start_th, end_th, dt,
                                   inputrec->opts.nFreeze, md->invmass, md->ptype,
                                   md->cFREEZE, md->cTC,
-                                 state->x, upd->xp, state->v, f,
+                                 x_rvec, xp_rvec, v_rvec, f_rvec,
                                   inputrec->bd_fric,
                                   upd->sd->bd_rf,
-                                 step, inputrec->ld_seed, DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
+                                 step, inputrec->ld_seed, DOMAINDECOMP(cr) ? cr->dd->gatindex : nullptr);
                      break;
                  case (eiVV):
                  case (eiVVAK):
-                    alpha = 1.0 + DIM/((double)inputrec->opts.nrdf[0]); /* assuming barostat coupled to group 0. */
+                {
+                    gmx_bool bExtended = (inputrec->etc == etcNOSEHOOVER ||
+                                          inputrec->epc == epcPARRINELLORAHMAN ||
+                                          inputrec->epc == epcMTTK);
+
+                    /* assuming barostat coupled to group 0 */
+                    real alpha = 1.0 + DIM/static_cast<real>(inputrec->opts.nrdf[0]);
                      switch (UpdatePart)
                      {
                          case etrtVELOCITY1:
@@ -1725,18 +1960,19 @@ void update_coords(FILE             *fplog,
                                               inputrec->opts.acc, inputrec->opts.nFreeze,
                                               md->invmass, md->ptype,
                                               md->cFREEZE, md->cACC,
-                                             state->v, f,
-                                             (bNH || bPR), state->veta, alpha);
+                                             v_rvec, f_rvec,
+                                             bExtended, state->veta, alpha);
                              break;
                          case etrtPOSITION:
                              do_update_vv_pos(start_th, end_th, dt,
                                               inputrec->opts.nFreeze,
                                               md->ptype, md->cFREEZE,
-                                             state->x, upd->xp, state->v,
-                                             (bNH || bPR), state->veta);
+                                             x_rvec, xp_rvec, v_rvec,
+                                             bExtended, state->veta);
                              break;
                      }
                      break;
+                }
                  default:
                      gmx_fatal(FARGS, "Don't know how to update coordinates");
                      break;
@@ -1808,7 +2044,7 @@ extern gmx_bool update_randomize_velocities(t_inputrec *ir, gmx_int64_t step, co
  
      real rate = (ir->delta_t)/ir->opts.tau_t[0];
  
-    if (ir->etc == etcANDERSEN && constr != NULL)
+    if (ir->etc == etcANDERSEN && constr != nullptr)
      {
          /* Currently, Andersen thermostat does not support constrained
             systems. Functionality exists in the andersen_tcoupl