src/gromacs/mdlib/update.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #include "gmxpre.h"
  39
  40 #include "update.h"
  41
  42 #include <cmath>
  43 #include <cstdio>
  44
  45 #include <algorithm>
  46 #include <memory>
  47
  48 #include "gromacs/domdec/domdec_struct.h"
  49 #include "gromacs/fileio/confio.h"
  50 #include "gromacs/gmxlib/network.h"
  51 #include "gromacs/gmxlib/nrnb.h"
  52 #include "gromacs/listed_forces/disre.h"
  53 #include "gromacs/listed_forces/orires.h"
  54 #include "gromacs/math/functions.h"
  55 #include "gromacs/math/invertmatrix.h"
  56 #include "gromacs/math/paddedvector.h"
  57 #include "gromacs/math/units.h"
  58 #include "gromacs/math/vec.h"
  59 #include "gromacs/math/vecdump.h"
  60 #include "gromacs/mdlib/boxdeformation.h"
  61 #include "gromacs/mdlib/constr.h"
  62 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  63 #include "gromacs/mdlib/mdatoms.h"
  64 #include "gromacs/mdlib/stat.h"
  65 #include "gromacs/mdlib/tgroup.h"
  66 #include "gromacs/mdtypes/commrec.h"
  67 #include "gromacs/mdtypes/fcdata.h"
  68 #include "gromacs/mdtypes/group.h"
  69 #include "gromacs/mdtypes/inputrec.h"
  70 #include "gromacs/mdtypes/md_enums.h"
  71 #include "gromacs/mdtypes/mdatom.h"
  72 #include "gromacs/mdtypes/state.h"
  73 #include "gromacs/pbcutil/boxutilities.h"
  74 #include "gromacs/pbcutil/pbc.h"
  75 #include "gromacs/pulling/pull.h"
  76 #include "gromacs/random/tabulatednormaldistribution.h"
  77 #include "gromacs/random/threefry.h"
  78 #include "gromacs/simd/simd.h"
  79 #include "gromacs/timing/wallcycle.h"
  80 #include "gromacs/topology/atoms.h"
  81 #include "gromacs/utility/exceptions.h"
  82 #include "gromacs/utility/fatalerror.h"
  83 #include "gromacs/utility/futil.h"
  84 #include "gromacs/utility/gmxassert.h"
  85 #include "gromacs/utility/gmxomp.h"
  86 #include "gromacs/utility/smalloc.h"
  87
  88 using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
  89
  90 struct gmx_sd_const_t
  91 {
  92     double em = 0;
  93 };
  94
  95 struct gmx_sd_sigma_t
  96 {
  97     real V = 0;
  98 };
  99
 100 struct gmx_stochd_t
 101 {
 102     /* BD stuff */
 103     std::vector<real> bd_rf;
 104     /* SD stuff */
 105     std::vector<gmx_sd_const_t> sdc;
 106     std::vector<gmx_sd_sigma_t> sdsig;
 107     /* andersen temperature control stuff */
 108     std::vector<bool> randomize_group;
 109     std::vector<real> boltzfac;
 110
 111     explicit gmx_stochd_t(const t_inputrec& inputRecord);
 112 };
 113
 114 //! pImpled implementation for Update
 115 class Update::Impl
 116 {
 117 public:
 118     //! Constructor
 119     Impl(const t_inputrec& inputRecord, BoxDeformation* boxDeformation);
 120     //! Destructor
 121     ~Impl() = default;
 122
 123     void update_coords(const t_inputrec&                                inputRecord,
 124                        int64_t                                          step,
 125                        const t_mdatoms*                                 md,
 126                        t_state*                                         state,
 127                        const gmx::ArrayRefWithPadding<const gmx::RVec>& f,
 128                        const t_fcdata&                                  fcdata,
 129                        const gmx_ekindata_t*                            ekind,
 130                        const matrix                                     M,
 131                        int                                              UpdatePart,
 132                        const t_commrec*                                 cr,
 133                        bool                                             haveConstraints);
 134
 135     void finish_update(const t_inputrec& inputRecord,
 136                        const t_mdatoms*  md,
 137                        t_state*          state,
 138                        gmx_wallcycle_t   wcycle,
 139                        bool              haveConstraints);
 140
 141     void update_sd_second_half(const t_inputrec& inputRecord,
 142                                int64_t           step,
 143                                real*             dvdlambda,
 144                                const t_mdatoms*  md,
 145                                t_state*          state,
 146                                const t_commrec*  cr,
 147                                t_nrnb*           nrnb,
 148                                gmx_wallcycle_t   wcycle,
 149                                gmx::Constraints* constr,
 150                                bool              do_log,
 151                                bool              do_ene);
 152
 153     void update_for_constraint_virial(const t_inputrec&                                inputRecord,
 154                                       const t_mdatoms&                                 md,
 155                                       const t_state&                                   state,
 156                                       const gmx::ArrayRefWithPadding<const gmx::RVec>& f,
 157                                       const gmx_ekindata_t&                            ekind);
 158
 159     void update_temperature_constants(const t_inputrec& inputRecord);
 160
 161     const std::vector<bool>& getAndersenRandomizeGroup() const { return sd_.randomize_group; }
 162
 163     const std::vector<real>& getBoltzmanFactor() const { return sd_.boltzfac; }
 164
 165     PaddedVector<RVec>* xp() { return &xp_; }
 166
 167     BoxDeformation* deform() const { return deform_; }
 168
 169 private:
 170     //! stochastic dynamics struct
 171     gmx_stochd_t sd_;
 172     //! xprime for constraint algorithms
 173     PaddedVector<RVec> xp_;
 174     //! Box deformation handler (or nullptr if inactive).
 175     BoxDeformation* deform_ = nullptr;
 176 };
 177
 178 Update::Update(const t_inputrec& inputRecord, BoxDeformation* boxDeformation) :
 179     impl_(new Impl(inputRecord, boxDeformation)){};
 180
 181 Update::~Update(){};
 182
 183 const std::vector<bool>& Update::getAndersenRandomizeGroup() const
 184 {
 185     return impl_->getAndersenRandomizeGroup();
 186 }
 187
 188 const std::vector<real>& Update::getBoltzmanFactor() const
 189 {
 190     return impl_->getBoltzmanFactor();
 191 }
 192
 193 PaddedVector<RVec>* Update::xp()
 194 {
 195     return impl_->xp();
 196 }
 197
 198 BoxDeformation* Update::deform() const
 199 {
 200     return impl_->deform();
 201 }
 202
 203 void Update::update_coords(const t_inputrec&                                inputRecord,
 204                            int64_t                                          step,
 205                            const t_mdatoms*                                 md,
 206                            t_state*                                         state,
 207                            const gmx::ArrayRefWithPadding<const gmx::RVec>& f,
 208                            const t_fcdata&                                  fcdata,
 209                            const gmx_ekindata_t*                            ekind,
 210                            const matrix                                     M,
 211                            int                                              updatePart,
 212                            const t_commrec*                                 cr,
 213                            const bool                                       haveConstraints)
 214 {
 215     return impl_->update_coords(
 216             inputRecord, step, md, state, f, fcdata, ekind, M, updatePart, cr, haveConstraints);
 217 }
 218
 219 void Update::finish_update(const t_inputrec& inputRecord,
 220                            const t_mdatoms*  md,
 221                            t_state*          state,
 222                            gmx_wallcycle_t   wcycle,
 223                            const bool        haveConstraints)
 224 {
 225     return impl_->finish_update(inputRecord, md, state, wcycle, haveConstraints);
 226 }
 227
 228 void Update::update_sd_second_half(const t_inputrec& inputRecord,
 229                                    int64_t           step,
 230                                    real*             dvdlambda,
 231                                    const t_mdatoms*  md,
 232                                    t_state*          state,
 233                                    const t_commrec*  cr,
 234                                    t_nrnb*           nrnb,
 235                                    gmx_wallcycle_t   wcycle,
 236                                    gmx::Constraints* constr,
 237                                    bool              do_log,
 238                                    bool              do_ene)
 239 {
 240     return impl_->update_sd_second_half(
 241             inputRecord, step, dvdlambda, md, state, cr, nrnb, wcycle, constr, do_log, do_ene);
 242 }
 243
 244 void Update::update_for_constraint_virial(const t_inputrec& inputRecord,
 245                                           const t_mdatoms&  md,
 246                                           const t_state&    state,
 247                                           const gmx::ArrayRefWithPadding<const gmx::RVec>& f,
 248                                           const gmx_ekindata_t&                            ekind)
 249 {
 250     return impl_->update_for_constraint_virial(inputRecord, md, state, f, ekind);
 251 }
 252
 253 void Update::update_temperature_constants(const t_inputrec& inputRecord)
 254 {
 255     return impl_->update_temperature_constants(inputRecord);
 256 }
 257
 258 /*! \brief Sets the velocities of virtual sites to zero */
 259 static void clearVsiteVelocities(int start, int nrend, const unsigned short* particleType, rvec* gmx_restrict v)
 260 {
 261     for (int a = start; a < nrend; a++)
 262     {
 263         if (particleType[a] == eptVSite)
 264         {
 265             clear_rvec(v[a]);
 266         }
 267     }
 268 }
 269
 270 /*! \brief Sets whether we store the updated velocities */
 271 enum class StoreUpdatedVelocities
 272 {
 273     yes, //!< Store the updated velocities
 274     no   //!< Do not store the updated velocities
 275 };
 276
 277 /*! \brief Sets the number of different temperature coupling values */
 278 enum class NumTempScaleValues
 279 {
 280     single,  //!< Single T-scaling value (either one group or all values =1)
 281     multiple //!< Multiple T-scaling values, need to use T-group indices
 282 };
 283
 284 /*! \brief Sets if to apply no or diagonal Parrinello-Rahman pressure scaling
 285  *
 286  * Note that this enum is only used in updateMDLeapfrogSimple(), which does
 287  * not handle fully anistropic Parrinello-Rahman scaling, so we only have
 288  * options \p no and \p diagonal here and no anistropic option.
 289  */
 290 enum class ApplyParrinelloRahmanVScaling
 291 {
 292     no,      //!< Do not apply velocity scaling (not a PR-coupling run or step)
 293     diagonal //!< Apply velocity scaling using a diagonal matrix
 294 };
 295
 296 /*! \brief Integrate using leap-frog with T-scaling and optionally diagonal Parrinello-Rahman p-coupling
 297  *
 298  * \tparam       storeUpdatedVelocities Tells whether we should store the updated velocities
 299  * \tparam       numTempScaleValues     The number of different T-couple values
 300  * \tparam       applyPRVScaling        Apply Parrinello-Rahman velocity scaling
 301  * \param[in]    start                  Index of first atom to update
 302  * \param[in]    nrend                  Last atom to update: \p nrend - 1
 303  * \param[in]    dt                     The time step
 304  * \param[in]    dtPressureCouple       Time step for pressure coupling
 305  * \param[in]    invMassPerDim          1/mass per atom and dimension
 306  * \param[in]    tcstat                 Temperature coupling information
 307  * \param[in]    cTC                    T-coupling group index per atom
 308  * \param[in]    pRVScaleMatrixDiagonal Parrinello-Rahman v-scale matrix diagonal
 309  * \param[in]    x                      Input coordinates
 310  * \param[out]   xprime                 Updated coordinates
 311  * \param[inout] v                      Velocities
 312  * \param[in]    f                      Forces
 313  *
 314  * We expect this template to get good SIMD acceleration by most compilers,
 315  * unlike the more complex general template.
 316  * Note that we might get even better SIMD acceleration when we introduce
 317  * aligned (and padded) memory, possibly with some hints for the compilers.
 318  */
 319 template<StoreUpdatedVelocities storeUpdatedVelocities, NumTempScaleValues numTempScaleValues, ApplyParrinelloRahmanVScaling applyPRVScaling>
 320 static void updateMDLeapfrogSimple(int         start,
 321                                    int         nrend,
 322                                    real        dt,
 323                                    real        dtPressureCouple,
 324                                    const rvec* gmx_restrict          invMassPerDim,
 325                                    gmx::ArrayRef<const t_grp_tcstat> tcstat,
 326                                    const unsigned short*             cTC,
 327                                    const rvec                        pRVScaleMatrixDiagonal,
 328                                    const rvec* gmx_restrict x,
 329                                    rvec* gmx_restrict xprime,
 330                                    rvec* gmx_restrict v,
 331                                    const rvec* gmx_restrict f)
 332 {
 333     real lambdaGroup;
 334
 335     if (numTempScaleValues == NumTempScaleValues::single)
 336     {
 337         lambdaGroup = tcstat[0].lambda;
 338     }
 339
 340     for (int a = start; a < nrend; a++)
 341     {
 342         if (numTempScaleValues == NumTempScaleValues::multiple)
 343         {
 344             lambdaGroup = tcstat[cTC[a]].lambda;
 345         }
 346
 347         for (int d = 0; d < DIM; d++)
 348         {
 349             /* Note that using rvec invMassPerDim results in more efficient
 350              * SIMD code, but this increases the cache pressure.
 351              * For large systems with PME on the CPU this slows down the
 352              * (then already slow) update by 20%. If all data remains in cache,
 353              * using rvec is much faster.
 354              */
 355             real vNew = lambdaGroup * v[a][d] + f[a][d] * invMassPerDim[a][d] * dt;
 356
 357             if (applyPRVScaling == ApplyParrinelloRahmanVScaling::diagonal)
 358             {
 359                 vNew -= dtPressureCouple * pRVScaleMatrixDiagonal[d] * v[a][d];
 360             }
 361             if (storeUpdatedVelocities == StoreUpdatedVelocities::yes)
 362             {
 363                 v[a][d] = vNew;
 364             }
 365             xprime[a][d] = x[a][d] + vNew * dt;
 366         }
 367     }
 368 }
 369
 370 #if GMX_SIMD && GMX_SIMD_HAVE_REAL
 371 #    define GMX_HAVE_SIMD_UPDATE 1
 372 #else
 373 #    define GMX_HAVE_SIMD_UPDATE 0
 374 #endif
 375
 376 #if GMX_HAVE_SIMD_UPDATE
 377
 378 /*! \brief Load (aligned) the contents of GMX_SIMD_REAL_WIDTH rvec elements sequentially into 3 SIMD registers
 379  *
 380  * The loaded output is:
 381  * \p r0: { r[index][XX], r[index][YY], ... }
 382  * \p r1: { ... }
 383  * \p r2: { ..., r[index+GMX_SIMD_REAL_WIDTH-1][YY], r[index+GMX_SIMD_REAL_WIDTH-1][ZZ] }
 384  *
 385  * \param[in]  r      Real to an rvec array, has to be aligned to SIMD register width
 386  * \param[in]  index  Index of the first rvec triplet of reals to load
 387  * \param[out] r0     Pointer to first SIMD register
 388  * \param[out] r1     Pointer to second SIMD register
 389  * \param[out] r2     Pointer to third SIMD register
 390  */
 391 static inline void simdLoadRvecs(const rvec* r, int index, SimdReal* r0, SimdReal* r1, SimdReal* r2)
 392 {
 393     const real* realPtr = r[index];
 394
 395     GMX_ASSERT(isSimdAligned(realPtr), "Pointer should be SIMD aligned");
 396
 397     *r0 = simdLoad(realPtr + 0 * GMX_SIMD_REAL_WIDTH);
 398     *r1 = simdLoad(realPtr + 1 * GMX_SIMD_REAL_WIDTH);
 399     *r2 = simdLoad(realPtr + 2 * GMX_SIMD_REAL_WIDTH);
 400 }
 401
 402 /*! \brief Store (aligned) 3 SIMD registers sequentially to GMX_SIMD_REAL_WIDTH rvec elements
 403  *
 404  * The stored output is:
 405  * \p r[index] = { { r0[0], r0[1], ... }
 406  * ...
 407  * \p r[index+GMX_SIMD_REAL_WIDTH-1] =  { ... , r2[GMX_SIMD_REAL_WIDTH-2], r2[GMX_SIMD_REAL_WIDTH-1] }
 408  *
 409  * \param[out] r      Pointer to an rvec array, has to be aligned to SIMD register width
 410  * \param[in]  index  Index of the first rvec triplet of reals to store to
 411  * \param[in]  r0     First SIMD register
 412  * \param[in]  r1     Second SIMD register
 413  * \param[in]  r2     Third SIMD register
 414  */
 415 static inline void simdStoreRvecs(rvec* r, int index, SimdReal r0, SimdReal r1, SimdReal r2)
 416 {
 417     real* realPtr = r[index];
 418
 419     GMX_ASSERT(isSimdAligned(realPtr), "Pointer should be SIMD aligned");
 420
 421     store(realPtr + 0 * GMX_SIMD_REAL_WIDTH, r0);
 422     store(realPtr + 1 * GMX_SIMD_REAL_WIDTH, r1);
 423     store(realPtr + 2 * GMX_SIMD_REAL_WIDTH, r2);
 424 }
 425
 426 /*! \brief Integrate using leap-frog with single group T-scaling and SIMD
 427  *
 428  * \tparam       storeUpdatedVelocities Tells whether we should store the updated velocities
 429  * \param[in]    start                  Index of first atom to update
 430  * \param[in]    nrend                  Last atom to update: \p nrend - 1
 431  * \param[in]    dt                     The time step
 432  * \param[in]    invMass                1/mass per atom
 433  * \param[in]    tcstat                 Temperature coupling information
 434  * \param[in]    x                      Input coordinates
 435  * \param[out]   xprime                 Updated coordinates
 436  * \param[inout] v                      Velocities
 437  * \param[in]    f                      Forces
 438  */
 439 template<StoreUpdatedVelocities storeUpdatedVelocities>
 440 static void updateMDLeapfrogSimpleSimd(int         start,
 441                                        int         nrend,
 442                                        real        dt,
 443                                        const real* gmx_restrict          invMass,
 444                                        gmx::ArrayRef<const t_grp_tcstat> tcstat,
 445                                        const rvec* gmx_restrict x,
 446                                        rvec* gmx_restrict xprime,
 447                                        rvec* gmx_restrict v,
 448                                        const rvec* gmx_restrict f)
 449 {
 450     SimdReal timestep(dt);
 451     SimdReal lambdaSystem(tcstat[0].lambda);
 452
 453     /* We declare variables here, since code is often slower when declaring them inside the loop */
 454
 455     /* Note: We should implement a proper PaddedVector, so we don't need this check */
 456     GMX_ASSERT(isSimdAligned(invMass), "invMass should be aligned");
 457
 458     for (int a = start; a < nrend; a += GMX_SIMD_REAL_WIDTH)
 459     {
 460         SimdReal invMass0, invMass1, invMass2;
 461         expandScalarsToTriplets(simdLoad(invMass + a), &invMass0, &invMass1, &invMass2);
 462
 463         SimdReal v0, v1, v2;
 464         SimdReal f0, f1, f2;
 465         simdLoadRvecs(v, a, &v0, &v1, &v2);
 466         simdLoadRvecs(f, a, &f0, &f1, &f2);
 467
 468         v0 = fma(f0 * invMass0, timestep, lambdaSystem * v0);
 469         v1 = fma(f1 * invMass1, timestep, lambdaSystem * v1);
 470         v2 = fma(f2 * invMass2, timestep, lambdaSystem * v2);
 471
 472         if (storeUpdatedVelocities == StoreUpdatedVelocities::yes)
 473         {
 474             simdStoreRvecs(v, a, v0, v1, v2);
 475         }
 476
 477         SimdReal x0, x1, x2;
 478         simdLoadRvecs(x, a, &x0, &x1, &x2);
 479
 480         SimdReal xprime0 = fma(v0, timestep, x0);
 481         SimdReal xprime1 = fma(v1, timestep, x1);
 482         SimdReal xprime2 = fma(v2, timestep, x2);
 483
 484         simdStoreRvecs(xprime, a, xprime0, xprime1, xprime2);
 485     }
 486 }
 487
 488 #endif // GMX_HAVE_SIMD_UPDATE
 489
 490 /*! \brief Sets the NEMD acceleration type */
 491 enum class AccelerationType
 492 {
 493     none,
 494     group,
 495     cosine
 496 };
 497
 498 /*! \brief Integrate using leap-frog with support for everything.
 499  *
 500  * \tparam        accelerationType  Type of NEMD acceleration.
 501  * \param[in]     start             Index of first atom to update.
 502  * \param[in]     nrend             Last atom to update: \p nrend - 1.
 503  * \param[in]     doNoseHoover      If to apply Nose-Hoover T-coupling.
 504  * \param[in]     dt                The time step.
 505  * \param[in]     dtPressureCouple  Time step for pressure coupling, is 0 when no pressure
 506  *                                  coupling should be applied at this step.
 507  * \param[in]     accel             Acceleration per group.
 508  * \param[in]     md                Atom properties.
 509  * \param[in]     ekind             Kinetic energy data.
 510  * \param[in]     box               The box dimensions.
 511  * \param[in]     x                 Input coordinates.
 512  * \param[out]    xprime            Updated coordinates.
 513  * \param[inout]  v                 Velocities.
 514  * \param[in]     f                 Forces.
 515  * \param[in]     nh_vxi            Nose-Hoover velocity scaling factors.
 516  * \param[in]     nsttcouple        Frequency of the temperature coupling steps.
 517  * \param[in]     M                 Parrinello-Rahman scaling matrix.
 518  */
 519 template<AccelerationType accelerationType>
 520 static void updateMDLeapfrogGeneral(int                   start,
 521                                     int                   nrend,
 522                                     bool                  doNoseHoover,
 523                                     real                  dt,
 524                                     real                  dtPressureCouple,
 525                                     const rvec*           accel,
 526                                     const t_mdatoms*      md,
 527                                     const gmx_ekindata_t* ekind,
 528                                     const matrix          box,
 529                                     const rvec* gmx_restrict x,
 530                                     rvec* gmx_restrict xprime,
 531                                     rvec* gmx_restrict v,
 532                                     const rvec* gmx_restrict f,
 533                                     const double* gmx_restrict nh_vxi,
 534                                     const int                  nsttcouple,
 535                                     const matrix               M)
 536 {
 537     /* This is a version of the leap-frog integrator that supports
 538      * all combinations of T-coupling, P-coupling and NEMD.
 539      * Nose-Hoover uses the reversible leap-frog integrator from
 540      * Holian et al. Phys Rev E 52(3) : 2338, 1995
 541      */
 542
 543     gmx::ArrayRef<const t_grp_tcstat> tcstat  = ekind->tcstat;
 544     gmx::ArrayRef<const t_grp_acc>    grpstat = ekind->grpstat;
 545     const unsigned short*             cTC     = md->cTC;
 546     const unsigned short*             cACC    = md->cACC;
 547
 548     const rvec* gmx_restrict invMassPerDim = md->invMassPerDim;
 549
 550     /* Initialize group values, changed later when multiple groups are used */
 551     int ga = 0;
 552     int gt = 0;
 553
 554     real omega_Z = 2 * static_cast<real>(M_PI) / box[ZZ][ZZ];
 555
 556     for (int n = start; n < nrend; n++)
 557     {
 558         if (cTC)
 559         {
 560             gt = cTC[n];
 561         }
 562         real lg = tcstat[gt].lambda;
 563
 564         rvec vRel;
 565         real cosineZ, vCosine;
 566 #ifdef __INTEL_COMPILER
 567 #    pragma warning(disable : 280)
 568 #endif
 569         switch (accelerationType)
 570         {
 571             case AccelerationType::none: copy_rvec(v[n], vRel); break;
 572             case AccelerationType::group:
 573                 if (cACC)
 574                 {
 575                     ga = cACC[n];
 576                 }
 577                 /* Avoid scaling the group velocity */
 578                 rvec_sub(v[n], grpstat[ga].u, vRel);
 579                 break;
 580             case AccelerationType::cosine:
 581                 cosineZ = std::cos(x[n][ZZ] * omega_Z);
 582                 vCosine = cosineZ * ekind->cosacc.vcos;
 583                 /* Avoid scaling the cosine profile velocity */
 584                 copy_rvec(v[n], vRel);
 585                 vRel[XX] -= vCosine;
 586                 break;
 587         }
 588
 589         real factorNH = 0.0;
 590         if (doNoseHoover)
 591         {
 592             /* Here we account for multiple time stepping, by increasing
 593              * the Nose-Hoover correction by nsttcouple
 594              * TODO: This can be pre-computed.
 595              */
 596             factorNH = 0.5 * nsttcouple * dt * nh_vxi[gt];
 597         }
 598
 599         for (int d = 0; d < DIM; d++)
 600         {
 601             real vNew = (lg * vRel[d]
 602                          + (f[n][d] * invMassPerDim[n][d] * dt - factorNH * vRel[d]
 603                             - dtPressureCouple * iprod(M[d], vRel)))
 604                         / (1 + factorNH);
 605             switch (accelerationType)
 606             {
 607                 case AccelerationType::none: break;
 608                 case AccelerationType::group:
 609                     /* Add back the mean velocity and apply acceleration */
 610                     vNew += grpstat[ga].u[d] + accel[ga][d] * dt;
 611                     break;
 612                 case AccelerationType::cosine:
 613                     if (d == XX)
 614                     {
 615                         /* Add back the mean velocity and apply acceleration */
 616                         vNew += vCosine + cosineZ * ekind->cosacc.cos_accel * dt;
 617                     }
 618                     break;
 619             }
 620             v[n][d]      = vNew;
 621             xprime[n][d] = x[n][d] + vNew * dt;
 622         }
 623     }
 624 }
 625
 626 /*! \brief Handles the Leap-frog MD x and v integration */
 627 static void do_update_md(int         start,
 628                          int         nrend,
 629                          real        dt,
 630                          int64_t     step,
 631                          const rvec* gmx_restrict x,
 632                          rvec* gmx_restrict xprime,
 633                          rvec* gmx_restrict v,
 634                          const rvec* gmx_restrict f,
 635                          const rvec* gmx_restrict accel,
 636                          const int                etc,
 637                          const int                epc,
 638                          const int                nsttcouple,
 639                          const int                nstpcouple,
 640                          const t_mdatoms*         md,
 641                          const gmx_ekindata_t*    ekind,
 642                          const matrix             box,
 643                          const double* gmx_restrict nh_vxi,
 644                          const matrix               M)
 645 {
 646     GMX_ASSERT(nrend == start || xprime != x,
 647                "For SIMD optimization certain compilers need to have xprime != x");
 648
 649     /* Note: Berendsen pressure scaling is handled after do_update_md() */
 650     bool doTempCouple = (etc != etcNO && do_per_step(step + nsttcouple - 1, nsttcouple));
 651     bool doNoseHoover = (etc == etcNOSEHOOVER && doTempCouple);
 652     bool doParrinelloRahman =
 653             (epc == epcPARRINELLORAHMAN && do_per_step(step + nstpcouple - 1, nstpcouple));
 654     bool doPROffDiagonal = (doParrinelloRahman && (M[YY][XX] != 0 || M[ZZ][XX] != 0 || M[ZZ][YY] != 0));
 655
 656     real dtPressureCouple = (doParrinelloRahman ? nstpcouple * dt : 0);
 657
 658     /* NEMD (also cosine) acceleration is applied in updateMDLeapFrogGeneral */
 659     bool doAcceleration = (ekind->bNEMD || ekind->cosacc.cos_accel != 0);
 660
 661     if (doNoseHoover || doPROffDiagonal || doAcceleration)
 662     {
 663         matrix stepM;
 664         if (!doParrinelloRahman)
 665         {
 666             /* We should not apply PR scaling at this step */
 667             clear_mat(stepM);
 668         }
 669         else
 670         {
 671             copy_mat(M, stepM);
 672         }
 673
 674         if (!doAcceleration)
 675         {
 676             updateMDLeapfrogGeneral<AccelerationType::none>(
 677                     start, nrend, doNoseHoover, dt, dtPressureCouple, accel, md, ekind, box, x, xprime, v, f, nh_vxi, nsttcouple, stepM);
 678         }
 679         else if (ekind->bNEMD)
 680         {
 681             updateMDLeapfrogGeneral<AccelerationType::group>(
 682                     start, nrend, doNoseHoover, dt, dtPressureCouple, accel, md, ekind, box, x, xprime, v, f, nh_vxi, nsttcouple, stepM);
 683         }
 684         else
 685         {
 686             updateMDLeapfrogGeneral<AccelerationType::cosine>(
 687                     start, nrend, doNoseHoover, dt, dtPressureCouple, accel, md, ekind, box, x, xprime, v, f, nh_vxi, nsttcouple, stepM);
 688         }
 689     }
 690     else
 691     {
 692         /* Use a simple and thus more efficient integration loop. */
 693         /* The simple loop does not check for particle type (so it can
 694          * be vectorized), which means we need to clear the velocities
 695          * of virtual sites in advance, when present. Note that vsite
 696          * velocities are computed after update and constraints from
 697          * their displacement.
 698          */
 699         if (md->haveVsites)
 700         {
 701             /* Note: The overhead of this loop is completely neligible */
 702             clearVsiteVelocities(start, nrend, md->ptype, v);
 703         }
 704
 705         /* We determine if we have a single T-coupling lambda value for all
 706          * atoms. That allows for better SIMD acceleration in the template.
 707          * If we do not do temperature coupling (in the run or this step),
 708          * all scaling values are 1, so we effectively have a single value.
 709          */
 710         bool haveSingleTempScaleValue = (!doTempCouple || ekind->ngtc == 1);
 711
 712         /* Extract some pointers needed by all cases */
 713         const unsigned short*             cTC           = md->cTC;
 714         gmx::ArrayRef<const t_grp_tcstat> tcstat        = ekind->tcstat;
 715         const rvec*                       invMassPerDim = md->invMassPerDim;
 716
 717         if (doParrinelloRahman)
 718         {
 719             GMX_ASSERT(!doPROffDiagonal,
 720                        "updateMDLeapfrogSimple only support diagonal Parrinello-Rahman scaling "
 721                        "matrices");
 722
 723             rvec diagM;
 724             for (int d = 0; d < DIM; d++)
 725             {
 726                 diagM[d] = M[d][d];
 727             }
 728
 729             if (haveSingleTempScaleValue)
 730             {
 731                 updateMDLeapfrogSimple<StoreUpdatedVelocities::yes, NumTempScaleValues::single, ApplyParrinelloRahmanVScaling::diagonal>(
 732                         start, nrend, dt, dtPressureCouple, invMassPerDim, tcstat, cTC, diagM, x, xprime, v, f);
 733             }
 734             else
 735             {
 736                 updateMDLeapfrogSimple<StoreUpdatedVelocities::yes, NumTempScaleValues::multiple, ApplyParrinelloRahmanVScaling::diagonal>(
 737                         start, nrend, dt, dtPressureCouple, invMassPerDim, tcstat, cTC, diagM, x, xprime, v, f);
 738             }
 739         }
 740         else
 741         {
 742             if (haveSingleTempScaleValue)
 743             {
 744                 /* Note that modern compilers are pretty good at vectorizing
 745                  * updateMDLeapfrogSimple(). But the SIMD version will still
 746                  * be faster because invMass lowers the cache pressure
 747                  * compared to invMassPerDim.
 748                  */
 749 #if GMX_HAVE_SIMD_UPDATE
 750                 /* Check if we can use invmass instead of invMassPerDim */
 751                 if (!md->havePartiallyFrozenAtoms)
 752                 {
 753                     updateMDLeapfrogSimpleSimd<StoreUpdatedVelocities::yes>(
 754                             start, nrend, dt, md->invmass, tcstat, x, xprime, v, f);
 755                 }
 756                 else
 757 #endif
 758                 {
 759                     updateMDLeapfrogSimple<StoreUpdatedVelocities::yes, NumTempScaleValues::single, ApplyParrinelloRahmanVScaling::no>(
 760                             start, nrend, dt, dtPressureCouple, invMassPerDim, tcstat, cTC, nullptr, x, xprime, v, f);
 761                 }
 762             }
 763             else
 764             {
 765                 updateMDLeapfrogSimple<StoreUpdatedVelocities::yes, NumTempScaleValues::multiple, ApplyParrinelloRahmanVScaling::no>(
 766                         start, nrend, dt, dtPressureCouple, invMassPerDim, tcstat, cTC, nullptr, x, xprime, v, f);
 767             }
 768         }
 769     }
 770 }
 771 /*! \brief Handles the Leap-frog MD x and v integration */
 772 static void doUpdateMDDoNotUpdateVelocities(int         start,
 773                                             int         nrend,
 774                                             real        dt,
 775                                             const rvec* gmx_restrict x,
 776                                             rvec* gmx_restrict xprime,
 777                                             rvec* gmx_restrict v,
 778                                             const rvec* gmx_restrict f,
 779                                             const t_mdatoms&         md,
 780                                             const gmx_ekindata_t&    ekind)
 781 {
 782     GMX_ASSERT(nrend == start || xprime != x,
 783                "For SIMD optimization certain compilers need to have xprime != x");
 784
 785     gmx::ArrayRef<const t_grp_tcstat> tcstat = ekind.tcstat;
 786
 787     /* Check if we can use invmass instead of invMassPerDim */
 788 #if GMX_HAVE_SIMD_UPDATE
 789     if (!md.havePartiallyFrozenAtoms)
 790     {
 791         updateMDLeapfrogSimpleSimd<StoreUpdatedVelocities::no>(
 792                 start, nrend, dt, md.invmass, tcstat, x, xprime, v, f);
 793     }
 794     else
 795 #endif
 796     {
 797         updateMDLeapfrogSimple<StoreUpdatedVelocities::no, NumTempScaleValues::single, ApplyParrinelloRahmanVScaling::no>(
 798                 start, nrend, dt, dt, md.invMassPerDim, tcstat, nullptr, nullptr, x, xprime, v, f);
 799     }
 800 }
 801
 802 static void do_update_vv_vel(int                  start,
 803                              int                  nrend,
 804                              real                 dt,
 805                              const rvec           accel[],
 806                              const ivec           nFreeze[],
 807                              const real           invmass[],
 808                              const unsigned short ptype[],
 809                              const unsigned short cFREEZE[],
 810                              const unsigned short cACC[],
 811                              rvec                 v[],
 812                              const rvec           f[],
 813                              gmx_bool             bExtended,
 814                              real                 veta,
 815                              real                 alpha)
 816 {
 817     int  gf = 0, ga = 0;
 818     int  n, d;
 819     real g, mv1, mv2;
 820
 821     if (bExtended)
 822     {
 823         g   = 0.25 * dt * veta * alpha;
 824         mv1 = std::exp(-g);
 825         mv2 = gmx::series_sinhx(g);
 826     }
 827     else
 828     {
 829         mv1 = 1.0;
 830         mv2 = 1.0;
 831     }
 832     for (n = start; n < nrend; n++)
 833     {
 834         real w_dt = invmass[n] * dt;
 835         if (cFREEZE)
 836         {
 837             gf = cFREEZE[n];
 838         }
 839         if (cACC)
 840         {
 841             ga = cACC[n];
 842         }
 843
 844         for (d = 0; d < DIM; d++)
 845         {
 846             if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
 847             {
 848                 v[n][d] = mv1 * (mv1 * v[n][d] + 0.5 * (w_dt * mv2 * f[n][d])) + 0.5 * accel[ga][d] * dt;
 849             }
 850             else
 851             {
 852                 v[n][d] = 0.0;
 853             }
 854         }
 855     }
 856 } /* do_update_vv_vel */
 857
 858 static void do_update_vv_pos(int                  start,
 859                              int                  nrend,
 860                              real                 dt,
 861                              const ivec           nFreeze[],
 862                              const unsigned short ptype[],
 863                              const unsigned short cFREEZE[],
 864                              const rvec           x[],
 865                              rvec                 xprime[],
 866                              const rvec           v[],
 867                              gmx_bool             bExtended,
 868                              real                 veta)
 869 {
 870     int  gf = 0;
 871     int  n, d;
 872     real g, mr1, mr2;
 873
 874     /* Would it make more sense if Parrinello-Rahman was put here? */
 875     if (bExtended)
 876     {
 877         g   = 0.5 * dt * veta;
 878         mr1 = std::exp(g);
 879         mr2 = gmx::series_sinhx(g);
 880     }
 881     else
 882     {
 883         mr1 = 1.0;
 884         mr2 = 1.0;
 885     }
 886
 887     for (n = start; n < nrend; n++)
 888     {
 889
 890         if (cFREEZE)
 891         {
 892             gf = cFREEZE[n];
 893         }
 894
 895         for (d = 0; d < DIM; d++)
 896         {
 897             if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
 898             {
 899                 xprime[n][d] = mr1 * (mr1 * x[n][d] + mr2 * dt * v[n][d]);
 900             }
 901             else
 902             {
 903                 xprime[n][d] = x[n][d];
 904             }
 905         }
 906     }
 907 } /* do_update_vv_pos */
 908
 909 gmx_stochd_t::gmx_stochd_t(const t_inputrec& inputRecord)
 910 {
 911     const t_grpopts* opts = &inputRecord.opts;
 912     const int        ngtc = opts->ngtc;
 913
 914     if (inputRecord.eI == eiBD)
 915     {
 916         bd_rf.resize(ngtc);
 917     }
 918     else if (EI_SD(inputRecord.eI))
 919     {
 920         sdc.resize(ngtc);
 921         sdsig.resize(ngtc);
 922
 923         for (int gt = 0; gt < ngtc; gt++)
 924         {
 925             if (opts->tau_t[gt] > 0)
 926             {
 927                 sdc[gt].em = std::exp(-inputRecord.delta_t / opts->tau_t[gt]);
 928             }
 929             else
 930             {
 931                 /* No friction and noise on this group */
 932                 sdc[gt].em = 1;
 933             }
 934         }
 935     }
 936     else if (ETC_ANDERSEN(inputRecord.etc))
 937     {
 938         randomize_group.resize(ngtc);
 939         boltzfac.resize(ngtc);
 940
 941         /* for now, assume that all groups, if randomized, are randomized at the same rate, i.e. tau_t is the same. */
 942         /* since constraint groups don't necessarily match up with temperature groups! This is checked in readir.c */
 943
 944         for (int gt = 0; gt < ngtc; gt++)
 945         {
 946             real reft = std::max<real>(0, opts->ref_t[gt]);
 947             if ((opts->tau_t[gt] > 0)
 948                 && (reft > 0)) /* tau_t or ref_t = 0 means that no randomization is done */
 949             {
 950                 randomize_group[gt] = true;
 951                 boltzfac[gt]        = BOLTZ * opts->ref_t[gt];
 952             }
 953             else
 954             {
 955                 randomize_group[gt] = false;
 956             }
 957         }
 958     }
 959 }
 960
 961 void Update::Impl::update_temperature_constants(const t_inputrec& inputRecord)
 962 {
 963     if (inputRecord.eI == eiBD)
 964     {
 965         if (inputRecord.bd_fric != 0)
 966         {
 967             for (int gt = 0; gt < inputRecord.opts.ngtc; gt++)
 968             {
 969                 sd_.bd_rf[gt] = std::sqrt(2.0 * BOLTZ * inputRecord.opts.ref_t[gt]
 970                                           / (inputRecord.bd_fric * inputRecord.delta_t));
 971             }
 972         }
 973         else
 974         {
 975             for (int gt = 0; gt < inputRecord.opts.ngtc; gt++)
 976             {
 977                 sd_.bd_rf[gt] = std::sqrt(2.0 * BOLTZ * inputRecord.opts.ref_t[gt]);
 978             }
 979         }
 980     }
 981     if (inputRecord.eI == eiSD1)
 982     {
 983         for (int gt = 0; gt < inputRecord.opts.ngtc; gt++)
 984         {
 985             real kT = BOLTZ * inputRecord.opts.ref_t[gt];
 986             /* The mass is accounted for later, since this differs per atom */
 987             sd_.sdsig[gt].V = std::sqrt(kT * (1 - sd_.sdc[gt].em * sd_.sdc[gt].em));
 988         }
 989     }
 990 }
 991
 992 Update::Impl::Impl(const t_inputrec& inputRecord, BoxDeformation* boxDeformation) :
 993     sd_(inputRecord),
 994     deform_(boxDeformation)
 995 {
 996     update_temperature_constants(inputRecord);
 997     xp_.resizeWithPadding(0);
 998 }
 999
1000 void Update::setNumAtoms(int numAtoms)
1001 {
1002
1003     impl_->xp()->resizeWithPadding(numAtoms);
1004 }
1005
1006 /*! \brief Sets the SD update type */
1007 enum class SDUpdate : int
1008 {
1009     ForcesOnly,
1010     FrictionAndNoiseOnly,
1011     Combined
1012 };
1013
1014 /*! \brief SD integrator update
1015  *
1016  * Two phases are required in the general case of a constrained
1017  * update, the first phase from the contribution of forces, before
1018  * applying constraints, and then a second phase applying the friction
1019  * and noise, and then further constraining. For details, see
1020  * Goga2012.
1021  *
1022  * Without constraints, the two phases can be combined, for
1023  * efficiency.
1024  *
1025  * Thus three instantiations of this templated function will be made,
1026  * two with only one contribution, and one with both contributions. */
1027 template<SDUpdate updateType>
1028 static void doSDUpdateGeneral(const gmx_stochd_t&  sd,
1029                               int                  start,
1030                               int                  nrend,
1031                               real                 dt,
1032                               const rvec           accel[],
1033                               const ivec           nFreeze[],
1034                               const real           invmass[],
1035                               const unsigned short ptype[],
1036                               const unsigned short cFREEZE[],
1037                               const unsigned short cACC[],
1038                               const unsigned short cTC[],
1039                               const rvec           x[],
1040                               rvec                 xprime[],
1041                               rvec                 v[],
1042                               const rvec           f[],
1043                               int64_t              step,
1044                               int                  seed,
1045                               const int*           gatindex)
1046 {
1047     // cTC, cACC and cFREEZE can be nullptr any time, but various
1048     // instantiations do not make sense with particular pointer
1049     // values.
1050     if (updateType == SDUpdate::ForcesOnly)
1051     {
1052         GMX_ASSERT(f != nullptr, "SD update with only forces requires forces");
1053         GMX_ASSERT(cTC == nullptr, "SD update with only forces cannot handle temperature groups");
1054     }
1055     if (updateType == SDUpdate::FrictionAndNoiseOnly)
1056     {
1057         GMX_ASSERT(f == nullptr, "SD update with only noise cannot handle forces");
1058         GMX_ASSERT(cACC == nullptr, "SD update with only noise cannot handle acceleration groups");
1059     }
1060     if (updateType == SDUpdate::Combined)
1061     {
1062         GMX_ASSERT(f != nullptr, "SD update with forces and noise requires forces");
1063     }
1064
1065     // Even 0 bits internal counter gives 2x64 ints (more than enough for three table lookups)
1066     gmx::ThreeFry2x64<0>                       rng(seed, gmx::RandomDomain::UpdateCoordinates);
1067     gmx::TabulatedNormalDistribution<real, 14> dist;
1068
1069     for (int n = start; n < nrend; n++)
1070     {
1071         int globalAtomIndex = gatindex ? gatindex[n] : n;
1072         rng.restart(step, globalAtomIndex);
1073         dist.reset();
1074
1075         real inverseMass = invmass[n];
1076         real invsqrtMass = std::sqrt(inverseMass);
1077
1078         int freezeGroup       = cFREEZE ? cFREEZE[n] : 0;
1079         int accelerationGroup = cACC ? cACC[n] : 0;
1080         int temperatureGroup  = cTC ? cTC[n] : 0;
1081
1082         for (int d = 0; d < DIM; d++)
1083         {
1084             if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[freezeGroup][d])
1085             {
1086                 if (updateType == SDUpdate::ForcesOnly)
1087                 {
1088                     real vn = v[n][d] + (inverseMass * f[n][d] + accel[accelerationGroup][d]) * dt;
1089                     v[n][d] = vn;
1090                     // Simple position update.
1091                     xprime[n][d] = x[n][d] + v[n][d] * dt;
1092                 }
1093                 else if (updateType == SDUpdate::FrictionAndNoiseOnly)
1094                 {
1095                     real vn = v[n][d];
1096                     v[n][d] = (vn * sd.sdc[temperatureGroup].em
1097                                + invsqrtMass * sd.sdsig[temperatureGroup].V * dist(rng));
1098                     // The previous phase already updated the
1099                     // positions with a full v*dt term that must
1100                     // now be half removed.
1101                     xprime[n][d] = xprime[n][d] + 0.5 * (v[n][d] - vn) * dt;
1102                 }
1103                 else
1104                 {
1105                     real vn = v[n][d] + (inverseMass * f[n][d] + accel[accelerationGroup][d]) * dt;
1106                     v[n][d] = (vn * sd.sdc[temperatureGroup].em
1107                                + invsqrtMass * sd.sdsig[temperatureGroup].V * dist(rng));
1108                     // Here we include half of the friction+noise
1109                     // update of v into the position update.
1110                     xprime[n][d] = x[n][d] + 0.5 * (vn + v[n][d]) * dt;
1111                 }
1112             }
1113             else
1114             {
1115                 // When using constraints, the update is split into
1116                 // two phases, but we only need to zero the update of
1117                 // virtual, shell or frozen particles in at most one
1118                 // of the phases.
1119                 if (updateType != SDUpdate::FrictionAndNoiseOnly)
1120                 {
1121                     v[n][d]      = 0.0;
1122                     xprime[n][d] = x[n][d];
1123                 }
1124             }
1125         }
1126     }
1127 }
1128
1129 static void do_update_sd(int         start,
1130                          int         nrend,
1131                          real        dt,
1132                          int64_t     step,
1133                          const rvec* gmx_restrict x,
1134                          rvec* gmx_restrict xprime,
1135                          rvec* gmx_restrict v,
1136                          const rvec* gmx_restrict f,
1137                          const rvec               accel[],
1138                          const ivec               nFreeze[],
1139                          const real               invmass[],
1140                          const unsigned short     ptype[],
1141                          const unsigned short     cFREEZE[],
1142                          const unsigned short     cACC[],
1143                          const unsigned short     cTC[],
1144                          int                      seed,
1145                          const t_commrec*         cr,
1146                          const gmx_stochd_t&      sd,
1147                          bool                     haveConstraints)
1148 {
1149     if (haveConstraints)
1150     {
1151         // With constraints, the SD update is done in 2 parts
1152         doSDUpdateGeneral<SDUpdate::ForcesOnly>(
1153                 sd, start, nrend, dt, accel, nFreeze, invmass, ptype, cFREEZE, cACC, nullptr, x, xprime, v, f, step, seed, nullptr);
1154     }
1155     else
1156     {
1157         doSDUpdateGeneral<SDUpdate::Combined>(sd,
1158                                               start,
1159                                               nrend,
1160                                               dt,
1161                                               accel,
1162                                               nFreeze,
1163                                               invmass,
1164                                               ptype,
1165                                               cFREEZE,
1166                                               cACC,
1167                                               cTC,
1168                                               x,
1169                                               xprime,
1170                                               v,
1171                                               f,
1172                                               step,
1173                                               seed,
1174                                               DOMAINDECOMP(cr) ? cr->dd->globalAtomIndices.data() : nullptr);
1175     }
1176 }
1177
1178 static void do_update_bd(int         start,
1179                          int         nrend,
1180                          real        dt,
1181                          int64_t     step,
1182                          const rvec* gmx_restrict x,
1183                          rvec* gmx_restrict xprime,
1184                          rvec* gmx_restrict v,
1185                          const rvec* gmx_restrict f,
1186                          const ivec               nFreeze[],
1187                          const real               invmass[],
1188                          const unsigned short     ptype[],
1189                          const unsigned short     cFREEZE[],
1190                          const unsigned short     cTC[],
1191                          real                     friction_coefficient,
1192                          const real*              rf,
1193                          int                      seed,
1194                          const int*               gatindex)
1195 {
1196     /* note -- these appear to be full step velocities . . .  */
1197     int  gf = 0, gt = 0;
1198     real vn;
1199     real invfr = 0;
1200     int  n, d;
1201     // Use 1 bit of internal counters to give us 2*2 64-bits values per stream
1202     // Each 64-bit value is enough for 4 normal distribution table numbers.
1203     gmx::ThreeFry2x64<0>                       rng(seed, gmx::RandomDomain::UpdateCoordinates);
1204     gmx::TabulatedNormalDistribution<real, 14> dist;
1205
1206     if (friction_coefficient != 0)
1207     {
1208         invfr = 1.0 / friction_coefficient;
1209     }
1210
1211     for (n = start; (n < nrend); n++)
1212     {
1213         int ng = gatindex ? gatindex[n] : n;
1214
1215         rng.restart(step, ng);
1216         dist.reset();
1217
1218         if (cFREEZE)
1219         {
1220             gf = cFREEZE[n];
1221         }
1222         if (cTC)
1223         {
1224             gt = cTC[n];
1225         }
1226         for (d = 0; (d < DIM); d++)
1227         {
1228             if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
1229             {
1230                 if (friction_coefficient != 0)
1231                 {
1232                     vn = invfr * f[n][d] + rf[gt] * dist(rng);
1233                 }
1234                 else
1235                 {
1236                     /* NOTE: invmass = 2/(mass*friction_constant*dt) */
1237                     vn = 0.5 * invmass[n] * f[n][d] * dt
1238                          + std::sqrt(0.5 * invmass[n]) * rf[gt] * dist(rng);
1239                 }
1240
1241                 v[n][d]      = vn;
1242                 xprime[n][d] = x[n][d] + vn * dt;
1243             }
1244             else
1245             {
1246                 v[n][d]      = 0.0;
1247                 xprime[n][d] = x[n][d];
1248             }
1249         }
1250     }
1251 }
1252
1253 extern void init_ekinstate(ekinstate_t* ekinstate, const t_inputrec* ir)
1254 {
1255     ekinstate->ekin_n = ir->opts.ngtc;
1256     snew(ekinstate->ekinh, ekinstate->ekin_n);
1257     snew(ekinstate->ekinf, ekinstate->ekin_n);
1258     snew(ekinstate->ekinh_old, ekinstate->ekin_n);
1259     ekinstate->ekinscalef_nhc.resize(ekinstate->ekin_n);
1260     ekinstate->ekinscaleh_nhc.resize(ekinstate->ekin_n);
1261     ekinstate->vscale_nhc.resize(ekinstate->ekin_n);
1262     ekinstate->dekindl          = 0;
1263     ekinstate->mvcos            = 0;
1264     ekinstate->hasReadEkinState = false;
1265 }
1266
1267 void update_ekinstate(ekinstate_t* ekinstate, const gmx_ekindata_t* ekind)
1268 {
1269     int i;
1270
1271     for (i = 0; i < ekinstate->ekin_n; i++)
1272     {
1273         copy_mat(ekind->tcstat[i].ekinh, ekinstate->ekinh[i]);
1274         copy_mat(ekind->tcstat[i].ekinf, ekinstate->ekinf[i]);
1275         copy_mat(ekind->tcstat[i].ekinh_old, ekinstate->ekinh_old[i]);
1276         ekinstate->ekinscalef_nhc[i] = ekind->tcstat[i].ekinscalef_nhc;
1277         ekinstate->ekinscaleh_nhc[i] = ekind->tcstat[i].ekinscaleh_nhc;
1278         ekinstate->vscale_nhc[i]     = ekind->tcstat[i].vscale_nhc;
1279     }
1280
1281     copy_mat(ekind->ekin, ekinstate->ekin_total);
1282     ekinstate->dekindl = ekind->dekindl;
1283     ekinstate->mvcos   = ekind->cosacc.mvcos;
1284 }
1285
1286 void restore_ekinstate_from_state(const t_commrec* cr, gmx_ekindata_t* ekind, const ekinstate_t* ekinstate)
1287 {
1288     int i, n;
1289
1290     if (MASTER(cr))
1291     {
1292         for (i = 0; i < ekinstate->ekin_n; i++)
1293         {
1294             copy_mat(ekinstate->ekinh[i], ekind->tcstat[i].ekinh);
1295             copy_mat(ekinstate->ekinf[i], ekind->tcstat[i].ekinf);
1296             copy_mat(ekinstate->ekinh_old[i], ekind->tcstat[i].ekinh_old);
1297             ekind->tcstat[i].ekinscalef_nhc = ekinstate->ekinscalef_nhc[i];
1298             ekind->tcstat[i].ekinscaleh_nhc = ekinstate->ekinscaleh_nhc[i];
1299             ekind->tcstat[i].vscale_nhc     = ekinstate->vscale_nhc[i];
1300         }
1301
1302         copy_mat(ekinstate->ekin_total, ekind->ekin);
1303
1304         ekind->dekindl      = ekinstate->dekindl;
1305         ekind->cosacc.mvcos = ekinstate->mvcos;
1306         n                   = ekinstate->ekin_n;
1307     }
1308
1309     if (PAR(cr))
1310     {
1311         gmx_bcast(sizeof(n), &n, cr->mpi_comm_mygroup);
1312         for (i = 0; i < n; i++)
1313         {
1314             gmx_bcast(DIM * DIM * sizeof(ekind->tcstat[i].ekinh[0][0]),
1315                       ekind->tcstat[i].ekinh[0],
1316                       cr->mpi_comm_mygroup);
1317             gmx_bcast(DIM * DIM * sizeof(ekind->tcstat[i].ekinf[0][0]),
1318                       ekind->tcstat[i].ekinf[0],
1319                       cr->mpi_comm_mygroup);
1320             gmx_bcast(DIM * DIM * sizeof(ekind->tcstat[i].ekinh_old[0][0]),
1321                       ekind->tcstat[i].ekinh_old[0],
1322                       cr->mpi_comm_mygroup);
1323
1324             gmx_bcast(sizeof(ekind->tcstat[i].ekinscalef_nhc),
1325                       &(ekind->tcstat[i].ekinscalef_nhc),
1326                       cr->mpi_comm_mygroup);
1327             gmx_bcast(sizeof(ekind->tcstat[i].ekinscaleh_nhc),
1328                       &(ekind->tcstat[i].ekinscaleh_nhc),
1329                       cr->mpi_comm_mygroup);
1330             gmx_bcast(sizeof(ekind->tcstat[i].vscale_nhc), &(ekind->tcstat[i].vscale_nhc), cr->mpi_comm_mygroup);
1331         }
1332         gmx_bcast(DIM * DIM * sizeof(ekind->ekin[0][0]), ekind->ekin[0], cr->mpi_comm_mygroup);
1333
1334         gmx_bcast(sizeof(ekind->dekindl), &ekind->dekindl, cr->mpi_comm_mygroup);
1335         gmx_bcast(sizeof(ekind->cosacc.mvcos), &ekind->cosacc.mvcos, cr->mpi_comm_mygroup);
1336     }
1337 }
1338
1339 void getThreadAtomRange(int numThreads, int threadIndex, int numAtoms, int* startAtom, int* endAtom)
1340 {
1341 #if GMX_HAVE_SIMD_UPDATE
1342     constexpr int blockSize = GMX_SIMD_REAL_WIDTH;
1343 #else
1344     constexpr int blockSize = 1;
1345 #endif
1346     int numBlocks = (numAtoms + blockSize - 1) / blockSize;
1347
1348     *startAtom = ((numBlocks * threadIndex) / numThreads) * blockSize;
1349     *endAtom   = ((numBlocks * (threadIndex + 1)) / numThreads) * blockSize;
1350     if (threadIndex == numThreads - 1)
1351     {
1352         *endAtom = numAtoms;
1353     }
1354 }
1355
1356 void Update::Impl::update_sd_second_half(const t_inputrec& inputRecord,
1357                                          int64_t           step,
1358                                          real*             dvdlambda,
1359                                          const t_mdatoms*  md,
1360                                          t_state*          state,
1361                                          const t_commrec*  cr,
1362                                          t_nrnb*           nrnb,
1363                                          gmx_wallcycle_t   wcycle,
1364                                          gmx::Constraints* constr,
1365                                          bool              do_log,
1366                                          bool              do_ene)
1367 {
1368     if (!constr)
1369     {
1370         return;
1371     }
1372     if (inputRecord.eI == eiSD1)
1373     {
1374         int homenr = md->homenr;
1375
1376         /* Cast delta_t from double to real to make the integrators faster.
1377          * The only reason for having delta_t double is to get accurate values
1378          * for t=delta_t*step when step is larger than float precision.
1379          * For integration dt the accuracy of real suffices, since with
1380          * integral += dt*integrand the increment is nearly always (much) smaller
1381          * than the integral (and the integrand has real precision).
1382          */
1383         real dt = inputRecord.delta_t;
1384
1385         wallcycle_start(wcycle, ewcUPDATE);
1386
1387         int nth = gmx_omp_nthreads_get(emntUpdate);
1388
1389 #pragma omp parallel for num_threads(nth) schedule(static)
1390         for (int th = 0; th < nth; th++)
1391         {
1392             try
1393             {
1394                 int start_th, end_th;
1395                 getThreadAtomRange(nth, th, homenr, &start_th, &end_th);
1396
1397                 doSDUpdateGeneral<SDUpdate::FrictionAndNoiseOnly>(
1398                         sd_,
1399                         start_th,
1400                         end_th,
1401                         dt,
1402                         inputRecord.opts.acc,
1403                         inputRecord.opts.nFreeze,
1404                         md->invmass,
1405                         md->ptype,
1406                         md->cFREEZE,
1407                         nullptr,
1408                         md->cTC,
1409                         state->x.rvec_array(),
1410                         xp_.rvec_array(),
1411                         state->v.rvec_array(),
1412                         nullptr,
1413                         step,
1414                         inputRecord.ld_seed,
1415                         DOMAINDECOMP(cr) ? cr->dd->globalAtomIndices.data() : nullptr);
1416             }
1417             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
1418         }
1419         inc_nrnb(nrnb, eNR_UPDATE, homenr);
1420         wallcycle_stop(wcycle, ewcUPDATE);
1421
1422         /* Constrain the coordinates upd->xp for half a time step */
1423         bool computeVirial = false;
1424         constr->apply(do_log,
1425                       do_ene,
1426                       step,
1427                       1,
1428                       0.5,
1429                       state->x.arrayRefWithPadding(),
1430                       xp_.arrayRefWithPadding(),
1431                       ArrayRef<RVec>(),
1432                       state->box,
1433                       state->lambda[efptBONDED],
1434                       dvdlambda,
1435                       state->v.arrayRefWithPadding(),
1436                       computeVirial,
1437                       nullptr,
1438                       ConstraintVariable::Positions);
1439     }
1440 }
1441
1442 void Update::Impl::finish_update(const t_inputrec& inputRecord,
1443                                  const t_mdatoms*  md,
1444                                  t_state*          state,
1445                                  gmx_wallcycle_t   wcycle,
1446                                  const bool        haveConstraints)
1447 {
1448     /* NOTE: Currently we always integrate to a temporary buffer and
1449      * then copy the results back here.
1450      */
1451
1452     wallcycle_start_nocount(wcycle, ewcUPDATE);
1453
1454     const int homenr = md->homenr;
1455     auto      xp     = makeConstArrayRef(xp_).subArray(0, homenr);
1456     auto      x      = makeArrayRef(state->x).subArray(0, homenr);
1457
1458     if (md->havePartiallyFrozenAtoms && haveConstraints)
1459     {
1460         /* We have atoms that are frozen along some, but not all dimensions,
1461          * then constraints will have moved them also along the frozen dimensions.
1462          * To freeze such degrees of freedom we do not copy them back here.
1463          */
1464         const ivec* nFreeze = inputRecord.opts.nFreeze;
1465
1466         for (int i = 0; i < homenr; i++)
1467         {
1468             const int g = md->cFREEZE[i];
1469
1470             for (int d = 0; d < DIM; d++)
1471             {
1472                 if (nFreeze[g][d] == 0)
1473                 {
1474                     x[i][d] = xp[i][d];
1475                 }
1476             }
1477         }
1478     }
1479     else
1480     {
1481         /* We have no frozen atoms or fully frozen atoms which have not
1482          * been moved by the update, so we can simply copy all coordinates.
1483          */
1484         int gmx_unused nth = gmx_omp_nthreads_get(emntUpdate);
1485 #pragma omp parallel for num_threads(nth) schedule(static)
1486         for (int i = 0; i < homenr; i++)
1487         {
1488             // Trivial statement, does not throw
1489             x[i] = xp[i];
1490         }
1491     }
1492
1493     wallcycle_stop(wcycle, ewcUPDATE);
1494 }
1495
1496 void Update::Impl::update_coords(const t_inputrec&                                inputRecord,
1497                                  int64_t                                          step,
1498                                  const t_mdatoms*                                 md,
1499                                  t_state*                                         state,
1500                                  const gmx::ArrayRefWithPadding<const gmx::RVec>& f,
1501                                  const t_fcdata&                                  fcdata,
1502                                  const gmx_ekindata_t*                            ekind,
1503                                  const matrix                                     M,
1504                                  int                                              updatePart,
1505                                  const t_commrec*                                 cr,
1506                                  const bool                                       haveConstraints)
1507 {
1508     /* Running the velocity half does nothing except for velocity verlet */
1509     if ((updatePart == etrtVELOCITY1 || updatePart == etrtVELOCITY2) && !EI_VV(inputRecord.eI))
1510     {
1511         gmx_incons("update_coords called for velocity without VV integrator");
1512     }
1513
1514     int homenr = md->homenr;
1515
1516     /* Cast to real for faster code, no loss in precision (see comment above) */
1517     real dt = inputRecord.delta_t;
1518
1519     /* We need to update the NMR restraint history when time averaging is used */
1520     if (state->flags & (1 << estDISRE_RM3TAV))
1521     {
1522         update_disres_history(*fcdata.disres, &state->hist);
1523     }
1524     if (state->flags & (1 << estORIRE_DTAV))
1525     {
1526         update_orires_history(*fcdata.orires, &state->hist);
1527     }
1528
1529     /* ############# START The update of velocities and positions ######### */
1530     int nth = gmx_omp_nthreads_get(emntUpdate);
1531
1532 #pragma omp parallel for num_threads(nth) schedule(static)
1533     for (int th = 0; th < nth; th++)
1534     {
1535         try
1536         {
1537             int start_th, end_th;
1538             getThreadAtomRange(nth, th, homenr, &start_th, &end_th);
1539
1540             const rvec* x_rvec  = state->x.rvec_array();
1541             rvec*       xp_rvec = xp_.rvec_array();
1542             rvec*       v_rvec  = state->v.rvec_array();
1543             const rvec* f_rvec  = as_rvec_array(f.unpaddedConstArrayRef().data());
1544
1545             switch (inputRecord.eI)
1546             {
1547                 case (eiMD):
1548                     do_update_md(start_th,
1549                                  end_th,
1550                                  dt,
1551                                  step,
1552                                  x_rvec,
1553                                  xp_rvec,
1554                                  v_rvec,
1555                                  f_rvec,
1556                                  inputRecord.opts.acc,
1557                                  inputRecord.etc,
1558                                  inputRecord.epc,
1559                                  inputRecord.nsttcouple,
1560                                  inputRecord.nstpcouple,
1561                                  md,
1562                                  ekind,
1563                                  state->box,
1564                                  state->nosehoover_vxi.data(),
1565                                  M);
1566                     break;
1567                 case (eiSD1):
1568                     do_update_sd(start_th,
1569                                  end_th,
1570                                  dt,
1571                                  step,
1572                                  x_rvec,
1573                                  xp_rvec,
1574                                  v_rvec,
1575                                  f_rvec,
1576                                  inputRecord.opts.acc,
1577                                  inputRecord.opts.nFreeze,
1578                                  md->invmass,
1579                                  md->ptype,
1580                                  md->cFREEZE,
1581                                  md->cACC,
1582                                  md->cTC,
1583                                  inputRecord.ld_seed,
1584                                  cr,
1585                                  sd_,
1586                                  haveConstraints);
1587                     break;
1588                 case (eiBD):
1589                     do_update_bd(start_th,
1590                                  end_th,
1591                                  dt,
1592                                  step,
1593                                  x_rvec,
1594                                  xp_rvec,
1595                                  v_rvec,
1596                                  f_rvec,
1597                                  inputRecord.opts.nFreeze,
1598                                  md->invmass,
1599                                  md->ptype,
1600                                  md->cFREEZE,
1601                                  md->cTC,
1602                                  inputRecord.bd_fric,
1603                                  sd_.bd_rf.data(),
1604                                  inputRecord.ld_seed,
1605                                  DOMAINDECOMP(cr) ? cr->dd->globalAtomIndices.data() : nullptr);
1606                     break;
1607                 case (eiVV):
1608                 case (eiVVAK):
1609                 {
1610                     gmx_bool bExtended = (inputRecord.etc == etcNOSEHOOVER || inputRecord.epc == epcPARRINELLORAHMAN
1611                                           || inputRecord.epc == epcMTTK);
1612
1613                     /* assuming barostat coupled to group 0 */
1614                     real alpha = 1.0 + DIM / static_cast<real>(inputRecord.opts.nrdf[0]);
1615                     switch (updatePart)
1616                     {
1617                         case etrtVELOCITY1:
1618                         case etrtVELOCITY2:
1619                             do_update_vv_vel(start_th,
1620                                              end_th,
1621                                              dt,
1622                                              inputRecord.opts.acc,
1623                                              inputRecord.opts.nFreeze,
1624                                              md->invmass,
1625                                              md->ptype,
1626                                              md->cFREEZE,
1627                                              md->cACC,
1628                                              v_rvec,
1629                                              f_rvec,
1630                                              bExtended,
1631                                              state->veta,
1632                                              alpha);
1633                             break;
1634                         case etrtPOSITION:
1635                             do_update_vv_pos(start_th,
1636                                              end_th,
1637                                              dt,
1638                                              inputRecord.opts.nFreeze,
1639                                              md->ptype,
1640                                              md->cFREEZE,
1641                                              x_rvec,
1642                                              xp_rvec,
1643                                              v_rvec,
1644                                              bExtended,
1645                                              state->veta);
1646                             break;
1647                     }
1648                     break;
1649                 }
1650                 default: gmx_fatal(FARGS, "Don't know how to update coordinates");
1651             }
1652         }
1653         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
1654     }
1655 }
1656
1657 void Update::Impl::update_for_constraint_virial(const t_inputrec& inputRecord,
1658                                                 const t_mdatoms&  md,
1659                                                 const t_state&    state,
1660                                                 const gmx::ArrayRefWithPadding<const gmx::RVec>& f,
1661                                                 const gmx_ekindata_t& ekind)
1662 {
1663     GMX_ASSERT(inputRecord.eI == eiMD || inputRecord.eI == eiSD1,
1664                "Only leap-frog is supported here");
1665
1666     // Cast to real for faster code, no loss in precision
1667     const real dt = inputRecord.delta_t;
1668
1669     const int nth = gmx_omp_nthreads_get(emntUpdate);
1670
1671 #pragma omp parallel for num_threads(nth) schedule(static)
1672     for (int th = 0; th < nth; th++)
1673     {
1674         try
1675         {
1676             int start_th, end_th;
1677             getThreadAtomRange(nth, th, md.homenr, &start_th, &end_th);
1678
1679             const rvec* x_rvec  = state.x.rvec_array();
1680             rvec*       xp_rvec = xp_.rvec_array();
1681             rvec*       v_rvec  = const_cast<rvec*>(state.v.rvec_array());
1682             const rvec* f_rvec  = as_rvec_array(f.unpaddedConstArrayRef().data());
1683
1684             doUpdateMDDoNotUpdateVelocities(
1685                     start_th, end_th, dt, x_rvec, xp_rvec, v_rvec, f_rvec, md, ekind);
1686         }
1687         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
1688     }
1689 }