src/gromacs/mdrun/md.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2011-2019,2020, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief Implements the integrator for normal molecular dynamics simulations
  40  *
  41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
  42  * \ingroup module_mdrun
  43  */
  44 #include "gmxpre.h"
  45
  46 #include <cinttypes>
  47 #include <cmath>
  48 #include <cstdio>
  49 #include <cstdlib>
  50
  51 #include <algorithm>
  52 #include <memory>
  53
  54 #include "gromacs/awh/awh.h"
  55 #include "gromacs/commandline/filenm.h"
  56 #include "gromacs/domdec/collect.h"
  57 #include "gromacs/domdec/dlbtiming.h"
  58 #include "gromacs/domdec/domdec.h"
  59 #include "gromacs/domdec/domdec_network.h"
  60 #include "gromacs/domdec/domdec_struct.h"
  61 #include "gromacs/domdec/gpuhaloexchange.h"
  62 #include "gromacs/domdec/mdsetup.h"
  63 #include "gromacs/domdec/partition.h"
  64 #include "gromacs/essentialdynamics/edsam.h"
  65 #include "gromacs/ewald/pme_load_balancing.h"
  66 #include "gromacs/ewald/pme_pp.h"
  67 #include "gromacs/fileio/trxio.h"
  68 #include "gromacs/gmxlib/network.h"
  69 #include "gromacs/gmxlib/nrnb.h"
  70 #include "gromacs/gpu_utils/device_stream_manager.h"
  71 #include "gromacs/gpu_utils/gpu_utils.h"
  72 #include "gromacs/imd/imd.h"
  73 #include "gromacs/listed_forces/listed_forces.h"
  74 #include "gromacs/math/functions.h"
  75 #include "gromacs/math/utilities.h"
  76 #include "gromacs/math/vec.h"
  77 #include "gromacs/math/vectypes.h"
  78 #include "gromacs/mdlib/checkpointhandler.h"
  79 #include "gromacs/mdlib/compute_io.h"
  80 #include "gromacs/mdlib/constr.h"
  81 #include "gromacs/mdlib/coupling.h"
  82 #include "gromacs/mdlib/ebin.h"
  83 #include "gromacs/mdlib/enerdata_utils.h"
  84 #include "gromacs/mdlib/energyoutput.h"
  85 #include "gromacs/mdlib/expanded.h"
  86 #include "gromacs/mdlib/force.h"
  87 #include "gromacs/mdlib/force_flags.h"
  88 #include "gromacs/mdlib/forcerec.h"
  89 #include "gromacs/mdlib/md_support.h"
  90 #include "gromacs/mdlib/mdatoms.h"
  91 #include "gromacs/mdlib/mdoutf.h"
  92 #include "gromacs/mdlib/membed.h"
  93 #include "gromacs/mdlib/resethandler.h"
  94 #include "gromacs/mdlib/sighandler.h"
  95 #include "gromacs/mdlib/simulationsignal.h"
  96 #include "gromacs/mdlib/stat.h"
  97 #include "gromacs/mdlib/stophandler.h"
  98 #include "gromacs/mdlib/tgroup.h"
  99 #include "gromacs/mdlib/trajectory_writing.h"
 100 #include "gromacs/mdlib/update.h"
 101 #include "gromacs/mdlib/update_constrain_gpu.h"
 102 #include "gromacs/mdlib/vcm.h"
 103 #include "gromacs/mdlib/vsite.h"
 104 #include "gromacs/mdrunutility/handlerestart.h"
 105 #include "gromacs/mdrunutility/multisim.h"
 106 #include "gromacs/mdrunutility/printtime.h"
 107 #include "gromacs/mdtypes/awh_history.h"
 108 #include "gromacs/mdtypes/awh_params.h"
 109 #include "gromacs/mdtypes/commrec.h"
 110 #include "gromacs/mdtypes/df_history.h"
 111 #include "gromacs/mdtypes/energyhistory.h"
 112 #include "gromacs/mdtypes/fcdata.h"
 113 #include "gromacs/mdtypes/forcerec.h"
 114 #include "gromacs/mdtypes/group.h"
 115 #include "gromacs/mdtypes/inputrec.h"
 116 #include "gromacs/mdtypes/interaction_const.h"
 117 #include "gromacs/mdtypes/md_enums.h"
 118 #include "gromacs/mdtypes/mdatom.h"
 119 #include "gromacs/mdtypes/mdrunoptions.h"
 120 #include "gromacs/mdtypes/observableshistory.h"
 121 #include "gromacs/mdtypes/pullhistory.h"
 122 #include "gromacs/mdtypes/simulation_workload.h"
 123 #include "gromacs/mdtypes/state.h"
 124 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
 125 #include "gromacs/modularsimulator/energydata.h"
 126 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 127 #include "gromacs/nbnxm/nbnxm.h"
 128 #include "gromacs/pbcutil/pbc.h"
 129 #include "gromacs/pulling/output.h"
 130 #include "gromacs/pulling/pull.h"
 131 #include "gromacs/swap/swapcoords.h"
 132 #include "gromacs/timing/wallcycle.h"
 133 #include "gromacs/timing/walltime_accounting.h"
 134 #include "gromacs/topology/atoms.h"
 135 #include "gromacs/topology/idef.h"
 136 #include "gromacs/topology/mtop_util.h"
 137 #include "gromacs/topology/topology.h"
 138 #include "gromacs/trajectory/trajectoryframe.h"
 139 #include "gromacs/utility/basedefinitions.h"
 140 #include "gromacs/utility/cstringutil.h"
 141 #include "gromacs/utility/fatalerror.h"
 142 #include "gromacs/utility/logger.h"
 143 #include "gromacs/utility/real.h"
 144 #include "gromacs/utility/smalloc.h"
 145
 146 #include "legacysimulator.h"
 147 #include "replicaexchange.h"
 148 #include "shellfc.h"
 149
 150 #if GMX_FAHCORE
 151 #    include "corewrap.h"
 152 #endif
 153
 154 using gmx::SimulationSignaller;
 155
 156 void gmx::LegacySimulator::do_md()
 157 {
 158     // TODO Historically, the EM and MD "integrators" used different
 159     // names for the t_inputrec *parameter, but these must have the
 160     // same name, now that it's a member of a struct. We use this ir
 161     // alias to avoid a large ripple of nearly useless changes.
 162     // t_inputrec is being replaced by IMdpOptionsProvider, so this
 163     // will go away eventually.
 164     t_inputrec*  ir = inputrec;
 165     int64_t      step, step_rel;
 166     double       t, t0 = ir->init_t, lam0[efptNR];
 167     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
 168     gmx_bool     bNS, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
 169     gmx_bool     bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
 170     gmx_bool     do_ene, do_log, do_verbose;
 171     gmx_bool     bMasterState;
 172     unsigned int force_flags;
 173     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
 174     int    i, m;
 175     rvec   mu_tot;
 176     matrix pressureCouplingMu, M;
 177     gmx_repl_ex_t               repl_ex = nullptr;
 178     PaddedHostVector<gmx::RVec> f{};
 179     gmx_global_stat_t           gstat;
 180     gmx_shellfc_t*              shellfc;
 181     gmx_bool                    bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
 182     gmx_bool                    bTemp, bPres, bTrotter;
 183     real                        dvdl_constr;
 184     std::vector<RVec>           cbuf;
 185     matrix                      lastbox;
 186     int                         lamnew = 0;
 187     /* for FEP */
 188     int       nstfep = 0;
 189     double    cycles;
 190     real      saved_conserved_quantity = 0;
 191     real      last_ekin                = 0;
 192     t_extmass MassQ;
 193     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
 194
 195     /* PME load balancing data for GPU kernels */
 196     gmx_bool bPMETune         = FALSE;
 197     gmx_bool bPMETunePrinting = FALSE;
 198
 199     bool bInteractiveMDstep = false;
 200
 201     /* Domain decomposition could incorrectly miss a bonded
 202        interaction, but checking for that requires a global
 203        communication stage, which does not otherwise happen in DD
 204        code. So we do that alongside the first global energy reduction
 205        after a new DD is made. These variables handle whether the
 206        check happens, and the result it returns. */
 207     bool shouldCheckNumberOfBondedInteractions = false;
 208     int  totalNumberOfBondedInteractions       = -1;
 209
 210     SimulationSignals signals;
 211     // Most global communnication stages don't propagate mdrun
 212     // signals, and will use this object to achieve that.
 213     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
 214
 215     if (!mdrunOptions.writeConfout)
 216     {
 217         // This is on by default, and the main known use case for
 218         // turning it off is for convenience in benchmarking, which is
 219         // something that should not show up in the general user
 220         // interface.
 221         GMX_LOG(mdlog.info)
 222                 .asParagraph()
 223                 .appendText(
 224                         "The -noconfout functionality is deprecated, and may be removed in a "
 225                         "future version.");
 226     }
 227
 228     /* md-vv uses averaged full step velocities for T-control
 229        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 230        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 231     bTrotter = (EI_VV(ir->eI)
 232                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
 233
 234     const bool bRerunMD = false;
 235
 236     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
 237     bGStatEveryStep   = (nstglobalcomm == 1);
 238
 239     const SimulationGroups* groups = &top_global->groups;
 240
 241     std::unique_ptr<EssentialDynamics> ed = nullptr;
 242     if (opt2bSet("-ei", nfile, fnm))
 243     {
 244         /* Initialize essential dynamics sampling */
 245         ed = init_edsam(mdlog, opt2fn_null("-ei", nfile, fnm), opt2fn("-eo", nfile, fnm), top_global,
 246                         ir, cr, constr, state_global, observablesHistory, oenv, startingBehavior);
 247     }
 248     else if (observablesHistory->edsamHistory)
 249     {
 250         gmx_fatal(FARGS,
 251                   "The checkpoint is from a run with essential dynamics sampling, "
 252                   "but the current run did not specify the -ei option. "
 253                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
 254     }
 255
 256     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
 257     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
 258     initialize_lambdas(fplog, *ir, MASTER(cr), fep_state, lambda, lam0);
 259     Update     upd(*ir, deform);
 260     const bool doSimulatedAnnealing = initSimulatedAnnealing(ir, &upd);
 261     const bool useReplicaExchange   = (replExParams.exchangeInterval > 0);
 262
 263     const t_fcdata& fcdata = fr->listedForces->fcdata();
 264
 265     bool simulationsShareState = false;
 266     int  nstSignalComm         = nstglobalcomm;
 267     {
 268         // TODO This implementation of ensemble orientation restraints is nasty because
 269         // a user can't just do multi-sim with single-sim orientation restraints.
 270         bool usingEnsembleRestraints =
 271                 (fcdata.disres->nsystems > 1) || ((ms != nullptr) && (fcdata.orires->nr != 0));
 272         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr));
 273
 274         // Replica exchange, ensemble restraints and AWH need all
 275         // simulations to remain synchronized, so they need
 276         // checkpoints and stop conditions to act on the same step, so
 277         // the propagation of such signals must take place between
 278         // simulations, not just within simulations.
 279         // TODO: Make algorithm initializers set these flags.
 280         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
 281
 282         if (simulationsShareState)
 283         {
 284             // Inter-simulation signal communication does not need to happen
 285             // often, so we use a minimum of 200 steps to reduce overhead.
 286             const int c_minimumInterSimulationSignallingInterval = 200;
 287             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
 288                             * nstglobalcomm;
 289         }
 290     }
 291
 292     if (startingBehavior != StartingBehavior::RestartWithAppending)
 293     {
 294         pleaseCiteCouplingAlgorithms(fplog, *ir);
 295     }
 296     gmx_mdoutf* outf =
 297             init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, ir,
 298                         top_global, oenv, wcycle, startingBehavior, simulationsShareState, ms);
 299     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, ir, pull_work,
 300                                    mdoutf_get_fp_dhdl(outf), false, startingBehavior, mdModulesNotifier);
 301
 302     gstat = global_stat_init(ir);
 303
 304     /* Check for polarizable models and flexible constraints */
 305     shellfc = init_shell_flexcon(fplog, top_global, constr ? constr->numFlexibleConstraints() : 0,
 306                                  ir->nstcalcenergy, DOMAINDECOMP(cr));
 307
 308     {
 309         double io = compute_io(ir, top_global->natoms, *groups, energyOutput.numEnergyTerms(), 1);
 310         if ((io > 2000) && MASTER(cr))
 311         {
 312             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
 313         }
 314     }
 315
 316     // Local state only becomes valid now.
 317     std::unique_ptr<t_state> stateInstance;
 318     t_state*                 state;
 319
 320
 321     gmx_localtop_t top(top_global->ffparams);
 322
 323     auto mdatoms = mdAtoms->mdatoms();
 324
 325     std::unique_ptr<UpdateConstrainGpu> integrator;
 326
 327     if (DOMAINDECOMP(cr))
 328     {
 329         stateInstance = std::make_unique<t_state>();
 330         state         = stateInstance.get();
 331         dd_init_local_state(cr->dd, state_global, state);
 332
 333         /* Distribute the charge groups over the nodes from the master node */
 334         dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1, state_global, *top_global, ir,
 335                             imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
 336                             nrnb, nullptr, FALSE);
 337         shouldCheckNumberOfBondedInteractions = true;
 338         upd.setNumAtoms(state->natoms);
 339     }
 340     else
 341     {
 342         state_change_natoms(state_global, state_global->natoms);
 343         /* Copy the pointer to the global state */
 344         state = state_global;
 345
 346         /* Generate and initialize new topology */
 347         mdAlgorithmsSetupAtomData(cr, ir, *top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
 348
 349         upd.setNumAtoms(state->natoms);
 350     }
 351
 352     const auto& simulationWork     = runScheduleWork->simulationWork;
 353     const bool  useGpuForPme       = simulationWork.useGpuPme;
 354     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
 355     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
 356     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
 357
 358     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
 359
 360     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
 361     if (useGpuForUpdate)
 362     {
 363         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
 364                                    || constr->numConstraintsTotal() == 0,
 365                            "Constraints in domain decomposition are only supported with update "
 366                            "groups if using GPU update.\n");
 367         GMX_RELEASE_ASSERT(ir->eConstrAlg != econtSHAKE || constr == nullptr
 368                                    || constr->numConstraintsTotal() == 0,
 369                            "SHAKE is not supported with GPU update.");
 370         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
 371                            "Either PME or short-ranged non-bonded interaction tasks must run on "
 372                            "the GPU to use GPU update.\n");
 373         GMX_RELEASE_ASSERT(ir->eI == eiMD,
 374                            "Only the md integrator is supported with the GPU update.\n");
 375         GMX_RELEASE_ASSERT(
 376                 ir->etc != etcNOSEHOOVER,
 377                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
 378         GMX_RELEASE_ASSERT(ir->epc == epcNO || ir->epc == epcPARRINELLORAHMAN || ir->epc == epcBERENDSEN,
 379                            "Only Parrinello-Rahman and Berendsen pressure coupling are supported "
 380                            "with the GPU update.\n");
 381         GMX_RELEASE_ASSERT(!mdatoms->haveVsites,
 382                            "Virtual sites are not supported with the GPU update.\n");
 383         GMX_RELEASE_ASSERT(ed == nullptr,
 384                            "Essential dynamics is not supported with the GPU update.\n");
 385         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(ir->pull),
 386                            "Constraints pulling is not supported with the GPU update.\n");
 387         GMX_RELEASE_ASSERT(fcdata.orires->nr == 0,
 388                            "Orientation restraints are not supported with the GPU update.\n");
 389         GMX_RELEASE_ASSERT(
 390                 ir->efep == efepNO
 391                         || (!haveFreeEnergyType(*ir, efptBONDED) && !haveFreeEnergyType(*ir, efptMASS)),
 392                 "Free energy perturbation of masses and constraints are not supported with the GPU "
 393                 "update.");
 394
 395         if (constr != nullptr && constr->numConstraintsTotal() > 0)
 396         {
 397             GMX_LOG(mdlog.info)
 398                     .asParagraph()
 399                     .appendText("Updating coordinates and applying constraints on the GPU.");
 400         }
 401         else
 402         {
 403             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
 404         }
 405         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
 406                            "Device stream manager should be initialized in order to use GPU "
 407                            "update-constraints.");
 408         GMX_RELEASE_ASSERT(
 409                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
 410                 "Update stream should be initialized in order to use GPU "
 411                 "update-constraints.");
 412         integrator = std::make_unique<UpdateConstrainGpu>(
 413                 *ir, *top_global, fr->deviceStreamManager->context(),
 414                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
 415                 stateGpu->xUpdatedOnDevice());
 416
 417         integrator->setPbc(PbcType::Xyz, state->box);
 418     }
 419
 420     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 421     {
 422         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
 423     }
 424     if ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 425     {
 426         changePinningPolicy(&f, PinningPolicy::PinnedIfSupported);
 427     }
 428     if (useGpuForUpdate)
 429     {
 430         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
 431     }
 432
 433     // NOTE: The global state is no longer used at this point.
 434     // But state_global is still used as temporary storage space for writing
 435     // the global state to file and potentially for replica exchange.
 436     // (Global topology should persist.)
 437
 438     update_mdatoms(mdatoms, state->lambda[efptMASS]);
 439
 440     if (ir->bExpanded)
 441     {
 442         /* Check nstexpanded here, because the grompp check was broken */
 443         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
 444         {
 445             gmx_fatal(FARGS,
 446                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
 447         }
 448         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
 449     }
 450
 451     if (MASTER(cr))
 452     {
 453         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
 454     }
 455
 456     preparePrevStepPullCom(ir, pull_work, mdatoms->massT, state, state_global, cr,
 457                            startingBehavior != StartingBehavior::NewSimulation);
 458
 459     // TODO: Remove this by converting AWH into a ForceProvider
 460     auto awh = prepareAwhModule(fplog, *ir, state_global, cr, ms,
 461                                 startingBehavior != StartingBehavior::NewSimulation,
 462                                 shellfc != nullptr, opt2fn("-awh", nfile, fnm), pull_work);
 463
 464     if (useReplicaExchange && MASTER(cr))
 465     {
 466         repl_ex = init_replica_exchange(fplog, ms, top_global->natoms, ir, replExParams);
 467     }
 468     /* PME tuning is only supported in the Verlet scheme, with PME for
 469      * Coulomb. It is not supported with only LJ PME. */
 470     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
 471                 && ir->cutoff_scheme != ecutsGROUP);
 472
 473     pme_load_balancing_t* pme_loadbal = nullptr;
 474     if (bPMETune)
 475     {
 476         pme_loadbal_init(&pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata,
 477                          fr->nbv->useGpu());
 478     }
 479
 480     if (!ir->bContinuation)
 481     {
 482         if (state->flags & (1U << estV))
 483         {
 484             auto v = makeArrayRef(state->v);
 485             /* Set the velocities of vsites, shells and frozen atoms to zero */
 486             for (i = 0; i < mdatoms->homenr; i++)
 487             {
 488                 if (mdatoms->ptype[i] == eptVSite || mdatoms->ptype[i] == eptShell)
 489                 {
 490                     clear_rvec(v[i]);
 491                 }
 492                 else if (mdatoms->cFREEZE)
 493                 {
 494                     for (m = 0; m < DIM; m++)
 495                     {
 496                         if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
 497                         {
 498                             v[i][m] = 0;
 499                         }
 500                     }
 501                 }
 502             }
 503         }
 504
 505         if (constr)
 506         {
 507             /* Constrain the initial coordinates and velocities */
 508             do_constrain_first(fplog, constr, ir, mdatoms->nr, mdatoms->homenr,
 509                                state->x.arrayRefWithPadding(), state->v.arrayRefWithPadding(),
 510                                state->box, state->lambda[efptBONDED]);
 511         }
 512         if (vsite)
 513         {
 514             /* Construct the virtual sites for the initial configuration */
 515             vsite->construct(state->x, ir->delta_t, {}, state->box);
 516         }
 517     }
 518
 519     if (ir->efep != efepNO)
 520     {
 521         /* Set free energy calculation frequency as the greatest common
 522          * denominator of nstdhdl and repl_ex_nst. */
 523         nstfep = ir->fepvals->nstdhdl;
 524         if (ir->bExpanded)
 525         {
 526             nstfep = gmx_greatest_common_divisor(ir->expandedvals->nstexpanded, nstfep);
 527         }
 528         if (useReplicaExchange)
 529         {
 530             nstfep = gmx_greatest_common_divisor(replExParams.exchangeInterval, nstfep);
 531         }
 532     }
 533
 534     /* Be REALLY careful about what flags you set here. You CANNOT assume
 535      * this is the first step, since we might be restarting from a checkpoint,
 536      * and in that case we should not do any modifications to the state.
 537      */
 538     bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
 539
 540     // When restarting from a checkpoint, it can be appropriate to
 541     // initialize ekind from quantities in the checkpoint. Otherwise,
 542     // compute_globals must initialize ekind before the simulation
 543     // starts/restarts. However, only the master rank knows what was
 544     // found in the checkpoint file, so we have to communicate in
 545     // order to coordinate the restart.
 546     //
 547     // TODO Consider removing this communication if/when checkpoint
 548     // reading directly follows .tpr reading, because all ranks can
 549     // agree on hasReadEkinState at that time.
 550     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
 551     if (PAR(cr))
 552     {
 553         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
 554     }
 555     if (hasReadEkinState)
 556     {
 557         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
 558     }
 559
 560     unsigned int cglo_flags =
 561             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
 562              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
 563
 564     bSumEkinhOld = FALSE;
 565
 566     t_vcm vcm(top_global->groups, *ir);
 567     reportComRemovalInfo(fplog, vcm);
 568
 569     /* To minimize communication, compute_globals computes the COM velocity
 570      * and the kinetic energy for the velocities without COM motion removed.
 571      * Thus to get the kinetic energy without the COM contribution, we need
 572      * to call compute_globals twice.
 573      */
 574     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
 575     {
 576         unsigned int cglo_flags_iteration = cglo_flags;
 577         if (bStopCM && cgloIteration == 0)
 578         {
 579             cglo_flags_iteration |= CGLO_STOPCM;
 580             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
 581         }
 582         compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 583                         makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, nullptr,
 584                         enerd, force_vir, shake_vir, total_vir, pres, constr, &nullSignaller,
 585                         state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
 586                         cglo_flags_iteration
 587                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
 588                                                                          : 0));
 589         if (cglo_flags_iteration & CGLO_STOPCM)
 590         {
 591             /* At initialization, do not pass x with acceleration-correction mode
 592              * to avoid (incorrect) correction of the initial coordinates.
 593              */
 594             auto x = (vcm.mode == ecmLINEAR_ACCELERATION_CORRECTION) ? ArrayRef<RVec>()
 595                                                                      : makeArrayRef(state->x);
 596             process_and_stopcm_grp(fplog, &vcm, *mdatoms, x, makeArrayRef(state->v));
 597             inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
 598         }
 599     }
 600     checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global, &top,
 601                                     makeConstArrayRef(state->x), state->box,
 602                                     &shouldCheckNumberOfBondedInteractions);
 603     if (ir->eI == eiVVAK)
 604     {
 605         /* a second call to get the half step temperature initialized as well */
 606         /* we do the same call as above, but turn the pressure off -- internally to
 607            compute_globals, this is recognized as a velocity verlet half-step
 608            kinetic energy calculation.  This minimized excess variables, but
 609            perhaps loses some logic?*/
 610
 611         compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 612                         makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, nullptr,
 613                         enerd, force_vir, shake_vir, total_vir, pres, constr, &nullSignaller,
 614                         state->box, nullptr, &bSumEkinhOld, cglo_flags & ~CGLO_PRESSURE);
 615     }
 616
 617     /* Calculate the initial half step temperature, and save the ekinh_old */
 618     if (startingBehavior == StartingBehavior::NewSimulation)
 619     {
 620         for (i = 0; (i < ir->opts.ngtc); i++)
 621         {
 622             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
 623         }
 624     }
 625
 626     /* need to make an initiation call to get the Trotter variables set, as well as other constants
 627        for non-trotter temperature control */
 628     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
 629
 630     if (MASTER(cr))
 631     {
 632         if (!ir->bContinuation)
 633         {
 634             if (constr && ir->eConstrAlg == econtLINCS)
 635             {
 636                 fprintf(fplog, "RMS relative constraint deviation after constraining: %.2e\n",
 637                         constr->rmsd());
 638             }
 639             if (EI_STATE_VELOCITY(ir->eI))
 640             {
 641                 real temp = enerd->term[F_TEMP];
 642                 if (ir->eI != eiVV)
 643                 {
 644                     /* Result of Ekin averaged over velocities of -half
 645                      * and +half step, while we only have -half step here.
 646                      */
 647                     temp *= 2;
 648                 }
 649                 fprintf(fplog, "Initial temperature: %g K\n", temp);
 650             }
 651         }
 652
 653         char tbuf[20];
 654         fprintf(stderr, "starting mdrun '%s'\n", *(top_global->name));
 655         if (ir->nsteps >= 0)
 656         {
 657             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
 658         }
 659         else
 660         {
 661             sprintf(tbuf, "%s", "infinite");
 662         }
 663         if (ir->init_step > 0)
 664         {
 665             fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 666                     gmx_step_str(ir->init_step + ir->nsteps, sbuf), tbuf,
 667                     gmx_step_str(ir->init_step, sbuf2), ir->init_step * ir->delta_t);
 668         }
 669         else
 670         {
 671             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
 672         }
 673         fprintf(fplog, "\n");
 674     }
 675
 676     walltime_accounting_start_time(walltime_accounting);
 677     wallcycle_start(wcycle, ewcRUN);
 678     print_start(fplog, cr, walltime_accounting, "mdrun");
 679
 680 #if GMX_FAHCORE
 681     /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
 682     int chkpt_ret = fcCheckPointParallel(cr->nodeid, NULL, 0);
 683     if (chkpt_ret == 0)
 684     {
 685         gmx_fatal(3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0);
 686     }
 687 #endif
 688
 689     /***********************************************************
 690      *
 691      *             Loop over MD steps
 692      *
 693      ************************************************************/
 694
 695     bFirstStep = TRUE;
 696     /* Skip the first Nose-Hoover integration when we get the state from tpx */
 697     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
 698     bSumEkinhOld     = FALSE;
 699     bExchanged       = FALSE;
 700     bNeedRepartition = FALSE;
 701
 702     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
 703             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]), simulationsShareState,
 704             MASTER(cr), ir->nstlist, mdrunOptions.reproducible, nstSignalComm,
 705             mdrunOptions.maximumHoursToRun, ir->nstlist == 0, fplog, step, bNS, walltime_accounting);
 706
 707     auto checkpointHandler = std::make_unique<CheckpointHandler>(
 708             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]), simulationsShareState,
 709             ir->nstlist == 0, MASTER(cr), mdrunOptions.writeConfout,
 710             mdrunOptions.checkpointOptions.period);
 711
 712     const bool resetCountersIsLocal = true;
 713     auto       resetHandler         = std::make_unique<ResetHandler>(
 714             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
 715             !resetCountersIsLocal, ir->nsteps, MASTER(cr), mdrunOptions.timingOptions.resetHalfway,
 716             mdrunOptions.maximumHoursToRun, mdlog, wcycle, walltime_accounting);
 717
 718     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
 719
 720     step     = ir->init_step;
 721     step_rel = 0;
 722
 723     // TODO extract this to new multi-simulation module
 724     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
 725     {
 726         if (!multisim_int_all_are_equal(ms, ir->nsteps))
 727         {
 728             GMX_LOG(mdlog.warning)
 729                     .appendText(
 730                             "Note: The number of steps is not consistent across multi "
 731                             "simulations,\n"
 732                             "but we are proceeding anyway!");
 733         }
 734         if (!multisim_int_all_are_equal(ms, ir->init_step))
 735         {
 736             if (simulationsShareState)
 737             {
 738                 if (MASTER(cr))
 739                 {
 740                     gmx_fatal(FARGS,
 741                               "The initial step is not consistent across multi simulations which "
 742                               "share the state");
 743                 }
 744                 gmx_barrier(cr->mpi_comm_mygroup);
 745             }
 746             else
 747             {
 748                 GMX_LOG(mdlog.warning)
 749                         .appendText(
 750                                 "Note: The initial step is not consistent across multi "
 751                                 "simulations,\n"
 752                                 "but we are proceeding anyway!");
 753             }
 754         }
 755     }
 756
 757     /* and stop now if we should */
 758     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
 759     while (!bLastStep)
 760     {
 761
 762         /* Determine if this is a neighbor search step */
 763         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
 764
 765         if (bPMETune && bNStList)
 766         {
 767             // This has to be here because PME load balancing is called so early.
 768             // TODO: Move to after all booleans are defined.
 769             if (useGpuForUpdate && !bFirstStep)
 770             {
 771                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 772                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 773             }
 774             /* PME grid + cut-off optimization with GPUs or PME nodes */
 775             pme_loadbal_do(pme_loadbal, cr, (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
 776                            fplog, mdlog, *ir, fr, state->box, state->x, wcycle, step, step_rel,
 777                            &bPMETunePrinting, simulationWork.useGpuPmePpCommunication);
 778         }
 779
 780         wallcycle_start(wcycle, ewcSTEP);
 781
 782         bLastStep = (step_rel == ir->nsteps);
 783         t         = t0 + step * ir->delta_t;
 784
 785         // TODO Refactor this, so that nstfep does not need a default value of zero
 786         if (ir->efep != efepNO || ir->bSimTemp)
 787         {
 788             /* find and set the current lambdas */
 789             setCurrentLambdasLocal(step, ir->fepvals, lam0, state->lambda, state->fep_state);
 790
 791             bDoDHDL     = do_per_step(step, ir->fepvals->nstdhdl);
 792             bDoFEP      = ((ir->efep != efepNO) && do_per_step(step, nstfep));
 793             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
 794                            && (!bFirstStep));
 795         }
 796
 797         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
 798                      && do_per_step(step, replExParams.exchangeInterval));
 799
 800         if (doSimulatedAnnealing)
 801         {
 802             update_annealing_target_temp(ir, t, &upd);
 803         }
 804
 805         /* Stop Center of Mass motion */
 806         bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
 807
 808         /* Determine whether or not to do Neighbour Searching */
 809         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
 810
 811         /* Note that the stopHandler will cause termination at nstglobalcomm
 812          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
 813          * nstpcouple steps, we have computed the half-step kinetic energy
 814          * of the previous step and can always output energies at the last step.
 815          */
 816         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
 817
 818         /* do_log triggers energy and virial calculation. Because this leads
 819          * to different code paths, forces can be different. Thus for exact
 820          * continuation we should avoid extra log output.
 821          * Note that the || bLastStep can result in non-exact continuation
 822          * beyond the last step. But we don't consider that to be an issue.
 823          */
 824         do_log     = (do_per_step(step, ir->nstlog)
 825                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
 826         do_verbose = mdrunOptions.verbose
 827                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
 828
 829         if (useGpuForUpdate && !bFirstStep && bNS)
 830         {
 831             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
 832             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
 833             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
 834             // Copy coordinate from the GPU when needed at the search step.
 835             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
 836             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
 837             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 838             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 839         }
 840
 841         if (bNS && !(bFirstStep && ir->bContinuation))
 842         {
 843             bMasterState = FALSE;
 844             /* Correct the new box if it is too skewed */
 845             if (inputrecDynamicBox(ir))
 846             {
 847                 if (correct_box(fplog, step, state->box))
 848                 {
 849                     bMasterState = TRUE;
 850                     // If update is offloaded, it should be informed about the box size change
 851                     if (useGpuForUpdate)
 852                     {
 853                         integrator->setPbc(PbcType::Xyz, state->box);
 854                     }
 855                 }
 856             }
 857             if (DOMAINDECOMP(cr) && bMasterState)
 858             {
 859                 dd_collect_state(cr->dd, state, state_global);
 860             }
 861
 862             if (DOMAINDECOMP(cr))
 863             {
 864                 /* Repartition the domain decomposition */
 865                 dd_partition_system(fplog, mdlog, step, cr, bMasterState, nstglobalcomm, state_global,
 866                                     *top_global, ir, imdSession, pull_work, state, &f, mdAtoms, &top,
 867                                     fr, vsite, constr, nrnb, wcycle, do_verbose && !bPMETunePrinting);
 868                 shouldCheckNumberOfBondedInteractions = true;
 869                 upd.setNumAtoms(state->natoms);
 870
 871                 // Allocate or re-size GPU halo exchange object, if necessary
 872                 if (havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange
 873                     && useGpuForNonbonded && is1D(*cr->dd))
 874                 {
 875                     GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
 876                                        "GPU device manager has to be initialized to use GPU "
 877                                        "version of halo exchange.");
 878                     // TODO remove need to pass local stream into GPU halo exchange - Issue #3093
 879                     constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
 880                 }
 881             }
 882         }
 883
 884         if (MASTER(cr) && do_log)
 885         {
 886             gmx::EnergyOutput::printHeader(fplog, step,
 887                                            t); /* can we improve the information printed here? */
 888         }
 889
 890         if (ir->efep != efepNO)
 891         {
 892             update_mdatoms(mdatoms, state->lambda[efptMASS]);
 893         }
 894
 895         if (bExchanged)
 896         {
 897
 898             /* We need the kinetic energy at minus the half step for determining
 899              * the full step kinetic energy and possibly for T-coupling.*/
 900             /* This may not be quite working correctly yet . . . . */
 901             compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 902                             makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle,
 903                             enerd, nullptr, nullptr, nullptr, nullptr, constr, &nullSignaller,
 904                             state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
 905                             CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
 906             checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global,
 907                                             &top, makeConstArrayRef(state->x), state->box,
 908                                             &shouldCheckNumberOfBondedInteractions);
 909         }
 910         clear_mat(force_vir);
 911
 912         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
 913
 914         /* Determine the energy and pressure:
 915          * at nstcalcenergy steps and at energy output steps (set below).
 916          */
 917         if (EI_VV(ir->eI) && (!bInitStep))
 918         {
 919             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
 920             bCalcVir      = bCalcEnerStep
 921                        || (ir->epc != epcNO
 922                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
 923         }
 924         else
 925         {
 926             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
 927             bCalcVir = bCalcEnerStep || (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
 928         }
 929         bCalcEner = bCalcEnerStep;
 930
 931         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
 932
 933         if (do_ene || do_log || bDoReplEx)
 934         {
 935             bCalcVir  = TRUE;
 936             bCalcEner = TRUE;
 937         }
 938
 939         /* Do we need global communication ? */
 940         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
 941                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
 942
 943         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
 944                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
 945                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
 946
 947         if (shellfc)
 948         {
 949             /* Now is the time to relax the shells */
 950             relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose, enforcedRotation, step, ir,
 951                                 imdSession, pull_work, bNS, force_flags, &top, constr, enerd,
 952                                 state->natoms, state->x.arrayRefWithPadding(),
 953                                 state->v.arrayRefWithPadding(), state->box, state->lambda, &state->hist,
 954                                 f.arrayRefWithPadding(), force_vir, mdatoms, nrnb, wcycle, shellfc,
 955                                 fr, runScheduleWork, t, mu_tot, vsite, ddBalanceRegionHandler);
 956         }
 957         else
 958         {
 959             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
 960                is updated (or the AWH update will be performed twice for one step when continuing).
 961                It would be best to call this update function from do_md_trajectory_writing but that
 962                would occur after do_force. One would have to divide the update_awh function into one
 963                function applying the AWH force and one doing the AWH bias update. The update AWH
 964                bias function could then be called after do_md_trajectory_writing (then containing
 965                update_awh_history). The checkpointing will in the future probably moved to the start
 966                of the md loop which will rid of this issue. */
 967             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
 968             {
 969                 awh->updateHistory(state_global->awhHistory.get());
 970             }
 971
 972             /* The coordinates (x) are shifted (to get whole molecules)
 973              * in do_force.
 974              * This is parallellized as well, and does communication too.
 975              * Check comments in sim_util.c
 976              */
 977             do_force(fplog, cr, ms, ir, awh.get(), enforcedRotation, imdSession, pull_work, step,
 978                      nrnb, wcycle, &top, state->box, state->x.arrayRefWithPadding(), &state->hist,
 979                      f.arrayRefWithPadding(), force_vir, mdatoms, enerd, state->lambda, fr,
 980                      runScheduleWork, vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
 981                      (bNS ? GMX_FORCE_NS : 0) | force_flags, ddBalanceRegionHandler);
 982         }
 983
 984         // VV integrators do not need the following velocity half step
 985         // if it is the first step after starting from a checkpoint.
 986         // That is, the half step is needed on all other steps, and
 987         // also the first step when starting from a .tpr file.
 988         if (EI_VV(ir->eI) && (!bFirstStep || startingBehavior == StartingBehavior::NewSimulation))
 989         /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
 990         {
 991             rvec* vbuf = nullptr;
 992
 993             wallcycle_start(wcycle, ewcUPDATE);
 994             if (ir->eI == eiVV && bInitStep)
 995             {
 996                 /* if using velocity verlet with full time step Ekin,
 997                  * take the first half step only to compute the
 998                  * virial for the first step. From there,
 999                  * revert back to the initial coordinates
1000                  * so that the input is actually the initial step.
1001                  */
1002                 snew(vbuf, state->natoms);
1003                 copy_rvecn(state->v.rvec_array(), vbuf, 0,
1004                            state->natoms); /* should make this better for parallelizing? */
1005             }
1006             else
1007             {
1008                 /* this is for NHC in the Ekin(t+dt/2) version of vv */
1009                 trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ,
1010                                trotter_seq, ettTSEQ1);
1011             }
1012
1013             upd.update_coords(*ir, step, mdatoms, state, f.arrayRefWithPadding(), fcdata, ekind, M,
1014                               etrtVELOCITY1, cr, constr != nullptr);
1015
1016             wallcycle_stop(wcycle, ewcUPDATE);
1017             constrain_velocities(constr, do_log, do_ene, step, state, nullptr, bCalcVir, shake_vir);
1018             wallcycle_start(wcycle, ewcUPDATE);
1019             /* if VV, compute the pressure and constraints */
1020             /* For VV2, we strictly only need this if using pressure
1021              * control, but we really would like to have accurate pressures
1022              * printed out.
1023              * Think about ways around this in the future?
1024              * For now, keep this choice in comments.
1025              */
1026             /*bPres = (ir->eI==eiVV || inputrecNptTrotter(ir)); */
1027             /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && inputrecNptTrotter(ir)));*/
1028             bPres = TRUE;
1029             bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
1030             if (bCalcEner && ir->eI == eiVVAK)
1031             {
1032                 bSumEkinhOld = TRUE;
1033             }
1034             /* for vv, the first half of the integration actually corresponds to the previous step.
1035                So we need information from the last step in the first half of the integration */
1036             if (bGStat || do_per_step(step - 1, nstglobalcomm))
1037             {
1038                 wallcycle_stop(wcycle, ewcUPDATE);
1039                 compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1040                                 makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle,
1041                                 enerd, force_vir, shake_vir, total_vir, pres, constr, &nullSignaller,
1042                                 state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
1043                                 (bGStat ? CGLO_GSTAT : 0) | (bCalcEner ? CGLO_ENERGY : 0)
1044                                         | (bTemp ? CGLO_TEMPERATURE : 0) | (bPres ? CGLO_PRESSURE : 0)
1045                                         | (bPres ? CGLO_CONSTRAINT : 0) | (bStopCM ? CGLO_STOPCM : 0)
1046                                         | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1047                                                                                  : 0)
1048                                         | CGLO_SCALEEKIN);
1049                 /* explanation of above:
1050                    a) We compute Ekin at the full time step
1051                    if 1) we are using the AveVel Ekin, and it's not the
1052                    initial step, or 2) if we are using AveEkin, but need the full
1053                    time step kinetic energy for the pressure (always true now, since we want accurate statistics).
1054                    b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
1055                    EkinAveVel because it's needed for the pressure */
1056                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1057                                                 top_global, &top, makeConstArrayRef(state->x),
1058                                                 state->box, &shouldCheckNumberOfBondedInteractions);
1059                 if (bStopCM)
1060                 {
1061                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, makeArrayRef(state->x),
1062                                            makeArrayRef(state->v));
1063                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1064                 }
1065                 wallcycle_start(wcycle, ewcUPDATE);
1066             }
1067             /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
1068             if (!bInitStep)
1069             {
1070                 if (bTrotter)
1071                 {
1072                     m_add(force_vir, shake_vir,
1073                           total_vir); /* we need the un-dispersion corrected total vir here */
1074                     trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ,
1075                                    trotter_seq, ettTSEQ2);
1076
1077                     /* TODO This is only needed when we're about to write
1078                      * a checkpoint, because we use it after the restart
1079                      * (in a kludge?). But what should we be doing if
1080                      * the startingBehavior is NewSimulation or bInitStep are true? */
1081                     if (inputrecNptTrotter(ir) || inputrecNphTrotter(ir))
1082                     {
1083                         copy_mat(shake_vir, state->svir_prev);
1084                         copy_mat(force_vir, state->fvir_prev);
1085                     }
1086                     if (inputrecNvtTrotter(ir) && ir->eI == eiVV)
1087                     {
1088                         /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
1089                         enerd->term[F_TEMP] =
1090                                 sum_ekin(&(ir->opts), ekind, nullptr, (ir->eI == eiVV), FALSE);
1091                         enerd->term[F_EKIN] = trace(ekind->ekin);
1092                     }
1093                 }
1094                 else if (bExchanged)
1095                 {
1096                     wallcycle_stop(wcycle, ewcUPDATE);
1097                     /* We need the kinetic energy at minus the half step for determining
1098                      * the full step kinetic energy and possibly for T-coupling.*/
1099                     /* This may not be quite working correctly yet . . . . */
1100                     compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1101                                     makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle,
1102                                     enerd, nullptr, nullptr, nullptr, nullptr, constr, &nullSignaller,
1103                                     state->box, nullptr, &bSumEkinhOld, CGLO_GSTAT | CGLO_TEMPERATURE);
1104                     wallcycle_start(wcycle, ewcUPDATE);
1105                 }
1106             }
1107             /* if it's the initial step, we performed this first step just to get the constraint virial */
1108             if (ir->eI == eiVV && bInitStep)
1109             {
1110                 copy_rvecn(vbuf, state->v.rvec_array(), 0, state->natoms);
1111                 sfree(vbuf);
1112             }
1113             wallcycle_stop(wcycle, ewcUPDATE);
1114         }
1115
1116         /* compute the conserved quantity */
1117         if (EI_VV(ir->eI))
1118         {
1119             saved_conserved_quantity = NPT_energy(ir, state, &MassQ);
1120             if (ir->eI == eiVV)
1121             {
1122                 last_ekin = enerd->term[F_EKIN];
1123             }
1124             if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
1125             {
1126                 saved_conserved_quantity -= enerd->term[F_DISPCORR];
1127             }
1128             /* sum up the foreign kinetic energy and dK/dl terms for vv.  currently done every step so that dhdl is correct in the .edr */
1129             if (ir->efep != efepNO)
1130             {
1131                 accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1132             }
1133         }
1134
1135         /* ########  END FIRST UPDATE STEP  ############## */
1136         /* ########  If doing VV, we now have v(dt) ###### */
1137         if (bDoExpanded)
1138         {
1139             /* perform extended ensemble sampling in lambda - we don't
1140                actually move to the new state before outputting
1141                statistics, but if performing simulated tempering, we
1142                do update the velocities and the tau_t. */
1143
1144             lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state,
1145                                               state->dfhist, step, state->v.rvec_array(), mdatoms);
1146             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1147             if (MASTER(cr))
1148             {
1149                 copy_df_history(state_global->dfhist, state->dfhist);
1150             }
1151         }
1152
1153         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1154         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1155         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1156             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1157                 || checkpointHandler->isCheckpointingStep()))
1158         {
1159             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1160             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1161         }
1162         // Copy velocities if needed for the output/checkpointing.
1163         // NOTE: Copy on the search steps is done at the beginning of the step.
1164         if (useGpuForUpdate && !bNS
1165             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1166         {
1167             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1168             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1169         }
1170         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1171         // and update is offloaded hence forces are kept on the GPU for update and have not been
1172         // already transferred in do_force().
1173         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1174         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1175         //       prior to GPU update.
1176         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1177         //       copy call in do_force(...).
1178         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1179         //       on host after the D2H copy in do_force(...).
1180         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1181             && do_per_step(step, ir->nstfout))
1182         {
1183             stateGpu->copyForcesFromGpu(ArrayRef<RVec>(f), AtomLocality::Local);
1184             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1185         }
1186         /* Now we have the energies and forces corresponding to the
1187          * coordinates at time t. We must output all of this before
1188          * the update.
1189          */
1190         do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t, ir, state, state_global,
1191                                  observablesHistory, top_global, fr, outf, energyOutput, ekind, f,
1192                                  checkpointHandler->isCheckpointingStep(), bRerunMD, bLastStep,
1193                                  mdrunOptions.writeConfout, bSumEkinhOld);
1194         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1195         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x.rvec_array(), t);
1196
1197         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1198         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1199             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1200         {
1201             copy_mat(state->svir_prev, shake_vir);
1202             copy_mat(state->fvir_prev, force_vir);
1203         }
1204
1205         stopHandler->setSignal();
1206         resetHandler->setSignal(walltime_accounting);
1207
1208         if (bGStat || !PAR(cr))
1209         {
1210             /* In parallel we only have to check for checkpointing in steps
1211              * where we do global communication,
1212              *  otherwise the other nodes don't know.
1213              */
1214             checkpointHandler->setSignal(walltime_accounting);
1215         }
1216
1217         /* #########   START SECOND UPDATE STEP ################# */
1218
1219         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1220            controlled in preprocessing */
1221
1222         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1223         {
1224             gmx_bool bIfRandomize;
1225             bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, &upd, constr);
1226             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1227             if (constr && bIfRandomize)
1228             {
1229                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1230             }
1231         }
1232         /* Box is changed in update() when we do pressure coupling,
1233          * but we should still use the old box for energy corrections and when
1234          * writing it to the energy file, so it matches the trajectory files for
1235          * the same timestep above. Make a copy in a separate array.
1236          */
1237         copy_mat(state->box, lastbox);
1238
1239         dvdl_constr = 0;
1240
1241         wallcycle_start(wcycle, ewcUPDATE);
1242         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1243         if (bTrotter)
1244         {
1245             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
1246             /* We can only do Berendsen coupling after we have summed
1247              * the kinetic energy or virial. Since the happens
1248              * in global_state after update, we should only do it at
1249              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1250              */
1251         }
1252         else
1253         {
1254             update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
1255             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1256         }
1257
1258         if (EI_VV(ir->eI))
1259         {
1260             /* velocity half-step update */
1261             upd.update_coords(*ir, step, mdatoms, state, f.arrayRefWithPadding(), fcdata, ekind, M,
1262                               etrtVELOCITY2, cr, constr != nullptr);
1263         }
1264
1265         /* Above, initialize just copies ekinh into ekin,
1266          * it doesn't copy position (for VV),
1267          * and entire integrator for MD.
1268          */
1269
1270         if (ir->eI == eiVVAK)
1271         {
1272             cbuf.resize(state->x.size());
1273             std::copy(state->x.begin(), state->x.end(), cbuf.begin());
1274         }
1275
1276         /* With leap-frog type integrators we compute the kinetic energy
1277          * at a whole time step as the average of the half-time step kinetic
1278          * energies of two subsequent steps. Therefore we need to compute the
1279          * half step kinetic energy also if we need energies at the next step.
1280          */
1281         const bool needHalfStepKineticEnergy =
1282                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1283
1284         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1285         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1286         const bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN
1287                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1288
1289         if (useGpuForUpdate)
1290         {
1291             if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1292             {
1293                 integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(),
1294                                 stateGpu->getForces(), top.idef, *mdatoms, ekind->ngtc);
1295
1296                 // Copy data to the GPU after buffers might have being reinitialized
1297                 stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1298                 stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1299             }
1300
1301             // If the buffer ops were not offloaded this step, the forces are on the host and have to be copied
1302             if (!runScheduleWork->stepWork.useGpuFBufferOps)
1303             {
1304                 stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), AtomLocality::Local);
1305             }
1306
1307             const bool doTemperatureScaling =
1308                     (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1309
1310             // This applies Leap-Frog, LINCS and SETTLE in succession
1311             integrator->integrate(stateGpu->getForcesReadyOnDeviceEvent(
1312                                           AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1313                                   ir->delta_t, true, bCalcVir, shake_vir, doTemperatureScaling,
1314                                   ekind->tcstat, doParrinelloRahman, ir->nstpcouple * ir->delta_t, M);
1315
1316             // Copy velocities D2H after update if:
1317             // - Globals are computed this step (includes the energy output steps).
1318             // - Temperature is needed for the next step.
1319             if (bGStat || needHalfStepKineticEnergy)
1320             {
1321                 stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1322                 stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1323             }
1324         }
1325         else
1326         {
1327             upd.update_coords(*ir, step, mdatoms, state, f.arrayRefWithPadding(), fcdata, ekind, M,
1328                               etrtPOSITION, cr, constr != nullptr);
1329
1330             wallcycle_stop(wcycle, ewcUPDATE);
1331
1332             constrain_coordinates(constr, do_log, do_ene, step, state,
1333                                   upd.xp()->arrayRefWithPadding(), &dvdl_constr, bCalcVir, shake_vir);
1334
1335             upd.update_sd_second_half(*ir, step, &dvdl_constr, mdatoms, state, cr, nrnb, wcycle,
1336                                       constr, do_log, do_ene);
1337             upd.finish_update(*ir, mdatoms, state, wcycle, constr != nullptr);
1338         }
1339
1340         if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1341         {
1342             updatePrevStepPullCom(pull_work, state);
1343         }
1344
1345         if (ir->eI == eiVVAK)
1346         {
1347             /* erase F_EKIN and F_TEMP here? */
1348             /* just compute the kinetic energy at the half step to perform a trotter step */
1349             compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1350                             makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle, enerd,
1351                             force_vir, shake_vir, total_vir, pres, constr, &nullSignaller, lastbox,
1352                             nullptr, &bSumEkinhOld, (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE);
1353             wallcycle_start(wcycle, ewcUPDATE);
1354             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
1355             /* now we know the scaling, we can compute the positions again */
1356             std::copy(cbuf.begin(), cbuf.end(), state->x.begin());
1357
1358             upd.update_coords(*ir, step, mdatoms, state, f.arrayRefWithPadding(), fcdata, ekind, M,
1359                               etrtPOSITION, cr, constr != nullptr);
1360             wallcycle_stop(wcycle, ewcUPDATE);
1361
1362             /* do we need an extra constraint here? just need to copy out of as_rvec_array(state->v.data()) to upd->xp? */
1363             /* are the small terms in the shake_vir here due
1364              * to numerical errors, or are they important
1365              * physically? I'm thinking they are just errors, but not completely sure.
1366              * For now, will call without actually constraining, constr=NULL*/
1367             upd.finish_update(*ir, mdatoms, state, wcycle, false);
1368         }
1369         if (EI_VV(ir->eI))
1370         {
1371             /* this factor or 2 correction is necessary
1372                because half of the constraint force is removed
1373                in the vv step, so we have to double it.  See
1374                the Issue #1255.  It is not yet clear
1375                if the factor of 2 is exact, or just a very
1376                good approximation, and this will be
1377                investigated.  The next step is to see if this
1378                can be done adding a dhdl contribution from the
1379                rattle step, but this is somewhat more
1380                complicated with the current code. Will be
1381                investigated, hopefully for 4.6.3. However,
1382                this current solution is much better than
1383                having it completely wrong.
1384              */
1385             enerd->term[F_DVDL_CONSTR] += 2 * dvdl_constr;
1386         }
1387         else
1388         {
1389             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1390         }
1391
1392         if (vsite != nullptr)
1393         {
1394             wallcycle_start(wcycle, ewcVSITECONSTR);
1395             vsite->construct(state->x, ir->delta_t, state->v, state->box);
1396             wallcycle_stop(wcycle, ewcVSITECONSTR);
1397         }
1398
1399         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1400         /* With Leap-Frog we can skip compute_globals at
1401          * non-communication steps, but we need to calculate
1402          * the kinetic energy one step before communication.
1403          */
1404         {
1405             // Organize to do inter-simulation signalling on steps if
1406             // and when algorithms require it.
1407             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1408
1409             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1410             {
1411                 // Copy coordinates when needed to stop the CM motion.
1412                 if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
1413                 {
1414                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1415                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1416                 }
1417                 // Since we're already communicating at this step, we
1418                 // can propagate intra-simulation signals. Note that
1419                 // check_nstglobalcomm has the responsibility for
1420                 // choosing the value of nstglobalcomm that is one way
1421                 // bGStat becomes true, so we can't get into a
1422                 // situation where e.g. checkpointing can't be
1423                 // signalled.
1424                 bool                doIntraSimSignal = true;
1425                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1426
1427                 compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1428                                 makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm,
1429                                 wcycle, enerd, force_vir, shake_vir, total_vir, pres, constr,
1430                                 &signaller, lastbox, &totalNumberOfBondedInteractions, &bSumEkinhOld,
1431                                 (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1432                                         | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1433                                         | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1434                                         | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
1435                                         | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1436                                                                                  : 0));
1437                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1438                                                 top_global, &top, makeConstArrayRef(state->x),
1439                                                 state->box, &shouldCheckNumberOfBondedInteractions);
1440                 if (!EI_VV(ir->eI) && bStopCM)
1441                 {
1442                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, makeArrayRef(state->x),
1443                                            makeArrayRef(state->v));
1444                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1445
1446                     // TODO: The special case of removing CM motion should be dealt more gracefully
1447                     if (useGpuForUpdate)
1448                     {
1449                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1450                         // Here we block until the H2D copy completes because event sync with the
1451                         // force kernels that use the coordinates on the next steps is not implemented
1452                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1453                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1454                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1455                         if (vcm.mode != ecmNO)
1456                         {
1457                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1458                         }
1459                     }
1460                 }
1461             }
1462         }
1463
1464         /* #############  END CALC EKIN AND PRESSURE ################# */
1465
1466         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1467            the virial that should probably be addressed eventually. state->veta has better properies,
1468            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1469            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1470
1471         if (ir->efep != efepNO && !EI_VV(ir->eI))
1472         {
1473             /* Sum up the foreign energy and dK/dl terms for md and sd.
1474                Currently done every step so that dH/dl is correct in the .edr */
1475             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1476         }
1477
1478         update_pcouple_after_coordinates(fplog, step, ir, mdatoms, pres, force_vir, shake_vir,
1479                                          pressureCouplingMu, state, nrnb, upd.deform(), !useGpuForUpdate);
1480
1481         const bool doBerendsenPressureCoupling =
1482                 (inputrec->epc == epcBERENDSEN && do_per_step(step, inputrec->nstpcouple));
1483         if (useGpuForUpdate && (doBerendsenPressureCoupling || doParrinelloRahman))
1484         {
1485             integrator->scaleCoordinates(pressureCouplingMu);
1486             integrator->setPbc(PbcType::Xyz, state->box);
1487         }
1488
1489         /* ################# END UPDATE STEP 2 ################# */
1490         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1491
1492         /* The coordinates (x) were unshifted in update */
1493         if (!bGStat)
1494         {
1495             /* We will not sum ekinh_old,
1496              * so signal that we still have to do it.
1497              */
1498             bSumEkinhOld = TRUE;
1499         }
1500
1501         if (bCalcEner)
1502         {
1503             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1504
1505             /* use the directly determined last velocity, not actually the averaged half steps */
1506             if (bTrotter && ir->eI == eiVV)
1507             {
1508                 enerd->term[F_EKIN] = last_ekin;
1509             }
1510             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1511
1512             if (integratorHasConservedEnergyQuantity(ir))
1513             {
1514                 if (EI_VV(ir->eI))
1515                 {
1516                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1517                 }
1518                 else
1519                 {
1520                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1521                 }
1522             }
1523             /* #########  END PREPARING EDR OUTPUT  ###########  */
1524         }
1525
1526         /* Output stuff */
1527         if (MASTER(cr))
1528         {
1529             if (fplog && do_log && bDoExpanded)
1530             {
1531                 /* only needed if doing expanded ensemble */
1532                 PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals,
1533                                           ir->bSimTemp ? ir->simtempvals : nullptr,
1534                                           state_global->dfhist, state->fep_state, ir->nstlog, step);
1535             }
1536             if (bCalcEner)
1537             {
1538                 energyOutput.addDataAtEnergyStep(bDoDHDL, bCalcEnerStep, t, mdatoms->tmass, enerd, state,
1539                                                  ir->fepvals, ir->expandedvals, lastbox, shake_vir,
1540                                                  force_vir, total_vir, pres, ekind, mu_tot, constr);
1541             }
1542             else
1543             {
1544                 energyOutput.recordNonEnergyStep();
1545             }
1546
1547             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1548             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1549
1550             if (doSimulatedAnnealing)
1551             {
1552                 gmx::EnergyOutput::printAnnealingTemperatures(do_log ? fplog : nullptr, groups,
1553                                                               &(ir->opts));
1554             }
1555             if (do_log || do_ene || do_dr || do_or)
1556             {
1557                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or,
1558                                                    do_log ? fplog : nullptr, step, t,
1559                                                    &fr->listedForces->fcdata(), awh.get());
1560             }
1561
1562             if (ir->bPull)
1563             {
1564                 pull_print_output(pull_work, step, t);
1565             }
1566
1567             if (do_per_step(step, ir->nstlog))
1568             {
1569                 if (fflush(fplog) != 0)
1570                 {
1571                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1572                 }
1573             }
1574         }
1575         if (bDoExpanded)
1576         {
1577             /* Have to do this part _after_ outputting the logfile and the edr file */
1578             /* Gets written into the state at the beginning of next loop*/
1579             state->fep_state = lamnew;
1580         }
1581         /* Print the remaining wall clock time for the run */
1582         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1583         {
1584             if (shellfc)
1585             {
1586                 fprintf(stderr, "\n");
1587             }
1588             print_time(stderr, walltime_accounting, step, ir, cr);
1589         }
1590
1591         /* Ion/water position swapping.
1592          * Not done in last step since trajectory writing happens before this call
1593          * in the MD loop and exchanges would be lost anyway. */
1594         bNeedRepartition = FALSE;
1595         if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep && do_per_step(step, ir->swap->nstswap))
1596         {
1597             bNeedRepartition =
1598                     do_swapcoords(cr, step, t, ir, swap, wcycle, as_rvec_array(state->x.data()),
1599                                   state->box, MASTER(cr) && mdrunOptions.verbose, bRerunMD);
1600
1601             if (bNeedRepartition && DOMAINDECOMP(cr))
1602             {
1603                 dd_collect_state(cr->dd, state, state_global);
1604             }
1605         }
1606
1607         /* Replica exchange */
1608         bExchanged = FALSE;
1609         if (bDoReplEx)
1610         {
1611             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1612         }
1613
1614         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1615         {
1616             dd_partition_system(fplog, mdlog, step, cr, TRUE, 1, state_global, *top_global, ir,
1617                                 imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
1618                                 nrnb, wcycle, FALSE);
1619             shouldCheckNumberOfBondedInteractions = true;
1620             upd.setNumAtoms(state->natoms);
1621         }
1622
1623         bFirstStep = FALSE;
1624         bInitStep  = FALSE;
1625
1626         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1627         /* With all integrators, except VV, we need to retain the pressure
1628          * at the current step for coupling at the next step.
1629          */
1630         if ((state->flags & (1U << estPRES_PREV))
1631             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1632         {
1633             /* Store the pressure in t_state for pressure coupling
1634              * at the next MD step.
1635              */
1636             copy_mat(pres, state->pres_prev);
1637         }
1638
1639         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1640
1641         if ((membed != nullptr) && (!bLastStep))
1642         {
1643             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1644         }
1645
1646         cycles = wallcycle_stop(wcycle, ewcSTEP);
1647         if (DOMAINDECOMP(cr) && wcycle)
1648         {
1649             dd_cycles_add(cr->dd, cycles, ddCyclStep);
1650         }
1651
1652         /* increase the MD step number */
1653         step++;
1654         step_rel++;
1655
1656         resetHandler->resetCounters(step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb,
1657                                     fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
1658
1659         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
1660         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
1661     }
1662     /* End of main MD loop */
1663
1664     /* Closing TNG files can include compressing data. Therefore it is good to do that
1665      * before stopping the time measurements. */
1666     mdoutf_tng_close(outf);
1667
1668     /* Stop measuring walltime */
1669     walltime_accounting_end_time(walltime_accounting);
1670
1671     if (!thisRankHasDuty(cr, DUTY_PME))
1672     {
1673         /* Tell the PME only node to finish */
1674         gmx_pme_send_finish(cr);
1675     }
1676
1677     if (MASTER(cr))
1678     {
1679         if (ir->nstcalcenergy > 0)
1680         {
1681             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
1682             energyOutput.printAverages(fplog, groups);
1683         }
1684     }
1685     done_mdoutf(outf);
1686
1687     if (bPMETune)
1688     {
1689         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
1690     }
1691
1692     done_shellfc(fplog, shellfc, step_rel);
1693
1694     if (useReplicaExchange && MASTER(cr))
1695     {
1696         print_replica_exchange_statistics(fplog, repl_ex);
1697     }
1698
1699     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
1700
1701     global_stat_destroy(gstat);
1702 }