src/gromacs/mdrun/md.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2011-2019,2020, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief Implements the integrator for normal molecular dynamics simulations
  40  *
  41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
  42  * \ingroup module_mdrun
  43  */
  44 #include "gmxpre.h"
  45
  46 #include <cinttypes>
  47 #include <cmath>
  48 #include <cstdio>
  49 #include <cstdlib>
  50
  51 #include <algorithm>
  52 #include <memory>
  53 #include <numeric>
  54
  55 #include "gromacs/applied_forces/awh/awh.h"
  56 #include "gromacs/commandline/filenm.h"
  57 #include "gromacs/domdec/collect.h"
  58 #include "gromacs/domdec/dlbtiming.h"
  59 #include "gromacs/domdec/domdec.h"
  60 #include "gromacs/domdec/domdec_network.h"
  61 #include "gromacs/domdec/domdec_struct.h"
  62 #include "gromacs/domdec/gpuhaloexchange.h"
  63 #include "gromacs/domdec/mdsetup.h"
  64 #include "gromacs/domdec/partition.h"
  65 #include "gromacs/essentialdynamics/edsam.h"
  66 #include "gromacs/ewald/pme_load_balancing.h"
  67 #include "gromacs/ewald/pme_pp.h"
  68 #include "gromacs/fileio/trxio.h"
  69 #include "gromacs/gmxlib/network.h"
  70 #include "gromacs/gmxlib/nrnb.h"
  71 #include "gromacs/gpu_utils/device_stream_manager.h"
  72 #include "gromacs/gpu_utils/gpu_utils.h"
  73 #include "gromacs/imd/imd.h"
  74 #include "gromacs/listed_forces/listed_forces.h"
  75 #include "gromacs/math/functions.h"
  76 #include "gromacs/math/invertmatrix.h"
  77 #include "gromacs/math/vec.h"
  78 #include "gromacs/math/vectypes.h"
  79 #include "gromacs/mdlib/checkpointhandler.h"
  80 #include "gromacs/mdlib/compute_io.h"
  81 #include "gromacs/mdlib/constr.h"
  82 #include "gromacs/mdlib/coupling.h"
  83 #include "gromacs/mdlib/ebin.h"
  84 #include "gromacs/mdlib/enerdata_utils.h"
  85 #include "gromacs/mdlib/energyoutput.h"
  86 #include "gromacs/mdlib/expanded.h"
  87 #include "gromacs/mdlib/force.h"
  88 #include "gromacs/mdlib/force_flags.h"
  89 #include "gromacs/mdlib/forcerec.h"
  90 #include "gromacs/mdlib/freeenergyparameters.h"
  91 #include "gromacs/mdlib/md_support.h"
  92 #include "gromacs/mdlib/mdatoms.h"
  93 #include "gromacs/mdlib/mdoutf.h"
  94 #include "gromacs/mdlib/membed.h"
  95 #include "gromacs/mdlib/resethandler.h"
  96 #include "gromacs/mdlib/sighandler.h"
  97 #include "gromacs/mdlib/simulationsignal.h"
  98 #include "gromacs/mdlib/stat.h"
  99 #include "gromacs/mdlib/stophandler.h"
 100 #include "gromacs/mdlib/tgroup.h"
 101 #include "gromacs/mdlib/trajectory_writing.h"
 102 #include "gromacs/mdlib/update.h"
 103 #include "gromacs/mdlib/update_constrain_gpu.h"
 104 #include "gromacs/mdlib/vcm.h"
 105 #include "gromacs/mdlib/vsite.h"
 106 #include "gromacs/mdrunutility/handlerestart.h"
 107 #include "gromacs/mdrunutility/multisim.h"
 108 #include "gromacs/mdrunutility/printtime.h"
 109 #include "gromacs/mdtypes/awh_history.h"
 110 #include "gromacs/mdtypes/awh_params.h"
 111 #include "gromacs/mdtypes/commrec.h"
 112 #include "gromacs/mdtypes/df_history.h"
 113 #include "gromacs/mdtypes/energyhistory.h"
 114 #include "gromacs/mdtypes/fcdata.h"
 115 #include "gromacs/mdtypes/forcebuffers.h"
 116 #include "gromacs/mdtypes/forcerec.h"
 117 #include "gromacs/mdtypes/group.h"
 118 #include "gromacs/mdtypes/inputrec.h"
 119 #include "gromacs/mdtypes/interaction_const.h"
 120 #include "gromacs/mdtypes/md_enums.h"
 121 #include "gromacs/mdtypes/mdatom.h"
 122 #include "gromacs/mdtypes/mdrunoptions.h"
 123 #include "gromacs/mdtypes/multipletimestepping.h"
 124 #include "gromacs/mdtypes/observableshistory.h"
 125 #include "gromacs/mdtypes/pullhistory.h"
 126 #include "gromacs/mdtypes/simulation_workload.h"
 127 #include "gromacs/mdtypes/state.h"
 128 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
 129 #include "gromacs/modularsimulator/energydata.h"
 130 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 131 #include "gromacs/nbnxm/nbnxm.h"
 132 #include "gromacs/pbcutil/pbc.h"
 133 #include "gromacs/pulling/output.h"
 134 #include "gromacs/pulling/pull.h"
 135 #include "gromacs/swap/swapcoords.h"
 136 #include "gromacs/timing/wallcycle.h"
 137 #include "gromacs/timing/walltime_accounting.h"
 138 #include "gromacs/topology/atoms.h"
 139 #include "gromacs/topology/idef.h"
 140 #include "gromacs/topology/mtop_util.h"
 141 #include "gromacs/topology/topology.h"
 142 #include "gromacs/trajectory/trajectoryframe.h"
 143 #include "gromacs/utility/basedefinitions.h"
 144 #include "gromacs/utility/cstringutil.h"
 145 #include "gromacs/utility/fatalerror.h"
 146 #include "gromacs/utility/logger.h"
 147 #include "gromacs/utility/real.h"
 148 #include "gromacs/utility/smalloc.h"
 149
 150 #include "legacysimulator.h"
 151 #include "replicaexchange.h"
 152 #include "shellfc.h"
 153
 154 using gmx::SimulationSignaller;
 155
 156 void gmx::LegacySimulator::do_md()
 157 {
 158     // TODO Historically, the EM and MD "integrators" used different
 159     // names for the t_inputrec *parameter, but these must have the
 160     // same name, now that it's a member of a struct. We use this ir
 161     // alias to avoid a large ripple of nearly useless changes.
 162     // t_inputrec is being replaced by IMdpOptionsProvider, so this
 163     // will go away eventually.
 164     t_inputrec*  ir = inputrec;
 165     int64_t      step, step_rel;
 166     double       t, t0 = ir->init_t;
 167     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
 168     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
 169     gmx_bool     bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
 170     gmx_bool     do_ene, do_log, do_verbose;
 171     gmx_bool     bMasterState;
 172     unsigned int force_flags;
 173     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
 174     int    i, m;
 175     rvec   mu_tot;
 176     matrix pressureCouplingMu, M;
 177     gmx_repl_ex_t     repl_ex = nullptr;
 178     gmx_global_stat_t gstat;
 179     gmx_shellfc_t*    shellfc;
 180     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
 181     gmx_bool          bTemp, bPres, bTrotter;
 182     real              dvdl_constr;
 183     std::vector<RVec> cbuf;
 184     matrix            lastbox;
 185     int               lamnew = 0;
 186     /* for FEP */
 187     int       nstfep = 0;
 188     double    cycles;
 189     real      saved_conserved_quantity = 0;
 190     real      last_ekin                = 0;
 191     t_extmass MassQ;
 192     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
 193
 194     /* PME load balancing data for GPU kernels */
 195     gmx_bool bPMETune         = FALSE;
 196     gmx_bool bPMETunePrinting = FALSE;
 197
 198     bool bInteractiveMDstep = false;
 199
 200     /* Domain decomposition could incorrectly miss a bonded
 201        interaction, but checking for that requires a global
 202        communication stage, which does not otherwise happen in DD
 203        code. So we do that alongside the first global energy reduction
 204        after a new DD is made. These variables handle whether the
 205        check happens, and the result it returns. */
 206     bool shouldCheckNumberOfBondedInteractions = false;
 207     int  totalNumberOfBondedInteractions       = -1;
 208
 209     SimulationSignals signals;
 210     // Most global communnication stages don't propagate mdrun
 211     // signals, and will use this object to achieve that.
 212     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
 213
 214     if (!mdrunOptions.writeConfout)
 215     {
 216         // This is on by default, and the main known use case for
 217         // turning it off is for convenience in benchmarking, which is
 218         // something that should not show up in the general user
 219         // interface.
 220         GMX_LOG(mdlog.info)
 221                 .asParagraph()
 222                 .appendText(
 223                         "The -noconfout functionality is deprecated, and may be removed in a "
 224                         "future version.");
 225     }
 226
 227     /* md-vv uses averaged full step velocities for T-control
 228        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 229        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 230     bTrotter = (EI_VV(ir->eI)
 231                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
 232
 233     const bool bRerunMD = false;
 234
 235     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
 236     bGStatEveryStep   = (nstglobalcomm == 1);
 237
 238     const SimulationGroups* groups = &top_global->groups;
 239
 240     std::unique_ptr<EssentialDynamics> ed = nullptr;
 241     if (opt2bSet("-ei", nfile, fnm))
 242     {
 243         /* Initialize essential dynamics sampling */
 244         ed = init_edsam(mdlog, opt2fn_null("-ei", nfile, fnm), opt2fn("-eo", nfile, fnm), top_global,
 245                         ir, cr, constr, state_global, observablesHistory, oenv, startingBehavior);
 246     }
 247     else if (observablesHistory->edsamHistory)
 248     {
 249         gmx_fatal(FARGS,
 250                   "The checkpoint is from a run with essential dynamics sampling, "
 251                   "but the current run did not specify the -ei option. "
 252                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
 253     }
 254
 255     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
 256     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
 257     initialize_lambdas(fplog, *ir, MASTER(cr), fep_state, lambda);
 258     Update     upd(*ir, deform);
 259     const bool doSimulatedAnnealing = initSimulatedAnnealing(ir, &upd);
 260     const bool useReplicaExchange   = (replExParams.exchangeInterval > 0);
 261
 262     const t_fcdata& fcdata = *fr->fcdata;
 263
 264     bool simulationsShareState = false;
 265     int  nstSignalComm         = nstglobalcomm;
 266     {
 267         // TODO This implementation of ensemble orientation restraints is nasty because
 268         // a user can't just do multi-sim with single-sim orientation restraints.
 269         bool usingEnsembleRestraints =
 270                 (fcdata.disres->nsystems > 1) || ((ms != nullptr) && (fcdata.orires->nr != 0));
 271         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr));
 272
 273         // Replica exchange, ensemble restraints and AWH need all
 274         // simulations to remain synchronized, so they need
 275         // checkpoints and stop conditions to act on the same step, so
 276         // the propagation of such signals must take place between
 277         // simulations, not just within simulations.
 278         // TODO: Make algorithm initializers set these flags.
 279         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
 280
 281         if (simulationsShareState)
 282         {
 283             // Inter-simulation signal communication does not need to happen
 284             // often, so we use a minimum of 200 steps to reduce overhead.
 285             const int c_minimumInterSimulationSignallingInterval = 200;
 286             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
 287                             * nstglobalcomm;
 288         }
 289     }
 290
 291     if (startingBehavior != StartingBehavior::RestartWithAppending)
 292     {
 293         pleaseCiteCouplingAlgorithms(fplog, *ir);
 294     }
 295     gmx_mdoutf* outf =
 296             init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, ir,
 297                         top_global, oenv, wcycle, startingBehavior, simulationsShareState, ms);
 298     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, ir, pull_work,
 299                                    mdoutf_get_fp_dhdl(outf), false, startingBehavior, mdModulesNotifier);
 300
 301     gstat = global_stat_init(ir);
 302
 303     const auto& simulationWork     = runScheduleWork->simulationWork;
 304     const bool  useGpuForPme       = simulationWork.useGpuPme;
 305     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
 306     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
 307     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
 308
 309     /* Check for polarizable models and flexible constraints */
 310     shellfc = init_shell_flexcon(fplog, top_global, constr ? constr->numFlexibleConstraints() : 0,
 311                                  ir->nstcalcenergy, DOMAINDECOMP(cr), useGpuForPme);
 312
 313     {
 314         double io = compute_io(ir, top_global->natoms, *groups, energyOutput.numEnergyTerms(), 1);
 315         if ((io > 2000) && MASTER(cr))
 316         {
 317             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
 318         }
 319     }
 320
 321     // Local state only becomes valid now.
 322     std::unique_ptr<t_state> stateInstance;
 323     t_state*                 state;
 324
 325     gmx_localtop_t top(top_global->ffparams);
 326
 327     auto mdatoms = mdAtoms->mdatoms();
 328
 329     ForceBuffers f(fr->useMts, ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 330                                        ? PinningPolicy::PinnedIfSupported
 331                                        : PinningPolicy::CannotBePinned);
 332     if (DOMAINDECOMP(cr))
 333     {
 334         stateInstance = std::make_unique<t_state>();
 335         state         = stateInstance.get();
 336         dd_init_local_state(cr->dd, state_global, state);
 337
 338         /* Distribute the charge groups over the nodes from the master node */
 339         dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1, state_global, *top_global, ir,
 340                             imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
 341                             nrnb, nullptr, FALSE);
 342         shouldCheckNumberOfBondedInteractions = true;
 343         upd.setNumAtoms(state->natoms);
 344     }
 345     else
 346     {
 347         state_change_natoms(state_global, state_global->natoms);
 348         /* Copy the pointer to the global state */
 349         state = state_global;
 350
 351         /* Generate and initialize new topology */
 352         mdAlgorithmsSetupAtomData(cr, ir, *top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
 353
 354         upd.setNumAtoms(state->natoms);
 355     }
 356
 357     std::unique_ptr<UpdateConstrainGpu> integrator;
 358
 359     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
 360
 361     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
 362     if (useGpuForUpdate)
 363     {
 364         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
 365                                    || constr->numConstraintsTotal() == 0,
 366                            "Constraints in domain decomposition are only supported with update "
 367                            "groups if using GPU update.\n");
 368         GMX_RELEASE_ASSERT(ir->eConstrAlg != econtSHAKE || constr == nullptr
 369                                    || constr->numConstraintsTotal() == 0,
 370                            "SHAKE is not supported with GPU update.");
 371         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
 372                            "Either PME or short-ranged non-bonded interaction tasks must run on "
 373                            "the GPU to use GPU update.\n");
 374         GMX_RELEASE_ASSERT(ir->eI == eiMD,
 375                            "Only the md integrator is supported with the GPU update.\n");
 376         GMX_RELEASE_ASSERT(
 377                 ir->etc != etcNOSEHOOVER,
 378                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
 379         GMX_RELEASE_ASSERT(
 380                 ir->epc == epcNO || ir->epc == epcPARRINELLORAHMAN || ir->epc == epcBERENDSEN
 381                         || ir->epc == epcCRESCALE,
 382                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
 383                 "with the GPU update.\n");
 384         GMX_RELEASE_ASSERT(!mdatoms->haveVsites,
 385                            "Virtual sites are not supported with the GPU update.\n");
 386         GMX_RELEASE_ASSERT(ed == nullptr,
 387                            "Essential dynamics is not supported with the GPU update.\n");
 388         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
 389                            "Constraints pulling is not supported with the GPU update.\n");
 390         GMX_RELEASE_ASSERT(fcdata.orires->nr == 0,
 391                            "Orientation restraints are not supported with the GPU update.\n");
 392         GMX_RELEASE_ASSERT(
 393                 ir->efep == efepNO
 394                         || (!haveFreeEnergyType(*ir, efptBONDED) && !haveFreeEnergyType(*ir, efptMASS)),
 395                 "Free energy perturbation of masses and constraints are not supported with the GPU "
 396                 "update.");
 397
 398         if (constr != nullptr && constr->numConstraintsTotal() > 0)
 399         {
 400             GMX_LOG(mdlog.info)
 401                     .asParagraph()
 402                     .appendText("Updating coordinates and applying constraints on the GPU.");
 403         }
 404         else
 405         {
 406             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
 407         }
 408         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
 409                            "Device stream manager should be initialized in order to use GPU "
 410                            "update-constraints.");
 411         GMX_RELEASE_ASSERT(
 412                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
 413                 "Update stream should be initialized in order to use GPU "
 414                 "update-constraints.");
 415         integrator = std::make_unique<UpdateConstrainGpu>(
 416                 *ir, *top_global, fr->deviceStreamManager->context(),
 417                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
 418                 stateGpu->xUpdatedOnDevice());
 419
 420         integrator->setPbc(PbcType::Xyz, state->box);
 421     }
 422
 423     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 424     {
 425         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
 426     }
 427     if (useGpuForUpdate)
 428     {
 429         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
 430     }
 431
 432     // NOTE: The global state is no longer used at this point.
 433     // But state_global is still used as temporary storage space for writing
 434     // the global state to file and potentially for replica exchange.
 435     // (Global topology should persist.)
 436
 437     update_mdatoms(mdatoms, state->lambda[efptMASS]);
 438
 439     if (ir->bExpanded)
 440     {
 441         /* Check nstexpanded here, because the grompp check was broken */
 442         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
 443         {
 444             gmx_fatal(FARGS,
 445                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
 446         }
 447         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
 448     }
 449
 450     if (MASTER(cr))
 451     {
 452         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
 453     }
 454
 455     preparePrevStepPullCom(ir, pull_work, mdatoms->massT, state, state_global, cr,
 456                            startingBehavior != StartingBehavior::NewSimulation);
 457
 458     // TODO: Remove this by converting AWH into a ForceProvider
 459     auto awh = prepareAwhModule(fplog, *ir, state_global, cr, ms,
 460                                 startingBehavior != StartingBehavior::NewSimulation,
 461                                 shellfc != nullptr, opt2fn("-awh", nfile, fnm), pull_work);
 462
 463     if (useReplicaExchange && MASTER(cr))
 464     {
 465         repl_ex = init_replica_exchange(fplog, ms, top_global->natoms, ir, replExParams);
 466     }
 467     /* PME tuning is only supported in the Verlet scheme, with PME for
 468      * Coulomb. It is not supported with only LJ PME. */
 469     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
 470                 && ir->cutoff_scheme != ecutsGROUP);
 471
 472     pme_load_balancing_t* pme_loadbal = nullptr;
 473     if (bPMETune)
 474     {
 475         pme_loadbal_init(&pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata,
 476                          fr->nbv->useGpu());
 477     }
 478
 479     if (!ir->bContinuation)
 480     {
 481         if (state->flags & (1U << estV))
 482         {
 483             auto v = makeArrayRef(state->v);
 484             /* Set the velocities of vsites, shells and frozen atoms to zero */
 485             for (i = 0; i < mdatoms->homenr; i++)
 486             {
 487                 if (mdatoms->ptype[i] == eptVSite || mdatoms->ptype[i] == eptShell)
 488                 {
 489                     clear_rvec(v[i]);
 490                 }
 491                 else if (mdatoms->cFREEZE)
 492                 {
 493                     for (m = 0; m < DIM; m++)
 494                     {
 495                         if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
 496                         {
 497                             v[i][m] = 0;
 498                         }
 499                     }
 500                 }
 501             }
 502         }
 503
 504         if (constr)
 505         {
 506             /* Constrain the initial coordinates and velocities */
 507             do_constrain_first(fplog, constr, ir, mdatoms->nr, mdatoms->homenr,
 508                                state->x.arrayRefWithPadding(), state->v.arrayRefWithPadding(),
 509                                state->box, state->lambda[efptBONDED]);
 510         }
 511         if (vsite)
 512         {
 513             /* Construct the virtual sites for the initial configuration */
 514             vsite->construct(state->x, ir->delta_t, {}, state->box);
 515         }
 516     }
 517
 518     if (ir->efep != efepNO)
 519     {
 520         /* Set free energy calculation frequency as the greatest common
 521          * denominator of nstdhdl and repl_ex_nst. */
 522         nstfep = ir->fepvals->nstdhdl;
 523         if (ir->bExpanded)
 524         {
 525             nstfep = std::gcd(ir->expandedvals->nstexpanded, nstfep);
 526         }
 527         if (useReplicaExchange)
 528         {
 529             nstfep = std::gcd(replExParams.exchangeInterval, nstfep);
 530         }
 531         if (ir->bDoAwh)
 532         {
 533             nstfep = std::gcd(ir->awhParams->nstSampleCoord, nstfep);
 534         }
 535     }
 536
 537     /* Be REALLY careful about what flags you set here. You CANNOT assume
 538      * this is the first step, since we might be restarting from a checkpoint,
 539      * and in that case we should not do any modifications to the state.
 540      */
 541     bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
 542
 543     // When restarting from a checkpoint, it can be appropriate to
 544     // initialize ekind from quantities in the checkpoint. Otherwise,
 545     // compute_globals must initialize ekind before the simulation
 546     // starts/restarts. However, only the master rank knows what was
 547     // found in the checkpoint file, so we have to communicate in
 548     // order to coordinate the restart.
 549     //
 550     // TODO Consider removing this communication if/when checkpoint
 551     // reading directly follows .tpr reading, because all ranks can
 552     // agree on hasReadEkinState at that time.
 553     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
 554     if (PAR(cr))
 555     {
 556         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
 557     }
 558     if (hasReadEkinState)
 559     {
 560         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
 561     }
 562
 563     unsigned int cglo_flags =
 564             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
 565              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
 566
 567     bSumEkinhOld = FALSE;
 568
 569     t_vcm vcm(top_global->groups, *ir);
 570     reportComRemovalInfo(fplog, vcm);
 571
 572     /* To minimize communication, compute_globals computes the COM velocity
 573      * and the kinetic energy for the velocities without COM motion removed.
 574      * Thus to get the kinetic energy without the COM contribution, we need
 575      * to call compute_globals twice.
 576      */
 577     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
 578     {
 579         unsigned int cglo_flags_iteration = cglo_flags;
 580         if (bStopCM && cgloIteration == 0)
 581         {
 582             cglo_flags_iteration |= CGLO_STOPCM;
 583             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
 584         }
 585         compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 586                         makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, nullptr,
 587                         enerd, force_vir, shake_vir, total_vir, pres, constr, &nullSignaller,
 588                         state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
 589                         cglo_flags_iteration
 590                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
 591                                                                          : 0));
 592         if (cglo_flags_iteration & CGLO_STOPCM)
 593         {
 594             /* At initialization, do not pass x with acceleration-correction mode
 595              * to avoid (incorrect) correction of the initial coordinates.
 596              */
 597             auto x = (vcm.mode == ecmLINEAR_ACCELERATION_CORRECTION) ? ArrayRef<RVec>()
 598                                                                      : makeArrayRef(state->x);
 599             process_and_stopcm_grp(fplog, &vcm, *mdatoms, x, makeArrayRef(state->v));
 600             inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
 601         }
 602     }
 603     checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global, &top,
 604                                     makeConstArrayRef(state->x), state->box,
 605                                     &shouldCheckNumberOfBondedInteractions);
 606     if (ir->eI == eiVVAK)
 607     {
 608         /* a second call to get the half step temperature initialized as well */
 609         /* we do the same call as above, but turn the pressure off -- internally to
 610            compute_globals, this is recognized as a velocity verlet half-step
 611            kinetic energy calculation.  This minimized excess variables, but
 612            perhaps loses some logic?*/
 613
 614         compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 615                         makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, nullptr,
 616                         enerd, force_vir, shake_vir, total_vir, pres, constr, &nullSignaller,
 617                         state->box, nullptr, &bSumEkinhOld, cglo_flags & ~CGLO_PRESSURE);
 618     }
 619
 620     /* Calculate the initial half step temperature, and save the ekinh_old */
 621     if (startingBehavior == StartingBehavior::NewSimulation)
 622     {
 623         for (i = 0; (i < ir->opts.ngtc); i++)
 624         {
 625             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
 626         }
 627     }
 628
 629     /* need to make an initiation call to get the Trotter variables set, as well as other constants
 630        for non-trotter temperature control */
 631     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
 632
 633     if (MASTER(cr))
 634     {
 635         if (!ir->bContinuation)
 636         {
 637             if (constr && ir->eConstrAlg == econtLINCS)
 638             {
 639                 fprintf(fplog, "RMS relative constraint deviation after constraining: %.2e\n",
 640                         constr->rmsd());
 641             }
 642             if (EI_STATE_VELOCITY(ir->eI))
 643             {
 644                 real temp = enerd->term[F_TEMP];
 645                 if (ir->eI != eiVV)
 646                 {
 647                     /* Result of Ekin averaged over velocities of -half
 648                      * and +half step, while we only have -half step here.
 649                      */
 650                     temp *= 2;
 651                 }
 652                 fprintf(fplog, "Initial temperature: %g K\n", temp);
 653             }
 654         }
 655
 656         char tbuf[20];
 657         fprintf(stderr, "starting mdrun '%s'\n", *(top_global->name));
 658         if (ir->nsteps >= 0)
 659         {
 660             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
 661         }
 662         else
 663         {
 664             sprintf(tbuf, "%s", "infinite");
 665         }
 666         if (ir->init_step > 0)
 667         {
 668             fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 669                     gmx_step_str(ir->init_step + ir->nsteps, sbuf), tbuf,
 670                     gmx_step_str(ir->init_step, sbuf2), ir->init_step * ir->delta_t);
 671         }
 672         else
 673         {
 674             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
 675         }
 676         fprintf(fplog, "\n");
 677     }
 678
 679     walltime_accounting_start_time(walltime_accounting);
 680     wallcycle_start(wcycle, ewcRUN);
 681     print_start(fplog, cr, walltime_accounting, "mdrun");
 682
 683     /***********************************************************
 684      *
 685      *             Loop over MD steps
 686      *
 687      ************************************************************/
 688
 689     bFirstStep = TRUE;
 690     /* Skip the first Nose-Hoover integration when we get the state from tpx */
 691     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
 692     bSumEkinhOld     = FALSE;
 693     bExchanged       = FALSE;
 694     bNeedRepartition = FALSE;
 695
 696     step     = ir->init_step;
 697     step_rel = 0;
 698
 699     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
 700             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]), simulationsShareState,
 701             MASTER(cr), ir->nstlist, mdrunOptions.reproducible, nstSignalComm,
 702             mdrunOptions.maximumHoursToRun, ir->nstlist == 0, fplog, step, bNS, walltime_accounting);
 703
 704     auto checkpointHandler = std::make_unique<CheckpointHandler>(
 705             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]), simulationsShareState,
 706             ir->nstlist == 0, MASTER(cr), mdrunOptions.writeConfout,
 707             mdrunOptions.checkpointOptions.period);
 708
 709     const bool resetCountersIsLocal = true;
 710     auto       resetHandler         = std::make_unique<ResetHandler>(
 711             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
 712             !resetCountersIsLocal, ir->nsteps, MASTER(cr), mdrunOptions.timingOptions.resetHalfway,
 713             mdrunOptions.maximumHoursToRun, mdlog, wcycle, walltime_accounting);
 714
 715     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
 716
 717     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
 718     {
 719         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
 720     }
 721
 722     /* and stop now if we should */
 723     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
 724     while (!bLastStep)
 725     {
 726
 727         /* Determine if this is a neighbor search step */
 728         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
 729
 730         if (bPMETune && bNStList)
 731         {
 732             // This has to be here because PME load balancing is called so early.
 733             // TODO: Move to after all booleans are defined.
 734             if (useGpuForUpdate && !bFirstStep)
 735             {
 736                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 737                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 738             }
 739             /* PME grid + cut-off optimization with GPUs or PME nodes */
 740             pme_loadbal_do(pme_loadbal, cr, (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
 741                            fplog, mdlog, *ir, fr, state->box, state->x, wcycle, step, step_rel,
 742                            &bPMETunePrinting, simulationWork.useGpuPmePpCommunication);
 743         }
 744
 745         wallcycle_start(wcycle, ewcSTEP);
 746
 747         bLastStep = (step_rel == ir->nsteps);
 748         t         = t0 + step * ir->delta_t;
 749
 750         // TODO Refactor this, so that nstfep does not need a default value of zero
 751         if (ir->efep != efepNO || ir->bSimTemp)
 752         {
 753             /* find and set the current lambdas */
 754             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
 755
 756             bDoDHDL     = do_per_step(step, ir->fepvals->nstdhdl);
 757             bDoFEP      = ((ir->efep != efepNO) && do_per_step(step, nstfep));
 758             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
 759                            && (!bFirstStep));
 760         }
 761
 762         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
 763                      && do_per_step(step, replExParams.exchangeInterval));
 764
 765         if (doSimulatedAnnealing)
 766         {
 767             update_annealing_target_temp(ir, t, &upd);
 768         }
 769
 770         /* Stop Center of Mass motion */
 771         bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
 772
 773         /* Determine whether or not to do Neighbour Searching */
 774         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
 775
 776         /* Note that the stopHandler will cause termination at nstglobalcomm
 777          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
 778          * nstpcouple steps, we have computed the half-step kinetic energy
 779          * of the previous step and can always output energies at the last step.
 780          */
 781         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
 782
 783         /* do_log triggers energy and virial calculation. Because this leads
 784          * to different code paths, forces can be different. Thus for exact
 785          * continuation we should avoid extra log output.
 786          * Note that the || bLastStep can result in non-exact continuation
 787          * beyond the last step. But we don't consider that to be an issue.
 788          */
 789         do_log     = (do_per_step(step, ir->nstlog)
 790                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
 791         do_verbose = mdrunOptions.verbose
 792                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
 793
 794         if (useGpuForUpdate && !bFirstStep && bNS)
 795         {
 796             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
 797             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
 798             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
 799             // Copy coordinate from the GPU when needed at the search step.
 800             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
 801             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
 802             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 803             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 804         }
 805
 806         if (bNS && !(bFirstStep && ir->bContinuation))
 807         {
 808             bMasterState = FALSE;
 809             /* Correct the new box if it is too skewed */
 810             if (inputrecDynamicBox(ir))
 811             {
 812                 if (correct_box(fplog, step, state->box))
 813                 {
 814                     bMasterState = TRUE;
 815                     // If update is offloaded, it should be informed about the box size change
 816                     if (useGpuForUpdate)
 817                     {
 818                         integrator->setPbc(PbcType::Xyz, state->box);
 819                     }
 820                 }
 821             }
 822             if (DOMAINDECOMP(cr) && bMasterState)
 823             {
 824                 dd_collect_state(cr->dd, state, state_global);
 825             }
 826
 827             if (DOMAINDECOMP(cr))
 828             {
 829                 /* Repartition the domain decomposition */
 830                 dd_partition_system(fplog, mdlog, step, cr, bMasterState, nstglobalcomm, state_global,
 831                                     *top_global, ir, imdSession, pull_work, state, &f, mdAtoms, &top,
 832                                     fr, vsite, constr, nrnb, wcycle, do_verbose && !bPMETunePrinting);
 833                 shouldCheckNumberOfBondedInteractions = true;
 834                 upd.setNumAtoms(state->natoms);
 835             }
 836         }
 837
 838         // Allocate or re-size GPU halo exchange object, if necessary
 839         if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange)
 840         {
 841             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
 842                                "GPU device manager has to be initialized to use GPU "
 843                                "version of halo exchange.");
 844             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
 845         }
 846
 847         if (MASTER(cr) && do_log)
 848         {
 849             gmx::EnergyOutput::printHeader(fplog, step,
 850                                            t); /* can we improve the information printed here? */
 851         }
 852
 853         if (ir->efep != efepNO)
 854         {
 855             update_mdatoms(mdatoms, state->lambda[efptMASS]);
 856         }
 857
 858         if (bExchanged)
 859         {
 860
 861             /* We need the kinetic energy at minus the half step for determining
 862              * the full step kinetic energy and possibly for T-coupling.*/
 863             /* This may not be quite working correctly yet . . . . */
 864             compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 865                             makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle,
 866                             enerd, nullptr, nullptr, nullptr, nullptr, constr, &nullSignaller,
 867                             state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
 868                             CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
 869             checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global,
 870                                             &top, makeConstArrayRef(state->x), state->box,
 871                                             &shouldCheckNumberOfBondedInteractions);
 872         }
 873         clear_mat(force_vir);
 874
 875         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
 876
 877         /* Determine the energy and pressure:
 878          * at nstcalcenergy steps and at energy output steps (set below).
 879          */
 880         if (EI_VV(ir->eI) && (!bInitStep))
 881         {
 882             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
 883             bCalcVir      = bCalcEnerStep
 884                        || (ir->epc != epcNO
 885                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
 886         }
 887         else
 888         {
 889             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
 890             bCalcVir = bCalcEnerStep || (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
 891         }
 892         bCalcEner = bCalcEnerStep;
 893
 894         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
 895
 896         if (do_ene || do_log || bDoReplEx)
 897         {
 898             bCalcVir  = TRUE;
 899             bCalcEner = TRUE;
 900         }
 901
 902         /* Do we need global communication ? */
 903         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
 904                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
 905
 906         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
 907                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
 908                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
 909         if (fr->useMts && !do_per_step(step, ir->nstfout))
 910         {
 911             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
 912         }
 913
 914         if (shellfc)
 915         {
 916             /* Now is the time to relax the shells */
 917             relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose, enforcedRotation, step, ir,
 918                                 imdSession, pull_work, bNS, force_flags, &top, constr, enerd,
 919                                 state->natoms, state->x.arrayRefWithPadding(),
 920                                 state->v.arrayRefWithPadding(), state->box, state->lambda,
 921                                 &state->hist, &f.view(), force_vir, mdatoms, nrnb, wcycle, shellfc,
 922                                 fr, runScheduleWork, t, mu_tot, vsite, ddBalanceRegionHandler);
 923         }
 924         else
 925         {
 926             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
 927                is updated (or the AWH update will be performed twice for one step when continuing).
 928                It would be best to call this update function from do_md_trajectory_writing but that
 929                would occur after do_force. One would have to divide the update_awh function into one
 930                function applying the AWH force and one doing the AWH bias update. The update AWH
 931                bias function could then be called after do_md_trajectory_writing (then containing
 932                update_awh_history). The checkpointing will in the future probably moved to the start
 933                of the md loop which will rid of this issue. */
 934             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
 935             {
 936                 awh->updateHistory(state_global->awhHistory.get());
 937             }
 938
 939             /* The coordinates (x) are shifted (to get whole molecules)
 940              * in do_force.
 941              * This is parallellized as well, and does communication too.
 942              * Check comments in sim_util.c
 943              */
 944             do_force(fplog, cr, ms, ir, awh.get(), enforcedRotation, imdSession, pull_work, step,
 945                      nrnb, wcycle, &top, state->box, state->x.arrayRefWithPadding(), &state->hist,
 946                      &f.view(), force_vir, mdatoms, enerd, state->lambda, fr, runScheduleWork,
 947                      vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
 948                      (bNS ? GMX_FORCE_NS : 0) | force_flags, ddBalanceRegionHandler);
 949         }
 950
 951         // VV integrators do not need the following velocity half step
 952         // if it is the first step after starting from a checkpoint.
 953         // That is, the half step is needed on all other steps, and
 954         // also the first step when starting from a .tpr file.
 955         if (EI_VV(ir->eI) && (!bFirstStep || startingBehavior == StartingBehavior::NewSimulation))
 956         /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
 957         {
 958             rvec* vbuf = nullptr;
 959
 960             wallcycle_start(wcycle, ewcUPDATE);
 961             if (ir->eI == eiVV && bInitStep)
 962             {
 963                 /* if using velocity verlet with full time step Ekin,
 964                  * take the first half step only to compute the
 965                  * virial for the first step. From there,
 966                  * revert back to the initial coordinates
 967                  * so that the input is actually the initial step.
 968                  */
 969                 snew(vbuf, state->natoms);
 970                 copy_rvecn(state->v.rvec_array(), vbuf, 0,
 971                            state->natoms); /* should make this better for parallelizing? */
 972             }
 973             else
 974             {
 975                 /* this is for NHC in the Ekin(t+dt/2) version of vv */
 976                 trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ,
 977                                trotter_seq, ettTSEQ1);
 978             }
 979
 980             upd.update_coords(*ir, step, mdatoms, state, f.view().forceWithPadding(), fcdata, ekind,
 981                               M, etrtVELOCITY1, cr, constr != nullptr);
 982
 983             wallcycle_stop(wcycle, ewcUPDATE);
 984             constrain_velocities(constr, do_log, do_ene, step, state, nullptr, bCalcVir, shake_vir);
 985             wallcycle_start(wcycle, ewcUPDATE);
 986             /* if VV, compute the pressure and constraints */
 987             /* For VV2, we strictly only need this if using pressure
 988              * control, but we really would like to have accurate pressures
 989              * printed out.
 990              * Think about ways around this in the future?
 991              * For now, keep this choice in comments.
 992              */
 993             /*bPres = (ir->eI==eiVV || inputrecNptTrotter(ir)); */
 994             /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && inputrecNptTrotter(ir)));*/
 995             bPres = TRUE;
 996             bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
 997             if (bCalcEner && ir->eI == eiVVAK)
 998             {
 999                 bSumEkinhOld = TRUE;
1000             }
1001             /* for vv, the first half of the integration actually corresponds to the previous step.
1002                So we need information from the last step in the first half of the integration */
1003             if (bGStat || do_per_step(step - 1, nstglobalcomm))
1004             {
1005                 wallcycle_stop(wcycle, ewcUPDATE);
1006                 compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1007                                 makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle,
1008                                 enerd, force_vir, shake_vir, total_vir, pres, constr, &nullSignaller,
1009                                 state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
1010                                 (bGStat ? CGLO_GSTAT : 0) | (bCalcEner ? CGLO_ENERGY : 0)
1011                                         | (bTemp ? CGLO_TEMPERATURE : 0) | (bPres ? CGLO_PRESSURE : 0)
1012                                         | (bPres ? CGLO_CONSTRAINT : 0) | (bStopCM ? CGLO_STOPCM : 0)
1013                                         | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1014                                                                                  : 0)
1015                                         | CGLO_SCALEEKIN);
1016                 /* explanation of above:
1017                    a) We compute Ekin at the full time step
1018                    if 1) we are using the AveVel Ekin, and it's not the
1019                    initial step, or 2) if we are using AveEkin, but need the full
1020                    time step kinetic energy for the pressure (always true now, since we want accurate statistics).
1021                    b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
1022                    EkinAveVel because it's needed for the pressure */
1023                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1024                                                 top_global, &top, makeConstArrayRef(state->x),
1025                                                 state->box, &shouldCheckNumberOfBondedInteractions);
1026                 if (bStopCM)
1027                 {
1028                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, makeArrayRef(state->x),
1029                                            makeArrayRef(state->v));
1030                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1031                 }
1032                 wallcycle_start(wcycle, ewcUPDATE);
1033             }
1034             /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
1035             if (!bInitStep)
1036             {
1037                 if (bTrotter)
1038                 {
1039                     m_add(force_vir, shake_vir,
1040                           total_vir); /* we need the un-dispersion corrected total vir here */
1041                     trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ,
1042                                    trotter_seq, ettTSEQ2);
1043
1044                     /* TODO This is only needed when we're about to write
1045                      * a checkpoint, because we use it after the restart
1046                      * (in a kludge?). But what should we be doing if
1047                      * the startingBehavior is NewSimulation or bInitStep are true? */
1048                     if (inputrecNptTrotter(ir) || inputrecNphTrotter(ir))
1049                     {
1050                         copy_mat(shake_vir, state->svir_prev);
1051                         copy_mat(force_vir, state->fvir_prev);
1052                     }
1053                     if (inputrecNvtTrotter(ir) && ir->eI == eiVV)
1054                     {
1055                         /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
1056                         enerd->term[F_TEMP] =
1057                                 sum_ekin(&(ir->opts), ekind, nullptr, (ir->eI == eiVV), FALSE);
1058                         enerd->term[F_EKIN] = trace(ekind->ekin);
1059                     }
1060                 }
1061                 else if (bExchanged)
1062                 {
1063                     wallcycle_stop(wcycle, ewcUPDATE);
1064                     /* We need the kinetic energy at minus the half step for determining
1065                      * the full step kinetic energy and possibly for T-coupling.*/
1066                     /* This may not be quite working correctly yet . . . . */
1067                     compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1068                                     makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle,
1069                                     enerd, nullptr, nullptr, nullptr, nullptr, constr, &nullSignaller,
1070                                     state->box, nullptr, &bSumEkinhOld, CGLO_GSTAT | CGLO_TEMPERATURE);
1071                     wallcycle_start(wcycle, ewcUPDATE);
1072                 }
1073             }
1074             /* if it's the initial step, we performed this first step just to get the constraint virial */
1075             if (ir->eI == eiVV && bInitStep)
1076             {
1077                 copy_rvecn(vbuf, state->v.rvec_array(), 0, state->natoms);
1078                 sfree(vbuf);
1079             }
1080             wallcycle_stop(wcycle, ewcUPDATE);
1081         }
1082
1083         /* compute the conserved quantity */
1084         if (EI_VV(ir->eI))
1085         {
1086             saved_conserved_quantity = NPT_energy(ir, state, &MassQ);
1087             if (ir->eI == eiVV)
1088             {
1089                 last_ekin = enerd->term[F_EKIN];
1090             }
1091             if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
1092             {
1093                 saved_conserved_quantity -= enerd->term[F_DISPCORR];
1094             }
1095             /* sum up the foreign kinetic energy and dK/dl terms for vv.  currently done every step so that dhdl is correct in the .edr */
1096             if (ir->efep != efepNO)
1097             {
1098                 accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1099             }
1100         }
1101
1102         /* ########  END FIRST UPDATE STEP  ############## */
1103         /* ########  If doing VV, we now have v(dt) ###### */
1104         if (bDoExpanded)
1105         {
1106             /* perform extended ensemble sampling in lambda - we don't
1107                actually move to the new state before outputting
1108                statistics, but if performing simulated tempering, we
1109                do update the velocities and the tau_t. */
1110
1111             lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state,
1112                                               state->dfhist, step, state->v.rvec_array(), mdatoms);
1113             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1114             if (MASTER(cr))
1115             {
1116                 copy_df_history(state_global->dfhist, state->dfhist);
1117             }
1118         }
1119
1120         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1121         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1122         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1123             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1124                 || checkpointHandler->isCheckpointingStep()))
1125         {
1126             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1127             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1128         }
1129         // Copy velocities if needed for the output/checkpointing.
1130         // NOTE: Copy on the search steps is done at the beginning of the step.
1131         if (useGpuForUpdate && !bNS
1132             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1133         {
1134             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1135             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1136         }
1137         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1138         // and update is offloaded hence forces are kept on the GPU for update and have not been
1139         // already transferred in do_force().
1140         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1141         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1142         //       prior to GPU update.
1143         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1144         //       copy call in do_force(...).
1145         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1146         //       on host after the D2H copy in do_force(...).
1147         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1148             && do_per_step(step, ir->nstfout))
1149         {
1150             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1151             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1152         }
1153         /* Now we have the energies and forces corresponding to the
1154          * coordinates at time t. We must output all of this before
1155          * the update.
1156          */
1157         do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t, ir, state, state_global,
1158                                  observablesHistory, top_global, fr, outf, energyOutput, ekind,
1159                                  f.view().force(), checkpointHandler->isCheckpointingStep(),
1160                                  bRerunMD, bLastStep, mdrunOptions.writeConfout, bSumEkinhOld);
1161         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1162         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x.rvec_array(), t);
1163
1164         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1165         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1166             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1167         {
1168             copy_mat(state->svir_prev, shake_vir);
1169             copy_mat(state->fvir_prev, force_vir);
1170         }
1171
1172         stopHandler->setSignal();
1173         resetHandler->setSignal(walltime_accounting);
1174
1175         if (bGStat || !PAR(cr))
1176         {
1177             /* In parallel we only have to check for checkpointing in steps
1178              * where we do global communication,
1179              *  otherwise the other nodes don't know.
1180              */
1181             checkpointHandler->setSignal(walltime_accounting);
1182         }
1183
1184         /* #########   START SECOND UPDATE STEP ################# */
1185
1186         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1187            controlled in preprocessing */
1188
1189         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1190         {
1191             gmx_bool bIfRandomize;
1192             bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, &upd, constr);
1193             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1194             if (constr && bIfRandomize)
1195             {
1196                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1197             }
1198         }
1199         /* Box is changed in update() when we do pressure coupling,
1200          * but we should still use the old box for energy corrections and when
1201          * writing it to the energy file, so it matches the trajectory files for
1202          * the same timestep above. Make a copy in a separate array.
1203          */
1204         copy_mat(state->box, lastbox);
1205
1206         dvdl_constr = 0;
1207
1208         wallcycle_start(wcycle, ewcUPDATE);
1209         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1210         if (bTrotter)
1211         {
1212             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
1213             /* We can only do Berendsen coupling after we have summed
1214              * the kinetic energy or virial. Since the happens
1215              * in global_state after update, we should only do it at
1216              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1217              */
1218         }
1219         else
1220         {
1221             update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
1222             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1223         }
1224
1225         if (EI_VV(ir->eI))
1226         {
1227             /* velocity half-step update */
1228             upd.update_coords(*ir, step, mdatoms, state, f.view().forceWithPadding(), fcdata, ekind,
1229                               M, etrtVELOCITY2, cr, constr != nullptr);
1230         }
1231
1232         /* Above, initialize just copies ekinh into ekin,
1233          * it doesn't copy position (for VV),
1234          * and entire integrator for MD.
1235          */
1236
1237         if (ir->eI == eiVVAK)
1238         {
1239             cbuf.resize(state->x.size());
1240             std::copy(state->x.begin(), state->x.end(), cbuf.begin());
1241         }
1242
1243         /* With leap-frog type integrators we compute the kinetic energy
1244          * at a whole time step as the average of the half-time step kinetic
1245          * energies of two subsequent steps. Therefore we need to compute the
1246          * half step kinetic energy also if we need energies at the next step.
1247          */
1248         const bool needHalfStepKineticEnergy =
1249                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1250
1251         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1252         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1253         const bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN
1254                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1255
1256         if (useGpuForUpdate)
1257         {
1258             if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1259             {
1260                 integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(),
1261                                 stateGpu->getForces(), top.idef, *mdatoms, ekind->ngtc);
1262
1263                 // Copy data to the GPU after buffers might have being reinitialized
1264                 stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1265                 stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1266             }
1267
1268             // If the buffer ops were not offloaded this step, the forces are on the host and have to be copied
1269             if (!runScheduleWork->stepWork.useGpuFBufferOps)
1270             {
1271                 stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1272             }
1273
1274             const bool doTemperatureScaling =
1275                     (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1276
1277             // This applies Leap-Frog, LINCS and SETTLE in succession
1278             integrator->integrate(stateGpu->getForcesReadyOnDeviceEvent(
1279                                           AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1280                                   ir->delta_t, true, bCalcVir, shake_vir, doTemperatureScaling,
1281                                   ekind->tcstat, doParrinelloRahman, ir->nstpcouple * ir->delta_t, M);
1282
1283             // Copy velocities D2H after update if:
1284             // - Globals are computed this step (includes the energy output steps).
1285             // - Temperature is needed for the next step.
1286             if (bGStat || needHalfStepKineticEnergy)
1287             {
1288                 stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1289                 stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1290             }
1291         }
1292         else
1293         {
1294             /* With multiple time stepping we need to do an additional normal
1295              * update step to obtain the virial, as the actual MTS integration
1296              * using an acceleration where the slow forces are multiplied by mtsFactor.
1297              * Using that acceleration would result in a virial with the slow
1298              * force contribution would be a factor mtsFactor too large.
1299              */
1300             if (fr->useMts && bCalcVir && constr != nullptr)
1301             {
1302                 upd.update_for_constraint_virial(*ir, *mdatoms, *state, f.view().forceWithPadding(), *ekind);
1303
1304                 constrain_coordinates(constr, do_log, do_ene, step, state,
1305                                       upd.xp()->arrayRefWithPadding(), &dvdl_constr, bCalcVir, shake_vir);
1306             }
1307
1308             ArrayRefWithPadding<const RVec> forceCombined =
1309                     (fr->useMts && step % ir->mtsLevels[1].stepFactor == 0)
1310                             ? f.view().forceMtsCombinedWithPadding()
1311                             : f.view().forceWithPadding();
1312             upd.update_coords(*ir, step, mdatoms, state, forceCombined, fcdata, ekind, M,
1313                               etrtPOSITION, cr, constr != nullptr);
1314
1315             wallcycle_stop(wcycle, ewcUPDATE);
1316
1317             constrain_coordinates(constr, do_log, do_ene, step, state, upd.xp()->arrayRefWithPadding(),
1318                                   &dvdl_constr, bCalcVir && !fr->useMts, shake_vir);
1319
1320             upd.update_sd_second_half(*ir, step, &dvdl_constr, mdatoms, state, cr, nrnb, wcycle,
1321                                       constr, do_log, do_ene);
1322             upd.finish_update(*ir, mdatoms, state, wcycle, constr != nullptr);
1323         }
1324
1325         if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1326         {
1327             updatePrevStepPullCom(pull_work, state);
1328         }
1329
1330         if (ir->eI == eiVVAK)
1331         {
1332             /* erase F_EKIN and F_TEMP here? */
1333             /* just compute the kinetic energy at the half step to perform a trotter step */
1334             compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1335                             makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle, enerd,
1336                             force_vir, shake_vir, total_vir, pres, constr, &nullSignaller, lastbox,
1337                             nullptr, &bSumEkinhOld, (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE);
1338             wallcycle_start(wcycle, ewcUPDATE);
1339             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
1340             /* now we know the scaling, we can compute the positions again */
1341             std::copy(cbuf.begin(), cbuf.end(), state->x.begin());
1342
1343             upd.update_coords(*ir, step, mdatoms, state, f.view().forceWithPadding(), fcdata, ekind,
1344                               M, etrtPOSITION, cr, constr != nullptr);
1345             wallcycle_stop(wcycle, ewcUPDATE);
1346
1347             /* do we need an extra constraint here? just need to copy out of as_rvec_array(state->v.data()) to upd->xp? */
1348             /* are the small terms in the shake_vir here due
1349              * to numerical errors, or are they important
1350              * physically? I'm thinking they are just errors, but not completely sure.
1351              * For now, will call without actually constraining, constr=NULL*/
1352             upd.finish_update(*ir, mdatoms, state, wcycle, false);
1353         }
1354         if (EI_VV(ir->eI))
1355         {
1356             /* this factor or 2 correction is necessary
1357                because half of the constraint force is removed
1358                in the vv step, so we have to double it.  See
1359                the Issue #1255.  It is not yet clear
1360                if the factor of 2 is exact, or just a very
1361                good approximation, and this will be
1362                investigated.  The next step is to see if this
1363                can be done adding a dhdl contribution from the
1364                rattle step, but this is somewhat more
1365                complicated with the current code. Will be
1366                investigated, hopefully for 4.6.3. However,
1367                this current solution is much better than
1368                having it completely wrong.
1369              */
1370             enerd->term[F_DVDL_CONSTR] += 2 * dvdl_constr;
1371         }
1372         else
1373         {
1374             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1375         }
1376
1377         if (vsite != nullptr)
1378         {
1379             wallcycle_start(wcycle, ewcVSITECONSTR);
1380             vsite->construct(state->x, ir->delta_t, state->v, state->box);
1381             wallcycle_stop(wcycle, ewcVSITECONSTR);
1382         }
1383
1384         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1385         /* With Leap-Frog we can skip compute_globals at
1386          * non-communication steps, but we need to calculate
1387          * the kinetic energy one step before communication.
1388          */
1389         {
1390             // Organize to do inter-simulation signalling on steps if
1391             // and when algorithms require it.
1392             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1393
1394             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1395             {
1396                 // Copy coordinates when needed to stop the CM motion.
1397                 if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
1398                 {
1399                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1400                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1401                 }
1402                 // Since we're already communicating at this step, we
1403                 // can propagate intra-simulation signals. Note that
1404                 // check_nstglobalcomm has the responsibility for
1405                 // choosing the value of nstglobalcomm that is one way
1406                 // bGStat becomes true, so we can't get into a
1407                 // situation where e.g. checkpointing can't be
1408                 // signalled.
1409                 bool                doIntraSimSignal = true;
1410                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1411
1412                 compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1413                                 makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm,
1414                                 wcycle, enerd, force_vir, shake_vir, total_vir, pres, constr,
1415                                 &signaller, lastbox, &totalNumberOfBondedInteractions, &bSumEkinhOld,
1416                                 (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1417                                         | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1418                                         | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1419                                         | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
1420                                         | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1421                                                                                  : 0));
1422                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1423                                                 top_global, &top, makeConstArrayRef(state->x),
1424                                                 state->box, &shouldCheckNumberOfBondedInteractions);
1425                 if (!EI_VV(ir->eI) && bStopCM)
1426                 {
1427                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, makeArrayRef(state->x),
1428                                            makeArrayRef(state->v));
1429                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1430
1431                     // TODO: The special case of removing CM motion should be dealt more gracefully
1432                     if (useGpuForUpdate)
1433                     {
1434                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1435                         // Here we block until the H2D copy completes because event sync with the
1436                         // force kernels that use the coordinates on the next steps is not implemented
1437                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1438                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1439                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1440                         if (vcm.mode != ecmNO)
1441                         {
1442                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1443                         }
1444                     }
1445                 }
1446             }
1447         }
1448
1449         /* #############  END CALC EKIN AND PRESSURE ################# */
1450
1451         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1452            the virial that should probably be addressed eventually. state->veta has better properies,
1453            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1454            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1455
1456         if (ir->efep != efepNO && !EI_VV(ir->eI))
1457         {
1458             /* Sum up the foreign energy and dK/dl terms for md and sd.
1459                Currently done every step so that dH/dl is correct in the .edr */
1460             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1461         }
1462
1463         update_pcouple_after_coordinates(fplog, step, ir, mdatoms, pres, force_vir, shake_vir,
1464                                          pressureCouplingMu, state, nrnb, upd.deform(), !useGpuForUpdate);
1465
1466         const bool doBerendsenPressureCoupling =
1467                 (inputrec->epc == epcBERENDSEN && do_per_step(step, inputrec->nstpcouple));
1468         const bool doCRescalePressureCoupling =
1469                 (inputrec->epc == epcCRESCALE && do_per_step(step, inputrec->nstpcouple));
1470         if (useGpuForUpdate
1471             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1472         {
1473             integrator->scaleCoordinates(pressureCouplingMu);
1474             if (doCRescalePressureCoupling)
1475             {
1476                 matrix pressureCouplingInvMu;
1477                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1478                 integrator->scaleVelocities(pressureCouplingInvMu);
1479             }
1480             integrator->setPbc(PbcType::Xyz, state->box);
1481         }
1482
1483         /* ################# END UPDATE STEP 2 ################# */
1484         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1485
1486         /* The coordinates (x) were unshifted in update */
1487         if (!bGStat)
1488         {
1489             /* We will not sum ekinh_old,
1490              * so signal that we still have to do it.
1491              */
1492             bSumEkinhOld = TRUE;
1493         }
1494
1495         if (bCalcEner)
1496         {
1497             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1498
1499             /* use the directly determined last velocity, not actually the averaged half steps */
1500             if (bTrotter && ir->eI == eiVV)
1501             {
1502                 enerd->term[F_EKIN] = last_ekin;
1503             }
1504             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1505
1506             if (integratorHasConservedEnergyQuantity(ir))
1507             {
1508                 if (EI_VV(ir->eI))
1509                 {
1510                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1511                 }
1512                 else
1513                 {
1514                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1515                 }
1516             }
1517             /* #########  END PREPARING EDR OUTPUT  ###########  */
1518         }
1519
1520         /* Output stuff */
1521         if (MASTER(cr))
1522         {
1523             if (fplog && do_log && bDoExpanded)
1524             {
1525                 /* only needed if doing expanded ensemble */
1526                 PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals,
1527                                           ir->bSimTemp ? ir->simtempvals : nullptr,
1528                                           state_global->dfhist, state->fep_state, ir->nstlog, step);
1529             }
1530             if (bCalcEner)
1531             {
1532                 energyOutput.addDataAtEnergyStep(
1533                         bDoDHDL, bCalcEnerStep, t, mdatoms->tmass, enerd, ir->fepvals,
1534                         ir->expandedvals, lastbox,
1535                         PTCouplingArrays{ state->boxv, state->nosehoover_xi, state->nosehoover_vxi,
1536                                           state->nhpres_xi, state->nhpres_vxi },
1537                         state->fep_state, shake_vir, force_vir, total_vir, pres, ekind, mu_tot, constr);
1538             }
1539             else
1540             {
1541                 energyOutput.recordNonEnergyStep();
1542             }
1543
1544             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1545             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1546
1547             if (doSimulatedAnnealing)
1548             {
1549                 gmx::EnergyOutput::printAnnealingTemperatures(do_log ? fplog : nullptr, groups,
1550                                                               &(ir->opts));
1551             }
1552             if (do_log || do_ene || do_dr || do_or)
1553             {
1554                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or,
1555                                                    do_log ? fplog : nullptr, step, t,
1556                                                    fr->fcdata.get(), awh.get());
1557             }
1558             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1559             {
1560                 const bool isInitialOutput = false;
1561                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1562             }
1563
1564             if (ir->bPull)
1565             {
1566                 pull_print_output(pull_work, step, t);
1567             }
1568
1569             if (do_per_step(step, ir->nstlog))
1570             {
1571                 if (fflush(fplog) != 0)
1572                 {
1573                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1574                 }
1575             }
1576         }
1577         if (bDoExpanded)
1578         {
1579             /* Have to do this part _after_ outputting the logfile and the edr file */
1580             /* Gets written into the state at the beginning of next loop*/
1581             state->fep_state = lamnew;
1582         }
1583         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1584         {
1585             state->fep_state = awh->fepLambdaState();
1586         }
1587         /* Print the remaining wall clock time for the run */
1588         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1589         {
1590             if (shellfc)
1591             {
1592                 fprintf(stderr, "\n");
1593             }
1594             print_time(stderr, walltime_accounting, step, ir, cr);
1595         }
1596
1597         /* Ion/water position swapping.
1598          * Not done in last step since trajectory writing happens before this call
1599          * in the MD loop and exchanges would be lost anyway. */
1600         bNeedRepartition = FALSE;
1601         if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep && do_per_step(step, ir->swap->nstswap))
1602         {
1603             bNeedRepartition =
1604                     do_swapcoords(cr, step, t, ir, swap, wcycle, as_rvec_array(state->x.data()),
1605                                   state->box, MASTER(cr) && mdrunOptions.verbose, bRerunMD);
1606
1607             if (bNeedRepartition && DOMAINDECOMP(cr))
1608             {
1609                 dd_collect_state(cr->dd, state, state_global);
1610             }
1611         }
1612
1613         /* Replica exchange */
1614         bExchanged = FALSE;
1615         if (bDoReplEx)
1616         {
1617             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1618         }
1619
1620         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1621         {
1622             dd_partition_system(fplog, mdlog, step, cr, TRUE, 1, state_global, *top_global, ir,
1623                                 imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
1624                                 nrnb, wcycle, FALSE);
1625             shouldCheckNumberOfBondedInteractions = true;
1626             upd.setNumAtoms(state->natoms);
1627         }
1628
1629         bFirstStep = FALSE;
1630         bInitStep  = FALSE;
1631
1632         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1633         /* With all integrators, except VV, we need to retain the pressure
1634          * at the current step for coupling at the next step.
1635          */
1636         if ((state->flags & (1U << estPRES_PREV))
1637             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1638         {
1639             /* Store the pressure in t_state for pressure coupling
1640              * at the next MD step.
1641              */
1642             copy_mat(pres, state->pres_prev);
1643         }
1644
1645         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1646
1647         if ((membed != nullptr) && (!bLastStep))
1648         {
1649             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1650         }
1651
1652         cycles = wallcycle_stop(wcycle, ewcSTEP);
1653         if (DOMAINDECOMP(cr) && wcycle)
1654         {
1655             dd_cycles_add(cr->dd, cycles, ddCyclStep);
1656         }
1657
1658         /* increase the MD step number */
1659         step++;
1660         step_rel++;
1661
1662 #if GMX_FAHCORE
1663         if (MASTER(cr))
1664         {
1665             fcReportProgress(ir->nsteps + ir->init_step, step);
1666         }
1667 #endif
1668
1669         resetHandler->resetCounters(step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb,
1670                                     fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
1671
1672         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
1673         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
1674     }
1675     /* End of main MD loop */
1676
1677     /* Closing TNG files can include compressing data. Therefore it is good to do that
1678      * before stopping the time measurements. */
1679     mdoutf_tng_close(outf);
1680
1681     /* Stop measuring walltime */
1682     walltime_accounting_end_time(walltime_accounting);
1683
1684     if (!thisRankHasDuty(cr, DUTY_PME))
1685     {
1686         /* Tell the PME only node to finish */
1687         gmx_pme_send_finish(cr);
1688     }
1689
1690     if (MASTER(cr))
1691     {
1692         if (ir->nstcalcenergy > 0)
1693         {
1694             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
1695             energyOutput.printAverages(fplog, groups);
1696         }
1697     }
1698     done_mdoutf(outf);
1699
1700     if (bPMETune)
1701     {
1702         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
1703     }
1704
1705     done_shellfc(fplog, shellfc, step_rel);
1706
1707     if (useReplicaExchange && MASTER(cr))
1708     {
1709         print_replica_exchange_statistics(fplog, repl_ex);
1710     }
1711
1712     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
1713
1714     global_stat_destroy(gstat);
1715 }