src/gromacs/mdrun/md.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2011-2019,2020, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief Implements the integrator for normal molecular dynamics simulations
  40  *
  41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
  42  * \ingroup module_mdrun
  43  */
  44 #include "gmxpre.h"
  45
  46 #include <cinttypes>
  47 #include <cmath>
  48 #include <cstdio>
  49 #include <cstdlib>
  50
  51 #include <algorithm>
  52 #include <memory>
  53
  54 #include "gromacs/awh/awh.h"
  55 #include "gromacs/commandline/filenm.h"
  56 #include "gromacs/domdec/collect.h"
  57 #include "gromacs/domdec/dlbtiming.h"
  58 #include "gromacs/domdec/domdec.h"
  59 #include "gromacs/domdec/domdec_network.h"
  60 #include "gromacs/domdec/domdec_struct.h"
  61 #include "gromacs/domdec/gpuhaloexchange.h"
  62 #include "gromacs/domdec/mdsetup.h"
  63 #include "gromacs/domdec/partition.h"
  64 #include "gromacs/essentialdynamics/edsam.h"
  65 #include "gromacs/ewald/pme_load_balancing.h"
  66 #include "gromacs/ewald/pme_pp.h"
  67 #include "gromacs/fileio/trxio.h"
  68 #include "gromacs/gmxlib/network.h"
  69 #include "gromacs/gmxlib/nrnb.h"
  70 #include "gromacs/gpu_utils/gpu_utils.h"
  71 #include "gromacs/imd/imd.h"
  72 #include "gromacs/listed_forces/manage_threading.h"
  73 #include "gromacs/math/functions.h"
  74 #include "gromacs/math/utilities.h"
  75 #include "gromacs/math/vec.h"
  76 #include "gromacs/math/vectypes.h"
  77 #include "gromacs/mdlib/checkpointhandler.h"
  78 #include "gromacs/mdlib/compute_io.h"
  79 #include "gromacs/mdlib/constr.h"
  80 #include "gromacs/mdlib/ebin.h"
  81 #include "gromacs/mdlib/enerdata_utils.h"
  82 #include "gromacs/mdlib/energyoutput.h"
  83 #include "gromacs/mdlib/expanded.h"
  84 #include "gromacs/mdlib/force.h"
  85 #include "gromacs/mdlib/force_flags.h"
  86 #include "gromacs/mdlib/forcerec.h"
  87 #include "gromacs/mdlib/md_support.h"
  88 #include "gromacs/mdlib/mdatoms.h"
  89 #include "gromacs/mdlib/mdoutf.h"
  90 #include "gromacs/mdlib/membed.h"
  91 #include "gromacs/mdlib/resethandler.h"
  92 #include "gromacs/mdlib/sighandler.h"
  93 #include "gromacs/mdlib/simulationsignal.h"
  94 #include "gromacs/mdlib/stat.h"
  95 #include "gromacs/mdlib/stophandler.h"
  96 #include "gromacs/mdlib/tgroup.h"
  97 #include "gromacs/mdlib/trajectory_writing.h"
  98 #include "gromacs/mdlib/update.h"
  99 #include "gromacs/mdlib/update_constrain_gpu.h"
 100 #include "gromacs/mdlib/vcm.h"
 101 #include "gromacs/mdlib/vsite.h"
 102 #include "gromacs/mdrunutility/handlerestart.h"
 103 #include "gromacs/mdrunutility/multisim.h"
 104 #include "gromacs/mdrunutility/printtime.h"
 105 #include "gromacs/mdtypes/awh_history.h"
 106 #include "gromacs/mdtypes/awh_params.h"
 107 #include "gromacs/mdtypes/commrec.h"
 108 #include "gromacs/mdtypes/df_history.h"
 109 #include "gromacs/mdtypes/energyhistory.h"
 110 #include "gromacs/mdtypes/fcdata.h"
 111 #include "gromacs/mdtypes/forcerec.h"
 112 #include "gromacs/mdtypes/group.h"
 113 #include "gromacs/mdtypes/inputrec.h"
 114 #include "gromacs/mdtypes/interaction_const.h"
 115 #include "gromacs/mdtypes/md_enums.h"
 116 #include "gromacs/mdtypes/mdatom.h"
 117 #include "gromacs/mdtypes/mdrunoptions.h"
 118 #include "gromacs/mdtypes/observableshistory.h"
 119 #include "gromacs/mdtypes/pullhistory.h"
 120 #include "gromacs/mdtypes/simulation_workload.h"
 121 #include "gromacs/mdtypes/state.h"
 122 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
 123 #include "gromacs/modularsimulator/energyelement.h"
 124 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 125 #include "gromacs/nbnxm/nbnxm.h"
 126 #include "gromacs/pbcutil/pbc.h"
 127 #include "gromacs/pulling/output.h"
 128 #include "gromacs/pulling/pull.h"
 129 #include "gromacs/swap/swapcoords.h"
 130 #include "gromacs/timing/wallcycle.h"
 131 #include "gromacs/timing/walltime_accounting.h"
 132 #include "gromacs/topology/atoms.h"
 133 #include "gromacs/topology/idef.h"
 134 #include "gromacs/topology/mtop_util.h"
 135 #include "gromacs/topology/topology.h"
 136 #include "gromacs/trajectory/trajectoryframe.h"
 137 #include "gromacs/utility/basedefinitions.h"
 138 #include "gromacs/utility/cstringutil.h"
 139 #include "gromacs/utility/fatalerror.h"
 140 #include "gromacs/utility/logger.h"
 141 #include "gromacs/utility/real.h"
 142 #include "gromacs/utility/smalloc.h"
 143
 144 #include "legacysimulator.h"
 145 #include "replicaexchange.h"
 146 #include "shellfc.h"
 147
 148 #if GMX_FAHCORE
 149 #    include "corewrap.h"
 150 #endif
 151
 152 using gmx::SimulationSignaller;
 153
 154 void gmx::LegacySimulator::do_md()
 155 {
 156     // TODO Historically, the EM and MD "integrators" used different
 157     // names for the t_inputrec *parameter, but these must have the
 158     // same name, now that it's a member of a struct. We use this ir
 159     // alias to avoid a large ripple of nearly useless changes.
 160     // t_inputrec is being replaced by IMdpOptionsProvider, so this
 161     // will go away eventually.
 162     t_inputrec*  ir = inputrec;
 163     int64_t      step, step_rel;
 164     double       t, t0 = ir->init_t, lam0[efptNR];
 165     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
 166     gmx_bool     bNS, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
 167     gmx_bool     bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
 168     gmx_bool     do_ene, do_log, do_verbose;
 169     gmx_bool     bMasterState;
 170     unsigned int force_flags;
 171     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, tmp_vir = { { 0 } },
 172            pres = { { 0 } };
 173     int                         i, m;
 174     rvec                        mu_tot;
 175     matrix                      pressureCouplingMu, M;
 176     gmx_repl_ex_t               repl_ex = nullptr;
 177     PaddedHostVector<gmx::RVec> f{};
 178     gmx_global_stat_t           gstat;
 179     gmx_shellfc_t*              shellfc;
 180     gmx_bool                    bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
 181     gmx_bool                    bTemp, bPres, bTrotter;
 182     real                        dvdl_constr;
 183     std::vector<RVec>           cbuf;
 184     matrix                      lastbox;
 185     int                         lamnew = 0;
 186     /* for FEP */
 187     int       nstfep = 0;
 188     double    cycles;
 189     real      saved_conserved_quantity = 0;
 190     real      last_ekin                = 0;
 191     t_extmass MassQ;
 192     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
 193
 194     /* PME load balancing data for GPU kernels */
 195     gmx_bool bPMETune         = FALSE;
 196     gmx_bool bPMETunePrinting = FALSE;
 197
 198     bool bInteractiveMDstep = false;
 199
 200     /* Domain decomposition could incorrectly miss a bonded
 201        interaction, but checking for that requires a global
 202        communication stage, which does not otherwise happen in DD
 203        code. So we do that alongside the first global energy reduction
 204        after a new DD is made. These variables handle whether the
 205        check happens, and the result it returns. */
 206     bool shouldCheckNumberOfBondedInteractions = false;
 207     int  totalNumberOfBondedInteractions       = -1;
 208
 209     SimulationSignals signals;
 210     // Most global communnication stages don't propagate mdrun
 211     // signals, and will use this object to achieve that.
 212     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
 213
 214     if (!mdrunOptions.writeConfout)
 215     {
 216         // This is on by default, and the main known use case for
 217         // turning it off is for convenience in benchmarking, which is
 218         // something that should not show up in the general user
 219         // interface.
 220         GMX_LOG(mdlog.info)
 221                 .asParagraph()
 222                 .appendText(
 223                         "The -noconfout functionality is deprecated, and may be removed in a "
 224                         "future version.");
 225     }
 226
 227     /* md-vv uses averaged full step velocities for T-control
 228        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 229        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 230     bTrotter = (EI_VV(ir->eI)
 231                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
 232
 233     const bool bRerunMD = false;
 234
 235     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
 236     bGStatEveryStep   = (nstglobalcomm == 1);
 237
 238     SimulationGroups* groups = &top_global->groups;
 239
 240     std::unique_ptr<EssentialDynamics> ed = nullptr;
 241     if (opt2bSet("-ei", nfile, fnm))
 242     {
 243         /* Initialize essential dynamics sampling */
 244         ed = init_edsam(mdlog, opt2fn_null("-ei", nfile, fnm), opt2fn("-eo", nfile, fnm), top_global,
 245                         ir, cr, constr, state_global, observablesHistory, oenv, startingBehavior);
 246     }
 247     else if (observablesHistory->edsamHistory)
 248     {
 249         gmx_fatal(FARGS,
 250                   "The checkpoint is from a run with essential dynamics sampling, "
 251                   "but the current run did not specify the -ei option. "
 252                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
 253     }
 254
 255     initialize_lambdas(fplog, *ir, MASTER(cr), &state_global->fep_state, state_global->lambda, lam0);
 256     Update     upd(ir, deform);
 257     const bool doSimulatedAnnealing = initSimulatedAnnealing(ir, &upd);
 258     const bool useReplicaExchange   = (replExParams.exchangeInterval > 0);
 259
 260     bool simulationsShareState = false;
 261     int  nstSignalComm         = nstglobalcomm;
 262     {
 263         // TODO This implementation of ensemble orientation restraints is nasty because
 264         // a user can't just do multi-sim with single-sim orientation restraints.
 265         bool usingEnsembleRestraints =
 266                 (fcd->disres.nsystems > 1) || ((ms != nullptr) && (fcd->orires.nr != 0));
 267         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr));
 268
 269         // Replica exchange, ensemble restraints and AWH need all
 270         // simulations to remain synchronized, so they need
 271         // checkpoints and stop conditions to act on the same step, so
 272         // the propagation of such signals must take place between
 273         // simulations, not just within simulations.
 274         // TODO: Make algorithm initializers set these flags.
 275         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
 276
 277         if (simulationsShareState)
 278         {
 279             // Inter-simulation signal communication does not need to happen
 280             // often, so we use a minimum of 200 steps to reduce overhead.
 281             const int c_minimumInterSimulationSignallingInterval = 200;
 282             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
 283                             * nstglobalcomm;
 284         }
 285     }
 286
 287     if (startingBehavior != StartingBehavior::RestartWithAppending)
 288     {
 289         pleaseCiteCouplingAlgorithms(fplog, *ir);
 290     }
 291     gmx_mdoutf* outf =
 292             init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, ir,
 293                         top_global, oenv, wcycle, startingBehavior, simulationsShareState, ms);
 294     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, ir, pull_work,
 295                                    mdoutf_get_fp_dhdl(outf), false, startingBehavior, mdModulesNotifier);
 296
 297     gstat = global_stat_init(ir);
 298
 299     /* Check for polarizable models and flexible constraints */
 300     shellfc = init_shell_flexcon(fplog, top_global, constr ? constr->numFlexibleConstraints() : 0,
 301                                  ir->nstcalcenergy, DOMAINDECOMP(cr));
 302
 303     {
 304         double io = compute_io(ir, top_global->natoms, *groups, energyOutput.numEnergyTerms(), 1);
 305         if ((io > 2000) && MASTER(cr))
 306         {
 307             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
 308         }
 309     }
 310
 311     // Local state only becomes valid now.
 312     std::unique_ptr<t_state> stateInstance;
 313     t_state*                 state;
 314
 315
 316     gmx_localtop_t top(top_global->ffparams);
 317
 318     auto mdatoms = mdAtoms->mdatoms();
 319
 320     std::unique_ptr<UpdateConstrainGpu> integrator;
 321
 322     if (DOMAINDECOMP(cr))
 323     {
 324         stateInstance = std::make_unique<t_state>();
 325         state         = stateInstance.get();
 326         dd_init_local_state(cr->dd, state_global, state);
 327
 328         /* Distribute the charge groups over the nodes from the master node */
 329         dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1, state_global, *top_global, ir,
 330                             imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
 331                             nrnb, nullptr, FALSE);
 332         shouldCheckNumberOfBondedInteractions = true;
 333         upd.setNumAtoms(state->natoms);
 334     }
 335     else
 336     {
 337         state_change_natoms(state_global, state_global->natoms);
 338         f.resizeWithPadding(state_global->natoms);
 339         /* Copy the pointer to the global state */
 340         state = state_global;
 341
 342         /* Generate and initialize new topology */
 343         mdAlgorithmsSetupAtomData(cr, ir, *top_global, &top, fr, mdAtoms, constr, vsite, shellfc);
 344
 345         upd.setNumAtoms(state->natoms);
 346     }
 347
 348     const auto& simulationWork     = runScheduleWork->simulationWork;
 349     const bool  useGpuForPme       = simulationWork.useGpuPme;
 350     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
 351     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
 352     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
 353
 354     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
 355
 356     if (useGpuForUpdate)
 357     {
 358         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
 359                                    || constr->numConstraintsTotal() == 0,
 360                            "Constraints in domain decomposition are only supported with update "
 361                            "groups if using GPU update.\n");
 362         GMX_RELEASE_ASSERT(ir->eConstrAlg != econtSHAKE || constr == nullptr
 363                                    || constr->numConstraintsTotal() == 0,
 364                            "SHAKE is not supported with GPU update.");
 365         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
 366                            "Either PME or short-ranged non-bonded interaction tasks must run on "
 367                            "the GPU to use GPU update.\n");
 368         GMX_RELEASE_ASSERT(ir->eI == eiMD,
 369                            "Only the md integrator is supported with the GPU update.\n");
 370         GMX_RELEASE_ASSERT(
 371                 ir->etc != etcNOSEHOOVER,
 372                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
 373         GMX_RELEASE_ASSERT(ir->epc == epcNO || ir->epc == epcPARRINELLORAHMAN || ir->epc == epcBERENDSEN,
 374                            "Only Parrinello-Rahman and Berendsen pressure coupling are supported "
 375                            "with the GPU update.\n");
 376         GMX_RELEASE_ASSERT(!mdatoms->haveVsites,
 377                            "Virtual sites are not supported with the GPU update.\n");
 378         GMX_RELEASE_ASSERT(ed == nullptr,
 379                            "Essential dynamics is not supported with the GPU update.\n");
 380         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(ir->pull),
 381                            "Constraints pulling is not supported with the GPU update.\n");
 382         GMX_RELEASE_ASSERT(fcd->orires.nr == 0,
 383                            "Orientation restraints are not supported with the GPU update.\n");
 384         GMX_RELEASE_ASSERT(
 385                 ir->efep == efepNO
 386                         || (!haveFreeEnergyType(*ir, efptBONDED) && !haveFreeEnergyType(*ir, efptMASS)),
 387                 "Free energy perturbation of masses and constraints are not supported with the GPU "
 388                 "update.");
 389
 390         if (constr != nullptr && constr->numConstraintsTotal() > 0)
 391         {
 392             GMX_LOG(mdlog.info)
 393                     .asParagraph()
 394                     .appendText("Updating coordinates and applying constraints on the GPU.");
 395         }
 396         else
 397         {
 398             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
 399         }
 400
 401         GMX_RELEASE_ASSERT(fr->deviceContext != nullptr,
 402                            "GPU device context should be initialized to use GPU update.");
 403         GMX_RELEASE_ASSERT(stateGpu->getUpdateStream() != nullptr,
 404                            "Update stream can not be nullptr when update is on a GPU.");
 405         integrator = std::make_unique<UpdateConstrainGpu>(*ir, *top_global, *fr->deviceContext,
 406                                                           *stateGpu->getUpdateStream(),
 407                                                           stateGpu->xUpdatedOnDevice());
 408
 409         integrator->setPbc(PbcType::Xyz, state->box);
 410     }
 411
 412     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 413     {
 414         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
 415     }
 416     if ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 417     {
 418         changePinningPolicy(&f, PinningPolicy::PinnedIfSupported);
 419     }
 420     if (useGpuForUpdate)
 421     {
 422         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
 423     }
 424
 425     // NOTE: The global state is no longer used at this point.
 426     // But state_global is still used as temporary storage space for writing
 427     // the global state to file and potentially for replica exchange.
 428     // (Global topology should persist.)
 429
 430     update_mdatoms(mdatoms, state->lambda[efptMASS]);
 431
 432     if (ir->bExpanded)
 433     {
 434         /* Check nstexpanded here, because the grompp check was broken */
 435         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
 436         {
 437             gmx_fatal(FARGS,
 438                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
 439         }
 440         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
 441     }
 442
 443     if (MASTER(cr))
 444     {
 445         EnergyElement::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
 446     }
 447
 448     preparePrevStepPullCom(ir, pull_work, mdatoms, state, state_global, cr,
 449                            startingBehavior != StartingBehavior::NewSimulation);
 450
 451     // TODO: Remove this by converting AWH into a ForceProvider
 452     auto awh = prepareAwhModule(fplog, *ir, state_global, cr, ms,
 453                                 startingBehavior != StartingBehavior::NewSimulation,
 454                                 shellfc != nullptr, opt2fn("-awh", nfile, fnm), pull_work);
 455
 456     if (useReplicaExchange && MASTER(cr))
 457     {
 458         repl_ex = init_replica_exchange(fplog, ms, top_global->natoms, ir, replExParams);
 459     }
 460     /* PME tuning is only supported in the Verlet scheme, with PME for
 461      * Coulomb. It is not supported with only LJ PME. */
 462     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
 463                 && ir->cutoff_scheme != ecutsGROUP);
 464
 465     pme_load_balancing_t* pme_loadbal = nullptr;
 466     if (bPMETune)
 467     {
 468         pme_loadbal_init(&pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata,
 469                          fr->nbv->useGpu());
 470     }
 471
 472     if (!ir->bContinuation)
 473     {
 474         if (state->flags & (1U << estV))
 475         {
 476             auto v = makeArrayRef(state->v);
 477             /* Set the velocities of vsites, shells and frozen atoms to zero */
 478             for (i = 0; i < mdatoms->homenr; i++)
 479             {
 480                 if (mdatoms->ptype[i] == eptVSite || mdatoms->ptype[i] == eptShell)
 481                 {
 482                     clear_rvec(v[i]);
 483                 }
 484                 else if (mdatoms->cFREEZE)
 485                 {
 486                     for (m = 0; m < DIM; m++)
 487                     {
 488                         if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
 489                         {
 490                             v[i][m] = 0;
 491                         }
 492                     }
 493                 }
 494             }
 495         }
 496
 497         if (constr)
 498         {
 499             /* Constrain the initial coordinates and velocities */
 500             do_constrain_first(fplog, constr, ir, mdatoms, state->natoms, state->x.arrayRefWithPadding(),
 501                                state->v.arrayRefWithPadding(), state->box, state->lambda[efptBONDED]);
 502         }
 503         if (vsite)
 504         {
 505             /* Construct the virtual sites for the initial configuration */
 506             construct_vsites(vsite, state->x.rvec_array(), ir->delta_t, nullptr, top.idef.iparams,
 507                              top.idef.il, fr->pbcType, fr->bMolPBC, cr, state->box);
 508         }
 509     }
 510
 511     if (ir->efep != efepNO)
 512     {
 513         /* Set free energy calculation frequency as the greatest common
 514          * denominator of nstdhdl and repl_ex_nst. */
 515         nstfep = ir->fepvals->nstdhdl;
 516         if (ir->bExpanded)
 517         {
 518             nstfep = gmx_greatest_common_divisor(ir->expandedvals->nstexpanded, nstfep);
 519         }
 520         if (useReplicaExchange)
 521         {
 522             nstfep = gmx_greatest_common_divisor(replExParams.exchangeInterval, nstfep);
 523         }
 524     }
 525
 526     /* Be REALLY careful about what flags you set here. You CANNOT assume
 527      * this is the first step, since we might be restarting from a checkpoint,
 528      * and in that case we should not do any modifications to the state.
 529      */
 530     bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
 531
 532     // When restarting from a checkpoint, it can be appropriate to
 533     // initialize ekind from quantities in the checkpoint. Otherwise,
 534     // compute_globals must initialize ekind before the simulation
 535     // starts/restarts. However, only the master rank knows what was
 536     // found in the checkpoint file, so we have to communicate in
 537     // order to coordinate the restart.
 538     //
 539     // TODO Consider removing this communication if/when checkpoint
 540     // reading directly follows .tpr reading, because all ranks can
 541     // agree on hasReadEkinState at that time.
 542     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
 543     if (PAR(cr))
 544     {
 545         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr);
 546     }
 547     if (hasReadEkinState)
 548     {
 549         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
 550     }
 551
 552     unsigned int cglo_flags =
 553             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
 554              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
 555
 556     bSumEkinhOld = FALSE;
 557
 558     t_vcm vcm(top_global->groups, *ir);
 559     reportComRemovalInfo(fplog, vcm);
 560
 561     /* To minimize communication, compute_globals computes the COM velocity
 562      * and the kinetic energy for the velocities without COM motion removed.
 563      * Thus to get the kinetic energy without the COM contribution, we need
 564      * to call compute_globals twice.
 565      */
 566     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
 567     {
 568         unsigned int cglo_flags_iteration = cglo_flags;
 569         if (bStopCM && cgloIteration == 0)
 570         {
 571             cglo_flags_iteration |= CGLO_STOPCM;
 572             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
 573         }
 574         compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 575                         makeConstArrayRef(state->v), state->box, state->lambda[efptVDW], mdatoms,
 576                         nrnb, &vcm, nullptr, enerd, force_vir, shake_vir, total_vir, pres, constr,
 577                         &nullSignaller, state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
 578                         cglo_flags_iteration
 579                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
 580                                                                          : 0));
 581         if (cglo_flags_iteration & CGLO_STOPCM)
 582         {
 583             /* At initialization, do not pass x with acceleration-correction mode
 584              * to avoid (incorrect) correction of the initial coordinates.
 585              */
 586             auto x = (vcm.mode == ecmLINEAR_ACCELERATION_CORRECTION) ? ArrayRef<RVec>()
 587                                                                      : makeArrayRef(state->x);
 588             process_and_stopcm_grp(fplog, &vcm, *mdatoms, x, makeArrayRef(state->v));
 589             inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
 590         }
 591     }
 592     checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global, &top,
 593                                     makeConstArrayRef(state->x), state->box,
 594                                     &shouldCheckNumberOfBondedInteractions);
 595     if (ir->eI == eiVVAK)
 596     {
 597         /* a second call to get the half step temperature initialized as well */
 598         /* we do the same call as above, but turn the pressure off -- internally to
 599            compute_globals, this is recognized as a velocity verlet half-step
 600            kinetic energy calculation.  This minimized excess variables, but
 601            perhaps loses some logic?*/
 602
 603         compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 604                         makeConstArrayRef(state->v), state->box, state->lambda[efptVDW], mdatoms,
 605                         nrnb, &vcm, nullptr, enerd, force_vir, shake_vir, total_vir, pres, constr,
 606                         &nullSignaller, state->box, nullptr, &bSumEkinhOld, cglo_flags & ~CGLO_PRESSURE);
 607     }
 608
 609     /* Calculate the initial half step temperature, and save the ekinh_old */
 610     if (startingBehavior == StartingBehavior::NewSimulation)
 611     {
 612         for (i = 0; (i < ir->opts.ngtc); i++)
 613         {
 614             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
 615         }
 616     }
 617
 618     /* need to make an initiation call to get the Trotter variables set, as well as other constants
 619        for non-trotter temperature control */
 620     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
 621
 622     if (MASTER(cr))
 623     {
 624         if (!ir->bContinuation)
 625         {
 626             if (constr && ir->eConstrAlg == econtLINCS)
 627             {
 628                 fprintf(fplog, "RMS relative constraint deviation after constraining: %.2e\n",
 629                         constr->rmsd());
 630             }
 631             if (EI_STATE_VELOCITY(ir->eI))
 632             {
 633                 real temp = enerd->term[F_TEMP];
 634                 if (ir->eI != eiVV)
 635                 {
 636                     /* Result of Ekin averaged over velocities of -half
 637                      * and +half step, while we only have -half step here.
 638                      */
 639                     temp *= 2;
 640                 }
 641                 fprintf(fplog, "Initial temperature: %g K\n", temp);
 642             }
 643         }
 644
 645         char tbuf[20];
 646         fprintf(stderr, "starting mdrun '%s'\n", *(top_global->name));
 647         if (ir->nsteps >= 0)
 648         {
 649             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
 650         }
 651         else
 652         {
 653             sprintf(tbuf, "%s", "infinite");
 654         }
 655         if (ir->init_step > 0)
 656         {
 657             fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 658                     gmx_step_str(ir->init_step + ir->nsteps, sbuf), tbuf,
 659                     gmx_step_str(ir->init_step, sbuf2), ir->init_step * ir->delta_t);
 660         }
 661         else
 662         {
 663             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
 664         }
 665         fprintf(fplog, "\n");
 666     }
 667
 668     walltime_accounting_start_time(walltime_accounting);
 669     wallcycle_start(wcycle, ewcRUN);
 670     print_start(fplog, cr, walltime_accounting, "mdrun");
 671
 672 #if GMX_FAHCORE
 673     /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
 674     int chkpt_ret = fcCheckPointParallel(cr->nodeid, NULL, 0);
 675     if (chkpt_ret == 0)
 676     {
 677         gmx_fatal(3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0);
 678     }
 679 #endif
 680
 681     /***********************************************************
 682      *
 683      *             Loop over MD steps
 684      *
 685      ************************************************************/
 686
 687     bFirstStep = TRUE;
 688     /* Skip the first Nose-Hoover integration when we get the state from tpx */
 689     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
 690     bSumEkinhOld     = FALSE;
 691     bExchanged       = FALSE;
 692     bNeedRepartition = FALSE;
 693
 694     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
 695             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]), simulationsShareState,
 696             MASTER(cr), ir->nstlist, mdrunOptions.reproducible, nstSignalComm,
 697             mdrunOptions.maximumHoursToRun, ir->nstlist == 0, fplog, step, bNS, walltime_accounting);
 698
 699     auto checkpointHandler = std::make_unique<CheckpointHandler>(
 700             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]), simulationsShareState,
 701             ir->nstlist == 0, MASTER(cr), mdrunOptions.writeConfout,
 702             mdrunOptions.checkpointOptions.period);
 703
 704     const bool resetCountersIsLocal = true;
 705     auto       resetHandler         = std::make_unique<ResetHandler>(
 706             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
 707             !resetCountersIsLocal, ir->nsteps, MASTER(cr), mdrunOptions.timingOptions.resetHalfway,
 708             mdrunOptions.maximumHoursToRun, mdlog, wcycle, walltime_accounting);
 709
 710     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
 711
 712     step     = ir->init_step;
 713     step_rel = 0;
 714
 715     // TODO extract this to new multi-simulation module
 716     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
 717     {
 718         if (!multisim_int_all_are_equal(ms, ir->nsteps))
 719         {
 720             GMX_LOG(mdlog.warning)
 721                     .appendText(
 722                             "Note: The number of steps is not consistent across multi "
 723                             "simulations,\n"
 724                             "but we are proceeding anyway!");
 725         }
 726         if (!multisim_int_all_are_equal(ms, ir->init_step))
 727         {
 728             if (simulationsShareState)
 729             {
 730                 if (MASTER(cr))
 731                 {
 732                     gmx_fatal(FARGS,
 733                               "The initial step is not consistent across multi simulations which "
 734                               "share the state");
 735                 }
 736                 gmx_barrier(cr);
 737             }
 738             else
 739             {
 740                 GMX_LOG(mdlog.warning)
 741                         .appendText(
 742                                 "Note: The initial step is not consistent across multi "
 743                                 "simulations,\n"
 744                                 "but we are proceeding anyway!");
 745             }
 746         }
 747     }
 748
 749     /* and stop now if we should */
 750     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
 751     while (!bLastStep)
 752     {
 753
 754         /* Determine if this is a neighbor search step */
 755         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
 756
 757         if (bPMETune && bNStList)
 758         {
 759             // This has to be here because PME load balancing is called so early.
 760             // TODO: Move to after all booleans are defined.
 761             if (useGpuForUpdate && !bFirstStep)
 762             {
 763                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 764                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 765             }
 766             /* PME grid + cut-off optimization with GPUs or PME nodes */
 767             pme_loadbal_do(pme_loadbal, cr, (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
 768                            fplog, mdlog, *ir, fr, state->box, state->x, wcycle, step, step_rel,
 769                            &bPMETunePrinting, simulationWork.useGpuPmePpCommunication);
 770         }
 771
 772         wallcycle_start(wcycle, ewcSTEP);
 773
 774         bLastStep = (step_rel == ir->nsteps);
 775         t         = t0 + step * ir->delta_t;
 776
 777         // TODO Refactor this, so that nstfep does not need a default value of zero
 778         if (ir->efep != efepNO || ir->bSimTemp)
 779         {
 780             /* find and set the current lambdas */
 781             setCurrentLambdasLocal(step, ir->fepvals, lam0, state->lambda, state->fep_state);
 782
 783             bDoDHDL     = do_per_step(step, ir->fepvals->nstdhdl);
 784             bDoFEP      = ((ir->efep != efepNO) && do_per_step(step, nstfep));
 785             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
 786                            && (step > 0) && (startingBehavior == StartingBehavior::NewSimulation));
 787         }
 788
 789         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
 790                      && do_per_step(step, replExParams.exchangeInterval));
 791
 792         if (doSimulatedAnnealing)
 793         {
 794             update_annealing_target_temp(ir, t, &upd);
 795         }
 796
 797         /* Stop Center of Mass motion */
 798         bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
 799
 800         /* Determine whether or not to do Neighbour Searching */
 801         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
 802
 803         /* Note that the stopHandler will cause termination at nstglobalcomm
 804          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
 805          * nstpcouple steps, we have computed the half-step kinetic energy
 806          * of the previous step and can always output energies at the last step.
 807          */
 808         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
 809
 810         /* do_log triggers energy and virial calculation. Because this leads
 811          * to different code paths, forces can be different. Thus for exact
 812          * continuation we should avoid extra log output.
 813          * Note that the || bLastStep can result in non-exact continuation
 814          * beyond the last step. But we don't consider that to be an issue.
 815          */
 816         do_log     = (do_per_step(step, ir->nstlog)
 817                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
 818         do_verbose = mdrunOptions.verbose
 819                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
 820
 821         if (useGpuForUpdate && !bFirstStep && bNS)
 822         {
 823             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
 824             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
 825             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
 826             // Copy coordinate from the GPU when needed at the search step.
 827             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
 828             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
 829             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 830             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 831         }
 832
 833         if (bNS && !(bFirstStep && ir->bContinuation))
 834         {
 835             bMasterState = FALSE;
 836             /* Correct the new box if it is too skewed */
 837             if (inputrecDynamicBox(ir))
 838             {
 839                 if (correct_box(fplog, step, state->box))
 840                 {
 841                     bMasterState = TRUE;
 842                     // If update is offloaded, it should be informed about the box size change
 843                     if (useGpuForUpdate)
 844                     {
 845                         integrator->setPbc(PbcType::Xyz, state->box);
 846                     }
 847                 }
 848             }
 849             if (DOMAINDECOMP(cr) && bMasterState)
 850             {
 851                 dd_collect_state(cr->dd, state, state_global);
 852             }
 853
 854             if (DOMAINDECOMP(cr))
 855             {
 856                 /* Repartition the domain decomposition */
 857                 dd_partition_system(fplog, mdlog, step, cr, bMasterState, nstglobalcomm, state_global,
 858                                     *top_global, ir, imdSession, pull_work, state, &f, mdAtoms, &top,
 859                                     fr, vsite, constr, nrnb, wcycle, do_verbose && !bPMETunePrinting);
 860                 shouldCheckNumberOfBondedInteractions = true;
 861                 upd.setNumAtoms(state->natoms);
 862
 863                 // Allocate or re-size GPU halo exchange object, if necessary
 864                 if (havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange
 865                     && useGpuForNonbonded && is1D(*cr->dd))
 866                 {
 867                     // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
 868                     const DeviceStream* localStream =
 869                             Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
 870                     const DeviceStream* nonLocalStream = Nbnxm::gpu_get_command_stream(
 871                             fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
 872                     GMX_RELEASE_ASSERT(
 873                             fr->deviceContext != nullptr,
 874                             "GPU device context should be initialized to use GPU halo exchange.");
 875                     GMX_RELEASE_ASSERT(localStream != nullptr,
 876                                        "Local non-bonded stream can't be nullptr when using GPU "
 877                                        "halo exchange.");
 878                     GMX_RELEASE_ASSERT(nonLocalStream != nullptr,
 879                                        "Non-local non-bonded stream can't be nullptr when using "
 880                                        "GPU halo exchange.");
 881                     constructGpuHaloExchange(mdlog, *cr, *fr->deviceContext, *localStream, *nonLocalStream);
 882                 }
 883             }
 884         }
 885
 886         if (MASTER(cr) && do_log)
 887         {
 888             energyOutput.printHeader(fplog, step, t); /* can we improve the information printed here? */
 889         }
 890
 891         if (ir->efep != efepNO)
 892         {
 893             update_mdatoms(mdatoms, state->lambda[efptMASS]);
 894         }
 895
 896         if (bExchanged)
 897         {
 898
 899             /* We need the kinetic energy at minus the half step for determining
 900              * the full step kinetic energy and possibly for T-coupling.*/
 901             /* This may not be quite working correctly yet . . . . */
 902             compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 903                             makeConstArrayRef(state->v), state->box, state->lambda[efptVDW], mdatoms,
 904                             nrnb, &vcm, wcycle, enerd, nullptr, nullptr, nullptr, nullptr, constr,
 905                             &nullSignaller, state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
 906                             CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
 907             checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global,
 908                                             &top, makeConstArrayRef(state->x), state->box,
 909                                             &shouldCheckNumberOfBondedInteractions);
 910         }
 911         clear_mat(force_vir);
 912
 913         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
 914
 915         /* Determine the energy and pressure:
 916          * at nstcalcenergy steps and at energy output steps (set below).
 917          */
 918         if (EI_VV(ir->eI) && (!bInitStep))
 919         {
 920             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
 921             bCalcVir      = bCalcEnerStep
 922                        || (ir->epc != epcNO
 923                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
 924         }
 925         else
 926         {
 927             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
 928             bCalcVir = bCalcEnerStep || (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
 929         }
 930         bCalcEner = bCalcEnerStep;
 931
 932         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
 933
 934         if (do_ene || do_log || bDoReplEx)
 935         {
 936             bCalcVir  = TRUE;
 937             bCalcEner = TRUE;
 938         }
 939
 940         /* Do we need global communication ? */
 941         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
 942                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
 943
 944         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
 945                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
 946                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
 947
 948         if (shellfc)
 949         {
 950             /* Now is the time to relax the shells */
 951             relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose, enforcedRotation, step, ir,
 952                                 imdSession, pull_work, bNS, force_flags, &top, constr, enerd, fcd,
 953                                 state->natoms, state->x.arrayRefWithPadding(),
 954                                 state->v.arrayRefWithPadding(), state->box, state->lambda, &state->hist,
 955                                 f.arrayRefWithPadding(), force_vir, mdatoms, nrnb, wcycle, shellfc,
 956                                 fr, runScheduleWork, t, mu_tot, vsite, ddBalanceRegionHandler);
 957         }
 958         else
 959         {
 960             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
 961                is updated (or the AWH update will be performed twice for one step when continuing).
 962                It would be best to call this update function from do_md_trajectory_writing but that
 963                would occur after do_force. One would have to divide the update_awh function into one
 964                function applying the AWH force and one doing the AWH bias update. The update AWH
 965                bias function could then be called after do_md_trajectory_writing (then containing
 966                update_awh_history). The checkpointing will in the future probably moved to the start
 967                of the md loop which will rid of this issue. */
 968             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
 969             {
 970                 awh->updateHistory(state_global->awhHistory.get());
 971             }
 972
 973             /* The coordinates (x) are shifted (to get whole molecules)
 974              * in do_force.
 975              * This is parallellized as well, and does communication too.
 976              * Check comments in sim_util.c
 977              */
 978             do_force(fplog, cr, ms, ir, awh.get(), enforcedRotation, imdSession, pull_work, step,
 979                      nrnb, wcycle, &top, state->box, state->x.arrayRefWithPadding(), &state->hist,
 980                      f.arrayRefWithPadding(), force_vir, mdatoms, enerd, fcd, state->lambda, fr,
 981                      runScheduleWork, vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
 982                      (bNS ? GMX_FORCE_NS : 0) | force_flags, ddBalanceRegionHandler);
 983         }
 984
 985         // VV integrators do not need the following velocity half step
 986         // if it is the first step after starting from a checkpoint.
 987         // That is, the half step is needed on all other steps, and
 988         // also the first step when starting from a .tpr file.
 989         if (EI_VV(ir->eI) && (!bFirstStep || startingBehavior == StartingBehavior::NewSimulation))
 990         /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
 991         {
 992             rvec* vbuf = nullptr;
 993
 994             wallcycle_start(wcycle, ewcUPDATE);
 995             if (ir->eI == eiVV && bInitStep)
 996             {
 997                 /* if using velocity verlet with full time step Ekin,
 998                  * take the first half step only to compute the
 999                  * virial for the first step. From there,
1000                  * revert back to the initial coordinates
1001                  * so that the input is actually the initial step.
1002                  */
1003                 snew(vbuf, state->natoms);
1004                 copy_rvecn(state->v.rvec_array(), vbuf, 0,
1005                            state->natoms); /* should make this better for parallelizing? */
1006             }
1007             else
1008             {
1009                 /* this is for NHC in the Ekin(t+dt/2) version of vv */
1010                 trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ,
1011                                trotter_seq, ettTSEQ1);
1012             }
1013
1014             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd, ekind, M, &upd,
1015                           etrtVELOCITY1, cr, constr);
1016
1017             wallcycle_stop(wcycle, ewcUPDATE);
1018             constrain_velocities(step, nullptr, state, shake_vir, constr, bCalcVir, do_log, do_ene);
1019             wallcycle_start(wcycle, ewcUPDATE);
1020             /* if VV, compute the pressure and constraints */
1021             /* For VV2, we strictly only need this if using pressure
1022              * control, but we really would like to have accurate pressures
1023              * printed out.
1024              * Think about ways around this in the future?
1025              * For now, keep this choice in comments.
1026              */
1027             /*bPres = (ir->eI==eiVV || inputrecNptTrotter(ir)); */
1028             /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && inputrecNptTrotter(ir)));*/
1029             bPres = TRUE;
1030             bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
1031             if (bCalcEner && ir->eI == eiVVAK)
1032             {
1033                 bSumEkinhOld = TRUE;
1034             }
1035             /* for vv, the first half of the integration actually corresponds to the previous step.
1036                So we need information from the last step in the first half of the integration */
1037             if (bGStat || do_per_step(step - 1, nstglobalcomm))
1038             {
1039                 wallcycle_stop(wcycle, ewcUPDATE);
1040                 compute_globals(
1041                         gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1042                         makeConstArrayRef(state->v), state->box, state->lambda[efptVDW], mdatoms,
1043                         nrnb, &vcm, wcycle, enerd, force_vir, shake_vir, total_vir, pres, constr,
1044                         &nullSignaller, state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
1045                         (bGStat ? CGLO_GSTAT : 0) | (bCalcEner ? CGLO_ENERGY : 0)
1046                                 | (bTemp ? CGLO_TEMPERATURE : 0) | (bPres ? CGLO_PRESSURE : 0)
1047                                 | (bPres ? CGLO_CONSTRAINT : 0) | (bStopCM ? CGLO_STOPCM : 0)
1048                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1049                                                                          : 0)
1050                                 | CGLO_SCALEEKIN);
1051                 /* explanation of above:
1052                    a) We compute Ekin at the full time step
1053                    if 1) we are using the AveVel Ekin, and it's not the
1054                    initial step, or 2) if we are using AveEkin, but need the full
1055                    time step kinetic energy for the pressure (always true now, since we want accurate statistics).
1056                    b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
1057                    EkinAveVel because it's needed for the pressure */
1058                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1059                                                 top_global, &top, makeConstArrayRef(state->x),
1060                                                 state->box, &shouldCheckNumberOfBondedInteractions);
1061                 if (bStopCM)
1062                 {
1063                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, makeArrayRef(state->x),
1064                                            makeArrayRef(state->v));
1065                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1066                 }
1067                 wallcycle_start(wcycle, ewcUPDATE);
1068             }
1069             /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
1070             if (!bInitStep)
1071             {
1072                 if (bTrotter)
1073                 {
1074                     m_add(force_vir, shake_vir,
1075                           total_vir); /* we need the un-dispersion corrected total vir here */
1076                     trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ,
1077                                    trotter_seq, ettTSEQ2);
1078
1079                     /* TODO This is only needed when we're about to write
1080                      * a checkpoint, because we use it after the restart
1081                      * (in a kludge?). But what should we be doing if
1082                      * the startingBehavior is NewSimulation or bInitStep are true? */
1083                     if (inputrecNptTrotter(ir) || inputrecNphTrotter(ir))
1084                     {
1085                         copy_mat(shake_vir, state->svir_prev);
1086                         copy_mat(force_vir, state->fvir_prev);
1087                     }
1088                     if (inputrecNvtTrotter(ir) && ir->eI == eiVV)
1089                     {
1090                         /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
1091                         enerd->term[F_TEMP] =
1092                                 sum_ekin(&(ir->opts), ekind, nullptr, (ir->eI == eiVV), FALSE);
1093                         enerd->term[F_EKIN] = trace(ekind->ekin);
1094                     }
1095                 }
1096                 else if (bExchanged)
1097                 {
1098                     wallcycle_stop(wcycle, ewcUPDATE);
1099                     /* We need the kinetic energy at minus the half step for determining
1100                      * the full step kinetic energy and possibly for T-coupling.*/
1101                     /* This may not be quite working correctly yet . . . . */
1102                     compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1103                                     makeConstArrayRef(state->v), state->box, state->lambda[efptVDW],
1104                                     mdatoms, nrnb, &vcm, wcycle, enerd, nullptr, nullptr, nullptr,
1105                                     nullptr, constr, &nullSignaller, state->box, nullptr,
1106                                     &bSumEkinhOld, CGLO_GSTAT | CGLO_TEMPERATURE);
1107                     wallcycle_start(wcycle, ewcUPDATE);
1108                 }
1109             }
1110             /* if it's the initial step, we performed this first step just to get the constraint virial */
1111             if (ir->eI == eiVV && bInitStep)
1112             {
1113                 copy_rvecn(vbuf, state->v.rvec_array(), 0, state->natoms);
1114                 sfree(vbuf);
1115             }
1116             wallcycle_stop(wcycle, ewcUPDATE);
1117         }
1118
1119         /* compute the conserved quantity */
1120         if (EI_VV(ir->eI))
1121         {
1122             saved_conserved_quantity = NPT_energy(ir, state, &MassQ);
1123             if (ir->eI == eiVV)
1124             {
1125                 last_ekin = enerd->term[F_EKIN];
1126             }
1127             if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
1128             {
1129                 saved_conserved_quantity -= enerd->term[F_DISPCORR];
1130             }
1131             /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
1132             if (ir->efep != efepNO)
1133             {
1134                 sum_dhdl(enerd, state->lambda, *ir->fepvals);
1135             }
1136         }
1137
1138         /* ########  END FIRST UPDATE STEP  ############## */
1139         /* ########  If doing VV, we now have v(dt) ###### */
1140         if (bDoExpanded)
1141         {
1142             /* perform extended ensemble sampling in lambda - we don't
1143                actually move to the new state before outputting
1144                statistics, but if performing simulated tempering, we
1145                do update the velocities and the tau_t. */
1146
1147             lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state,
1148                                               state->dfhist, step, state->v.rvec_array(), mdatoms);
1149             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1150             if (MASTER(cr))
1151             {
1152                 copy_df_history(state_global->dfhist, state->dfhist);
1153             }
1154         }
1155
1156         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1157         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1158         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1159             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1160                 || checkpointHandler->isCheckpointingStep()))
1161         {
1162             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1163             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1164         }
1165         // Copy velocities if needed for the output/checkpointing.
1166         // NOTE: Copy on the search steps is done at the beginning of the step.
1167         if (useGpuForUpdate && !bNS
1168             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1169         {
1170             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1171             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1172         }
1173         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1174         // and update is offloaded hence forces are kept on the GPU for update and have not been
1175         // already transferred in do_force().
1176         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1177         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1178         //       prior to GPU update.
1179         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1180         //       copy call in do_force(...).
1181         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1182         //       on host after the D2H copy in do_force(...).
1183         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1184             && do_per_step(step, ir->nstfout))
1185         {
1186             stateGpu->copyForcesFromGpu(ArrayRef<RVec>(f), AtomLocality::Local);
1187             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1188         }
1189         /* Now we have the energies and forces corresponding to the
1190          * coordinates at time t. We must output all of this before
1191          * the update.
1192          */
1193         do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t, ir, state, state_global,
1194                                  observablesHistory, top_global, fr, outf, energyOutput, ekind, f,
1195                                  checkpointHandler->isCheckpointingStep(), bRerunMD, bLastStep,
1196                                  mdrunOptions.writeConfout, bSumEkinhOld);
1197         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1198         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x.rvec_array(), t);
1199
1200         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1201         if (startingBehavior != StartingBehavior::NewSimulation
1202             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1203         {
1204             copy_mat(state->svir_prev, shake_vir);
1205             copy_mat(state->fvir_prev, force_vir);
1206         }
1207
1208         stopHandler->setSignal();
1209         resetHandler->setSignal(walltime_accounting);
1210
1211         if (bGStat || !PAR(cr))
1212         {
1213             /* In parallel we only have to check for checkpointing in steps
1214              * where we do global communication,
1215              *  otherwise the other nodes don't know.
1216              */
1217             checkpointHandler->setSignal(walltime_accounting);
1218         }
1219
1220         /* #########   START SECOND UPDATE STEP ################# */
1221
1222         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1223            controlled in preprocessing */
1224
1225         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1226         {
1227             gmx_bool bIfRandomize;
1228             bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, &upd, constr);
1229             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1230             if (constr && bIfRandomize)
1231             {
1232                 constrain_velocities(step, nullptr, state, tmp_vir, constr, bCalcVir, do_log, do_ene);
1233             }
1234         }
1235         /* Box is changed in update() when we do pressure coupling,
1236          * but we should still use the old box for energy corrections and when
1237          * writing it to the energy file, so it matches the trajectory files for
1238          * the same timestep above. Make a copy in a separate array.
1239          */
1240         copy_mat(state->box, lastbox);
1241
1242         dvdl_constr = 0;
1243
1244         wallcycle_start(wcycle, ewcUPDATE);
1245         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1246         if (bTrotter)
1247         {
1248             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
1249             /* We can only do Berendsen coupling after we have summed
1250              * the kinetic energy or virial. Since the happens
1251              * in global_state after update, we should only do it at
1252              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1253              */
1254         }
1255         else
1256         {
1257             update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
1258             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1259         }
1260
1261         if (EI_VV(ir->eI))
1262         {
1263             /* velocity half-step update */
1264             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd, ekind, M, &upd,
1265                           etrtVELOCITY2, cr, constr);
1266         }
1267
1268         /* Above, initialize just copies ekinh into ekin,
1269          * it doesn't copy position (for VV),
1270          * and entire integrator for MD.
1271          */
1272
1273         if (ir->eI == eiVVAK)
1274         {
1275             cbuf.resize(state->x.size());
1276             std::copy(state->x.begin(), state->x.end(), cbuf.begin());
1277         }
1278
1279         /* With leap-frog type integrators we compute the kinetic energy
1280          * at a whole time step as the average of the half-time step kinetic
1281          * energies of two subsequent steps. Therefore we need to compute the
1282          * half step kinetic energy also if we need energies at the next step.
1283          */
1284         const bool needHalfStepKineticEnergy =
1285                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1286
1287         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1288         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1289         const bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN
1290                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1291
1292         if (useGpuForUpdate)
1293         {
1294             if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1295             {
1296                 integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(),
1297                                 stateGpu->getForces(), top.idef, *mdatoms, ekind->ngtc);
1298
1299                 // Copy data to the GPU after buffers might have being reinitialized
1300                 stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1301                 stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1302             }
1303
1304             // If the buffer ops were not offloaded this step, the forces are on the host and have to be copied
1305             if (!runScheduleWork->stepWork.useGpuFBufferOps)
1306             {
1307                 stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), AtomLocality::Local);
1308             }
1309
1310             const bool doTemperatureScaling =
1311                     (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1312
1313             // This applies Leap-Frog, LINCS and SETTLE in succession
1314             integrator->integrate(stateGpu->getForcesReadyOnDeviceEvent(
1315                                           AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1316                                   ir->delta_t, true, bCalcVir, shake_vir, doTemperatureScaling,
1317                                   ekind->tcstat, doParrinelloRahman, ir->nstpcouple * ir->delta_t, M);
1318
1319             // Copy velocities D2H after update if:
1320             // - Globals are computed this step (includes the energy output steps).
1321             // - Temperature is needed for the next step.
1322             if (bGStat || needHalfStepKineticEnergy)
1323             {
1324                 stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1325                 stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1326             }
1327         }
1328         else
1329         {
1330             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd, ekind, M, &upd,
1331                           etrtPOSITION, cr, constr);
1332
1333             wallcycle_stop(wcycle, ewcUPDATE);
1334
1335             constrain_coordinates(step, &dvdl_constr, state, shake_vir, &upd, constr, bCalcVir,
1336                                   do_log, do_ene);
1337
1338             update_sd_second_half(step, &dvdl_constr, ir, mdatoms, state, cr, nrnb, wcycle, &upd,
1339                                   constr, do_log, do_ene);
1340             finish_update(ir, mdatoms, state, wcycle, &upd, constr);
1341         }
1342
1343         if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1344         {
1345             updatePrevStepPullCom(pull_work, state);
1346         }
1347
1348         if (ir->eI == eiVVAK)
1349         {
1350             /* erase F_EKIN and F_TEMP here? */
1351             /* just compute the kinetic energy at the half step to perform a trotter step */
1352             compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1353                             makeConstArrayRef(state->v), state->box, state->lambda[efptVDW],
1354                             mdatoms, nrnb, &vcm, wcycle, enerd, force_vir, shake_vir, total_vir,
1355                             pres, constr, &nullSignaller, lastbox, nullptr, &bSumEkinhOld,
1356                             (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE);
1357             wallcycle_start(wcycle, ewcUPDATE);
1358             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
1359             /* now we know the scaling, we can compute the positions again */
1360             std::copy(cbuf.begin(), cbuf.end(), state->x.begin());
1361
1362             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd, ekind, M, &upd,
1363                           etrtPOSITION, cr, constr);
1364             wallcycle_stop(wcycle, ewcUPDATE);
1365
1366             /* do we need an extra constraint here? just need to copy out of as_rvec_array(state->v.data()) to upd->xp? */
1367             /* are the small terms in the shake_vir here due
1368              * to numerical errors, or are they important
1369              * physically? I'm thinking they are just errors, but not completely sure.
1370              * For now, will call without actually constraining, constr=NULL*/
1371             finish_update(ir, mdatoms, state, wcycle, &upd, nullptr);
1372         }
1373         if (EI_VV(ir->eI))
1374         {
1375             /* this factor or 2 correction is necessary
1376                because half of the constraint force is removed
1377                in the vv step, so we have to double it.  See
1378                the Redmine issue #1255.  It is not yet clear
1379                if the factor of 2 is exact, or just a very
1380                good approximation, and this will be
1381                investigated.  The next step is to see if this
1382                can be done adding a dhdl contribution from the
1383                rattle step, but this is somewhat more
1384                complicated with the current code. Will be
1385                investigated, hopefully for 4.6.3. However,
1386                this current solution is much better than
1387                having it completely wrong.
1388              */
1389             enerd->term[F_DVDL_CONSTR] += 2 * dvdl_constr;
1390         }
1391         else
1392         {
1393             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1394         }
1395
1396         if (vsite != nullptr)
1397         {
1398             wallcycle_start(wcycle, ewcVSITECONSTR);
1399             construct_vsites(vsite, state->x.rvec_array(), ir->delta_t, state->v.rvec_array(),
1400                              top.idef.iparams, top.idef.il, fr->pbcType, fr->bMolPBC, cr, state->box);
1401             wallcycle_stop(wcycle, ewcVSITECONSTR);
1402         }
1403
1404         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1405         /* With Leap-Frog we can skip compute_globals at
1406          * non-communication steps, but we need to calculate
1407          * the kinetic energy one step before communication.
1408          */
1409         {
1410             // Organize to do inter-simulation signalling on steps if
1411             // and when algorithms require it.
1412             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1413
1414             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1415             {
1416                 // Copy coordinates when needed to stop the CM motion.
1417                 if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
1418                 {
1419                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1420                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1421                 }
1422                 // Since we're already communicating at this step, we
1423                 // can propagate intra-simulation signals. Note that
1424                 // check_nstglobalcomm has the responsibility for
1425                 // choosing the value of nstglobalcomm that is one way
1426                 // bGStat becomes true, so we can't get into a
1427                 // situation where e.g. checkpointing can't be
1428                 // signalled.
1429                 bool                doIntraSimSignal = true;
1430                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1431
1432                 compute_globals(
1433                         gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1434                         makeConstArrayRef(state->v), state->box, state->lambda[efptVDW], mdatoms,
1435                         nrnb, &vcm, wcycle, enerd, force_vir, shake_vir, total_vir, pres, constr,
1436                         &signaller, lastbox, &totalNumberOfBondedInteractions, &bSumEkinhOld,
1437                         (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1438                                 | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1439                                 | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1440                                 | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
1441                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1442                                                                          : 0));
1443                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1444                                                 top_global, &top, makeConstArrayRef(state->x),
1445                                                 state->box, &shouldCheckNumberOfBondedInteractions);
1446                 if (!EI_VV(ir->eI) && bStopCM)
1447                 {
1448                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, makeArrayRef(state->x),
1449                                            makeArrayRef(state->v));
1450                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1451
1452                     // TODO: The special case of removing CM motion should be dealt more gracefully
1453                     if (useGpuForUpdate)
1454                     {
1455                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1456                         // Here we block until the H2D copy completes because event sync with the
1457                         // force kernels that use the coordinates on the next steps is not implemented
1458                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1459                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1460                     }
1461                 }
1462             }
1463         }
1464
1465         /* #############  END CALC EKIN AND PRESSURE ################# */
1466
1467         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1468            the virial that should probably be addressed eventually. state->veta has better properies,
1469            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1470            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1471
1472         if (ir->efep != efepNO && !EI_VV(ir->eI))
1473         {
1474             /* Sum up the foreign energy and dhdl terms for md and sd.
1475                Currently done every step so that dhdl is correct in the .edr */
1476             sum_dhdl(enerd, state->lambda, *ir->fepvals);
1477         }
1478
1479         update_pcouple_after_coordinates(fplog, step, ir, mdatoms, pres, force_vir, shake_vir,
1480                                          pressureCouplingMu, state, nrnb, &upd, !useGpuForUpdate);
1481
1482         const bool doBerendsenPressureCoupling =
1483                 (inputrec->epc == epcBERENDSEN && do_per_step(step, inputrec->nstpcouple));
1484         if (useGpuForUpdate && (doBerendsenPressureCoupling || doParrinelloRahman))
1485         {
1486             integrator->scaleCoordinates(pressureCouplingMu);
1487             integrator->setPbc(PbcType::Xyz, state->box);
1488         }
1489
1490         /* ################# END UPDATE STEP 2 ################# */
1491         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1492
1493         /* The coordinates (x) were unshifted in update */
1494         if (!bGStat)
1495         {
1496             /* We will not sum ekinh_old,
1497              * so signal that we still have to do it.
1498              */
1499             bSumEkinhOld = TRUE;
1500         }
1501
1502         if (bCalcEner)
1503         {
1504             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1505
1506             /* use the directly determined last velocity, not actually the averaged half steps */
1507             if (bTrotter && ir->eI == eiVV)
1508             {
1509                 enerd->term[F_EKIN] = last_ekin;
1510             }
1511             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1512
1513             if (integratorHasConservedEnergyQuantity(ir))
1514             {
1515                 if (EI_VV(ir->eI))
1516                 {
1517                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1518                 }
1519                 else
1520                 {
1521                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1522                 }
1523             }
1524             /* #########  END PREPARING EDR OUTPUT  ###########  */
1525         }
1526
1527         /* Output stuff */
1528         if (MASTER(cr))
1529         {
1530             if (fplog && do_log && bDoExpanded)
1531             {
1532                 /* only needed if doing expanded ensemble */
1533                 PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals,
1534                                           ir->bSimTemp ? ir->simtempvals : nullptr,
1535                                           state_global->dfhist, state->fep_state, ir->nstlog, step);
1536             }
1537             if (bCalcEner)
1538             {
1539                 energyOutput.addDataAtEnergyStep(bDoDHDL, bCalcEnerStep, t, mdatoms->tmass, enerd, state,
1540                                                  ir->fepvals, ir->expandedvals, lastbox, shake_vir,
1541                                                  force_vir, total_vir, pres, ekind, mu_tot, constr);
1542             }
1543             else
1544             {
1545                 energyOutput.recordNonEnergyStep();
1546             }
1547
1548             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1549             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1550
1551             if (doSimulatedAnnealing)
1552             {
1553                 energyOutput.printAnnealingTemperatures(do_log ? fplog : nullptr, groups, &(ir->opts));
1554             }
1555             if (do_log || do_ene || do_dr || do_or)
1556             {
1557                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or,
1558                                                    do_log ? fplog : nullptr, step, t, fcd, awh.get());
1559             }
1560
1561             if (ir->bPull)
1562             {
1563                 pull_print_output(pull_work, step, t);
1564             }
1565
1566             if (do_per_step(step, ir->nstlog))
1567             {
1568                 if (fflush(fplog) != 0)
1569                 {
1570                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1571                 }
1572             }
1573         }
1574         if (bDoExpanded)
1575         {
1576             /* Have to do this part _after_ outputting the logfile and the edr file */
1577             /* Gets written into the state at the beginning of next loop*/
1578             state->fep_state = lamnew;
1579         }
1580         /* Print the remaining wall clock time for the run */
1581         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1582         {
1583             if (shellfc)
1584             {
1585                 fprintf(stderr, "\n");
1586             }
1587             print_time(stderr, walltime_accounting, step, ir, cr);
1588         }
1589
1590         /* Ion/water position swapping.
1591          * Not done in last step since trajectory writing happens before this call
1592          * in the MD loop and exchanges would be lost anyway. */
1593         bNeedRepartition = FALSE;
1594         if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep && do_per_step(step, ir->swap->nstswap))
1595         {
1596             bNeedRepartition =
1597                     do_swapcoords(cr, step, t, ir, swap, wcycle, as_rvec_array(state->x.data()),
1598                                   state->box, MASTER(cr) && mdrunOptions.verbose, bRerunMD);
1599
1600             if (bNeedRepartition && DOMAINDECOMP(cr))
1601             {
1602                 dd_collect_state(cr->dd, state, state_global);
1603             }
1604         }
1605
1606         /* Replica exchange */
1607         bExchanged = FALSE;
1608         if (bDoReplEx)
1609         {
1610             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1611         }
1612
1613         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1614         {
1615             dd_partition_system(fplog, mdlog, step, cr, TRUE, 1, state_global, *top_global, ir,
1616                                 imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
1617                                 nrnb, wcycle, FALSE);
1618             shouldCheckNumberOfBondedInteractions = true;
1619             upd.setNumAtoms(state->natoms);
1620         }
1621
1622         bFirstStep = FALSE;
1623         bInitStep  = FALSE;
1624
1625         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1626         /* With all integrators, except VV, we need to retain the pressure
1627          * at the current step for coupling at the next step.
1628          */
1629         if ((state->flags & (1U << estPRES_PREV))
1630             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1631         {
1632             /* Store the pressure in t_state for pressure coupling
1633              * at the next MD step.
1634              */
1635             copy_mat(pres, state->pres_prev);
1636         }
1637
1638         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1639
1640         if ((membed != nullptr) && (!bLastStep))
1641         {
1642             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1643         }
1644
1645         cycles = wallcycle_stop(wcycle, ewcSTEP);
1646         if (DOMAINDECOMP(cr) && wcycle)
1647         {
1648             dd_cycles_add(cr->dd, cycles, ddCyclStep);
1649         }
1650
1651         /* increase the MD step number */
1652         step++;
1653         step_rel++;
1654
1655         resetHandler->resetCounters(step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb,
1656                                     fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
1657
1658         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
1659         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
1660     }
1661     /* End of main MD loop */
1662
1663     /* Closing TNG files can include compressing data. Therefore it is good to do that
1664      * before stopping the time measurements. */
1665     mdoutf_tng_close(outf);
1666
1667     /* Stop measuring walltime */
1668     walltime_accounting_end_time(walltime_accounting);
1669
1670     if (!thisRankHasDuty(cr, DUTY_PME))
1671     {
1672         /* Tell the PME only node to finish */
1673         gmx_pme_send_finish(cr);
1674     }
1675
1676     if (MASTER(cr))
1677     {
1678         if (ir->nstcalcenergy > 0)
1679         {
1680             energyOutput.printAnnealingTemperatures(fplog, groups, &(ir->opts));
1681             energyOutput.printAverages(fplog, groups);
1682         }
1683     }
1684     done_mdoutf(outf);
1685
1686     if (bPMETune)
1687     {
1688         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
1689     }
1690
1691     done_shellfc(fplog, shellfc, step_rel);
1692
1693     if (useReplicaExchange && MASTER(cr))
1694     {
1695         print_replica_exchange_statistics(fplog, repl_ex);
1696     }
1697
1698     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
1699
1700     global_stat_destroy(gstat);
1701 }