src/gromacs/mdrun/md.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2011-2019,2020, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief Implements the integrator for normal molecular dynamics simulations
  40  *
  41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
  42  * \ingroup module_mdrun
  43  */
  44 #include "gmxpre.h"
  45
  46 #include <cinttypes>
  47 #include <cmath>
  48 #include <cstdio>
  49 #include <cstdlib>
  50
  51 #include <algorithm>
  52 #include <memory>
  53
  54 #include "gromacs/awh/awh.h"
  55 #include "gromacs/commandline/filenm.h"
  56 #include "gromacs/domdec/collect.h"
  57 #include "gromacs/domdec/dlbtiming.h"
  58 #include "gromacs/domdec/domdec.h"
  59 #include "gromacs/domdec/domdec_network.h"
  60 #include "gromacs/domdec/domdec_struct.h"
  61 #include "gromacs/domdec/gpuhaloexchange.h"
  62 #include "gromacs/domdec/mdsetup.h"
  63 #include "gromacs/domdec/partition.h"
  64 #include "gromacs/essentialdynamics/edsam.h"
  65 #include "gromacs/ewald/pme_load_balancing.h"
  66 #include "gromacs/ewald/pme_pp.h"
  67 #include "gromacs/fileio/trxio.h"
  68 #include "gromacs/gmxlib/network.h"
  69 #include "gromacs/gmxlib/nrnb.h"
  70 #include "gromacs/gpu_utils/device_stream_manager.h"
  71 #include "gromacs/gpu_utils/gpu_utils.h"
  72 #include "gromacs/imd/imd.h"
  73 #include "gromacs/listed_forces/manage_threading.h"
  74 #include "gromacs/math/functions.h"
  75 #include "gromacs/math/utilities.h"
  76 #include "gromacs/math/vec.h"
  77 #include "gromacs/math/vectypes.h"
  78 #include "gromacs/mdlib/checkpointhandler.h"
  79 #include "gromacs/mdlib/compute_io.h"
  80 #include "gromacs/mdlib/constr.h"
  81 #include "gromacs/mdlib/ebin.h"
  82 #include "gromacs/mdlib/enerdata_utils.h"
  83 #include "gromacs/mdlib/energyoutput.h"
  84 #include "gromacs/mdlib/expanded.h"
  85 #include "gromacs/mdlib/force.h"
  86 #include "gromacs/mdlib/force_flags.h"
  87 #include "gromacs/mdlib/forcerec.h"
  88 #include "gromacs/mdlib/md_support.h"
  89 #include "gromacs/mdlib/mdatoms.h"
  90 #include "gromacs/mdlib/mdoutf.h"
  91 #include "gromacs/mdlib/membed.h"
  92 #include "gromacs/mdlib/resethandler.h"
  93 #include "gromacs/mdlib/sighandler.h"
  94 #include "gromacs/mdlib/simulationsignal.h"
  95 #include "gromacs/mdlib/stat.h"
  96 #include "gromacs/mdlib/stophandler.h"
  97 #include "gromacs/mdlib/tgroup.h"
  98 #include "gromacs/mdlib/trajectory_writing.h"
  99 #include "gromacs/mdlib/update.h"
 100 #include "gromacs/mdlib/update_constrain_gpu.h"
 101 #include "gromacs/mdlib/vcm.h"
 102 #include "gromacs/mdlib/vsite.h"
 103 #include "gromacs/mdrunutility/handlerestart.h"
 104 #include "gromacs/mdrunutility/multisim.h"
 105 #include "gromacs/mdrunutility/printtime.h"
 106 #include "gromacs/mdtypes/awh_history.h"
 107 #include "gromacs/mdtypes/awh_params.h"
 108 #include "gromacs/mdtypes/commrec.h"
 109 #include "gromacs/mdtypes/df_history.h"
 110 #include "gromacs/mdtypes/energyhistory.h"
 111 #include "gromacs/mdtypes/fcdata.h"
 112 #include "gromacs/mdtypes/forcerec.h"
 113 #include "gromacs/mdtypes/group.h"
 114 #include "gromacs/mdtypes/inputrec.h"
 115 #include "gromacs/mdtypes/interaction_const.h"
 116 #include "gromacs/mdtypes/md_enums.h"
 117 #include "gromacs/mdtypes/mdatom.h"
 118 #include "gromacs/mdtypes/mdrunoptions.h"
 119 #include "gromacs/mdtypes/observableshistory.h"
 120 #include "gromacs/mdtypes/pullhistory.h"
 121 #include "gromacs/mdtypes/simulation_workload.h"
 122 #include "gromacs/mdtypes/state.h"
 123 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
 124 #include "gromacs/modularsimulator/energyelement.h"
 125 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 126 #include "gromacs/nbnxm/nbnxm.h"
 127 #include "gromacs/pbcutil/pbc.h"
 128 #include "gromacs/pulling/output.h"
 129 #include "gromacs/pulling/pull.h"
 130 #include "gromacs/swap/swapcoords.h"
 131 #include "gromacs/timing/wallcycle.h"
 132 #include "gromacs/timing/walltime_accounting.h"
 133 #include "gromacs/topology/atoms.h"
 134 #include "gromacs/topology/idef.h"
 135 #include "gromacs/topology/mtop_util.h"
 136 #include "gromacs/topology/topology.h"
 137 #include "gromacs/trajectory/trajectoryframe.h"
 138 #include "gromacs/utility/basedefinitions.h"
 139 #include "gromacs/utility/cstringutil.h"
 140 #include "gromacs/utility/fatalerror.h"
 141 #include "gromacs/utility/logger.h"
 142 #include "gromacs/utility/real.h"
 143 #include "gromacs/utility/smalloc.h"
 144
 145 #include "legacysimulator.h"
 146 #include "replicaexchange.h"
 147 #include "shellfc.h"
 148
 149 #if GMX_FAHCORE
 150 #    include "corewrap.h"
 151 #endif
 152
 153 using gmx::SimulationSignaller;
 154
 155 void gmx::LegacySimulator::do_md()
 156 {
 157     // TODO Historically, the EM and MD "integrators" used different
 158     // names for the t_inputrec *parameter, but these must have the
 159     // same name, now that it's a member of a struct. We use this ir
 160     // alias to avoid a large ripple of nearly useless changes.
 161     // t_inputrec is being replaced by IMdpOptionsProvider, so this
 162     // will go away eventually.
 163     t_inputrec*  ir = inputrec;
 164     int64_t      step, step_rel;
 165     double       t, t0 = ir->init_t, lam0[efptNR];
 166     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
 167     gmx_bool     bNS, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
 168     gmx_bool     bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
 169     gmx_bool     do_ene, do_log, do_verbose;
 170     gmx_bool     bMasterState;
 171     unsigned int force_flags;
 172     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
 173     int    i, m;
 174     rvec   mu_tot;
 175     matrix pressureCouplingMu, M;
 176     gmx_repl_ex_t               repl_ex = nullptr;
 177     PaddedHostVector<gmx::RVec> f{};
 178     gmx_global_stat_t           gstat;
 179     gmx_shellfc_t*              shellfc;
 180     gmx_bool                    bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
 181     gmx_bool                    bTemp, bPres, bTrotter;
 182     real                        dvdl_constr;
 183     std::vector<RVec>           cbuf;
 184     matrix                      lastbox;
 185     int                         lamnew = 0;
 186     /* for FEP */
 187     int       nstfep = 0;
 188     double    cycles;
 189     real      saved_conserved_quantity = 0;
 190     real      last_ekin                = 0;
 191     t_extmass MassQ;
 192     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
 193
 194     /* PME load balancing data for GPU kernels */
 195     gmx_bool bPMETune         = FALSE;
 196     gmx_bool bPMETunePrinting = FALSE;
 197
 198     bool bInteractiveMDstep = false;
 199
 200     /* Domain decomposition could incorrectly miss a bonded
 201        interaction, but checking for that requires a global
 202        communication stage, which does not otherwise happen in DD
 203        code. So we do that alongside the first global energy reduction
 204        after a new DD is made. These variables handle whether the
 205        check happens, and the result it returns. */
 206     bool shouldCheckNumberOfBondedInteractions = false;
 207     int  totalNumberOfBondedInteractions       = -1;
 208
 209     SimulationSignals signals;
 210     // Most global communnication stages don't propagate mdrun
 211     // signals, and will use this object to achieve that.
 212     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
 213
 214     if (!mdrunOptions.writeConfout)
 215     {
 216         // This is on by default, and the main known use case for
 217         // turning it off is for convenience in benchmarking, which is
 218         // something that should not show up in the general user
 219         // interface.
 220         GMX_LOG(mdlog.info)
 221                 .asParagraph()
 222                 .appendText(
 223                         "The -noconfout functionality is deprecated, and may be removed in a "
 224                         "future version.");
 225     }
 226
 227     /* md-vv uses averaged full step velocities for T-control
 228        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 229        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 230     bTrotter = (EI_VV(ir->eI)
 231                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
 232
 233     const bool bRerunMD = false;
 234
 235     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
 236     bGStatEveryStep   = (nstglobalcomm == 1);
 237
 238     const SimulationGroups* groups = &top_global->groups;
 239
 240     std::unique_ptr<EssentialDynamics> ed = nullptr;
 241     if (opt2bSet("-ei", nfile, fnm))
 242     {
 243         /* Initialize essential dynamics sampling */
 244         ed = init_edsam(mdlog, opt2fn_null("-ei", nfile, fnm), opt2fn("-eo", nfile, fnm), top_global,
 245                         ir, cr, constr, state_global, observablesHistory, oenv, startingBehavior);
 246     }
 247     else if (observablesHistory->edsamHistory)
 248     {
 249         gmx_fatal(FARGS,
 250                   "The checkpoint is from a run with essential dynamics sampling, "
 251                   "but the current run did not specify the -ei option. "
 252                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
 253     }
 254
 255     initialize_lambdas(fplog, *ir, MASTER(cr), &state_global->fep_state, state_global->lambda, lam0);
 256     Update     upd(*ir, deform);
 257     const bool doSimulatedAnnealing = initSimulatedAnnealing(ir, &upd);
 258     const bool useReplicaExchange   = (replExParams.exchangeInterval > 0);
 259
 260     bool simulationsShareState = false;
 261     int  nstSignalComm         = nstglobalcomm;
 262     {
 263         // TODO This implementation of ensemble orientation restraints is nasty because
 264         // a user can't just do multi-sim with single-sim orientation restraints.
 265         bool usingEnsembleRestraints =
 266                 (fcd->disres.nsystems > 1) || ((ms != nullptr) && (fcd->orires.nr != 0));
 267         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr));
 268
 269         // Replica exchange, ensemble restraints and AWH need all
 270         // simulations to remain synchronized, so they need
 271         // checkpoints and stop conditions to act on the same step, so
 272         // the propagation of such signals must take place between
 273         // simulations, not just within simulations.
 274         // TODO: Make algorithm initializers set these flags.
 275         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
 276
 277         if (simulationsShareState)
 278         {
 279             // Inter-simulation signal communication does not need to happen
 280             // often, so we use a minimum of 200 steps to reduce overhead.
 281             const int c_minimumInterSimulationSignallingInterval = 200;
 282             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
 283                             * nstglobalcomm;
 284         }
 285     }
 286
 287     if (startingBehavior != StartingBehavior::RestartWithAppending)
 288     {
 289         pleaseCiteCouplingAlgorithms(fplog, *ir);
 290     }
 291     gmx_mdoutf* outf =
 292             init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, ir,
 293                         top_global, oenv, wcycle, startingBehavior, simulationsShareState, ms);
 294     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, ir, pull_work,
 295                                    mdoutf_get_fp_dhdl(outf), false, startingBehavior, mdModulesNotifier);
 296
 297     gstat = global_stat_init(ir);
 298
 299     /* Check for polarizable models and flexible constraints */
 300     shellfc = init_shell_flexcon(fplog, top_global, constr ? constr->numFlexibleConstraints() : 0,
 301                                  ir->nstcalcenergy, DOMAINDECOMP(cr));
 302
 303     {
 304         double io = compute_io(ir, top_global->natoms, *groups, energyOutput.numEnergyTerms(), 1);
 305         if ((io > 2000) && MASTER(cr))
 306         {
 307             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
 308         }
 309     }
 310
 311     // Local state only becomes valid now.
 312     std::unique_ptr<t_state> stateInstance;
 313     t_state*                 state;
 314
 315
 316     gmx_localtop_t top(top_global->ffparams);
 317
 318     auto mdatoms = mdAtoms->mdatoms();
 319
 320     std::unique_ptr<UpdateConstrainGpu> integrator;
 321
 322     if (DOMAINDECOMP(cr))
 323     {
 324         stateInstance = std::make_unique<t_state>();
 325         state         = stateInstance.get();
 326         dd_init_local_state(cr->dd, state_global, state);
 327
 328         /* Distribute the charge groups over the nodes from the master node */
 329         dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1, state_global, *top_global, ir,
 330                             imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
 331                             nrnb, nullptr, FALSE);
 332         shouldCheckNumberOfBondedInteractions = true;
 333         upd.setNumAtoms(state->natoms);
 334     }
 335     else
 336     {
 337         state_change_natoms(state_global, state_global->natoms);
 338         /* Copy the pointer to the global state */
 339         state = state_global;
 340
 341         /* Generate and initialize new topology */
 342         mdAlgorithmsSetupAtomData(cr, ir, *top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
 343
 344         upd.setNumAtoms(state->natoms);
 345     }
 346
 347     const auto& simulationWork     = runScheduleWork->simulationWork;
 348     const bool  useGpuForPme       = simulationWork.useGpuPme;
 349     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
 350     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
 351     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
 352
 353     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
 354
 355     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
 356     if (useGpuForUpdate)
 357     {
 358         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
 359                                    || constr->numConstraintsTotal() == 0,
 360                            "Constraints in domain decomposition are only supported with update "
 361                            "groups if using GPU update.\n");
 362         GMX_RELEASE_ASSERT(ir->eConstrAlg != econtSHAKE || constr == nullptr
 363                                    || constr->numConstraintsTotal() == 0,
 364                            "SHAKE is not supported with GPU update.");
 365         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
 366                            "Either PME or short-ranged non-bonded interaction tasks must run on "
 367                            "the GPU to use GPU update.\n");
 368         GMX_RELEASE_ASSERT(ir->eI == eiMD,
 369                            "Only the md integrator is supported with the GPU update.\n");
 370         GMX_RELEASE_ASSERT(
 371                 ir->etc != etcNOSEHOOVER,
 372                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
 373         GMX_RELEASE_ASSERT(ir->epc == epcNO || ir->epc == epcPARRINELLORAHMAN || ir->epc == epcBERENDSEN,
 374                            "Only Parrinello-Rahman and Berendsen pressure coupling are supported "
 375                            "with the GPU update.\n");
 376         GMX_RELEASE_ASSERT(!mdatoms->haveVsites,
 377                            "Virtual sites are not supported with the GPU update.\n");
 378         GMX_RELEASE_ASSERT(ed == nullptr,
 379                            "Essential dynamics is not supported with the GPU update.\n");
 380         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(ir->pull),
 381                            "Constraints pulling is not supported with the GPU update.\n");
 382         GMX_RELEASE_ASSERT(fcd->orires.nr == 0,
 383                            "Orientation restraints are not supported with the GPU update.\n");
 384         GMX_RELEASE_ASSERT(
 385                 ir->efep == efepNO
 386                         || (!haveFreeEnergyType(*ir, efptBONDED) && !haveFreeEnergyType(*ir, efptMASS)),
 387                 "Free energy perturbation of masses and constraints are not supported with the GPU "
 388                 "update.");
 389
 390         if (constr != nullptr && constr->numConstraintsTotal() > 0)
 391         {
 392             GMX_LOG(mdlog.info)
 393                     .asParagraph()
 394                     .appendText("Updating coordinates and applying constraints on the GPU.");
 395         }
 396         else
 397         {
 398             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
 399         }
 400         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
 401                            "Device stream manager should be initialized in order to use GPU "
 402                            "update-constraints.");
 403         GMX_RELEASE_ASSERT(
 404                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
 405                 "Update stream should be initialized in order to use GPU "
 406                 "update-constraints.");
 407         integrator = std::make_unique<UpdateConstrainGpu>(
 408                 *ir, *top_global, fr->deviceStreamManager->context(),
 409                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
 410                 stateGpu->xUpdatedOnDevice());
 411
 412         integrator->setPbc(PbcType::Xyz, state->box);
 413     }
 414
 415     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 416     {
 417         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
 418     }
 419     if ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 420     {
 421         changePinningPolicy(&f, PinningPolicy::PinnedIfSupported);
 422     }
 423     if (useGpuForUpdate)
 424     {
 425         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
 426     }
 427
 428     // NOTE: The global state is no longer used at this point.
 429     // But state_global is still used as temporary storage space for writing
 430     // the global state to file and potentially for replica exchange.
 431     // (Global topology should persist.)
 432
 433     update_mdatoms(mdatoms, state->lambda[efptMASS]);
 434
 435     if (ir->bExpanded)
 436     {
 437         /* Check nstexpanded here, because the grompp check was broken */
 438         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
 439         {
 440             gmx_fatal(FARGS,
 441                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
 442         }
 443         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
 444     }
 445
 446     if (MASTER(cr))
 447     {
 448         EnergyElement::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
 449     }
 450
 451     preparePrevStepPullCom(ir, pull_work, mdatoms->massT, state, state_global, cr,
 452                            startingBehavior != StartingBehavior::NewSimulation);
 453
 454     // TODO: Remove this by converting AWH into a ForceProvider
 455     auto awh = prepareAwhModule(fplog, *ir, state_global, cr, ms,
 456                                 startingBehavior != StartingBehavior::NewSimulation,
 457                                 shellfc != nullptr, opt2fn("-awh", nfile, fnm), pull_work);
 458
 459     if (useReplicaExchange && MASTER(cr))
 460     {
 461         repl_ex = init_replica_exchange(fplog, ms, top_global->natoms, ir, replExParams);
 462     }
 463     /* PME tuning is only supported in the Verlet scheme, with PME for
 464      * Coulomb. It is not supported with only LJ PME. */
 465     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
 466                 && ir->cutoff_scheme != ecutsGROUP);
 467
 468     pme_load_balancing_t* pme_loadbal = nullptr;
 469     if (bPMETune)
 470     {
 471         pme_loadbal_init(&pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata,
 472                          fr->nbv->useGpu());
 473     }
 474
 475     if (!ir->bContinuation)
 476     {
 477         if (state->flags & (1U << estV))
 478         {
 479             auto v = makeArrayRef(state->v);
 480             /* Set the velocities of vsites, shells and frozen atoms to zero */
 481             for (i = 0; i < mdatoms->homenr; i++)
 482             {
 483                 if (mdatoms->ptype[i] == eptVSite || mdatoms->ptype[i] == eptShell)
 484                 {
 485                     clear_rvec(v[i]);
 486                 }
 487                 else if (mdatoms->cFREEZE)
 488                 {
 489                     for (m = 0; m < DIM; m++)
 490                     {
 491                         if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
 492                         {
 493                             v[i][m] = 0;
 494                         }
 495                     }
 496                 }
 497             }
 498         }
 499
 500         if (constr)
 501         {
 502             /* Constrain the initial coordinates and velocities */
 503             do_constrain_first(fplog, constr, ir, mdatoms->nr, mdatoms->homenr,
 504                                state->x.arrayRefWithPadding(), state->v.arrayRefWithPadding(),
 505                                state->box, state->lambda[efptBONDED]);
 506         }
 507         if (vsite)
 508         {
 509             /* Construct the virtual sites for the initial configuration */
 510             vsite->construct(state->x, ir->delta_t, {}, state->box);
 511         }
 512     }
 513
 514     if (ir->efep != efepNO)
 515     {
 516         /* Set free energy calculation frequency as the greatest common
 517          * denominator of nstdhdl and repl_ex_nst. */
 518         nstfep = ir->fepvals->nstdhdl;
 519         if (ir->bExpanded)
 520         {
 521             nstfep = gmx_greatest_common_divisor(ir->expandedvals->nstexpanded, nstfep);
 522         }
 523         if (useReplicaExchange)
 524         {
 525             nstfep = gmx_greatest_common_divisor(replExParams.exchangeInterval, nstfep);
 526         }
 527     }
 528
 529     /* Be REALLY careful about what flags you set here. You CANNOT assume
 530      * this is the first step, since we might be restarting from a checkpoint,
 531      * and in that case we should not do any modifications to the state.
 532      */
 533     bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
 534
 535     // When restarting from a checkpoint, it can be appropriate to
 536     // initialize ekind from quantities in the checkpoint. Otherwise,
 537     // compute_globals must initialize ekind before the simulation
 538     // starts/restarts. However, only the master rank knows what was
 539     // found in the checkpoint file, so we have to communicate in
 540     // order to coordinate the restart.
 541     //
 542     // TODO Consider removing this communication if/when checkpoint
 543     // reading directly follows .tpr reading, because all ranks can
 544     // agree on hasReadEkinState at that time.
 545     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
 546     if (PAR(cr))
 547     {
 548         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
 549     }
 550     if (hasReadEkinState)
 551     {
 552         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
 553     }
 554
 555     unsigned int cglo_flags =
 556             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
 557              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
 558
 559     bSumEkinhOld = FALSE;
 560
 561     t_vcm vcm(top_global->groups, *ir);
 562     reportComRemovalInfo(fplog, vcm);
 563
 564     /* To minimize communication, compute_globals computes the COM velocity
 565      * and the kinetic energy for the velocities without COM motion removed.
 566      * Thus to get the kinetic energy without the COM contribution, we need
 567      * to call compute_globals twice.
 568      */
 569     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
 570     {
 571         unsigned int cglo_flags_iteration = cglo_flags;
 572         if (bStopCM && cgloIteration == 0)
 573         {
 574             cglo_flags_iteration |= CGLO_STOPCM;
 575             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
 576         }
 577         compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 578                         makeConstArrayRef(state->v), state->box, state->lambda[efptVDW], mdatoms,
 579                         nrnb, &vcm, nullptr, enerd, force_vir, shake_vir, total_vir, pres, constr,
 580                         &nullSignaller, state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
 581                         cglo_flags_iteration
 582                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
 583                                                                          : 0));
 584         if (cglo_flags_iteration & CGLO_STOPCM)
 585         {
 586             /* At initialization, do not pass x with acceleration-correction mode
 587              * to avoid (incorrect) correction of the initial coordinates.
 588              */
 589             auto x = (vcm.mode == ecmLINEAR_ACCELERATION_CORRECTION) ? ArrayRef<RVec>()
 590                                                                      : makeArrayRef(state->x);
 591             process_and_stopcm_grp(fplog, &vcm, *mdatoms, x, makeArrayRef(state->v));
 592             inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
 593         }
 594     }
 595     checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global, &top,
 596                                     makeConstArrayRef(state->x), state->box,
 597                                     &shouldCheckNumberOfBondedInteractions);
 598     if (ir->eI == eiVVAK)
 599     {
 600         /* a second call to get the half step temperature initialized as well */
 601         /* we do the same call as above, but turn the pressure off -- internally to
 602            compute_globals, this is recognized as a velocity verlet half-step
 603            kinetic energy calculation.  This minimized excess variables, but
 604            perhaps loses some logic?*/
 605
 606         compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 607                         makeConstArrayRef(state->v), state->box, state->lambda[efptVDW], mdatoms,
 608                         nrnb, &vcm, nullptr, enerd, force_vir, shake_vir, total_vir, pres, constr,
 609                         &nullSignaller, state->box, nullptr, &bSumEkinhOld, cglo_flags & ~CGLO_PRESSURE);
 610     }
 611
 612     /* Calculate the initial half step temperature, and save the ekinh_old */
 613     if (startingBehavior == StartingBehavior::NewSimulation)
 614     {
 615         for (i = 0; (i < ir->opts.ngtc); i++)
 616         {
 617             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
 618         }
 619     }
 620
 621     /* need to make an initiation call to get the Trotter variables set, as well as other constants
 622        for non-trotter temperature control */
 623     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
 624
 625     if (MASTER(cr))
 626     {
 627         if (!ir->bContinuation)
 628         {
 629             if (constr && ir->eConstrAlg == econtLINCS)
 630             {
 631                 fprintf(fplog, "RMS relative constraint deviation after constraining: %.2e\n",
 632                         constr->rmsd());
 633             }
 634             if (EI_STATE_VELOCITY(ir->eI))
 635             {
 636                 real temp = enerd->term[F_TEMP];
 637                 if (ir->eI != eiVV)
 638                 {
 639                     /* Result of Ekin averaged over velocities of -half
 640                      * and +half step, while we only have -half step here.
 641                      */
 642                     temp *= 2;
 643                 }
 644                 fprintf(fplog, "Initial temperature: %g K\n", temp);
 645             }
 646         }
 647
 648         char tbuf[20];
 649         fprintf(stderr, "starting mdrun '%s'\n", *(top_global->name));
 650         if (ir->nsteps >= 0)
 651         {
 652             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
 653         }
 654         else
 655         {
 656             sprintf(tbuf, "%s", "infinite");
 657         }
 658         if (ir->init_step > 0)
 659         {
 660             fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 661                     gmx_step_str(ir->init_step + ir->nsteps, sbuf), tbuf,
 662                     gmx_step_str(ir->init_step, sbuf2), ir->init_step * ir->delta_t);
 663         }
 664         else
 665         {
 666             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
 667         }
 668         fprintf(fplog, "\n");
 669     }
 670
 671     walltime_accounting_start_time(walltime_accounting);
 672     wallcycle_start(wcycle, ewcRUN);
 673     print_start(fplog, cr, walltime_accounting, "mdrun");
 674
 675 #if GMX_FAHCORE
 676     /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
 677     int chkpt_ret = fcCheckPointParallel(cr->nodeid, NULL, 0);
 678     if (chkpt_ret == 0)
 679     {
 680         gmx_fatal(3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0);
 681     }
 682 #endif
 683
 684     /***********************************************************
 685      *
 686      *             Loop over MD steps
 687      *
 688      ************************************************************/
 689
 690     bFirstStep = TRUE;
 691     /* Skip the first Nose-Hoover integration when we get the state from tpx */
 692     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
 693     bSumEkinhOld     = FALSE;
 694     bExchanged       = FALSE;
 695     bNeedRepartition = FALSE;
 696
 697     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
 698             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]), simulationsShareState,
 699             MASTER(cr), ir->nstlist, mdrunOptions.reproducible, nstSignalComm,
 700             mdrunOptions.maximumHoursToRun, ir->nstlist == 0, fplog, step, bNS, walltime_accounting);
 701
 702     auto checkpointHandler = std::make_unique<CheckpointHandler>(
 703             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]), simulationsShareState,
 704             ir->nstlist == 0, MASTER(cr), mdrunOptions.writeConfout,
 705             mdrunOptions.checkpointOptions.period);
 706
 707     const bool resetCountersIsLocal = true;
 708     auto       resetHandler         = std::make_unique<ResetHandler>(
 709             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
 710             !resetCountersIsLocal, ir->nsteps, MASTER(cr), mdrunOptions.timingOptions.resetHalfway,
 711             mdrunOptions.maximumHoursToRun, mdlog, wcycle, walltime_accounting);
 712
 713     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
 714
 715     step     = ir->init_step;
 716     step_rel = 0;
 717
 718     // TODO extract this to new multi-simulation module
 719     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
 720     {
 721         if (!multisim_int_all_are_equal(ms, ir->nsteps))
 722         {
 723             GMX_LOG(mdlog.warning)
 724                     .appendText(
 725                             "Note: The number of steps is not consistent across multi "
 726                             "simulations,\n"
 727                             "but we are proceeding anyway!");
 728         }
 729         if (!multisim_int_all_are_equal(ms, ir->init_step))
 730         {
 731             if (simulationsShareState)
 732             {
 733                 if (MASTER(cr))
 734                 {
 735                     gmx_fatal(FARGS,
 736                               "The initial step is not consistent across multi simulations which "
 737                               "share the state");
 738                 }
 739                 gmx_barrier(cr->mpi_comm_mygroup);
 740             }
 741             else
 742             {
 743                 GMX_LOG(mdlog.warning)
 744                         .appendText(
 745                                 "Note: The initial step is not consistent across multi "
 746                                 "simulations,\n"
 747                                 "but we are proceeding anyway!");
 748             }
 749         }
 750     }
 751
 752     /* and stop now if we should */
 753     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
 754     while (!bLastStep)
 755     {
 756
 757         /* Determine if this is a neighbor search step */
 758         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
 759
 760         if (bPMETune && bNStList)
 761         {
 762             // This has to be here because PME load balancing is called so early.
 763             // TODO: Move to after all booleans are defined.
 764             if (useGpuForUpdate && !bFirstStep)
 765             {
 766                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 767                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 768             }
 769             /* PME grid + cut-off optimization with GPUs or PME nodes */
 770             pme_loadbal_do(pme_loadbal, cr, (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
 771                            fplog, mdlog, *ir, fr, state->box, state->x, wcycle, step, step_rel,
 772                            &bPMETunePrinting, simulationWork.useGpuPmePpCommunication);
 773         }
 774
 775         wallcycle_start(wcycle, ewcSTEP);
 776
 777         bLastStep = (step_rel == ir->nsteps);
 778         t         = t0 + step * ir->delta_t;
 779
 780         // TODO Refactor this, so that nstfep does not need a default value of zero
 781         if (ir->efep != efepNO || ir->bSimTemp)
 782         {
 783             /* find and set the current lambdas */
 784             setCurrentLambdasLocal(step, ir->fepvals, lam0, state->lambda, state->fep_state);
 785
 786             bDoDHDL     = do_per_step(step, ir->fepvals->nstdhdl);
 787             bDoFEP      = ((ir->efep != efepNO) && do_per_step(step, nstfep));
 788             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
 789                            && (!bFirstStep));
 790         }
 791
 792         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
 793                      && do_per_step(step, replExParams.exchangeInterval));
 794
 795         if (doSimulatedAnnealing)
 796         {
 797             update_annealing_target_temp(ir, t, &upd);
 798         }
 799
 800         /* Stop Center of Mass motion */
 801         bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
 802
 803         /* Determine whether or not to do Neighbour Searching */
 804         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
 805
 806         /* Note that the stopHandler will cause termination at nstglobalcomm
 807          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
 808          * nstpcouple steps, we have computed the half-step kinetic energy
 809          * of the previous step and can always output energies at the last step.
 810          */
 811         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
 812
 813         /* do_log triggers energy and virial calculation. Because this leads
 814          * to different code paths, forces can be different. Thus for exact
 815          * continuation we should avoid extra log output.
 816          * Note that the || bLastStep can result in non-exact continuation
 817          * beyond the last step. But we don't consider that to be an issue.
 818          */
 819         do_log     = (do_per_step(step, ir->nstlog)
 820                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
 821         do_verbose = mdrunOptions.verbose
 822                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
 823
 824         if (useGpuForUpdate && !bFirstStep && bNS)
 825         {
 826             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
 827             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
 828             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
 829             // Copy coordinate from the GPU when needed at the search step.
 830             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
 831             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
 832             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 833             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 834         }
 835
 836         if (bNS && !(bFirstStep && ir->bContinuation))
 837         {
 838             bMasterState = FALSE;
 839             /* Correct the new box if it is too skewed */
 840             if (inputrecDynamicBox(ir))
 841             {
 842                 if (correct_box(fplog, step, state->box))
 843                 {
 844                     bMasterState = TRUE;
 845                     // If update is offloaded, it should be informed about the box size change
 846                     if (useGpuForUpdate)
 847                     {
 848                         integrator->setPbc(PbcType::Xyz, state->box);
 849                     }
 850                 }
 851             }
 852             if (DOMAINDECOMP(cr) && bMasterState)
 853             {
 854                 dd_collect_state(cr->dd, state, state_global);
 855             }
 856
 857             if (DOMAINDECOMP(cr))
 858             {
 859                 /* Repartition the domain decomposition */
 860                 dd_partition_system(fplog, mdlog, step, cr, bMasterState, nstglobalcomm, state_global,
 861                                     *top_global, ir, imdSession, pull_work, state, &f, mdAtoms, &top,
 862                                     fr, vsite, constr, nrnb, wcycle, do_verbose && !bPMETunePrinting);
 863                 shouldCheckNumberOfBondedInteractions = true;
 864                 upd.setNumAtoms(state->natoms);
 865
 866                 // Allocate or re-size GPU halo exchange object, if necessary
 867                 if (havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange
 868                     && useGpuForNonbonded && is1D(*cr->dd))
 869                 {
 870                     GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
 871                                        "GPU device manager has to be initialized to use GPU "
 872                                        "version of halo exchange.");
 873                     // TODO remove need to pass local stream into GPU halo exchange - Issue #3093
 874                     constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager);
 875                 }
 876             }
 877         }
 878
 879         if (MASTER(cr) && do_log)
 880         {
 881             gmx::EnergyOutput::printHeader(fplog, step,
 882                                            t); /* can we improve the information printed here? */
 883         }
 884
 885         if (ir->efep != efepNO)
 886         {
 887             update_mdatoms(mdatoms, state->lambda[efptMASS]);
 888         }
 889
 890         if (bExchanged)
 891         {
 892
 893             /* We need the kinetic energy at minus the half step for determining
 894              * the full step kinetic energy and possibly for T-coupling.*/
 895             /* This may not be quite working correctly yet . . . . */
 896             compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 897                             makeConstArrayRef(state->v), state->box, state->lambda[efptVDW], mdatoms,
 898                             nrnb, &vcm, wcycle, enerd, nullptr, nullptr, nullptr, nullptr, constr,
 899                             &nullSignaller, state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
 900                             CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
 901             checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global,
 902                                             &top, makeConstArrayRef(state->x), state->box,
 903                                             &shouldCheckNumberOfBondedInteractions);
 904         }
 905         clear_mat(force_vir);
 906
 907         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
 908
 909         /* Determine the energy and pressure:
 910          * at nstcalcenergy steps and at energy output steps (set below).
 911          */
 912         if (EI_VV(ir->eI) && (!bInitStep))
 913         {
 914             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
 915             bCalcVir      = bCalcEnerStep
 916                        || (ir->epc != epcNO
 917                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
 918         }
 919         else
 920         {
 921             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
 922             bCalcVir = bCalcEnerStep || (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
 923         }
 924         bCalcEner = bCalcEnerStep;
 925
 926         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
 927
 928         if (do_ene || do_log || bDoReplEx)
 929         {
 930             bCalcVir  = TRUE;
 931             bCalcEner = TRUE;
 932         }
 933
 934         /* Do we need global communication ? */
 935         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
 936                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
 937
 938         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
 939                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
 940                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
 941
 942         if (shellfc)
 943         {
 944             /* Now is the time to relax the shells */
 945             relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose, enforcedRotation, step, ir,
 946                                 imdSession, pull_work, bNS, force_flags, &top, constr, enerd, fcd,
 947                                 state->natoms, state->x.arrayRefWithPadding(),
 948                                 state->v.arrayRefWithPadding(), state->box, state->lambda, &state->hist,
 949                                 f.arrayRefWithPadding(), force_vir, mdatoms, nrnb, wcycle, shellfc,
 950                                 fr, runScheduleWork, t, mu_tot, vsite, ddBalanceRegionHandler);
 951         }
 952         else
 953         {
 954             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
 955                is updated (or the AWH update will be performed twice for one step when continuing).
 956                It would be best to call this update function from do_md_trajectory_writing but that
 957                would occur after do_force. One would have to divide the update_awh function into one
 958                function applying the AWH force and one doing the AWH bias update. The update AWH
 959                bias function could then be called after do_md_trajectory_writing (then containing
 960                update_awh_history). The checkpointing will in the future probably moved to the start
 961                of the md loop which will rid of this issue. */
 962             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
 963             {
 964                 awh->updateHistory(state_global->awhHistory.get());
 965             }
 966
 967             /* The coordinates (x) are shifted (to get whole molecules)
 968              * in do_force.
 969              * This is parallellized as well, and does communication too.
 970              * Check comments in sim_util.c
 971              */
 972             do_force(fplog, cr, ms, ir, awh.get(), enforcedRotation, imdSession, pull_work, step,
 973                      nrnb, wcycle, &top, state->box, state->x.arrayRefWithPadding(), &state->hist,
 974                      f.arrayRefWithPadding(), force_vir, mdatoms, enerd, fcd, state->lambda, fr,
 975                      runScheduleWork, vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
 976                      (bNS ? GMX_FORCE_NS : 0) | force_flags, ddBalanceRegionHandler);
 977         }
 978
 979         // VV integrators do not need the following velocity half step
 980         // if it is the first step after starting from a checkpoint.
 981         // That is, the half step is needed on all other steps, and
 982         // also the first step when starting from a .tpr file.
 983         if (EI_VV(ir->eI) && (!bFirstStep || startingBehavior == StartingBehavior::NewSimulation))
 984         /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
 985         {
 986             rvec* vbuf = nullptr;
 987
 988             wallcycle_start(wcycle, ewcUPDATE);
 989             if (ir->eI == eiVV && bInitStep)
 990             {
 991                 /* if using velocity verlet with full time step Ekin,
 992                  * take the first half step only to compute the
 993                  * virial for the first step. From there,
 994                  * revert back to the initial coordinates
 995                  * so that the input is actually the initial step.
 996                  */
 997                 snew(vbuf, state->natoms);
 998                 copy_rvecn(state->v.rvec_array(), vbuf, 0,
 999                            state->natoms); /* should make this better for parallelizing? */
1000             }
1001             else
1002             {
1003                 /* this is for NHC in the Ekin(t+dt/2) version of vv */
1004                 trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ,
1005                                trotter_seq, ettTSEQ1);
1006             }
1007
1008             upd.update_coords(*ir, step, mdatoms, state, f.arrayRefWithPadding(), fcd, ekind, M,
1009                               etrtVELOCITY1, cr, constr != nullptr);
1010
1011             wallcycle_stop(wcycle, ewcUPDATE);
1012             constrain_velocities(constr, do_log, do_ene, step, state, nullptr, bCalcVir, shake_vir);
1013             wallcycle_start(wcycle, ewcUPDATE);
1014             /* if VV, compute the pressure and constraints */
1015             /* For VV2, we strictly only need this if using pressure
1016              * control, but we really would like to have accurate pressures
1017              * printed out.
1018              * Think about ways around this in the future?
1019              * For now, keep this choice in comments.
1020              */
1021             /*bPres = (ir->eI==eiVV || inputrecNptTrotter(ir)); */
1022             /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && inputrecNptTrotter(ir)));*/
1023             bPres = TRUE;
1024             bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
1025             if (bCalcEner && ir->eI == eiVVAK)
1026             {
1027                 bSumEkinhOld = TRUE;
1028             }
1029             /* for vv, the first half of the integration actually corresponds to the previous step.
1030                So we need information from the last step in the first half of the integration */
1031             if (bGStat || do_per_step(step - 1, nstglobalcomm))
1032             {
1033                 wallcycle_stop(wcycle, ewcUPDATE);
1034                 compute_globals(
1035                         gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1036                         makeConstArrayRef(state->v), state->box, state->lambda[efptVDW], mdatoms,
1037                         nrnb, &vcm, wcycle, enerd, force_vir, shake_vir, total_vir, pres, constr,
1038                         &nullSignaller, state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
1039                         (bGStat ? CGLO_GSTAT : 0) | (bCalcEner ? CGLO_ENERGY : 0)
1040                                 | (bTemp ? CGLO_TEMPERATURE : 0) | (bPres ? CGLO_PRESSURE : 0)
1041                                 | (bPres ? CGLO_CONSTRAINT : 0) | (bStopCM ? CGLO_STOPCM : 0)
1042                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1043                                                                          : 0)
1044                                 | CGLO_SCALEEKIN);
1045                 /* explanation of above:
1046                    a) We compute Ekin at the full time step
1047                    if 1) we are using the AveVel Ekin, and it's not the
1048                    initial step, or 2) if we are using AveEkin, but need the full
1049                    time step kinetic energy for the pressure (always true now, since we want accurate statistics).
1050                    b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
1051                    EkinAveVel because it's needed for the pressure */
1052                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1053                                                 top_global, &top, makeConstArrayRef(state->x),
1054                                                 state->box, &shouldCheckNumberOfBondedInteractions);
1055                 if (bStopCM)
1056                 {
1057                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, makeArrayRef(state->x),
1058                                            makeArrayRef(state->v));
1059                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1060                 }
1061                 wallcycle_start(wcycle, ewcUPDATE);
1062             }
1063             /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
1064             if (!bInitStep)
1065             {
1066                 if (bTrotter)
1067                 {
1068                     m_add(force_vir, shake_vir,
1069                           total_vir); /* we need the un-dispersion corrected total vir here */
1070                     trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ,
1071                                    trotter_seq, ettTSEQ2);
1072
1073                     /* TODO This is only needed when we're about to write
1074                      * a checkpoint, because we use it after the restart
1075                      * (in a kludge?). But what should we be doing if
1076                      * the startingBehavior is NewSimulation or bInitStep are true? */
1077                     if (inputrecNptTrotter(ir) || inputrecNphTrotter(ir))
1078                     {
1079                         copy_mat(shake_vir, state->svir_prev);
1080                         copy_mat(force_vir, state->fvir_prev);
1081                     }
1082                     if (inputrecNvtTrotter(ir) && ir->eI == eiVV)
1083                     {
1084                         /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
1085                         enerd->term[F_TEMP] =
1086                                 sum_ekin(&(ir->opts), ekind, nullptr, (ir->eI == eiVV), FALSE);
1087                         enerd->term[F_EKIN] = trace(ekind->ekin);
1088                     }
1089                 }
1090                 else if (bExchanged)
1091                 {
1092                     wallcycle_stop(wcycle, ewcUPDATE);
1093                     /* We need the kinetic energy at minus the half step for determining
1094                      * the full step kinetic energy and possibly for T-coupling.*/
1095                     /* This may not be quite working correctly yet . . . . */
1096                     compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1097                                     makeConstArrayRef(state->v), state->box, state->lambda[efptVDW],
1098                                     mdatoms, nrnb, &vcm, wcycle, enerd, nullptr, nullptr, nullptr,
1099                                     nullptr, constr, &nullSignaller, state->box, nullptr,
1100                                     &bSumEkinhOld, CGLO_GSTAT | CGLO_TEMPERATURE);
1101                     wallcycle_start(wcycle, ewcUPDATE);
1102                 }
1103             }
1104             /* if it's the initial step, we performed this first step just to get the constraint virial */
1105             if (ir->eI == eiVV && bInitStep)
1106             {
1107                 copy_rvecn(vbuf, state->v.rvec_array(), 0, state->natoms);
1108                 sfree(vbuf);
1109             }
1110             wallcycle_stop(wcycle, ewcUPDATE);
1111         }
1112
1113         /* compute the conserved quantity */
1114         if (EI_VV(ir->eI))
1115         {
1116             saved_conserved_quantity = NPT_energy(ir, state, &MassQ);
1117             if (ir->eI == eiVV)
1118             {
1119                 last_ekin = enerd->term[F_EKIN];
1120             }
1121             if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
1122             {
1123                 saved_conserved_quantity -= enerd->term[F_DISPCORR];
1124             }
1125             /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
1126             if (ir->efep != efepNO)
1127             {
1128                 sum_dhdl(enerd, state->lambda, *ir->fepvals);
1129             }
1130         }
1131
1132         /* ########  END FIRST UPDATE STEP  ############## */
1133         /* ########  If doing VV, we now have v(dt) ###### */
1134         if (bDoExpanded)
1135         {
1136             /* perform extended ensemble sampling in lambda - we don't
1137                actually move to the new state before outputting
1138                statistics, but if performing simulated tempering, we
1139                do update the velocities and the tau_t. */
1140
1141             lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state,
1142                                               state->dfhist, step, state->v.rvec_array(), mdatoms);
1143             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1144             if (MASTER(cr))
1145             {
1146                 copy_df_history(state_global->dfhist, state->dfhist);
1147             }
1148         }
1149
1150         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1151         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1152         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1153             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1154                 || checkpointHandler->isCheckpointingStep()))
1155         {
1156             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1157             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1158         }
1159         // Copy velocities if needed for the output/checkpointing.
1160         // NOTE: Copy on the search steps is done at the beginning of the step.
1161         if (useGpuForUpdate && !bNS
1162             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1163         {
1164             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1165             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1166         }
1167         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1168         // and update is offloaded hence forces are kept on the GPU for update and have not been
1169         // already transferred in do_force().
1170         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1171         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1172         //       prior to GPU update.
1173         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1174         //       copy call in do_force(...).
1175         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1176         //       on host after the D2H copy in do_force(...).
1177         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1178             && do_per_step(step, ir->nstfout))
1179         {
1180             stateGpu->copyForcesFromGpu(ArrayRef<RVec>(f), AtomLocality::Local);
1181             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1182         }
1183         /* Now we have the energies and forces corresponding to the
1184          * coordinates at time t. We must output all of this before
1185          * the update.
1186          */
1187         do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t, ir, state, state_global,
1188                                  observablesHistory, top_global, fr, outf, energyOutput, ekind, f,
1189                                  checkpointHandler->isCheckpointingStep(), bRerunMD, bLastStep,
1190                                  mdrunOptions.writeConfout, bSumEkinhOld);
1191         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1192         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x.rvec_array(), t);
1193
1194         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1195         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1196             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1197         {
1198             copy_mat(state->svir_prev, shake_vir);
1199             copy_mat(state->fvir_prev, force_vir);
1200         }
1201
1202         stopHandler->setSignal();
1203         resetHandler->setSignal(walltime_accounting);
1204
1205         if (bGStat || !PAR(cr))
1206         {
1207             /* In parallel we only have to check for checkpointing in steps
1208              * where we do global communication,
1209              *  otherwise the other nodes don't know.
1210              */
1211             checkpointHandler->setSignal(walltime_accounting);
1212         }
1213
1214         /* #########   START SECOND UPDATE STEP ################# */
1215
1216         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1217            controlled in preprocessing */
1218
1219         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1220         {
1221             gmx_bool bIfRandomize;
1222             bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, &upd, constr);
1223             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1224             if (constr && bIfRandomize)
1225             {
1226                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1227             }
1228         }
1229         /* Box is changed in update() when we do pressure coupling,
1230          * but we should still use the old box for energy corrections and when
1231          * writing it to the energy file, so it matches the trajectory files for
1232          * the same timestep above. Make a copy in a separate array.
1233          */
1234         copy_mat(state->box, lastbox);
1235
1236         dvdl_constr = 0;
1237
1238         wallcycle_start(wcycle, ewcUPDATE);
1239         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1240         if (bTrotter)
1241         {
1242             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
1243             /* We can only do Berendsen coupling after we have summed
1244              * the kinetic energy or virial. Since the happens
1245              * in global_state after update, we should only do it at
1246              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1247              */
1248         }
1249         else
1250         {
1251             update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
1252             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1253         }
1254
1255         if (EI_VV(ir->eI))
1256         {
1257             /* velocity half-step update */
1258             upd.update_coords(*ir, step, mdatoms, state, f.arrayRefWithPadding(), fcd, ekind, M,
1259                               etrtVELOCITY2, cr, constr != nullptr);
1260         }
1261
1262         /* Above, initialize just copies ekinh into ekin,
1263          * it doesn't copy position (for VV),
1264          * and entire integrator for MD.
1265          */
1266
1267         if (ir->eI == eiVVAK)
1268         {
1269             cbuf.resize(state->x.size());
1270             std::copy(state->x.begin(), state->x.end(), cbuf.begin());
1271         }
1272
1273         /* With leap-frog type integrators we compute the kinetic energy
1274          * at a whole time step as the average of the half-time step kinetic
1275          * energies of two subsequent steps. Therefore we need to compute the
1276          * half step kinetic energy also if we need energies at the next step.
1277          */
1278         const bool needHalfStepKineticEnergy =
1279                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1280
1281         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1282         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1283         const bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN
1284                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1285
1286         if (useGpuForUpdate)
1287         {
1288             if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1289             {
1290                 integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(),
1291                                 stateGpu->getForces(), top.idef, *mdatoms, ekind->ngtc);
1292
1293                 // Copy data to the GPU after buffers might have being reinitialized
1294                 stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1295                 stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1296             }
1297
1298             // If the buffer ops were not offloaded this step, the forces are on the host and have to be copied
1299             if (!runScheduleWork->stepWork.useGpuFBufferOps)
1300             {
1301                 stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), AtomLocality::Local);
1302             }
1303
1304             const bool doTemperatureScaling =
1305                     (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1306
1307             // This applies Leap-Frog, LINCS and SETTLE in succession
1308             integrator->integrate(stateGpu->getForcesReadyOnDeviceEvent(
1309                                           AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1310                                   ir->delta_t, true, bCalcVir, shake_vir, doTemperatureScaling,
1311                                   ekind->tcstat, doParrinelloRahman, ir->nstpcouple * ir->delta_t, M);
1312
1313             // Copy velocities D2H after update if:
1314             // - Globals are computed this step (includes the energy output steps).
1315             // - Temperature is needed for the next step.
1316             if (bGStat || needHalfStepKineticEnergy)
1317             {
1318                 stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1319                 stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1320             }
1321         }
1322         else
1323         {
1324             upd.update_coords(*ir, step, mdatoms, state, f.arrayRefWithPadding(), fcd, ekind, M,
1325                               etrtPOSITION, cr, constr != nullptr);
1326
1327             wallcycle_stop(wcycle, ewcUPDATE);
1328
1329             constrain_coordinates(constr, do_log, do_ene, step, state,
1330                                   upd.xp()->arrayRefWithPadding(), &dvdl_constr, bCalcVir, shake_vir);
1331
1332             upd.update_sd_second_half(*ir, step, &dvdl_constr, mdatoms, state, cr, nrnb, wcycle,
1333                                       constr, do_log, do_ene);
1334             upd.finish_update(*ir, mdatoms, state, wcycle, constr != nullptr);
1335         }
1336
1337         if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1338         {
1339             updatePrevStepPullCom(pull_work, state);
1340         }
1341
1342         if (ir->eI == eiVVAK)
1343         {
1344             /* erase F_EKIN and F_TEMP here? */
1345             /* just compute the kinetic energy at the half step to perform a trotter step */
1346             compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1347                             makeConstArrayRef(state->v), state->box, state->lambda[efptVDW],
1348                             mdatoms, nrnb, &vcm, wcycle, enerd, force_vir, shake_vir, total_vir,
1349                             pres, constr, &nullSignaller, lastbox, nullptr, &bSumEkinhOld,
1350                             (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE);
1351             wallcycle_start(wcycle, ewcUPDATE);
1352             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
1353             /* now we know the scaling, we can compute the positions again */
1354             std::copy(cbuf.begin(), cbuf.end(), state->x.begin());
1355
1356             upd.update_coords(*ir, step, mdatoms, state, f.arrayRefWithPadding(), fcd, ekind, M,
1357                               etrtPOSITION, cr, constr != nullptr);
1358             wallcycle_stop(wcycle, ewcUPDATE);
1359
1360             /* do we need an extra constraint here? just need to copy out of as_rvec_array(state->v.data()) to upd->xp? */
1361             /* are the small terms in the shake_vir here due
1362              * to numerical errors, or are they important
1363              * physically? I'm thinking they are just errors, but not completely sure.
1364              * For now, will call without actually constraining, constr=NULL*/
1365             upd.finish_update(*ir, mdatoms, state, wcycle, false);
1366         }
1367         if (EI_VV(ir->eI))
1368         {
1369             /* this factor or 2 correction is necessary
1370                because half of the constraint force is removed
1371                in the vv step, so we have to double it.  See
1372                the Issue #1255.  It is not yet clear
1373                if the factor of 2 is exact, or just a very
1374                good approximation, and this will be
1375                investigated.  The next step is to see if this
1376                can be done adding a dhdl contribution from the
1377                rattle step, but this is somewhat more
1378                complicated with the current code. Will be
1379                investigated, hopefully for 4.6.3. However,
1380                this current solution is much better than
1381                having it completely wrong.
1382              */
1383             enerd->term[F_DVDL_CONSTR] += 2 * dvdl_constr;
1384         }
1385         else
1386         {
1387             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1388         }
1389
1390         if (vsite != nullptr)
1391         {
1392             wallcycle_start(wcycle, ewcVSITECONSTR);
1393             vsite->construct(state->x, ir->delta_t, state->v, state->box);
1394             wallcycle_stop(wcycle, ewcVSITECONSTR);
1395         }
1396
1397         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1398         /* With Leap-Frog we can skip compute_globals at
1399          * non-communication steps, but we need to calculate
1400          * the kinetic energy one step before communication.
1401          */
1402         {
1403             // Organize to do inter-simulation signalling on steps if
1404             // and when algorithms require it.
1405             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1406
1407             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1408             {
1409                 // Copy coordinates when needed to stop the CM motion.
1410                 if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
1411                 {
1412                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1413                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1414                 }
1415                 // Since we're already communicating at this step, we
1416                 // can propagate intra-simulation signals. Note that
1417                 // check_nstglobalcomm has the responsibility for
1418                 // choosing the value of nstglobalcomm that is one way
1419                 // bGStat becomes true, so we can't get into a
1420                 // situation where e.g. checkpointing can't be
1421                 // signalled.
1422                 bool                doIntraSimSignal = true;
1423                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1424
1425                 compute_globals(
1426                         gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1427                         makeConstArrayRef(state->v), state->box, state->lambda[efptVDW], mdatoms,
1428                         nrnb, &vcm, wcycle, enerd, force_vir, shake_vir, total_vir, pres, constr,
1429                         &signaller, lastbox, &totalNumberOfBondedInteractions, &bSumEkinhOld,
1430                         (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1431                                 | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1432                                 | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1433                                 | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
1434                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1435                                                                          : 0));
1436                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1437                                                 top_global, &top, makeConstArrayRef(state->x),
1438                                                 state->box, &shouldCheckNumberOfBondedInteractions);
1439                 if (!EI_VV(ir->eI) && bStopCM)
1440                 {
1441                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, makeArrayRef(state->x),
1442                                            makeArrayRef(state->v));
1443                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1444
1445                     // TODO: The special case of removing CM motion should be dealt more gracefully
1446                     if (useGpuForUpdate)
1447                     {
1448                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1449                         // Here we block until the H2D copy completes because event sync with the
1450                         // force kernels that use the coordinates on the next steps is not implemented
1451                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1452                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1453                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1454                         if (vcm.mode != ecmNO)
1455                         {
1456                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1457                         }
1458                     }
1459                 }
1460             }
1461         }
1462
1463         /* #############  END CALC EKIN AND PRESSURE ################# */
1464
1465         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1466            the virial that should probably be addressed eventually. state->veta has better properies,
1467            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1468            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1469
1470         if (ir->efep != efepNO && !EI_VV(ir->eI))
1471         {
1472             /* Sum up the foreign energy and dhdl terms for md and sd.
1473                Currently done every step so that dhdl is correct in the .edr */
1474             sum_dhdl(enerd, state->lambda, *ir->fepvals);
1475         }
1476
1477         update_pcouple_after_coordinates(fplog, step, ir, mdatoms, pres, force_vir, shake_vir,
1478                                          pressureCouplingMu, state, nrnb, upd.deform(), !useGpuForUpdate);
1479
1480         const bool doBerendsenPressureCoupling =
1481                 (inputrec->epc == epcBERENDSEN && do_per_step(step, inputrec->nstpcouple));
1482         if (useGpuForUpdate && (doBerendsenPressureCoupling || doParrinelloRahman))
1483         {
1484             integrator->scaleCoordinates(pressureCouplingMu);
1485             integrator->setPbc(PbcType::Xyz, state->box);
1486         }
1487
1488         /* ################# END UPDATE STEP 2 ################# */
1489         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1490
1491         /* The coordinates (x) were unshifted in update */
1492         if (!bGStat)
1493         {
1494             /* We will not sum ekinh_old,
1495              * so signal that we still have to do it.
1496              */
1497             bSumEkinhOld = TRUE;
1498         }
1499
1500         if (bCalcEner)
1501         {
1502             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1503
1504             /* use the directly determined last velocity, not actually the averaged half steps */
1505             if (bTrotter && ir->eI == eiVV)
1506             {
1507                 enerd->term[F_EKIN] = last_ekin;
1508             }
1509             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1510
1511             if (integratorHasConservedEnergyQuantity(ir))
1512             {
1513                 if (EI_VV(ir->eI))
1514                 {
1515                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1516                 }
1517                 else
1518                 {
1519                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1520                 }
1521             }
1522             /* #########  END PREPARING EDR OUTPUT  ###########  */
1523         }
1524
1525         /* Output stuff */
1526         if (MASTER(cr))
1527         {
1528             if (fplog && do_log && bDoExpanded)
1529             {
1530                 /* only needed if doing expanded ensemble */
1531                 PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals,
1532                                           ir->bSimTemp ? ir->simtempvals : nullptr,
1533                                           state_global->dfhist, state->fep_state, ir->nstlog, step);
1534             }
1535             if (bCalcEner)
1536             {
1537                 energyOutput.addDataAtEnergyStep(bDoDHDL, bCalcEnerStep, t, mdatoms->tmass, enerd, state,
1538                                                  ir->fepvals, ir->expandedvals, lastbox, shake_vir,
1539                                                  force_vir, total_vir, pres, ekind, mu_tot, constr);
1540             }
1541             else
1542             {
1543                 energyOutput.recordNonEnergyStep();
1544             }
1545
1546             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1547             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1548
1549             if (doSimulatedAnnealing)
1550             {
1551                 gmx::EnergyOutput::printAnnealingTemperatures(do_log ? fplog : nullptr, groups,
1552                                                               &(ir->opts));
1553             }
1554             if (do_log || do_ene || do_dr || do_or)
1555             {
1556                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or,
1557                                                    do_log ? fplog : nullptr, step, t, fcd, awh.get());
1558             }
1559
1560             if (ir->bPull)
1561             {
1562                 pull_print_output(pull_work, step, t);
1563             }
1564
1565             if (do_per_step(step, ir->nstlog))
1566             {
1567                 if (fflush(fplog) != 0)
1568                 {
1569                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1570                 }
1571             }
1572         }
1573         if (bDoExpanded)
1574         {
1575             /* Have to do this part _after_ outputting the logfile and the edr file */
1576             /* Gets written into the state at the beginning of next loop*/
1577             state->fep_state = lamnew;
1578         }
1579         /* Print the remaining wall clock time for the run */
1580         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1581         {
1582             if (shellfc)
1583             {
1584                 fprintf(stderr, "\n");
1585             }
1586             print_time(stderr, walltime_accounting, step, ir, cr);
1587         }
1588
1589         /* Ion/water position swapping.
1590          * Not done in last step since trajectory writing happens before this call
1591          * in the MD loop and exchanges would be lost anyway. */
1592         bNeedRepartition = FALSE;
1593         if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep && do_per_step(step, ir->swap->nstswap))
1594         {
1595             bNeedRepartition =
1596                     do_swapcoords(cr, step, t, ir, swap, wcycle, as_rvec_array(state->x.data()),
1597                                   state->box, MASTER(cr) && mdrunOptions.verbose, bRerunMD);
1598
1599             if (bNeedRepartition && DOMAINDECOMP(cr))
1600             {
1601                 dd_collect_state(cr->dd, state, state_global);
1602             }
1603         }
1604
1605         /* Replica exchange */
1606         bExchanged = FALSE;
1607         if (bDoReplEx)
1608         {
1609             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1610         }
1611
1612         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1613         {
1614             dd_partition_system(fplog, mdlog, step, cr, TRUE, 1, state_global, *top_global, ir,
1615                                 imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
1616                                 nrnb, wcycle, FALSE);
1617             shouldCheckNumberOfBondedInteractions = true;
1618             upd.setNumAtoms(state->natoms);
1619         }
1620
1621         bFirstStep = FALSE;
1622         bInitStep  = FALSE;
1623
1624         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1625         /* With all integrators, except VV, we need to retain the pressure
1626          * at the current step for coupling at the next step.
1627          */
1628         if ((state->flags & (1U << estPRES_PREV))
1629             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1630         {
1631             /* Store the pressure in t_state for pressure coupling
1632              * at the next MD step.
1633              */
1634             copy_mat(pres, state->pres_prev);
1635         }
1636
1637         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1638
1639         if ((membed != nullptr) && (!bLastStep))
1640         {
1641             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1642         }
1643
1644         cycles = wallcycle_stop(wcycle, ewcSTEP);
1645         if (DOMAINDECOMP(cr) && wcycle)
1646         {
1647             dd_cycles_add(cr->dd, cycles, ddCyclStep);
1648         }
1649
1650         /* increase the MD step number */
1651         step++;
1652         step_rel++;
1653
1654         resetHandler->resetCounters(step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb,
1655                                     fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
1656
1657         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
1658         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
1659     }
1660     /* End of main MD loop */
1661
1662     /* Closing TNG files can include compressing data. Therefore it is good to do that
1663      * before stopping the time measurements. */
1664     mdoutf_tng_close(outf);
1665
1666     /* Stop measuring walltime */
1667     walltime_accounting_end_time(walltime_accounting);
1668
1669     if (!thisRankHasDuty(cr, DUTY_PME))
1670     {
1671         /* Tell the PME only node to finish */
1672         gmx_pme_send_finish(cr);
1673     }
1674
1675     if (MASTER(cr))
1676     {
1677         if (ir->nstcalcenergy > 0)
1678         {
1679             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
1680             energyOutput.printAverages(fplog, groups);
1681         }
1682     }
1683     done_mdoutf(outf);
1684
1685     if (bPMETune)
1686     {
1687         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
1688     }
1689
1690     done_shellfc(fplog, shellfc, step_rel);
1691
1692     if (useReplicaExchange && MASTER(cr))
1693     {
1694         print_replica_exchange_statistics(fplog, repl_ex);
1695     }
1696
1697     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
1698
1699     global_stat_destroy(gstat);
1700 }