src/gromacs/mdrun/md.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2011-2019,2020, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief Implements the integrator for normal molecular dynamics simulations
  40  *
  41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
  42  * \ingroup module_mdrun
  43  */
  44 #include "gmxpre.h"
  45
  46 #include <cinttypes>
  47 #include <cmath>
  48 #include <cstdio>
  49 #include <cstdlib>
  50
  51 #include <algorithm>
  52 #include <memory>
  53 #include <numeric>
  54
  55 #include "gromacs/applied_forces/awh/awh.h"
  56 #include "gromacs/commandline/filenm.h"
  57 #include "gromacs/domdec/collect.h"
  58 #include "gromacs/domdec/dlbtiming.h"
  59 #include "gromacs/domdec/domdec.h"
  60 #include "gromacs/domdec/domdec_network.h"
  61 #include "gromacs/domdec/domdec_struct.h"
  62 #include "gromacs/domdec/gpuhaloexchange.h"
  63 #include "gromacs/domdec/mdsetup.h"
  64 #include "gromacs/domdec/partition.h"
  65 #include "gromacs/essentialdynamics/edsam.h"
  66 #include "gromacs/ewald/pme_load_balancing.h"
  67 #include "gromacs/ewald/pme_pp.h"
  68 #include "gromacs/fileio/trxio.h"
  69 #include "gromacs/gmxlib/network.h"
  70 #include "gromacs/gmxlib/nrnb.h"
  71 #include "gromacs/gpu_utils/device_stream_manager.h"
  72 #include "gromacs/gpu_utils/gpu_utils.h"
  73 #include "gromacs/imd/imd.h"
  74 #include "gromacs/listed_forces/listed_forces.h"
  75 #include "gromacs/math/functions.h"
  76 #include "gromacs/math/invertmatrix.h"
  77 #include "gromacs/math/vec.h"
  78 #include "gromacs/math/vectypes.h"
  79 #include "gromacs/mdlib/checkpointhandler.h"
  80 #include "gromacs/mdlib/compute_io.h"
  81 #include "gromacs/mdlib/constr.h"
  82 #include "gromacs/mdlib/coupling.h"
  83 #include "gromacs/mdlib/ebin.h"
  84 #include "gromacs/mdlib/enerdata_utils.h"
  85 #include "gromacs/mdlib/energyoutput.h"
  86 #include "gromacs/mdlib/expanded.h"
  87 #include "gromacs/mdlib/force.h"
  88 #include "gromacs/mdlib/force_flags.h"
  89 #include "gromacs/mdlib/forcerec.h"
  90 #include "gromacs/mdlib/freeenergyparameters.h"
  91 #include "gromacs/mdlib/md_support.h"
  92 #include "gromacs/mdlib/mdatoms.h"
  93 #include "gromacs/mdlib/mdoutf.h"
  94 #include "gromacs/mdlib/membed.h"
  95 #include "gromacs/mdlib/resethandler.h"
  96 #include "gromacs/mdlib/sighandler.h"
  97 #include "gromacs/mdlib/simulationsignal.h"
  98 #include "gromacs/mdlib/stat.h"
  99 #include "gromacs/mdlib/stophandler.h"
 100 #include "gromacs/mdlib/tgroup.h"
 101 #include "gromacs/mdlib/trajectory_writing.h"
 102 #include "gromacs/mdlib/update.h"
 103 #include "gromacs/mdlib/update_constrain_gpu.h"
 104 #include "gromacs/mdlib/vcm.h"
 105 #include "gromacs/mdlib/vsite.h"
 106 #include "gromacs/mdrunutility/handlerestart.h"
 107 #include "gromacs/mdrunutility/multisim.h"
 108 #include "gromacs/mdrunutility/printtime.h"
 109 #include "gromacs/mdtypes/awh_history.h"
 110 #include "gromacs/mdtypes/awh_params.h"
 111 #include "gromacs/mdtypes/commrec.h"
 112 #include "gromacs/mdtypes/df_history.h"
 113 #include "gromacs/mdtypes/energyhistory.h"
 114 #include "gromacs/mdtypes/fcdata.h"
 115 #include "gromacs/mdtypes/forcebuffers.h"
 116 #include "gromacs/mdtypes/forcerec.h"
 117 #include "gromacs/mdtypes/group.h"
 118 #include "gromacs/mdtypes/inputrec.h"
 119 #include "gromacs/mdtypes/interaction_const.h"
 120 #include "gromacs/mdtypes/md_enums.h"
 121 #include "gromacs/mdtypes/mdatom.h"
 122 #include "gromacs/mdtypes/mdrunoptions.h"
 123 #include "gromacs/mdtypes/multipletimestepping.h"
 124 #include "gromacs/mdtypes/observableshistory.h"
 125 #include "gromacs/mdtypes/pullhistory.h"
 126 #include "gromacs/mdtypes/simulation_workload.h"
 127 #include "gromacs/mdtypes/state.h"
 128 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
 129 #include "gromacs/modularsimulator/energydata.h"
 130 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 131 #include "gromacs/nbnxm/nbnxm.h"
 132 #include "gromacs/pbcutil/pbc.h"
 133 #include "gromacs/pulling/output.h"
 134 #include "gromacs/pulling/pull.h"
 135 #include "gromacs/swap/swapcoords.h"
 136 #include "gromacs/timing/wallcycle.h"
 137 #include "gromacs/timing/walltime_accounting.h"
 138 #include "gromacs/topology/atoms.h"
 139 #include "gromacs/topology/idef.h"
 140 #include "gromacs/topology/mtop_util.h"
 141 #include "gromacs/topology/topology.h"
 142 #include "gromacs/trajectory/trajectoryframe.h"
 143 #include "gromacs/utility/basedefinitions.h"
 144 #include "gromacs/utility/cstringutil.h"
 145 #include "gromacs/utility/fatalerror.h"
 146 #include "gromacs/utility/logger.h"
 147 #include "gromacs/utility/real.h"
 148 #include "gromacs/utility/smalloc.h"
 149
 150 #include "legacysimulator.h"
 151 #include "replicaexchange.h"
 152 #include "shellfc.h"
 153
 154 #if GMX_FAHCORE
 155 #    include "corewrap.h"
 156 #endif
 157
 158 using gmx::SimulationSignaller;
 159
 160 void gmx::LegacySimulator::do_md()
 161 {
 162     // TODO Historically, the EM and MD "integrators" used different
 163     // names for the t_inputrec *parameter, but these must have the
 164     // same name, now that it's a member of a struct. We use this ir
 165     // alias to avoid a large ripple of nearly useless changes.
 166     // t_inputrec is being replaced by IMdpOptionsProvider, so this
 167     // will go away eventually.
 168     t_inputrec*  ir = inputrec;
 169     int64_t      step, step_rel;
 170     double       t, t0 = ir->init_t;
 171     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
 172     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
 173     gmx_bool     bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
 174     gmx_bool     do_ene, do_log, do_verbose;
 175     gmx_bool     bMasterState;
 176     unsigned int force_flags;
 177     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
 178     int    i, m;
 179     rvec   mu_tot;
 180     matrix pressureCouplingMu, M;
 181     gmx_repl_ex_t     repl_ex = nullptr;
 182     gmx_global_stat_t gstat;
 183     gmx_shellfc_t*    shellfc;
 184     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
 185     gmx_bool          bTemp, bPres, bTrotter;
 186     real              dvdl_constr;
 187     std::vector<RVec> cbuf;
 188     matrix            lastbox;
 189     int               lamnew = 0;
 190     /* for FEP */
 191     int       nstfep = 0;
 192     double    cycles;
 193     real      saved_conserved_quantity = 0;
 194     real      last_ekin                = 0;
 195     t_extmass MassQ;
 196     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
 197
 198     /* PME load balancing data for GPU kernels */
 199     gmx_bool bPMETune         = FALSE;
 200     gmx_bool bPMETunePrinting = FALSE;
 201
 202     bool bInteractiveMDstep = false;
 203
 204     /* Domain decomposition could incorrectly miss a bonded
 205        interaction, but checking for that requires a global
 206        communication stage, which does not otherwise happen in DD
 207        code. So we do that alongside the first global energy reduction
 208        after a new DD is made. These variables handle whether the
 209        check happens, and the result it returns. */
 210     bool shouldCheckNumberOfBondedInteractions = false;
 211     int  totalNumberOfBondedInteractions       = -1;
 212
 213     SimulationSignals signals;
 214     // Most global communnication stages don't propagate mdrun
 215     // signals, and will use this object to achieve that.
 216     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
 217
 218     if (!mdrunOptions.writeConfout)
 219     {
 220         // This is on by default, and the main known use case for
 221         // turning it off is for convenience in benchmarking, which is
 222         // something that should not show up in the general user
 223         // interface.
 224         GMX_LOG(mdlog.info)
 225                 .asParagraph()
 226                 .appendText(
 227                         "The -noconfout functionality is deprecated, and may be removed in a "
 228                         "future version.");
 229     }
 230
 231     /* md-vv uses averaged full step velocities for T-control
 232        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 233        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 234     bTrotter = (EI_VV(ir->eI)
 235                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
 236
 237     const bool bRerunMD = false;
 238
 239     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
 240     bGStatEveryStep   = (nstglobalcomm == 1);
 241
 242     const SimulationGroups* groups = &top_global->groups;
 243
 244     std::unique_ptr<EssentialDynamics> ed = nullptr;
 245     if (opt2bSet("-ei", nfile, fnm))
 246     {
 247         /* Initialize essential dynamics sampling */
 248         ed = init_edsam(mdlog, opt2fn_null("-ei", nfile, fnm), opt2fn("-eo", nfile, fnm), top_global,
 249                         ir, cr, constr, state_global, observablesHistory, oenv, startingBehavior);
 250     }
 251     else if (observablesHistory->edsamHistory)
 252     {
 253         gmx_fatal(FARGS,
 254                   "The checkpoint is from a run with essential dynamics sampling, "
 255                   "but the current run did not specify the -ei option. "
 256                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
 257     }
 258
 259     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
 260     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
 261     initialize_lambdas(fplog, *ir, MASTER(cr), fep_state, lambda);
 262     Update     upd(*ir, deform);
 263     const bool doSimulatedAnnealing = initSimulatedAnnealing(ir, &upd);
 264     const bool useReplicaExchange   = (replExParams.exchangeInterval > 0);
 265
 266     const t_fcdata& fcdata = *fr->fcdata;
 267
 268     bool simulationsShareState = false;
 269     int  nstSignalComm         = nstglobalcomm;
 270     {
 271         // TODO This implementation of ensemble orientation restraints is nasty because
 272         // a user can't just do multi-sim with single-sim orientation restraints.
 273         bool usingEnsembleRestraints =
 274                 (fcdata.disres->nsystems > 1) || ((ms != nullptr) && (fcdata.orires->nr != 0));
 275         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr));
 276
 277         // Replica exchange, ensemble restraints and AWH need all
 278         // simulations to remain synchronized, so they need
 279         // checkpoints and stop conditions to act on the same step, so
 280         // the propagation of such signals must take place between
 281         // simulations, not just within simulations.
 282         // TODO: Make algorithm initializers set these flags.
 283         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
 284
 285         if (simulationsShareState)
 286         {
 287             // Inter-simulation signal communication does not need to happen
 288             // often, so we use a minimum of 200 steps to reduce overhead.
 289             const int c_minimumInterSimulationSignallingInterval = 200;
 290             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
 291                             * nstglobalcomm;
 292         }
 293     }
 294
 295     if (startingBehavior != StartingBehavior::RestartWithAppending)
 296     {
 297         pleaseCiteCouplingAlgorithms(fplog, *ir);
 298     }
 299     gmx_mdoutf* outf =
 300             init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, ir,
 301                         top_global, oenv, wcycle, startingBehavior, simulationsShareState, ms);
 302     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, ir, pull_work,
 303                                    mdoutf_get_fp_dhdl(outf), false, startingBehavior, mdModulesNotifier);
 304
 305     gstat = global_stat_init(ir);
 306
 307     /* Check for polarizable models and flexible constraints */
 308     shellfc = init_shell_flexcon(fplog, top_global, constr ? constr->numFlexibleConstraints() : 0,
 309                                  ir->nstcalcenergy, DOMAINDECOMP(cr));
 310
 311     {
 312         double io = compute_io(ir, top_global->natoms, *groups, energyOutput.numEnergyTerms(), 1);
 313         if ((io > 2000) && MASTER(cr))
 314         {
 315             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
 316         }
 317     }
 318
 319     // Local state only becomes valid now.
 320     std::unique_ptr<t_state> stateInstance;
 321     t_state*                 state;
 322
 323
 324     gmx_localtop_t top(top_global->ffparams);
 325
 326     auto mdatoms = mdAtoms->mdatoms();
 327
 328     const auto& simulationWork     = runScheduleWork->simulationWork;
 329     const bool  useGpuForPme       = simulationWork.useGpuPme;
 330     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
 331     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
 332     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
 333
 334     ForceBuffers f(fr->useMts, ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 335                                        ? PinningPolicy::PinnedIfSupported
 336                                        : PinningPolicy::CannotBePinned);
 337     if (DOMAINDECOMP(cr))
 338     {
 339         stateInstance = std::make_unique<t_state>();
 340         state         = stateInstance.get();
 341         dd_init_local_state(cr->dd, state_global, state);
 342
 343         /* Distribute the charge groups over the nodes from the master node */
 344         dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1, state_global, *top_global, ir,
 345                             imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
 346                             nrnb, nullptr, FALSE);
 347         shouldCheckNumberOfBondedInteractions = true;
 348         upd.setNumAtoms(state->natoms);
 349     }
 350     else
 351     {
 352         state_change_natoms(state_global, state_global->natoms);
 353         /* Copy the pointer to the global state */
 354         state = state_global;
 355
 356         /* Generate and initialize new topology */
 357         mdAlgorithmsSetupAtomData(cr, ir, *top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
 358
 359         upd.setNumAtoms(state->natoms);
 360     }
 361
 362     std::unique_ptr<UpdateConstrainGpu> integrator;
 363
 364     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
 365
 366     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
 367     if (useGpuForUpdate)
 368     {
 369         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
 370                                    || constr->numConstraintsTotal() == 0,
 371                            "Constraints in domain decomposition are only supported with update "
 372                            "groups if using GPU update.\n");
 373         GMX_RELEASE_ASSERT(ir->eConstrAlg != econtSHAKE || constr == nullptr
 374                                    || constr->numConstraintsTotal() == 0,
 375                            "SHAKE is not supported with GPU update.");
 376         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
 377                            "Either PME or short-ranged non-bonded interaction tasks must run on "
 378                            "the GPU to use GPU update.\n");
 379         GMX_RELEASE_ASSERT(ir->eI == eiMD,
 380                            "Only the md integrator is supported with the GPU update.\n");
 381         GMX_RELEASE_ASSERT(
 382                 ir->etc != etcNOSEHOOVER,
 383                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
 384         GMX_RELEASE_ASSERT(
 385                 ir->epc == epcNO || ir->epc == epcPARRINELLORAHMAN || ir->epc == epcBERENDSEN
 386                         || ir->epc == epcCRESCALE,
 387                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
 388                 "with the GPU update.\n");
 389         GMX_RELEASE_ASSERT(!mdatoms->haveVsites,
 390                            "Virtual sites are not supported with the GPU update.\n");
 391         GMX_RELEASE_ASSERT(ed == nullptr,
 392                            "Essential dynamics is not supported with the GPU update.\n");
 393         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(ir->pull),
 394                            "Constraints pulling is not supported with the GPU update.\n");
 395         GMX_RELEASE_ASSERT(fcdata.orires->nr == 0,
 396                            "Orientation restraints are not supported with the GPU update.\n");
 397         GMX_RELEASE_ASSERT(
 398                 ir->efep == efepNO
 399                         || (!haveFreeEnergyType(*ir, efptBONDED) && !haveFreeEnergyType(*ir, efptMASS)),
 400                 "Free energy perturbation of masses and constraints are not supported with the GPU "
 401                 "update.");
 402
 403         if (constr != nullptr && constr->numConstraintsTotal() > 0)
 404         {
 405             GMX_LOG(mdlog.info)
 406                     .asParagraph()
 407                     .appendText("Updating coordinates and applying constraints on the GPU.");
 408         }
 409         else
 410         {
 411             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
 412         }
 413         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
 414                            "Device stream manager should be initialized in order to use GPU "
 415                            "update-constraints.");
 416         GMX_RELEASE_ASSERT(
 417                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
 418                 "Update stream should be initialized in order to use GPU "
 419                 "update-constraints.");
 420         integrator = std::make_unique<UpdateConstrainGpu>(
 421                 *ir, *top_global, fr->deviceStreamManager->context(),
 422                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
 423                 stateGpu->xUpdatedOnDevice());
 424
 425         integrator->setPbc(PbcType::Xyz, state->box);
 426     }
 427
 428     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 429     {
 430         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
 431     }
 432     if (useGpuForUpdate)
 433     {
 434         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
 435     }
 436
 437     // NOTE: The global state is no longer used at this point.
 438     // But state_global is still used as temporary storage space for writing
 439     // the global state to file and potentially for replica exchange.
 440     // (Global topology should persist.)
 441
 442     update_mdatoms(mdatoms, state->lambda[efptMASS]);
 443
 444     if (ir->bExpanded)
 445     {
 446         /* Check nstexpanded here, because the grompp check was broken */
 447         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
 448         {
 449             gmx_fatal(FARGS,
 450                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
 451         }
 452         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
 453     }
 454
 455     if (MASTER(cr))
 456     {
 457         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
 458     }
 459
 460     preparePrevStepPullCom(ir, pull_work, mdatoms->massT, state, state_global, cr,
 461                            startingBehavior != StartingBehavior::NewSimulation);
 462
 463     // TODO: Remove this by converting AWH into a ForceProvider
 464     auto awh = prepareAwhModule(fplog, *ir, state_global, cr, ms,
 465                                 startingBehavior != StartingBehavior::NewSimulation,
 466                                 shellfc != nullptr, opt2fn("-awh", nfile, fnm), pull_work);
 467
 468     if (useReplicaExchange && MASTER(cr))
 469     {
 470         repl_ex = init_replica_exchange(fplog, ms, top_global->natoms, ir, replExParams);
 471     }
 472     /* PME tuning is only supported in the Verlet scheme, with PME for
 473      * Coulomb. It is not supported with only LJ PME. */
 474     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
 475                 && ir->cutoff_scheme != ecutsGROUP);
 476
 477     pme_load_balancing_t* pme_loadbal = nullptr;
 478     if (bPMETune)
 479     {
 480         pme_loadbal_init(&pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata,
 481                          fr->nbv->useGpu());
 482     }
 483
 484     if (!ir->bContinuation)
 485     {
 486         if (state->flags & (1U << estV))
 487         {
 488             auto v = makeArrayRef(state->v);
 489             /* Set the velocities of vsites, shells and frozen atoms to zero */
 490             for (i = 0; i < mdatoms->homenr; i++)
 491             {
 492                 if (mdatoms->ptype[i] == eptVSite || mdatoms->ptype[i] == eptShell)
 493                 {
 494                     clear_rvec(v[i]);
 495                 }
 496                 else if (mdatoms->cFREEZE)
 497                 {
 498                     for (m = 0; m < DIM; m++)
 499                     {
 500                         if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
 501                         {
 502                             v[i][m] = 0;
 503                         }
 504                     }
 505                 }
 506             }
 507         }
 508
 509         if (constr)
 510         {
 511             /* Constrain the initial coordinates and velocities */
 512             do_constrain_first(fplog, constr, ir, mdatoms->nr, mdatoms->homenr,
 513                                state->x.arrayRefWithPadding(), state->v.arrayRefWithPadding(),
 514                                state->box, state->lambda[efptBONDED]);
 515         }
 516         if (vsite)
 517         {
 518             /* Construct the virtual sites for the initial configuration */
 519             vsite->construct(state->x, ir->delta_t, {}, state->box);
 520         }
 521     }
 522
 523     if (ir->efep != efepNO)
 524     {
 525         /* Set free energy calculation frequency as the greatest common
 526          * denominator of nstdhdl and repl_ex_nst. */
 527         nstfep = ir->fepvals->nstdhdl;
 528         if (ir->bExpanded)
 529         {
 530             nstfep = std::gcd(ir->expandedvals->nstexpanded, nstfep);
 531         }
 532         if (useReplicaExchange)
 533         {
 534             nstfep = std::gcd(replExParams.exchangeInterval, nstfep);
 535         }
 536         if (ir->bDoAwh)
 537         {
 538             nstfep = std::gcd(ir->awhParams->nstSampleCoord, nstfep);
 539         }
 540     }
 541
 542     /* Be REALLY careful about what flags you set here. You CANNOT assume
 543      * this is the first step, since we might be restarting from a checkpoint,
 544      * and in that case we should not do any modifications to the state.
 545      */
 546     bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
 547
 548     // When restarting from a checkpoint, it can be appropriate to
 549     // initialize ekind from quantities in the checkpoint. Otherwise,
 550     // compute_globals must initialize ekind before the simulation
 551     // starts/restarts. However, only the master rank knows what was
 552     // found in the checkpoint file, so we have to communicate in
 553     // order to coordinate the restart.
 554     //
 555     // TODO Consider removing this communication if/when checkpoint
 556     // reading directly follows .tpr reading, because all ranks can
 557     // agree on hasReadEkinState at that time.
 558     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
 559     if (PAR(cr))
 560     {
 561         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
 562     }
 563     if (hasReadEkinState)
 564     {
 565         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
 566     }
 567
 568     unsigned int cglo_flags =
 569             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
 570              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
 571
 572     bSumEkinhOld = FALSE;
 573
 574     t_vcm vcm(top_global->groups, *ir);
 575     reportComRemovalInfo(fplog, vcm);
 576
 577     /* To minimize communication, compute_globals computes the COM velocity
 578      * and the kinetic energy for the velocities without COM motion removed.
 579      * Thus to get the kinetic energy without the COM contribution, we need
 580      * to call compute_globals twice.
 581      */
 582     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
 583     {
 584         unsigned int cglo_flags_iteration = cglo_flags;
 585         if (bStopCM && cgloIteration == 0)
 586         {
 587             cglo_flags_iteration |= CGLO_STOPCM;
 588             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
 589         }
 590         compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 591                         makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, nullptr,
 592                         enerd, force_vir, shake_vir, total_vir, pres, constr, &nullSignaller,
 593                         state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
 594                         cglo_flags_iteration
 595                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
 596                                                                          : 0));
 597         if (cglo_flags_iteration & CGLO_STOPCM)
 598         {
 599             /* At initialization, do not pass x with acceleration-correction mode
 600              * to avoid (incorrect) correction of the initial coordinates.
 601              */
 602             auto x = (vcm.mode == ecmLINEAR_ACCELERATION_CORRECTION) ? ArrayRef<RVec>()
 603                                                                      : makeArrayRef(state->x);
 604             process_and_stopcm_grp(fplog, &vcm, *mdatoms, x, makeArrayRef(state->v));
 605             inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
 606         }
 607     }
 608     checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global, &top,
 609                                     makeConstArrayRef(state->x), state->box,
 610                                     &shouldCheckNumberOfBondedInteractions);
 611     if (ir->eI == eiVVAK)
 612     {
 613         /* a second call to get the half step temperature initialized as well */
 614         /* we do the same call as above, but turn the pressure off -- internally to
 615            compute_globals, this is recognized as a velocity verlet half-step
 616            kinetic energy calculation.  This minimized excess variables, but
 617            perhaps loses some logic?*/
 618
 619         compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 620                         makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, nullptr,
 621                         enerd, force_vir, shake_vir, total_vir, pres, constr, &nullSignaller,
 622                         state->box, nullptr, &bSumEkinhOld, cglo_flags & ~CGLO_PRESSURE);
 623     }
 624
 625     /* Calculate the initial half step temperature, and save the ekinh_old */
 626     if (startingBehavior == StartingBehavior::NewSimulation)
 627     {
 628         for (i = 0; (i < ir->opts.ngtc); i++)
 629         {
 630             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
 631         }
 632     }
 633
 634     /* need to make an initiation call to get the Trotter variables set, as well as other constants
 635        for non-trotter temperature control */
 636     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
 637
 638     if (MASTER(cr))
 639     {
 640         if (!ir->bContinuation)
 641         {
 642             if (constr && ir->eConstrAlg == econtLINCS)
 643             {
 644                 fprintf(fplog, "RMS relative constraint deviation after constraining: %.2e\n",
 645                         constr->rmsd());
 646             }
 647             if (EI_STATE_VELOCITY(ir->eI))
 648             {
 649                 real temp = enerd->term[F_TEMP];
 650                 if (ir->eI != eiVV)
 651                 {
 652                     /* Result of Ekin averaged over velocities of -half
 653                      * and +half step, while we only have -half step here.
 654                      */
 655                     temp *= 2;
 656                 }
 657                 fprintf(fplog, "Initial temperature: %g K\n", temp);
 658             }
 659         }
 660
 661         char tbuf[20];
 662         fprintf(stderr, "starting mdrun '%s'\n", *(top_global->name));
 663         if (ir->nsteps >= 0)
 664         {
 665             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
 666         }
 667         else
 668         {
 669             sprintf(tbuf, "%s", "infinite");
 670         }
 671         if (ir->init_step > 0)
 672         {
 673             fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 674                     gmx_step_str(ir->init_step + ir->nsteps, sbuf), tbuf,
 675                     gmx_step_str(ir->init_step, sbuf2), ir->init_step * ir->delta_t);
 676         }
 677         else
 678         {
 679             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
 680         }
 681         fprintf(fplog, "\n");
 682     }
 683
 684     walltime_accounting_start_time(walltime_accounting);
 685     wallcycle_start(wcycle, ewcRUN);
 686     print_start(fplog, cr, walltime_accounting, "mdrun");
 687
 688 #if GMX_FAHCORE
 689     /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
 690     int chkpt_ret = fcCheckPointParallel(cr->nodeid, NULL, 0);
 691     if (chkpt_ret == 0)
 692     {
 693         gmx_fatal(3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0);
 694     }
 695 #endif
 696
 697     /***********************************************************
 698      *
 699      *             Loop over MD steps
 700      *
 701      ************************************************************/
 702
 703     bFirstStep = TRUE;
 704     /* Skip the first Nose-Hoover integration when we get the state from tpx */
 705     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
 706     bSumEkinhOld     = FALSE;
 707     bExchanged       = FALSE;
 708     bNeedRepartition = FALSE;
 709
 710     step     = ir->init_step;
 711     step_rel = 0;
 712
 713     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
 714             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]), simulationsShareState,
 715             MASTER(cr), ir->nstlist, mdrunOptions.reproducible, nstSignalComm,
 716             mdrunOptions.maximumHoursToRun, ir->nstlist == 0, fplog, step, bNS, walltime_accounting);
 717
 718     auto checkpointHandler = std::make_unique<CheckpointHandler>(
 719             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]), simulationsShareState,
 720             ir->nstlist == 0, MASTER(cr), mdrunOptions.writeConfout,
 721             mdrunOptions.checkpointOptions.period);
 722
 723     const bool resetCountersIsLocal = true;
 724     auto       resetHandler         = std::make_unique<ResetHandler>(
 725             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
 726             !resetCountersIsLocal, ir->nsteps, MASTER(cr), mdrunOptions.timingOptions.resetHalfway,
 727             mdrunOptions.maximumHoursToRun, mdlog, wcycle, walltime_accounting);
 728
 729     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
 730
 731     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
 732     {
 733         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
 734     }
 735
 736     /* and stop now if we should */
 737     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
 738     while (!bLastStep)
 739     {
 740
 741         /* Determine if this is a neighbor search step */
 742         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
 743
 744         if (bPMETune && bNStList)
 745         {
 746             // This has to be here because PME load balancing is called so early.
 747             // TODO: Move to after all booleans are defined.
 748             if (useGpuForUpdate && !bFirstStep)
 749             {
 750                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 751                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 752             }
 753             /* PME grid + cut-off optimization with GPUs or PME nodes */
 754             pme_loadbal_do(pme_loadbal, cr, (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
 755                            fplog, mdlog, *ir, fr, state->box, state->x, wcycle, step, step_rel,
 756                            &bPMETunePrinting, simulationWork.useGpuPmePpCommunication);
 757         }
 758
 759         wallcycle_start(wcycle, ewcSTEP);
 760
 761         bLastStep = (step_rel == ir->nsteps);
 762         t         = t0 + step * ir->delta_t;
 763
 764         // TODO Refactor this, so that nstfep does not need a default value of zero
 765         if (ir->efep != efepNO || ir->bSimTemp)
 766         {
 767             /* find and set the current lambdas */
 768             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
 769
 770             bDoDHDL     = do_per_step(step, ir->fepvals->nstdhdl);
 771             bDoFEP      = ((ir->efep != efepNO) && do_per_step(step, nstfep));
 772             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
 773                            && (!bFirstStep));
 774         }
 775
 776         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
 777                      && do_per_step(step, replExParams.exchangeInterval));
 778
 779         if (doSimulatedAnnealing)
 780         {
 781             update_annealing_target_temp(ir, t, &upd);
 782         }
 783
 784         /* Stop Center of Mass motion */
 785         bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
 786
 787         /* Determine whether or not to do Neighbour Searching */
 788         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
 789
 790         /* Note that the stopHandler will cause termination at nstglobalcomm
 791          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
 792          * nstpcouple steps, we have computed the half-step kinetic energy
 793          * of the previous step and can always output energies at the last step.
 794          */
 795         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
 796
 797         /* do_log triggers energy and virial calculation. Because this leads
 798          * to different code paths, forces can be different. Thus for exact
 799          * continuation we should avoid extra log output.
 800          * Note that the || bLastStep can result in non-exact continuation
 801          * beyond the last step. But we don't consider that to be an issue.
 802          */
 803         do_log     = (do_per_step(step, ir->nstlog)
 804                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
 805         do_verbose = mdrunOptions.verbose
 806                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
 807
 808         if (useGpuForUpdate && !bFirstStep && bNS)
 809         {
 810             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
 811             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
 812             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
 813             // Copy coordinate from the GPU when needed at the search step.
 814             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
 815             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
 816             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 817             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 818         }
 819
 820         if (bNS && !(bFirstStep && ir->bContinuation))
 821         {
 822             bMasterState = FALSE;
 823             /* Correct the new box if it is too skewed */
 824             if (inputrecDynamicBox(ir))
 825             {
 826                 if (correct_box(fplog, step, state->box))
 827                 {
 828                     bMasterState = TRUE;
 829                     // If update is offloaded, it should be informed about the box size change
 830                     if (useGpuForUpdate)
 831                     {
 832                         integrator->setPbc(PbcType::Xyz, state->box);
 833                     }
 834                 }
 835             }
 836             if (DOMAINDECOMP(cr) && bMasterState)
 837             {
 838                 dd_collect_state(cr->dd, state, state_global);
 839             }
 840
 841             if (DOMAINDECOMP(cr))
 842             {
 843                 /* Repartition the domain decomposition */
 844                 dd_partition_system(fplog, mdlog, step, cr, bMasterState, nstglobalcomm, state_global,
 845                                     *top_global, ir, imdSession, pull_work, state, &f, mdAtoms, &top,
 846                                     fr, vsite, constr, nrnb, wcycle, do_verbose && !bPMETunePrinting);
 847                 shouldCheckNumberOfBondedInteractions = true;
 848                 upd.setNumAtoms(state->natoms);
 849             }
 850         }
 851
 852         // Allocate or re-size GPU halo exchange object, if necessary
 853         if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange && useGpuForNonbonded)
 854         {
 855             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
 856                                "GPU device manager has to be initialized to use GPU "
 857                                "version of halo exchange.");
 858             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
 859         }
 860
 861         if (MASTER(cr) && do_log)
 862         {
 863             gmx::EnergyOutput::printHeader(fplog, step,
 864                                            t); /* can we improve the information printed here? */
 865         }
 866
 867         if (ir->efep != efepNO)
 868         {
 869             update_mdatoms(mdatoms, state->lambda[efptMASS]);
 870         }
 871
 872         if (bExchanged)
 873         {
 874
 875             /* We need the kinetic energy at minus the half step for determining
 876              * the full step kinetic energy and possibly for T-coupling.*/
 877             /* This may not be quite working correctly yet . . . . */
 878             compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 879                             makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle,
 880                             enerd, nullptr, nullptr, nullptr, nullptr, constr, &nullSignaller,
 881                             state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
 882                             CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
 883             checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global,
 884                                             &top, makeConstArrayRef(state->x), state->box,
 885                                             &shouldCheckNumberOfBondedInteractions);
 886         }
 887         clear_mat(force_vir);
 888
 889         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
 890
 891         /* Determine the energy and pressure:
 892          * at nstcalcenergy steps and at energy output steps (set below).
 893          */
 894         if (EI_VV(ir->eI) && (!bInitStep))
 895         {
 896             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
 897             bCalcVir      = bCalcEnerStep
 898                        || (ir->epc != epcNO
 899                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
 900         }
 901         else
 902         {
 903             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
 904             bCalcVir = bCalcEnerStep || (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
 905         }
 906         bCalcEner = bCalcEnerStep;
 907
 908         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
 909
 910         if (do_ene || do_log || bDoReplEx)
 911         {
 912             bCalcVir  = TRUE;
 913             bCalcEner = TRUE;
 914         }
 915
 916         /* Do we need global communication ? */
 917         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
 918                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
 919
 920         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
 921                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
 922                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
 923         if (fr->useMts && !do_per_step(step, ir->nstfout))
 924         {
 925             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
 926         }
 927
 928         if (shellfc)
 929         {
 930             /* Now is the time to relax the shells */
 931             relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose, enforcedRotation, step, ir,
 932                                 imdSession, pull_work, bNS, force_flags, &top, constr, enerd,
 933                                 state->natoms, state->x.arrayRefWithPadding(),
 934                                 state->v.arrayRefWithPadding(), state->box, state->lambda,
 935                                 &state->hist, &f.view(), force_vir, mdatoms, nrnb, wcycle, shellfc,
 936                                 fr, runScheduleWork, t, mu_tot, vsite, ddBalanceRegionHandler);
 937         }
 938         else
 939         {
 940             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
 941                is updated (or the AWH update will be performed twice for one step when continuing).
 942                It would be best to call this update function from do_md_trajectory_writing but that
 943                would occur after do_force. One would have to divide the update_awh function into one
 944                function applying the AWH force and one doing the AWH bias update. The update AWH
 945                bias function could then be called after do_md_trajectory_writing (then containing
 946                update_awh_history). The checkpointing will in the future probably moved to the start
 947                of the md loop which will rid of this issue. */
 948             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
 949             {
 950                 awh->updateHistory(state_global->awhHistory.get());
 951             }
 952
 953             /* The coordinates (x) are shifted (to get whole molecules)
 954              * in do_force.
 955              * This is parallellized as well, and does communication too.
 956              * Check comments in sim_util.c
 957              */
 958             do_force(fplog, cr, ms, ir, awh.get(), enforcedRotation, imdSession, pull_work, step,
 959                      nrnb, wcycle, &top, state->box, state->x.arrayRefWithPadding(), &state->hist,
 960                      &f.view(), force_vir, mdatoms, enerd, state->lambda, fr, runScheduleWork,
 961                      vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
 962                      (bNS ? GMX_FORCE_NS : 0) | force_flags, ddBalanceRegionHandler);
 963         }
 964
 965         // VV integrators do not need the following velocity half step
 966         // if it is the first step after starting from a checkpoint.
 967         // That is, the half step is needed on all other steps, and
 968         // also the first step when starting from a .tpr file.
 969         if (EI_VV(ir->eI) && (!bFirstStep || startingBehavior == StartingBehavior::NewSimulation))
 970         /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
 971         {
 972             rvec* vbuf = nullptr;
 973
 974             wallcycle_start(wcycle, ewcUPDATE);
 975             if (ir->eI == eiVV && bInitStep)
 976             {
 977                 /* if using velocity verlet with full time step Ekin,
 978                  * take the first half step only to compute the
 979                  * virial for the first step. From there,
 980                  * revert back to the initial coordinates
 981                  * so that the input is actually the initial step.
 982                  */
 983                 snew(vbuf, state->natoms);
 984                 copy_rvecn(state->v.rvec_array(), vbuf, 0,
 985                            state->natoms); /* should make this better for parallelizing? */
 986             }
 987             else
 988             {
 989                 /* this is for NHC in the Ekin(t+dt/2) version of vv */
 990                 trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ,
 991                                trotter_seq, ettTSEQ1);
 992             }
 993
 994             upd.update_coords(*ir, step, mdatoms, state, f.view().forceWithPadding(), fcdata, ekind,
 995                               M, etrtVELOCITY1, cr, constr != nullptr);
 996
 997             wallcycle_stop(wcycle, ewcUPDATE);
 998             constrain_velocities(constr, do_log, do_ene, step, state, nullptr, bCalcVir, shake_vir);
 999             wallcycle_start(wcycle, ewcUPDATE);
1000             /* if VV, compute the pressure and constraints */
1001             /* For VV2, we strictly only need this if using pressure
1002              * control, but we really would like to have accurate pressures
1003              * printed out.
1004              * Think about ways around this in the future?
1005              * For now, keep this choice in comments.
1006              */
1007             /*bPres = (ir->eI==eiVV || inputrecNptTrotter(ir)); */
1008             /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && inputrecNptTrotter(ir)));*/
1009             bPres = TRUE;
1010             bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
1011             if (bCalcEner && ir->eI == eiVVAK)
1012             {
1013                 bSumEkinhOld = TRUE;
1014             }
1015             /* for vv, the first half of the integration actually corresponds to the previous step.
1016                So we need information from the last step in the first half of the integration */
1017             if (bGStat || do_per_step(step - 1, nstglobalcomm))
1018             {
1019                 wallcycle_stop(wcycle, ewcUPDATE);
1020                 compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1021                                 makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle,
1022                                 enerd, force_vir, shake_vir, total_vir, pres, constr, &nullSignaller,
1023                                 state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
1024                                 (bGStat ? CGLO_GSTAT : 0) | (bCalcEner ? CGLO_ENERGY : 0)
1025                                         | (bTemp ? CGLO_TEMPERATURE : 0) | (bPres ? CGLO_PRESSURE : 0)
1026                                         | (bPres ? CGLO_CONSTRAINT : 0) | (bStopCM ? CGLO_STOPCM : 0)
1027                                         | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1028                                                                                  : 0)
1029                                         | CGLO_SCALEEKIN);
1030                 /* explanation of above:
1031                    a) We compute Ekin at the full time step
1032                    if 1) we are using the AveVel Ekin, and it's not the
1033                    initial step, or 2) if we are using AveEkin, but need the full
1034                    time step kinetic energy for the pressure (always true now, since we want accurate statistics).
1035                    b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
1036                    EkinAveVel because it's needed for the pressure */
1037                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1038                                                 top_global, &top, makeConstArrayRef(state->x),
1039                                                 state->box, &shouldCheckNumberOfBondedInteractions);
1040                 if (bStopCM)
1041                 {
1042                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, makeArrayRef(state->x),
1043                                            makeArrayRef(state->v));
1044                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1045                 }
1046                 wallcycle_start(wcycle, ewcUPDATE);
1047             }
1048             /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
1049             if (!bInitStep)
1050             {
1051                 if (bTrotter)
1052                 {
1053                     m_add(force_vir, shake_vir,
1054                           total_vir); /* we need the un-dispersion corrected total vir here */
1055                     trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ,
1056                                    trotter_seq, ettTSEQ2);
1057
1058                     /* TODO This is only needed when we're about to write
1059                      * a checkpoint, because we use it after the restart
1060                      * (in a kludge?). But what should we be doing if
1061                      * the startingBehavior is NewSimulation or bInitStep are true? */
1062                     if (inputrecNptTrotter(ir) || inputrecNphTrotter(ir))
1063                     {
1064                         copy_mat(shake_vir, state->svir_prev);
1065                         copy_mat(force_vir, state->fvir_prev);
1066                     }
1067                     if (inputrecNvtTrotter(ir) && ir->eI == eiVV)
1068                     {
1069                         /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
1070                         enerd->term[F_TEMP] =
1071                                 sum_ekin(&(ir->opts), ekind, nullptr, (ir->eI == eiVV), FALSE);
1072                         enerd->term[F_EKIN] = trace(ekind->ekin);
1073                     }
1074                 }
1075                 else if (bExchanged)
1076                 {
1077                     wallcycle_stop(wcycle, ewcUPDATE);
1078                     /* We need the kinetic energy at minus the half step for determining
1079                      * the full step kinetic energy and possibly for T-coupling.*/
1080                     /* This may not be quite working correctly yet . . . . */
1081                     compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1082                                     makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle,
1083                                     enerd, nullptr, nullptr, nullptr, nullptr, constr, &nullSignaller,
1084                                     state->box, nullptr, &bSumEkinhOld, CGLO_GSTAT | CGLO_TEMPERATURE);
1085                     wallcycle_start(wcycle, ewcUPDATE);
1086                 }
1087             }
1088             /* if it's the initial step, we performed this first step just to get the constraint virial */
1089             if (ir->eI == eiVV && bInitStep)
1090             {
1091                 copy_rvecn(vbuf, state->v.rvec_array(), 0, state->natoms);
1092                 sfree(vbuf);
1093             }
1094             wallcycle_stop(wcycle, ewcUPDATE);
1095         }
1096
1097         /* compute the conserved quantity */
1098         if (EI_VV(ir->eI))
1099         {
1100             saved_conserved_quantity = NPT_energy(ir, state, &MassQ);
1101             if (ir->eI == eiVV)
1102             {
1103                 last_ekin = enerd->term[F_EKIN];
1104             }
1105             if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
1106             {
1107                 saved_conserved_quantity -= enerd->term[F_DISPCORR];
1108             }
1109             /* sum up the foreign kinetic energy and dK/dl terms for vv.  currently done every step so that dhdl is correct in the .edr */
1110             if (ir->efep != efepNO)
1111             {
1112                 accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1113             }
1114         }
1115
1116         /* ########  END FIRST UPDATE STEP  ############## */
1117         /* ########  If doing VV, we now have v(dt) ###### */
1118         if (bDoExpanded)
1119         {
1120             /* perform extended ensemble sampling in lambda - we don't
1121                actually move to the new state before outputting
1122                statistics, but if performing simulated tempering, we
1123                do update the velocities and the tau_t. */
1124
1125             lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state,
1126                                               state->dfhist, step, state->v.rvec_array(), mdatoms);
1127             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1128             if (MASTER(cr))
1129             {
1130                 copy_df_history(state_global->dfhist, state->dfhist);
1131             }
1132         }
1133
1134         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1135         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1136         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1137             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1138                 || checkpointHandler->isCheckpointingStep()))
1139         {
1140             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1141             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1142         }
1143         // Copy velocities if needed for the output/checkpointing.
1144         // NOTE: Copy on the search steps is done at the beginning of the step.
1145         if (useGpuForUpdate && !bNS
1146             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1147         {
1148             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1149             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1150         }
1151         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1152         // and update is offloaded hence forces are kept on the GPU for update and have not been
1153         // already transferred in do_force().
1154         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1155         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1156         //       prior to GPU update.
1157         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1158         //       copy call in do_force(...).
1159         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1160         //       on host after the D2H copy in do_force(...).
1161         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1162             && do_per_step(step, ir->nstfout))
1163         {
1164             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1165             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1166         }
1167         /* Now we have the energies and forces corresponding to the
1168          * coordinates at time t. We must output all of this before
1169          * the update.
1170          */
1171         do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t, ir, state, state_global,
1172                                  observablesHistory, top_global, fr, outf, energyOutput, ekind,
1173                                  f.view().force(), checkpointHandler->isCheckpointingStep(),
1174                                  bRerunMD, bLastStep, mdrunOptions.writeConfout, bSumEkinhOld);
1175         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1176         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x.rvec_array(), t);
1177
1178         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1179         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1180             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1181         {
1182             copy_mat(state->svir_prev, shake_vir);
1183             copy_mat(state->fvir_prev, force_vir);
1184         }
1185
1186         stopHandler->setSignal();
1187         resetHandler->setSignal(walltime_accounting);
1188
1189         if (bGStat || !PAR(cr))
1190         {
1191             /* In parallel we only have to check for checkpointing in steps
1192              * where we do global communication,
1193              *  otherwise the other nodes don't know.
1194              */
1195             checkpointHandler->setSignal(walltime_accounting);
1196         }
1197
1198         /* #########   START SECOND UPDATE STEP ################# */
1199
1200         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1201            controlled in preprocessing */
1202
1203         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1204         {
1205             gmx_bool bIfRandomize;
1206             bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, &upd, constr);
1207             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1208             if (constr && bIfRandomize)
1209             {
1210                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1211             }
1212         }
1213         /* Box is changed in update() when we do pressure coupling,
1214          * but we should still use the old box for energy corrections and when
1215          * writing it to the energy file, so it matches the trajectory files for
1216          * the same timestep above. Make a copy in a separate array.
1217          */
1218         copy_mat(state->box, lastbox);
1219
1220         dvdl_constr = 0;
1221
1222         wallcycle_start(wcycle, ewcUPDATE);
1223         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1224         if (bTrotter)
1225         {
1226             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
1227             /* We can only do Berendsen coupling after we have summed
1228              * the kinetic energy or virial. Since the happens
1229              * in global_state after update, we should only do it at
1230              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1231              */
1232         }
1233         else
1234         {
1235             update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
1236             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1237         }
1238
1239         if (EI_VV(ir->eI))
1240         {
1241             /* velocity half-step update */
1242             upd.update_coords(*ir, step, mdatoms, state, f.view().forceWithPadding(), fcdata, ekind,
1243                               M, etrtVELOCITY2, cr, constr != nullptr);
1244         }
1245
1246         /* Above, initialize just copies ekinh into ekin,
1247          * it doesn't copy position (for VV),
1248          * and entire integrator for MD.
1249          */
1250
1251         if (ir->eI == eiVVAK)
1252         {
1253             cbuf.resize(state->x.size());
1254             std::copy(state->x.begin(), state->x.end(), cbuf.begin());
1255         }
1256
1257         /* With leap-frog type integrators we compute the kinetic energy
1258          * at a whole time step as the average of the half-time step kinetic
1259          * energies of two subsequent steps. Therefore we need to compute the
1260          * half step kinetic energy also if we need energies at the next step.
1261          */
1262         const bool needHalfStepKineticEnergy =
1263                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1264
1265         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1266         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1267         const bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN
1268                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1269
1270         if (useGpuForUpdate)
1271         {
1272             if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1273             {
1274                 integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(),
1275                                 stateGpu->getForces(), top.idef, *mdatoms, ekind->ngtc);
1276
1277                 // Copy data to the GPU after buffers might have being reinitialized
1278                 stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1279                 stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1280             }
1281
1282             // If the buffer ops were not offloaded this step, the forces are on the host and have to be copied
1283             if (!runScheduleWork->stepWork.useGpuFBufferOps)
1284             {
1285                 stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1286             }
1287
1288             const bool doTemperatureScaling =
1289                     (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1290
1291             // This applies Leap-Frog, LINCS and SETTLE in succession
1292             integrator->integrate(stateGpu->getForcesReadyOnDeviceEvent(
1293                                           AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1294                                   ir->delta_t, true, bCalcVir, shake_vir, doTemperatureScaling,
1295                                   ekind->tcstat, doParrinelloRahman, ir->nstpcouple * ir->delta_t, M);
1296
1297             // Copy velocities D2H after update if:
1298             // - Globals are computed this step (includes the energy output steps).
1299             // - Temperature is needed for the next step.
1300             if (bGStat || needHalfStepKineticEnergy)
1301             {
1302                 stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1303                 stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1304             }
1305         }
1306         else
1307         {
1308             /* With multiple time stepping we need to do an additional normal
1309              * update step to obtain the virial, as the actual MTS integration
1310              * using an acceleration where the slow forces are multiplied by mtsFactor.
1311              * Using that acceleration would result in a virial with the slow
1312              * force contribution would be a factor mtsFactor too large.
1313              */
1314             if (fr->useMts && bCalcVir && constr != nullptr)
1315             {
1316                 upd.update_for_constraint_virial(*ir, *mdatoms, *state, f.view().forceWithPadding(), *ekind);
1317
1318                 constrain_coordinates(constr, do_log, do_ene, step, state,
1319                                       upd.xp()->arrayRefWithPadding(), &dvdl_constr, bCalcVir, shake_vir);
1320             }
1321
1322             ArrayRefWithPadding<const RVec> forceCombined =
1323                     (fr->useMts && step % ir->mtsLevels[1].stepFactor == 0)
1324                             ? f.view().forceMtsCombinedWithPadding()
1325                             : f.view().forceWithPadding();
1326             upd.update_coords(*ir, step, mdatoms, state, forceCombined, fcdata, ekind, M,
1327                               etrtPOSITION, cr, constr != nullptr);
1328
1329             wallcycle_stop(wcycle, ewcUPDATE);
1330
1331             constrain_coordinates(constr, do_log, do_ene, step, state, upd.xp()->arrayRefWithPadding(),
1332                                   &dvdl_constr, bCalcVir && !fr->useMts, shake_vir);
1333
1334             upd.update_sd_second_half(*ir, step, &dvdl_constr, mdatoms, state, cr, nrnb, wcycle,
1335                                       constr, do_log, do_ene);
1336             upd.finish_update(*ir, mdatoms, state, wcycle, constr != nullptr);
1337         }
1338
1339         if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1340         {
1341             updatePrevStepPullCom(pull_work, state);
1342         }
1343
1344         if (ir->eI == eiVVAK)
1345         {
1346             /* erase F_EKIN and F_TEMP here? */
1347             /* just compute the kinetic energy at the half step to perform a trotter step */
1348             compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1349                             makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle, enerd,
1350                             force_vir, shake_vir, total_vir, pres, constr, &nullSignaller, lastbox,
1351                             nullptr, &bSumEkinhOld, (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE);
1352             wallcycle_start(wcycle, ewcUPDATE);
1353             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
1354             /* now we know the scaling, we can compute the positions again */
1355             std::copy(cbuf.begin(), cbuf.end(), state->x.begin());
1356
1357             upd.update_coords(*ir, step, mdatoms, state, f.view().forceWithPadding(), fcdata, ekind,
1358                               M, etrtPOSITION, cr, constr != nullptr);
1359             wallcycle_stop(wcycle, ewcUPDATE);
1360
1361             /* do we need an extra constraint here? just need to copy out of as_rvec_array(state->v.data()) to upd->xp? */
1362             /* are the small terms in the shake_vir here due
1363              * to numerical errors, or are they important
1364              * physically? I'm thinking they are just errors, but not completely sure.
1365              * For now, will call without actually constraining, constr=NULL*/
1366             upd.finish_update(*ir, mdatoms, state, wcycle, false);
1367         }
1368         if (EI_VV(ir->eI))
1369         {
1370             /* this factor or 2 correction is necessary
1371                because half of the constraint force is removed
1372                in the vv step, so we have to double it.  See
1373                the Issue #1255.  It is not yet clear
1374                if the factor of 2 is exact, or just a very
1375                good approximation, and this will be
1376                investigated.  The next step is to see if this
1377                can be done adding a dhdl contribution from the
1378                rattle step, but this is somewhat more
1379                complicated with the current code. Will be
1380                investigated, hopefully for 4.6.3. However,
1381                this current solution is much better than
1382                having it completely wrong.
1383              */
1384             enerd->term[F_DVDL_CONSTR] += 2 * dvdl_constr;
1385         }
1386         else
1387         {
1388             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1389         }
1390
1391         if (vsite != nullptr)
1392         {
1393             wallcycle_start(wcycle, ewcVSITECONSTR);
1394             vsite->construct(state->x, ir->delta_t, state->v, state->box);
1395             wallcycle_stop(wcycle, ewcVSITECONSTR);
1396         }
1397
1398         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1399         /* With Leap-Frog we can skip compute_globals at
1400          * non-communication steps, but we need to calculate
1401          * the kinetic energy one step before communication.
1402          */
1403         {
1404             // Organize to do inter-simulation signalling on steps if
1405             // and when algorithms require it.
1406             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1407
1408             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1409             {
1410                 // Copy coordinates when needed to stop the CM motion.
1411                 if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
1412                 {
1413                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1414                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1415                 }
1416                 // Since we're already communicating at this step, we
1417                 // can propagate intra-simulation signals. Note that
1418                 // check_nstglobalcomm has the responsibility for
1419                 // choosing the value of nstglobalcomm that is one way
1420                 // bGStat becomes true, so we can't get into a
1421                 // situation where e.g. checkpointing can't be
1422                 // signalled.
1423                 bool                doIntraSimSignal = true;
1424                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1425
1426                 compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1427                                 makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm,
1428                                 wcycle, enerd, force_vir, shake_vir, total_vir, pres, constr,
1429                                 &signaller, lastbox, &totalNumberOfBondedInteractions, &bSumEkinhOld,
1430                                 (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1431                                         | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1432                                         | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1433                                         | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
1434                                         | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1435                                                                                  : 0));
1436                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1437                                                 top_global, &top, makeConstArrayRef(state->x),
1438                                                 state->box, &shouldCheckNumberOfBondedInteractions);
1439                 if (!EI_VV(ir->eI) && bStopCM)
1440                 {
1441                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, makeArrayRef(state->x),
1442                                            makeArrayRef(state->v));
1443                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1444
1445                     // TODO: The special case of removing CM motion should be dealt more gracefully
1446                     if (useGpuForUpdate)
1447                     {
1448                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1449                         // Here we block until the H2D copy completes because event sync with the
1450                         // force kernels that use the coordinates on the next steps is not implemented
1451                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1452                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1453                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1454                         if (vcm.mode != ecmNO)
1455                         {
1456                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1457                         }
1458                     }
1459                 }
1460             }
1461         }
1462
1463         /* #############  END CALC EKIN AND PRESSURE ################# */
1464
1465         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1466            the virial that should probably be addressed eventually. state->veta has better properies,
1467            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1468            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1469
1470         if (ir->efep != efepNO && !EI_VV(ir->eI))
1471         {
1472             /* Sum up the foreign energy and dK/dl terms for md and sd.
1473                Currently done every step so that dH/dl is correct in the .edr */
1474             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1475         }
1476
1477         update_pcouple_after_coordinates(fplog, step, ir, mdatoms, pres, force_vir, shake_vir,
1478                                          pressureCouplingMu, state, nrnb, upd.deform(), !useGpuForUpdate);
1479
1480         const bool doBerendsenPressureCoupling =
1481                 (inputrec->epc == epcBERENDSEN && do_per_step(step, inputrec->nstpcouple));
1482         const bool doCRescalePressureCoupling =
1483                 (inputrec->epc == epcCRESCALE && do_per_step(step, inputrec->nstpcouple));
1484         if (useGpuForUpdate
1485             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1486         {
1487             integrator->scaleCoordinates(pressureCouplingMu);
1488             if (doCRescalePressureCoupling)
1489             {
1490                 matrix pressureCouplingInvMu;
1491                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1492                 integrator->scaleVelocities(pressureCouplingInvMu);
1493             }
1494             integrator->setPbc(PbcType::Xyz, state->box);
1495         }
1496
1497         /* ################# END UPDATE STEP 2 ################# */
1498         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1499
1500         /* The coordinates (x) were unshifted in update */
1501         if (!bGStat)
1502         {
1503             /* We will not sum ekinh_old,
1504              * so signal that we still have to do it.
1505              */
1506             bSumEkinhOld = TRUE;
1507         }
1508
1509         if (bCalcEner)
1510         {
1511             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1512
1513             /* use the directly determined last velocity, not actually the averaged half steps */
1514             if (bTrotter && ir->eI == eiVV)
1515             {
1516                 enerd->term[F_EKIN] = last_ekin;
1517             }
1518             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1519
1520             if (integratorHasConservedEnergyQuantity(ir))
1521             {
1522                 if (EI_VV(ir->eI))
1523                 {
1524                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1525                 }
1526                 else
1527                 {
1528                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1529                 }
1530             }
1531             /* #########  END PREPARING EDR OUTPUT  ###########  */
1532         }
1533
1534         /* Output stuff */
1535         if (MASTER(cr))
1536         {
1537             if (fplog && do_log && bDoExpanded)
1538             {
1539                 /* only needed if doing expanded ensemble */
1540                 PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals,
1541                                           ir->bSimTemp ? ir->simtempvals : nullptr,
1542                                           state_global->dfhist, state->fep_state, ir->nstlog, step);
1543             }
1544             if (bCalcEner)
1545             {
1546                 energyOutput.addDataAtEnergyStep(
1547                         bDoDHDL, bCalcEnerStep, t, mdatoms->tmass, enerd, ir->fepvals,
1548                         ir->expandedvals, lastbox,
1549                         PTCouplingArrays{ state->boxv, state->nosehoover_xi, state->nosehoover_vxi,
1550                                           state->nhpres_xi, state->nhpres_vxi },
1551                         state->fep_state, shake_vir, force_vir, total_vir, pres, ekind, mu_tot, constr);
1552             }
1553             else
1554             {
1555                 energyOutput.recordNonEnergyStep();
1556             }
1557
1558             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1559             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1560
1561             if (doSimulatedAnnealing)
1562             {
1563                 gmx::EnergyOutput::printAnnealingTemperatures(do_log ? fplog : nullptr, groups,
1564                                                               &(ir->opts));
1565             }
1566             if (do_log || do_ene || do_dr || do_or)
1567             {
1568                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or,
1569                                                    do_log ? fplog : nullptr, step, t,
1570                                                    fr->fcdata.get(), awh.get());
1571             }
1572             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1573             {
1574                 const bool isInitialOutput = false;
1575                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1576             }
1577
1578             if (ir->bPull)
1579             {
1580                 pull_print_output(pull_work, step, t);
1581             }
1582
1583             if (do_per_step(step, ir->nstlog))
1584             {
1585                 if (fflush(fplog) != 0)
1586                 {
1587                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1588                 }
1589             }
1590         }
1591         if (bDoExpanded)
1592         {
1593             /* Have to do this part _after_ outputting the logfile and the edr file */
1594             /* Gets written into the state at the beginning of next loop*/
1595             state->fep_state = lamnew;
1596         }
1597         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1598         {
1599             state->fep_state = awh->fepLambdaState();
1600         }
1601         /* Print the remaining wall clock time for the run */
1602         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1603         {
1604             if (shellfc)
1605             {
1606                 fprintf(stderr, "\n");
1607             }
1608             print_time(stderr, walltime_accounting, step, ir, cr);
1609         }
1610
1611         /* Ion/water position swapping.
1612          * Not done in last step since trajectory writing happens before this call
1613          * in the MD loop and exchanges would be lost anyway. */
1614         bNeedRepartition = FALSE;
1615         if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep && do_per_step(step, ir->swap->nstswap))
1616         {
1617             bNeedRepartition =
1618                     do_swapcoords(cr, step, t, ir, swap, wcycle, as_rvec_array(state->x.data()),
1619                                   state->box, MASTER(cr) && mdrunOptions.verbose, bRerunMD);
1620
1621             if (bNeedRepartition && DOMAINDECOMP(cr))
1622             {
1623                 dd_collect_state(cr->dd, state, state_global);
1624             }
1625         }
1626
1627         /* Replica exchange */
1628         bExchanged = FALSE;
1629         if (bDoReplEx)
1630         {
1631             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1632         }
1633
1634         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1635         {
1636             dd_partition_system(fplog, mdlog, step, cr, TRUE, 1, state_global, *top_global, ir,
1637                                 imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
1638                                 nrnb, wcycle, FALSE);
1639             shouldCheckNumberOfBondedInteractions = true;
1640             upd.setNumAtoms(state->natoms);
1641         }
1642
1643         bFirstStep = FALSE;
1644         bInitStep  = FALSE;
1645
1646         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1647         /* With all integrators, except VV, we need to retain the pressure
1648          * at the current step for coupling at the next step.
1649          */
1650         if ((state->flags & (1U << estPRES_PREV))
1651             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1652         {
1653             /* Store the pressure in t_state for pressure coupling
1654              * at the next MD step.
1655              */
1656             copy_mat(pres, state->pres_prev);
1657         }
1658
1659         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1660
1661         if ((membed != nullptr) && (!bLastStep))
1662         {
1663             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1664         }
1665
1666         cycles = wallcycle_stop(wcycle, ewcSTEP);
1667         if (DOMAINDECOMP(cr) && wcycle)
1668         {
1669             dd_cycles_add(cr->dd, cycles, ddCyclStep);
1670         }
1671
1672         /* increase the MD step number */
1673         step++;
1674         step_rel++;
1675
1676         resetHandler->resetCounters(step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb,
1677                                     fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
1678
1679         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
1680         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
1681     }
1682     /* End of main MD loop */
1683
1684     /* Closing TNG files can include compressing data. Therefore it is good to do that
1685      * before stopping the time measurements. */
1686     mdoutf_tng_close(outf);
1687
1688     /* Stop measuring walltime */
1689     walltime_accounting_end_time(walltime_accounting);
1690
1691     if (!thisRankHasDuty(cr, DUTY_PME))
1692     {
1693         /* Tell the PME only node to finish */
1694         gmx_pme_send_finish(cr);
1695     }
1696
1697     if (MASTER(cr))
1698     {
1699         if (ir->nstcalcenergy > 0)
1700         {
1701             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
1702             energyOutput.printAverages(fplog, groups);
1703         }
1704     }
1705     done_mdoutf(outf);
1706
1707     if (bPMETune)
1708     {
1709         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
1710     }
1711
1712     done_shellfc(fplog, shellfc, step_rel);
1713
1714     if (useReplicaExchange && MASTER(cr))
1715     {
1716         print_replica_exchange_statistics(fplog, repl_ex);
1717     }
1718
1719     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
1720
1721     global_stat_destroy(gstat);
1722 }