src/gromacs/mdrun/md.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2011-2019,2020, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief Implements the integrator for normal molecular dynamics simulations
  40  *
  41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
  42  * \ingroup module_mdrun
  43  */
  44 #include "gmxpre.h"
  45
  46 #include <cinttypes>
  47 #include <cmath>
  48 #include <cstdio>
  49 #include <cstdlib>
  50
  51 #include <algorithm>
  52 #include <memory>
  53 #include <numeric>
  54
  55 #include "gromacs/awh/awh.h"
  56 #include "gromacs/commandline/filenm.h"
  57 #include "gromacs/domdec/collect.h"
  58 #include "gromacs/domdec/dlbtiming.h"
  59 #include "gromacs/domdec/domdec.h"
  60 #include "gromacs/domdec/domdec_network.h"
  61 #include "gromacs/domdec/domdec_struct.h"
  62 #include "gromacs/domdec/gpuhaloexchange.h"
  63 #include "gromacs/domdec/mdsetup.h"
  64 #include "gromacs/domdec/partition.h"
  65 #include "gromacs/essentialdynamics/edsam.h"
  66 #include "gromacs/ewald/pme_load_balancing.h"
  67 #include "gromacs/ewald/pme_pp.h"
  68 #include "gromacs/fileio/trxio.h"
  69 #include "gromacs/gmxlib/network.h"
  70 #include "gromacs/gmxlib/nrnb.h"
  71 #include "gromacs/gpu_utils/device_stream_manager.h"
  72 #include "gromacs/gpu_utils/gpu_utils.h"
  73 #include "gromacs/imd/imd.h"
  74 #include "gromacs/listed_forces/listed_forces.h"
  75 #include "gromacs/math/functions.h"
  76 #include "gromacs/math/vec.h"
  77 #include "gromacs/math/vectypes.h"
  78 #include "gromacs/mdlib/checkpointhandler.h"
  79 #include "gromacs/mdlib/compute_io.h"
  80 #include "gromacs/mdlib/constr.h"
  81 #include "gromacs/mdlib/coupling.h"
  82 #include "gromacs/mdlib/ebin.h"
  83 #include "gromacs/mdlib/enerdata_utils.h"
  84 #include "gromacs/mdlib/energyoutput.h"
  85 #include "gromacs/mdlib/expanded.h"
  86 #include "gromacs/mdlib/force.h"
  87 #include "gromacs/mdlib/force_flags.h"
  88 #include "gromacs/mdlib/forcerec.h"
  89 #include "gromacs/mdlib/md_support.h"
  90 #include "gromacs/mdlib/mdatoms.h"
  91 #include "gromacs/mdlib/mdoutf.h"
  92 #include "gromacs/mdlib/membed.h"
  93 #include "gromacs/mdlib/resethandler.h"
  94 #include "gromacs/mdlib/sighandler.h"
  95 #include "gromacs/mdlib/simulationsignal.h"
  96 #include "gromacs/mdlib/stat.h"
  97 #include "gromacs/mdlib/stophandler.h"
  98 #include "gromacs/mdlib/tgroup.h"
  99 #include "gromacs/mdlib/trajectory_writing.h"
 100 #include "gromacs/mdlib/update.h"
 101 #include "gromacs/mdlib/update_constrain_gpu.h"
 102 #include "gromacs/mdlib/vcm.h"
 103 #include "gromacs/mdlib/vsite.h"
 104 #include "gromacs/mdrunutility/handlerestart.h"
 105 #include "gromacs/mdrunutility/multisim.h"
 106 #include "gromacs/mdrunutility/printtime.h"
 107 #include "gromacs/mdtypes/awh_history.h"
 108 #include "gromacs/mdtypes/awh_params.h"
 109 #include "gromacs/mdtypes/commrec.h"
 110 #include "gromacs/mdtypes/df_history.h"
 111 #include "gromacs/mdtypes/energyhistory.h"
 112 #include "gromacs/mdtypes/fcdata.h"
 113 #include "gromacs/mdtypes/forcebuffers.h"
 114 #include "gromacs/mdtypes/forcerec.h"
 115 #include "gromacs/mdtypes/group.h"
 116 #include "gromacs/mdtypes/inputrec.h"
 117 #include "gromacs/mdtypes/interaction_const.h"
 118 #include "gromacs/mdtypes/md_enums.h"
 119 #include "gromacs/mdtypes/mdatom.h"
 120 #include "gromacs/mdtypes/mdrunoptions.h"
 121 #include "gromacs/mdtypes/observableshistory.h"
 122 #include "gromacs/mdtypes/pullhistory.h"
 123 #include "gromacs/mdtypes/simulation_workload.h"
 124 #include "gromacs/mdtypes/state.h"
 125 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
 126 #include "gromacs/modularsimulator/energydata.h"
 127 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 128 #include "gromacs/nbnxm/nbnxm.h"
 129 #include "gromacs/pbcutil/pbc.h"
 130 #include "gromacs/pulling/output.h"
 131 #include "gromacs/pulling/pull.h"
 132 #include "gromacs/swap/swapcoords.h"
 133 #include "gromacs/timing/wallcycle.h"
 134 #include "gromacs/timing/walltime_accounting.h"
 135 #include "gromacs/topology/atoms.h"
 136 #include "gromacs/topology/idef.h"
 137 #include "gromacs/topology/mtop_util.h"
 138 #include "gromacs/topology/topology.h"
 139 #include "gromacs/trajectory/trajectoryframe.h"
 140 #include "gromacs/utility/basedefinitions.h"
 141 #include "gromacs/utility/cstringutil.h"
 142 #include "gromacs/utility/fatalerror.h"
 143 #include "gromacs/utility/logger.h"
 144 #include "gromacs/utility/real.h"
 145 #include "gromacs/utility/smalloc.h"
 146
 147 #include "legacysimulator.h"
 148 #include "replicaexchange.h"
 149 #include "shellfc.h"
 150
 151 #if GMX_FAHCORE
 152 #    include "corewrap.h"
 153 #endif
 154
 155 using gmx::SimulationSignaller;
 156
 157 void gmx::LegacySimulator::do_md()
 158 {
 159     // TODO Historically, the EM and MD "integrators" used different
 160     // names for the t_inputrec *parameter, but these must have the
 161     // same name, now that it's a member of a struct. We use this ir
 162     // alias to avoid a large ripple of nearly useless changes.
 163     // t_inputrec is being replaced by IMdpOptionsProvider, so this
 164     // will go away eventually.
 165     t_inputrec*  ir = inputrec;
 166     int64_t      step, step_rel;
 167     double       t, t0 = ir->init_t, lam0[efptNR];
 168     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
 169     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
 170     gmx_bool     bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
 171     gmx_bool     do_ene, do_log, do_verbose;
 172     gmx_bool     bMasterState;
 173     unsigned int force_flags;
 174     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
 175     int    i, m;
 176     rvec   mu_tot;
 177     matrix pressureCouplingMu, M;
 178     gmx_repl_ex_t     repl_ex = nullptr;
 179     gmx_global_stat_t gstat;
 180     gmx_shellfc_t*    shellfc;
 181     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
 182     gmx_bool          bTemp, bPres, bTrotter;
 183     real              dvdl_constr;
 184     std::vector<RVec> cbuf;
 185     matrix            lastbox;
 186     int               lamnew = 0;
 187     /* for FEP */
 188     int       nstfep = 0;
 189     double    cycles;
 190     real      saved_conserved_quantity = 0;
 191     real      last_ekin                = 0;
 192     t_extmass MassQ;
 193     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
 194
 195     /* PME load balancing data for GPU kernels */
 196     gmx_bool bPMETune         = FALSE;
 197     gmx_bool bPMETunePrinting = FALSE;
 198
 199     bool bInteractiveMDstep = false;
 200
 201     /* Domain decomposition could incorrectly miss a bonded
 202        interaction, but checking for that requires a global
 203        communication stage, which does not otherwise happen in DD
 204        code. So we do that alongside the first global energy reduction
 205        after a new DD is made. These variables handle whether the
 206        check happens, and the result it returns. */
 207     bool shouldCheckNumberOfBondedInteractions = false;
 208     int  totalNumberOfBondedInteractions       = -1;
 209
 210     SimulationSignals signals;
 211     // Most global communnication stages don't propagate mdrun
 212     // signals, and will use this object to achieve that.
 213     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
 214
 215     if (!mdrunOptions.writeConfout)
 216     {
 217         // This is on by default, and the main known use case for
 218         // turning it off is for convenience in benchmarking, which is
 219         // something that should not show up in the general user
 220         // interface.
 221         GMX_LOG(mdlog.info)
 222                 .asParagraph()
 223                 .appendText(
 224                         "The -noconfout functionality is deprecated, and may be removed in a "
 225                         "future version.");
 226     }
 227
 228     /* md-vv uses averaged full step velocities for T-control
 229        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 230        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 231     bTrotter = (EI_VV(ir->eI)
 232                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
 233
 234     const bool bRerunMD = false;
 235
 236     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
 237     bGStatEveryStep   = (nstglobalcomm == 1);
 238
 239     const SimulationGroups* groups = &top_global->groups;
 240
 241     std::unique_ptr<EssentialDynamics> ed = nullptr;
 242     if (opt2bSet("-ei", nfile, fnm))
 243     {
 244         /* Initialize essential dynamics sampling */
 245         ed = init_edsam(mdlog, opt2fn_null("-ei", nfile, fnm), opt2fn("-eo", nfile, fnm), top_global,
 246                         ir, cr, constr, state_global, observablesHistory, oenv, startingBehavior);
 247     }
 248     else if (observablesHistory->edsamHistory)
 249     {
 250         gmx_fatal(FARGS,
 251                   "The checkpoint is from a run with essential dynamics sampling, "
 252                   "but the current run did not specify the -ei option. "
 253                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
 254     }
 255
 256     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
 257     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
 258     initialize_lambdas(fplog, *ir, MASTER(cr), fep_state, lambda, lam0);
 259     Update     upd(*ir, deform);
 260     const bool doSimulatedAnnealing = initSimulatedAnnealing(ir, &upd);
 261     const bool useReplicaExchange   = (replExParams.exchangeInterval > 0);
 262
 263     const t_fcdata& fcdata = *fr->fcdata;
 264
 265     bool simulationsShareState = false;
 266     int  nstSignalComm         = nstglobalcomm;
 267     {
 268         // TODO This implementation of ensemble orientation restraints is nasty because
 269         // a user can't just do multi-sim with single-sim orientation restraints.
 270         bool usingEnsembleRestraints =
 271                 (fcdata.disres->nsystems > 1) || ((ms != nullptr) && (fcdata.orires->nr != 0));
 272         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr));
 273
 274         // Replica exchange, ensemble restraints and AWH need all
 275         // simulations to remain synchronized, so they need
 276         // checkpoints and stop conditions to act on the same step, so
 277         // the propagation of such signals must take place between
 278         // simulations, not just within simulations.
 279         // TODO: Make algorithm initializers set these flags.
 280         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
 281
 282         if (simulationsShareState)
 283         {
 284             // Inter-simulation signal communication does not need to happen
 285             // often, so we use a minimum of 200 steps to reduce overhead.
 286             const int c_minimumInterSimulationSignallingInterval = 200;
 287             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
 288                             * nstglobalcomm;
 289         }
 290     }
 291
 292     if (startingBehavior != StartingBehavior::RestartWithAppending)
 293     {
 294         pleaseCiteCouplingAlgorithms(fplog, *ir);
 295     }
 296     gmx_mdoutf* outf =
 297             init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, ir,
 298                         top_global, oenv, wcycle, startingBehavior, simulationsShareState, ms);
 299     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, ir, pull_work,
 300                                    mdoutf_get_fp_dhdl(outf), false, startingBehavior, mdModulesNotifier);
 301
 302     gstat = global_stat_init(ir);
 303
 304     /* Check for polarizable models and flexible constraints */
 305     shellfc = init_shell_flexcon(fplog, top_global, constr ? constr->numFlexibleConstraints() : 0,
 306                                  ir->nstcalcenergy, DOMAINDECOMP(cr));
 307
 308     {
 309         double io = compute_io(ir, top_global->natoms, *groups, energyOutput.numEnergyTerms(), 1);
 310         if ((io > 2000) && MASTER(cr))
 311         {
 312             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
 313         }
 314     }
 315
 316     // Local state only becomes valid now.
 317     std::unique_ptr<t_state> stateInstance;
 318     t_state*                 state;
 319
 320
 321     gmx_localtop_t top(top_global->ffparams);
 322
 323     auto mdatoms = mdAtoms->mdatoms();
 324
 325     const auto& simulationWork     = runScheduleWork->simulationWork;
 326     const bool  useGpuForPme       = simulationWork.useGpuPme;
 327     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
 328     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
 329     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
 330
 331     ForceBuffers f(((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 332                            ? PinningPolicy::PinnedIfSupported
 333                            : PinningPolicy::CannotBePinned);
 334     if (DOMAINDECOMP(cr))
 335     {
 336         stateInstance = std::make_unique<t_state>();
 337         state         = stateInstance.get();
 338         dd_init_local_state(cr->dd, state_global, state);
 339
 340         /* Distribute the charge groups over the nodes from the master node */
 341         dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1, state_global, *top_global, ir,
 342                             imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
 343                             nrnb, nullptr, FALSE);
 344         shouldCheckNumberOfBondedInteractions = true;
 345         upd.setNumAtoms(state->natoms);
 346     }
 347     else
 348     {
 349         state_change_natoms(state_global, state_global->natoms);
 350         /* Copy the pointer to the global state */
 351         state = state_global;
 352
 353         /* Generate and initialize new topology */
 354         mdAlgorithmsSetupAtomData(cr, ir, *top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
 355
 356         upd.setNumAtoms(state->natoms);
 357     }
 358
 359     std::unique_ptr<UpdateConstrainGpu> integrator;
 360
 361     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
 362
 363     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
 364     if (useGpuForUpdate)
 365     {
 366         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
 367                                    || constr->numConstraintsTotal() == 0,
 368                            "Constraints in domain decomposition are only supported with update "
 369                            "groups if using GPU update.\n");
 370         GMX_RELEASE_ASSERT(ir->eConstrAlg != econtSHAKE || constr == nullptr
 371                                    || constr->numConstraintsTotal() == 0,
 372                            "SHAKE is not supported with GPU update.");
 373         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
 374                            "Either PME or short-ranged non-bonded interaction tasks must run on "
 375                            "the GPU to use GPU update.\n");
 376         GMX_RELEASE_ASSERT(ir->eI == eiMD,
 377                            "Only the md integrator is supported with the GPU update.\n");
 378         GMX_RELEASE_ASSERT(
 379                 ir->etc != etcNOSEHOOVER,
 380                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
 381         GMX_RELEASE_ASSERT(ir->epc == epcNO || ir->epc == epcPARRINELLORAHMAN || ir->epc == epcBERENDSEN,
 382                            "Only Parrinello-Rahman and Berendsen pressure coupling are supported "
 383                            "with the GPU update.\n");
 384         GMX_RELEASE_ASSERT(!mdatoms->haveVsites,
 385                            "Virtual sites are not supported with the GPU update.\n");
 386         GMX_RELEASE_ASSERT(ed == nullptr,
 387                            "Essential dynamics is not supported with the GPU update.\n");
 388         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(ir->pull),
 389                            "Constraints pulling is not supported with the GPU update.\n");
 390         GMX_RELEASE_ASSERT(fcdata.orires->nr == 0,
 391                            "Orientation restraints are not supported with the GPU update.\n");
 392         GMX_RELEASE_ASSERT(
 393                 ir->efep == efepNO
 394                         || (!haveFreeEnergyType(*ir, efptBONDED) && !haveFreeEnergyType(*ir, efptMASS)),
 395                 "Free energy perturbation of masses and constraints are not supported with the GPU "
 396                 "update.");
 397
 398         if (constr != nullptr && constr->numConstraintsTotal() > 0)
 399         {
 400             GMX_LOG(mdlog.info)
 401                     .asParagraph()
 402                     .appendText("Updating coordinates and applying constraints on the GPU.");
 403         }
 404         else
 405         {
 406             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
 407         }
 408         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
 409                            "Device stream manager should be initialized in order to use GPU "
 410                            "update-constraints.");
 411         GMX_RELEASE_ASSERT(
 412                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
 413                 "Update stream should be initialized in order to use GPU "
 414                 "update-constraints.");
 415         integrator = std::make_unique<UpdateConstrainGpu>(
 416                 *ir, *top_global, fr->deviceStreamManager->context(),
 417                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
 418                 stateGpu->xUpdatedOnDevice());
 419
 420         integrator->setPbc(PbcType::Xyz, state->box);
 421     }
 422
 423     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 424     {
 425         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
 426     }
 427     if (useGpuForUpdate)
 428     {
 429         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
 430     }
 431
 432     // NOTE: The global state is no longer used at this point.
 433     // But state_global is still used as temporary storage space for writing
 434     // the global state to file and potentially for replica exchange.
 435     // (Global topology should persist.)
 436
 437     update_mdatoms(mdatoms, state->lambda[efptMASS]);
 438
 439     if (ir->bExpanded)
 440     {
 441         /* Check nstexpanded here, because the grompp check was broken */
 442         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
 443         {
 444             gmx_fatal(FARGS,
 445                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
 446         }
 447         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
 448     }
 449
 450     if (MASTER(cr))
 451     {
 452         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
 453     }
 454
 455     preparePrevStepPullCom(ir, pull_work, mdatoms->massT, state, state_global, cr,
 456                            startingBehavior != StartingBehavior::NewSimulation);
 457
 458     // TODO: Remove this by converting AWH into a ForceProvider
 459     auto awh = prepareAwhModule(fplog, *ir, state_global, cr, ms,
 460                                 startingBehavior != StartingBehavior::NewSimulation,
 461                                 shellfc != nullptr, opt2fn("-awh", nfile, fnm), pull_work);
 462
 463     if (useReplicaExchange && MASTER(cr))
 464     {
 465         repl_ex = init_replica_exchange(fplog, ms, top_global->natoms, ir, replExParams);
 466     }
 467     /* PME tuning is only supported in the Verlet scheme, with PME for
 468      * Coulomb. It is not supported with only LJ PME. */
 469     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
 470                 && ir->cutoff_scheme != ecutsGROUP);
 471
 472     pme_load_balancing_t* pme_loadbal = nullptr;
 473     if (bPMETune)
 474     {
 475         pme_loadbal_init(&pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata,
 476                          fr->nbv->useGpu());
 477     }
 478
 479     if (!ir->bContinuation)
 480     {
 481         if (state->flags & (1U << estV))
 482         {
 483             auto v = makeArrayRef(state->v);
 484             /* Set the velocities of vsites, shells and frozen atoms to zero */
 485             for (i = 0; i < mdatoms->homenr; i++)
 486             {
 487                 if (mdatoms->ptype[i] == eptVSite || mdatoms->ptype[i] == eptShell)
 488                 {
 489                     clear_rvec(v[i]);
 490                 }
 491                 else if (mdatoms->cFREEZE)
 492                 {
 493                     for (m = 0; m < DIM; m++)
 494                     {
 495                         if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
 496                         {
 497                             v[i][m] = 0;
 498                         }
 499                     }
 500                 }
 501             }
 502         }
 503
 504         if (constr)
 505         {
 506             /* Constrain the initial coordinates and velocities */
 507             do_constrain_first(fplog, constr, ir, mdatoms->nr, mdatoms->homenr,
 508                                state->x.arrayRefWithPadding(), state->v.arrayRefWithPadding(),
 509                                state->box, state->lambda[efptBONDED]);
 510         }
 511         if (vsite)
 512         {
 513             /* Construct the virtual sites for the initial configuration */
 514             vsite->construct(state->x, ir->delta_t, {}, state->box);
 515         }
 516     }
 517
 518     if (ir->efep != efepNO)
 519     {
 520         /* Set free energy calculation frequency as the greatest common
 521          * denominator of nstdhdl and repl_ex_nst. */
 522         nstfep = ir->fepvals->nstdhdl;
 523         if (ir->bExpanded)
 524         {
 525             nstfep = std::gcd(ir->expandedvals->nstexpanded, nstfep);
 526         }
 527         if (useReplicaExchange)
 528         {
 529             nstfep = std::gcd(replExParams.exchangeInterval, nstfep);
 530         }
 531         if (ir->bDoAwh)
 532         {
 533             nstfep = std::gcd(ir->awhParams->nstSampleCoord, nstfep);
 534         }
 535     }
 536
 537     /* Be REALLY careful about what flags you set here. You CANNOT assume
 538      * this is the first step, since we might be restarting from a checkpoint,
 539      * and in that case we should not do any modifications to the state.
 540      */
 541     bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
 542
 543     // When restarting from a checkpoint, it can be appropriate to
 544     // initialize ekind from quantities in the checkpoint. Otherwise,
 545     // compute_globals must initialize ekind before the simulation
 546     // starts/restarts. However, only the master rank knows what was
 547     // found in the checkpoint file, so we have to communicate in
 548     // order to coordinate the restart.
 549     //
 550     // TODO Consider removing this communication if/when checkpoint
 551     // reading directly follows .tpr reading, because all ranks can
 552     // agree on hasReadEkinState at that time.
 553     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
 554     if (PAR(cr))
 555     {
 556         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
 557     }
 558     if (hasReadEkinState)
 559     {
 560         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
 561     }
 562
 563     unsigned int cglo_flags =
 564             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
 565              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
 566
 567     bSumEkinhOld = FALSE;
 568
 569     t_vcm vcm(top_global->groups, *ir);
 570     reportComRemovalInfo(fplog, vcm);
 571
 572     /* To minimize communication, compute_globals computes the COM velocity
 573      * and the kinetic energy for the velocities without COM motion removed.
 574      * Thus to get the kinetic energy without the COM contribution, we need
 575      * to call compute_globals twice.
 576      */
 577     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
 578     {
 579         unsigned int cglo_flags_iteration = cglo_flags;
 580         if (bStopCM && cgloIteration == 0)
 581         {
 582             cglo_flags_iteration |= CGLO_STOPCM;
 583             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
 584         }
 585         compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 586                         makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, nullptr,
 587                         enerd, force_vir, shake_vir, total_vir, pres, constr, &nullSignaller,
 588                         state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
 589                         cglo_flags_iteration
 590                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
 591                                                                          : 0));
 592         if (cglo_flags_iteration & CGLO_STOPCM)
 593         {
 594             /* At initialization, do not pass x with acceleration-correction mode
 595              * to avoid (incorrect) correction of the initial coordinates.
 596              */
 597             auto x = (vcm.mode == ecmLINEAR_ACCELERATION_CORRECTION) ? ArrayRef<RVec>()
 598                                                                      : makeArrayRef(state->x);
 599             process_and_stopcm_grp(fplog, &vcm, *mdatoms, x, makeArrayRef(state->v));
 600             inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
 601         }
 602     }
 603     checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global, &top,
 604                                     makeConstArrayRef(state->x), state->box,
 605                                     &shouldCheckNumberOfBondedInteractions);
 606     if (ir->eI == eiVVAK)
 607     {
 608         /* a second call to get the half step temperature initialized as well */
 609         /* we do the same call as above, but turn the pressure off -- internally to
 610            compute_globals, this is recognized as a velocity verlet half-step
 611            kinetic energy calculation.  This minimized excess variables, but
 612            perhaps loses some logic?*/
 613
 614         compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 615                         makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, nullptr,
 616                         enerd, force_vir, shake_vir, total_vir, pres, constr, &nullSignaller,
 617                         state->box, nullptr, &bSumEkinhOld, cglo_flags & ~CGLO_PRESSURE);
 618     }
 619
 620     /* Calculate the initial half step temperature, and save the ekinh_old */
 621     if (startingBehavior == StartingBehavior::NewSimulation)
 622     {
 623         for (i = 0; (i < ir->opts.ngtc); i++)
 624         {
 625             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
 626         }
 627     }
 628
 629     /* need to make an initiation call to get the Trotter variables set, as well as other constants
 630        for non-trotter temperature control */
 631     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
 632
 633     if (MASTER(cr))
 634     {
 635         if (!ir->bContinuation)
 636         {
 637             if (constr && ir->eConstrAlg == econtLINCS)
 638             {
 639                 fprintf(fplog, "RMS relative constraint deviation after constraining: %.2e\n",
 640                         constr->rmsd());
 641             }
 642             if (EI_STATE_VELOCITY(ir->eI))
 643             {
 644                 real temp = enerd->term[F_TEMP];
 645                 if (ir->eI != eiVV)
 646                 {
 647                     /* Result of Ekin averaged over velocities of -half
 648                      * and +half step, while we only have -half step here.
 649                      */
 650                     temp *= 2;
 651                 }
 652                 fprintf(fplog, "Initial temperature: %g K\n", temp);
 653             }
 654         }
 655
 656         char tbuf[20];
 657         fprintf(stderr, "starting mdrun '%s'\n", *(top_global->name));
 658         if (ir->nsteps >= 0)
 659         {
 660             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
 661         }
 662         else
 663         {
 664             sprintf(tbuf, "%s", "infinite");
 665         }
 666         if (ir->init_step > 0)
 667         {
 668             fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 669                     gmx_step_str(ir->init_step + ir->nsteps, sbuf), tbuf,
 670                     gmx_step_str(ir->init_step, sbuf2), ir->init_step * ir->delta_t);
 671         }
 672         else
 673         {
 674             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
 675         }
 676         fprintf(fplog, "\n");
 677     }
 678
 679     walltime_accounting_start_time(walltime_accounting);
 680     wallcycle_start(wcycle, ewcRUN);
 681     print_start(fplog, cr, walltime_accounting, "mdrun");
 682
 683 #if GMX_FAHCORE
 684     /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
 685     int chkpt_ret = fcCheckPointParallel(cr->nodeid, NULL, 0);
 686     if (chkpt_ret == 0)
 687     {
 688         gmx_fatal(3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0);
 689     }
 690 #endif
 691
 692     /***********************************************************
 693      *
 694      *             Loop over MD steps
 695      *
 696      ************************************************************/
 697
 698     bFirstStep = TRUE;
 699     /* Skip the first Nose-Hoover integration when we get the state from tpx */
 700     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
 701     bSumEkinhOld     = FALSE;
 702     bExchanged       = FALSE;
 703     bNeedRepartition = FALSE;
 704
 705     step     = ir->init_step;
 706     step_rel = 0;
 707
 708     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
 709             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]), simulationsShareState,
 710             MASTER(cr), ir->nstlist, mdrunOptions.reproducible, nstSignalComm,
 711             mdrunOptions.maximumHoursToRun, ir->nstlist == 0, fplog, step, bNS, walltime_accounting);
 712
 713     auto checkpointHandler = std::make_unique<CheckpointHandler>(
 714             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]), simulationsShareState,
 715             ir->nstlist == 0, MASTER(cr), mdrunOptions.writeConfout,
 716             mdrunOptions.checkpointOptions.period);
 717
 718     const bool resetCountersIsLocal = true;
 719     auto       resetHandler         = std::make_unique<ResetHandler>(
 720             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
 721             !resetCountersIsLocal, ir->nsteps, MASTER(cr), mdrunOptions.timingOptions.resetHalfway,
 722             mdrunOptions.maximumHoursToRun, mdlog, wcycle, walltime_accounting);
 723
 724     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
 725
 726     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
 727     {
 728         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
 729     }
 730
 731     /* and stop now if we should */
 732     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
 733     while (!bLastStep)
 734     {
 735
 736         /* Determine if this is a neighbor search step */
 737         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
 738
 739         if (bPMETune && bNStList)
 740         {
 741             // This has to be here because PME load balancing is called so early.
 742             // TODO: Move to after all booleans are defined.
 743             if (useGpuForUpdate && !bFirstStep)
 744             {
 745                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 746                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 747             }
 748             /* PME grid + cut-off optimization with GPUs or PME nodes */
 749             pme_loadbal_do(pme_loadbal, cr, (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
 750                            fplog, mdlog, *ir, fr, state->box, state->x, wcycle, step, step_rel,
 751                            &bPMETunePrinting, simulationWork.useGpuPmePpCommunication);
 752         }
 753
 754         wallcycle_start(wcycle, ewcSTEP);
 755
 756         bLastStep = (step_rel == ir->nsteps);
 757         t         = t0 + step * ir->delta_t;
 758
 759         // TODO Refactor this, so that nstfep does not need a default value of zero
 760         if (ir->efep != efepNO || ir->bSimTemp)
 761         {
 762             /* find and set the current lambdas */
 763             setCurrentLambdasLocal(step, ir->fepvals, lam0, state->lambda, state->fep_state);
 764
 765             bDoDHDL     = do_per_step(step, ir->fepvals->nstdhdl);
 766             bDoFEP      = ((ir->efep != efepNO) && do_per_step(step, nstfep));
 767             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
 768                            && (!bFirstStep));
 769         }
 770
 771         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
 772                      && do_per_step(step, replExParams.exchangeInterval));
 773
 774         if (doSimulatedAnnealing)
 775         {
 776             update_annealing_target_temp(ir, t, &upd);
 777         }
 778
 779         /* Stop Center of Mass motion */
 780         bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
 781
 782         /* Determine whether or not to do Neighbour Searching */
 783         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
 784
 785         /* Note that the stopHandler will cause termination at nstglobalcomm
 786          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
 787          * nstpcouple steps, we have computed the half-step kinetic energy
 788          * of the previous step and can always output energies at the last step.
 789          */
 790         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
 791
 792         /* do_log triggers energy and virial calculation. Because this leads
 793          * to different code paths, forces can be different. Thus for exact
 794          * continuation we should avoid extra log output.
 795          * Note that the || bLastStep can result in non-exact continuation
 796          * beyond the last step. But we don't consider that to be an issue.
 797          */
 798         do_log     = (do_per_step(step, ir->nstlog)
 799                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
 800         do_verbose = mdrunOptions.verbose
 801                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
 802
 803         if (useGpuForUpdate && !bFirstStep && bNS)
 804         {
 805             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
 806             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
 807             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
 808             // Copy coordinate from the GPU when needed at the search step.
 809             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
 810             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
 811             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 812             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 813         }
 814
 815         if (bNS && !(bFirstStep && ir->bContinuation))
 816         {
 817             bMasterState = FALSE;
 818             /* Correct the new box if it is too skewed */
 819             if (inputrecDynamicBox(ir))
 820             {
 821                 if (correct_box(fplog, step, state->box))
 822                 {
 823                     bMasterState = TRUE;
 824                     // If update is offloaded, it should be informed about the box size change
 825                     if (useGpuForUpdate)
 826                     {
 827                         integrator->setPbc(PbcType::Xyz, state->box);
 828                     }
 829                 }
 830             }
 831             if (DOMAINDECOMP(cr) && bMasterState)
 832             {
 833                 dd_collect_state(cr->dd, state, state_global);
 834             }
 835
 836             if (DOMAINDECOMP(cr))
 837             {
 838                 /* Repartition the domain decomposition */
 839                 dd_partition_system(fplog, mdlog, step, cr, bMasterState, nstglobalcomm, state_global,
 840                                     *top_global, ir, imdSession, pull_work, state, &f, mdAtoms, &top,
 841                                     fr, vsite, constr, nrnb, wcycle, do_verbose && !bPMETunePrinting);
 842                 shouldCheckNumberOfBondedInteractions = true;
 843                 upd.setNumAtoms(state->natoms);
 844             }
 845         }
 846
 847         // Allocate or re-size GPU halo exchange object, if necessary
 848         if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange
 849             && useGpuForNonbonded && is1D(*cr->dd))
 850         {
 851             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
 852                                "GPU device manager has to be initialized to use GPU "
 853                                "version of halo exchange.");
 854             // TODO remove need to pass local stream into GPU halo exchange - Issue #3093
 855             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
 856         }
 857
 858         if (MASTER(cr) && do_log)
 859         {
 860             gmx::EnergyOutput::printHeader(fplog, step,
 861                                            t); /* can we improve the information printed here? */
 862         }
 863
 864         if (ir->efep != efepNO)
 865         {
 866             update_mdatoms(mdatoms, state->lambda[efptMASS]);
 867         }
 868
 869         if (bExchanged)
 870         {
 871
 872             /* We need the kinetic energy at minus the half step for determining
 873              * the full step kinetic energy and possibly for T-coupling.*/
 874             /* This may not be quite working correctly yet . . . . */
 875             compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 876                             makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle,
 877                             enerd, nullptr, nullptr, nullptr, nullptr, constr, &nullSignaller,
 878                             state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
 879                             CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
 880             checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global,
 881                                             &top, makeConstArrayRef(state->x), state->box,
 882                                             &shouldCheckNumberOfBondedInteractions);
 883         }
 884         clear_mat(force_vir);
 885
 886         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
 887
 888         /* Determine the energy and pressure:
 889          * at nstcalcenergy steps and at energy output steps (set below).
 890          */
 891         if (EI_VV(ir->eI) && (!bInitStep))
 892         {
 893             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
 894             bCalcVir      = bCalcEnerStep
 895                        || (ir->epc != epcNO
 896                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
 897         }
 898         else
 899         {
 900             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
 901             bCalcVir = bCalcEnerStep || (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
 902         }
 903         bCalcEner = bCalcEnerStep;
 904
 905         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
 906
 907         if (do_ene || do_log || bDoReplEx)
 908         {
 909             bCalcVir  = TRUE;
 910             bCalcEner = TRUE;
 911         }
 912
 913         /* Do we need global communication ? */
 914         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
 915                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
 916
 917         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
 918                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
 919                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
 920
 921         if (shellfc)
 922         {
 923             /* Now is the time to relax the shells */
 924             relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose, enforcedRotation, step, ir,
 925                                 imdSession, pull_work, bNS, force_flags, &top, constr, enerd,
 926                                 state->natoms, state->x.arrayRefWithPadding(),
 927                                 state->v.arrayRefWithPadding(), state->box, state->lambda,
 928                                 &state->hist, &f.view(), force_vir, mdatoms, nrnb, wcycle, shellfc,
 929                                 fr, runScheduleWork, t, mu_tot, vsite, ddBalanceRegionHandler);
 930         }
 931         else
 932         {
 933             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
 934                is updated (or the AWH update will be performed twice for one step when continuing).
 935                It would be best to call this update function from do_md_trajectory_writing but that
 936                would occur after do_force. One would have to divide the update_awh function into one
 937                function applying the AWH force and one doing the AWH bias update. The update AWH
 938                bias function could then be called after do_md_trajectory_writing (then containing
 939                update_awh_history). The checkpointing will in the future probably moved to the start
 940                of the md loop which will rid of this issue. */
 941             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
 942             {
 943                 awh->updateHistory(state_global->awhHistory.get());
 944             }
 945
 946             /* The coordinates (x) are shifted (to get whole molecules)
 947              * in do_force.
 948              * This is parallellized as well, and does communication too.
 949              * Check comments in sim_util.c
 950              */
 951             do_force(fplog, cr, ms, ir, awh.get(), enforcedRotation, imdSession, pull_work, step,
 952                      nrnb, wcycle, &top, state->box, state->x.arrayRefWithPadding(), &state->hist,
 953                      &f.view(), force_vir, mdatoms, enerd, state->lambda, fr, runScheduleWork,
 954                      vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
 955                      (bNS ? GMX_FORCE_NS : 0) | force_flags, ddBalanceRegionHandler);
 956         }
 957
 958         // VV integrators do not need the following velocity half step
 959         // if it is the first step after starting from a checkpoint.
 960         // That is, the half step is needed on all other steps, and
 961         // also the first step when starting from a .tpr file.
 962         if (EI_VV(ir->eI) && (!bFirstStep || startingBehavior == StartingBehavior::NewSimulation))
 963         /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
 964         {
 965             rvec* vbuf = nullptr;
 966
 967             wallcycle_start(wcycle, ewcUPDATE);
 968             if (ir->eI == eiVV && bInitStep)
 969             {
 970                 /* if using velocity verlet with full time step Ekin,
 971                  * take the first half step only to compute the
 972                  * virial for the first step. From there,
 973                  * revert back to the initial coordinates
 974                  * so that the input is actually the initial step.
 975                  */
 976                 snew(vbuf, state->natoms);
 977                 copy_rvecn(state->v.rvec_array(), vbuf, 0,
 978                            state->natoms); /* should make this better for parallelizing? */
 979             }
 980             else
 981             {
 982                 /* this is for NHC in the Ekin(t+dt/2) version of vv */
 983                 trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ,
 984                                trotter_seq, ettTSEQ1);
 985             }
 986
 987             upd.update_coords(*ir, step, mdatoms, state, f.view().forceWithPadding(), fcdata, ekind,
 988                               M, etrtVELOCITY1, cr, constr != nullptr);
 989
 990             wallcycle_stop(wcycle, ewcUPDATE);
 991             constrain_velocities(constr, do_log, do_ene, step, state, nullptr, bCalcVir, shake_vir);
 992             wallcycle_start(wcycle, ewcUPDATE);
 993             /* if VV, compute the pressure and constraints */
 994             /* For VV2, we strictly only need this if using pressure
 995              * control, but we really would like to have accurate pressures
 996              * printed out.
 997              * Think about ways around this in the future?
 998              * For now, keep this choice in comments.
 999              */
1000             /*bPres = (ir->eI==eiVV || inputrecNptTrotter(ir)); */
1001             /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && inputrecNptTrotter(ir)));*/
1002             bPres = TRUE;
1003             bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
1004             if (bCalcEner && ir->eI == eiVVAK)
1005             {
1006                 bSumEkinhOld = TRUE;
1007             }
1008             /* for vv, the first half of the integration actually corresponds to the previous step.
1009                So we need information from the last step in the first half of the integration */
1010             if (bGStat || do_per_step(step - 1, nstglobalcomm))
1011             {
1012                 wallcycle_stop(wcycle, ewcUPDATE);
1013                 compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1014                                 makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle,
1015                                 enerd, force_vir, shake_vir, total_vir, pres, constr, &nullSignaller,
1016                                 state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
1017                                 (bGStat ? CGLO_GSTAT : 0) | (bCalcEner ? CGLO_ENERGY : 0)
1018                                         | (bTemp ? CGLO_TEMPERATURE : 0) | (bPres ? CGLO_PRESSURE : 0)
1019                                         | (bPres ? CGLO_CONSTRAINT : 0) | (bStopCM ? CGLO_STOPCM : 0)
1020                                         | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1021                                                                                  : 0)
1022                                         | CGLO_SCALEEKIN);
1023                 /* explanation of above:
1024                    a) We compute Ekin at the full time step
1025                    if 1) we are using the AveVel Ekin, and it's not the
1026                    initial step, or 2) if we are using AveEkin, but need the full
1027                    time step kinetic energy for the pressure (always true now, since we want accurate statistics).
1028                    b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
1029                    EkinAveVel because it's needed for the pressure */
1030                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1031                                                 top_global, &top, makeConstArrayRef(state->x),
1032                                                 state->box, &shouldCheckNumberOfBondedInteractions);
1033                 if (bStopCM)
1034                 {
1035                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, makeArrayRef(state->x),
1036                                            makeArrayRef(state->v));
1037                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1038                 }
1039                 wallcycle_start(wcycle, ewcUPDATE);
1040             }
1041             /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
1042             if (!bInitStep)
1043             {
1044                 if (bTrotter)
1045                 {
1046                     m_add(force_vir, shake_vir,
1047                           total_vir); /* we need the un-dispersion corrected total vir here */
1048                     trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ,
1049                                    trotter_seq, ettTSEQ2);
1050
1051                     /* TODO This is only needed when we're about to write
1052                      * a checkpoint, because we use it after the restart
1053                      * (in a kludge?). But what should we be doing if
1054                      * the startingBehavior is NewSimulation or bInitStep are true? */
1055                     if (inputrecNptTrotter(ir) || inputrecNphTrotter(ir))
1056                     {
1057                         copy_mat(shake_vir, state->svir_prev);
1058                         copy_mat(force_vir, state->fvir_prev);
1059                     }
1060                     if (inputrecNvtTrotter(ir) && ir->eI == eiVV)
1061                     {
1062                         /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
1063                         enerd->term[F_TEMP] =
1064                                 sum_ekin(&(ir->opts), ekind, nullptr, (ir->eI == eiVV), FALSE);
1065                         enerd->term[F_EKIN] = trace(ekind->ekin);
1066                     }
1067                 }
1068                 else if (bExchanged)
1069                 {
1070                     wallcycle_stop(wcycle, ewcUPDATE);
1071                     /* We need the kinetic energy at minus the half step for determining
1072                      * the full step kinetic energy and possibly for T-coupling.*/
1073                     /* This may not be quite working correctly yet . . . . */
1074                     compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1075                                     makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle,
1076                                     enerd, nullptr, nullptr, nullptr, nullptr, constr, &nullSignaller,
1077                                     state->box, nullptr, &bSumEkinhOld, CGLO_GSTAT | CGLO_TEMPERATURE);
1078                     wallcycle_start(wcycle, ewcUPDATE);
1079                 }
1080             }
1081             /* if it's the initial step, we performed this first step just to get the constraint virial */
1082             if (ir->eI == eiVV && bInitStep)
1083             {
1084                 copy_rvecn(vbuf, state->v.rvec_array(), 0, state->natoms);
1085                 sfree(vbuf);
1086             }
1087             wallcycle_stop(wcycle, ewcUPDATE);
1088         }
1089
1090         /* compute the conserved quantity */
1091         if (EI_VV(ir->eI))
1092         {
1093             saved_conserved_quantity = NPT_energy(ir, state, &MassQ);
1094             if (ir->eI == eiVV)
1095             {
1096                 last_ekin = enerd->term[F_EKIN];
1097             }
1098             if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
1099             {
1100                 saved_conserved_quantity -= enerd->term[F_DISPCORR];
1101             }
1102             /* sum up the foreign kinetic energy and dK/dl terms for vv.  currently done every step so that dhdl is correct in the .edr */
1103             if (ir->efep != efepNO)
1104             {
1105                 accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1106             }
1107         }
1108
1109         /* ########  END FIRST UPDATE STEP  ############## */
1110         /* ########  If doing VV, we now have v(dt) ###### */
1111         if (bDoExpanded)
1112         {
1113             /* perform extended ensemble sampling in lambda - we don't
1114                actually move to the new state before outputting
1115                statistics, but if performing simulated tempering, we
1116                do update the velocities and the tau_t. */
1117
1118             lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state,
1119                                               state->dfhist, step, state->v.rvec_array(), mdatoms);
1120             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1121             if (MASTER(cr))
1122             {
1123                 copy_df_history(state_global->dfhist, state->dfhist);
1124             }
1125         }
1126
1127         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1128         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1129         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1130             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1131                 || checkpointHandler->isCheckpointingStep()))
1132         {
1133             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1134             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1135         }
1136         // Copy velocities if needed for the output/checkpointing.
1137         // NOTE: Copy on the search steps is done at the beginning of the step.
1138         if (useGpuForUpdate && !bNS
1139             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1140         {
1141             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1142             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1143         }
1144         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1145         // and update is offloaded hence forces are kept on the GPU for update and have not been
1146         // already transferred in do_force().
1147         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1148         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1149         //       prior to GPU update.
1150         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1151         //       copy call in do_force(...).
1152         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1153         //       on host after the D2H copy in do_force(...).
1154         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1155             && do_per_step(step, ir->nstfout))
1156         {
1157             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1158             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1159         }
1160         /* Now we have the energies and forces corresponding to the
1161          * coordinates at time t. We must output all of this before
1162          * the update.
1163          */
1164         do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t, ir, state, state_global,
1165                                  observablesHistory, top_global, fr, outf, energyOutput, ekind,
1166                                  f.view().force(), checkpointHandler->isCheckpointingStep(),
1167                                  bRerunMD, bLastStep, mdrunOptions.writeConfout, bSumEkinhOld);
1168         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1169         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x.rvec_array(), t);
1170
1171         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1172         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1173             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1174         {
1175             copy_mat(state->svir_prev, shake_vir);
1176             copy_mat(state->fvir_prev, force_vir);
1177         }
1178
1179         stopHandler->setSignal();
1180         resetHandler->setSignal(walltime_accounting);
1181
1182         if (bGStat || !PAR(cr))
1183         {
1184             /* In parallel we only have to check for checkpointing in steps
1185              * where we do global communication,
1186              *  otherwise the other nodes don't know.
1187              */
1188             checkpointHandler->setSignal(walltime_accounting);
1189         }
1190
1191         /* #########   START SECOND UPDATE STEP ################# */
1192
1193         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1194            controlled in preprocessing */
1195
1196         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1197         {
1198             gmx_bool bIfRandomize;
1199             bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, &upd, constr);
1200             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1201             if (constr && bIfRandomize)
1202             {
1203                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1204             }
1205         }
1206         /* Box is changed in update() when we do pressure coupling,
1207          * but we should still use the old box for energy corrections and when
1208          * writing it to the energy file, so it matches the trajectory files for
1209          * the same timestep above. Make a copy in a separate array.
1210          */
1211         copy_mat(state->box, lastbox);
1212
1213         dvdl_constr = 0;
1214
1215         wallcycle_start(wcycle, ewcUPDATE);
1216         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1217         if (bTrotter)
1218         {
1219             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
1220             /* We can only do Berendsen coupling after we have summed
1221              * the kinetic energy or virial. Since the happens
1222              * in global_state after update, we should only do it at
1223              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1224              */
1225         }
1226         else
1227         {
1228             update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
1229             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1230         }
1231
1232         if (EI_VV(ir->eI))
1233         {
1234             /* velocity half-step update */
1235             upd.update_coords(*ir, step, mdatoms, state, f.view().forceWithPadding(), fcdata, ekind,
1236                               M, etrtVELOCITY2, cr, constr != nullptr);
1237         }
1238
1239         /* Above, initialize just copies ekinh into ekin,
1240          * it doesn't copy position (for VV),
1241          * and entire integrator for MD.
1242          */
1243
1244         if (ir->eI == eiVVAK)
1245         {
1246             cbuf.resize(state->x.size());
1247             std::copy(state->x.begin(), state->x.end(), cbuf.begin());
1248         }
1249
1250         /* With leap-frog type integrators we compute the kinetic energy
1251          * at a whole time step as the average of the half-time step kinetic
1252          * energies of two subsequent steps. Therefore we need to compute the
1253          * half step kinetic energy also if we need energies at the next step.
1254          */
1255         const bool needHalfStepKineticEnergy =
1256                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1257
1258         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1259         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1260         const bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN
1261                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1262
1263         if (useGpuForUpdate)
1264         {
1265             if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1266             {
1267                 integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(),
1268                                 stateGpu->getForces(), top.idef, *mdatoms, ekind->ngtc);
1269
1270                 // Copy data to the GPU after buffers might have being reinitialized
1271                 stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1272                 stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1273             }
1274
1275             // If the buffer ops were not offloaded this step, the forces are on the host and have to be copied
1276             if (!runScheduleWork->stepWork.useGpuFBufferOps)
1277             {
1278                 stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1279             }
1280
1281             const bool doTemperatureScaling =
1282                     (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1283
1284             // This applies Leap-Frog, LINCS and SETTLE in succession
1285             integrator->integrate(stateGpu->getForcesReadyOnDeviceEvent(
1286                                           AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1287                                   ir->delta_t, true, bCalcVir, shake_vir, doTemperatureScaling,
1288                                   ekind->tcstat, doParrinelloRahman, ir->nstpcouple * ir->delta_t, M);
1289
1290             // Copy velocities D2H after update if:
1291             // - Globals are computed this step (includes the energy output steps).
1292             // - Temperature is needed for the next step.
1293             if (bGStat || needHalfStepKineticEnergy)
1294             {
1295                 stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1296                 stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1297             }
1298         }
1299         else
1300         {
1301             upd.update_coords(*ir, step, mdatoms, state, f.view().forceWithPadding(), fcdata, ekind,
1302                               M, etrtPOSITION, cr, constr != nullptr);
1303
1304             wallcycle_stop(wcycle, ewcUPDATE);
1305
1306             constrain_coordinates(constr, do_log, do_ene, step, state,
1307                                   upd.xp()->arrayRefWithPadding(), &dvdl_constr, bCalcVir, shake_vir);
1308
1309             upd.update_sd_second_half(*ir, step, &dvdl_constr, mdatoms, state, cr, nrnb, wcycle,
1310                                       constr, do_log, do_ene);
1311             upd.finish_update(*ir, mdatoms, state, wcycle, constr != nullptr);
1312         }
1313
1314         if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1315         {
1316             updatePrevStepPullCom(pull_work, state);
1317         }
1318
1319         if (ir->eI == eiVVAK)
1320         {
1321             /* erase F_EKIN and F_TEMP here? */
1322             /* just compute the kinetic energy at the half step to perform a trotter step */
1323             compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1324                             makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm, wcycle, enerd,
1325                             force_vir, shake_vir, total_vir, pres, constr, &nullSignaller, lastbox,
1326                             nullptr, &bSumEkinhOld, (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE);
1327             wallcycle_start(wcycle, ewcUPDATE);
1328             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
1329             /* now we know the scaling, we can compute the positions again */
1330             std::copy(cbuf.begin(), cbuf.end(), state->x.begin());
1331
1332             upd.update_coords(*ir, step, mdatoms, state, f.view().forceWithPadding(), fcdata, ekind,
1333                               M, etrtPOSITION, cr, constr != nullptr);
1334             wallcycle_stop(wcycle, ewcUPDATE);
1335
1336             /* do we need an extra constraint here? just need to copy out of as_rvec_array(state->v.data()) to upd->xp? */
1337             /* are the small terms in the shake_vir here due
1338              * to numerical errors, or are they important
1339              * physically? I'm thinking they are just errors, but not completely sure.
1340              * For now, will call without actually constraining, constr=NULL*/
1341             upd.finish_update(*ir, mdatoms, state, wcycle, false);
1342         }
1343         if (EI_VV(ir->eI))
1344         {
1345             /* this factor or 2 correction is necessary
1346                because half of the constraint force is removed
1347                in the vv step, so we have to double it.  See
1348                the Issue #1255.  It is not yet clear
1349                if the factor of 2 is exact, or just a very
1350                good approximation, and this will be
1351                investigated.  The next step is to see if this
1352                can be done adding a dhdl contribution from the
1353                rattle step, but this is somewhat more
1354                complicated with the current code. Will be
1355                investigated, hopefully for 4.6.3. However,
1356                this current solution is much better than
1357                having it completely wrong.
1358              */
1359             enerd->term[F_DVDL_CONSTR] += 2 * dvdl_constr;
1360         }
1361         else
1362         {
1363             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1364         }
1365
1366         if (vsite != nullptr)
1367         {
1368             wallcycle_start(wcycle, ewcVSITECONSTR);
1369             vsite->construct(state->x, ir->delta_t, state->v, state->box);
1370             wallcycle_stop(wcycle, ewcVSITECONSTR);
1371         }
1372
1373         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1374         /* With Leap-Frog we can skip compute_globals at
1375          * non-communication steps, but we need to calculate
1376          * the kinetic energy one step before communication.
1377          */
1378         {
1379             // Organize to do inter-simulation signalling on steps if
1380             // and when algorithms require it.
1381             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1382
1383             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1384             {
1385                 // Copy coordinates when needed to stop the CM motion.
1386                 if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
1387                 {
1388                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1389                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1390                 }
1391                 // Since we're already communicating at this step, we
1392                 // can propagate intra-simulation signals. Note that
1393                 // check_nstglobalcomm has the responsibility for
1394                 // choosing the value of nstglobalcomm that is one way
1395                 // bGStat becomes true, so we can't get into a
1396                 // situation where e.g. checkpointing can't be
1397                 // signalled.
1398                 bool                doIntraSimSignal = true;
1399                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1400
1401                 compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1402                                 makeConstArrayRef(state->v), state->box, mdatoms, nrnb, &vcm,
1403                                 wcycle, enerd, force_vir, shake_vir, total_vir, pres, constr,
1404                                 &signaller, lastbox, &totalNumberOfBondedInteractions, &bSumEkinhOld,
1405                                 (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1406                                         | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1407                                         | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1408                                         | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
1409                                         | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1410                                                                                  : 0));
1411                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1412                                                 top_global, &top, makeConstArrayRef(state->x),
1413                                                 state->box, &shouldCheckNumberOfBondedInteractions);
1414                 if (!EI_VV(ir->eI) && bStopCM)
1415                 {
1416                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, makeArrayRef(state->x),
1417                                            makeArrayRef(state->v));
1418                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1419
1420                     // TODO: The special case of removing CM motion should be dealt more gracefully
1421                     if (useGpuForUpdate)
1422                     {
1423                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1424                         // Here we block until the H2D copy completes because event sync with the
1425                         // force kernels that use the coordinates on the next steps is not implemented
1426                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1427                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1428                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1429                         if (vcm.mode != ecmNO)
1430                         {
1431                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1432                         }
1433                     }
1434                 }
1435             }
1436         }
1437
1438         /* #############  END CALC EKIN AND PRESSURE ################# */
1439
1440         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1441            the virial that should probably be addressed eventually. state->veta has better properies,
1442            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1443            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1444
1445         if (ir->efep != efepNO && !EI_VV(ir->eI))
1446         {
1447             /* Sum up the foreign energy and dK/dl terms for md and sd.
1448                Currently done every step so that dH/dl is correct in the .edr */
1449             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1450         }
1451
1452         update_pcouple_after_coordinates(fplog, step, ir, mdatoms, pres, force_vir, shake_vir,
1453                                          pressureCouplingMu, state, nrnb, upd.deform(), !useGpuForUpdate);
1454
1455         const bool doBerendsenPressureCoupling =
1456                 (inputrec->epc == epcBERENDSEN && do_per_step(step, inputrec->nstpcouple));
1457         if (useGpuForUpdate && (doBerendsenPressureCoupling || doParrinelloRahman))
1458         {
1459             integrator->scaleCoordinates(pressureCouplingMu);
1460             integrator->setPbc(PbcType::Xyz, state->box);
1461         }
1462
1463         /* ################# END UPDATE STEP 2 ################# */
1464         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1465
1466         /* The coordinates (x) were unshifted in update */
1467         if (!bGStat)
1468         {
1469             /* We will not sum ekinh_old,
1470              * so signal that we still have to do it.
1471              */
1472             bSumEkinhOld = TRUE;
1473         }
1474
1475         if (bCalcEner)
1476         {
1477             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1478
1479             /* use the directly determined last velocity, not actually the averaged half steps */
1480             if (bTrotter && ir->eI == eiVV)
1481             {
1482                 enerd->term[F_EKIN] = last_ekin;
1483             }
1484             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1485
1486             if (integratorHasConservedEnergyQuantity(ir))
1487             {
1488                 if (EI_VV(ir->eI))
1489                 {
1490                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1491                 }
1492                 else
1493                 {
1494                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1495                 }
1496             }
1497             /* #########  END PREPARING EDR OUTPUT  ###########  */
1498         }
1499
1500         /* Output stuff */
1501         if (MASTER(cr))
1502         {
1503             if (fplog && do_log && bDoExpanded)
1504             {
1505                 /* only needed if doing expanded ensemble */
1506                 PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals,
1507                                           ir->bSimTemp ? ir->simtempvals : nullptr,
1508                                           state_global->dfhist, state->fep_state, ir->nstlog, step);
1509             }
1510             if (bCalcEner)
1511             {
1512                 energyOutput.addDataAtEnergyStep(
1513                         bDoDHDL, bCalcEnerStep, t, mdatoms->tmass, enerd, ir->fepvals,
1514                         ir->expandedvals, lastbox,
1515                         PTCouplingArrays{ state->boxv, state->nosehoover_xi, state->nosehoover_vxi,
1516                                           state->nhpres_xi, state->nhpres_vxi },
1517                         state->fep_state, shake_vir, force_vir, total_vir, pres, ekind, mu_tot, constr);
1518             }
1519             else
1520             {
1521                 energyOutput.recordNonEnergyStep();
1522             }
1523
1524             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1525             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1526
1527             if (doSimulatedAnnealing)
1528             {
1529                 gmx::EnergyOutput::printAnnealingTemperatures(do_log ? fplog : nullptr, groups,
1530                                                               &(ir->opts));
1531             }
1532             if (do_log || do_ene || do_dr || do_or)
1533             {
1534                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or,
1535                                                    do_log ? fplog : nullptr, step, t,
1536                                                    fr->fcdata.get(), awh.get());
1537             }
1538             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1539             {
1540                 const bool isInitialOutput = false;
1541                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1542             }
1543
1544             if (ir->bPull)
1545             {
1546                 pull_print_output(pull_work, step, t);
1547             }
1548
1549             if (do_per_step(step, ir->nstlog))
1550             {
1551                 if (fflush(fplog) != 0)
1552                 {
1553                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1554                 }
1555             }
1556         }
1557         if (bDoExpanded)
1558         {
1559             /* Have to do this part _after_ outputting the logfile and the edr file */
1560             /* Gets written into the state at the beginning of next loop*/
1561             state->fep_state = lamnew;
1562         }
1563         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1564         {
1565             state->fep_state = awh->fepLambdaState();
1566         }
1567         /* Print the remaining wall clock time for the run */
1568         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1569         {
1570             if (shellfc)
1571             {
1572                 fprintf(stderr, "\n");
1573             }
1574             print_time(stderr, walltime_accounting, step, ir, cr);
1575         }
1576
1577         /* Ion/water position swapping.
1578          * Not done in last step since trajectory writing happens before this call
1579          * in the MD loop and exchanges would be lost anyway. */
1580         bNeedRepartition = FALSE;
1581         if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep && do_per_step(step, ir->swap->nstswap))
1582         {
1583             bNeedRepartition =
1584                     do_swapcoords(cr, step, t, ir, swap, wcycle, as_rvec_array(state->x.data()),
1585                                   state->box, MASTER(cr) && mdrunOptions.verbose, bRerunMD);
1586
1587             if (bNeedRepartition && DOMAINDECOMP(cr))
1588             {
1589                 dd_collect_state(cr->dd, state, state_global);
1590             }
1591         }
1592
1593         /* Replica exchange */
1594         bExchanged = FALSE;
1595         if (bDoReplEx)
1596         {
1597             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1598         }
1599
1600         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1601         {
1602             dd_partition_system(fplog, mdlog, step, cr, TRUE, 1, state_global, *top_global, ir,
1603                                 imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
1604                                 nrnb, wcycle, FALSE);
1605             shouldCheckNumberOfBondedInteractions = true;
1606             upd.setNumAtoms(state->natoms);
1607         }
1608
1609         bFirstStep = FALSE;
1610         bInitStep  = FALSE;
1611
1612         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1613         /* With all integrators, except VV, we need to retain the pressure
1614          * at the current step for coupling at the next step.
1615          */
1616         if ((state->flags & (1U << estPRES_PREV))
1617             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1618         {
1619             /* Store the pressure in t_state for pressure coupling
1620              * at the next MD step.
1621              */
1622             copy_mat(pres, state->pres_prev);
1623         }
1624
1625         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1626
1627         if ((membed != nullptr) && (!bLastStep))
1628         {
1629             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1630         }
1631
1632         cycles = wallcycle_stop(wcycle, ewcSTEP);
1633         if (DOMAINDECOMP(cr) && wcycle)
1634         {
1635             dd_cycles_add(cr->dd, cycles, ddCyclStep);
1636         }
1637
1638         /* increase the MD step number */
1639         step++;
1640         step_rel++;
1641
1642         resetHandler->resetCounters(step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb,
1643                                     fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
1644
1645         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
1646         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
1647     }
1648     /* End of main MD loop */
1649
1650     /* Closing TNG files can include compressing data. Therefore it is good to do that
1651      * before stopping the time measurements. */
1652     mdoutf_tng_close(outf);
1653
1654     /* Stop measuring walltime */
1655     walltime_accounting_end_time(walltime_accounting);
1656
1657     if (!thisRankHasDuty(cr, DUTY_PME))
1658     {
1659         /* Tell the PME only node to finish */
1660         gmx_pme_send_finish(cr);
1661     }
1662
1663     if (MASTER(cr))
1664     {
1665         if (ir->nstcalcenergy > 0)
1666         {
1667             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
1668             energyOutput.printAverages(fplog, groups);
1669         }
1670     }
1671     done_mdoutf(outf);
1672
1673     if (bPMETune)
1674     {
1675         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
1676     }
1677
1678     done_shellfc(fplog, shellfc, step_rel);
1679
1680     if (useReplicaExchange && MASTER(cr))
1681     {
1682         print_replica_exchange_statistics(fplog, repl_ex);
1683     }
1684
1685     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
1686
1687     global_stat_destroy(gstat);
1688 }