src/gromacs/mdrun/md.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2011-2019,2020, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief Implements the integrator for normal molecular dynamics simulations
  40  *
  41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
  42  * \ingroup module_mdrun
  43  */
  44 #include "gmxpre.h"
  45
  46 #include <cinttypes>
  47 #include <cmath>
  48 #include <cstdio>
  49 #include <cstdlib>
  50
  51 #include <algorithm>
  52 #include <memory>
  53
  54 #include "gromacs/awh/awh.h"
  55 #include "gromacs/commandline/filenm.h"
  56 #include "gromacs/domdec/collect.h"
  57 #include "gromacs/domdec/dlbtiming.h"
  58 #include "gromacs/domdec/domdec.h"
  59 #include "gromacs/domdec/domdec_network.h"
  60 #include "gromacs/domdec/domdec_struct.h"
  61 #include "gromacs/domdec/gpuhaloexchange.h"
  62 #include "gromacs/domdec/mdsetup.h"
  63 #include "gromacs/domdec/partition.h"
  64 #include "gromacs/essentialdynamics/edsam.h"
  65 #include "gromacs/ewald/pme_load_balancing.h"
  66 #include "gromacs/ewald/pme_pp.h"
  67 #include "gromacs/fileio/trxio.h"
  68 #include "gromacs/gmxlib/network.h"
  69 #include "gromacs/gmxlib/nrnb.h"
  70 #include "gromacs/gpu_utils/gpu_utils.h"
  71 #include "gromacs/imd/imd.h"
  72 #include "gromacs/listed_forces/manage_threading.h"
  73 #include "gromacs/math/functions.h"
  74 #include "gromacs/math/utilities.h"
  75 #include "gromacs/math/vec.h"
  76 #include "gromacs/math/vectypes.h"
  77 #include "gromacs/mdlib/checkpointhandler.h"
  78 #include "gromacs/mdlib/compute_io.h"
  79 #include "gromacs/mdlib/constr.h"
  80 #include "gromacs/mdlib/ebin.h"
  81 #include "gromacs/mdlib/enerdata_utils.h"
  82 #include "gromacs/mdlib/energyoutput.h"
  83 #include "gromacs/mdlib/expanded.h"
  84 #include "gromacs/mdlib/force.h"
  85 #include "gromacs/mdlib/force_flags.h"
  86 #include "gromacs/mdlib/forcerec.h"
  87 #include "gromacs/mdlib/md_support.h"
  88 #include "gromacs/mdlib/mdatoms.h"
  89 #include "gromacs/mdlib/mdoutf.h"
  90 #include "gromacs/mdlib/membed.h"
  91 #include "gromacs/mdlib/resethandler.h"
  92 #include "gromacs/mdlib/sighandler.h"
  93 #include "gromacs/mdlib/simulationsignal.h"
  94 #include "gromacs/mdlib/stat.h"
  95 #include "gromacs/mdlib/stophandler.h"
  96 #include "gromacs/mdlib/tgroup.h"
  97 #include "gromacs/mdlib/trajectory_writing.h"
  98 #include "gromacs/mdlib/update.h"
  99 #include "gromacs/mdlib/update_constrain_gpu.h"
 100 #include "gromacs/mdlib/vcm.h"
 101 #include "gromacs/mdlib/vsite.h"
 102 #include "gromacs/mdrunutility/handlerestart.h"
 103 #include "gromacs/mdrunutility/multisim.h"
 104 #include "gromacs/mdrunutility/printtime.h"
 105 #include "gromacs/mdtypes/awh_history.h"
 106 #include "gromacs/mdtypes/awh_params.h"
 107 #include "gromacs/mdtypes/commrec.h"
 108 #include "gromacs/mdtypes/df_history.h"
 109 #include "gromacs/mdtypes/energyhistory.h"
 110 #include "gromacs/mdtypes/fcdata.h"
 111 #include "gromacs/mdtypes/forcerec.h"
 112 #include "gromacs/mdtypes/group.h"
 113 #include "gromacs/mdtypes/inputrec.h"
 114 #include "gromacs/mdtypes/interaction_const.h"
 115 #include "gromacs/mdtypes/md_enums.h"
 116 #include "gromacs/mdtypes/mdatom.h"
 117 #include "gromacs/mdtypes/mdrunoptions.h"
 118 #include "gromacs/mdtypes/observableshistory.h"
 119 #include "gromacs/mdtypes/pullhistory.h"
 120 #include "gromacs/mdtypes/simulation_workload.h"
 121 #include "gromacs/mdtypes/state.h"
 122 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
 123 #include "gromacs/modularsimulator/energyelement.h"
 124 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 125 #include "gromacs/nbnxm/nbnxm.h"
 126 #include "gromacs/pbcutil/mshift.h"
 127 #include "gromacs/pbcutil/pbc.h"
 128 #include "gromacs/pulling/output.h"
 129 #include "gromacs/pulling/pull.h"
 130 #include "gromacs/swap/swapcoords.h"
 131 #include "gromacs/timing/wallcycle.h"
 132 #include "gromacs/timing/walltime_accounting.h"
 133 #include "gromacs/topology/atoms.h"
 134 #include "gromacs/topology/idef.h"
 135 #include "gromacs/topology/mtop_util.h"
 136 #include "gromacs/topology/topology.h"
 137 #include "gromacs/trajectory/trajectoryframe.h"
 138 #include "gromacs/utility/basedefinitions.h"
 139 #include "gromacs/utility/cstringutil.h"
 140 #include "gromacs/utility/fatalerror.h"
 141 #include "gromacs/utility/logger.h"
 142 #include "gromacs/utility/real.h"
 143 #include "gromacs/utility/smalloc.h"
 144
 145 #include "legacysimulator.h"
 146 #include "replicaexchange.h"
 147 #include "shellfc.h"
 148
 149 #if GMX_FAHCORE
 150 #    include "corewrap.h"
 151 #endif
 152
 153 using gmx::SimulationSignaller;
 154
 155 void gmx::LegacySimulator::do_md()
 156 {
 157     // TODO Historically, the EM and MD "integrators" used different
 158     // names for the t_inputrec *parameter, but these must have the
 159     // same name, now that it's a member of a struct. We use this ir
 160     // alias to avoid a large ripple of nearly useless changes.
 161     // t_inputrec is being replaced by IMdpOptionsProvider, so this
 162     // will go away eventually.
 163     t_inputrec*  ir = inputrec;
 164     int64_t      step, step_rel;
 165     double       t, t0 = ir->init_t, lam0[efptNR];
 166     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
 167     gmx_bool     bNS, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
 168     gmx_bool     bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
 169     gmx_bool     do_ene, do_log, do_verbose;
 170     gmx_bool     bMasterState;
 171     unsigned int force_flags;
 172     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, tmp_vir = { { 0 } },
 173            pres = { { 0 } };
 174     int                         i, m;
 175     rvec                        mu_tot;
 176     matrix                      pressureCouplingMu, M;
 177     gmx_repl_ex_t               repl_ex = nullptr;
 178     gmx_localtop_t              top;
 179     PaddedHostVector<gmx::RVec> f{};
 180     gmx_global_stat_t           gstat;
 181     t_graph*                    graph = nullptr;
 182     gmx_shellfc_t*              shellfc;
 183     gmx_bool                    bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
 184     gmx_bool                    bTemp, bPres, bTrotter;
 185     real                        dvdl_constr;
 186     std::vector<RVec>           cbuf;
 187     matrix                      lastbox;
 188     int                         lamnew = 0;
 189     /* for FEP */
 190     int       nstfep = 0;
 191     double    cycles;
 192     real      saved_conserved_quantity = 0;
 193     real      last_ekin                = 0;
 194     t_extmass MassQ;
 195     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
 196
 197     /* PME load balancing data for GPU kernels */
 198     gmx_bool bPMETune         = FALSE;
 199     gmx_bool bPMETunePrinting = FALSE;
 200
 201     bool bInteractiveMDstep = false;
 202
 203     /* Domain decomposition could incorrectly miss a bonded
 204        interaction, but checking for that requires a global
 205        communication stage, which does not otherwise happen in DD
 206        code. So we do that alongside the first global energy reduction
 207        after a new DD is made. These variables handle whether the
 208        check happens, and the result it returns. */
 209     bool shouldCheckNumberOfBondedInteractions = false;
 210     int  totalNumberOfBondedInteractions       = -1;
 211
 212     SimulationSignals signals;
 213     // Most global communnication stages don't propagate mdrun
 214     // signals, and will use this object to achieve that.
 215     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
 216
 217     if (!mdrunOptions.writeConfout)
 218     {
 219         // This is on by default, and the main known use case for
 220         // turning it off is for convenience in benchmarking, which is
 221         // something that should not show up in the general user
 222         // interface.
 223         GMX_LOG(mdlog.info)
 224                 .asParagraph()
 225                 .appendText(
 226                         "The -noconfout functionality is deprecated, and may be removed in a "
 227                         "future version.");
 228     }
 229
 230     /* md-vv uses averaged full step velocities for T-control
 231        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 232        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 233     bTrotter = (EI_VV(ir->eI)
 234                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
 235
 236     const bool bRerunMD = false;
 237
 238     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
 239     bGStatEveryStep   = (nstglobalcomm == 1);
 240
 241     SimulationGroups* groups = &top_global->groups;
 242
 243     std::unique_ptr<EssentialDynamics> ed = nullptr;
 244     if (opt2bSet("-ei", nfile, fnm))
 245     {
 246         /* Initialize essential dynamics sampling */
 247         ed = init_edsam(mdlog, opt2fn_null("-ei", nfile, fnm), opt2fn("-eo", nfile, fnm), top_global,
 248                         ir, cr, constr, state_global, observablesHistory, oenv, startingBehavior);
 249     }
 250     else if (observablesHistory->edsamHistory)
 251     {
 252         gmx_fatal(FARGS,
 253                   "The checkpoint is from a run with essential dynamics sampling, "
 254                   "but the current run did not specify the -ei option. "
 255                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
 256     }
 257
 258     initialize_lambdas(fplog, *ir, MASTER(cr), &state_global->fep_state, state_global->lambda, lam0);
 259     Update     upd(ir, deform);
 260     const bool doSimulatedAnnealing = initSimulatedAnnealing(ir, &upd);
 261     if (startingBehavior != StartingBehavior::RestartWithAppending)
 262     {
 263         pleaseCiteCouplingAlgorithms(fplog, *ir);
 264     }
 265     gmx_mdoutf*       outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider,
 266                                    mdModulesNotifier, ir, top_global, oenv, wcycle, startingBehavior);
 267     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, ir, pull_work,
 268                                    mdoutf_get_fp_dhdl(outf), false, startingBehavior, mdModulesNotifier);
 269
 270     gstat = global_stat_init(ir);
 271
 272     /* Check for polarizable models and flexible constraints */
 273     shellfc = init_shell_flexcon(fplog, top_global, constr ? constr->numFlexibleConstraints() : 0,
 274                                  ir->nstcalcenergy, DOMAINDECOMP(cr));
 275
 276     {
 277         double io = compute_io(ir, top_global->natoms, *groups, energyOutput.numEnergyTerms(), 1);
 278         if ((io > 2000) && MASTER(cr))
 279         {
 280             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
 281         }
 282     }
 283
 284     // Local state only becomes valid now.
 285     std::unique_ptr<t_state> stateInstance;
 286     t_state*                 state;
 287
 288
 289     auto mdatoms = mdAtoms->mdatoms();
 290
 291     std::unique_ptr<UpdateConstrainGpu> integrator;
 292
 293     if (DOMAINDECOMP(cr))
 294     {
 295         dd_init_local_top(*top_global, &top);
 296
 297         stateInstance = std::make_unique<t_state>();
 298         state         = stateInstance.get();
 299         dd_init_local_state(cr->dd, state_global, state);
 300
 301         /* Distribute the charge groups over the nodes from the master node */
 302         dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1, state_global, *top_global, ir,
 303                             imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
 304                             nrnb, nullptr, FALSE);
 305         shouldCheckNumberOfBondedInteractions = true;
 306         upd.setNumAtoms(state->natoms);
 307     }
 308     else
 309     {
 310         state_change_natoms(state_global, state_global->natoms);
 311         f.resizeWithPadding(state_global->natoms);
 312         /* Copy the pointer to the global state */
 313         state = state_global;
 314
 315         /* Generate and initialize new topology */
 316         mdAlgorithmsSetupAtomData(cr, ir, *top_global, &top, fr, &graph, mdAtoms, constr, vsite, shellfc);
 317
 318         upd.setNumAtoms(state->natoms);
 319     }
 320
 321     const auto& simulationWork     = runScheduleWork->simulationWork;
 322     const bool  useGpuForPme       = simulationWork.useGpuPme;
 323     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
 324     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
 325     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
 326
 327     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
 328
 329     if (useGpuForUpdate)
 330     {
 331         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
 332                                    || constr->numConstraintsTotal() == 0,
 333                            "Constraints in domain decomposition are only supported with update "
 334                            "groups if using GPU update.\n");
 335         GMX_RELEASE_ASSERT(ir->eConstrAlg != econtSHAKE || constr == nullptr
 336                                    || constr->numConstraintsTotal() == 0,
 337                            "SHAKE is not supported with GPU update.");
 338         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
 339                            "Either PME or short-ranged non-bonded interaction tasks must run on "
 340                            "the GPU to use GPU update.\n");
 341         GMX_RELEASE_ASSERT(ir->eI == eiMD,
 342                            "Only the md integrator is supported with the GPU update.\n");
 343         GMX_RELEASE_ASSERT(
 344                 ir->etc != etcNOSEHOOVER,
 345                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
 346         GMX_RELEASE_ASSERT(ir->epc == epcNO || ir->epc == epcPARRINELLORAHMAN || ir->epc == epcBERENDSEN,
 347                            "Only Parrinello-Rahman and Berendsen pressure coupling are supported "
 348                            "with the GPU update.\n");
 349         GMX_RELEASE_ASSERT(!mdatoms->haveVsites,
 350                            "Virtual sites are not supported with the GPU update.\n");
 351         GMX_RELEASE_ASSERT(ed == nullptr,
 352                            "Essential dynamics is not supported with the GPU update.\n");
 353         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(ir->pull),
 354                            "Constraints pulling is not supported with the GPU update.\n");
 355         GMX_RELEASE_ASSERT(fcd->orires.nr == 0,
 356                            "Orientation restraints are not supported with the GPU update.\n");
 357         GMX_RELEASE_ASSERT(
 358                 ir->efep == efepNO
 359                         || (!haveFreeEnergyType(*ir, efptBONDED) && !haveFreeEnergyType(*ir, efptMASS)),
 360                 "Free energy perturbation of masses and constraints are not supported with the GPU "
 361                 "update.");
 362         GMX_RELEASE_ASSERT(graph == nullptr, "The graph is not supported with GPU update.");
 363
 364         if (constr != nullptr && constr->numConstraintsTotal() > 0)
 365         {
 366             GMX_LOG(mdlog.info)
 367                     .asParagraph()
 368                     .appendText("Updating coordinates and applying constraints on the GPU.");
 369         }
 370         else
 371         {
 372             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
 373         }
 374         integrator = std::make_unique<UpdateConstrainGpu>(
 375                 *ir, *top_global, stateGpu->getUpdateStream(), stateGpu->xUpdatedOnDevice());
 376
 377         integrator->setPbc(PbcType::Xyz, state->box);
 378     }
 379
 380     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 381     {
 382         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
 383     }
 384     if ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 385     {
 386         changePinningPolicy(&f, PinningPolicy::PinnedIfSupported);
 387     }
 388     if (useGpuForUpdate)
 389     {
 390         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
 391     }
 392
 393     // NOTE: The global state is no longer used at this point.
 394     // But state_global is still used as temporary storage space for writing
 395     // the global state to file and potentially for replica exchange.
 396     // (Global topology should persist.)
 397
 398     update_mdatoms(mdatoms, state->lambda[efptMASS]);
 399
 400     if (ir->bExpanded)
 401     {
 402         /* Check nstexpanded here, because the grompp check was broken */
 403         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
 404         {
 405             gmx_fatal(FARGS,
 406                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
 407         }
 408         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
 409     }
 410
 411     if (MASTER(cr))
 412     {
 413         EnergyElement::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
 414     }
 415
 416     preparePrevStepPullCom(ir, pull_work, mdatoms, state, state_global, cr,
 417                            startingBehavior != StartingBehavior::NewSimulation);
 418
 419     // TODO: Remove this by converting AWH into a ForceProvider
 420     auto awh = prepareAwhModule(fplog, *ir, state_global, cr, ms,
 421                                 startingBehavior != StartingBehavior::NewSimulation,
 422                                 shellfc != nullptr, opt2fn("-awh", nfile, fnm), pull_work);
 423
 424     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
 425     if (useReplicaExchange && MASTER(cr))
 426     {
 427         repl_ex = init_replica_exchange(fplog, ms, top_global->natoms, ir, replExParams);
 428     }
 429     /* PME tuning is only supported in the Verlet scheme, with PME for
 430      * Coulomb. It is not supported with only LJ PME. */
 431     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
 432                 && ir->cutoff_scheme != ecutsGROUP);
 433
 434     pme_load_balancing_t* pme_loadbal = nullptr;
 435     if (bPMETune)
 436     {
 437         pme_loadbal_init(&pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata,
 438                          fr->nbv->useGpu());
 439     }
 440
 441     if (!ir->bContinuation)
 442     {
 443         if (state->flags & (1U << estV))
 444         {
 445             auto v = makeArrayRef(state->v);
 446             /* Set the velocities of vsites, shells and frozen atoms to zero */
 447             for (i = 0; i < mdatoms->homenr; i++)
 448             {
 449                 if (mdatoms->ptype[i] == eptVSite || mdatoms->ptype[i] == eptShell)
 450                 {
 451                     clear_rvec(v[i]);
 452                 }
 453                 else if (mdatoms->cFREEZE)
 454                 {
 455                     for (m = 0; m < DIM; m++)
 456                     {
 457                         if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
 458                         {
 459                             v[i][m] = 0;
 460                         }
 461                     }
 462                 }
 463             }
 464         }
 465
 466         if (constr)
 467         {
 468             /* Constrain the initial coordinates and velocities */
 469             do_constrain_first(fplog, constr, ir, mdatoms, state->natoms, state->x.arrayRefWithPadding(),
 470                                state->v.arrayRefWithPadding(), state->box, state->lambda[efptBONDED]);
 471         }
 472         if (vsite)
 473         {
 474             /* Construct the virtual sites for the initial configuration */
 475             construct_vsites(vsite, state->x.rvec_array(), ir->delta_t, nullptr, top.idef.iparams,
 476                              top.idef.il, fr->pbcType, fr->bMolPBC, cr, state->box);
 477         }
 478     }
 479
 480     if (ir->efep != efepNO)
 481     {
 482         /* Set free energy calculation frequency as the greatest common
 483          * denominator of nstdhdl and repl_ex_nst. */
 484         nstfep = ir->fepvals->nstdhdl;
 485         if (ir->bExpanded)
 486         {
 487             nstfep = gmx_greatest_common_divisor(ir->expandedvals->nstexpanded, nstfep);
 488         }
 489         if (useReplicaExchange)
 490         {
 491             nstfep = gmx_greatest_common_divisor(replExParams.exchangeInterval, nstfep);
 492         }
 493     }
 494
 495     /* Be REALLY careful about what flags you set here. You CANNOT assume
 496      * this is the first step, since we might be restarting from a checkpoint,
 497      * and in that case we should not do any modifications to the state.
 498      */
 499     bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
 500
 501     // When restarting from a checkpoint, it can be appropriate to
 502     // initialize ekind from quantities in the checkpoint. Otherwise,
 503     // compute_globals must initialize ekind before the simulation
 504     // starts/restarts. However, only the master rank knows what was
 505     // found in the checkpoint file, so we have to communicate in
 506     // order to coordinate the restart.
 507     //
 508     // TODO Consider removing this communication if/when checkpoint
 509     // reading directly follows .tpr reading, because all ranks can
 510     // agree on hasReadEkinState at that time.
 511     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
 512     if (PAR(cr))
 513     {
 514         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr);
 515     }
 516     if (hasReadEkinState)
 517     {
 518         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
 519     }
 520
 521     unsigned int cglo_flags =
 522             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
 523              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
 524
 525     bSumEkinhOld = FALSE;
 526
 527     t_vcm vcm(top_global->groups, *ir);
 528     reportComRemovalInfo(fplog, vcm);
 529
 530     /* To minimize communication, compute_globals computes the COM velocity
 531      * and the kinetic energy for the velocities without COM motion removed.
 532      * Thus to get the kinetic energy without the COM contribution, we need
 533      * to call compute_globals twice.
 534      */
 535     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
 536     {
 537         unsigned int cglo_flags_iteration = cglo_flags;
 538         if (bStopCM && cgloIteration == 0)
 539         {
 540             cglo_flags_iteration |= CGLO_STOPCM;
 541             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
 542         }
 543         compute_globals(gstat, cr, ir, fr, ekind, state->x.rvec_array(), state->v.rvec_array(),
 544                         state->box, state->lambda[efptVDW], mdatoms, nrnb, &vcm, nullptr, enerd,
 545                         force_vir, shake_vir, total_vir, pres, constr, &nullSignaller, state->box,
 546                         &totalNumberOfBondedInteractions, &bSumEkinhOld,
 547                         cglo_flags_iteration
 548                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
 549                                                                          : 0));
 550         if (cglo_flags_iteration & CGLO_STOPCM)
 551         {
 552             /* At initialization, do not pass x with acceleration-correction mode
 553              * to avoid (incorrect) correction of the initial coordinates.
 554              */
 555             rvec* xPtr = nullptr;
 556             if (vcm.mode != ecmLINEAR_ACCELERATION_CORRECTION)
 557             {
 558                 xPtr = state->x.rvec_array();
 559             }
 560             process_and_stopcm_grp(fplog, &vcm, *mdatoms, xPtr, state->v.rvec_array());
 561             inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
 562         }
 563     }
 564     checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global, &top,
 565                                     state->x.rvec_array(), state->box,
 566                                     &shouldCheckNumberOfBondedInteractions);
 567     if (ir->eI == eiVVAK)
 568     {
 569         /* a second call to get the half step temperature initialized as well */
 570         /* we do the same call as above, but turn the pressure off -- internally to
 571            compute_globals, this is recognized as a velocity verlet half-step
 572            kinetic energy calculation.  This minimized excess variables, but
 573            perhaps loses some logic?*/
 574
 575         compute_globals(gstat, cr, ir, fr, ekind, state->x.rvec_array(), state->v.rvec_array(),
 576                         state->box, state->lambda[efptVDW], mdatoms, nrnb, &vcm, nullptr, enerd,
 577                         force_vir, shake_vir, total_vir, pres, constr, &nullSignaller, state->box,
 578                         nullptr, &bSumEkinhOld, cglo_flags & ~CGLO_PRESSURE);
 579     }
 580
 581     /* Calculate the initial half step temperature, and save the ekinh_old */
 582     if (startingBehavior == StartingBehavior::NewSimulation)
 583     {
 584         for (i = 0; (i < ir->opts.ngtc); i++)
 585         {
 586             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
 587         }
 588     }
 589
 590     /* need to make an initiation call to get the Trotter variables set, as well as other constants
 591        for non-trotter temperature control */
 592     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
 593
 594     if (MASTER(cr))
 595     {
 596         if (!ir->bContinuation)
 597         {
 598             if (constr && ir->eConstrAlg == econtLINCS)
 599             {
 600                 fprintf(fplog, "RMS relative constraint deviation after constraining: %.2e\n",
 601                         constr->rmsd());
 602             }
 603             if (EI_STATE_VELOCITY(ir->eI))
 604             {
 605                 real temp = enerd->term[F_TEMP];
 606                 if (ir->eI != eiVV)
 607                 {
 608                     /* Result of Ekin averaged over velocities of -half
 609                      * and +half step, while we only have -half step here.
 610                      */
 611                     temp *= 2;
 612                 }
 613                 fprintf(fplog, "Initial temperature: %g K\n", temp);
 614             }
 615         }
 616
 617         char tbuf[20];
 618         fprintf(stderr, "starting mdrun '%s'\n", *(top_global->name));
 619         if (ir->nsteps >= 0)
 620         {
 621             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
 622         }
 623         else
 624         {
 625             sprintf(tbuf, "%s", "infinite");
 626         }
 627         if (ir->init_step > 0)
 628         {
 629             fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 630                     gmx_step_str(ir->init_step + ir->nsteps, sbuf), tbuf,
 631                     gmx_step_str(ir->init_step, sbuf2), ir->init_step * ir->delta_t);
 632         }
 633         else
 634         {
 635             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
 636         }
 637         fprintf(fplog, "\n");
 638     }
 639
 640     walltime_accounting_start_time(walltime_accounting);
 641     wallcycle_start(wcycle, ewcRUN);
 642     print_start(fplog, cr, walltime_accounting, "mdrun");
 643
 644 #if GMX_FAHCORE
 645     /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
 646     int chkpt_ret = fcCheckPointParallel(cr->nodeid, NULL, 0);
 647     if (chkpt_ret == 0)
 648     {
 649         gmx_fatal(3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0);
 650     }
 651 #endif
 652
 653     /***********************************************************
 654      *
 655      *             Loop over MD steps
 656      *
 657      ************************************************************/
 658
 659     bFirstStep = TRUE;
 660     /* Skip the first Nose-Hoover integration when we get the state from tpx */
 661     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
 662     bSumEkinhOld     = FALSE;
 663     bExchanged       = FALSE;
 664     bNeedRepartition = FALSE;
 665
 666     bool simulationsShareState = false;
 667     int  nstSignalComm         = nstglobalcomm;
 668     {
 669         // TODO This implementation of ensemble orientation restraints is nasty because
 670         // a user can't just do multi-sim with single-sim orientation restraints.
 671         bool usingEnsembleRestraints =
 672                 (fcd->disres.nsystems > 1) || ((ms != nullptr) && (fcd->orires.nr != 0));
 673         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr));
 674
 675         // Replica exchange, ensemble restraints and AWH need all
 676         // simulations to remain synchronized, so they need
 677         // checkpoints and stop conditions to act on the same step, so
 678         // the propagation of such signals must take place between
 679         // simulations, not just within simulations.
 680         // TODO: Make algorithm initializers set these flags.
 681         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
 682
 683         if (simulationsShareState)
 684         {
 685             // Inter-simulation signal communication does not need to happen
 686             // often, so we use a minimum of 200 steps to reduce overhead.
 687             const int c_minimumInterSimulationSignallingInterval = 200;
 688             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
 689                             * nstglobalcomm;
 690         }
 691     }
 692
 693     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
 694             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]), simulationsShareState,
 695             MASTER(cr), ir->nstlist, mdrunOptions.reproducible, nstSignalComm,
 696             mdrunOptions.maximumHoursToRun, ir->nstlist == 0, fplog, step, bNS, walltime_accounting);
 697
 698     auto checkpointHandler = std::make_unique<CheckpointHandler>(
 699             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]), simulationsShareState,
 700             ir->nstlist == 0, MASTER(cr), mdrunOptions.writeConfout,
 701             mdrunOptions.checkpointOptions.period);
 702
 703     const bool resetCountersIsLocal = true;
 704     auto       resetHandler         = std::make_unique<ResetHandler>(
 705             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
 706             !resetCountersIsLocal, ir->nsteps, MASTER(cr), mdrunOptions.timingOptions.resetHalfway,
 707             mdrunOptions.maximumHoursToRun, mdlog, wcycle, walltime_accounting);
 708
 709     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
 710
 711     step     = ir->init_step;
 712     step_rel = 0;
 713
 714     // TODO extract this to new multi-simulation module
 715     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
 716     {
 717         if (!multisim_int_all_are_equal(ms, ir->nsteps))
 718         {
 719             GMX_LOG(mdlog.warning)
 720                     .appendText(
 721                             "Note: The number of steps is not consistent across multi "
 722                             "simulations,\n"
 723                             "but we are proceeding anyway!");
 724         }
 725         if (!multisim_int_all_are_equal(ms, ir->init_step))
 726         {
 727             GMX_LOG(mdlog.warning)
 728                     .appendText(
 729                             "Note: The initial step is not consistent across multi simulations,\n"
 730                             "but we are proceeding anyway!");
 731         }
 732     }
 733
 734     /* and stop now if we should */
 735     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
 736     while (!bLastStep)
 737     {
 738
 739         /* Determine if this is a neighbor search step */
 740         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
 741
 742         if (bPMETune && bNStList)
 743         {
 744             // This has to be here because PME load balancing is called so early.
 745             // TODO: Move to after all booleans are defined.
 746             if (useGpuForUpdate && !bFirstStep)
 747             {
 748                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 749                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 750             }
 751             /* PME grid + cut-off optimization with GPUs or PME nodes */
 752             pme_loadbal_do(pme_loadbal, cr, (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
 753                            fplog, mdlog, *ir, fr, state->box, state->x, wcycle, step, step_rel,
 754                            &bPMETunePrinting, simulationWork.useGpuPmePpCommunication);
 755         }
 756
 757         wallcycle_start(wcycle, ewcSTEP);
 758
 759         bLastStep = (step_rel == ir->nsteps);
 760         t         = t0 + step * ir->delta_t;
 761
 762         // TODO Refactor this, so that nstfep does not need a default value of zero
 763         if (ir->efep != efepNO || ir->bSimTemp)
 764         {
 765             /* find and set the current lambdas */
 766             setCurrentLambdasLocal(step, ir->fepvals, lam0, state->lambda, state->fep_state);
 767
 768             bDoDHDL     = do_per_step(step, ir->fepvals->nstdhdl);
 769             bDoFEP      = ((ir->efep != efepNO) && do_per_step(step, nstfep));
 770             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
 771                            && (step > 0) && (startingBehavior == StartingBehavior::NewSimulation));
 772         }
 773
 774         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
 775                      && do_per_step(step, replExParams.exchangeInterval));
 776
 777         if (doSimulatedAnnealing)
 778         {
 779             update_annealing_target_temp(ir, t, &upd);
 780         }
 781
 782         /* Stop Center of Mass motion */
 783         bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
 784
 785         /* Determine whether or not to do Neighbour Searching */
 786         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
 787
 788         /* Note that the stopHandler will cause termination at nstglobalcomm
 789          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
 790          * nstpcouple steps, we have computed the half-step kinetic energy
 791          * of the previous step and can always output energies at the last step.
 792          */
 793         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
 794
 795         /* do_log triggers energy and virial calculation. Because this leads
 796          * to different code paths, forces can be different. Thus for exact
 797          * continuation we should avoid extra log output.
 798          * Note that the || bLastStep can result in non-exact continuation
 799          * beyond the last step. But we don't consider that to be an issue.
 800          */
 801         do_log     = (do_per_step(step, ir->nstlog)
 802                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
 803         do_verbose = mdrunOptions.verbose
 804                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
 805
 806         if (useGpuForUpdate && !bFirstStep && bNS)
 807         {
 808             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
 809             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
 810             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
 811             // Copy coordinate from the GPU when needed at the search step.
 812             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
 813             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
 814             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 815             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 816         }
 817
 818         if (bNS && !(bFirstStep && ir->bContinuation))
 819         {
 820             bMasterState = FALSE;
 821             /* Correct the new box if it is too skewed */
 822             if (inputrecDynamicBox(ir))
 823             {
 824                 if (correct_box(fplog, step, state->box, graph))
 825                 {
 826                     bMasterState = TRUE;
 827                     // If update is offloaded, it should be informed about the box size change
 828                     if (useGpuForUpdate)
 829                     {
 830                         integrator->setPbc(PbcType::Xyz, state->box);
 831                     }
 832                 }
 833             }
 834             if (DOMAINDECOMP(cr) && bMasterState)
 835             {
 836                 dd_collect_state(cr->dd, state, state_global);
 837             }
 838
 839             if (DOMAINDECOMP(cr))
 840             {
 841                 /* Repartition the domain decomposition */
 842                 dd_partition_system(fplog, mdlog, step, cr, bMasterState, nstglobalcomm, state_global,
 843                                     *top_global, ir, imdSession, pull_work, state, &f, mdAtoms, &top,
 844                                     fr, vsite, constr, nrnb, wcycle, do_verbose && !bPMETunePrinting);
 845                 shouldCheckNumberOfBondedInteractions = true;
 846                 upd.setNumAtoms(state->natoms);
 847
 848                 // Allocate or re-size GPU halo exchange object, if necessary
 849                 if (havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange
 850                     && useGpuForNonbonded && is1D(*cr->dd))
 851                 {
 852                     // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
 853                     void* streamLocal =
 854                             Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
 855                     void* streamNonLocal = Nbnxm::gpu_get_command_stream(
 856                             fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
 857                     constructGpuHaloExchange(mdlog, *cr, streamLocal, streamNonLocal);
 858                 }
 859             }
 860         }
 861
 862         if (MASTER(cr) && do_log)
 863         {
 864             energyOutput.printHeader(fplog, step, t); /* can we improve the information printed here? */
 865         }
 866
 867         if (ir->efep != efepNO)
 868         {
 869             update_mdatoms(mdatoms, state->lambda[efptMASS]);
 870         }
 871
 872         if (bExchanged)
 873         {
 874
 875             /* We need the kinetic energy at minus the half step for determining
 876              * the full step kinetic energy and possibly for T-coupling.*/
 877             /* This may not be quite working correctly yet . . . . */
 878             compute_globals(gstat, cr, ir, fr, ekind, state->x.rvec_array(), state->v.rvec_array(),
 879                             state->box, state->lambda[efptVDW], mdatoms, nrnb, &vcm, wcycle, enerd,
 880                             nullptr, nullptr, nullptr, nullptr, constr, &nullSignaller, state->box,
 881                             &totalNumberOfBondedInteractions, &bSumEkinhOld,
 882                             CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
 883             checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global,
 884                                             &top, state->x.rvec_array(), state->box,
 885                                             &shouldCheckNumberOfBondedInteractions);
 886         }
 887         clear_mat(force_vir);
 888
 889         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
 890
 891         /* Determine the energy and pressure:
 892          * at nstcalcenergy steps and at energy output steps (set below).
 893          */
 894         if (EI_VV(ir->eI) && (!bInitStep))
 895         {
 896             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
 897             bCalcVir      = bCalcEnerStep
 898                        || (ir->epc != epcNO
 899                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
 900         }
 901         else
 902         {
 903             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
 904             bCalcVir = bCalcEnerStep || (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
 905         }
 906         bCalcEner = bCalcEnerStep;
 907
 908         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
 909
 910         if (do_ene || do_log || bDoReplEx)
 911         {
 912             bCalcVir  = TRUE;
 913             bCalcEner = TRUE;
 914         }
 915
 916         /* Do we need global communication ? */
 917         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
 918                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
 919
 920         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
 921                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
 922                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
 923
 924         if (shellfc)
 925         {
 926             /* Now is the time to relax the shells */
 927             relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose, enforcedRotation, step, ir,
 928                                 imdSession, pull_work, bNS, force_flags, &top, constr, enerd, fcd,
 929                                 state->natoms, state->x.arrayRefWithPadding(),
 930                                 state->v.arrayRefWithPadding(), state->box, state->lambda, &state->hist,
 931                                 f.arrayRefWithPadding(), force_vir, mdatoms, nrnb, wcycle, graph,
 932                                 shellfc, fr, runScheduleWork, t, mu_tot, vsite, ddBalanceRegionHandler);
 933         }
 934         else
 935         {
 936             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
 937                is updated (or the AWH update will be performed twice for one step when continuing).
 938                It would be best to call this update function from do_md_trajectory_writing but that
 939                would occur after do_force. One would have to divide the update_awh function into one
 940                function applying the AWH force and one doing the AWH bias update. The update AWH
 941                bias function could then be called after do_md_trajectory_writing (then containing
 942                update_awh_history). The checkpointing will in the future probably moved to the start
 943                of the md loop which will rid of this issue. */
 944             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
 945             {
 946                 awh->updateHistory(state_global->awhHistory.get());
 947             }
 948
 949             /* The coordinates (x) are shifted (to get whole molecules)
 950              * in do_force.
 951              * This is parallellized as well, and does communication too.
 952              * Check comments in sim_util.c
 953              */
 954             do_force(fplog, cr, ms, ir, awh.get(), enforcedRotation, imdSession, pull_work, step,
 955                      nrnb, wcycle, &top, state->box, state->x.arrayRefWithPadding(), &state->hist,
 956                      f.arrayRefWithPadding(), force_vir, mdatoms, enerd, fcd, state->lambda, graph,
 957                      fr, runScheduleWork, vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
 958                      (bNS ? GMX_FORCE_NS : 0) | force_flags, ddBalanceRegionHandler);
 959         }
 960
 961         // VV integrators do not need the following velocity half step
 962         // if it is the first step after starting from a checkpoint.
 963         // That is, the half step is needed on all other steps, and
 964         // also the first step when starting from a .tpr file.
 965         if (EI_VV(ir->eI) && (!bFirstStep || startingBehavior == StartingBehavior::NewSimulation))
 966         /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
 967         {
 968             rvec* vbuf = nullptr;
 969
 970             wallcycle_start(wcycle, ewcUPDATE);
 971             if (ir->eI == eiVV && bInitStep)
 972             {
 973                 /* if using velocity verlet with full time step Ekin,
 974                  * take the first half step only to compute the
 975                  * virial for the first step. From there,
 976                  * revert back to the initial coordinates
 977                  * so that the input is actually the initial step.
 978                  */
 979                 snew(vbuf, state->natoms);
 980                 copy_rvecn(state->v.rvec_array(), vbuf, 0,
 981                            state->natoms); /* should make this better for parallelizing? */
 982             }
 983             else
 984             {
 985                 /* this is for NHC in the Ekin(t+dt/2) version of vv */
 986                 trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ,
 987                                trotter_seq, ettTSEQ1);
 988             }
 989
 990             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd, ekind, M, &upd,
 991                           etrtVELOCITY1, cr, constr);
 992
 993             wallcycle_stop(wcycle, ewcUPDATE);
 994             constrain_velocities(step, nullptr, state, shake_vir, constr, bCalcVir, do_log, do_ene);
 995             wallcycle_start(wcycle, ewcUPDATE);
 996             /* if VV, compute the pressure and constraints */
 997             /* For VV2, we strictly only need this if using pressure
 998              * control, but we really would like to have accurate pressures
 999              * printed out.
1000              * Think about ways around this in the future?
1001              * For now, keep this choice in comments.
1002              */
1003             /*bPres = (ir->eI==eiVV || inputrecNptTrotter(ir)); */
1004             /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && inputrecNptTrotter(ir)));*/
1005             bPres = TRUE;
1006             bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
1007             if (bCalcEner && ir->eI == eiVVAK)
1008             {
1009                 bSumEkinhOld = TRUE;
1010             }
1011             /* for vv, the first half of the integration actually corresponds to the previous step.
1012                So we need information from the last step in the first half of the integration */
1013             if (bGStat || do_per_step(step - 1, nstglobalcomm))
1014             {
1015                 wallcycle_stop(wcycle, ewcUPDATE);
1016                 compute_globals(gstat, cr, ir, fr, ekind, state->x.rvec_array(), state->v.rvec_array(),
1017                                 state->box, state->lambda[efptVDW], mdatoms, nrnb, &vcm, wcycle,
1018                                 enerd, force_vir, shake_vir, total_vir, pres, constr, &nullSignaller,
1019                                 state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
1020                                 (bGStat ? CGLO_GSTAT : 0) | (bCalcEner ? CGLO_ENERGY : 0)
1021                                         | (bTemp ? CGLO_TEMPERATURE : 0) | (bPres ? CGLO_PRESSURE : 0)
1022                                         | (bPres ? CGLO_CONSTRAINT : 0) | (bStopCM ? CGLO_STOPCM : 0)
1023                                         | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1024                                                                                  : 0)
1025                                         | CGLO_SCALEEKIN);
1026                 /* explanation of above:
1027                    a) We compute Ekin at the full time step
1028                    if 1) we are using the AveVel Ekin, and it's not the
1029                    initial step, or 2) if we are using AveEkin, but need the full
1030                    time step kinetic energy for the pressure (always true now, since we want accurate statistics).
1031                    b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
1032                    EkinAveVel because it's needed for the pressure */
1033                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1034                                                 top_global, &top, state->x.rvec_array(), state->box,
1035                                                 &shouldCheckNumberOfBondedInteractions);
1036                 if (bStopCM)
1037                 {
1038                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, state->x.rvec_array(),
1039                                            state->v.rvec_array());
1040                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1041                 }
1042                 wallcycle_start(wcycle, ewcUPDATE);
1043             }
1044             /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
1045             if (!bInitStep)
1046             {
1047                 if (bTrotter)
1048                 {
1049                     m_add(force_vir, shake_vir,
1050                           total_vir); /* we need the un-dispersion corrected total vir here */
1051                     trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ,
1052                                    trotter_seq, ettTSEQ2);
1053
1054                     /* TODO This is only needed when we're about to write
1055                      * a checkpoint, because we use it after the restart
1056                      * (in a kludge?). But what should we be doing if
1057                      * the startingBehavior is NewSimulation or bInitStep are true? */
1058                     if (inputrecNptTrotter(ir) || inputrecNphTrotter(ir))
1059                     {
1060                         copy_mat(shake_vir, state->svir_prev);
1061                         copy_mat(force_vir, state->fvir_prev);
1062                     }
1063                     if (inputrecNvtTrotter(ir) && ir->eI == eiVV)
1064                     {
1065                         /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
1066                         enerd->term[F_TEMP] =
1067                                 sum_ekin(&(ir->opts), ekind, nullptr, (ir->eI == eiVV), FALSE);
1068                         enerd->term[F_EKIN] = trace(ekind->ekin);
1069                     }
1070                 }
1071                 else if (bExchanged)
1072                 {
1073                     wallcycle_stop(wcycle, ewcUPDATE);
1074                     /* We need the kinetic energy at minus the half step for determining
1075                      * the full step kinetic energy and possibly for T-coupling.*/
1076                     /* This may not be quite working correctly yet . . . . */
1077                     compute_globals(gstat, cr, ir, fr, ekind, state->x.rvec_array(),
1078                                     state->v.rvec_array(), state->box, state->lambda[efptVDW],
1079                                     mdatoms, nrnb, &vcm, wcycle, enerd, nullptr, nullptr, nullptr,
1080                                     nullptr, constr, &nullSignaller, state->box, nullptr,
1081                                     &bSumEkinhOld, CGLO_GSTAT | CGLO_TEMPERATURE);
1082                     wallcycle_start(wcycle, ewcUPDATE);
1083                 }
1084             }
1085             /* if it's the initial step, we performed this first step just to get the constraint virial */
1086             if (ir->eI == eiVV && bInitStep)
1087             {
1088                 copy_rvecn(vbuf, state->v.rvec_array(), 0, state->natoms);
1089                 sfree(vbuf);
1090             }
1091             wallcycle_stop(wcycle, ewcUPDATE);
1092         }
1093
1094         /* compute the conserved quantity */
1095         if (EI_VV(ir->eI))
1096         {
1097             saved_conserved_quantity = NPT_energy(ir, state, &MassQ);
1098             if (ir->eI == eiVV)
1099             {
1100                 last_ekin = enerd->term[F_EKIN];
1101             }
1102             if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
1103             {
1104                 saved_conserved_quantity -= enerd->term[F_DISPCORR];
1105             }
1106             /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
1107             if (ir->efep != efepNO)
1108             {
1109                 sum_dhdl(enerd, state->lambda, *ir->fepvals);
1110             }
1111         }
1112
1113         /* ########  END FIRST UPDATE STEP  ############## */
1114         /* ########  If doing VV, we now have v(dt) ###### */
1115         if (bDoExpanded)
1116         {
1117             /* perform extended ensemble sampling in lambda - we don't
1118                actually move to the new state before outputting
1119                statistics, but if performing simulated tempering, we
1120                do update the velocities and the tau_t. */
1121
1122             lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state,
1123                                               state->dfhist, step, state->v.rvec_array(), mdatoms);
1124             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1125             if (MASTER(cr))
1126             {
1127                 copy_df_history(state_global->dfhist, state->dfhist);
1128             }
1129         }
1130
1131         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1132         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1133         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1134             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1135                 || checkpointHandler->isCheckpointingStep()))
1136         {
1137             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1138             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1139         }
1140         // Copy velocities if needed for the output/checkpointing.
1141         // NOTE: Copy on the search steps is done at the beginning of the step.
1142         if (useGpuForUpdate && !bNS
1143             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1144         {
1145             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1146             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1147         }
1148         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1149         // and update is offloaded hence forces are kept on the GPU for update and have not been
1150         // already transferred in do_force().
1151         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1152         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1153         //       prior to GPU update.
1154         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1155         //       copy call in do_force(...).
1156         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1157         //       on host after the D2H copy in do_force(...).
1158         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1159             && do_per_step(step, ir->nstfout))
1160         {
1161             stateGpu->copyForcesFromGpu(ArrayRef<RVec>(f), AtomLocality::Local);
1162             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1163         }
1164         /* Now we have the energies and forces corresponding to the
1165          * coordinates at time t. We must output all of this before
1166          * the update.
1167          */
1168         do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t, ir, state, state_global,
1169                                  observablesHistory, top_global, fr, outf, energyOutput, ekind, f,
1170                                  checkpointHandler->isCheckpointingStep(), bRerunMD, bLastStep,
1171                                  mdrunOptions.writeConfout, bSumEkinhOld);
1172         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1173         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x.rvec_array(), t);
1174
1175         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1176         if (startingBehavior != StartingBehavior::NewSimulation
1177             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1178         {
1179             copy_mat(state->svir_prev, shake_vir);
1180             copy_mat(state->fvir_prev, force_vir);
1181         }
1182
1183         stopHandler->setSignal();
1184         resetHandler->setSignal(walltime_accounting);
1185
1186         if (bGStat || !PAR(cr))
1187         {
1188             /* In parallel we only have to check for checkpointing in steps
1189              * where we do global communication,
1190              *  otherwise the other nodes don't know.
1191              */
1192             checkpointHandler->setSignal(walltime_accounting);
1193         }
1194
1195         /* #########   START SECOND UPDATE STEP ################# */
1196
1197         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1198            controlled in preprocessing */
1199
1200         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1201         {
1202             gmx_bool bIfRandomize;
1203             bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, &upd, constr);
1204             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1205             if (constr && bIfRandomize)
1206             {
1207                 constrain_velocities(step, nullptr, state, tmp_vir, constr, bCalcVir, do_log, do_ene);
1208             }
1209         }
1210         /* Box is changed in update() when we do pressure coupling,
1211          * but we should still use the old box for energy corrections and when
1212          * writing it to the energy file, so it matches the trajectory files for
1213          * the same timestep above. Make a copy in a separate array.
1214          */
1215         copy_mat(state->box, lastbox);
1216
1217         dvdl_constr = 0;
1218
1219         wallcycle_start(wcycle, ewcUPDATE);
1220         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1221         if (bTrotter)
1222         {
1223             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
1224             /* We can only do Berendsen coupling after we have summed
1225              * the kinetic energy or virial. Since the happens
1226              * in global_state after update, we should only do it at
1227              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1228              */
1229         }
1230         else
1231         {
1232             update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
1233             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1234         }
1235
1236         if (EI_VV(ir->eI))
1237         {
1238             /* velocity half-step update */
1239             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd, ekind, M, &upd,
1240                           etrtVELOCITY2, cr, constr);
1241         }
1242
1243         /* Above, initialize just copies ekinh into ekin,
1244          * it doesn't copy position (for VV),
1245          * and entire integrator for MD.
1246          */
1247
1248         if (ir->eI == eiVVAK)
1249         {
1250             cbuf.resize(state->x.size());
1251             std::copy(state->x.begin(), state->x.end(), cbuf.begin());
1252         }
1253
1254         /* With leap-frog type integrators we compute the kinetic energy
1255          * at a whole time step as the average of the half-time step kinetic
1256          * energies of two subsequent steps. Therefore we need to compute the
1257          * half step kinetic energy also if we need energies at the next step.
1258          */
1259         const bool needHalfStepKineticEnergy =
1260                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1261
1262         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1263         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1264         const bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN
1265                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1266
1267         if (useGpuForUpdate)
1268         {
1269             if (bNS)
1270             {
1271                 integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(),
1272                                 stateGpu->getForces(), top.idef, *mdatoms, ekind->ngtc);
1273
1274                 // Copy data to the GPU after buffers might have being reinitialized
1275                 stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1276                 stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1277             }
1278
1279             // If the buffer ops were not offloaded this step, the forces are on the host and have to be copied
1280             if (!runScheduleWork->stepWork.useGpuFBufferOps)
1281             {
1282                 stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), AtomLocality::Local);
1283             }
1284
1285             const bool doTemperatureScaling =
1286                     (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1287
1288             // This applies Leap-Frog, LINCS and SETTLE in succession
1289             integrator->integrate(stateGpu->getForcesReadyOnDeviceEvent(
1290                                           AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1291                                   ir->delta_t, true, bCalcVir, shake_vir, doTemperatureScaling,
1292                                   ekind->tcstat, doParrinelloRahman, ir->nstpcouple * ir->delta_t, M);
1293
1294             // Copy velocities D2H after update if:
1295             // - Globals are computed this step (includes the energy output steps).
1296             // - Temperature is needed for the next step.
1297             if (bGStat || needHalfStepKineticEnergy)
1298             {
1299                 stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1300                 stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1301             }
1302         }
1303         else
1304         {
1305             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd, ekind, M, &upd,
1306                           etrtPOSITION, cr, constr);
1307
1308             wallcycle_stop(wcycle, ewcUPDATE);
1309
1310             constrain_coordinates(step, &dvdl_constr, state, shake_vir, &upd, constr, bCalcVir,
1311                                   do_log, do_ene);
1312
1313             update_sd_second_half(step, &dvdl_constr, ir, mdatoms, state, cr, nrnb, wcycle, &upd,
1314                                   constr, do_log, do_ene);
1315             finish_update(ir, mdatoms, state, graph, nrnb, wcycle, &upd, constr);
1316         }
1317
1318         if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1319         {
1320             updatePrevStepPullCom(pull_work, state);
1321         }
1322
1323         if (ir->eI == eiVVAK)
1324         {
1325             /* erase F_EKIN and F_TEMP here? */
1326             /* just compute the kinetic energy at the half step to perform a trotter step */
1327             compute_globals(gstat, cr, ir, fr, ekind, state->x.rvec_array(), state->v.rvec_array(),
1328                             state->box, state->lambda[efptVDW], mdatoms, nrnb, &vcm, wcycle, enerd,
1329                             force_vir, shake_vir, total_vir, pres, constr, &nullSignaller, lastbox,
1330                             nullptr, &bSumEkinhOld, (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE);
1331             wallcycle_start(wcycle, ewcUPDATE);
1332             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
1333             /* now we know the scaling, we can compute the positions again */
1334             std::copy(cbuf.begin(), cbuf.end(), state->x.begin());
1335
1336             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd, ekind, M, &upd,
1337                           etrtPOSITION, cr, constr);
1338             wallcycle_stop(wcycle, ewcUPDATE);
1339
1340             /* do we need an extra constraint here? just need to copy out of as_rvec_array(state->v.data()) to upd->xp? */
1341             /* are the small terms in the shake_vir here due
1342              * to numerical errors, or are they important
1343              * physically? I'm thinking they are just errors, but not completely sure.
1344              * For now, will call without actually constraining, constr=NULL*/
1345             finish_update(ir, mdatoms, state, graph, nrnb, wcycle, &upd, nullptr);
1346         }
1347         if (EI_VV(ir->eI))
1348         {
1349             /* this factor or 2 correction is necessary
1350                because half of the constraint force is removed
1351                in the vv step, so we have to double it.  See
1352                the Redmine issue #1255.  It is not yet clear
1353                if the factor of 2 is exact, or just a very
1354                good approximation, and this will be
1355                investigated.  The next step is to see if this
1356                can be done adding a dhdl contribution from the
1357                rattle step, but this is somewhat more
1358                complicated with the current code. Will be
1359                investigated, hopefully for 4.6.3. However,
1360                this current solution is much better than
1361                having it completely wrong.
1362              */
1363             enerd->term[F_DVDL_CONSTR] += 2 * dvdl_constr;
1364         }
1365         else
1366         {
1367             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1368         }
1369
1370         if (vsite != nullptr)
1371         {
1372             wallcycle_start(wcycle, ewcVSITECONSTR);
1373             if (graph != nullptr)
1374             {
1375                 shift_self(graph, state->box, state->x.rvec_array());
1376             }
1377             construct_vsites(vsite, state->x.rvec_array(), ir->delta_t, state->v.rvec_array(),
1378                              top.idef.iparams, top.idef.il, fr->pbcType, fr->bMolPBC, cr, state->box);
1379
1380             if (graph != nullptr)
1381             {
1382                 unshift_self(graph, state->box, state->x.rvec_array());
1383             }
1384             wallcycle_stop(wcycle, ewcVSITECONSTR);
1385         }
1386
1387         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1388         /* With Leap-Frog we can skip compute_globals at
1389          * non-communication steps, but we need to calculate
1390          * the kinetic energy one step before communication.
1391          */
1392         {
1393             // Organize to do inter-simulation signalling on steps if
1394             // and when algorithms require it.
1395             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1396
1397             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1398             {
1399                 // Copy coordinates when needed to stop the CM motion.
1400                 if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
1401                 {
1402                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1403                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1404                 }
1405                 // Since we're already communicating at this step, we
1406                 // can propagate intra-simulation signals. Note that
1407                 // check_nstglobalcomm has the responsibility for
1408                 // choosing the value of nstglobalcomm that is one way
1409                 // bGStat becomes true, so we can't get into a
1410                 // situation where e.g. checkpointing can't be
1411                 // signalled.
1412                 bool                doIntraSimSignal = true;
1413                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1414
1415                 compute_globals(
1416                         gstat, cr, ir, fr, ekind, state->x.rvec_array(), state->v.rvec_array(),
1417                         state->box, state->lambda[efptVDW], mdatoms, nrnb, &vcm, wcycle, enerd,
1418                         force_vir, shake_vir, total_vir, pres, constr, &signaller, lastbox,
1419                         &totalNumberOfBondedInteractions, &bSumEkinhOld,
1420                         (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1421                                 | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1422                                 | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1423                                 | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
1424                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1425                                                                          : 0));
1426                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1427                                                 top_global, &top, state->x.rvec_array(), state->box,
1428                                                 &shouldCheckNumberOfBondedInteractions);
1429                 if (!EI_VV(ir->eI) && bStopCM)
1430                 {
1431                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, state->x.rvec_array(),
1432                                            state->v.rvec_array());
1433                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1434
1435                     // TODO: The special case of removing CM motion should be dealt more gracefully
1436                     if (useGpuForUpdate)
1437                     {
1438                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1439                         // Here we block until the H2D copy completes because event sync with the
1440                         // force kernels that use the coordinates on the next steps is not implemented
1441                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1442                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1443                     }
1444                 }
1445             }
1446         }
1447
1448         /* #############  END CALC EKIN AND PRESSURE ################# */
1449
1450         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1451            the virial that should probably be addressed eventually. state->veta has better properies,
1452            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1453            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1454
1455         if (ir->efep != efepNO && !EI_VV(ir->eI))
1456         {
1457             /* Sum up the foreign energy and dhdl terms for md and sd.
1458                Currently done every step so that dhdl is correct in the .edr */
1459             sum_dhdl(enerd, state->lambda, *ir->fepvals);
1460         }
1461
1462         update_pcouple_after_coordinates(fplog, step, ir, mdatoms, pres, force_vir, shake_vir,
1463                                          pressureCouplingMu, state, nrnb, &upd, !useGpuForUpdate);
1464
1465         const bool doBerendsenPressureCoupling =
1466                 (inputrec->epc == epcBERENDSEN && do_per_step(step, inputrec->nstpcouple));
1467         if (useGpuForUpdate && (doBerendsenPressureCoupling || doParrinelloRahman))
1468         {
1469             integrator->scaleCoordinates(pressureCouplingMu);
1470             integrator->setPbc(PbcType::Xyz, state->box);
1471         }
1472
1473         /* ################# END UPDATE STEP 2 ################# */
1474         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1475
1476         /* The coordinates (x) were unshifted in update */
1477         if (!bGStat)
1478         {
1479             /* We will not sum ekinh_old,
1480              * so signal that we still have to do it.
1481              */
1482             bSumEkinhOld = TRUE;
1483         }
1484
1485         if (bCalcEner)
1486         {
1487             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1488
1489             /* use the directly determined last velocity, not actually the averaged half steps */
1490             if (bTrotter && ir->eI == eiVV)
1491             {
1492                 enerd->term[F_EKIN] = last_ekin;
1493             }
1494             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1495
1496             if (integratorHasConservedEnergyQuantity(ir))
1497             {
1498                 if (EI_VV(ir->eI))
1499                 {
1500                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1501                 }
1502                 else
1503                 {
1504                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1505                 }
1506             }
1507             /* #########  END PREPARING EDR OUTPUT  ###########  */
1508         }
1509
1510         /* Output stuff */
1511         if (MASTER(cr))
1512         {
1513             if (fplog && do_log && bDoExpanded)
1514             {
1515                 /* only needed if doing expanded ensemble */
1516                 PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals,
1517                                           ir->bSimTemp ? ir->simtempvals : nullptr,
1518                                           state_global->dfhist, state->fep_state, ir->nstlog, step);
1519             }
1520             if (bCalcEner)
1521             {
1522                 energyOutput.addDataAtEnergyStep(bDoDHDL, bCalcEnerStep, t, mdatoms->tmass, enerd, state,
1523                                                  ir->fepvals, ir->expandedvals, lastbox, shake_vir,
1524                                                  force_vir, total_vir, pres, ekind, mu_tot, constr);
1525             }
1526             else
1527             {
1528                 energyOutput.recordNonEnergyStep();
1529             }
1530
1531             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1532             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1533
1534             if (doSimulatedAnnealing)
1535             {
1536                 energyOutput.printAnnealingTemperatures(do_log ? fplog : nullptr, groups, &(ir->opts));
1537             }
1538             if (do_log || do_ene || do_dr || do_or)
1539             {
1540                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or,
1541                                                    do_log ? fplog : nullptr, step, t, fcd, awh.get());
1542             }
1543
1544             if (ir->bPull)
1545             {
1546                 pull_print_output(pull_work, step, t);
1547             }
1548
1549             if (do_per_step(step, ir->nstlog))
1550             {
1551                 if (fflush(fplog) != 0)
1552                 {
1553                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1554                 }
1555             }
1556         }
1557         if (bDoExpanded)
1558         {
1559             /* Have to do this part _after_ outputting the logfile and the edr file */
1560             /* Gets written into the state at the beginning of next loop*/
1561             state->fep_state = lamnew;
1562         }
1563         /* Print the remaining wall clock time for the run */
1564         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1565         {
1566             if (shellfc)
1567             {
1568                 fprintf(stderr, "\n");
1569             }
1570             print_time(stderr, walltime_accounting, step, ir, cr);
1571         }
1572
1573         /* Ion/water position swapping.
1574          * Not done in last step since trajectory writing happens before this call
1575          * in the MD loop and exchanges would be lost anyway. */
1576         bNeedRepartition = FALSE;
1577         if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep && do_per_step(step, ir->swap->nstswap))
1578         {
1579             bNeedRepartition =
1580                     do_swapcoords(cr, step, t, ir, swap, wcycle, as_rvec_array(state->x.data()),
1581                                   state->box, MASTER(cr) && mdrunOptions.verbose, bRerunMD);
1582
1583             if (bNeedRepartition && DOMAINDECOMP(cr))
1584             {
1585                 dd_collect_state(cr->dd, state, state_global);
1586             }
1587         }
1588
1589         /* Replica exchange */
1590         bExchanged = FALSE;
1591         if (bDoReplEx)
1592         {
1593             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1594         }
1595
1596         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1597         {
1598             dd_partition_system(fplog, mdlog, step, cr, TRUE, 1, state_global, *top_global, ir,
1599                                 imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
1600                                 nrnb, wcycle, FALSE);
1601             shouldCheckNumberOfBondedInteractions = true;
1602             upd.setNumAtoms(state->natoms);
1603         }
1604
1605         bFirstStep = FALSE;
1606         bInitStep  = FALSE;
1607
1608         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1609         /* With all integrators, except VV, we need to retain the pressure
1610          * at the current step for coupling at the next step.
1611          */
1612         if ((state->flags & (1U << estPRES_PREV))
1613             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1614         {
1615             /* Store the pressure in t_state for pressure coupling
1616              * at the next MD step.
1617              */
1618             copy_mat(pres, state->pres_prev);
1619         }
1620
1621         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1622
1623         if ((membed != nullptr) && (!bLastStep))
1624         {
1625             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1626         }
1627
1628         cycles = wallcycle_stop(wcycle, ewcSTEP);
1629         if (DOMAINDECOMP(cr) && wcycle)
1630         {
1631             dd_cycles_add(cr->dd, cycles, ddCyclStep);
1632         }
1633
1634         /* increase the MD step number */
1635         step++;
1636         step_rel++;
1637
1638         resetHandler->resetCounters(step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb,
1639                                     fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
1640
1641         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
1642         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
1643     }
1644     /* End of main MD loop */
1645
1646     /* Closing TNG files can include compressing data. Therefore it is good to do that
1647      * before stopping the time measurements. */
1648     mdoutf_tng_close(outf);
1649
1650     /* Stop measuring walltime */
1651     walltime_accounting_end_time(walltime_accounting);
1652
1653     if (!thisRankHasDuty(cr, DUTY_PME))
1654     {
1655         /* Tell the PME only node to finish */
1656         gmx_pme_send_finish(cr);
1657     }
1658
1659     if (MASTER(cr))
1660     {
1661         if (ir->nstcalcenergy > 0)
1662         {
1663             energyOutput.printAnnealingTemperatures(fplog, groups, &(ir->opts));
1664             energyOutput.printAverages(fplog, groups);
1665         }
1666     }
1667     done_mdoutf(outf);
1668
1669     if (bPMETune)
1670     {
1671         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
1672     }
1673
1674     done_shellfc(fplog, shellfc, step_rel);
1675
1676     if (useReplicaExchange && MASTER(cr))
1677     {
1678         print_replica_exchange_statistics(fplog, repl_ex);
1679     }
1680
1681     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
1682
1683     global_stat_destroy(gstat);
1684 }