src/gromacs/mdrun/md.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief Implements the integrator for normal molecular dynamics simulations
  40  *
  41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
  42  * \ingroup module_mdrun
  43  */
  44 #include "gmxpre.h"
  45
  46 #include <cinttypes>
  47 #include <cmath>
  48 #include <cstdio>
  49 #include <cstdlib>
  50
  51 #include <algorithm>
  52 #include <memory>
  53 #include <numeric>
  54
  55 #include "gromacs/applied_forces/awh/awh.h"
  56 #include "gromacs/applied_forces/awh/read_params.h"
  57 #include "gromacs/commandline/filenm.h"
  58 #include "gromacs/domdec/collect.h"
  59 #include "gromacs/domdec/dlbtiming.h"
  60 #include "gromacs/domdec/domdec.h"
  61 #include "gromacs/domdec/domdec_network.h"
  62 #include "gromacs/domdec/domdec_struct.h"
  63 #include "gromacs/domdec/gpuhaloexchange.h"
  64 #include "gromacs/domdec/mdsetup.h"
  65 #include "gromacs/domdec/partition.h"
  66 #include "gromacs/essentialdynamics/edsam.h"
  67 #include "gromacs/ewald/pme_load_balancing.h"
  68 #include "gromacs/ewald/pme_pp.h"
  69 #include "gromacs/fileio/trxio.h"
  70 #include "gromacs/gmxlib/network.h"
  71 #include "gromacs/gmxlib/nrnb.h"
  72 #include "gromacs/gpu_utils/device_stream_manager.h"
  73 #include "gromacs/gpu_utils/gpu_utils.h"
  74 #include "gromacs/imd/imd.h"
  75 #include "gromacs/listed_forces/listed_forces.h"
  76 #include "gromacs/math/functions.h"
  77 #include "gromacs/math/invertmatrix.h"
  78 #include "gromacs/math/vec.h"
  79 #include "gromacs/math/vectypes.h"
  80 #include "gromacs/mdlib/checkpointhandler.h"
  81 #include "gromacs/mdlib/compute_io.h"
  82 #include "gromacs/mdlib/constr.h"
  83 #include "gromacs/mdlib/coupling.h"
  84 #include "gromacs/mdlib/ebin.h"
  85 #include "gromacs/mdlib/enerdata_utils.h"
  86 #include "gromacs/mdlib/energyoutput.h"
  87 #include "gromacs/mdlib/expanded.h"
  88 #include "gromacs/mdlib/force.h"
  89 #include "gromacs/mdlib/force_flags.h"
  90 #include "gromacs/mdlib/forcerec.h"
  91 #include "gromacs/mdlib/freeenergyparameters.h"
  92 #include "gromacs/mdlib/md_support.h"
  93 #include "gromacs/mdlib/mdatoms.h"
  94 #include "gromacs/mdlib/mdoutf.h"
  95 #include "gromacs/mdlib/membed.h"
  96 #include "gromacs/mdlib/resethandler.h"
  97 #include "gromacs/mdlib/sighandler.h"
  98 #include "gromacs/mdlib/simulationsignal.h"
  99 #include "gromacs/mdlib/stat.h"
 100 #include "gromacs/mdlib/stophandler.h"
 101 #include "gromacs/mdlib/tgroup.h"
 102 #include "gromacs/mdlib/trajectory_writing.h"
 103 #include "gromacs/mdlib/update.h"
 104 #include "gromacs/mdlib/update_constrain_gpu.h"
 105 #include "gromacs/mdlib/update_vv.h"
 106 #include "gromacs/mdlib/vcm.h"
 107 #include "gromacs/mdlib/vsite.h"
 108 #include "gromacs/mdrunutility/handlerestart.h"
 109 #include "gromacs/mdrunutility/multisim.h"
 110 #include "gromacs/mdrunutility/printtime.h"
 111 #include "gromacs/mdtypes/awh_history.h"
 112 #include "gromacs/mdtypes/awh_params.h"
 113 #include "gromacs/mdtypes/commrec.h"
 114 #include "gromacs/mdtypes/df_history.h"
 115 #include "gromacs/mdtypes/energyhistory.h"
 116 #include "gromacs/mdtypes/fcdata.h"
 117 #include "gromacs/mdtypes/forcebuffers.h"
 118 #include "gromacs/mdtypes/forcerec.h"
 119 #include "gromacs/mdtypes/group.h"
 120 #include "gromacs/mdtypes/inputrec.h"
 121 #include "gromacs/mdtypes/interaction_const.h"
 122 #include "gromacs/mdtypes/md_enums.h"
 123 #include "gromacs/mdtypes/mdatom.h"
 124 #include "gromacs/mdtypes/mdrunoptions.h"
 125 #include "gromacs/mdtypes/multipletimestepping.h"
 126 #include "gromacs/mdtypes/observableshistory.h"
 127 #include "gromacs/mdtypes/pullhistory.h"
 128 #include "gromacs/mdtypes/simulation_workload.h"
 129 #include "gromacs/mdtypes/state.h"
 130 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
 131 #include "gromacs/modularsimulator/energydata.h"
 132 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 133 #include "gromacs/nbnxm/nbnxm.h"
 134 #include "gromacs/pbcutil/pbc.h"
 135 #include "gromacs/pulling/output.h"
 136 #include "gromacs/pulling/pull.h"
 137 #include "gromacs/swap/swapcoords.h"
 138 #include "gromacs/timing/wallcycle.h"
 139 #include "gromacs/timing/walltime_accounting.h"
 140 #include "gromacs/topology/atoms.h"
 141 #include "gromacs/topology/idef.h"
 142 #include "gromacs/topology/mtop_util.h"
 143 #include "gromacs/topology/topology.h"
 144 #include "gromacs/trajectory/trajectoryframe.h"
 145 #include "gromacs/utility/basedefinitions.h"
 146 #include "gromacs/utility/cstringutil.h"
 147 #include "gromacs/utility/fatalerror.h"
 148 #include "gromacs/utility/logger.h"
 149 #include "gromacs/utility/real.h"
 150 #include "gromacs/utility/smalloc.h"
 151
 152 #include "legacysimulator.h"
 153 #include "replicaexchange.h"
 154 #include "shellfc.h"
 155
 156 using gmx::SimulationSignaller;
 157
 158 void gmx::LegacySimulator::do_md()
 159 {
 160     // TODO Historically, the EM and MD "integrators" used different
 161     // names for the t_inputrec *parameter, but these must have the
 162     // same name, now that it's a member of a struct. We use this ir
 163     // alias to avoid a large ripple of nearly useless changes.
 164     // t_inputrec is being replaced by IMdpOptionsProvider, so this
 165     // will go away eventually.
 166     const t_inputrec* ir = inputrec;
 167
 168     int64_t      step, step_rel;
 169     double       t, t0 = ir->init_t;
 170     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
 171     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
 172     gmx_bool     bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
 173     gmx_bool     do_ene, do_log, do_verbose;
 174     gmx_bool     bMasterState;
 175     unsigned int force_flags;
 176     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
 177     int    i, m;
 178     rvec   mu_tot;
 179     matrix pressureCouplingMu, M;
 180     gmx_repl_ex_t     repl_ex = nullptr;
 181     gmx_global_stat_t gstat;
 182     gmx_shellfc_t*    shellfc;
 183     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
 184     gmx_bool          bTrotter;
 185     real              dvdl_constr;
 186     std::vector<RVec> cbuf;
 187     matrix            lastbox;
 188     int               lamnew = 0;
 189     /* for FEP */
 190     int       nstfep = 0;
 191     double    cycles;
 192     real      saved_conserved_quantity = 0;
 193     real      last_ekin                = 0;
 194     t_extmass MassQ;
 195     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
 196
 197     /* PME load balancing data for GPU kernels */
 198     gmx_bool bPMETune         = FALSE;
 199     gmx_bool bPMETunePrinting = FALSE;
 200
 201     bool bInteractiveMDstep = false;
 202
 203     SimulationSignals signals;
 204     // Most global communnication stages don't propagate mdrun
 205     // signals, and will use this object to achieve that.
 206     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
 207
 208     if (!mdrunOptions.writeConfout)
 209     {
 210         // This is on by default, and the main known use case for
 211         // turning it off is for convenience in benchmarking, which is
 212         // something that should not show up in the general user
 213         // interface.
 214         GMX_LOG(mdlog.info)
 215                 .asParagraph()
 216                 .appendText(
 217                         "The -noconfout functionality is deprecated, and may be removed in a "
 218                         "future version.");
 219     }
 220
 221     /* md-vv uses averaged full step velocities for T-control
 222        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 223        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 224     bTrotter = (EI_VV(ir->eI)
 225                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
 226
 227     const bool bRerunMD = false;
 228
 229     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
 230     bGStatEveryStep   = (nstglobalcomm == 1);
 231
 232     const SimulationGroups* groups = &top_global.groups;
 233
 234     std::unique_ptr<EssentialDynamics> ed = nullptr;
 235     if (opt2bSet("-ei", nfile, fnm))
 236     {
 237         /* Initialize essential dynamics sampling */
 238         ed = init_edsam(mdlog,
 239                         opt2fn_null("-ei", nfile, fnm),
 240                         opt2fn("-eo", nfile, fnm),
 241                         top_global,
 242                         *ir,
 243                         cr,
 244                         constr,
 245                         state_global,
 246                         observablesHistory,
 247                         oenv,
 248                         startingBehavior);
 249     }
 250     else if (observablesHistory->edsamHistory)
 251     {
 252         gmx_fatal(FARGS,
 253                   "The checkpoint is from a run with essential dynamics sampling, "
 254                   "but the current run did not specify the -ei option. "
 255                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
 256     }
 257
 258     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
 259     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
 260     initialize_lambdas(fplog,
 261                        ir->efep,
 262                        ir->bSimTemp,
 263                        *ir->fepvals,
 264                        ir->simtempvals->temperatures,
 265                        gmx::arrayRefFromArray(ir->opts.ref_t, ir->opts.ngtc),
 266                        MASTER(cr),
 267                        fep_state,
 268                        lambda);
 269     Update upd(*ir, deform);
 270     bool   doSimulatedAnnealing = false;
 271     {
 272         // TODO: Avoid changing inputrec (#3854)
 273         // Simulated annealing updates the reference temperature.
 274         auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
 275         doSimulatedAnnealing   = initSimulatedAnnealing(nonConstInputrec, &upd);
 276     }
 277     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
 278
 279     const t_fcdata& fcdata = *fr->fcdata;
 280
 281     bool simulationsShareState = false;
 282     int  nstSignalComm         = nstglobalcomm;
 283     {
 284         // TODO This implementation of ensemble orientation restraints is nasty because
 285         // a user can't just do multi-sim with single-sim orientation restraints.
 286         bool usingEnsembleRestraints =
 287                 (fcdata.disres->nsystems > 1) || ((ms != nullptr) && (fcdata.orires->nr != 0));
 288         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim() && (ms != nullptr));
 289
 290         // Replica exchange, ensemble restraints and AWH need all
 291         // simulations to remain synchronized, so they need
 292         // checkpoints and stop conditions to act on the same step, so
 293         // the propagation of such signals must take place between
 294         // simulations, not just within simulations.
 295         // TODO: Make algorithm initializers set these flags.
 296         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
 297
 298         if (simulationsShareState)
 299         {
 300             // Inter-simulation signal communication does not need to happen
 301             // often, so we use a minimum of 200 steps to reduce overhead.
 302             const int c_minimumInterSimulationSignallingInterval = 200;
 303             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
 304                             * nstglobalcomm;
 305         }
 306     }
 307
 308     if (startingBehavior != StartingBehavior::RestartWithAppending)
 309     {
 310         pleaseCiteCouplingAlgorithms(fplog, *ir);
 311     }
 312     gmx_mdoutf*       outf = init_mdoutf(fplog,
 313                                    nfile,
 314                                    fnm,
 315                                    mdrunOptions,
 316                                    cr,
 317                                    outputProvider,
 318                                    mdModulesNotifiers,
 319                                    ir,
 320                                    top_global,
 321                                    oenv,
 322                                    wcycle,
 323                                    startingBehavior,
 324                                    simulationsShareState,
 325                                    ms);
 326     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf),
 327                                    top_global,
 328                                    *ir,
 329                                    pull_work,
 330                                    mdoutf_get_fp_dhdl(outf),
 331                                    false,
 332                                    startingBehavior,
 333                                    simulationsShareState,
 334                                    mdModulesNotifiers);
 335
 336     gstat = global_stat_init(ir);
 337
 338     const auto& simulationWork     = runScheduleWork->simulationWork;
 339     const bool  useGpuForPme       = simulationWork.useGpuPme;
 340     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
 341     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
 342     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
 343
 344     /* Check for polarizable models and flexible constraints */
 345     shellfc = init_shell_flexcon(fplog,
 346                                  top_global,
 347                                  constr ? constr->numFlexibleConstraints() : 0,
 348                                  ir->nstcalcenergy,
 349                                  DOMAINDECOMP(cr),
 350                                  useGpuForPme);
 351
 352     {
 353         double io = compute_io(ir, top_global.natoms, *groups, energyOutput.numEnergyTerms(), 1);
 354         if ((io > 2000) && MASTER(cr))
 355         {
 356             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
 357         }
 358     }
 359
 360     // Local state only becomes valid now.
 361     std::unique_ptr<t_state> stateInstance;
 362     t_state*                 state;
 363
 364     gmx_localtop_t top(top_global.ffparams);
 365
 366     auto mdatoms = mdAtoms->mdatoms();
 367
 368     ForceBuffers     f(fr->useMts,
 369                    ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 370                            ? PinningPolicy::PinnedIfSupported
 371                            : PinningPolicy::CannotBePinned);
 372     const t_mdatoms* md = mdAtoms->mdatoms();
 373     if (DOMAINDECOMP(cr))
 374     {
 375         stateInstance = std::make_unique<t_state>();
 376         state         = stateInstance.get();
 377         dd_init_local_state(*cr->dd, state_global, state);
 378
 379         /* Distribute the charge groups over the nodes from the master node */
 380         dd_partition_system(fplog,
 381                             mdlog,
 382                             ir->init_step,
 383                             cr,
 384                             TRUE,
 385                             1,
 386                             state_global,
 387                             top_global,
 388                             *ir,
 389                             imdSession,
 390                             pull_work,
 391                             state,
 392                             &f,
 393                             mdAtoms,
 394                             &top,
 395                             fr,
 396                             vsite,
 397                             constr,
 398                             nrnb,
 399                             nullptr,
 400                             FALSE);
 401         upd.updateAfterPartition(state->natoms,
 402                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
 403                                              : gmx::ArrayRef<const unsigned short>(),
 404                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
 405                                          : gmx::ArrayRef<const unsigned short>());
 406     }
 407     else
 408     {
 409         state_change_natoms(state_global, state_global->natoms);
 410         /* Copy the pointer to the global state */
 411         state = state_global;
 412
 413         /* Generate and initialize new topology */
 414         mdAlgorithmsSetupAtomData(cr, *ir, top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
 415
 416         upd.updateAfterPartition(state->natoms,
 417                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
 418                                              : gmx::ArrayRef<const unsigned short>(),
 419                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
 420                                          : gmx::ArrayRef<const unsigned short>());
 421     }
 422
 423     std::unique_ptr<UpdateConstrainGpu> integrator;
 424
 425     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
 426
 427     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
 428     if (useGpuForUpdate)
 429     {
 430         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
 431                                    || constr->numConstraintsTotal() == 0,
 432                            "Constraints in domain decomposition are only supported with update "
 433                            "groups if using GPU update.\n");
 434         GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
 435                                    || constr->numConstraintsTotal() == 0,
 436                            "SHAKE is not supported with GPU update.");
 437         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
 438                            "Either PME or short-ranged non-bonded interaction tasks must run on "
 439                            "the GPU to use GPU update.\n");
 440         GMX_RELEASE_ASSERT(ir->eI == IntegrationAlgorithm::MD,
 441                            "Only the md integrator is supported with the GPU update.\n");
 442         GMX_RELEASE_ASSERT(
 443                 ir->etc != TemperatureCoupling::NoseHoover,
 444                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
 445         GMX_RELEASE_ASSERT(
 446                 ir->epc == PressureCoupling::No || ir->epc == PressureCoupling::ParrinelloRahman
 447                         || ir->epc == PressureCoupling::Berendsen || ir->epc == PressureCoupling::CRescale,
 448                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
 449                 "with the GPU update.\n");
 450         GMX_RELEASE_ASSERT(!mdatoms->haveVsites,
 451                            "Virtual sites are not supported with the GPU update.\n");
 452         GMX_RELEASE_ASSERT(ed == nullptr,
 453                            "Essential dynamics is not supported with the GPU update.\n");
 454         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
 455                            "Constraints pulling is not supported with the GPU update.\n");
 456         GMX_RELEASE_ASSERT(fcdata.orires->nr == 0,
 457                            "Orientation restraints are not supported with the GPU update.\n");
 458         GMX_RELEASE_ASSERT(
 459                 ir->efep == FreeEnergyPerturbationType::No
 460                         || (!haveFepPerturbedMasses(top_global) && !havePerturbedConstraints(top_global)),
 461                 "Free energy perturbation of masses and constraints are not supported with the GPU "
 462                 "update.");
 463
 464         if (constr != nullptr && constr->numConstraintsTotal() > 0)
 465         {
 466             GMX_LOG(mdlog.info)
 467                     .asParagraph()
 468                     .appendText("Updating coordinates and applying constraints on the GPU.");
 469         }
 470         else
 471         {
 472             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
 473         }
 474         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
 475                            "Device stream manager should be initialized in order to use GPU "
 476                            "update-constraints.");
 477         GMX_RELEASE_ASSERT(
 478                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
 479                 "Update stream should be initialized in order to use GPU "
 480                 "update-constraints.");
 481         integrator = std::make_unique<UpdateConstrainGpu>(
 482                 *ir,
 483                 top_global,
 484                 ekind->ngtc,
 485                 fr->deviceStreamManager->context(),
 486                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
 487                 stateGpu->xUpdatedOnDevice(),
 488                 wcycle);
 489
 490         integrator->setPbc(PbcType::Xyz, state->box);
 491     }
 492
 493     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 494     {
 495         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
 496     }
 497     if (useGpuForUpdate)
 498     {
 499         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
 500     }
 501
 502     // NOTE: The global state is no longer used at this point.
 503     // But state_global is still used as temporary storage space for writing
 504     // the global state to file and potentially for replica exchange.
 505     // (Global topology should persist.)
 506
 507     update_mdatoms(mdatoms, state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
 508
 509     if (ir->bExpanded)
 510     {
 511         /* Check nstexpanded here, because the grompp check was broken */
 512         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
 513         {
 514             gmx_fatal(FARGS,
 515                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
 516         }
 517         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
 518     }
 519
 520     if (MASTER(cr))
 521     {
 522         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
 523     }
 524
 525     preparePrevStepPullCom(ir,
 526                            pull_work,
 527                            gmx::arrayRefFromArray(mdatoms->massT, mdatoms->nr),
 528                            state,
 529                            state_global,
 530                            cr,
 531                            startingBehavior != StartingBehavior::NewSimulation);
 532
 533     // TODO: Remove this by converting AWH into a ForceProvider
 534     auto awh = prepareAwhModule(fplog,
 535                                 *ir,
 536                                 state_global,
 537                                 cr,
 538                                 ms,
 539                                 startingBehavior != StartingBehavior::NewSimulation,
 540                                 shellfc != nullptr,
 541                                 opt2fn("-awh", nfile, fnm),
 542                                 pull_work);
 543
 544     if (useReplicaExchange && MASTER(cr))
 545     {
 546         repl_ex = init_replica_exchange(fplog, ms, top_global.natoms, ir, replExParams);
 547     }
 548     /* PME tuning is only supported in the Verlet scheme, with PME for
 549      * Coulomb. It is not supported with only LJ PME. */
 550     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
 551                 && ir->cutoff_scheme != CutoffScheme::Group);
 552
 553     pme_load_balancing_t* pme_loadbal = nullptr;
 554     if (bPMETune)
 555     {
 556         pme_loadbal_init(
 557                 &pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu());
 558     }
 559
 560     if (!ir->bContinuation)
 561     {
 562         if (state->flags & enumValueToBitMask(StateEntry::V))
 563         {
 564             auto v = makeArrayRef(state->v);
 565             /* Set the velocities of vsites, shells and frozen atoms to zero */
 566             for (i = 0; i < mdatoms->homenr; i++)
 567             {
 568                 if (mdatoms->ptype[i] == ParticleType::Shell)
 569                 {
 570                     clear_rvec(v[i]);
 571                 }
 572                 else if (mdatoms->cFREEZE)
 573                 {
 574                     for (m = 0; m < DIM; m++)
 575                     {
 576                         if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
 577                         {
 578                             v[i][m] = 0;
 579                         }
 580                     }
 581                 }
 582             }
 583         }
 584
 585         if (constr)
 586         {
 587             /* Constrain the initial coordinates and velocities */
 588             do_constrain_first(fplog,
 589                                constr,
 590                                ir,
 591                                mdatoms->nr,
 592                                mdatoms->homenr,
 593                                state->x.arrayRefWithPadding(),
 594                                state->v.arrayRefWithPadding(),
 595                                state->box,
 596                                state->lambda[FreeEnergyPerturbationCouplingType::Bonded]);
 597         }
 598     }
 599
 600     if (ir->efep != FreeEnergyPerturbationType::No)
 601     {
 602         /* Set free energy calculation frequency as the greatest common
 603          * denominator of nstdhdl and repl_ex_nst. */
 604         nstfep = ir->fepvals->nstdhdl;
 605         if (ir->bExpanded)
 606         {
 607             nstfep = std::gcd(ir->expandedvals->nstexpanded, nstfep);
 608         }
 609         if (useReplicaExchange)
 610         {
 611             nstfep = std::gcd(replExParams.exchangeInterval, nstfep);
 612         }
 613         if (ir->bDoAwh)
 614         {
 615             nstfep = std::gcd(ir->awhParams->nstSampleCoord(), nstfep);
 616         }
 617     }
 618
 619     /* Be REALLY careful about what flags you set here. You CANNOT assume
 620      * this is the first step, since we might be restarting from a checkpoint,
 621      * and in that case we should not do any modifications to the state.
 622      */
 623     bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && !ir->bContinuation);
 624
 625     // When restarting from a checkpoint, it can be appropriate to
 626     // initialize ekind from quantities in the checkpoint. Otherwise,
 627     // compute_globals must initialize ekind before the simulation
 628     // starts/restarts. However, only the master rank knows what was
 629     // found in the checkpoint file, so we have to communicate in
 630     // order to coordinate the restart.
 631     //
 632     // TODO Consider removing this communication if/when checkpoint
 633     // reading directly follows .tpr reading, because all ranks can
 634     // agree on hasReadEkinState at that time.
 635     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
 636     if (PAR(cr))
 637     {
 638         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
 639     }
 640     if (hasReadEkinState)
 641     {
 642         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
 643     }
 644
 645     unsigned int cglo_flags =
 646             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
 647              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
 648
 649     bSumEkinhOld = FALSE;
 650
 651     t_vcm vcm(top_global.groups, *ir);
 652     reportComRemovalInfo(fplog, vcm);
 653
 654     /* To minimize communication, compute_globals computes the COM velocity
 655      * and the kinetic energy for the velocities without COM motion removed.
 656      * Thus to get the kinetic energy without the COM contribution, we need
 657      * to call compute_globals twice.
 658      */
 659     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
 660     {
 661         unsigned int cglo_flags_iteration = cglo_flags;
 662         if (bStopCM && cgloIteration == 0)
 663         {
 664             cglo_flags_iteration |= CGLO_STOPCM;
 665             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
 666         }
 667         if (DOMAINDECOMP(cr) && shouldCheckNumberOfBondedInteractions(*cr->dd) && cgloIteration == 0)
 668         {
 669             cglo_flags_iteration |= CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS;
 670         }
 671         compute_globals(gstat,
 672                         cr,
 673                         ir,
 674                         fr,
 675                         ekind,
 676                         makeConstArrayRef(state->x),
 677                         makeConstArrayRef(state->v),
 678                         state->box,
 679                         mdatoms,
 680                         nrnb,
 681                         &vcm,
 682                         nullptr,
 683                         enerd,
 684                         force_vir,
 685                         shake_vir,
 686                         total_vir,
 687                         pres,
 688                         gmx::ArrayRef<real>{},
 689                         &nullSignaller,
 690                         state->box,
 691                         &bSumEkinhOld,
 692                         cglo_flags_iteration);
 693         if (cglo_flags_iteration & CGLO_STOPCM)
 694         {
 695             /* At initialization, do not pass x with acceleration-correction mode
 696              * to avoid (incorrect) correction of the initial coordinates.
 697              */
 698             auto x = (vcm.mode == ComRemovalAlgorithm::LinearAccelerationCorrection)
 699                              ? ArrayRef<RVec>()
 700                              : makeArrayRef(state->x);
 701             process_and_stopcm_grp(fplog, &vcm, *mdatoms, x, makeArrayRef(state->v));
 702             inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
 703         }
 704     }
 705     if (DOMAINDECOMP(cr))
 706     {
 707         checkNumberOfBondedInteractions(
 708                 mdlog, cr, top_global, &top, makeConstArrayRef(state->x), state->box);
 709     }
 710     if (ir->eI == IntegrationAlgorithm::VVAK)
 711     {
 712         /* a second call to get the half step temperature initialized as well */
 713         /* we do the same call as above, but turn the pressure off -- internally to
 714            compute_globals, this is recognized as a velocity verlet half-step
 715            kinetic energy calculation.  This minimized excess variables, but
 716            perhaps loses some logic?*/
 717
 718         compute_globals(gstat,
 719                         cr,
 720                         ir,
 721                         fr,
 722                         ekind,
 723                         makeConstArrayRef(state->x),
 724                         makeConstArrayRef(state->v),
 725                         state->box,
 726                         mdatoms,
 727                         nrnb,
 728                         &vcm,
 729                         nullptr,
 730                         enerd,
 731                         force_vir,
 732                         shake_vir,
 733                         total_vir,
 734                         pres,
 735                         gmx::ArrayRef<real>{},
 736                         &nullSignaller,
 737                         state->box,
 738                         &bSumEkinhOld,
 739                         cglo_flags & ~CGLO_PRESSURE);
 740     }
 741
 742     /* Calculate the initial half step temperature, and save the ekinh_old */
 743     if (startingBehavior == StartingBehavior::NewSimulation)
 744     {
 745         for (i = 0; (i < ir->opts.ngtc); i++)
 746         {
 747             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
 748         }
 749     }
 750
 751     /* need to make an initiation call to get the Trotter variables set, as well as other constants
 752        for non-trotter temperature control */
 753     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
 754
 755     if (MASTER(cr))
 756     {
 757         if (!ir->bContinuation)
 758         {
 759             if (constr && ir->eConstrAlg == ConstraintAlgorithm::Lincs)
 760             {
 761                 fprintf(fplog,
 762                         "RMS relative constraint deviation after constraining: %.2e\n",
 763                         constr->rmsd());
 764             }
 765             if (EI_STATE_VELOCITY(ir->eI))
 766             {
 767                 real temp = enerd->term[F_TEMP];
 768                 if (ir->eI != IntegrationAlgorithm::VV)
 769                 {
 770                     /* Result of Ekin averaged over velocities of -half
 771                      * and +half step, while we only have -half step here.
 772                      */
 773                     temp *= 2;
 774                 }
 775                 fprintf(fplog, "Initial temperature: %g K\n", temp);
 776             }
 777         }
 778
 779         char tbuf[20];
 780         fprintf(stderr, "starting mdrun '%s'\n", *(top_global.name));
 781         if (ir->nsteps >= 0)
 782         {
 783             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
 784         }
 785         else
 786         {
 787             sprintf(tbuf, "%s", "infinite");
 788         }
 789         if (ir->init_step > 0)
 790         {
 791             fprintf(stderr,
 792                     "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 793                     gmx_step_str(ir->init_step + ir->nsteps, sbuf),
 794                     tbuf,
 795                     gmx_step_str(ir->init_step, sbuf2),
 796                     ir->init_step * ir->delta_t);
 797         }
 798         else
 799         {
 800             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
 801         }
 802         fprintf(fplog, "\n");
 803     }
 804
 805     walltime_accounting_start_time(walltime_accounting);
 806     wallcycle_start(wcycle, WallCycleCounter::Run);
 807     print_start(fplog, cr, walltime_accounting, "mdrun");
 808
 809     /***********************************************************
 810      *
 811      *             Loop over MD steps
 812      *
 813      ************************************************************/
 814
 815     bFirstStep = TRUE;
 816     /* Skip the first Nose-Hoover integration when we get the state from tpx */
 817     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
 818     bSumEkinhOld     = FALSE;
 819     bExchanged       = FALSE;
 820     bNeedRepartition = FALSE;
 821
 822     step     = ir->init_step;
 823     step_rel = 0;
 824
 825     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
 826             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]),
 827             simulationsShareState,
 828             MASTER(cr),
 829             ir->nstlist,
 830             mdrunOptions.reproducible,
 831             nstSignalComm,
 832             mdrunOptions.maximumHoursToRun,
 833             ir->nstlist == 0,
 834             fplog,
 835             step,
 836             bNS,
 837             walltime_accounting);
 838
 839     auto checkpointHandler = std::make_unique<CheckpointHandler>(
 840             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
 841             simulationsShareState,
 842             ir->nstlist == 0,
 843             MASTER(cr),
 844             mdrunOptions.writeConfout,
 845             mdrunOptions.checkpointOptions.period);
 846
 847     const bool resetCountersIsLocal = true;
 848     auto       resetHandler         = std::make_unique<ResetHandler>(
 849             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
 850             !resetCountersIsLocal,
 851             ir->nsteps,
 852             MASTER(cr),
 853             mdrunOptions.timingOptions.resetHalfway,
 854             mdrunOptions.maximumHoursToRun,
 855             mdlog,
 856             wcycle,
 857             walltime_accounting);
 858
 859     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
 860
 861     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
 862     {
 863         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
 864     }
 865
 866     /* and stop now if we should */
 867     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
 868     while (!bLastStep)
 869     {
 870
 871         /* Determine if this is a neighbor search step */
 872         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
 873
 874         if (bPMETune && bNStList)
 875         {
 876             // This has to be here because PME load balancing is called so early.
 877             // TODO: Move to after all booleans are defined.
 878             if (useGpuForUpdate && !bFirstStep)
 879             {
 880                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 881                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 882             }
 883             /* PME grid + cut-off optimization with GPUs or PME nodes */
 884             pme_loadbal_do(pme_loadbal,
 885                            cr,
 886                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
 887                            fplog,
 888                            mdlog,
 889                            *ir,
 890                            fr,
 891                            state->box,
 892                            state->x,
 893                            wcycle,
 894                            step,
 895                            step_rel,
 896                            &bPMETunePrinting,
 897                            simulationWork.useGpuPmePpCommunication);
 898         }
 899
 900         wallcycle_start(wcycle, WallCycleCounter::Step);
 901
 902         bLastStep = (step_rel == ir->nsteps);
 903         t         = t0 + step * ir->delta_t;
 904
 905         // TODO Refactor this, so that nstfep does not need a default value of zero
 906         if (ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
 907         {
 908             /* find and set the current lambdas */
 909             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
 910
 911             bDoDHDL = do_per_step(step, ir->fepvals->nstdhdl);
 912             bDoFEP  = ((ir->efep != FreeEnergyPerturbationType::No) && do_per_step(step, nstfep));
 913             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
 914                            && (!bFirstStep));
 915         }
 916
 917         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
 918                      && do_per_step(step, replExParams.exchangeInterval));
 919
 920         if (doSimulatedAnnealing)
 921         {
 922             // TODO: Avoid changing inputrec (#3854)
 923             // Simulated annealing updates the reference temperature.
 924             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
 925             update_annealing_target_temp(nonConstInputrec, t, &upd);
 926         }
 927
 928         /* Stop Center of Mass motion */
 929         bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && do_per_step(step, ir->nstcomm));
 930
 931         /* Determine whether or not to do Neighbour Searching */
 932         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
 933
 934         /* Note that the stopHandler will cause termination at nstglobalcomm
 935          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
 936          * nstpcouple steps, we have computed the half-step kinetic energy
 937          * of the previous step and can always output energies at the last step.
 938          */
 939         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
 940
 941         /* do_log triggers energy and virial calculation. Because this leads
 942          * to different code paths, forces can be different. Thus for exact
 943          * continuation we should avoid extra log output.
 944          * Note that the || bLastStep can result in non-exact continuation
 945          * beyond the last step. But we don't consider that to be an issue.
 946          */
 947         do_log     = (do_per_step(step, ir->nstlog)
 948                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
 949         do_verbose = mdrunOptions.verbose
 950                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
 951
 952         if (useGpuForUpdate && !bFirstStep && bNS)
 953         {
 954             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
 955             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
 956             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
 957             // Copy coordinate from the GPU when needed at the search step.
 958             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
 959             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
 960             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 961             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 962         }
 963
 964         // We only need to calculate virtual velocities if we are writing them in the current step
 965         const bool needVirtualVelocitiesThisStep =
 966                 (vsite != nullptr)
 967                 && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep());
 968
 969         if (vsite != nullptr)
 970         {
 971             // Virtual sites need to be updated before domain decomposition and forces are calculated
 972             wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
 973             // md-vv calculates virtual velocities once it has full-step real velocities
 974             vsite->construct(state->x,
 975                              state->v,
 976                              state->box,
 977                              (!EI_VV(inputrec->eI) && needVirtualVelocitiesThisStep)
 978                                      ? VSiteOperation::PositionsAndVelocities
 979                                      : VSiteOperation::Positions);
 980             wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
 981         }
 982
 983         if (bNS && !(bFirstStep && ir->bContinuation))
 984         {
 985             bMasterState = FALSE;
 986             /* Correct the new box if it is too skewed */
 987             if (inputrecDynamicBox(ir))
 988             {
 989                 if (correct_box(fplog, step, state->box))
 990                 {
 991                     bMasterState = TRUE;
 992                     // If update is offloaded, it should be informed about the box size change
 993                     if (useGpuForUpdate)
 994                     {
 995                         integrator->setPbc(PbcType::Xyz, state->box);
 996                     }
 997                 }
 998             }
 999             if (DOMAINDECOMP(cr) && bMasterState)
1000             {
1001                 dd_collect_state(cr->dd, state, state_global);
1002             }
1003
1004             if (DOMAINDECOMP(cr))
1005             {
1006                 /* Repartition the domain decomposition */
1007                 dd_partition_system(fplog,
1008                                     mdlog,
1009                                     step,
1010                                     cr,
1011                                     bMasterState,
1012                                     nstglobalcomm,
1013                                     state_global,
1014                                     top_global,
1015                                     *ir,
1016                                     imdSession,
1017                                     pull_work,
1018                                     state,
1019                                     &f,
1020                                     mdAtoms,
1021                                     &top,
1022                                     fr,
1023                                     vsite,
1024                                     constr,
1025                                     nrnb,
1026                                     wcycle,
1027                                     do_verbose && !bPMETunePrinting);
1028                 upd.updateAfterPartition(state->natoms,
1029                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1030                                                      : gmx::ArrayRef<const unsigned short>(),
1031                                          md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1032                                                  : gmx::ArrayRef<const unsigned short>());
1033             }
1034         }
1035
1036         // Allocate or re-size GPU halo exchange object, if necessary
1037         if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange)
1038         {
1039             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
1040                                "GPU device manager has to be initialized to use GPU "
1041                                "version of halo exchange.");
1042             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
1043         }
1044
1045         if (MASTER(cr) && do_log)
1046         {
1047             gmx::EnergyOutput::printHeader(
1048                     fplog, step, t); /* can we improve the information printed here? */
1049         }
1050
1051         if (ir->efep != FreeEnergyPerturbationType::No)
1052         {
1053             update_mdatoms(mdatoms, state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
1054         }
1055
1056         if (bExchanged)
1057         {
1058             /* We need the kinetic energy at minus the half step for determining
1059              * the full step kinetic energy and possibly for T-coupling.*/
1060             /* This may not be quite working correctly yet . . . . */
1061             int cglo_flags = CGLO_GSTAT | CGLO_TEMPERATURE;
1062             if (DOMAINDECOMP(cr) && shouldCheckNumberOfBondedInteractions(*cr->dd))
1063             {
1064                 cglo_flags |= CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS;
1065             }
1066             compute_globals(gstat,
1067                             cr,
1068                             ir,
1069                             fr,
1070                             ekind,
1071                             makeConstArrayRef(state->x),
1072                             makeConstArrayRef(state->v),
1073                             state->box,
1074                             mdatoms,
1075                             nrnb,
1076                             &vcm,
1077                             wcycle,
1078                             enerd,
1079                             nullptr,
1080                             nullptr,
1081                             nullptr,
1082                             nullptr,
1083                             gmx::ArrayRef<real>{},
1084                             &nullSignaller,
1085                             state->box,
1086                             &bSumEkinhOld,
1087                             cglo_flags);
1088             if (DOMAINDECOMP(cr))
1089             {
1090                 checkNumberOfBondedInteractions(
1091                         mdlog, cr, top_global, &top, makeConstArrayRef(state->x), state->box);
1092             }
1093         }
1094         clear_mat(force_vir);
1095
1096         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
1097
1098         /* Determine the energy and pressure:
1099          * at nstcalcenergy steps and at energy output steps (set below).
1100          */
1101         if (EI_VV(ir->eI) && (!bInitStep))
1102         {
1103             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1104             bCalcVir      = bCalcEnerStep
1105                        || (ir->epc != PressureCoupling::No
1106                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
1107         }
1108         else
1109         {
1110             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1111             bCalcVir      = bCalcEnerStep
1112                        || (ir->epc != PressureCoupling::No && do_per_step(step, ir->nstpcouple));
1113         }
1114         bCalcEner = bCalcEnerStep;
1115
1116         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
1117
1118         if (do_ene || do_log || bDoReplEx)
1119         {
1120             bCalcVir  = TRUE;
1121             bCalcEner = TRUE;
1122         }
1123
1124         /* Do we need global communication ? */
1125         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
1126                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
1127
1128         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
1129                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
1130                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
1131         if (fr->useMts && !do_per_step(step, ir->nstfout))
1132         {
1133             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
1134         }
1135
1136         if (shellfc)
1137         {
1138             /* Now is the time to relax the shells */
1139             relax_shell_flexcon(fplog,
1140                                 cr,
1141                                 ms,
1142                                 mdrunOptions.verbose,
1143                                 enforcedRotation,
1144                                 step,
1145                                 ir,
1146                                 imdSession,
1147                                 pull_work,
1148                                 bNS,
1149                                 force_flags,
1150                                 &top,
1151                                 constr,
1152                                 enerd,
1153                                 state->natoms,
1154                                 state->x.arrayRefWithPadding(),
1155                                 state->v.arrayRefWithPadding(),
1156                                 state->box,
1157                                 state->lambda,
1158                                 &state->hist,
1159                                 &f.view(),
1160                                 force_vir,
1161                                 *mdatoms,
1162                                 nrnb,
1163                                 wcycle,
1164                                 shellfc,
1165                                 fr,
1166                                 runScheduleWork,
1167                                 t,
1168                                 mu_tot,
1169                                 vsite,
1170                                 ddBalanceRegionHandler);
1171         }
1172         else
1173         {
1174             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
1175                is updated (or the AWH update will be performed twice for one step when continuing).
1176                It would be best to call this update function from do_md_trajectory_writing but that
1177                would occur after do_force. One would have to divide the update_awh function into one
1178                function applying the AWH force and one doing the AWH bias update. The update AWH
1179                bias function could then be called after do_md_trajectory_writing (then containing
1180                update_awh_history). The checkpointing will in the future probably moved to the start
1181                of the md loop which will rid of this issue. */
1182             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
1183             {
1184                 awh->updateHistory(state_global->awhHistory.get());
1185             }
1186
1187             /* The coordinates (x) are shifted (to get whole molecules)
1188              * in do_force.
1189              * This is parallellized as well, and does communication too.
1190              * Check comments in sim_util.c
1191              */
1192             do_force(fplog,
1193                      cr,
1194                      ms,
1195                      *ir,
1196                      awh.get(),
1197                      enforcedRotation,
1198                      imdSession,
1199                      pull_work,
1200                      step,
1201                      nrnb,
1202                      wcycle,
1203                      &top,
1204                      state->box,
1205                      state->x.arrayRefWithPadding(),
1206                      &state->hist,
1207                      &f.view(),
1208                      force_vir,
1209                      mdatoms,
1210                      enerd,
1211                      state->lambda,
1212                      fr,
1213                      runScheduleWork,
1214                      vsite,
1215                      mu_tot,
1216                      t,
1217                      ed ? ed->getLegacyED() : nullptr,
1218                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
1219                      ddBalanceRegionHandler);
1220         }
1221
1222         // VV integrators do not need the following velocity half step
1223         // if it is the first step after starting from a checkpoint.
1224         // That is, the half step is needed on all other steps, and
1225         // also the first step when starting from a .tpr file.
1226         if (EI_VV(ir->eI))
1227         {
1228             integrateVVFirstStep(step,
1229                                  bFirstStep,
1230                                  bInitStep,
1231                                  startingBehavior,
1232                                  nstglobalcomm,
1233                                  ir,
1234                                  fr,
1235                                  cr,
1236                                  state,
1237                                  mdatoms,
1238                                  fcdata,
1239                                  &MassQ,
1240                                  &vcm,
1241                                  top_global,
1242                                  top,
1243                                  enerd,
1244                                  ekind,
1245                                  gstat,
1246                                  &last_ekin,
1247                                  bCalcVir,
1248                                  total_vir,
1249                                  shake_vir,
1250                                  force_vir,
1251                                  pres,
1252                                  M,
1253                                  do_log,
1254                                  do_ene,
1255                                  bCalcEner,
1256                                  bGStat,
1257                                  bStopCM,
1258                                  bTrotter,
1259                                  bExchanged,
1260                                  &bSumEkinhOld,
1261                                  &saved_conserved_quantity,
1262                                  &f,
1263                                  &upd,
1264                                  constr,
1265                                  &nullSignaller,
1266                                  trotter_seq,
1267                                  nrnb,
1268                                  mdlog,
1269                                  fplog,
1270                                  wcycle);
1271             if (vsite != nullptr && needVirtualVelocitiesThisStep)
1272             {
1273                 // Positions were calculated earlier
1274                 wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
1275                 vsite->construct(state->x, state->v, state->box, VSiteOperation::Velocities);
1276                 wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
1277             }
1278         }
1279
1280         /* ########  END FIRST UPDATE STEP  ############## */
1281         /* ########  If doing VV, we now have v(dt) ###### */
1282         if (bDoExpanded)
1283         {
1284             /* perform extended ensemble sampling in lambda - we don't
1285                actually move to the new state before outputting
1286                statistics, but if performing simulated tempering, we
1287                do update the velocities and the tau_t. */
1288             // TODO: Avoid changing inputrec (#3854)
1289             // Simulated tempering updates the reference temperature.
1290             // Expanded ensemble without simulated tempering does not change the inputrec.
1291             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
1292             lamnew                 = ExpandedEnsembleDynamics(fplog,
1293                                               nonConstInputrec,
1294                                               enerd,
1295                                               state,
1296                                               &MassQ,
1297                                               state->fep_state,
1298                                               state->dfhist,
1299                                               step,
1300                                               state->v.rvec_array(),
1301                                               mdatoms);
1302             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1303             if (MASTER(cr))
1304             {
1305                 copy_df_history(state_global->dfhist, state->dfhist);
1306             }
1307         }
1308
1309         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1310         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1311         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1312             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1313                 || checkpointHandler->isCheckpointingStep()))
1314         {
1315             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1316             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1317         }
1318         // Copy velocities if needed for the output/checkpointing.
1319         // NOTE: Copy on the search steps is done at the beginning of the step.
1320         if (useGpuForUpdate && !bNS
1321             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1322         {
1323             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1324             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1325         }
1326         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1327         // and update is offloaded hence forces are kept on the GPU for update and have not been
1328         // already transferred in do_force().
1329         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1330         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1331         //       prior to GPU update.
1332         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1333         //       copy call in do_force(...).
1334         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1335         //       on host after the D2H copy in do_force(...).
1336         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1337             && do_per_step(step, ir->nstfout))
1338         {
1339             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1340             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1341         }
1342         /* Now we have the energies and forces corresponding to the
1343          * coordinates at time t. We must output all of this before
1344          * the update.
1345          */
1346         do_md_trajectory_writing(fplog,
1347                                  cr,
1348                                  nfile,
1349                                  fnm,
1350                                  step,
1351                                  step_rel,
1352                                  t,
1353                                  ir,
1354                                  state,
1355                                  state_global,
1356                                  observablesHistory,
1357                                  top_global,
1358                                  fr,
1359                                  outf,
1360                                  energyOutput,
1361                                  ekind,
1362                                  f.view().force(),
1363                                  checkpointHandler->isCheckpointingStep(),
1364                                  bRerunMD,
1365                                  bLastStep,
1366                                  mdrunOptions.writeConfout,
1367                                  bSumEkinhOld);
1368         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1369         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x, t);
1370
1371         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1372         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1373             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1374         {
1375             copy_mat(state->svir_prev, shake_vir);
1376             copy_mat(state->fvir_prev, force_vir);
1377         }
1378
1379         stopHandler->setSignal();
1380         resetHandler->setSignal(walltime_accounting);
1381
1382         if (bGStat || !PAR(cr))
1383         {
1384             /* In parallel we only have to check for checkpointing in steps
1385              * where we do global communication,
1386              *  otherwise the other nodes don't know.
1387              */
1388             checkpointHandler->setSignal(walltime_accounting);
1389         }
1390
1391         /* #########   START SECOND UPDATE STEP ################# */
1392
1393         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1394            controlled in preprocessing */
1395
1396         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1397         {
1398             gmx_bool bIfRandomize;
1399             bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, &upd, constr);
1400             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1401             if (constr && bIfRandomize)
1402             {
1403                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1404             }
1405         }
1406         /* Box is changed in update() when we do pressure coupling,
1407          * but we should still use the old box for energy corrections and when
1408          * writing it to the energy file, so it matches the trajectory files for
1409          * the same timestep above. Make a copy in a separate array.
1410          */
1411         copy_mat(state->box, lastbox);
1412
1413         dvdl_constr = 0;
1414
1415         if (!useGpuForUpdate)
1416         {
1417             wallcycle_start(wcycle, WallCycleCounter::Update);
1418         }
1419         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1420         if (bTrotter)
1421         {
1422             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
1423             /* We can only do Berendsen coupling after we have summed
1424              * the kinetic energy or virial. Since the happens
1425              * in global_state after update, we should only do it at
1426              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1427              */
1428         }
1429         else
1430         {
1431             update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
1432             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1433         }
1434
1435         /* With leap-frog type integrators we compute the kinetic energy
1436          * at a whole time step as the average of the half-time step kinetic
1437          * energies of two subsequent steps. Therefore we need to compute the
1438          * half step kinetic energy also if we need energies at the next step.
1439          */
1440         const bool needHalfStepKineticEnergy =
1441                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1442
1443         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1444         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1445         const bool doParrinelloRahman = (ir->epc == PressureCoupling::ParrinelloRahman
1446                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1447
1448         if (EI_VV(ir->eI))
1449         {
1450             GMX_ASSERT(!useGpuForUpdate, "GPU update is not supported with VVAK integrator.");
1451
1452             integrateVVSecondStep(step,
1453                                   ir,
1454                                   fr,
1455                                   cr,
1456                                   state,
1457                                   mdatoms,
1458                                   fcdata,
1459                                   &MassQ,
1460                                   &vcm,
1461                                   pull_work,
1462                                   enerd,
1463                                   ekind,
1464                                   gstat,
1465                                   &dvdl_constr,
1466                                   bCalcVir,
1467                                   total_vir,
1468                                   shake_vir,
1469                                   force_vir,
1470                                   pres,
1471                                   M,
1472                                   lastbox,
1473                                   do_log,
1474                                   do_ene,
1475                                   bGStat,
1476                                   &bSumEkinhOld,
1477                                   &f,
1478                                   &cbuf,
1479                                   &upd,
1480                                   constr,
1481                                   &nullSignaller,
1482                                   trotter_seq,
1483                                   nrnb,
1484                                   wcycle);
1485         }
1486         else
1487         {
1488             if (useGpuForUpdate)
1489             {
1490                 if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1491                 {
1492                     integrator->set(stateGpu->getCoordinates(),
1493                                     stateGpu->getVelocities(),
1494                                     stateGpu->getForces(),
1495                                     top.idef,
1496                                     *mdatoms);
1497
1498                     // Copy data to the GPU after buffers might have being reinitialized
1499                     stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1500                     stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1501                 }
1502
1503                 if (simulationWork.useGpuPme && !runScheduleWork->simulationWork.useGpuPmePpCommunication
1504                     && !thisRankHasDuty(cr, DUTY_PME))
1505                 {
1506                     // The PME forces were recieved to the host, so have to be copied
1507                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::All);
1508                 }
1509                 else if (!runScheduleWork->stepWork.useGpuFBufferOps)
1510                 {
1511                     // The buffer ops were not offloaded this step, so the forces are on the
1512                     // host and have to be copied
1513                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1514                 }
1515
1516                 const bool doTemperatureScaling =
1517                         (ir->etc != TemperatureCoupling::No
1518                          && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1519
1520                 // This applies Leap-Frog, LINCS and SETTLE in succession
1521                 integrator->integrate(
1522                         stateGpu->getForcesReadyOnDeviceEvent(
1523                                 AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1524                         ir->delta_t,
1525                         true,
1526                         bCalcVir,
1527                         shake_vir,
1528                         doTemperatureScaling,
1529                         ekind->tcstat,
1530                         doParrinelloRahman,
1531                         ir->nstpcouple * ir->delta_t,
1532                         M);
1533
1534                 // Copy velocities D2H after update if:
1535                 // - Globals are computed this step (includes the energy output steps).
1536                 // - Temperature is needed for the next step.
1537                 if (bGStat || needHalfStepKineticEnergy)
1538                 {
1539                     stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1540                     stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1541                 }
1542             }
1543             else
1544             {
1545                 /* With multiple time stepping we need to do an additional normal
1546                  * update step to obtain the virial, as the actual MTS integration
1547                  * using an acceleration where the slow forces are multiplied by mtsFactor.
1548                  * Using that acceleration would result in a virial with the slow
1549                  * force contribution would be a factor mtsFactor too large.
1550                  */
1551                 if (fr->useMts && bCalcVir && constr != nullptr)
1552                 {
1553                     upd.update_for_constraint_virial(
1554                             *ir,
1555                             mdatoms->homenr,
1556                             mdatoms->havePartiallyFrozenAtoms,
1557                             gmx::arrayRefFromArray(mdatoms->invmass, mdatoms->nr),
1558                             gmx::arrayRefFromArray(mdatoms->invMassPerDim, mdatoms->nr),
1559                             *state,
1560                             f.view().forceWithPadding(),
1561                             *ekind);
1562
1563                     constrain_coordinates(constr,
1564                                           do_log,
1565                                           do_ene,
1566                                           step,
1567                                           state,
1568                                           upd.xp()->arrayRefWithPadding(),
1569                                           &dvdl_constr,
1570                                           bCalcVir,
1571                                           shake_vir);
1572                 }
1573
1574                 ArrayRefWithPadding<const RVec> forceCombined =
1575                         (fr->useMts && step % ir->mtsLevels[1].stepFactor == 0)
1576                                 ? f.view().forceMtsCombinedWithPadding()
1577                                 : f.view().forceWithPadding();
1578                 upd.update_coords(*ir,
1579                                   step,
1580                                   mdatoms->homenr,
1581                                   mdatoms->havePartiallyFrozenAtoms,
1582                                   gmx::arrayRefFromArray(md->ptype, md->nr),
1583                                   gmx::arrayRefFromArray(md->invmass, md->nr),
1584                                   gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1585                                   state,
1586                                   forceCombined,
1587                                   fcdata,
1588                                   ekind,
1589                                   M,
1590                                   etrtPOSITION,
1591                                   cr,
1592                                   constr != nullptr);
1593
1594                 wallcycle_stop(wcycle, WallCycleCounter::Update);
1595
1596                 constrain_coordinates(constr,
1597                                       do_log,
1598                                       do_ene,
1599                                       step,
1600                                       state,
1601                                       upd.xp()->arrayRefWithPadding(),
1602                                       &dvdl_constr,
1603                                       bCalcVir && !fr->useMts,
1604                                       shake_vir);
1605
1606                 upd.update_sd_second_half(*ir,
1607                                           step,
1608                                           &dvdl_constr,
1609                                           mdatoms->homenr,
1610                                           gmx::arrayRefFromArray(md->ptype, md->nr),
1611                                           gmx::arrayRefFromArray(md->invmass, md->nr),
1612                                           state,
1613                                           cr,
1614                                           nrnb,
1615                                           wcycle,
1616                                           constr,
1617                                           do_log,
1618                                           do_ene);
1619                 upd.finish_update(
1620                         *ir, mdatoms->havePartiallyFrozenAtoms, mdatoms->homenr, state, wcycle, constr != nullptr);
1621             }
1622
1623             if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1624             {
1625                 updatePrevStepPullCom(pull_work, state);
1626             }
1627
1628             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1629         }
1630
1631         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1632         /* With Leap-Frog we can skip compute_globals at
1633          * non-communication steps, but we need to calculate
1634          * the kinetic energy one step before communication.
1635          */
1636         {
1637             // Organize to do inter-simulation signalling on steps if
1638             // and when algorithms require it.
1639             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1640
1641             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1642             {
1643                 // Copy coordinates when needed to stop the CM motion.
1644                 if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
1645                 {
1646                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1647                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1648                 }
1649                 // Since we're already communicating at this step, we
1650                 // can propagate intra-simulation signals. Note that
1651                 // check_nstglobalcomm has the responsibility for
1652                 // choosing the value of nstglobalcomm that is one way
1653                 // bGStat becomes true, so we can't get into a
1654                 // situation where e.g. checkpointing can't be
1655                 // signalled.
1656                 bool                doIntraSimSignal = true;
1657                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1658
1659                 compute_globals(
1660                         gstat,
1661                         cr,
1662                         ir,
1663                         fr,
1664                         ekind,
1665                         makeConstArrayRef(state->x),
1666                         makeConstArrayRef(state->v),
1667                         state->box,
1668                         mdatoms,
1669                         nrnb,
1670                         &vcm,
1671                         wcycle,
1672                         enerd,
1673                         force_vir,
1674                         shake_vir,
1675                         total_vir,
1676                         pres,
1677                         (!EI_VV(ir->eI) && bCalcEner && constr != nullptr) ? constr->rmsdData()
1678                                                                            : gmx::ArrayRef<real>{},
1679                         &signaller,
1680                         lastbox,
1681                         &bSumEkinhOld,
1682                         (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1683                                 | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1684                                 | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1685                                 | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
1686                                 | (DOMAINDECOMP(cr) && shouldCheckNumberOfBondedInteractions(*cr->dd)
1687                                            ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1688                                            : 0));
1689                 if (DOMAINDECOMP(cr))
1690                 {
1691                     checkNumberOfBondedInteractions(
1692                             mdlog, cr, top_global, &top, makeConstArrayRef(state->x), state->box);
1693                 }
1694                 if (!EI_VV(ir->eI) && bStopCM)
1695                 {
1696                     process_and_stopcm_grp(
1697                             fplog, &vcm, *mdatoms, makeArrayRef(state->x), makeArrayRef(state->v));
1698                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1699
1700                     // TODO: The special case of removing CM motion should be dealt more gracefully
1701                     if (useGpuForUpdate)
1702                     {
1703                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1704                         // Here we block until the H2D copy completes because event sync with the
1705                         // force kernels that use the coordinates on the next steps is not implemented
1706                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1707                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1708                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1709                         if (vcm.mode != ComRemovalAlgorithm::No)
1710                         {
1711                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1712                         }
1713                     }
1714                 }
1715             }
1716         }
1717
1718         /* #############  END CALC EKIN AND PRESSURE ################# */
1719
1720         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1721            the virial that should probably be addressed eventually. state->veta has better properies,
1722            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1723            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1724
1725         if (ir->efep != FreeEnergyPerturbationType::No && !EI_VV(ir->eI))
1726         {
1727             /* Sum up the foreign energy and dK/dl terms for md and sd.
1728                Currently done every step so that dH/dl is correct in the .edr */
1729             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1730         }
1731
1732         update_pcouple_after_coordinates(
1733                 fplog, step, ir, mdatoms, pres, force_vir, shake_vir, pressureCouplingMu, state, nrnb, upd.deform(), !useGpuForUpdate);
1734
1735         const bool doBerendsenPressureCoupling = (inputrec->epc == PressureCoupling::Berendsen
1736                                                   && do_per_step(step, inputrec->nstpcouple));
1737         const bool doCRescalePressureCoupling  = (inputrec->epc == PressureCoupling::CRescale
1738                                                  && do_per_step(step, inputrec->nstpcouple));
1739         if (useGpuForUpdate
1740             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1741         {
1742             integrator->scaleCoordinates(pressureCouplingMu);
1743             if (doCRescalePressureCoupling)
1744             {
1745                 matrix pressureCouplingInvMu;
1746                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1747                 integrator->scaleVelocities(pressureCouplingInvMu);
1748             }
1749             integrator->setPbc(PbcType::Xyz, state->box);
1750         }
1751
1752         /* ################# END UPDATE STEP 2 ################# */
1753         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1754
1755         /* The coordinates (x) were unshifted in update */
1756         if (!bGStat)
1757         {
1758             /* We will not sum ekinh_old,
1759              * so signal that we still have to do it.
1760              */
1761             bSumEkinhOld = TRUE;
1762         }
1763
1764         if (bCalcEner)
1765         {
1766             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1767
1768             /* use the directly determined last velocity, not actually the averaged half steps */
1769             if (bTrotter && ir->eI == IntegrationAlgorithm::VV)
1770             {
1771                 enerd->term[F_EKIN] = last_ekin;
1772             }
1773             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1774
1775             if (integratorHasConservedEnergyQuantity(ir))
1776             {
1777                 if (EI_VV(ir->eI))
1778                 {
1779                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1780                 }
1781                 else
1782                 {
1783                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1784                 }
1785             }
1786             /* #########  END PREPARING EDR OUTPUT  ###########  */
1787         }
1788
1789         /* Output stuff */
1790         if (MASTER(cr))
1791         {
1792             if (fplog && do_log && bDoExpanded)
1793             {
1794                 /* only needed if doing expanded ensemble */
1795                 PrintFreeEnergyInfoToFile(fplog,
1796                                           ir->fepvals.get(),
1797                                           ir->expandedvals.get(),
1798                                           ir->bSimTemp ? ir->simtempvals.get() : nullptr,
1799                                           state_global->dfhist,
1800                                           state->fep_state,
1801                                           ir->nstlog,
1802                                           step);
1803             }
1804             if (bCalcEner)
1805             {
1806                 energyOutput.addDataAtEnergyStep(bDoDHDL,
1807                                                  bCalcEnerStep,
1808                                                  t,
1809                                                  mdatoms->tmass,
1810                                                  enerd,
1811                                                  ir->fepvals.get(),
1812                                                  ir->expandedvals.get(),
1813                                                  lastbox,
1814                                                  PTCouplingArrays{ state->boxv,
1815                                                                    state->nosehoover_xi,
1816                                                                    state->nosehoover_vxi,
1817                                                                    state->nhpres_xi,
1818                                                                    state->nhpres_vxi },
1819                                                  state->fep_state,
1820                                                  shake_vir,
1821                                                  force_vir,
1822                                                  total_vir,
1823                                                  pres,
1824                                                  ekind,
1825                                                  mu_tot,
1826                                                  constr);
1827             }
1828             else
1829             {
1830                 energyOutput.recordNonEnergyStep();
1831             }
1832
1833             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1834             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1835
1836             if (doSimulatedAnnealing)
1837             {
1838                 gmx::EnergyOutput::printAnnealingTemperatures(
1839                         do_log ? fplog : nullptr, groups, &(ir->opts));
1840             }
1841             if (do_log || do_ene || do_dr || do_or)
1842             {
1843                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf),
1844                                                    do_ene,
1845                                                    do_dr,
1846                                                    do_or,
1847                                                    do_log ? fplog : nullptr,
1848                                                    step,
1849                                                    t,
1850                                                    fr->fcdata.get(),
1851                                                    awh.get());
1852             }
1853             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1854             {
1855                 const bool isInitialOutput = false;
1856                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1857             }
1858
1859             if (ir->bPull)
1860             {
1861                 pull_print_output(pull_work, step, t);
1862             }
1863
1864             if (do_per_step(step, ir->nstlog))
1865             {
1866                 if (fflush(fplog) != 0)
1867                 {
1868                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1869                 }
1870             }
1871         }
1872         if (bDoExpanded)
1873         {
1874             /* Have to do this part _after_ outputting the logfile and the edr file */
1875             /* Gets written into the state at the beginning of next loop*/
1876             state->fep_state = lamnew;
1877         }
1878         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1879         {
1880             state->fep_state = awh->fepLambdaState();
1881         }
1882         /* Print the remaining wall clock time for the run */
1883         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1884         {
1885             if (shellfc)
1886             {
1887                 fprintf(stderr, "\n");
1888             }
1889             print_time(stderr, walltime_accounting, step, ir, cr);
1890         }
1891
1892         /* Ion/water position swapping.
1893          * Not done in last step since trajectory writing happens before this call
1894          * in the MD loop and exchanges would be lost anyway. */
1895         bNeedRepartition = FALSE;
1896         if ((ir->eSwapCoords != SwapType::No) && (step > 0) && !bLastStep
1897             && do_per_step(step, ir->swap->nstswap))
1898         {
1899             bNeedRepartition = do_swapcoords(cr,
1900                                              step,
1901                                              t,
1902                                              ir,
1903                                              swap,
1904                                              wcycle,
1905                                              as_rvec_array(state->x.data()),
1906                                              state->box,
1907                                              MASTER(cr) && mdrunOptions.verbose,
1908                                              bRerunMD);
1909
1910             if (bNeedRepartition && DOMAINDECOMP(cr))
1911             {
1912                 dd_collect_state(cr->dd, state, state_global);
1913             }
1914         }
1915
1916         /* Replica exchange */
1917         bExchanged = FALSE;
1918         if (bDoReplEx)
1919         {
1920             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1921         }
1922
1923         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1924         {
1925             dd_partition_system(fplog,
1926                                 mdlog,
1927                                 step,
1928                                 cr,
1929                                 TRUE,
1930                                 1,
1931                                 state_global,
1932                                 top_global,
1933                                 *ir,
1934                                 imdSession,
1935                                 pull_work,
1936                                 state,
1937                                 &f,
1938                                 mdAtoms,
1939                                 &top,
1940                                 fr,
1941                                 vsite,
1942                                 constr,
1943                                 nrnb,
1944                                 wcycle,
1945                                 FALSE);
1946             upd.updateAfterPartition(state->natoms,
1947                                      md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1948                                                  : gmx::ArrayRef<const unsigned short>(),
1949                                      md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1950                                              : gmx::ArrayRef<const unsigned short>());
1951         }
1952
1953         bFirstStep = FALSE;
1954         bInitStep  = FALSE;
1955
1956         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1957         /* With all integrators, except VV, we need to retain the pressure
1958          * at the current step for coupling at the next step.
1959          */
1960         if ((state->flags & enumValueToBitMask(StateEntry::PressurePrevious))
1961             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1962         {
1963             /* Store the pressure in t_state for pressure coupling
1964              * at the next MD step.
1965              */
1966             copy_mat(pres, state->pres_prev);
1967         }
1968
1969         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1970
1971         if ((membed != nullptr) && (!bLastStep))
1972         {
1973             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1974         }
1975
1976         cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
1977         if (DOMAINDECOMP(cr) && wcycle)
1978         {
1979             dd_cycles_add(cr->dd, cycles, ddCyclStep);
1980         }
1981
1982         /* increase the MD step number */
1983         step++;
1984         step_rel++;
1985
1986 #if GMX_FAHCORE
1987         if (MASTER(cr))
1988         {
1989             fcReportProgress(ir->nsteps + ir->init_step, step);
1990         }
1991 #endif
1992
1993         resetHandler->resetCounters(
1994                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
1995
1996         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
1997         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
1998     }
1999     /* End of main MD loop */
2000
2001     /* Closing TNG files can include compressing data. Therefore it is good to do that
2002      * before stopping the time measurements. */
2003     mdoutf_tng_close(outf);
2004
2005     /* Stop measuring walltime */
2006     walltime_accounting_end_time(walltime_accounting);
2007
2008     if (!thisRankHasDuty(cr, DUTY_PME))
2009     {
2010         /* Tell the PME only node to finish */
2011         gmx_pme_send_finish(cr);
2012     }
2013
2014     if (MASTER(cr))
2015     {
2016         if (ir->nstcalcenergy > 0)
2017         {
2018             energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
2019
2020             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
2021             energyOutput.printAverages(fplog, groups);
2022         }
2023     }
2024     done_mdoutf(outf);
2025
2026     if (bPMETune)
2027     {
2028         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
2029     }
2030
2031     done_shellfc(fplog, shellfc, step_rel);
2032
2033     if (useReplicaExchange && MASTER(cr))
2034     {
2035         print_replica_exchange_statistics(fplog, repl_ex);
2036     }
2037
2038     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
2039
2040     global_stat_destroy(gstat);
2041 }