src/gromacs/mdrun/md.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief Implements the integrator for normal molecular dynamics simulations
  40  *
  41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
  42  * \ingroup module_mdrun
  43  */
  44 #include "gmxpre.h"
  45
  46 #include <cinttypes>
  47 #include <cmath>
  48 #include <cstdio>
  49 #include <cstdlib>
  50
  51 #include <algorithm>
  52 #include <memory>
  53 #include <numeric>
  54
  55 #include "gromacs/applied_forces/awh/awh.h"
  56 #include "gromacs/applied_forces/awh/read_params.h"
  57 #include "gromacs/commandline/filenm.h"
  58 #include "gromacs/domdec/collect.h"
  59 #include "gromacs/domdec/dlbtiming.h"
  60 #include "gromacs/domdec/domdec.h"
  61 #include "gromacs/domdec/domdec_network.h"
  62 #include "gromacs/domdec/domdec_struct.h"
  63 #include "gromacs/domdec/gpuhaloexchange.h"
  64 #include "gromacs/domdec/localtopologychecker.h"
  65 #include "gromacs/domdec/mdsetup.h"
  66 #include "gromacs/domdec/partition.h"
  67 #include "gromacs/essentialdynamics/edsam.h"
  68 #include "gromacs/ewald/pme_load_balancing.h"
  69 #include "gromacs/ewald/pme_pp.h"
  70 #include "gromacs/fileio/trxio.h"
  71 #include "gromacs/gmxlib/network.h"
  72 #include "gromacs/gmxlib/nrnb.h"
  73 #include "gromacs/gpu_utils/device_stream_manager.h"
  74 #include "gromacs/gpu_utils/gpu_utils.h"
  75 #include "gromacs/imd/imd.h"
  76 #include "gromacs/listed_forces/listed_forces.h"
  77 #include "gromacs/math/functions.h"
  78 #include "gromacs/math/invertmatrix.h"
  79 #include "gromacs/math/vec.h"
  80 #include "gromacs/math/vectypes.h"
  81 #include "gromacs/mdlib/checkpointhandler.h"
  82 #include "gromacs/mdlib/compute_io.h"
  83 #include "gromacs/mdlib/constr.h"
  84 #include "gromacs/mdlib/coupling.h"
  85 #include "gromacs/mdlib/ebin.h"
  86 #include "gromacs/mdlib/enerdata_utils.h"
  87 #include "gromacs/mdlib/energyoutput.h"
  88 #include "gromacs/mdlib/expanded.h"
  89 #include "gromacs/mdlib/force.h"
  90 #include "gromacs/mdlib/force_flags.h"
  91 #include "gromacs/mdlib/forcerec.h"
  92 #include "gromacs/mdlib/freeenergyparameters.h"
  93 #include "gromacs/mdlib/md_support.h"
  94 #include "gromacs/mdlib/mdatoms.h"
  95 #include "gromacs/mdlib/mdoutf.h"
  96 #include "gromacs/mdlib/membed.h"
  97 #include "gromacs/mdlib/resethandler.h"
  98 #include "gromacs/mdlib/sighandler.h"
  99 #include "gromacs/mdlib/simulationsignal.h"
 100 #include "gromacs/mdlib/stat.h"
 101 #include "gromacs/mdlib/stophandler.h"
 102 #include "gromacs/mdlib/tgroup.h"
 103 #include "gromacs/mdlib/trajectory_writing.h"
 104 #include "gromacs/mdlib/update.h"
 105 #include "gromacs/mdlib/update_constrain_gpu.h"
 106 #include "gromacs/mdlib/update_vv.h"
 107 #include "gromacs/mdlib/vcm.h"
 108 #include "gromacs/mdlib/vsite.h"
 109 #include "gromacs/mdrunutility/freeenergy.h"
 110 #include "gromacs/mdrunutility/handlerestart.h"
 111 #include "gromacs/mdrunutility/multisim.h"
 112 #include "gromacs/mdrunutility/printtime.h"
 113 #include "gromacs/mdtypes/awh_history.h"
 114 #include "gromacs/mdtypes/awh_params.h"
 115 #include "gromacs/mdtypes/commrec.h"
 116 #include "gromacs/mdtypes/df_history.h"
 117 #include "gromacs/mdtypes/energyhistory.h"
 118 #include "gromacs/mdtypes/fcdata.h"
 119 #include "gromacs/mdtypes/forcebuffers.h"
 120 #include "gromacs/mdtypes/forcerec.h"
 121 #include "gromacs/mdtypes/group.h"
 122 #include "gromacs/mdtypes/inputrec.h"
 123 #include "gromacs/mdtypes/interaction_const.h"
 124 #include "gromacs/mdtypes/md_enums.h"
 125 #include "gromacs/mdtypes/mdatom.h"
 126 #include "gromacs/mdtypes/mdrunoptions.h"
 127 #include "gromacs/mdtypes/multipletimestepping.h"
 128 #include "gromacs/mdtypes/observableshistory.h"
 129 #include "gromacs/mdtypes/observablesreducer.h"
 130 #include "gromacs/mdtypes/pullhistory.h"
 131 #include "gromacs/mdtypes/simulation_workload.h"
 132 #include "gromacs/mdtypes/state.h"
 133 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
 134 #include "gromacs/modularsimulator/energydata.h"
 135 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 136 #include "gromacs/nbnxm/nbnxm.h"
 137 #include "gromacs/pbcutil/pbc.h"
 138 #include "gromacs/pulling/output.h"
 139 #include "gromacs/pulling/pull.h"
 140 #include "gromacs/swap/swapcoords.h"
 141 #include "gromacs/timing/wallcycle.h"
 142 #include "gromacs/timing/walltime_accounting.h"
 143 #include "gromacs/topology/atoms.h"
 144 #include "gromacs/topology/idef.h"
 145 #include "gromacs/topology/mtop_util.h"
 146 #include "gromacs/topology/topology.h"
 147 #include "gromacs/trajectory/trajectoryframe.h"
 148 #include "gromacs/utility/basedefinitions.h"
 149 #include "gromacs/utility/cstringutil.h"
 150 #include "gromacs/utility/fatalerror.h"
 151 #include "gromacs/utility/logger.h"
 152 #include "gromacs/utility/real.h"
 153 #include "gromacs/utility/smalloc.h"
 154
 155 #include "legacysimulator.h"
 156 #include "replicaexchange.h"
 157 #include "shellfc.h"
 158
 159 using gmx::SimulationSignaller;
 160
 161 void gmx::LegacySimulator::do_md()
 162 {
 163     // TODO Historically, the EM and MD "integrators" used different
 164     // names for the t_inputrec *parameter, but these must have the
 165     // same name, now that it's a member of a struct. We use this ir
 166     // alias to avoid a large ripple of nearly useless changes.
 167     // t_inputrec is being replaced by IMdpOptionsProvider, so this
 168     // will go away eventually.
 169     const t_inputrec* ir = inputrec;
 170
 171     double       t, t0 = ir->init_t;
 172     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
 173     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
 174     gmx_bool     bDoExpanded = FALSE;
 175     gmx_bool     do_ene, do_log, do_verbose;
 176     gmx_bool     bMasterState;
 177     unsigned int force_flags;
 178     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
 179     int    i, m;
 180     rvec   mu_tot;
 181     matrix pressureCouplingMu, M;
 182     gmx_repl_ex_t     repl_ex = nullptr;
 183     gmx_global_stat_t gstat;
 184     gmx_shellfc_t*    shellfc;
 185     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
 186     gmx_bool          bTrotter;
 187     real              dvdl_constr;
 188     std::vector<RVec> cbuf;
 189     matrix            lastbox;
 190     int               lamnew = 0;
 191     /* for FEP */
 192     double    cycles;
 193     real      saved_conserved_quantity = 0;
 194     real      last_ekin                = 0;
 195     t_extmass MassQ;
 196     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
 197
 198     /* PME load balancing data for GPU kernels */
 199     gmx_bool bPMETune         = FALSE;
 200     gmx_bool bPMETunePrinting = FALSE;
 201
 202     bool bInteractiveMDstep = false;
 203
 204     SimulationSignals signals;
 205     // Most global communnication stages don't propagate mdrun
 206     // signals, and will use this object to achieve that.
 207     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
 208
 209     if (!mdrunOptions.writeConfout)
 210     {
 211         // This is on by default, and the main known use case for
 212         // turning it off is for convenience in benchmarking, which is
 213         // something that should not show up in the general user
 214         // interface.
 215         GMX_LOG(mdlog.info)
 216                 .asParagraph()
 217                 .appendText(
 218                         "The -noconfout functionality is deprecated, and may be removed in a "
 219                         "future version.");
 220     }
 221
 222     /* md-vv uses averaged full step velocities for T-control
 223        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 224        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 225     bTrotter = (EI_VV(ir->eI)
 226                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
 227
 228     const bool bRerunMD = false;
 229
 230     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
 231     bGStatEveryStep   = (nstglobalcomm == 1);
 232
 233     const SimulationGroups* groups = &top_global.groups;
 234
 235     std::unique_ptr<EssentialDynamics> ed = nullptr;
 236     if (opt2bSet("-ei", nfile, fnm))
 237     {
 238         /* Initialize essential dynamics sampling */
 239         ed = init_edsam(mdlog,
 240                         opt2fn_null("-ei", nfile, fnm),
 241                         opt2fn("-eo", nfile, fnm),
 242                         top_global,
 243                         *ir,
 244                         cr,
 245                         constr,
 246                         state_global,
 247                         observablesHistory,
 248                         oenv,
 249                         startingBehavior);
 250     }
 251     else if (observablesHistory->edsamHistory)
 252     {
 253         gmx_fatal(FARGS,
 254                   "The checkpoint is from a run with essential dynamics sampling, "
 255                   "but the current run did not specify the -ei option. "
 256                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
 257     }
 258
 259     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
 260     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
 261     initialize_lambdas(fplog,
 262                        ir->efep,
 263                        ir->bSimTemp,
 264                        *ir->fepvals,
 265                        ir->simtempvals->temperatures,
 266                        gmx::arrayRefFromArray(ir->opts.ref_t, ir->opts.ngtc),
 267                        MASTER(cr),
 268                        fep_state,
 269                        lambda);
 270     Update upd(*ir, deform);
 271     bool   doSimulatedAnnealing = false;
 272     {
 273         // TODO: Avoid changing inputrec (#3854)
 274         // Simulated annealing updates the reference temperature.
 275         auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
 276         doSimulatedAnnealing   = initSimulatedAnnealing(nonConstInputrec, &upd);
 277     }
 278     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
 279
 280     t_fcdata& fcdata = *fr->fcdata;
 281
 282     bool simulationsShareState = false;
 283     int  nstSignalComm         = nstglobalcomm;
 284     {
 285         // TODO This implementation of ensemble orientation restraints is nasty because
 286         // a user can't just do multi-sim with single-sim orientation restraints.
 287         bool usingEnsembleRestraints = (fcdata.disres->nsystems > 1) || ((ms != nullptr) && fcdata.orires);
 288         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim() && (ms != nullptr));
 289
 290         // Replica exchange, ensemble restraints and AWH need all
 291         // simulations to remain synchronized, so they need
 292         // checkpoints and stop conditions to act on the same step, so
 293         // the propagation of such signals must take place between
 294         // simulations, not just within simulations.
 295         // TODO: Make algorithm initializers set these flags.
 296         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
 297
 298         if (simulationsShareState)
 299         {
 300             // Inter-simulation signal communication does not need to happen
 301             // often, so we use a minimum of 200 steps to reduce overhead.
 302             const int c_minimumInterSimulationSignallingInterval = 200;
 303             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
 304                             * nstglobalcomm;
 305         }
 306     }
 307
 308     if (startingBehavior != StartingBehavior::RestartWithAppending)
 309     {
 310         pleaseCiteCouplingAlgorithms(fplog, *ir);
 311     }
 312     gmx_mdoutf*       outf = init_mdoutf(fplog,
 313                                    nfile,
 314                                    fnm,
 315                                    mdrunOptions,
 316                                    cr,
 317                                    outputProvider,
 318                                    mdModulesNotifiers,
 319                                    ir,
 320                                    top_global,
 321                                    oenv,
 322                                    wcycle,
 323                                    startingBehavior,
 324                                    simulationsShareState,
 325                                    ms);
 326     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf),
 327                                    top_global,
 328                                    *ir,
 329                                    pull_work,
 330                                    mdoutf_get_fp_dhdl(outf),
 331                                    false,
 332                                    startingBehavior,
 333                                    simulationsShareState,
 334                                    mdModulesNotifiers);
 335
 336     gstat = global_stat_init(ir);
 337
 338     const auto& simulationWork     = runScheduleWork->simulationWork;
 339     const bool  useGpuForPme       = simulationWork.useGpuPme;
 340     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
 341     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
 342     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
 343
 344     /* Check for polarizable models and flexible constraints */
 345     shellfc = init_shell_flexcon(fplog,
 346                                  top_global,
 347                                  constr ? constr->numFlexibleConstraints() : 0,
 348                                  ir->nstcalcenergy,
 349                                  DOMAINDECOMP(cr),
 350                                  useGpuForPme);
 351
 352     {
 353         double io = compute_io(ir, top_global.natoms, *groups, energyOutput.numEnergyTerms(), 1);
 354         if ((io > 2000) && MASTER(cr))
 355         {
 356             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
 357         }
 358     }
 359
 360     ObservablesReducer observablesReducer = observablesReducerBuilder->build();
 361
 362     ForceBuffers     f(simulationWork.useMts,
 363                    ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 364                                ? PinningPolicy::PinnedIfSupported
 365                                : PinningPolicy::CannotBePinned);
 366     const t_mdatoms* md = mdAtoms->mdatoms();
 367     if (DOMAINDECOMP(cr))
 368     {
 369         // Local state only becomes valid now.
 370         dd_init_local_state(*cr->dd, state_global, state);
 371
 372         /* Distribute the charge groups over the nodes from the master node */
 373         dd_partition_system(fplog,
 374                             mdlog,
 375                             ir->init_step,
 376                             cr,
 377                             TRUE,
 378                             1,
 379                             state_global,
 380                             top_global,
 381                             *ir,
 382                             imdSession,
 383                             pull_work,
 384                             state,
 385                             &f,
 386                             mdAtoms,
 387                             top,
 388                             fr,
 389                             vsite,
 390                             constr,
 391                             nrnb,
 392                             nullptr,
 393                             FALSE);
 394         upd.updateAfterPartition(state->natoms,
 395                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
 396                                              : gmx::ArrayRef<const unsigned short>(),
 397                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
 398                                          : gmx::ArrayRef<const unsigned short>());
 399         fr->longRangeNonbondeds->updateAfterPartition(*md);
 400     }
 401     else
 402     {
 403         state_change_natoms(state_global, state_global->natoms);
 404
 405         /* Generate and initialize new topology */
 406         mdAlgorithmsSetupAtomData(cr, *ir, top_global, top, fr, &f, mdAtoms, constr, vsite, shellfc);
 407
 408         upd.updateAfterPartition(state->natoms,
 409                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
 410                                              : gmx::ArrayRef<const unsigned short>(),
 411                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
 412                                          : gmx::ArrayRef<const unsigned short>());
 413         fr->longRangeNonbondeds->updateAfterPartition(*md);
 414     }
 415
 416     std::unique_ptr<UpdateConstrainGpu> integrator;
 417
 418     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
 419
 420     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
 421     if (useGpuForUpdate)
 422     {
 423         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
 424                                    || constr->numConstraintsTotal() == 0,
 425                            "Constraints in domain decomposition are only supported with update "
 426                            "groups if using GPU update.\n");
 427         GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
 428                                    || constr->numConstraintsTotal() == 0,
 429                            "SHAKE is not supported with GPU update.");
 430         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
 431                            "Either PME or short-ranged non-bonded interaction tasks must run on "
 432                            "the GPU to use GPU update.\n");
 433         GMX_RELEASE_ASSERT(ir->eI == IntegrationAlgorithm::MD,
 434                            "Only the md integrator is supported with the GPU update.\n");
 435         GMX_RELEASE_ASSERT(
 436                 ir->etc != TemperatureCoupling::NoseHoover,
 437                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
 438         GMX_RELEASE_ASSERT(
 439                 ir->epc == PressureCoupling::No || ir->epc == PressureCoupling::ParrinelloRahman
 440                         || ir->epc == PressureCoupling::Berendsen || ir->epc == PressureCoupling::CRescale,
 441                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
 442                 "with the GPU update.\n");
 443         GMX_RELEASE_ASSERT(!md->haveVsites,
 444                            "Virtual sites are not supported with the GPU update.\n");
 445         GMX_RELEASE_ASSERT(ed == nullptr,
 446                            "Essential dynamics is not supported with the GPU update.\n");
 447         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
 448                            "Constraints pulling is not supported with the GPU update.\n");
 449         GMX_RELEASE_ASSERT(fcdata.orires == nullptr,
 450                            "Orientation restraints are not supported with the GPU update.\n");
 451         GMX_RELEASE_ASSERT(
 452                 ir->efep == FreeEnergyPerturbationType::No
 453                         || (!haveFepPerturbedMasses(top_global) && !havePerturbedConstraints(top_global)),
 454                 "Free energy perturbation of masses and constraints are not supported with the GPU "
 455                 "update.");
 456
 457         if (constr != nullptr && constr->numConstraintsTotal() > 0)
 458         {
 459             GMX_LOG(mdlog.info)
 460                     .asParagraph()
 461                     .appendText("Updating coordinates and applying constraints on the GPU.");
 462         }
 463         else
 464         {
 465             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
 466         }
 467         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
 468                            "Device stream manager should be initialized in order to use GPU "
 469                            "update-constraints.");
 470         GMX_RELEASE_ASSERT(
 471                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
 472                 "Update stream should be initialized in order to use GPU "
 473                 "update-constraints.");
 474         integrator = std::make_unique<UpdateConstrainGpu>(
 475                 *ir,
 476                 top_global,
 477                 ekind->ngtc,
 478                 fr->deviceStreamManager->context(),
 479                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
 480                 wcycle);
 481
 482         stateGpu->setXUpdatedOnDeviceEvent(integrator->xUpdatedOnDeviceEvent());
 483
 484         integrator->setPbc(PbcType::Xyz, state->box);
 485     }
 486
 487     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 488     {
 489         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
 490     }
 491     if (useGpuForUpdate)
 492     {
 493         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
 494     }
 495
 496     // NOTE: The global state is no longer used at this point.
 497     // But state_global is still used as temporary storage space for writing
 498     // the global state to file and potentially for replica exchange.
 499     // (Global topology should persist.)
 500
 501     update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
 502
 503     if (ir->bExpanded)
 504     {
 505         /* Check nstexpanded here, because the grompp check was broken */
 506         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
 507         {
 508             gmx_fatal(FARGS,
 509                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
 510         }
 511         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
 512     }
 513
 514     if (MASTER(cr))
 515     {
 516         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
 517     }
 518
 519     preparePrevStepPullCom(ir,
 520                            pull_work,
 521                            gmx::arrayRefFromArray(md->massT, md->nr),
 522                            state,
 523                            state_global,
 524                            cr,
 525                            startingBehavior != StartingBehavior::NewSimulation);
 526
 527     // TODO: Remove this by converting AWH into a ForceProvider
 528     auto awh = prepareAwhModule(fplog,
 529                                 *ir,
 530                                 state_global,
 531                                 cr,
 532                                 ms,
 533                                 startingBehavior != StartingBehavior::NewSimulation,
 534                                 shellfc != nullptr,
 535                                 opt2fn("-awh", nfile, fnm),
 536                                 pull_work);
 537
 538     if (useReplicaExchange && MASTER(cr))
 539     {
 540         repl_ex = init_replica_exchange(fplog, ms, top_global.natoms, ir, replExParams);
 541     }
 542     /* PME tuning is only supported in the Verlet scheme, with PME for
 543      * Coulomb. It is not supported with only LJ PME. */
 544     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
 545                 && ir->cutoff_scheme != CutoffScheme::Group);
 546
 547     pme_load_balancing_t* pme_loadbal = nullptr;
 548     if (bPMETune)
 549     {
 550         pme_loadbal_init(
 551                 &pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu());
 552     }
 553
 554     if (!ir->bContinuation)
 555     {
 556         if (state->flags & enumValueToBitMask(StateEntry::V))
 557         {
 558             auto v = makeArrayRef(state->v);
 559             /* Set the velocities of vsites, shells and frozen atoms to zero */
 560             for (i = 0; i < md->homenr; i++)
 561             {
 562                 if (md->ptype[i] == ParticleType::Shell)
 563                 {
 564                     clear_rvec(v[i]);
 565                 }
 566                 else if (md->cFREEZE)
 567                 {
 568                     for (m = 0; m < DIM; m++)
 569                     {
 570                         if (ir->opts.nFreeze[md->cFREEZE[i]][m])
 571                         {
 572                             v[i][m] = 0;
 573                         }
 574                     }
 575                 }
 576             }
 577         }
 578
 579         if (constr)
 580         {
 581             /* Constrain the initial coordinates and velocities */
 582             do_constrain_first(fplog,
 583                                constr,
 584                                ir,
 585                                md->nr,
 586                                md->homenr,
 587                                state->x.arrayRefWithPadding(),
 588                                state->v.arrayRefWithPadding(),
 589                                state->box,
 590                                state->lambda[FreeEnergyPerturbationCouplingType::Bonded]);
 591         }
 592     }
 593
 594     const int nstfep = computeFepPeriod(*ir, replExParams);
 595
 596     /* Be REALLY careful about what flags you set here. You CANNOT assume
 597      * this is the first step, since we might be restarting from a checkpoint,
 598      * and in that case we should not do any modifications to the state.
 599      */
 600     bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && !ir->bContinuation);
 601
 602     // When restarting from a checkpoint, it can be appropriate to
 603     // initialize ekind from quantities in the checkpoint. Otherwise,
 604     // compute_globals must initialize ekind before the simulation
 605     // starts/restarts. However, only the master rank knows what was
 606     // found in the checkpoint file, so we have to communicate in
 607     // order to coordinate the restart.
 608     //
 609     // TODO Consider removing this communication if/when checkpoint
 610     // reading directly follows .tpr reading, because all ranks can
 611     // agree on hasReadEkinState at that time.
 612     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
 613     if (PAR(cr))
 614     {
 615         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
 616     }
 617     if (hasReadEkinState)
 618     {
 619         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
 620     }
 621
 622     unsigned int cglo_flags =
 623             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
 624              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
 625
 626     bSumEkinhOld = FALSE;
 627
 628     t_vcm vcm(top_global.groups, *ir);
 629     reportComRemovalInfo(fplog, vcm);
 630
 631     int64_t step     = ir->init_step;
 632     int64_t step_rel = 0;
 633
 634     /* To minimize communication, compute_globals computes the COM velocity
 635      * and the kinetic energy for the velocities without COM motion removed.
 636      * Thus to get the kinetic energy without the COM contribution, we need
 637      * to call compute_globals twice.
 638      */
 639     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
 640     {
 641         unsigned int cglo_flags_iteration = cglo_flags;
 642         if (bStopCM && cgloIteration == 0)
 643         {
 644             cglo_flags_iteration |= CGLO_STOPCM;
 645             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
 646         }
 647         compute_globals(gstat,
 648                         cr,
 649                         ir,
 650                         fr,
 651                         ekind,
 652                         makeConstArrayRef(state->x),
 653                         makeConstArrayRef(state->v),
 654                         state->box,
 655                         md,
 656                         nrnb,
 657                         &vcm,
 658                         nullptr,
 659                         enerd,
 660                         force_vir,
 661                         shake_vir,
 662                         total_vir,
 663                         pres,
 664                         gmx::ArrayRef<real>{},
 665                         &nullSignaller,
 666                         state->box,
 667                         &bSumEkinhOld,
 668                         cglo_flags_iteration,
 669                         step,
 670                         &observablesReducer);
 671         // Clean up after pre-step use of compute_globals()
 672         observablesReducer.markAsReadyToReduce();
 673
 674         if (cglo_flags_iteration & CGLO_STOPCM)
 675         {
 676             /* At initialization, do not pass x with acceleration-correction mode
 677              * to avoid (incorrect) correction of the initial coordinates.
 678              */
 679             auto x = (vcm.mode == ComRemovalAlgorithm::LinearAccelerationCorrection)
 680                              ? ArrayRef<RVec>()
 681                              : makeArrayRef(state->x);
 682             process_and_stopcm_grp(fplog, &vcm, *md, x, makeArrayRef(state->v));
 683             inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
 684         }
 685     }
 686     if (ir->eI == IntegrationAlgorithm::VVAK)
 687     {
 688         /* a second call to get the half step temperature initialized as well */
 689         /* we do the same call as above, but turn the pressure off -- internally to
 690            compute_globals, this is recognized as a velocity verlet half-step
 691            kinetic energy calculation.  This minimized excess variables, but
 692            perhaps loses some logic?*/
 693
 694         compute_globals(gstat,
 695                         cr,
 696                         ir,
 697                         fr,
 698                         ekind,
 699                         makeConstArrayRef(state->x),
 700                         makeConstArrayRef(state->v),
 701                         state->box,
 702                         md,
 703                         nrnb,
 704                         &vcm,
 705                         nullptr,
 706                         enerd,
 707                         force_vir,
 708                         shake_vir,
 709                         total_vir,
 710                         pres,
 711                         gmx::ArrayRef<real>{},
 712                         &nullSignaller,
 713                         state->box,
 714                         &bSumEkinhOld,
 715                         cglo_flags & ~CGLO_PRESSURE,
 716                         step,
 717                         &observablesReducer);
 718         // Clean up after pre-step use of compute_globals()
 719         observablesReducer.markAsReadyToReduce();
 720     }
 721
 722     /* Calculate the initial half step temperature, and save the ekinh_old */
 723     if (startingBehavior == StartingBehavior::NewSimulation)
 724     {
 725         for (i = 0; (i < ir->opts.ngtc); i++)
 726         {
 727             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
 728         }
 729     }
 730
 731     /* need to make an initiation call to get the Trotter variables set, as well as other constants
 732        for non-trotter temperature control */
 733     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
 734
 735     if (MASTER(cr))
 736     {
 737         if (!ir->bContinuation)
 738         {
 739             if (constr && ir->eConstrAlg == ConstraintAlgorithm::Lincs)
 740             {
 741                 fprintf(fplog,
 742                         "RMS relative constraint deviation after constraining: %.2e\n",
 743                         constr->rmsd());
 744             }
 745             if (EI_STATE_VELOCITY(ir->eI))
 746             {
 747                 real temp = enerd->term[F_TEMP];
 748                 if (ir->eI != IntegrationAlgorithm::VV)
 749                 {
 750                     /* Result of Ekin averaged over velocities of -half
 751                      * and +half step, while we only have -half step here.
 752                      */
 753                     temp *= 2;
 754                 }
 755                 fprintf(fplog, "Initial temperature: %g K\n", temp);
 756             }
 757         }
 758
 759         char tbuf[20];
 760         fprintf(stderr, "starting mdrun '%s'\n", *(top_global.name));
 761         if (ir->nsteps >= 0)
 762         {
 763             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
 764         }
 765         else
 766         {
 767             sprintf(tbuf, "%s", "infinite");
 768         }
 769         if (ir->init_step > 0)
 770         {
 771             fprintf(stderr,
 772                     "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 773                     gmx_step_str(ir->init_step + ir->nsteps, sbuf),
 774                     tbuf,
 775                     gmx_step_str(ir->init_step, sbuf2),
 776                     ir->init_step * ir->delta_t);
 777         }
 778         else
 779         {
 780             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
 781         }
 782         fprintf(fplog, "\n");
 783     }
 784
 785     walltime_accounting_start_time(walltime_accounting);
 786     wallcycle_start(wcycle, WallCycleCounter::Run);
 787     print_start(fplog, cr, walltime_accounting, "mdrun");
 788
 789     /***********************************************************
 790      *
 791      *             Loop over MD steps
 792      *
 793      ************************************************************/
 794
 795     bFirstStep = TRUE;
 796     /* Skip the first Nose-Hoover integration when we get the state from tpx */
 797     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
 798     bSumEkinhOld     = FALSE;
 799     bExchanged       = FALSE;
 800     bNeedRepartition = FALSE;
 801
 802     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
 803             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]),
 804             simulationsShareState,
 805             MASTER(cr),
 806             ir->nstlist,
 807             mdrunOptions.reproducible,
 808             nstSignalComm,
 809             mdrunOptions.maximumHoursToRun,
 810             ir->nstlist == 0,
 811             fplog,
 812             step,
 813             bNS,
 814             walltime_accounting);
 815
 816     auto checkpointHandler = std::make_unique<CheckpointHandler>(
 817             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
 818             simulationsShareState,
 819             ir->nstlist == 0,
 820             MASTER(cr),
 821             mdrunOptions.writeConfout,
 822             mdrunOptions.checkpointOptions.period);
 823
 824     const bool resetCountersIsLocal = true;
 825     auto       resetHandler         = std::make_unique<ResetHandler>(
 826             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
 827             !resetCountersIsLocal,
 828             ir->nsteps,
 829             MASTER(cr),
 830             mdrunOptions.timingOptions.resetHalfway,
 831             mdrunOptions.maximumHoursToRun,
 832             mdlog,
 833             wcycle,
 834             walltime_accounting);
 835
 836     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
 837
 838     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
 839     {
 840         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
 841     }
 842
 843     /* and stop now if we should */
 844     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
 845     while (!bLastStep)
 846     {
 847
 848         /* Determine if this is a neighbor search step */
 849         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
 850
 851         if (bPMETune && bNStList)
 852         {
 853             // This has to be here because PME load balancing is called so early.
 854             // TODO: Move to after all booleans are defined.
 855             if (useGpuForUpdate && !bFirstStep)
 856             {
 857                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 858                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 859             }
 860             /* PME grid + cut-off optimization with GPUs or PME nodes */
 861             pme_loadbal_do(pme_loadbal,
 862                            cr,
 863                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
 864                            fplog,
 865                            mdlog,
 866                            *ir,
 867                            fr,
 868                            state->box,
 869                            state->x,
 870                            wcycle,
 871                            step,
 872                            step_rel,
 873                            &bPMETunePrinting,
 874                            simulationWork.useGpuPmePpCommunication);
 875         }
 876
 877         wallcycle_start(wcycle, WallCycleCounter::Step);
 878
 879         bLastStep = (step_rel == ir->nsteps);
 880         t         = t0 + step * ir->delta_t;
 881
 882         // TODO Refactor this, so that nstfep does not need a default value of zero
 883         if (ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
 884         {
 885             /* find and set the current lambdas */
 886             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
 887
 888             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
 889                            && (!bFirstStep));
 890         }
 891
 892         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
 893                      && do_per_step(step, replExParams.exchangeInterval));
 894
 895         if (doSimulatedAnnealing)
 896         {
 897             // TODO: Avoid changing inputrec (#3854)
 898             // Simulated annealing updates the reference temperature.
 899             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
 900             update_annealing_target_temp(nonConstInputrec, t, &upd);
 901         }
 902
 903         /* Stop Center of Mass motion */
 904         bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && do_per_step(step, ir->nstcomm));
 905
 906         /* Determine whether or not to do Neighbour Searching */
 907         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
 908
 909         /* Note that the stopHandler will cause termination at nstglobalcomm
 910          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
 911          * nstpcouple steps, we have computed the half-step kinetic energy
 912          * of the previous step and can always output energies at the last step.
 913          */
 914         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
 915
 916         /* do_log triggers energy and virial calculation. Because this leads
 917          * to different code paths, forces can be different. Thus for exact
 918          * continuation we should avoid extra log output.
 919          * Note that the || bLastStep can result in non-exact continuation
 920          * beyond the last step. But we don't consider that to be an issue.
 921          */
 922         do_log     = (do_per_step(step, ir->nstlog)
 923                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
 924         do_verbose = mdrunOptions.verbose
 925                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
 926
 927         if (useGpuForUpdate && !bFirstStep && bNS)
 928         {
 929             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
 930             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
 931             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
 932             // Copy coordinate from the GPU when needed at the search step.
 933             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
 934             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
 935             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 936             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 937         }
 938
 939         // We only need to calculate virtual velocities if we are writing them in the current step
 940         const bool needVirtualVelocitiesThisStep =
 941                 (vsite != nullptr)
 942                 && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep());
 943
 944         if (vsite != nullptr)
 945         {
 946             // Virtual sites need to be updated before domain decomposition and forces are calculated
 947             wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
 948             // md-vv calculates virtual velocities once it has full-step real velocities
 949             vsite->construct(state->x,
 950                              state->v,
 951                              state->box,
 952                              (!EI_VV(inputrec->eI) && needVirtualVelocitiesThisStep)
 953                                      ? VSiteOperation::PositionsAndVelocities
 954                                      : VSiteOperation::Positions);
 955             wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
 956         }
 957
 958         if (bNS && !(bFirstStep && ir->bContinuation))
 959         {
 960             bMasterState = FALSE;
 961             /* Correct the new box if it is too skewed */
 962             if (inputrecDynamicBox(ir))
 963             {
 964                 if (correct_box(fplog, step, state->box))
 965                 {
 966                     bMasterState = TRUE;
 967                     // If update is offloaded, it should be informed about the box size change
 968                     if (useGpuForUpdate)
 969                     {
 970                         integrator->setPbc(PbcType::Xyz, state->box);
 971                     }
 972                 }
 973             }
 974             if (DOMAINDECOMP(cr) && bMasterState)
 975             {
 976                 dd_collect_state(cr->dd, state, state_global);
 977             }
 978
 979             if (DOMAINDECOMP(cr))
 980             {
 981                 /* Repartition the domain decomposition */
 982                 dd_partition_system(fplog,
 983                                     mdlog,
 984                                     step,
 985                                     cr,
 986                                     bMasterState,
 987                                     nstglobalcomm,
 988                                     state_global,
 989                                     top_global,
 990                                     *ir,
 991                                     imdSession,
 992                                     pull_work,
 993                                     state,
 994                                     &f,
 995                                     mdAtoms,
 996                                     top,
 997                                     fr,
 998                                     vsite,
 999                                     constr,
1000                                     nrnb,
1001                                     wcycle,
1002                                     do_verbose && !bPMETunePrinting);
1003                 upd.updateAfterPartition(state->natoms,
1004                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1005                                                      : gmx::ArrayRef<const unsigned short>(),
1006                                          md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1007                                                  : gmx::ArrayRef<const unsigned short>());
1008                 fr->longRangeNonbondeds->updateAfterPartition(*md);
1009             }
1010         }
1011
1012         // Allocate or re-size GPU halo exchange object, if necessary
1013         if (bNS && simulationWork.havePpDomainDecomposition && simulationWork.useGpuHaloExchange)
1014         {
1015             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
1016                                "GPU device manager has to be initialized to use GPU "
1017                                "version of halo exchange.");
1018             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
1019         }
1020
1021         if (MASTER(cr) && do_log)
1022         {
1023             gmx::EnergyOutput::printHeader(
1024                     fplog, step, t); /* can we improve the information printed here? */
1025         }
1026
1027         if (ir->efep != FreeEnergyPerturbationType::No)
1028         {
1029             update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
1030         }
1031
1032         if (bExchanged)
1033         {
1034             /* We need the kinetic energy at minus the half step for determining
1035              * the full step kinetic energy and possibly for T-coupling.*/
1036             /* This may not be quite working correctly yet . . . . */
1037             int cglo_flags = CGLO_GSTAT | CGLO_TEMPERATURE;
1038             compute_globals(gstat,
1039                             cr,
1040                             ir,
1041                             fr,
1042                             ekind,
1043                             makeConstArrayRef(state->x),
1044                             makeConstArrayRef(state->v),
1045                             state->box,
1046                             md,
1047                             nrnb,
1048                             &vcm,
1049                             wcycle,
1050                             enerd,
1051                             nullptr,
1052                             nullptr,
1053                             nullptr,
1054                             nullptr,
1055                             gmx::ArrayRef<real>{},
1056                             &nullSignaller,
1057                             state->box,
1058                             &bSumEkinhOld,
1059                             cglo_flags,
1060                             step,
1061                             &observablesReducer);
1062         }
1063         clear_mat(force_vir);
1064
1065         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
1066
1067         /* Determine the energy and pressure:
1068          * at nstcalcenergy steps and at energy output steps (set below).
1069          */
1070         if (EI_VV(ir->eI) && (!bInitStep))
1071         {
1072             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1073             bCalcVir      = bCalcEnerStep
1074                        || (ir->epc != PressureCoupling::No
1075                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
1076         }
1077         else
1078         {
1079             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1080             bCalcVir      = bCalcEnerStep
1081                        || (ir->epc != PressureCoupling::No && do_per_step(step, ir->nstpcouple));
1082         }
1083         bCalcEner = bCalcEnerStep;
1084
1085         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
1086
1087         if (do_ene || do_log || bDoReplEx)
1088         {
1089             bCalcVir  = TRUE;
1090             bCalcEner = TRUE;
1091         }
1092
1093         // bCalcEner is only here for when the last step is not a mulitple of nstfep
1094         const bool computeDHDL = ((ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
1095                                   && (do_per_step(step, nstfep) || bCalcEner));
1096
1097         /* Do we need global communication ? */
1098         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
1099                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
1100
1101         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
1102                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
1103                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (computeDHDL ? GMX_FORCE_DHDL : 0));
1104         if (simulationWork.useMts && !do_per_step(step, ir->nstfout))
1105         {
1106             // TODO: merge this with stepWork.useOnlyMtsCombinedForceBuffer
1107             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
1108         }
1109
1110         if (shellfc)
1111         {
1112             /* Now is the time to relax the shells */
1113             relax_shell_flexcon(fplog,
1114                                 cr,
1115                                 ms,
1116                                 mdrunOptions.verbose,
1117                                 enforcedRotation,
1118                                 step,
1119                                 ir,
1120                                 imdSession,
1121                                 pull_work,
1122                                 bNS,
1123                                 force_flags,
1124                                 top,
1125                                 constr,
1126                                 enerd,
1127                                 state->natoms,
1128                                 state->x.arrayRefWithPadding(),
1129                                 state->v.arrayRefWithPadding(),
1130                                 state->box,
1131                                 state->lambda,
1132                                 &state->hist,
1133                                 &f.view(),
1134                                 force_vir,
1135                                 *md,
1136                                 fr->longRangeNonbondeds.get(),
1137                                 nrnb,
1138                                 wcycle,
1139                                 shellfc,
1140                                 fr,
1141                                 runScheduleWork,
1142                                 t,
1143                                 mu_tot,
1144                                 vsite,
1145                                 ddBalanceRegionHandler);
1146         }
1147         else
1148         {
1149             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
1150                is updated (or the AWH update will be performed twice for one step when continuing).
1151                It would be best to call this update function from do_md_trajectory_writing but that
1152                would occur after do_force. One would have to divide the update_awh function into one
1153                function applying the AWH force and one doing the AWH bias update. The update AWH
1154                bias function could then be called after do_md_trajectory_writing (then containing
1155                update_awh_history). The checkpointing will in the future probably moved to the start
1156                of the md loop which will rid of this issue. */
1157             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
1158             {
1159                 awh->updateHistory(state_global->awhHistory.get());
1160             }
1161
1162             /* The coordinates (x) are shifted (to get whole molecules)
1163              * in do_force.
1164              * This is parallellized as well, and does communication too.
1165              * Check comments in sim_util.c
1166              */
1167             do_force(fplog,
1168                      cr,
1169                      ms,
1170                      *ir,
1171                      awh.get(),
1172                      enforcedRotation,
1173                      imdSession,
1174                      pull_work,
1175                      step,
1176                      nrnb,
1177                      wcycle,
1178                      top,
1179                      state->box,
1180                      state->x.arrayRefWithPadding(),
1181                      &state->hist,
1182                      &f.view(),
1183                      force_vir,
1184                      md,
1185                      enerd,
1186                      state->lambda,
1187                      fr,
1188                      runScheduleWork,
1189                      vsite,
1190                      mu_tot,
1191                      t,
1192                      ed ? ed->getLegacyED() : nullptr,
1193                      fr->longRangeNonbondeds.get(),
1194                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
1195                      ddBalanceRegionHandler);
1196         }
1197
1198         // VV integrators do not need the following velocity half step
1199         // if it is the first step after starting from a checkpoint.
1200         // That is, the half step is needed on all other steps, and
1201         // also the first step when starting from a .tpr file.
1202         if (EI_VV(ir->eI))
1203         {
1204             integrateVVFirstStep(step,
1205                                  bFirstStep,
1206                                  bInitStep,
1207                                  startingBehavior,
1208                                  nstglobalcomm,
1209                                  ir,
1210                                  fr,
1211                                  cr,
1212                                  state,
1213                                  mdAtoms->mdatoms(),
1214                                  &fcdata,
1215                                  &MassQ,
1216                                  &vcm,
1217                                  enerd,
1218                                  &observablesReducer,
1219                                  ekind,
1220                                  gstat,
1221                                  &last_ekin,
1222                                  bCalcVir,
1223                                  total_vir,
1224                                  shake_vir,
1225                                  force_vir,
1226                                  pres,
1227                                  M,
1228                                  do_log,
1229                                  do_ene,
1230                                  bCalcEner,
1231                                  bGStat,
1232                                  bStopCM,
1233                                  bTrotter,
1234                                  bExchanged,
1235                                  &bSumEkinhOld,
1236                                  &saved_conserved_quantity,
1237                                  &f,
1238                                  &upd,
1239                                  constr,
1240                                  &nullSignaller,
1241                                  trotter_seq,
1242                                  nrnb,
1243                                  fplog,
1244                                  wcycle);
1245             if (vsite != nullptr && needVirtualVelocitiesThisStep)
1246             {
1247                 // Positions were calculated earlier
1248                 wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
1249                 vsite->construct(state->x, state->v, state->box, VSiteOperation::Velocities);
1250                 wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
1251             }
1252         }
1253
1254         /* ########  END FIRST UPDATE STEP  ############## */
1255         /* ########  If doing VV, we now have v(dt) ###### */
1256         if (bDoExpanded)
1257         {
1258             /* perform extended ensemble sampling in lambda - we don't
1259                actually move to the new state before outputting
1260                statistics, but if performing simulated tempering, we
1261                do update the velocities and the tau_t. */
1262             // TODO: Avoid changing inputrec (#3854)
1263             // Simulated tempering updates the reference temperature.
1264             // Expanded ensemble without simulated tempering does not change the inputrec.
1265             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
1266             lamnew                 = ExpandedEnsembleDynamics(fplog,
1267                                               nonConstInputrec,
1268                                               enerd,
1269                                               state,
1270                                               &MassQ,
1271                                               state->fep_state,
1272                                               state->dfhist,
1273                                               step,
1274                                               state->v.rvec_array(),
1275                                               md->homenr,
1276                                               md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1277                                                                       : gmx::ArrayRef<const unsigned short>());
1278             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1279             if (MASTER(cr))
1280             {
1281                 copy_df_history(state_global->dfhist, state->dfhist);
1282             }
1283         }
1284
1285         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1286         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1287         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1288             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1289                 || checkpointHandler->isCheckpointingStep()))
1290         {
1291             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1292             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1293         }
1294         // Copy velocities if needed for the output/checkpointing.
1295         // NOTE: Copy on the search steps is done at the beginning of the step.
1296         if (useGpuForUpdate && !bNS
1297             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1298         {
1299             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1300             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1301         }
1302         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1303         // and update is offloaded hence forces are kept on the GPU for update and have not been
1304         // already transferred in do_force().
1305         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1306         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1307         //       prior to GPU update.
1308         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1309         //       copy call in do_force(...).
1310         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1311         //       on host after the D2H copy in do_force(...).
1312         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1313             && do_per_step(step, ir->nstfout))
1314         {
1315             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1316             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1317         }
1318         /* Now we have the energies and forces corresponding to the
1319          * coordinates at time t. We must output all of this before
1320          * the update.
1321          */
1322         do_md_trajectory_writing(fplog,
1323                                  cr,
1324                                  nfile,
1325                                  fnm,
1326                                  step,
1327                                  step_rel,
1328                                  t,
1329                                  ir,
1330                                  state,
1331                                  state_global,
1332                                  observablesHistory,
1333                                  top_global,
1334                                  fr,
1335                                  outf,
1336                                  energyOutput,
1337                                  ekind,
1338                                  f.view().force(),
1339                                  checkpointHandler->isCheckpointingStep(),
1340                                  bRerunMD,
1341                                  bLastStep,
1342                                  mdrunOptions.writeConfout,
1343                                  bSumEkinhOld);
1344         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1345         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x, t);
1346
1347         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1348         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1349             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1350         {
1351             copy_mat(state->svir_prev, shake_vir);
1352             copy_mat(state->fvir_prev, force_vir);
1353         }
1354
1355         stopHandler->setSignal();
1356         resetHandler->setSignal(walltime_accounting);
1357
1358         if (bGStat || !PAR(cr))
1359         {
1360             /* In parallel we only have to check for checkpointing in steps
1361              * where we do global communication,
1362              *  otherwise the other nodes don't know.
1363              */
1364             checkpointHandler->setSignal(walltime_accounting);
1365         }
1366
1367         /* #########   START SECOND UPDATE STEP ################# */
1368
1369         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1370            controlled in preprocessing */
1371
1372         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1373         {
1374             gmx_bool bIfRandomize;
1375             bIfRandomize = update_randomize_velocities(ir,
1376                                                        step,
1377                                                        cr,
1378                                                        md->homenr,
1379                                                        md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1380                                                                : gmx::ArrayRef<const unsigned short>(),
1381                                                        gmx::arrayRefFromArray(md->invmass, md->nr),
1382                                                        state->v,
1383                                                        &upd,
1384                                                        constr);
1385             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1386             if (constr && bIfRandomize)
1387             {
1388                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1389             }
1390         }
1391         /* Box is changed in update() when we do pressure coupling,
1392          * but we should still use the old box for energy corrections and when
1393          * writing it to the energy file, so it matches the trajectory files for
1394          * the same timestep above. Make a copy in a separate array.
1395          */
1396         copy_mat(state->box, lastbox);
1397
1398         dvdl_constr = 0;
1399
1400         if (!useGpuForUpdate)
1401         {
1402             wallcycle_start(wcycle, WallCycleCounter::Update);
1403         }
1404         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1405         if (bTrotter)
1406         {
1407             trotter_update(ir,
1408                            step,
1409                            ekind,
1410                            enerd,
1411                            state,
1412                            total_vir,
1413                            md->homenr,
1414                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1415                                    : gmx::ArrayRef<const unsigned short>(),
1416                            gmx::arrayRefFromArray(md->invmass, md->nr),
1417                            &MassQ,
1418                            trotter_seq,
1419                            TrotterSequence::Three);
1420             /* We can only do Berendsen coupling after we have summed
1421              * the kinetic energy or virial. Since the happens
1422              * in global_state after update, we should only do it at
1423              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1424              */
1425         }
1426         else
1427         {
1428             update_tcouple(step,
1429                            ir,
1430                            state,
1431                            ekind,
1432                            &MassQ,
1433                            md->homenr,
1434                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1435                                    : gmx::ArrayRef<const unsigned short>());
1436             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1437         }
1438
1439         /* With leap-frog type integrators we compute the kinetic energy
1440          * at a whole time step as the average of the half-time step kinetic
1441          * energies of two subsequent steps. Therefore we need to compute the
1442          * half step kinetic energy also if we need energies at the next step.
1443          */
1444         const bool needHalfStepKineticEnergy =
1445                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1446
1447         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1448         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1449         const bool doParrinelloRahman = (ir->epc == PressureCoupling::ParrinelloRahman
1450                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1451
1452         if (EI_VV(ir->eI))
1453         {
1454             GMX_ASSERT(!useGpuForUpdate, "GPU update is not supported with VVAK integrator.");
1455
1456             integrateVVSecondStep(step,
1457                                   ir,
1458                                   fr,
1459                                   cr,
1460                                   state,
1461                                   mdAtoms->mdatoms(),
1462                                   &fcdata,
1463                                   &MassQ,
1464                                   &vcm,
1465                                   pull_work,
1466                                   enerd,
1467                                   &observablesReducer,
1468                                   ekind,
1469                                   gstat,
1470                                   &dvdl_constr,
1471                                   bCalcVir,
1472                                   total_vir,
1473                                   shake_vir,
1474                                   force_vir,
1475                                   pres,
1476                                   M,
1477                                   lastbox,
1478                                   do_log,
1479                                   do_ene,
1480                                   bGStat,
1481                                   &bSumEkinhOld,
1482                                   &f,
1483                                   &cbuf,
1484                                   &upd,
1485                                   constr,
1486                                   &nullSignaller,
1487                                   trotter_seq,
1488                                   nrnb,
1489                                   wcycle);
1490         }
1491         else
1492         {
1493             if (useGpuForUpdate)
1494             {
1495                 if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1496                 {
1497                     integrator->set(stateGpu->getCoordinates(),
1498                                     stateGpu->getVelocities(),
1499                                     stateGpu->getForces(),
1500                                     top->idef,
1501                                     *md);
1502
1503                     // Copy data to the GPU after buffers might have being reinitialized
1504                     /* The velocity copy is redundant if we had Center-of-Mass motion removed on
1505                      * the previous step. We don't check that now. */
1506                     stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1507                     if (!runScheduleWork->stepWork.haveGpuPmeOnThisRank
1508                         && !runScheduleWork->stepWork.useGpuXBufferOps)
1509                     {
1510                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1511                     }
1512                 }
1513
1514                 if ((simulationWork.useGpuPme && simulationWork.useCpuPmePpCommunication)
1515                     || (!runScheduleWork->stepWork.useGpuFBufferOps))
1516                 {
1517                     // The PME forces were recieved to the host, and reduced on the CPU with the
1518                     // rest of the forces computed on the GPU, so the final forces have to be copied
1519                     // back to the GPU. Or the buffer ops were not offloaded this step, so the
1520                     // forces are on the host and have to be copied
1521                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1522                 }
1523                 const bool doTemperatureScaling =
1524                         (ir->etc != TemperatureCoupling::No
1525                          && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1526
1527                 // This applies Leap-Frog, LINCS and SETTLE in succession
1528                 integrator->integrate(
1529                         stateGpu->getForcesReadyOnDeviceEvent(
1530                                 AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1531                         ir->delta_t,
1532                         true,
1533                         bCalcVir,
1534                         shake_vir,
1535                         doTemperatureScaling,
1536                         ekind->tcstat,
1537                         doParrinelloRahman,
1538                         ir->nstpcouple * ir->delta_t,
1539                         M);
1540
1541                 // Copy velocities D2H after update if:
1542                 // - Globals are computed this step (includes the energy output steps).
1543                 // - Temperature is needed for the next step.
1544                 if (bGStat || needHalfStepKineticEnergy)
1545                 {
1546                     stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1547                     stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1548                 }
1549             }
1550             else
1551             {
1552                 /* With multiple time stepping we need to do an additional normal
1553                  * update step to obtain the virial, as the actual MTS integration
1554                  * using an acceleration where the slow forces are multiplied by mtsFactor.
1555                  * Using that acceleration would result in a virial with the slow
1556                  * force contribution would be a factor mtsFactor too large.
1557                  */
1558                 if (simulationWork.useMts && bCalcVir && constr != nullptr)
1559                 {
1560                     upd.update_for_constraint_virial(*ir,
1561                                                      md->homenr,
1562                                                      md->havePartiallyFrozenAtoms,
1563                                                      gmx::arrayRefFromArray(md->invmass, md->nr),
1564                                                      gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1565                                                      *state,
1566                                                      f.view().forceWithPadding(),
1567                                                      *ekind);
1568
1569                     constrain_coordinates(constr,
1570                                           do_log,
1571                                           do_ene,
1572                                           step,
1573                                           state,
1574                                           upd.xp()->arrayRefWithPadding(),
1575                                           &dvdl_constr,
1576                                           bCalcVir,
1577                                           shake_vir);
1578                 }
1579
1580                 ArrayRefWithPadding<const RVec> forceCombined =
1581                         (simulationWork.useMts && step % ir->mtsLevels[1].stepFactor == 0)
1582                                 ? f.view().forceMtsCombinedWithPadding()
1583                                 : f.view().forceWithPadding();
1584                 upd.update_coords(*ir,
1585                                   step,
1586                                   md->homenr,
1587                                   md->havePartiallyFrozenAtoms,
1588                                   gmx::arrayRefFromArray(md->ptype, md->nr),
1589                                   gmx::arrayRefFromArray(md->invmass, md->nr),
1590                                   gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1591                                   state,
1592                                   forceCombined,
1593                                   &fcdata,
1594                                   ekind,
1595                                   M,
1596                                   etrtPOSITION,
1597                                   cr,
1598                                   constr != nullptr);
1599
1600                 wallcycle_stop(wcycle, WallCycleCounter::Update);
1601
1602                 constrain_coordinates(constr,
1603                                       do_log,
1604                                       do_ene,
1605                                       step,
1606                                       state,
1607                                       upd.xp()->arrayRefWithPadding(),
1608                                       &dvdl_constr,
1609                                       bCalcVir && !simulationWork.useMts,
1610                                       shake_vir);
1611
1612                 upd.update_sd_second_half(*ir,
1613                                           step,
1614                                           &dvdl_constr,
1615                                           md->homenr,
1616                                           gmx::arrayRefFromArray(md->ptype, md->nr),
1617                                           gmx::arrayRefFromArray(md->invmass, md->nr),
1618                                           state,
1619                                           cr,
1620                                           nrnb,
1621                                           wcycle,
1622                                           constr,
1623                                           do_log,
1624                                           do_ene);
1625                 upd.finish_update(
1626                         *ir, md->havePartiallyFrozenAtoms, md->homenr, state, wcycle, constr != nullptr);
1627             }
1628
1629             if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1630             {
1631                 updatePrevStepPullCom(pull_work, state);
1632             }
1633
1634             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1635         }
1636
1637         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1638         /* With Leap-Frog we can skip compute_globals at
1639          * non-communication steps, but we need to calculate
1640          * the kinetic energy one step before communication.
1641          */
1642         {
1643             // Organize to do inter-simulation signalling on steps if
1644             // and when algorithms require it.
1645             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1646
1647             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1648             {
1649                 // Copy coordinates when needed to stop the CM motion.
1650                 if (useGpuForUpdate && (bDoReplEx || (!EI_VV(ir->eI) && bStopCM)))
1651                 {
1652                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1653                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1654                 }
1655                 // Since we're already communicating at this step, we
1656                 // can propagate intra-simulation signals. Note that
1657                 // check_nstglobalcomm has the responsibility for
1658                 // choosing the value of nstglobalcomm that is one way
1659                 // bGStat becomes true, so we can't get into a
1660                 // situation where e.g. checkpointing can't be
1661                 // signalled.
1662                 bool                doIntraSimSignal = true;
1663                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1664
1665                 compute_globals(
1666                         gstat,
1667                         cr,
1668                         ir,
1669                         fr,
1670                         ekind,
1671                         makeConstArrayRef(state->x),
1672                         makeConstArrayRef(state->v),
1673                         state->box,
1674                         md,
1675                         nrnb,
1676                         &vcm,
1677                         wcycle,
1678                         enerd,
1679                         force_vir,
1680                         shake_vir,
1681                         total_vir,
1682                         pres,
1683                         (!EI_VV(ir->eI) && bCalcEner && constr != nullptr) ? constr->rmsdData()
1684                                                                            : gmx::ArrayRef<real>{},
1685                         &signaller,
1686                         lastbox,
1687                         &bSumEkinhOld,
1688                         (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1689                                 | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1690                                 | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1691                                 | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT,
1692                         step,
1693                         &observablesReducer);
1694                 if (!EI_VV(ir->eI) && bStopCM)
1695                 {
1696                     process_and_stopcm_grp(
1697                             fplog, &vcm, *md, makeArrayRef(state->x), makeArrayRef(state->v));
1698                     inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
1699
1700                     // TODO: The special case of removing CM motion should be dealt more gracefully
1701                     if (useGpuForUpdate)
1702                     {
1703                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1704                         // Here we block until the H2D copy completes because event sync with the
1705                         // force kernels that use the coordinates on the next steps is not implemented
1706                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1707                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1708                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1709                         if (vcm.mode != ComRemovalAlgorithm::No)
1710                         {
1711                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1712                         }
1713                     }
1714                 }
1715             }
1716         }
1717
1718         /* #############  END CALC EKIN AND PRESSURE ################# */
1719
1720         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1721            the virial that should probably be addressed eventually. state->veta has better properies,
1722            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1723            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1724
1725         if (ir->efep != FreeEnergyPerturbationType::No && !EI_VV(ir->eI))
1726         {
1727             /* Sum up the foreign energy and dK/dl terms for md and sd.
1728                Currently done every step so that dH/dl is correct in the .edr */
1729             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1730         }
1731
1732         update_pcouple_after_coordinates(fplog,
1733                                          step,
1734                                          ir,
1735                                          md->homenr,
1736                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1737                                                      : gmx::ArrayRef<const unsigned short>(),
1738                                          pres,
1739                                          force_vir,
1740                                          shake_vir,
1741                                          pressureCouplingMu,
1742                                          state,
1743                                          nrnb,
1744                                          upd.deform(),
1745                                          !useGpuForUpdate);
1746
1747         const bool doBerendsenPressureCoupling = (inputrec->epc == PressureCoupling::Berendsen
1748                                                   && do_per_step(step, inputrec->nstpcouple));
1749         const bool doCRescalePressureCoupling  = (inputrec->epc == PressureCoupling::CRescale
1750                                                  && do_per_step(step, inputrec->nstpcouple));
1751         if (useGpuForUpdate
1752             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1753         {
1754             integrator->scaleCoordinates(pressureCouplingMu);
1755             if (doCRescalePressureCoupling)
1756             {
1757                 matrix pressureCouplingInvMu;
1758                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1759                 integrator->scaleVelocities(pressureCouplingInvMu);
1760             }
1761             integrator->setPbc(PbcType::Xyz, state->box);
1762         }
1763
1764         /* ################# END UPDATE STEP 2 ################# */
1765         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1766
1767         /* The coordinates (x) were unshifted in update */
1768         if (!bGStat)
1769         {
1770             /* We will not sum ekinh_old,
1771              * so signal that we still have to do it.
1772              */
1773             bSumEkinhOld = TRUE;
1774         }
1775
1776         if (bCalcEner)
1777         {
1778             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1779
1780             /* use the directly determined last velocity, not actually the averaged half steps */
1781             if (bTrotter && ir->eI == IntegrationAlgorithm::VV)
1782             {
1783                 enerd->term[F_EKIN] = last_ekin;
1784             }
1785             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1786
1787             if (integratorHasConservedEnergyQuantity(ir))
1788             {
1789                 if (EI_VV(ir->eI))
1790                 {
1791                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1792                 }
1793                 else
1794                 {
1795                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1796                 }
1797             }
1798             /* #########  END PREPARING EDR OUTPUT  ###########  */
1799         }
1800
1801         /* Output stuff */
1802         if (MASTER(cr))
1803         {
1804             if (fplog && do_log && bDoExpanded)
1805             {
1806                 /* only needed if doing expanded ensemble */
1807                 PrintFreeEnergyInfoToFile(fplog,
1808                                           ir->fepvals.get(),
1809                                           ir->expandedvals.get(),
1810                                           ir->bSimTemp ? ir->simtempvals.get() : nullptr,
1811                                           state_global->dfhist,
1812                                           state->fep_state,
1813                                           ir->nstlog,
1814                                           step);
1815             }
1816             if (bCalcEner)
1817             {
1818                 const bool outputDHDL = (computeDHDL && do_per_step(step, ir->fepvals->nstdhdl));
1819
1820                 energyOutput.addDataAtEnergyStep(outputDHDL,
1821                                                  bCalcEnerStep,
1822                                                  t,
1823                                                  md->tmass,
1824                                                  enerd,
1825                                                  ir->fepvals.get(),
1826                                                  ir->expandedvals.get(),
1827                                                  lastbox,
1828                                                  PTCouplingArrays{ state->boxv,
1829                                                                    state->nosehoover_xi,
1830                                                                    state->nosehoover_vxi,
1831                                                                    state->nhpres_xi,
1832                                                                    state->nhpres_vxi },
1833                                                  state->fep_state,
1834                                                  total_vir,
1835                                                  pres,
1836                                                  ekind,
1837                                                  mu_tot,
1838                                                  constr);
1839             }
1840             else
1841             {
1842                 energyOutput.recordNonEnergyStep();
1843             }
1844
1845             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1846             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1847
1848             if (doSimulatedAnnealing)
1849             {
1850                 gmx::EnergyOutput::printAnnealingTemperatures(
1851                         do_log ? fplog : nullptr, groups, &(ir->opts));
1852             }
1853             if (do_log || do_ene || do_dr || do_or)
1854             {
1855                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf),
1856                                                    do_ene,
1857                                                    do_dr,
1858                                                    do_or,
1859                                                    do_log ? fplog : nullptr,
1860                                                    step,
1861                                                    t,
1862                                                    fr->fcdata.get(),
1863                                                    awh.get());
1864             }
1865             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1866             {
1867                 const bool isInitialOutput = false;
1868                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1869             }
1870
1871             if (ir->bPull)
1872             {
1873                 pull_print_output(pull_work, step, t);
1874             }
1875
1876             if (do_per_step(step, ir->nstlog))
1877             {
1878                 if (fflush(fplog) != 0)
1879                 {
1880                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1881                 }
1882             }
1883         }
1884         if (bDoExpanded)
1885         {
1886             /* Have to do this part _after_ outputting the logfile and the edr file */
1887             /* Gets written into the state at the beginning of next loop*/
1888             state->fep_state = lamnew;
1889         }
1890         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1891         {
1892             state->fep_state = awh->fepLambdaState();
1893         }
1894         /* Print the remaining wall clock time for the run */
1895         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1896         {
1897             if (shellfc)
1898             {
1899                 fprintf(stderr, "\n");
1900             }
1901             print_time(stderr, walltime_accounting, step, ir, cr);
1902         }
1903
1904         /* Ion/water position swapping.
1905          * Not done in last step since trajectory writing happens before this call
1906          * in the MD loop and exchanges would be lost anyway. */
1907         bNeedRepartition = FALSE;
1908         if ((ir->eSwapCoords != SwapType::No) && (step > 0) && !bLastStep
1909             && do_per_step(step, ir->swap->nstswap))
1910         {
1911             bNeedRepartition = do_swapcoords(cr,
1912                                              step,
1913                                              t,
1914                                              ir,
1915                                              swap,
1916                                              wcycle,
1917                                              as_rvec_array(state->x.data()),
1918                                              state->box,
1919                                              MASTER(cr) && mdrunOptions.verbose,
1920                                              bRerunMD);
1921
1922             if (bNeedRepartition && DOMAINDECOMP(cr))
1923             {
1924                 dd_collect_state(cr->dd, state, state_global);
1925             }
1926         }
1927
1928         /* Replica exchange */
1929         bExchanged = FALSE;
1930         if (bDoReplEx)
1931         {
1932             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1933         }
1934
1935         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1936         {
1937             dd_partition_system(fplog,
1938                                 mdlog,
1939                                 step,
1940                                 cr,
1941                                 TRUE,
1942                                 1,
1943                                 state_global,
1944                                 top_global,
1945                                 *ir,
1946                                 imdSession,
1947                                 pull_work,
1948                                 state,
1949                                 &f,
1950                                 mdAtoms,
1951                                 top,
1952                                 fr,
1953                                 vsite,
1954                                 constr,
1955                                 nrnb,
1956                                 wcycle,
1957                                 FALSE);
1958             upd.updateAfterPartition(state->natoms,
1959                                      md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1960                                                  : gmx::ArrayRef<const unsigned short>(),
1961                                      md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1962                                              : gmx::ArrayRef<const unsigned short>());
1963             fr->longRangeNonbondeds->updateAfterPartition(*md);
1964         }
1965
1966         bFirstStep = FALSE;
1967         bInitStep  = FALSE;
1968
1969         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1970         /* With all integrators, except VV, we need to retain the pressure
1971          * at the current step for coupling at the next step.
1972          */
1973         if ((state->flags & enumValueToBitMask(StateEntry::PressurePrevious))
1974             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1975         {
1976             /* Store the pressure in t_state for pressure coupling
1977              * at the next MD step.
1978              */
1979             copy_mat(pres, state->pres_prev);
1980         }
1981
1982         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1983
1984         if ((membed != nullptr) && (!bLastStep))
1985         {
1986             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1987         }
1988
1989         cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
1990         if (DOMAINDECOMP(cr) && wcycle)
1991         {
1992             dd_cycles_add(cr->dd, cycles, ddCyclStep);
1993         }
1994
1995         /* increase the MD step number */
1996         step++;
1997         step_rel++;
1998         observablesReducer.markAsReadyToReduce();
1999
2000 #if GMX_FAHCORE
2001         if (MASTER(cr))
2002         {
2003             fcReportProgress(ir->nsteps + ir->init_step, step);
2004         }
2005 #endif
2006
2007         resetHandler->resetCounters(
2008                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
2009
2010         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
2011         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
2012     }
2013     /* End of main MD loop */
2014
2015     /* Closing TNG files can include compressing data. Therefore it is good to do that
2016      * before stopping the time measurements. */
2017     mdoutf_tng_close(outf);
2018
2019     /* Stop measuring walltime */
2020     walltime_accounting_end_time(walltime_accounting);
2021
2022     if (simulationWork.haveSeparatePmeRank)
2023     {
2024         /* Tell the PME only node to finish */
2025         gmx_pme_send_finish(cr);
2026     }
2027
2028     if (MASTER(cr))
2029     {
2030         if (ir->nstcalcenergy > 0)
2031         {
2032             energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
2033
2034             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
2035             energyOutput.printAverages(fplog, groups);
2036         }
2037     }
2038     done_mdoutf(outf);
2039
2040     if (bPMETune)
2041     {
2042         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
2043     }
2044
2045     done_shellfc(fplog, shellfc, step_rel);
2046
2047     if (useReplicaExchange && MASTER(cr))
2048     {
2049         print_replica_exchange_statistics(fplog, repl_ex);
2050     }
2051
2052     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
2053
2054     global_stat_destroy(gstat);
2055 }