src/gromacs/mdrun/md.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief Implements the integrator for normal molecular dynamics simulations
  40  *
  41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
  42  * \ingroup module_mdrun
  43  */
  44 #include "gmxpre.h"
  45
  46 #include <cinttypes>
  47 #include <cmath>
  48 #include <cstdio>
  49 #include <cstdlib>
  50
  51 #include <algorithm>
  52 #include <memory>
  53 #include <numeric>
  54
  55 #include "gromacs/applied_forces/awh/awh.h"
  56 #include "gromacs/applied_forces/awh/read_params.h"
  57 #include "gromacs/commandline/filenm.h"
  58 #include "gromacs/domdec/collect.h"
  59 #include "gromacs/domdec/dlbtiming.h"
  60 #include "gromacs/domdec/domdec.h"
  61 #include "gromacs/domdec/domdec_network.h"
  62 #include "gromacs/domdec/domdec_struct.h"
  63 #include "gromacs/domdec/gpuhaloexchange.h"
  64 #include "gromacs/domdec/localtopologychecker.h"
  65 #include "gromacs/domdec/mdsetup.h"
  66 #include "gromacs/domdec/partition.h"
  67 #include "gromacs/essentialdynamics/edsam.h"
  68 #include "gromacs/ewald/pme_load_balancing.h"
  69 #include "gromacs/ewald/pme_pp.h"
  70 #include "gromacs/fileio/trxio.h"
  71 #include "gromacs/gmxlib/network.h"
  72 #include "gromacs/gmxlib/nrnb.h"
  73 #include "gromacs/gpu_utils/device_stream_manager.h"
  74 #include "gromacs/gpu_utils/gpu_utils.h"
  75 #include "gromacs/imd/imd.h"
  76 #include "gromacs/listed_forces/listed_forces.h"
  77 #include "gromacs/math/functions.h"
  78 #include "gromacs/math/invertmatrix.h"
  79 #include "gromacs/math/vec.h"
  80 #include "gromacs/math/vectypes.h"
  81 #include "gromacs/mdlib/checkpointhandler.h"
  82 #include "gromacs/mdlib/compute_io.h"
  83 #include "gromacs/mdlib/constr.h"
  84 #include "gromacs/mdlib/coupling.h"
  85 #include "gromacs/mdlib/ebin.h"
  86 #include "gromacs/mdlib/enerdata_utils.h"
  87 #include "gromacs/mdlib/energyoutput.h"
  88 #include "gromacs/mdlib/expanded.h"
  89 #include "gromacs/mdlib/force.h"
  90 #include "gromacs/mdlib/force_flags.h"
  91 #include "gromacs/mdlib/forcerec.h"
  92 #include "gromacs/mdlib/freeenergyparameters.h"
  93 #include "gromacs/mdlib/md_support.h"
  94 #include "gromacs/mdlib/mdatoms.h"
  95 #include "gromacs/mdlib/mdoutf.h"
  96 #include "gromacs/mdlib/membed.h"
  97 #include "gromacs/mdlib/resethandler.h"
  98 #include "gromacs/mdlib/sighandler.h"
  99 #include "gromacs/mdlib/simulationsignal.h"
 100 #include "gromacs/mdlib/stat.h"
 101 #include "gromacs/mdlib/stophandler.h"
 102 #include "gromacs/mdlib/tgroup.h"
 103 #include "gromacs/mdlib/trajectory_writing.h"
 104 #include "gromacs/mdlib/update.h"
 105 #include "gromacs/mdlib/update_constrain_gpu.h"
 106 #include "gromacs/mdlib/update_vv.h"
 107 #include "gromacs/mdlib/vcm.h"
 108 #include "gromacs/mdlib/vsite.h"
 109 #include "gromacs/mdrunutility/freeenergy.h"
 110 #include "gromacs/mdrunutility/handlerestart.h"
 111 #include "gromacs/mdrunutility/multisim.h"
 112 #include "gromacs/mdrunutility/printtime.h"
 113 #include "gromacs/mdtypes/awh_history.h"
 114 #include "gromacs/mdtypes/awh_params.h"
 115 #include "gromacs/mdtypes/commrec.h"
 116 #include "gromacs/mdtypes/df_history.h"
 117 #include "gromacs/mdtypes/energyhistory.h"
 118 #include "gromacs/mdtypes/fcdata.h"
 119 #include "gromacs/mdtypes/forcebuffers.h"
 120 #include "gromacs/mdtypes/forcerec.h"
 121 #include "gromacs/mdtypes/group.h"
 122 #include "gromacs/mdtypes/inputrec.h"
 123 #include "gromacs/mdtypes/interaction_const.h"
 124 #include "gromacs/mdtypes/md_enums.h"
 125 #include "gromacs/mdtypes/mdatom.h"
 126 #include "gromacs/mdtypes/mdrunoptions.h"
 127 #include "gromacs/mdtypes/multipletimestepping.h"
 128 #include "gromacs/mdtypes/observableshistory.h"
 129 #include "gromacs/mdtypes/pullhistory.h"
 130 #include "gromacs/mdtypes/simulation_workload.h"
 131 #include "gromacs/mdtypes/state.h"
 132 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
 133 #include "gromacs/modularsimulator/energydata.h"
 134 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 135 #include "gromacs/nbnxm/nbnxm.h"
 136 #include "gromacs/pbcutil/pbc.h"
 137 #include "gromacs/pulling/output.h"
 138 #include "gromacs/pulling/pull.h"
 139 #include "gromacs/swap/swapcoords.h"
 140 #include "gromacs/timing/wallcycle.h"
 141 #include "gromacs/timing/walltime_accounting.h"
 142 #include "gromacs/topology/atoms.h"
 143 #include "gromacs/topology/idef.h"
 144 #include "gromacs/topology/mtop_util.h"
 145 #include "gromacs/topology/topology.h"
 146 #include "gromacs/trajectory/trajectoryframe.h"
 147 #include "gromacs/utility/basedefinitions.h"
 148 #include "gromacs/utility/cstringutil.h"
 149 #include "gromacs/utility/fatalerror.h"
 150 #include "gromacs/utility/logger.h"
 151 #include "gromacs/utility/real.h"
 152 #include "gromacs/utility/smalloc.h"
 153
 154 #include "legacysimulator.h"
 155 #include "replicaexchange.h"
 156 #include "shellfc.h"
 157
 158 using gmx::SimulationSignaller;
 159
 160 void gmx::LegacySimulator::do_md()
 161 {
 162     // TODO Historically, the EM and MD "integrators" used different
 163     // names for the t_inputrec *parameter, but these must have the
 164     // same name, now that it's a member of a struct. We use this ir
 165     // alias to avoid a large ripple of nearly useless changes.
 166     // t_inputrec is being replaced by IMdpOptionsProvider, so this
 167     // will go away eventually.
 168     const t_inputrec* ir = inputrec;
 169
 170     double       t, t0 = ir->init_t;
 171     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
 172     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
 173     gmx_bool     bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
 174     gmx_bool     do_ene, do_log, do_verbose;
 175     gmx_bool     bMasterState;
 176     unsigned int force_flags;
 177     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
 178     int    i, m;
 179     rvec   mu_tot;
 180     matrix pressureCouplingMu, M;
 181     gmx_repl_ex_t     repl_ex = nullptr;
 182     gmx_global_stat_t gstat;
 183     gmx_shellfc_t*    shellfc;
 184     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
 185     gmx_bool          bTrotter;
 186     real              dvdl_constr;
 187     std::vector<RVec> cbuf;
 188     matrix            lastbox;
 189     int               lamnew = 0;
 190     /* for FEP */
 191     double    cycles;
 192     real      saved_conserved_quantity = 0;
 193     real      last_ekin                = 0;
 194     t_extmass MassQ;
 195     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
 196
 197     /* PME load balancing data for GPU kernels */
 198     gmx_bool bPMETune         = FALSE;
 199     gmx_bool bPMETunePrinting = FALSE;
 200
 201     bool bInteractiveMDstep = false;
 202
 203     SimulationSignals signals;
 204     // Most global communnication stages don't propagate mdrun
 205     // signals, and will use this object to achieve that.
 206     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
 207
 208     if (!mdrunOptions.writeConfout)
 209     {
 210         // This is on by default, and the main known use case for
 211         // turning it off is for convenience in benchmarking, which is
 212         // something that should not show up in the general user
 213         // interface.
 214         GMX_LOG(mdlog.info)
 215                 .asParagraph()
 216                 .appendText(
 217                         "The -noconfout functionality is deprecated, and may be removed in a "
 218                         "future version.");
 219     }
 220
 221     /* md-vv uses averaged full step velocities for T-control
 222        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 223        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 224     bTrotter = (EI_VV(ir->eI)
 225                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
 226
 227     const bool bRerunMD = false;
 228
 229     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
 230     bGStatEveryStep   = (nstglobalcomm == 1);
 231
 232     const SimulationGroups* groups = &top_global.groups;
 233
 234     std::unique_ptr<EssentialDynamics> ed = nullptr;
 235     if (opt2bSet("-ei", nfile, fnm))
 236     {
 237         /* Initialize essential dynamics sampling */
 238         ed = init_edsam(mdlog,
 239                         opt2fn_null("-ei", nfile, fnm),
 240                         opt2fn("-eo", nfile, fnm),
 241                         top_global,
 242                         *ir,
 243                         cr,
 244                         constr,
 245                         state_global,
 246                         observablesHistory,
 247                         oenv,
 248                         startingBehavior);
 249     }
 250     else if (observablesHistory->edsamHistory)
 251     {
 252         gmx_fatal(FARGS,
 253                   "The checkpoint is from a run with essential dynamics sampling, "
 254                   "but the current run did not specify the -ei option. "
 255                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
 256     }
 257
 258     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
 259     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
 260     initialize_lambdas(fplog,
 261                        ir->efep,
 262                        ir->bSimTemp,
 263                        *ir->fepvals,
 264                        ir->simtempvals->temperatures,
 265                        gmx::arrayRefFromArray(ir->opts.ref_t, ir->opts.ngtc),
 266                        MASTER(cr),
 267                        fep_state,
 268                        lambda);
 269     Update upd(*ir, deform);
 270     bool   doSimulatedAnnealing = false;
 271     {
 272         // TODO: Avoid changing inputrec (#3854)
 273         // Simulated annealing updates the reference temperature.
 274         auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
 275         doSimulatedAnnealing   = initSimulatedAnnealing(nonConstInputrec, &upd);
 276     }
 277     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
 278
 279     const t_fcdata& fcdata = *fr->fcdata;
 280
 281     bool simulationsShareState = false;
 282     int  nstSignalComm         = nstglobalcomm;
 283     {
 284         // TODO This implementation of ensemble orientation restraints is nasty because
 285         // a user can't just do multi-sim with single-sim orientation restraints.
 286         bool usingEnsembleRestraints = (fcdata.disres->nsystems > 1) || ((ms != nullptr) && fcdata.orires);
 287         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim() && (ms != nullptr));
 288
 289         // Replica exchange, ensemble restraints and AWH need all
 290         // simulations to remain synchronized, so they need
 291         // checkpoints and stop conditions to act on the same step, so
 292         // the propagation of such signals must take place between
 293         // simulations, not just within simulations.
 294         // TODO: Make algorithm initializers set these flags.
 295         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
 296
 297         if (simulationsShareState)
 298         {
 299             // Inter-simulation signal communication does not need to happen
 300             // often, so we use a minimum of 200 steps to reduce overhead.
 301             const int c_minimumInterSimulationSignallingInterval = 200;
 302             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
 303                             * nstglobalcomm;
 304         }
 305     }
 306
 307     if (startingBehavior != StartingBehavior::RestartWithAppending)
 308     {
 309         pleaseCiteCouplingAlgorithms(fplog, *ir);
 310     }
 311     gmx_mdoutf*       outf = init_mdoutf(fplog,
 312                                    nfile,
 313                                    fnm,
 314                                    mdrunOptions,
 315                                    cr,
 316                                    outputProvider,
 317                                    mdModulesNotifiers,
 318                                    ir,
 319                                    top_global,
 320                                    oenv,
 321                                    wcycle,
 322                                    startingBehavior,
 323                                    simulationsShareState,
 324                                    ms);
 325     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf),
 326                                    top_global,
 327                                    *ir,
 328                                    pull_work,
 329                                    mdoutf_get_fp_dhdl(outf),
 330                                    false,
 331                                    startingBehavior,
 332                                    simulationsShareState,
 333                                    mdModulesNotifiers);
 334
 335     gstat = global_stat_init(ir);
 336
 337     const auto& simulationWork     = runScheduleWork->simulationWork;
 338     const bool  useGpuForPme       = simulationWork.useGpuPme;
 339     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
 340     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
 341     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
 342
 343     /* Check for polarizable models and flexible constraints */
 344     shellfc = init_shell_flexcon(fplog,
 345                                  top_global,
 346                                  constr ? constr->numFlexibleConstraints() : 0,
 347                                  ir->nstcalcenergy,
 348                                  DOMAINDECOMP(cr),
 349                                  useGpuForPme);
 350
 351     {
 352         double io = compute_io(ir, top_global.natoms, *groups, energyOutput.numEnergyTerms(), 1);
 353         if ((io > 2000) && MASTER(cr))
 354         {
 355             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
 356         }
 357     }
 358
 359     // Local state only becomes valid now.
 360     std::unique_ptr<t_state> stateInstance;
 361     t_state*                 state;
 362
 363     gmx_localtop_t top(top_global.ffparams);
 364
 365     ForceBuffers     f(simulationWork.useMts,
 366                    ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 367                                ? PinningPolicy::PinnedIfSupported
 368                                : PinningPolicy::CannotBePinned);
 369     const t_mdatoms* md = mdAtoms->mdatoms();
 370     if (DOMAINDECOMP(cr))
 371     {
 372         stateInstance = std::make_unique<t_state>();
 373         state         = stateInstance.get();
 374         dd_init_local_state(*cr->dd, state_global, state);
 375
 376         /* Distribute the charge groups over the nodes from the master node */
 377         dd_partition_system(fplog,
 378                             mdlog,
 379                             ir->init_step,
 380                             cr,
 381                             TRUE,
 382                             1,
 383                             state_global,
 384                             top_global,
 385                             *ir,
 386                             imdSession,
 387                             pull_work,
 388                             state,
 389                             &f,
 390                             mdAtoms,
 391                             &top,
 392                             fr,
 393                             vsite,
 394                             constr,
 395                             nrnb,
 396                             nullptr,
 397                             FALSE);
 398         upd.updateAfterPartition(state->natoms,
 399                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
 400                                              : gmx::ArrayRef<const unsigned short>(),
 401                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
 402                                          : gmx::ArrayRef<const unsigned short>());
 403     }
 404     else
 405     {
 406         state_change_natoms(state_global, state_global->natoms);
 407         /* Copy the pointer to the global state */
 408         state = state_global;
 409
 410         /* Generate and initialize new topology */
 411         mdAlgorithmsSetupAtomData(cr, *ir, top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
 412
 413         upd.updateAfterPartition(state->natoms,
 414                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
 415                                              : gmx::ArrayRef<const unsigned short>(),
 416                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
 417                                          : gmx::ArrayRef<const unsigned short>());
 418     }
 419
 420     std::unique_ptr<UpdateConstrainGpu> integrator;
 421
 422     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
 423
 424     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
 425     if (useGpuForUpdate)
 426     {
 427         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
 428                                    || constr->numConstraintsTotal() == 0,
 429                            "Constraints in domain decomposition are only supported with update "
 430                            "groups if using GPU update.\n");
 431         GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
 432                                    || constr->numConstraintsTotal() == 0,
 433                            "SHAKE is not supported with GPU update.");
 434         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
 435                            "Either PME or short-ranged non-bonded interaction tasks must run on "
 436                            "the GPU to use GPU update.\n");
 437         GMX_RELEASE_ASSERT(ir->eI == IntegrationAlgorithm::MD,
 438                            "Only the md integrator is supported with the GPU update.\n");
 439         GMX_RELEASE_ASSERT(
 440                 ir->etc != TemperatureCoupling::NoseHoover,
 441                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
 442         GMX_RELEASE_ASSERT(
 443                 ir->epc == PressureCoupling::No || ir->epc == PressureCoupling::ParrinelloRahman
 444                         || ir->epc == PressureCoupling::Berendsen || ir->epc == PressureCoupling::CRescale,
 445                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
 446                 "with the GPU update.\n");
 447         GMX_RELEASE_ASSERT(!md->haveVsites,
 448                            "Virtual sites are not supported with the GPU update.\n");
 449         GMX_RELEASE_ASSERT(ed == nullptr,
 450                            "Essential dynamics is not supported with the GPU update.\n");
 451         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
 452                            "Constraints pulling is not supported with the GPU update.\n");
 453         GMX_RELEASE_ASSERT(fcdata.orires == nullptr,
 454                            "Orientation restraints are not supported with the GPU update.\n");
 455         GMX_RELEASE_ASSERT(
 456                 ir->efep == FreeEnergyPerturbationType::No
 457                         || (!haveFepPerturbedMasses(top_global) && !havePerturbedConstraints(top_global)),
 458                 "Free energy perturbation of masses and constraints are not supported with the GPU "
 459                 "update.");
 460
 461         if (constr != nullptr && constr->numConstraintsTotal() > 0)
 462         {
 463             GMX_LOG(mdlog.info)
 464                     .asParagraph()
 465                     .appendText("Updating coordinates and applying constraints on the GPU.");
 466         }
 467         else
 468         {
 469             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
 470         }
 471         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
 472                            "Device stream manager should be initialized in order to use GPU "
 473                            "update-constraints.");
 474         GMX_RELEASE_ASSERT(
 475                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
 476                 "Update stream should be initialized in order to use GPU "
 477                 "update-constraints.");
 478         integrator = std::make_unique<UpdateConstrainGpu>(
 479                 *ir,
 480                 top_global,
 481                 ekind->ngtc,
 482                 fr->deviceStreamManager->context(),
 483                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
 484                 wcycle);
 485
 486         stateGpu->setXUpdatedOnDeviceEvent(integrator->xUpdatedOnDeviceEvent());
 487
 488         integrator->setPbc(PbcType::Xyz, state->box);
 489     }
 490
 491     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 492     {
 493         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
 494     }
 495     if (useGpuForUpdate)
 496     {
 497         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
 498     }
 499
 500     // NOTE: The global state is no longer used at this point.
 501     // But state_global is still used as temporary storage space for writing
 502     // the global state to file and potentially for replica exchange.
 503     // (Global topology should persist.)
 504
 505     update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
 506
 507     if (ir->bExpanded)
 508     {
 509         /* Check nstexpanded here, because the grompp check was broken */
 510         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
 511         {
 512             gmx_fatal(FARGS,
 513                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
 514         }
 515         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
 516     }
 517
 518     if (MASTER(cr))
 519     {
 520         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
 521     }
 522
 523     preparePrevStepPullCom(ir,
 524                            pull_work,
 525                            gmx::arrayRefFromArray(md->massT, md->nr),
 526                            state,
 527                            state_global,
 528                            cr,
 529                            startingBehavior != StartingBehavior::NewSimulation);
 530
 531     // TODO: Remove this by converting AWH into a ForceProvider
 532     auto awh = prepareAwhModule(fplog,
 533                                 *ir,
 534                                 state_global,
 535                                 cr,
 536                                 ms,
 537                                 startingBehavior != StartingBehavior::NewSimulation,
 538                                 shellfc != nullptr,
 539                                 opt2fn("-awh", nfile, fnm),
 540                                 pull_work);
 541
 542     if (useReplicaExchange && MASTER(cr))
 543     {
 544         repl_ex = init_replica_exchange(fplog, ms, top_global.natoms, ir, replExParams);
 545     }
 546     /* PME tuning is only supported in the Verlet scheme, with PME for
 547      * Coulomb. It is not supported with only LJ PME. */
 548     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
 549                 && ir->cutoff_scheme != CutoffScheme::Group);
 550
 551     pme_load_balancing_t* pme_loadbal = nullptr;
 552     if (bPMETune)
 553     {
 554         pme_loadbal_init(
 555                 &pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu());
 556     }
 557
 558     if (!ir->bContinuation)
 559     {
 560         if (state->flags & enumValueToBitMask(StateEntry::V))
 561         {
 562             auto v = makeArrayRef(state->v);
 563             /* Set the velocities of vsites, shells and frozen atoms to zero */
 564             for (i = 0; i < md->homenr; i++)
 565             {
 566                 if (md->ptype[i] == ParticleType::Shell)
 567                 {
 568                     clear_rvec(v[i]);
 569                 }
 570                 else if (md->cFREEZE)
 571                 {
 572                     for (m = 0; m < DIM; m++)
 573                     {
 574                         if (ir->opts.nFreeze[md->cFREEZE[i]][m])
 575                         {
 576                             v[i][m] = 0;
 577                         }
 578                     }
 579                 }
 580             }
 581         }
 582
 583         if (constr)
 584         {
 585             /* Constrain the initial coordinates and velocities */
 586             do_constrain_first(fplog,
 587                                constr,
 588                                ir,
 589                                md->nr,
 590                                md->homenr,
 591                                state->x.arrayRefWithPadding(),
 592                                state->v.arrayRefWithPadding(),
 593                                state->box,
 594                                state->lambda[FreeEnergyPerturbationCouplingType::Bonded]);
 595         }
 596     }
 597
 598     const int nstfep = computeFepPeriod(*ir, replExParams);
 599
 600     /* Be REALLY careful about what flags you set here. You CANNOT assume
 601      * this is the first step, since we might be restarting from a checkpoint,
 602      * and in that case we should not do any modifications to the state.
 603      */
 604     bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && !ir->bContinuation);
 605
 606     // When restarting from a checkpoint, it can be appropriate to
 607     // initialize ekind from quantities in the checkpoint. Otherwise,
 608     // compute_globals must initialize ekind before the simulation
 609     // starts/restarts. However, only the master rank knows what was
 610     // found in the checkpoint file, so we have to communicate in
 611     // order to coordinate the restart.
 612     //
 613     // TODO Consider removing this communication if/when checkpoint
 614     // reading directly follows .tpr reading, because all ranks can
 615     // agree on hasReadEkinState at that time.
 616     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
 617     if (PAR(cr))
 618     {
 619         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
 620     }
 621     if (hasReadEkinState)
 622     {
 623         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
 624     }
 625
 626     unsigned int cglo_flags =
 627             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
 628              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
 629
 630     bSumEkinhOld = FALSE;
 631
 632     t_vcm vcm(top_global.groups, *ir);
 633     reportComRemovalInfo(fplog, vcm);
 634
 635     /* To minimize communication, compute_globals computes the COM velocity
 636      * and the kinetic energy for the velocities without COM motion removed.
 637      * Thus to get the kinetic energy without the COM contribution, we need
 638      * to call compute_globals twice.
 639      */
 640     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
 641     {
 642         unsigned int cglo_flags_iteration = cglo_flags;
 643         if (bStopCM && cgloIteration == 0)
 644         {
 645             cglo_flags_iteration |= CGLO_STOPCM;
 646             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
 647         }
 648         if (DOMAINDECOMP(cr) && dd_localTopologyChecker(*cr->dd).shouldCheckNumberOfBondedInteractions()
 649             && cgloIteration == 0)
 650         {
 651             cglo_flags_iteration |= CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS;
 652         }
 653         compute_globals(gstat,
 654                         cr,
 655                         ir,
 656                         fr,
 657                         ekind,
 658                         makeConstArrayRef(state->x),
 659                         makeConstArrayRef(state->v),
 660                         state->box,
 661                         md,
 662                         nrnb,
 663                         &vcm,
 664                         nullptr,
 665                         enerd,
 666                         force_vir,
 667                         shake_vir,
 668                         total_vir,
 669                         pres,
 670                         gmx::ArrayRef<real>{},
 671                         &nullSignaller,
 672                         state->box,
 673                         &bSumEkinhOld,
 674                         cglo_flags_iteration);
 675         if (cglo_flags_iteration & CGLO_STOPCM)
 676         {
 677             /* At initialization, do not pass x with acceleration-correction mode
 678              * to avoid (incorrect) correction of the initial coordinates.
 679              */
 680             auto x = (vcm.mode == ComRemovalAlgorithm::LinearAccelerationCorrection)
 681                              ? ArrayRef<RVec>()
 682                              : makeArrayRef(state->x);
 683             process_and_stopcm_grp(fplog, &vcm, *md, x, makeArrayRef(state->v));
 684             inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
 685         }
 686     }
 687     if (DOMAINDECOMP(cr))
 688     {
 689         dd_localTopologyChecker(cr->dd)->checkNumberOfBondedInteractions(
 690                 &top, makeConstArrayRef(state->x), state->box);
 691     }
 692     if (ir->eI == IntegrationAlgorithm::VVAK)
 693     {
 694         /* a second call to get the half step temperature initialized as well */
 695         /* we do the same call as above, but turn the pressure off -- internally to
 696            compute_globals, this is recognized as a velocity verlet half-step
 697            kinetic energy calculation.  This minimized excess variables, but
 698            perhaps loses some logic?*/
 699
 700         compute_globals(gstat,
 701                         cr,
 702                         ir,
 703                         fr,
 704                         ekind,
 705                         makeConstArrayRef(state->x),
 706                         makeConstArrayRef(state->v),
 707                         state->box,
 708                         md,
 709                         nrnb,
 710                         &vcm,
 711                         nullptr,
 712                         enerd,
 713                         force_vir,
 714                         shake_vir,
 715                         total_vir,
 716                         pres,
 717                         gmx::ArrayRef<real>{},
 718                         &nullSignaller,
 719                         state->box,
 720                         &bSumEkinhOld,
 721                         cglo_flags & ~CGLO_PRESSURE);
 722     }
 723
 724     /* Calculate the initial half step temperature, and save the ekinh_old */
 725     if (startingBehavior == StartingBehavior::NewSimulation)
 726     {
 727         for (i = 0; (i < ir->opts.ngtc); i++)
 728         {
 729             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
 730         }
 731     }
 732
 733     /* need to make an initiation call to get the Trotter variables set, as well as other constants
 734        for non-trotter temperature control */
 735     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
 736
 737     if (MASTER(cr))
 738     {
 739         if (!ir->bContinuation)
 740         {
 741             if (constr && ir->eConstrAlg == ConstraintAlgorithm::Lincs)
 742             {
 743                 fprintf(fplog,
 744                         "RMS relative constraint deviation after constraining: %.2e\n",
 745                         constr->rmsd());
 746             }
 747             if (EI_STATE_VELOCITY(ir->eI))
 748             {
 749                 real temp = enerd->term[F_TEMP];
 750                 if (ir->eI != IntegrationAlgorithm::VV)
 751                 {
 752                     /* Result of Ekin averaged over velocities of -half
 753                      * and +half step, while we only have -half step here.
 754                      */
 755                     temp *= 2;
 756                 }
 757                 fprintf(fplog, "Initial temperature: %g K\n", temp);
 758             }
 759         }
 760
 761         char tbuf[20];
 762         fprintf(stderr, "starting mdrun '%s'\n", *(top_global.name));
 763         if (ir->nsteps >= 0)
 764         {
 765             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
 766         }
 767         else
 768         {
 769             sprintf(tbuf, "%s", "infinite");
 770         }
 771         if (ir->init_step > 0)
 772         {
 773             fprintf(stderr,
 774                     "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 775                     gmx_step_str(ir->init_step + ir->nsteps, sbuf),
 776                     tbuf,
 777                     gmx_step_str(ir->init_step, sbuf2),
 778                     ir->init_step * ir->delta_t);
 779         }
 780         else
 781         {
 782             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
 783         }
 784         fprintf(fplog, "\n");
 785     }
 786
 787     walltime_accounting_start_time(walltime_accounting);
 788     wallcycle_start(wcycle, WallCycleCounter::Run);
 789     print_start(fplog, cr, walltime_accounting, "mdrun");
 790
 791     /***********************************************************
 792      *
 793      *             Loop over MD steps
 794      *
 795      ************************************************************/
 796
 797     bFirstStep = TRUE;
 798     /* Skip the first Nose-Hoover integration when we get the state from tpx */
 799     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
 800     bSumEkinhOld     = FALSE;
 801     bExchanged       = FALSE;
 802     bNeedRepartition = FALSE;
 803
 804     int64_t step     = ir->init_step;
 805     int64_t step_rel = 0;
 806
 807     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
 808             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]),
 809             simulationsShareState,
 810             MASTER(cr),
 811             ir->nstlist,
 812             mdrunOptions.reproducible,
 813             nstSignalComm,
 814             mdrunOptions.maximumHoursToRun,
 815             ir->nstlist == 0,
 816             fplog,
 817             step,
 818             bNS,
 819             walltime_accounting);
 820
 821     auto checkpointHandler = std::make_unique<CheckpointHandler>(
 822             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
 823             simulationsShareState,
 824             ir->nstlist == 0,
 825             MASTER(cr),
 826             mdrunOptions.writeConfout,
 827             mdrunOptions.checkpointOptions.period);
 828
 829     const bool resetCountersIsLocal = true;
 830     auto       resetHandler         = std::make_unique<ResetHandler>(
 831             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
 832             !resetCountersIsLocal,
 833             ir->nsteps,
 834             MASTER(cr),
 835             mdrunOptions.timingOptions.resetHalfway,
 836             mdrunOptions.maximumHoursToRun,
 837             mdlog,
 838             wcycle,
 839             walltime_accounting);
 840
 841     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
 842
 843     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
 844     {
 845         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
 846     }
 847
 848     /* and stop now if we should */
 849     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
 850     while (!bLastStep)
 851     {
 852
 853         /* Determine if this is a neighbor search step */
 854         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
 855
 856         if (bPMETune && bNStList)
 857         {
 858             // This has to be here because PME load balancing is called so early.
 859             // TODO: Move to after all booleans are defined.
 860             if (useGpuForUpdate && !bFirstStep)
 861             {
 862                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 863                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 864             }
 865             /* PME grid + cut-off optimization with GPUs or PME nodes */
 866             pme_loadbal_do(pme_loadbal,
 867                            cr,
 868                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
 869                            fplog,
 870                            mdlog,
 871                            *ir,
 872                            fr,
 873                            state->box,
 874                            state->x,
 875                            wcycle,
 876                            step,
 877                            step_rel,
 878                            &bPMETunePrinting,
 879                            simulationWork.useGpuPmePpCommunication);
 880         }
 881
 882         wallcycle_start(wcycle, WallCycleCounter::Step);
 883
 884         bLastStep = (step_rel == ir->nsteps);
 885         t         = t0 + step * ir->delta_t;
 886
 887         // TODO Refactor this, so that nstfep does not need a default value of zero
 888         if (ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
 889         {
 890             /* find and set the current lambdas */
 891             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
 892
 893             bDoDHDL = do_per_step(step, ir->fepvals->nstdhdl);
 894             bDoFEP  = ((ir->efep != FreeEnergyPerturbationType::No) && do_per_step(step, nstfep));
 895             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
 896                            && (!bFirstStep));
 897         }
 898
 899         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
 900                      && do_per_step(step, replExParams.exchangeInterval));
 901
 902         if (doSimulatedAnnealing)
 903         {
 904             // TODO: Avoid changing inputrec (#3854)
 905             // Simulated annealing updates the reference temperature.
 906             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
 907             update_annealing_target_temp(nonConstInputrec, t, &upd);
 908         }
 909
 910         /* Stop Center of Mass motion */
 911         bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && do_per_step(step, ir->nstcomm));
 912
 913         /* Determine whether or not to do Neighbour Searching */
 914         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
 915
 916         /* Note that the stopHandler will cause termination at nstglobalcomm
 917          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
 918          * nstpcouple steps, we have computed the half-step kinetic energy
 919          * of the previous step and can always output energies at the last step.
 920          */
 921         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
 922
 923         /* do_log triggers energy and virial calculation. Because this leads
 924          * to different code paths, forces can be different. Thus for exact
 925          * continuation we should avoid extra log output.
 926          * Note that the || bLastStep can result in non-exact continuation
 927          * beyond the last step. But we don't consider that to be an issue.
 928          */
 929         do_log     = (do_per_step(step, ir->nstlog)
 930                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
 931         do_verbose = mdrunOptions.verbose
 932                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
 933
 934         if (useGpuForUpdate && !bFirstStep && bNS)
 935         {
 936             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
 937             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
 938             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
 939             // Copy coordinate from the GPU when needed at the search step.
 940             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
 941             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
 942             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 943             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 944         }
 945
 946         // We only need to calculate virtual velocities if we are writing them in the current step
 947         const bool needVirtualVelocitiesThisStep =
 948                 (vsite != nullptr)
 949                 && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep());
 950
 951         if (vsite != nullptr)
 952         {
 953             // Virtual sites need to be updated before domain decomposition and forces are calculated
 954             wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
 955             // md-vv calculates virtual velocities once it has full-step real velocities
 956             vsite->construct(state->x,
 957                              state->v,
 958                              state->box,
 959                              (!EI_VV(inputrec->eI) && needVirtualVelocitiesThisStep)
 960                                      ? VSiteOperation::PositionsAndVelocities
 961                                      : VSiteOperation::Positions);
 962             wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
 963         }
 964
 965         if (bNS && !(bFirstStep && ir->bContinuation))
 966         {
 967             bMasterState = FALSE;
 968             /* Correct the new box if it is too skewed */
 969             if (inputrecDynamicBox(ir))
 970             {
 971                 if (correct_box(fplog, step, state->box))
 972                 {
 973                     bMasterState = TRUE;
 974                     // If update is offloaded, it should be informed about the box size change
 975                     if (useGpuForUpdate)
 976                     {
 977                         integrator->setPbc(PbcType::Xyz, state->box);
 978                     }
 979                 }
 980             }
 981             if (DOMAINDECOMP(cr) && bMasterState)
 982             {
 983                 dd_collect_state(cr->dd, state, state_global);
 984             }
 985
 986             if (DOMAINDECOMP(cr))
 987             {
 988                 /* Repartition the domain decomposition */
 989                 dd_partition_system(fplog,
 990                                     mdlog,
 991                                     step,
 992                                     cr,
 993                                     bMasterState,
 994                                     nstglobalcomm,
 995                                     state_global,
 996                                     top_global,
 997                                     *ir,
 998                                     imdSession,
 999                                     pull_work,
1000                                     state,
1001                                     &f,
1002                                     mdAtoms,
1003                                     &top,
1004                                     fr,
1005                                     vsite,
1006                                     constr,
1007                                     nrnb,
1008                                     wcycle,
1009                                     do_verbose && !bPMETunePrinting);
1010                 upd.updateAfterPartition(state->natoms,
1011                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1012                                                      : gmx::ArrayRef<const unsigned short>(),
1013                                          md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1014                                                  : gmx::ArrayRef<const unsigned short>());
1015             }
1016         }
1017
1018         // Allocate or re-size GPU halo exchange object, if necessary
1019         if (bNS && simulationWork.havePpDomainDecomposition && simulationWork.useGpuHaloExchange)
1020         {
1021             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
1022                                "GPU device manager has to be initialized to use GPU "
1023                                "version of halo exchange.");
1024             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
1025         }
1026
1027         if (MASTER(cr) && do_log)
1028         {
1029             gmx::EnergyOutput::printHeader(
1030                     fplog, step, t); /* can we improve the information printed here? */
1031         }
1032
1033         if (ir->efep != FreeEnergyPerturbationType::No)
1034         {
1035             update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
1036         }
1037
1038         if (bExchanged)
1039         {
1040             /* We need the kinetic energy at minus the half step for determining
1041              * the full step kinetic energy and possibly for T-coupling.*/
1042             /* This may not be quite working correctly yet . . . . */
1043             int cglo_flags = CGLO_GSTAT | CGLO_TEMPERATURE;
1044             if (DOMAINDECOMP(cr) && dd_localTopologyChecker(*cr->dd).shouldCheckNumberOfBondedInteractions())
1045             {
1046                 cglo_flags |= CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS;
1047             }
1048             compute_globals(gstat,
1049                             cr,
1050                             ir,
1051                             fr,
1052                             ekind,
1053                             makeConstArrayRef(state->x),
1054                             makeConstArrayRef(state->v),
1055                             state->box,
1056                             md,
1057                             nrnb,
1058                             &vcm,
1059                             wcycle,
1060                             enerd,
1061                             nullptr,
1062                             nullptr,
1063                             nullptr,
1064                             nullptr,
1065                             gmx::ArrayRef<real>{},
1066                             &nullSignaller,
1067                             state->box,
1068                             &bSumEkinhOld,
1069                             cglo_flags);
1070             if (DOMAINDECOMP(cr))
1071             {
1072                 dd_localTopologyChecker(cr->dd)->checkNumberOfBondedInteractions(
1073                         &top, makeConstArrayRef(state->x), state->box);
1074             }
1075         }
1076         clear_mat(force_vir);
1077
1078         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
1079
1080         /* Determine the energy and pressure:
1081          * at nstcalcenergy steps and at energy output steps (set below).
1082          */
1083         if (EI_VV(ir->eI) && (!bInitStep))
1084         {
1085             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1086             bCalcVir      = bCalcEnerStep
1087                        || (ir->epc != PressureCoupling::No
1088                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
1089         }
1090         else
1091         {
1092             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1093             bCalcVir      = bCalcEnerStep
1094                        || (ir->epc != PressureCoupling::No && do_per_step(step, ir->nstpcouple));
1095         }
1096         bCalcEner = bCalcEnerStep;
1097
1098         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
1099
1100         if (do_ene || do_log || bDoReplEx)
1101         {
1102             bCalcVir  = TRUE;
1103             bCalcEner = TRUE;
1104         }
1105
1106         /* Do we need global communication ? */
1107         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
1108                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
1109
1110         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
1111                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
1112                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
1113         if (simulationWork.useMts && !do_per_step(step, ir->nstfout))
1114         {
1115             // TODO: merge this with stepWork.useOnlyMtsCombinedForceBuffer
1116             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
1117         }
1118
1119         if (shellfc)
1120         {
1121             /* Now is the time to relax the shells */
1122             relax_shell_flexcon(fplog,
1123                                 cr,
1124                                 ms,
1125                                 mdrunOptions.verbose,
1126                                 enforcedRotation,
1127                                 step,
1128                                 ir,
1129                                 imdSession,
1130                                 pull_work,
1131                                 bNS,
1132                                 force_flags,
1133                                 &top,
1134                                 constr,
1135                                 enerd,
1136                                 state->natoms,
1137                                 state->x.arrayRefWithPadding(),
1138                                 state->v.arrayRefWithPadding(),
1139                                 state->box,
1140                                 state->lambda,
1141                                 &state->hist,
1142                                 &f.view(),
1143                                 force_vir,
1144                                 *md,
1145                                 nrnb,
1146                                 wcycle,
1147                                 shellfc,
1148                                 fr,
1149                                 runScheduleWork,
1150                                 t,
1151                                 mu_tot,
1152                                 vsite,
1153                                 ddBalanceRegionHandler);
1154         }
1155         else
1156         {
1157             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
1158                is updated (or the AWH update will be performed twice for one step when continuing).
1159                It would be best to call this update function from do_md_trajectory_writing but that
1160                would occur after do_force. One would have to divide the update_awh function into one
1161                function applying the AWH force and one doing the AWH bias update. The update AWH
1162                bias function could then be called after do_md_trajectory_writing (then containing
1163                update_awh_history). The checkpointing will in the future probably moved to the start
1164                of the md loop which will rid of this issue. */
1165             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
1166             {
1167                 awh->updateHistory(state_global->awhHistory.get());
1168             }
1169
1170             /* The coordinates (x) are shifted (to get whole molecules)
1171              * in do_force.
1172              * This is parallellized as well, and does communication too.
1173              * Check comments in sim_util.c
1174              */
1175             do_force(fplog,
1176                      cr,
1177                      ms,
1178                      *ir,
1179                      awh.get(),
1180                      enforcedRotation,
1181                      imdSession,
1182                      pull_work,
1183                      step,
1184                      nrnb,
1185                      wcycle,
1186                      &top,
1187                      state->box,
1188                      state->x.arrayRefWithPadding(),
1189                      &state->hist,
1190                      &f.view(),
1191                      force_vir,
1192                      md,
1193                      enerd,
1194                      state->lambda,
1195                      fr,
1196                      runScheduleWork,
1197                      vsite,
1198                      mu_tot,
1199                      t,
1200                      ed ? ed->getLegacyED() : nullptr,
1201                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
1202                      ddBalanceRegionHandler);
1203         }
1204
1205         // VV integrators do not need the following velocity half step
1206         // if it is the first step after starting from a checkpoint.
1207         // That is, the half step is needed on all other steps, and
1208         // also the first step when starting from a .tpr file.
1209         if (EI_VV(ir->eI))
1210         {
1211             integrateVVFirstStep(step,
1212                                  bFirstStep,
1213                                  bInitStep,
1214                                  startingBehavior,
1215                                  nstglobalcomm,
1216                                  ir,
1217                                  fr,
1218                                  cr,
1219                                  state,
1220                                  mdAtoms->mdatoms(),
1221                                  fcdata,
1222                                  &MassQ,
1223                                  &vcm,
1224                                  top,
1225                                  enerd,
1226                                  ekind,
1227                                  gstat,
1228                                  &last_ekin,
1229                                  bCalcVir,
1230                                  total_vir,
1231                                  shake_vir,
1232                                  force_vir,
1233                                  pres,
1234                                  M,
1235                                  do_log,
1236                                  do_ene,
1237                                  bCalcEner,
1238                                  bGStat,
1239                                  bStopCM,
1240                                  bTrotter,
1241                                  bExchanged,
1242                                  &bSumEkinhOld,
1243                                  &saved_conserved_quantity,
1244                                  &f,
1245                                  &upd,
1246                                  constr,
1247                                  &nullSignaller,
1248                                  trotter_seq,
1249                                  nrnb,
1250                                  fplog,
1251                                  wcycle);
1252             if (vsite != nullptr && needVirtualVelocitiesThisStep)
1253             {
1254                 // Positions were calculated earlier
1255                 wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
1256                 vsite->construct(state->x, state->v, state->box, VSiteOperation::Velocities);
1257                 wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
1258             }
1259         }
1260
1261         /* ########  END FIRST UPDATE STEP  ############## */
1262         /* ########  If doing VV, we now have v(dt) ###### */
1263         if (bDoExpanded)
1264         {
1265             /* perform extended ensemble sampling in lambda - we don't
1266                actually move to the new state before outputting
1267                statistics, but if performing simulated tempering, we
1268                do update the velocities and the tau_t. */
1269             // TODO: Avoid changing inputrec (#3854)
1270             // Simulated tempering updates the reference temperature.
1271             // Expanded ensemble without simulated tempering does not change the inputrec.
1272             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
1273             lamnew                 = ExpandedEnsembleDynamics(fplog,
1274                                               nonConstInputrec,
1275                                               enerd,
1276                                               state,
1277                                               &MassQ,
1278                                               state->fep_state,
1279                                               state->dfhist,
1280                                               step,
1281                                               state->v.rvec_array(),
1282                                               md->homenr,
1283                                               md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1284                                                                       : gmx::ArrayRef<const unsigned short>());
1285             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1286             if (MASTER(cr))
1287             {
1288                 copy_df_history(state_global->dfhist, state->dfhist);
1289             }
1290         }
1291
1292         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1293         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1294         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1295             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1296                 || checkpointHandler->isCheckpointingStep()))
1297         {
1298             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1299             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1300         }
1301         // Copy velocities if needed for the output/checkpointing.
1302         // NOTE: Copy on the search steps is done at the beginning of the step.
1303         if (useGpuForUpdate && !bNS
1304             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1305         {
1306             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1307             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1308         }
1309         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1310         // and update is offloaded hence forces are kept on the GPU for update and have not been
1311         // already transferred in do_force().
1312         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1313         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1314         //       prior to GPU update.
1315         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1316         //       copy call in do_force(...).
1317         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1318         //       on host after the D2H copy in do_force(...).
1319         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1320             && do_per_step(step, ir->nstfout))
1321         {
1322             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1323             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1324         }
1325         /* Now we have the energies and forces corresponding to the
1326          * coordinates at time t. We must output all of this before
1327          * the update.
1328          */
1329         do_md_trajectory_writing(fplog,
1330                                  cr,
1331                                  nfile,
1332                                  fnm,
1333                                  step,
1334                                  step_rel,
1335                                  t,
1336                                  ir,
1337                                  state,
1338                                  state_global,
1339                                  observablesHistory,
1340                                  top_global,
1341                                  fr,
1342                                  outf,
1343                                  energyOutput,
1344                                  ekind,
1345                                  f.view().force(),
1346                                  checkpointHandler->isCheckpointingStep(),
1347                                  bRerunMD,
1348                                  bLastStep,
1349                                  mdrunOptions.writeConfout,
1350                                  bSumEkinhOld);
1351         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1352         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x, t);
1353
1354         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1355         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1356             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1357         {
1358             copy_mat(state->svir_prev, shake_vir);
1359             copy_mat(state->fvir_prev, force_vir);
1360         }
1361
1362         stopHandler->setSignal();
1363         resetHandler->setSignal(walltime_accounting);
1364
1365         if (bGStat || !PAR(cr))
1366         {
1367             /* In parallel we only have to check for checkpointing in steps
1368              * where we do global communication,
1369              *  otherwise the other nodes don't know.
1370              */
1371             checkpointHandler->setSignal(walltime_accounting);
1372         }
1373
1374         /* #########   START SECOND UPDATE STEP ################# */
1375
1376         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1377            controlled in preprocessing */
1378
1379         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1380         {
1381             gmx_bool bIfRandomize;
1382             bIfRandomize = update_randomize_velocities(ir,
1383                                                        step,
1384                                                        cr,
1385                                                        md->homenr,
1386                                                        md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1387                                                                : gmx::ArrayRef<const unsigned short>(),
1388                                                        gmx::arrayRefFromArray(md->invmass, md->nr),
1389                                                        state->v,
1390                                                        &upd,
1391                                                        constr);
1392             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1393             if (constr && bIfRandomize)
1394             {
1395                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1396             }
1397         }
1398         /* Box is changed in update() when we do pressure coupling,
1399          * but we should still use the old box for energy corrections and when
1400          * writing it to the energy file, so it matches the trajectory files for
1401          * the same timestep above. Make a copy in a separate array.
1402          */
1403         copy_mat(state->box, lastbox);
1404
1405         dvdl_constr = 0;
1406
1407         if (!useGpuForUpdate)
1408         {
1409             wallcycle_start(wcycle, WallCycleCounter::Update);
1410         }
1411         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1412         if (bTrotter)
1413         {
1414             trotter_update(ir,
1415                            step,
1416                            ekind,
1417                            enerd,
1418                            state,
1419                            total_vir,
1420                            md->homenr,
1421                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1422                                    : gmx::ArrayRef<const unsigned short>(),
1423                            gmx::arrayRefFromArray(md->invmass, md->nr),
1424                            &MassQ,
1425                            trotter_seq,
1426                            TrotterSequence::Three);
1427             /* We can only do Berendsen coupling after we have summed
1428              * the kinetic energy or virial. Since the happens
1429              * in global_state after update, we should only do it at
1430              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1431              */
1432         }
1433         else
1434         {
1435             update_tcouple(step,
1436                            ir,
1437                            state,
1438                            ekind,
1439                            &MassQ,
1440                            md->homenr,
1441                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1442                                    : gmx::ArrayRef<const unsigned short>());
1443             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1444         }
1445
1446         /* With leap-frog type integrators we compute the kinetic energy
1447          * at a whole time step as the average of the half-time step kinetic
1448          * energies of two subsequent steps. Therefore we need to compute the
1449          * half step kinetic energy also if we need energies at the next step.
1450          */
1451         const bool needHalfStepKineticEnergy =
1452                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1453
1454         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1455         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1456         const bool doParrinelloRahman = (ir->epc == PressureCoupling::ParrinelloRahman
1457                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1458
1459         if (EI_VV(ir->eI))
1460         {
1461             GMX_ASSERT(!useGpuForUpdate, "GPU update is not supported with VVAK integrator.");
1462
1463             integrateVVSecondStep(step,
1464                                   ir,
1465                                   fr,
1466                                   cr,
1467                                   state,
1468                                   mdAtoms->mdatoms(),
1469                                   fcdata,
1470                                   &MassQ,
1471                                   &vcm,
1472                                   pull_work,
1473                                   enerd,
1474                                   ekind,
1475                                   gstat,
1476                                   &dvdl_constr,
1477                                   bCalcVir,
1478                                   total_vir,
1479                                   shake_vir,
1480                                   force_vir,
1481                                   pres,
1482                                   M,
1483                                   lastbox,
1484                                   do_log,
1485                                   do_ene,
1486                                   bGStat,
1487                                   &bSumEkinhOld,
1488                                   &f,
1489                                   &cbuf,
1490                                   &upd,
1491                                   constr,
1492                                   &nullSignaller,
1493                                   trotter_seq,
1494                                   nrnb,
1495                                   wcycle);
1496         }
1497         else
1498         {
1499             if (useGpuForUpdate)
1500             {
1501                 if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1502                 {
1503                     integrator->set(stateGpu->getCoordinates(),
1504                                     stateGpu->getVelocities(),
1505                                     stateGpu->getForces(),
1506                                     top.idef,
1507                                     *md);
1508
1509                     // Copy data to the GPU after buffers might have being reinitialized
1510                     /* The velocity copy is redundant if we had Center-of-Mass motion removed on
1511                      * the previous step. We don't check that now. */
1512                     stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1513                     if (!runScheduleWork->stepWork.haveGpuPmeOnThisRank
1514                         && !runScheduleWork->stepWork.useGpuXBufferOps)
1515                     {
1516                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1517                     }
1518                 }
1519
1520                 if (simulationWork.useGpuPme && simulationWork.useCpuPmePpCommunication)
1521                 {
1522                     // The PME forces were recieved to the host, so have to be copied
1523                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::All);
1524                 }
1525                 else if (!runScheduleWork->stepWork.useGpuFBufferOps)
1526                 {
1527                     // The buffer ops were not offloaded this step, so the forces are on the
1528                     // host and have to be copied
1529                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1530                 }
1531
1532                 const bool doTemperatureScaling =
1533                         (ir->etc != TemperatureCoupling::No
1534                          && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1535
1536                 // This applies Leap-Frog, LINCS and SETTLE in succession
1537                 integrator->integrate(
1538                         stateGpu->getForcesReadyOnDeviceEvent(
1539                                 AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1540                         ir->delta_t,
1541                         true,
1542                         bCalcVir,
1543                         shake_vir,
1544                         doTemperatureScaling,
1545                         ekind->tcstat,
1546                         doParrinelloRahman,
1547                         ir->nstpcouple * ir->delta_t,
1548                         M);
1549
1550                 // Copy velocities D2H after update if:
1551                 // - Globals are computed this step (includes the energy output steps).
1552                 // - Temperature is needed for the next step.
1553                 if (bGStat || needHalfStepKineticEnergy)
1554                 {
1555                     stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1556                     stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1557                 }
1558             }
1559             else
1560             {
1561                 /* With multiple time stepping we need to do an additional normal
1562                  * update step to obtain the virial, as the actual MTS integration
1563                  * using an acceleration where the slow forces are multiplied by mtsFactor.
1564                  * Using that acceleration would result in a virial with the slow
1565                  * force contribution would be a factor mtsFactor too large.
1566                  */
1567                 if (simulationWork.useMts && bCalcVir && constr != nullptr)
1568                 {
1569                     upd.update_for_constraint_virial(*ir,
1570                                                      md->homenr,
1571                                                      md->havePartiallyFrozenAtoms,
1572                                                      gmx::arrayRefFromArray(md->invmass, md->nr),
1573                                                      gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1574                                                      *state,
1575                                                      f.view().forceWithPadding(),
1576                                                      *ekind);
1577
1578                     constrain_coordinates(constr,
1579                                           do_log,
1580                                           do_ene,
1581                                           step,
1582                                           state,
1583                                           upd.xp()->arrayRefWithPadding(),
1584                                           &dvdl_constr,
1585                                           bCalcVir,
1586                                           shake_vir);
1587                 }
1588
1589                 ArrayRefWithPadding<const RVec> forceCombined =
1590                         (simulationWork.useMts && step % ir->mtsLevels[1].stepFactor == 0)
1591                                 ? f.view().forceMtsCombinedWithPadding()
1592                                 : f.view().forceWithPadding();
1593                 upd.update_coords(*ir,
1594                                   step,
1595                                   md->homenr,
1596                                   md->havePartiallyFrozenAtoms,
1597                                   gmx::arrayRefFromArray(md->ptype, md->nr),
1598                                   gmx::arrayRefFromArray(md->invmass, md->nr),
1599                                   gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1600                                   state,
1601                                   forceCombined,
1602                                   fcdata,
1603                                   ekind,
1604                                   M,
1605                                   etrtPOSITION,
1606                                   cr,
1607                                   constr != nullptr);
1608
1609                 wallcycle_stop(wcycle, WallCycleCounter::Update);
1610
1611                 constrain_coordinates(constr,
1612                                       do_log,
1613                                       do_ene,
1614                                       step,
1615                                       state,
1616                                       upd.xp()->arrayRefWithPadding(),
1617                                       &dvdl_constr,
1618                                       bCalcVir && !simulationWork.useMts,
1619                                       shake_vir);
1620
1621                 upd.update_sd_second_half(*ir,
1622                                           step,
1623                                           &dvdl_constr,
1624                                           md->homenr,
1625                                           gmx::arrayRefFromArray(md->ptype, md->nr),
1626                                           gmx::arrayRefFromArray(md->invmass, md->nr),
1627                                           state,
1628                                           cr,
1629                                           nrnb,
1630                                           wcycle,
1631                                           constr,
1632                                           do_log,
1633                                           do_ene);
1634                 upd.finish_update(
1635                         *ir, md->havePartiallyFrozenAtoms, md->homenr, state, wcycle, constr != nullptr);
1636             }
1637
1638             if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1639             {
1640                 updatePrevStepPullCom(pull_work, state);
1641             }
1642
1643             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1644         }
1645
1646         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1647         /* With Leap-Frog we can skip compute_globals at
1648          * non-communication steps, but we need to calculate
1649          * the kinetic energy one step before communication.
1650          */
1651         {
1652             // Organize to do inter-simulation signalling on steps if
1653             // and when algorithms require it.
1654             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1655
1656             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1657             {
1658                 // Copy coordinates when needed to stop the CM motion.
1659                 if (useGpuForUpdate && (bDoReplEx || (!EI_VV(ir->eI) && bStopCM)))
1660                 {
1661                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1662                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1663                 }
1664                 // Since we're already communicating at this step, we
1665                 // can propagate intra-simulation signals. Note that
1666                 // check_nstglobalcomm has the responsibility for
1667                 // choosing the value of nstglobalcomm that is one way
1668                 // bGStat becomes true, so we can't get into a
1669                 // situation where e.g. checkpointing can't be
1670                 // signalled.
1671                 bool                doIntraSimSignal = true;
1672                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1673
1674                 compute_globals(
1675                         gstat,
1676                         cr,
1677                         ir,
1678                         fr,
1679                         ekind,
1680                         makeConstArrayRef(state->x),
1681                         makeConstArrayRef(state->v),
1682                         state->box,
1683                         md,
1684                         nrnb,
1685                         &vcm,
1686                         wcycle,
1687                         enerd,
1688                         force_vir,
1689                         shake_vir,
1690                         total_vir,
1691                         pres,
1692                         (!EI_VV(ir->eI) && bCalcEner && constr != nullptr) ? constr->rmsdData()
1693                                                                            : gmx::ArrayRef<real>{},
1694                         &signaller,
1695                         lastbox,
1696                         &bSumEkinhOld,
1697                         (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1698                                 | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1699                                 | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1700                                 | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
1701                                 | (DOMAINDECOMP(cr) && dd_localTopologyChecker(*cr->dd).shouldCheckNumberOfBondedInteractions()
1702                                            ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1703                                            : 0));
1704                 if (DOMAINDECOMP(cr))
1705                 {
1706                     dd_localTopologyChecker(cr->dd)->checkNumberOfBondedInteractions(
1707                             &top, makeConstArrayRef(state->x), state->box);
1708                 }
1709                 if (!EI_VV(ir->eI) && bStopCM)
1710                 {
1711                     process_and_stopcm_grp(
1712                             fplog, &vcm, *md, makeArrayRef(state->x), makeArrayRef(state->v));
1713                     inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
1714
1715                     // TODO: The special case of removing CM motion should be dealt more gracefully
1716                     if (useGpuForUpdate)
1717                     {
1718                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1719                         // Here we block until the H2D copy completes because event sync with the
1720                         // force kernels that use the coordinates on the next steps is not implemented
1721                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1722                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1723                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1724                         if (vcm.mode != ComRemovalAlgorithm::No)
1725                         {
1726                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1727                         }
1728                     }
1729                 }
1730             }
1731         }
1732
1733         /* #############  END CALC EKIN AND PRESSURE ################# */
1734
1735         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1736            the virial that should probably be addressed eventually. state->veta has better properies,
1737            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1738            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1739
1740         if (ir->efep != FreeEnergyPerturbationType::No && !EI_VV(ir->eI))
1741         {
1742             /* Sum up the foreign energy and dK/dl terms for md and sd.
1743                Currently done every step so that dH/dl is correct in the .edr */
1744             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1745         }
1746
1747         update_pcouple_after_coordinates(fplog,
1748                                          step,
1749                                          ir,
1750                                          md->homenr,
1751                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1752                                                      : gmx::ArrayRef<const unsigned short>(),
1753                                          pres,
1754                                          force_vir,
1755                                          shake_vir,
1756                                          pressureCouplingMu,
1757                                          state,
1758                                          nrnb,
1759                                          upd.deform(),
1760                                          !useGpuForUpdate);
1761
1762         const bool doBerendsenPressureCoupling = (inputrec->epc == PressureCoupling::Berendsen
1763                                                   && do_per_step(step, inputrec->nstpcouple));
1764         const bool doCRescalePressureCoupling  = (inputrec->epc == PressureCoupling::CRescale
1765                                                  && do_per_step(step, inputrec->nstpcouple));
1766         if (useGpuForUpdate
1767             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1768         {
1769             integrator->scaleCoordinates(pressureCouplingMu);
1770             if (doCRescalePressureCoupling)
1771             {
1772                 matrix pressureCouplingInvMu;
1773                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1774                 integrator->scaleVelocities(pressureCouplingInvMu);
1775             }
1776             integrator->setPbc(PbcType::Xyz, state->box);
1777         }
1778
1779         /* ################# END UPDATE STEP 2 ################# */
1780         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1781
1782         /* The coordinates (x) were unshifted in update */
1783         if (!bGStat)
1784         {
1785             /* We will not sum ekinh_old,
1786              * so signal that we still have to do it.
1787              */
1788             bSumEkinhOld = TRUE;
1789         }
1790
1791         if (bCalcEner)
1792         {
1793             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1794
1795             /* use the directly determined last velocity, not actually the averaged half steps */
1796             if (bTrotter && ir->eI == IntegrationAlgorithm::VV)
1797             {
1798                 enerd->term[F_EKIN] = last_ekin;
1799             }
1800             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1801
1802             if (integratorHasConservedEnergyQuantity(ir))
1803             {
1804                 if (EI_VV(ir->eI))
1805                 {
1806                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1807                 }
1808                 else
1809                 {
1810                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1811                 }
1812             }
1813             /* #########  END PREPARING EDR OUTPUT  ###########  */
1814         }
1815
1816         /* Output stuff */
1817         if (MASTER(cr))
1818         {
1819             if (fplog && do_log && bDoExpanded)
1820             {
1821                 /* only needed if doing expanded ensemble */
1822                 PrintFreeEnergyInfoToFile(fplog,
1823                                           ir->fepvals.get(),
1824                                           ir->expandedvals.get(),
1825                                           ir->bSimTemp ? ir->simtempvals.get() : nullptr,
1826                                           state_global->dfhist,
1827                                           state->fep_state,
1828                                           ir->nstlog,
1829                                           step);
1830             }
1831             if (bCalcEner)
1832             {
1833                 energyOutput.addDataAtEnergyStep(bDoDHDL,
1834                                                  bCalcEnerStep,
1835                                                  t,
1836                                                  md->tmass,
1837                                                  enerd,
1838                                                  ir->fepvals.get(),
1839                                                  ir->expandedvals.get(),
1840                                                  lastbox,
1841                                                  PTCouplingArrays{ state->boxv,
1842                                                                    state->nosehoover_xi,
1843                                                                    state->nosehoover_vxi,
1844                                                                    state->nhpres_xi,
1845                                                                    state->nhpres_vxi },
1846                                                  state->fep_state,
1847                                                  total_vir,
1848                                                  pres,
1849                                                  ekind,
1850                                                  mu_tot,
1851                                                  constr);
1852             }
1853             else
1854             {
1855                 energyOutput.recordNonEnergyStep();
1856             }
1857
1858             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1859             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1860
1861             if (doSimulatedAnnealing)
1862             {
1863                 gmx::EnergyOutput::printAnnealingTemperatures(
1864                         do_log ? fplog : nullptr, groups, &(ir->opts));
1865             }
1866             if (do_log || do_ene || do_dr || do_or)
1867             {
1868                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf),
1869                                                    do_ene,
1870                                                    do_dr,
1871                                                    do_or,
1872                                                    do_log ? fplog : nullptr,
1873                                                    step,
1874                                                    t,
1875                                                    fr->fcdata.get(),
1876                                                    awh.get());
1877             }
1878             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1879             {
1880                 const bool isInitialOutput = false;
1881                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1882             }
1883
1884             if (ir->bPull)
1885             {
1886                 pull_print_output(pull_work, step, t);
1887             }
1888
1889             if (do_per_step(step, ir->nstlog))
1890             {
1891                 if (fflush(fplog) != 0)
1892                 {
1893                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1894                 }
1895             }
1896         }
1897         if (bDoExpanded)
1898         {
1899             /* Have to do this part _after_ outputting the logfile and the edr file */
1900             /* Gets written into the state at the beginning of next loop*/
1901             state->fep_state = lamnew;
1902         }
1903         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1904         {
1905             state->fep_state = awh->fepLambdaState();
1906         }
1907         /* Print the remaining wall clock time for the run */
1908         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1909         {
1910             if (shellfc)
1911             {
1912                 fprintf(stderr, "\n");
1913             }
1914             print_time(stderr, walltime_accounting, step, ir, cr);
1915         }
1916
1917         /* Ion/water position swapping.
1918          * Not done in last step since trajectory writing happens before this call
1919          * in the MD loop and exchanges would be lost anyway. */
1920         bNeedRepartition = FALSE;
1921         if ((ir->eSwapCoords != SwapType::No) && (step > 0) && !bLastStep
1922             && do_per_step(step, ir->swap->nstswap))
1923         {
1924             bNeedRepartition = do_swapcoords(cr,
1925                                              step,
1926                                              t,
1927                                              ir,
1928                                              swap,
1929                                              wcycle,
1930                                              as_rvec_array(state->x.data()),
1931                                              state->box,
1932                                              MASTER(cr) && mdrunOptions.verbose,
1933                                              bRerunMD);
1934
1935             if (bNeedRepartition && DOMAINDECOMP(cr))
1936             {
1937                 dd_collect_state(cr->dd, state, state_global);
1938             }
1939         }
1940
1941         /* Replica exchange */
1942         bExchanged = FALSE;
1943         if (bDoReplEx)
1944         {
1945             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1946         }
1947
1948         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1949         {
1950             dd_partition_system(fplog,
1951                                 mdlog,
1952                                 step,
1953                                 cr,
1954                                 TRUE,
1955                                 1,
1956                                 state_global,
1957                                 top_global,
1958                                 *ir,
1959                                 imdSession,
1960                                 pull_work,
1961                                 state,
1962                                 &f,
1963                                 mdAtoms,
1964                                 &top,
1965                                 fr,
1966                                 vsite,
1967                                 constr,
1968                                 nrnb,
1969                                 wcycle,
1970                                 FALSE);
1971             upd.updateAfterPartition(state->natoms,
1972                                      md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1973                                                  : gmx::ArrayRef<const unsigned short>(),
1974                                      md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1975                                              : gmx::ArrayRef<const unsigned short>());
1976         }
1977
1978         bFirstStep = FALSE;
1979         bInitStep  = FALSE;
1980
1981         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1982         /* With all integrators, except VV, we need to retain the pressure
1983          * at the current step for coupling at the next step.
1984          */
1985         if ((state->flags & enumValueToBitMask(StateEntry::PressurePrevious))
1986             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1987         {
1988             /* Store the pressure in t_state for pressure coupling
1989              * at the next MD step.
1990              */
1991             copy_mat(pres, state->pres_prev);
1992         }
1993
1994         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1995
1996         if ((membed != nullptr) && (!bLastStep))
1997         {
1998             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1999         }
2000
2001         cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
2002         if (DOMAINDECOMP(cr) && wcycle)
2003         {
2004             dd_cycles_add(cr->dd, cycles, ddCyclStep);
2005         }
2006
2007         /* increase the MD step number */
2008         step++;
2009         step_rel++;
2010
2011 #if GMX_FAHCORE
2012         if (MASTER(cr))
2013         {
2014             fcReportProgress(ir->nsteps + ir->init_step, step);
2015         }
2016 #endif
2017
2018         resetHandler->resetCounters(
2019                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
2020
2021         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
2022         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
2023     }
2024     /* End of main MD loop */
2025
2026     /* Closing TNG files can include compressing data. Therefore it is good to do that
2027      * before stopping the time measurements. */
2028     mdoutf_tng_close(outf);
2029
2030     /* Stop measuring walltime */
2031     walltime_accounting_end_time(walltime_accounting);
2032
2033     if (simulationWork.haveSeparatePmeRank)
2034     {
2035         /* Tell the PME only node to finish */
2036         gmx_pme_send_finish(cr);
2037     }
2038
2039     if (MASTER(cr))
2040     {
2041         if (ir->nstcalcenergy > 0)
2042         {
2043             energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
2044
2045             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
2046             energyOutput.printAverages(fplog, groups);
2047         }
2048     }
2049     done_mdoutf(outf);
2050
2051     if (bPMETune)
2052     {
2053         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
2054     }
2055
2056     done_shellfc(fplog, shellfc, step_rel);
2057
2058     if (useReplicaExchange && MASTER(cr))
2059     {
2060         print_replica_exchange_statistics(fplog, repl_ex);
2061     }
2062
2063     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
2064
2065     global_stat_destroy(gstat);
2066 }