src/gromacs/mdrun/md.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief Implements the integrator for normal molecular dynamics simulations
  40  *
  41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
  42  * \ingroup module_mdrun
  43  */
  44 #include "gmxpre.h"
  45
  46 #include <cinttypes>
  47 #include <cmath>
  48 #include <cstdio>
  49 #include <cstdlib>
  50
  51 #include <algorithm>
  52 #include <memory>
  53 #include <numeric>
  54
  55 #include "gromacs/applied_forces/awh/awh.h"
  56 #include "gromacs/applied_forces/awh/read_params.h"
  57 #include "gromacs/commandline/filenm.h"
  58 #include "gromacs/domdec/collect.h"
  59 #include "gromacs/domdec/dlbtiming.h"
  60 #include "gromacs/domdec/domdec.h"
  61 #include "gromacs/domdec/domdec_network.h"
  62 #include "gromacs/domdec/domdec_struct.h"
  63 #include "gromacs/domdec/gpuhaloexchange.h"
  64 #include "gromacs/domdec/localtopologychecker.h"
  65 #include "gromacs/domdec/mdsetup.h"
  66 #include "gromacs/domdec/partition.h"
  67 #include "gromacs/essentialdynamics/edsam.h"
  68 #include "gromacs/ewald/pme_load_balancing.h"
  69 #include "gromacs/ewald/pme_pp.h"
  70 #include "gromacs/fileio/trxio.h"
  71 #include "gromacs/gmxlib/network.h"
  72 #include "gromacs/gmxlib/nrnb.h"
  73 #include "gromacs/gpu_utils/device_stream_manager.h"
  74 #include "gromacs/gpu_utils/gpu_utils.h"
  75 #include "gromacs/imd/imd.h"
  76 #include "gromacs/listed_forces/listed_forces.h"
  77 #include "gromacs/math/functions.h"
  78 #include "gromacs/math/invertmatrix.h"
  79 #include "gromacs/math/vec.h"
  80 #include "gromacs/math/vectypes.h"
  81 #include "gromacs/mdlib/checkpointhandler.h"
  82 #include "gromacs/mdlib/compute_io.h"
  83 #include "gromacs/mdlib/constr.h"
  84 #include "gromacs/mdlib/coupling.h"
  85 #include "gromacs/mdlib/ebin.h"
  86 #include "gromacs/mdlib/enerdata_utils.h"
  87 #include "gromacs/mdlib/energyoutput.h"
  88 #include "gromacs/mdlib/expanded.h"
  89 #include "gromacs/mdlib/force.h"
  90 #include "gromacs/mdlib/force_flags.h"
  91 #include "gromacs/mdlib/forcerec.h"
  92 #include "gromacs/mdlib/freeenergyparameters.h"
  93 #include "gromacs/mdlib/md_support.h"
  94 #include "gromacs/mdlib/mdatoms.h"
  95 #include "gromacs/mdlib/mdoutf.h"
  96 #include "gromacs/mdlib/membed.h"
  97 #include "gromacs/mdlib/resethandler.h"
  98 #include "gromacs/mdlib/sighandler.h"
  99 #include "gromacs/mdlib/simulationsignal.h"
 100 #include "gromacs/mdlib/stat.h"
 101 #include "gromacs/mdlib/stophandler.h"
 102 #include "gromacs/mdlib/tgroup.h"
 103 #include "gromacs/mdlib/trajectory_writing.h"
 104 #include "gromacs/mdlib/update.h"
 105 #include "gromacs/mdlib/update_constrain_gpu.h"
 106 #include "gromacs/mdlib/update_vv.h"
 107 #include "gromacs/mdlib/vcm.h"
 108 #include "gromacs/mdlib/vsite.h"
 109 #include "gromacs/mdrunutility/freeenergy.h"
 110 #include "gromacs/mdrunutility/handlerestart.h"
 111 #include "gromacs/mdrunutility/multisim.h"
 112 #include "gromacs/mdrunutility/printtime.h"
 113 #include "gromacs/mdtypes/awh_history.h"
 114 #include "gromacs/mdtypes/awh_params.h"
 115 #include "gromacs/mdtypes/commrec.h"
 116 #include "gromacs/mdtypes/df_history.h"
 117 #include "gromacs/mdtypes/energyhistory.h"
 118 #include "gromacs/mdtypes/fcdata.h"
 119 #include "gromacs/mdtypes/forcebuffers.h"
 120 #include "gromacs/mdtypes/forcerec.h"
 121 #include "gromacs/mdtypes/group.h"
 122 #include "gromacs/mdtypes/inputrec.h"
 123 #include "gromacs/mdtypes/interaction_const.h"
 124 #include "gromacs/mdtypes/md_enums.h"
 125 #include "gromacs/mdtypes/mdatom.h"
 126 #include "gromacs/mdtypes/mdrunoptions.h"
 127 #include "gromacs/mdtypes/multipletimestepping.h"
 128 #include "gromacs/mdtypes/observableshistory.h"
 129 #include "gromacs/mdtypes/observablesreducer.h"
 130 #include "gromacs/mdtypes/pullhistory.h"
 131 #include "gromacs/mdtypes/simulation_workload.h"
 132 #include "gromacs/mdtypes/state.h"
 133 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
 134 #include "gromacs/modularsimulator/energydata.h"
 135 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 136 #include "gromacs/nbnxm/nbnxm.h"
 137 #include "gromacs/pbcutil/pbc.h"
 138 #include "gromacs/pulling/output.h"
 139 #include "gromacs/pulling/pull.h"
 140 #include "gromacs/swap/swapcoords.h"
 141 #include "gromacs/timing/wallcycle.h"
 142 #include "gromacs/timing/walltime_accounting.h"
 143 #include "gromacs/topology/atoms.h"
 144 #include "gromacs/topology/idef.h"
 145 #include "gromacs/topology/mtop_util.h"
 146 #include "gromacs/topology/topology.h"
 147 #include "gromacs/trajectory/trajectoryframe.h"
 148 #include "gromacs/utility/basedefinitions.h"
 149 #include "gromacs/utility/cstringutil.h"
 150 #include "gromacs/utility/fatalerror.h"
 151 #include "gromacs/utility/logger.h"
 152 #include "gromacs/utility/real.h"
 153 #include "gromacs/utility/smalloc.h"
 154
 155 #include "legacysimulator.h"
 156 #include "replicaexchange.h"
 157 #include "shellfc.h"
 158
 159 using gmx::SimulationSignaller;
 160
 161 void gmx::LegacySimulator::do_md()
 162 {
 163     // TODO Historically, the EM and MD "integrators" used different
 164     // names for the t_inputrec *parameter, but these must have the
 165     // same name, now that it's a member of a struct. We use this ir
 166     // alias to avoid a large ripple of nearly useless changes.
 167     // t_inputrec is being replaced by IMdpOptionsProvider, so this
 168     // will go away eventually.
 169     const t_inputrec* ir = inputrec;
 170
 171     double       t, t0 = ir->init_t;
 172     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
 173     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
 174     gmx_bool     bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
 175     gmx_bool     do_ene, do_log, do_verbose;
 176     gmx_bool     bMasterState;
 177     unsigned int force_flags;
 178     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
 179     int    i, m;
 180     rvec   mu_tot;
 181     matrix pressureCouplingMu, M;
 182     gmx_repl_ex_t     repl_ex = nullptr;
 183     gmx_global_stat_t gstat;
 184     gmx_shellfc_t*    shellfc;
 185     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
 186     gmx_bool          bTrotter;
 187     real              dvdl_constr;
 188     std::vector<RVec> cbuf;
 189     matrix            lastbox;
 190     int               lamnew = 0;
 191     /* for FEP */
 192     double    cycles;
 193     real      saved_conserved_quantity = 0;
 194     real      last_ekin                = 0;
 195     t_extmass MassQ;
 196     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
 197
 198     /* PME load balancing data for GPU kernels */
 199     gmx_bool bPMETune         = FALSE;
 200     gmx_bool bPMETunePrinting = FALSE;
 201
 202     bool bInteractiveMDstep = false;
 203
 204     SimulationSignals signals;
 205     // Most global communnication stages don't propagate mdrun
 206     // signals, and will use this object to achieve that.
 207     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
 208
 209     if (!mdrunOptions.writeConfout)
 210     {
 211         // This is on by default, and the main known use case for
 212         // turning it off is for convenience in benchmarking, which is
 213         // something that should not show up in the general user
 214         // interface.
 215         GMX_LOG(mdlog.info)
 216                 .asParagraph()
 217                 .appendText(
 218                         "The -noconfout functionality is deprecated, and may be removed in a "
 219                         "future version.");
 220     }
 221
 222     /* md-vv uses averaged full step velocities for T-control
 223        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 224        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 225     bTrotter = (EI_VV(ir->eI)
 226                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
 227
 228     const bool bRerunMD = false;
 229
 230     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
 231     bGStatEveryStep   = (nstglobalcomm == 1);
 232
 233     const SimulationGroups* groups = &top_global.groups;
 234
 235     std::unique_ptr<EssentialDynamics> ed = nullptr;
 236     if (opt2bSet("-ei", nfile, fnm))
 237     {
 238         /* Initialize essential dynamics sampling */
 239         ed = init_edsam(mdlog,
 240                         opt2fn_null("-ei", nfile, fnm),
 241                         opt2fn("-eo", nfile, fnm),
 242                         top_global,
 243                         *ir,
 244                         cr,
 245                         constr,
 246                         state_global,
 247                         observablesHistory,
 248                         oenv,
 249                         startingBehavior);
 250     }
 251     else if (observablesHistory->edsamHistory)
 252     {
 253         gmx_fatal(FARGS,
 254                   "The checkpoint is from a run with essential dynamics sampling, "
 255                   "but the current run did not specify the -ei option. "
 256                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
 257     }
 258
 259     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
 260     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
 261     initialize_lambdas(fplog,
 262                        ir->efep,
 263                        ir->bSimTemp,
 264                        *ir->fepvals,
 265                        ir->simtempvals->temperatures,
 266                        gmx::arrayRefFromArray(ir->opts.ref_t, ir->opts.ngtc),
 267                        MASTER(cr),
 268                        fep_state,
 269                        lambda);
 270     Update upd(*ir, deform);
 271     bool   doSimulatedAnnealing = false;
 272     {
 273         // TODO: Avoid changing inputrec (#3854)
 274         // Simulated annealing updates the reference temperature.
 275         auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
 276         doSimulatedAnnealing   = initSimulatedAnnealing(nonConstInputrec, &upd);
 277     }
 278     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
 279
 280     t_fcdata& fcdata = *fr->fcdata;
 281
 282     bool simulationsShareState = false;
 283     int  nstSignalComm         = nstglobalcomm;
 284     {
 285         // TODO This implementation of ensemble orientation restraints is nasty because
 286         // a user can't just do multi-sim with single-sim orientation restraints.
 287         bool usingEnsembleRestraints = (fcdata.disres->nsystems > 1) || ((ms != nullptr) && fcdata.orires);
 288         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim() && (ms != nullptr));
 289
 290         // Replica exchange, ensemble restraints and AWH need all
 291         // simulations to remain synchronized, so they need
 292         // checkpoints and stop conditions to act on the same step, so
 293         // the propagation of such signals must take place between
 294         // simulations, not just within simulations.
 295         // TODO: Make algorithm initializers set these flags.
 296         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
 297
 298         if (simulationsShareState)
 299         {
 300             // Inter-simulation signal communication does not need to happen
 301             // often, so we use a minimum of 200 steps to reduce overhead.
 302             const int c_minimumInterSimulationSignallingInterval = 200;
 303             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
 304                             * nstglobalcomm;
 305         }
 306     }
 307
 308     if (startingBehavior != StartingBehavior::RestartWithAppending)
 309     {
 310         pleaseCiteCouplingAlgorithms(fplog, *ir);
 311     }
 312     gmx_mdoutf*       outf = init_mdoutf(fplog,
 313                                    nfile,
 314                                    fnm,
 315                                    mdrunOptions,
 316                                    cr,
 317                                    outputProvider,
 318                                    mdModulesNotifiers,
 319                                    ir,
 320                                    top_global,
 321                                    oenv,
 322                                    wcycle,
 323                                    startingBehavior,
 324                                    simulationsShareState,
 325                                    ms);
 326     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf),
 327                                    top_global,
 328                                    *ir,
 329                                    pull_work,
 330                                    mdoutf_get_fp_dhdl(outf),
 331                                    false,
 332                                    startingBehavior,
 333                                    simulationsShareState,
 334                                    mdModulesNotifiers);
 335
 336     gstat = global_stat_init(ir);
 337
 338     const auto& simulationWork     = runScheduleWork->simulationWork;
 339     const bool  useGpuForPme       = simulationWork.useGpuPme;
 340     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
 341     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
 342     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
 343
 344     /* Check for polarizable models and flexible constraints */
 345     shellfc = init_shell_flexcon(fplog,
 346                                  top_global,
 347                                  constr ? constr->numFlexibleConstraints() : 0,
 348                                  ir->nstcalcenergy,
 349                                  DOMAINDECOMP(cr),
 350                                  useGpuForPme);
 351
 352     {
 353         double io = compute_io(ir, top_global.natoms, *groups, energyOutput.numEnergyTerms(), 1);
 354         if ((io > 2000) && MASTER(cr))
 355         {
 356             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
 357         }
 358     }
 359
 360     // Local state only becomes valid now.
 361     std::unique_ptr<t_state> stateInstance;
 362     t_state*                 state;
 363
 364     gmx_localtop_t     top(top_global.ffparams);
 365     ObservablesReducer observablesReducer = observablesReducerBuilder->build();
 366
 367     ForceBuffers     f(simulationWork.useMts,
 368                    ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 369                                ? PinningPolicy::PinnedIfSupported
 370                                : PinningPolicy::CannotBePinned);
 371     const t_mdatoms* md = mdAtoms->mdatoms();
 372     if (DOMAINDECOMP(cr))
 373     {
 374         stateInstance = std::make_unique<t_state>();
 375         state         = stateInstance.get();
 376         dd_init_local_state(*cr->dd, state_global, state);
 377
 378         /* Distribute the charge groups over the nodes from the master node */
 379         dd_partition_system(fplog,
 380                             mdlog,
 381                             ir->init_step,
 382                             cr,
 383                             TRUE,
 384                             1,
 385                             state_global,
 386                             top_global,
 387                             *ir,
 388                             imdSession,
 389                             pull_work,
 390                             state,
 391                             &f,
 392                             mdAtoms,
 393                             &top,
 394                             fr,
 395                             vsite,
 396                             constr,
 397                             nrnb,
 398                             nullptr,
 399                             FALSE);
 400         upd.updateAfterPartition(state->natoms,
 401                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
 402                                              : gmx::ArrayRef<const unsigned short>(),
 403                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
 404                                          : gmx::ArrayRef<const unsigned short>());
 405     }
 406     else
 407     {
 408         state_change_natoms(state_global, state_global->natoms);
 409         /* Copy the pointer to the global state */
 410         state = state_global;
 411
 412         /* Generate and initialize new topology */
 413         mdAlgorithmsSetupAtomData(cr, *ir, top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
 414
 415         upd.updateAfterPartition(state->natoms,
 416                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
 417                                              : gmx::ArrayRef<const unsigned short>(),
 418                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
 419                                          : gmx::ArrayRef<const unsigned short>());
 420     }
 421
 422     std::unique_ptr<UpdateConstrainGpu> integrator;
 423
 424     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
 425
 426     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
 427     if (useGpuForUpdate)
 428     {
 429         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
 430                                    || constr->numConstraintsTotal() == 0,
 431                            "Constraints in domain decomposition are only supported with update "
 432                            "groups if using GPU update.\n");
 433         GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
 434                                    || constr->numConstraintsTotal() == 0,
 435                            "SHAKE is not supported with GPU update.");
 436         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
 437                            "Either PME or short-ranged non-bonded interaction tasks must run on "
 438                            "the GPU to use GPU update.\n");
 439         GMX_RELEASE_ASSERT(ir->eI == IntegrationAlgorithm::MD,
 440                            "Only the md integrator is supported with the GPU update.\n");
 441         GMX_RELEASE_ASSERT(
 442                 ir->etc != TemperatureCoupling::NoseHoover,
 443                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
 444         GMX_RELEASE_ASSERT(
 445                 ir->epc == PressureCoupling::No || ir->epc == PressureCoupling::ParrinelloRahman
 446                         || ir->epc == PressureCoupling::Berendsen || ir->epc == PressureCoupling::CRescale,
 447                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
 448                 "with the GPU update.\n");
 449         GMX_RELEASE_ASSERT(!md->haveVsites,
 450                            "Virtual sites are not supported with the GPU update.\n");
 451         GMX_RELEASE_ASSERT(ed == nullptr,
 452                            "Essential dynamics is not supported with the GPU update.\n");
 453         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
 454                            "Constraints pulling is not supported with the GPU update.\n");
 455         GMX_RELEASE_ASSERT(fcdata.orires == nullptr,
 456                            "Orientation restraints are not supported with the GPU update.\n");
 457         GMX_RELEASE_ASSERT(
 458                 ir->efep == FreeEnergyPerturbationType::No
 459                         || (!haveFepPerturbedMasses(top_global) && !havePerturbedConstraints(top_global)),
 460                 "Free energy perturbation of masses and constraints are not supported with the GPU "
 461                 "update.");
 462
 463         if (constr != nullptr && constr->numConstraintsTotal() > 0)
 464         {
 465             GMX_LOG(mdlog.info)
 466                     .asParagraph()
 467                     .appendText("Updating coordinates and applying constraints on the GPU.");
 468         }
 469         else
 470         {
 471             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
 472         }
 473         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
 474                            "Device stream manager should be initialized in order to use GPU "
 475                            "update-constraints.");
 476         GMX_RELEASE_ASSERT(
 477                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
 478                 "Update stream should be initialized in order to use GPU "
 479                 "update-constraints.");
 480         integrator = std::make_unique<UpdateConstrainGpu>(
 481                 *ir,
 482                 top_global,
 483                 ekind->ngtc,
 484                 fr->deviceStreamManager->context(),
 485                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
 486                 wcycle);
 487
 488         stateGpu->setXUpdatedOnDeviceEvent(integrator->xUpdatedOnDeviceEvent());
 489
 490         integrator->setPbc(PbcType::Xyz, state->box);
 491     }
 492
 493     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 494     {
 495         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
 496     }
 497     if (useGpuForUpdate)
 498     {
 499         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
 500     }
 501
 502     // NOTE: The global state is no longer used at this point.
 503     // But state_global is still used as temporary storage space for writing
 504     // the global state to file and potentially for replica exchange.
 505     // (Global topology should persist.)
 506
 507     update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
 508
 509     if (ir->bExpanded)
 510     {
 511         /* Check nstexpanded here, because the grompp check was broken */
 512         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
 513         {
 514             gmx_fatal(FARGS,
 515                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
 516         }
 517         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
 518     }
 519
 520     if (MASTER(cr))
 521     {
 522         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
 523     }
 524
 525     preparePrevStepPullCom(ir,
 526                            pull_work,
 527                            gmx::arrayRefFromArray(md->massT, md->nr),
 528                            state,
 529                            state_global,
 530                            cr,
 531                            startingBehavior != StartingBehavior::NewSimulation);
 532
 533     // TODO: Remove this by converting AWH into a ForceProvider
 534     auto awh = prepareAwhModule(fplog,
 535                                 *ir,
 536                                 state_global,
 537                                 cr,
 538                                 ms,
 539                                 startingBehavior != StartingBehavior::NewSimulation,
 540                                 shellfc != nullptr,
 541                                 opt2fn("-awh", nfile, fnm),
 542                                 pull_work);
 543
 544     if (useReplicaExchange && MASTER(cr))
 545     {
 546         repl_ex = init_replica_exchange(fplog, ms, top_global.natoms, ir, replExParams);
 547     }
 548     /* PME tuning is only supported in the Verlet scheme, with PME for
 549      * Coulomb. It is not supported with only LJ PME. */
 550     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
 551                 && ir->cutoff_scheme != CutoffScheme::Group);
 552
 553     pme_load_balancing_t* pme_loadbal = nullptr;
 554     if (bPMETune)
 555     {
 556         pme_loadbal_init(
 557                 &pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu());
 558     }
 559
 560     if (!ir->bContinuation)
 561     {
 562         if (state->flags & enumValueToBitMask(StateEntry::V))
 563         {
 564             auto v = makeArrayRef(state->v);
 565             /* Set the velocities of vsites, shells and frozen atoms to zero */
 566             for (i = 0; i < md->homenr; i++)
 567             {
 568                 if (md->ptype[i] == ParticleType::Shell)
 569                 {
 570                     clear_rvec(v[i]);
 571                 }
 572                 else if (md->cFREEZE)
 573                 {
 574                     for (m = 0; m < DIM; m++)
 575                     {
 576                         if (ir->opts.nFreeze[md->cFREEZE[i]][m])
 577                         {
 578                             v[i][m] = 0;
 579                         }
 580                     }
 581                 }
 582             }
 583         }
 584
 585         if (constr)
 586         {
 587             /* Constrain the initial coordinates and velocities */
 588             do_constrain_first(fplog,
 589                                constr,
 590                                ir,
 591                                md->nr,
 592                                md->homenr,
 593                                state->x.arrayRefWithPadding(),
 594                                state->v.arrayRefWithPadding(),
 595                                state->box,
 596                                state->lambda[FreeEnergyPerturbationCouplingType::Bonded]);
 597         }
 598     }
 599
 600     const int nstfep = computeFepPeriod(*ir, replExParams);
 601
 602     /* Be REALLY careful about what flags you set here. You CANNOT assume
 603      * this is the first step, since we might be restarting from a checkpoint,
 604      * and in that case we should not do any modifications to the state.
 605      */
 606     bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && !ir->bContinuation);
 607
 608     // When restarting from a checkpoint, it can be appropriate to
 609     // initialize ekind from quantities in the checkpoint. Otherwise,
 610     // compute_globals must initialize ekind before the simulation
 611     // starts/restarts. However, only the master rank knows what was
 612     // found in the checkpoint file, so we have to communicate in
 613     // order to coordinate the restart.
 614     //
 615     // TODO Consider removing this communication if/when checkpoint
 616     // reading directly follows .tpr reading, because all ranks can
 617     // agree on hasReadEkinState at that time.
 618     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
 619     if (PAR(cr))
 620     {
 621         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
 622     }
 623     if (hasReadEkinState)
 624     {
 625         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
 626     }
 627
 628     unsigned int cglo_flags =
 629             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
 630              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
 631
 632     bSumEkinhOld = FALSE;
 633
 634     t_vcm vcm(top_global.groups, *ir);
 635     reportComRemovalInfo(fplog, vcm);
 636
 637     int64_t step     = ir->init_step;
 638     int64_t step_rel = 0;
 639
 640     /* To minimize communication, compute_globals computes the COM velocity
 641      * and the kinetic energy for the velocities without COM motion removed.
 642      * Thus to get the kinetic energy without the COM contribution, we need
 643      * to call compute_globals twice.
 644      */
 645     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
 646     {
 647         unsigned int cglo_flags_iteration = cglo_flags;
 648         if (bStopCM && cgloIteration == 0)
 649         {
 650             cglo_flags_iteration |= CGLO_STOPCM;
 651             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
 652         }
 653         if (DOMAINDECOMP(cr) && dd_localTopologyChecker(*cr->dd).shouldCheckNumberOfBondedInteractions()
 654             && cgloIteration == 0)
 655         {
 656             cglo_flags_iteration |= CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS;
 657         }
 658         compute_globals(gstat,
 659                         cr,
 660                         ir,
 661                         fr,
 662                         ekind,
 663                         makeConstArrayRef(state->x),
 664                         makeConstArrayRef(state->v),
 665                         state->box,
 666                         md,
 667                         nrnb,
 668                         &vcm,
 669                         nullptr,
 670                         enerd,
 671                         force_vir,
 672                         shake_vir,
 673                         total_vir,
 674                         pres,
 675                         gmx::ArrayRef<real>{},
 676                         &nullSignaller,
 677                         state->box,
 678                         &bSumEkinhOld,
 679                         cglo_flags_iteration,
 680                         step,
 681                         &observablesReducer);
 682         if (cglo_flags_iteration & CGLO_STOPCM)
 683         {
 684             /* At initialization, do not pass x with acceleration-correction mode
 685              * to avoid (incorrect) correction of the initial coordinates.
 686              */
 687             auto x = (vcm.mode == ComRemovalAlgorithm::LinearAccelerationCorrection)
 688                              ? ArrayRef<RVec>()
 689                              : makeArrayRef(state->x);
 690             process_and_stopcm_grp(fplog, &vcm, *md, x, makeArrayRef(state->v));
 691             inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
 692         }
 693     }
 694     if (DOMAINDECOMP(cr))
 695     {
 696         dd_localTopologyChecker(cr->dd)->checkNumberOfBondedInteractions(
 697                 &top, makeConstArrayRef(state->x), state->box);
 698     }
 699     if (ir->eI == IntegrationAlgorithm::VVAK)
 700     {
 701         /* a second call to get the half step temperature initialized as well */
 702         /* we do the same call as above, but turn the pressure off -- internally to
 703            compute_globals, this is recognized as a velocity verlet half-step
 704            kinetic energy calculation.  This minimized excess variables, but
 705            perhaps loses some logic?*/
 706
 707         compute_globals(gstat,
 708                         cr,
 709                         ir,
 710                         fr,
 711                         ekind,
 712                         makeConstArrayRef(state->x),
 713                         makeConstArrayRef(state->v),
 714                         state->box,
 715                         md,
 716                         nrnb,
 717                         &vcm,
 718                         nullptr,
 719                         enerd,
 720                         force_vir,
 721                         shake_vir,
 722                         total_vir,
 723                         pres,
 724                         gmx::ArrayRef<real>{},
 725                         &nullSignaller,
 726                         state->box,
 727                         &bSumEkinhOld,
 728                         cglo_flags & ~CGLO_PRESSURE,
 729                         step,
 730                         &observablesReducer);
 731     }
 732
 733     /* Calculate the initial half step temperature, and save the ekinh_old */
 734     if (startingBehavior == StartingBehavior::NewSimulation)
 735     {
 736         for (i = 0; (i < ir->opts.ngtc); i++)
 737         {
 738             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
 739         }
 740     }
 741
 742     /* need to make an initiation call to get the Trotter variables set, as well as other constants
 743        for non-trotter temperature control */
 744     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
 745
 746     if (MASTER(cr))
 747     {
 748         if (!ir->bContinuation)
 749         {
 750             if (constr && ir->eConstrAlg == ConstraintAlgorithm::Lincs)
 751             {
 752                 fprintf(fplog,
 753                         "RMS relative constraint deviation after constraining: %.2e\n",
 754                         constr->rmsd());
 755             }
 756             if (EI_STATE_VELOCITY(ir->eI))
 757             {
 758                 real temp = enerd->term[F_TEMP];
 759                 if (ir->eI != IntegrationAlgorithm::VV)
 760                 {
 761                     /* Result of Ekin averaged over velocities of -half
 762                      * and +half step, while we only have -half step here.
 763                      */
 764                     temp *= 2;
 765                 }
 766                 fprintf(fplog, "Initial temperature: %g K\n", temp);
 767             }
 768         }
 769
 770         char tbuf[20];
 771         fprintf(stderr, "starting mdrun '%s'\n", *(top_global.name));
 772         if (ir->nsteps >= 0)
 773         {
 774             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
 775         }
 776         else
 777         {
 778             sprintf(tbuf, "%s", "infinite");
 779         }
 780         if (ir->init_step > 0)
 781         {
 782             fprintf(stderr,
 783                     "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 784                     gmx_step_str(ir->init_step + ir->nsteps, sbuf),
 785                     tbuf,
 786                     gmx_step_str(ir->init_step, sbuf2),
 787                     ir->init_step * ir->delta_t);
 788         }
 789         else
 790         {
 791             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
 792         }
 793         fprintf(fplog, "\n");
 794     }
 795
 796     walltime_accounting_start_time(walltime_accounting);
 797     wallcycle_start(wcycle, WallCycleCounter::Run);
 798     print_start(fplog, cr, walltime_accounting, "mdrun");
 799
 800     /***********************************************************
 801      *
 802      *             Loop over MD steps
 803      *
 804      ************************************************************/
 805
 806     bFirstStep = TRUE;
 807     /* Skip the first Nose-Hoover integration when we get the state from tpx */
 808     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
 809     bSumEkinhOld     = FALSE;
 810     bExchanged       = FALSE;
 811     bNeedRepartition = FALSE;
 812
 813     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
 814             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]),
 815             simulationsShareState,
 816             MASTER(cr),
 817             ir->nstlist,
 818             mdrunOptions.reproducible,
 819             nstSignalComm,
 820             mdrunOptions.maximumHoursToRun,
 821             ir->nstlist == 0,
 822             fplog,
 823             step,
 824             bNS,
 825             walltime_accounting);
 826
 827     auto checkpointHandler = std::make_unique<CheckpointHandler>(
 828             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
 829             simulationsShareState,
 830             ir->nstlist == 0,
 831             MASTER(cr),
 832             mdrunOptions.writeConfout,
 833             mdrunOptions.checkpointOptions.period);
 834
 835     const bool resetCountersIsLocal = true;
 836     auto       resetHandler         = std::make_unique<ResetHandler>(
 837             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
 838             !resetCountersIsLocal,
 839             ir->nsteps,
 840             MASTER(cr),
 841             mdrunOptions.timingOptions.resetHalfway,
 842             mdrunOptions.maximumHoursToRun,
 843             mdlog,
 844             wcycle,
 845             walltime_accounting);
 846
 847     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
 848
 849     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
 850     {
 851         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
 852     }
 853
 854     /* and stop now if we should */
 855     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
 856     while (!bLastStep)
 857     {
 858
 859         /* Determine if this is a neighbor search step */
 860         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
 861
 862         if (bPMETune && bNStList)
 863         {
 864             // This has to be here because PME load balancing is called so early.
 865             // TODO: Move to after all booleans are defined.
 866             if (useGpuForUpdate && !bFirstStep)
 867             {
 868                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 869                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 870             }
 871             /* PME grid + cut-off optimization with GPUs or PME nodes */
 872             pme_loadbal_do(pme_loadbal,
 873                            cr,
 874                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
 875                            fplog,
 876                            mdlog,
 877                            *ir,
 878                            fr,
 879                            state->box,
 880                            state->x,
 881                            wcycle,
 882                            step,
 883                            step_rel,
 884                            &bPMETunePrinting,
 885                            simulationWork.useGpuPmePpCommunication);
 886         }
 887
 888         wallcycle_start(wcycle, WallCycleCounter::Step);
 889
 890         bLastStep = (step_rel == ir->nsteps);
 891         t         = t0 + step * ir->delta_t;
 892
 893         // TODO Refactor this, so that nstfep does not need a default value of zero
 894         if (ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
 895         {
 896             /* find and set the current lambdas */
 897             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
 898
 899             bDoDHDL = do_per_step(step, ir->fepvals->nstdhdl);
 900             bDoFEP  = ((ir->efep != FreeEnergyPerturbationType::No) && do_per_step(step, nstfep));
 901             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
 902                            && (!bFirstStep));
 903         }
 904
 905         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
 906                      && do_per_step(step, replExParams.exchangeInterval));
 907
 908         if (doSimulatedAnnealing)
 909         {
 910             // TODO: Avoid changing inputrec (#3854)
 911             // Simulated annealing updates the reference temperature.
 912             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
 913             update_annealing_target_temp(nonConstInputrec, t, &upd);
 914         }
 915
 916         /* Stop Center of Mass motion */
 917         bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && do_per_step(step, ir->nstcomm));
 918
 919         /* Determine whether or not to do Neighbour Searching */
 920         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
 921
 922         /* Note that the stopHandler will cause termination at nstglobalcomm
 923          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
 924          * nstpcouple steps, we have computed the half-step kinetic energy
 925          * of the previous step and can always output energies at the last step.
 926          */
 927         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
 928
 929         /* do_log triggers energy and virial calculation. Because this leads
 930          * to different code paths, forces can be different. Thus for exact
 931          * continuation we should avoid extra log output.
 932          * Note that the || bLastStep can result in non-exact continuation
 933          * beyond the last step. But we don't consider that to be an issue.
 934          */
 935         do_log     = (do_per_step(step, ir->nstlog)
 936                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
 937         do_verbose = mdrunOptions.verbose
 938                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
 939
 940         if (useGpuForUpdate && !bFirstStep && bNS)
 941         {
 942             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
 943             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
 944             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
 945             // Copy coordinate from the GPU when needed at the search step.
 946             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
 947             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
 948             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 949             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 950         }
 951
 952         // We only need to calculate virtual velocities if we are writing them in the current step
 953         const bool needVirtualVelocitiesThisStep =
 954                 (vsite != nullptr)
 955                 && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep());
 956
 957         if (vsite != nullptr)
 958         {
 959             // Virtual sites need to be updated before domain decomposition and forces are calculated
 960             wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
 961             // md-vv calculates virtual velocities once it has full-step real velocities
 962             vsite->construct(state->x,
 963                              state->v,
 964                              state->box,
 965                              (!EI_VV(inputrec->eI) && needVirtualVelocitiesThisStep)
 966                                      ? VSiteOperation::PositionsAndVelocities
 967                                      : VSiteOperation::Positions);
 968             wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
 969         }
 970
 971         if (bNS && !(bFirstStep && ir->bContinuation))
 972         {
 973             bMasterState = FALSE;
 974             /* Correct the new box if it is too skewed */
 975             if (inputrecDynamicBox(ir))
 976             {
 977                 if (correct_box(fplog, step, state->box))
 978                 {
 979                     bMasterState = TRUE;
 980                     // If update is offloaded, it should be informed about the box size change
 981                     if (useGpuForUpdate)
 982                     {
 983                         integrator->setPbc(PbcType::Xyz, state->box);
 984                     }
 985                 }
 986             }
 987             if (DOMAINDECOMP(cr) && bMasterState)
 988             {
 989                 dd_collect_state(cr->dd, state, state_global);
 990             }
 991
 992             if (DOMAINDECOMP(cr))
 993             {
 994                 /* Repartition the domain decomposition */
 995                 dd_partition_system(fplog,
 996                                     mdlog,
 997                                     step,
 998                                     cr,
 999                                     bMasterState,
1000                                     nstglobalcomm,
1001                                     state_global,
1002                                     top_global,
1003                                     *ir,
1004                                     imdSession,
1005                                     pull_work,
1006                                     state,
1007                                     &f,
1008                                     mdAtoms,
1009                                     &top,
1010                                     fr,
1011                                     vsite,
1012                                     constr,
1013                                     nrnb,
1014                                     wcycle,
1015                                     do_verbose && !bPMETunePrinting);
1016                 upd.updateAfterPartition(state->natoms,
1017                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1018                                                      : gmx::ArrayRef<const unsigned short>(),
1019                                          md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1020                                                  : gmx::ArrayRef<const unsigned short>());
1021             }
1022         }
1023
1024         // Allocate or re-size GPU halo exchange object, if necessary
1025         if (bNS && simulationWork.havePpDomainDecomposition && simulationWork.useGpuHaloExchange)
1026         {
1027             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
1028                                "GPU device manager has to be initialized to use GPU "
1029                                "version of halo exchange.");
1030             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
1031         }
1032
1033         if (MASTER(cr) && do_log)
1034         {
1035             gmx::EnergyOutput::printHeader(
1036                     fplog, step, t); /* can we improve the information printed here? */
1037         }
1038
1039         if (ir->efep != FreeEnergyPerturbationType::No)
1040         {
1041             update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
1042         }
1043
1044         if (bExchanged)
1045         {
1046             /* We need the kinetic energy at minus the half step for determining
1047              * the full step kinetic energy and possibly for T-coupling.*/
1048             /* This may not be quite working correctly yet . . . . */
1049             int cglo_flags = CGLO_GSTAT | CGLO_TEMPERATURE;
1050             if (DOMAINDECOMP(cr) && dd_localTopologyChecker(*cr->dd).shouldCheckNumberOfBondedInteractions())
1051             {
1052                 cglo_flags |= CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS;
1053             }
1054             compute_globals(gstat,
1055                             cr,
1056                             ir,
1057                             fr,
1058                             ekind,
1059                             makeConstArrayRef(state->x),
1060                             makeConstArrayRef(state->v),
1061                             state->box,
1062                             md,
1063                             nrnb,
1064                             &vcm,
1065                             wcycle,
1066                             enerd,
1067                             nullptr,
1068                             nullptr,
1069                             nullptr,
1070                             nullptr,
1071                             gmx::ArrayRef<real>{},
1072                             &nullSignaller,
1073                             state->box,
1074                             &bSumEkinhOld,
1075                             cglo_flags,
1076                             step,
1077                             &observablesReducer);
1078             if (DOMAINDECOMP(cr))
1079             {
1080                 dd_localTopologyChecker(cr->dd)->checkNumberOfBondedInteractions(
1081                         &top, makeConstArrayRef(state->x), state->box);
1082             }
1083         }
1084         clear_mat(force_vir);
1085
1086         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
1087
1088         /* Determine the energy and pressure:
1089          * at nstcalcenergy steps and at energy output steps (set below).
1090          */
1091         if (EI_VV(ir->eI) && (!bInitStep))
1092         {
1093             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1094             bCalcVir      = bCalcEnerStep
1095                        || (ir->epc != PressureCoupling::No
1096                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
1097         }
1098         else
1099         {
1100             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1101             bCalcVir      = bCalcEnerStep
1102                        || (ir->epc != PressureCoupling::No && do_per_step(step, ir->nstpcouple));
1103         }
1104         bCalcEner = bCalcEnerStep;
1105
1106         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
1107
1108         if (do_ene || do_log || bDoReplEx)
1109         {
1110             bCalcVir  = TRUE;
1111             bCalcEner = TRUE;
1112         }
1113
1114         /* Do we need global communication ? */
1115         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
1116                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
1117
1118         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
1119                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
1120                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
1121         if (simulationWork.useMts && !do_per_step(step, ir->nstfout))
1122         {
1123             // TODO: merge this with stepWork.useOnlyMtsCombinedForceBuffer
1124             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
1125         }
1126
1127         if (shellfc)
1128         {
1129             /* Now is the time to relax the shells */
1130             relax_shell_flexcon(fplog,
1131                                 cr,
1132                                 ms,
1133                                 mdrunOptions.verbose,
1134                                 enforcedRotation,
1135                                 step,
1136                                 ir,
1137                                 imdSession,
1138                                 pull_work,
1139                                 bNS,
1140                                 force_flags,
1141                                 &top,
1142                                 constr,
1143                                 enerd,
1144                                 state->natoms,
1145                                 state->x.arrayRefWithPadding(),
1146                                 state->v.arrayRefWithPadding(),
1147                                 state->box,
1148                                 state->lambda,
1149                                 &state->hist,
1150                                 &f.view(),
1151                                 force_vir,
1152                                 *md,
1153                                 nrnb,
1154                                 wcycle,
1155                                 shellfc,
1156                                 fr,
1157                                 runScheduleWork,
1158                                 t,
1159                                 mu_tot,
1160                                 vsite,
1161                                 ddBalanceRegionHandler);
1162         }
1163         else
1164         {
1165             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
1166                is updated (or the AWH update will be performed twice for one step when continuing).
1167                It would be best to call this update function from do_md_trajectory_writing but that
1168                would occur after do_force. One would have to divide the update_awh function into one
1169                function applying the AWH force and one doing the AWH bias update. The update AWH
1170                bias function could then be called after do_md_trajectory_writing (then containing
1171                update_awh_history). The checkpointing will in the future probably moved to the start
1172                of the md loop which will rid of this issue. */
1173             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
1174             {
1175                 awh->updateHistory(state_global->awhHistory.get());
1176             }
1177
1178             /* The coordinates (x) are shifted (to get whole molecules)
1179              * in do_force.
1180              * This is parallellized as well, and does communication too.
1181              * Check comments in sim_util.c
1182              */
1183             do_force(fplog,
1184                      cr,
1185                      ms,
1186                      *ir,
1187                      awh.get(),
1188                      enforcedRotation,
1189                      imdSession,
1190                      pull_work,
1191                      step,
1192                      nrnb,
1193                      wcycle,
1194                      &top,
1195                      state->box,
1196                      state->x.arrayRefWithPadding(),
1197                      &state->hist,
1198                      &f.view(),
1199                      force_vir,
1200                      md,
1201                      enerd,
1202                      state->lambda,
1203                      fr,
1204                      runScheduleWork,
1205                      vsite,
1206                      mu_tot,
1207                      t,
1208                      ed ? ed->getLegacyED() : nullptr,
1209                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
1210                      ddBalanceRegionHandler);
1211         }
1212
1213         // VV integrators do not need the following velocity half step
1214         // if it is the first step after starting from a checkpoint.
1215         // That is, the half step is needed on all other steps, and
1216         // also the first step when starting from a .tpr file.
1217         if (EI_VV(ir->eI))
1218         {
1219             integrateVVFirstStep(step,
1220                                  bFirstStep,
1221                                  bInitStep,
1222                                  startingBehavior,
1223                                  nstglobalcomm,
1224                                  ir,
1225                                  fr,
1226                                  cr,
1227                                  state,
1228                                  mdAtoms->mdatoms(),
1229                                  &fcdata,
1230                                  &MassQ,
1231                                  &vcm,
1232                                  top,
1233                                  enerd,
1234                                  &observablesReducer,
1235                                  ekind,
1236                                  gstat,
1237                                  &last_ekin,
1238                                  bCalcVir,
1239                                  total_vir,
1240                                  shake_vir,
1241                                  force_vir,
1242                                  pres,
1243                                  M,
1244                                  do_log,
1245                                  do_ene,
1246                                  bCalcEner,
1247                                  bGStat,
1248                                  bStopCM,
1249                                  bTrotter,
1250                                  bExchanged,
1251                                  &bSumEkinhOld,
1252                                  &saved_conserved_quantity,
1253                                  &f,
1254                                  &upd,
1255                                  constr,
1256                                  &nullSignaller,
1257                                  trotter_seq,
1258                                  nrnb,
1259                                  fplog,
1260                                  wcycle);
1261             if (vsite != nullptr && needVirtualVelocitiesThisStep)
1262             {
1263                 // Positions were calculated earlier
1264                 wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
1265                 vsite->construct(state->x, state->v, state->box, VSiteOperation::Velocities);
1266                 wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
1267             }
1268         }
1269
1270         /* ########  END FIRST UPDATE STEP  ############## */
1271         /* ########  If doing VV, we now have v(dt) ###### */
1272         if (bDoExpanded)
1273         {
1274             /* perform extended ensemble sampling in lambda - we don't
1275                actually move to the new state before outputting
1276                statistics, but if performing simulated tempering, we
1277                do update the velocities and the tau_t. */
1278             // TODO: Avoid changing inputrec (#3854)
1279             // Simulated tempering updates the reference temperature.
1280             // Expanded ensemble without simulated tempering does not change the inputrec.
1281             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
1282             lamnew                 = ExpandedEnsembleDynamics(fplog,
1283                                               nonConstInputrec,
1284                                               enerd,
1285                                               state,
1286                                               &MassQ,
1287                                               state->fep_state,
1288                                               state->dfhist,
1289                                               step,
1290                                               state->v.rvec_array(),
1291                                               md->homenr,
1292                                               md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1293                                                                       : gmx::ArrayRef<const unsigned short>());
1294             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1295             if (MASTER(cr))
1296             {
1297                 copy_df_history(state_global->dfhist, state->dfhist);
1298             }
1299         }
1300
1301         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1302         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1303         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1304             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1305                 || checkpointHandler->isCheckpointingStep()))
1306         {
1307             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1308             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1309         }
1310         // Copy velocities if needed for the output/checkpointing.
1311         // NOTE: Copy on the search steps is done at the beginning of the step.
1312         if (useGpuForUpdate && !bNS
1313             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1314         {
1315             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1316             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1317         }
1318         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1319         // and update is offloaded hence forces are kept on the GPU for update and have not been
1320         // already transferred in do_force().
1321         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1322         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1323         //       prior to GPU update.
1324         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1325         //       copy call in do_force(...).
1326         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1327         //       on host after the D2H copy in do_force(...).
1328         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1329             && do_per_step(step, ir->nstfout))
1330         {
1331             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1332             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1333         }
1334         /* Now we have the energies and forces corresponding to the
1335          * coordinates at time t. We must output all of this before
1336          * the update.
1337          */
1338         do_md_trajectory_writing(fplog,
1339                                  cr,
1340                                  nfile,
1341                                  fnm,
1342                                  step,
1343                                  step_rel,
1344                                  t,
1345                                  ir,
1346                                  state,
1347                                  state_global,
1348                                  observablesHistory,
1349                                  top_global,
1350                                  fr,
1351                                  outf,
1352                                  energyOutput,
1353                                  ekind,
1354                                  f.view().force(),
1355                                  checkpointHandler->isCheckpointingStep(),
1356                                  bRerunMD,
1357                                  bLastStep,
1358                                  mdrunOptions.writeConfout,
1359                                  bSumEkinhOld);
1360         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1361         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x, t);
1362
1363         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1364         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1365             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1366         {
1367             copy_mat(state->svir_prev, shake_vir);
1368             copy_mat(state->fvir_prev, force_vir);
1369         }
1370
1371         stopHandler->setSignal();
1372         resetHandler->setSignal(walltime_accounting);
1373
1374         if (bGStat || !PAR(cr))
1375         {
1376             /* In parallel we only have to check for checkpointing in steps
1377              * where we do global communication,
1378              *  otherwise the other nodes don't know.
1379              */
1380             checkpointHandler->setSignal(walltime_accounting);
1381         }
1382
1383         /* #########   START SECOND UPDATE STEP ################# */
1384
1385         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1386            controlled in preprocessing */
1387
1388         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1389         {
1390             gmx_bool bIfRandomize;
1391             bIfRandomize = update_randomize_velocities(ir,
1392                                                        step,
1393                                                        cr,
1394                                                        md->homenr,
1395                                                        md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1396                                                                : gmx::ArrayRef<const unsigned short>(),
1397                                                        gmx::arrayRefFromArray(md->invmass, md->nr),
1398                                                        state->v,
1399                                                        &upd,
1400                                                        constr);
1401             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1402             if (constr && bIfRandomize)
1403             {
1404                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1405             }
1406         }
1407         /* Box is changed in update() when we do pressure coupling,
1408          * but we should still use the old box for energy corrections and when
1409          * writing it to the energy file, so it matches the trajectory files for
1410          * the same timestep above. Make a copy in a separate array.
1411          */
1412         copy_mat(state->box, lastbox);
1413
1414         dvdl_constr = 0;
1415
1416         if (!useGpuForUpdate)
1417         {
1418             wallcycle_start(wcycle, WallCycleCounter::Update);
1419         }
1420         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1421         if (bTrotter)
1422         {
1423             trotter_update(ir,
1424                            step,
1425                            ekind,
1426                            enerd,
1427                            state,
1428                            total_vir,
1429                            md->homenr,
1430                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1431                                    : gmx::ArrayRef<const unsigned short>(),
1432                            gmx::arrayRefFromArray(md->invmass, md->nr),
1433                            &MassQ,
1434                            trotter_seq,
1435                            TrotterSequence::Three);
1436             /* We can only do Berendsen coupling after we have summed
1437              * the kinetic energy or virial. Since the happens
1438              * in global_state after update, we should only do it at
1439              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1440              */
1441         }
1442         else
1443         {
1444             update_tcouple(step,
1445                            ir,
1446                            state,
1447                            ekind,
1448                            &MassQ,
1449                            md->homenr,
1450                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1451                                    : gmx::ArrayRef<const unsigned short>());
1452             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1453         }
1454
1455         /* With leap-frog type integrators we compute the kinetic energy
1456          * at a whole time step as the average of the half-time step kinetic
1457          * energies of two subsequent steps. Therefore we need to compute the
1458          * half step kinetic energy also if we need energies at the next step.
1459          */
1460         const bool needHalfStepKineticEnergy =
1461                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1462
1463         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1464         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1465         const bool doParrinelloRahman = (ir->epc == PressureCoupling::ParrinelloRahman
1466                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1467
1468         if (EI_VV(ir->eI))
1469         {
1470             GMX_ASSERT(!useGpuForUpdate, "GPU update is not supported with VVAK integrator.");
1471
1472             integrateVVSecondStep(step,
1473                                   ir,
1474                                   fr,
1475                                   cr,
1476                                   state,
1477                                   mdAtoms->mdatoms(),
1478                                   &fcdata,
1479                                   &MassQ,
1480                                   &vcm,
1481                                   pull_work,
1482                                   enerd,
1483                                   &observablesReducer,
1484                                   ekind,
1485                                   gstat,
1486                                   &dvdl_constr,
1487                                   bCalcVir,
1488                                   total_vir,
1489                                   shake_vir,
1490                                   force_vir,
1491                                   pres,
1492                                   M,
1493                                   lastbox,
1494                                   do_log,
1495                                   do_ene,
1496                                   bGStat,
1497                                   &bSumEkinhOld,
1498                                   &f,
1499                                   &cbuf,
1500                                   &upd,
1501                                   constr,
1502                                   &nullSignaller,
1503                                   trotter_seq,
1504                                   nrnb,
1505                                   wcycle);
1506         }
1507         else
1508         {
1509             if (useGpuForUpdate)
1510             {
1511                 if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1512                 {
1513                     integrator->set(stateGpu->getCoordinates(),
1514                                     stateGpu->getVelocities(),
1515                                     stateGpu->getForces(),
1516                                     top.idef,
1517                                     *md);
1518
1519                     // Copy data to the GPU after buffers might have being reinitialized
1520                     /* The velocity copy is redundant if we had Center-of-Mass motion removed on
1521                      * the previous step. We don't check that now. */
1522                     stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1523                     if (!runScheduleWork->stepWork.haveGpuPmeOnThisRank
1524                         && !runScheduleWork->stepWork.useGpuXBufferOps)
1525                     {
1526                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1527                     }
1528                 }
1529
1530                 if (simulationWork.useGpuPme && simulationWork.useCpuPmePpCommunication)
1531                 {
1532                     // The PME forces were recieved to the host, so have to be copied
1533                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::All);
1534                 }
1535                 else if (!runScheduleWork->stepWork.useGpuFBufferOps)
1536                 {
1537                     // The buffer ops were not offloaded this step, so the forces are on the
1538                     // host and have to be copied
1539                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1540                 }
1541
1542                 const bool doTemperatureScaling =
1543                         (ir->etc != TemperatureCoupling::No
1544                          && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1545
1546                 // This applies Leap-Frog, LINCS and SETTLE in succession
1547                 integrator->integrate(
1548                         stateGpu->getForcesReadyOnDeviceEvent(
1549                                 AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1550                         ir->delta_t,
1551                         true,
1552                         bCalcVir,
1553                         shake_vir,
1554                         doTemperatureScaling,
1555                         ekind->tcstat,
1556                         doParrinelloRahman,
1557                         ir->nstpcouple * ir->delta_t,
1558                         M);
1559
1560                 // Copy velocities D2H after update if:
1561                 // - Globals are computed this step (includes the energy output steps).
1562                 // - Temperature is needed for the next step.
1563                 if (bGStat || needHalfStepKineticEnergy)
1564                 {
1565                     stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1566                     stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1567                 }
1568             }
1569             else
1570             {
1571                 /* With multiple time stepping we need to do an additional normal
1572                  * update step to obtain the virial, as the actual MTS integration
1573                  * using an acceleration where the slow forces are multiplied by mtsFactor.
1574                  * Using that acceleration would result in a virial with the slow
1575                  * force contribution would be a factor mtsFactor too large.
1576                  */
1577                 if (simulationWork.useMts && bCalcVir && constr != nullptr)
1578                 {
1579                     upd.update_for_constraint_virial(*ir,
1580                                                      md->homenr,
1581                                                      md->havePartiallyFrozenAtoms,
1582                                                      gmx::arrayRefFromArray(md->invmass, md->nr),
1583                                                      gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1584                                                      *state,
1585                                                      f.view().forceWithPadding(),
1586                                                      *ekind);
1587
1588                     constrain_coordinates(constr,
1589                                           do_log,
1590                                           do_ene,
1591                                           step,
1592                                           state,
1593                                           upd.xp()->arrayRefWithPadding(),
1594                                           &dvdl_constr,
1595                                           bCalcVir,
1596                                           shake_vir);
1597                 }
1598
1599                 ArrayRefWithPadding<const RVec> forceCombined =
1600                         (simulationWork.useMts && step % ir->mtsLevels[1].stepFactor == 0)
1601                                 ? f.view().forceMtsCombinedWithPadding()
1602                                 : f.view().forceWithPadding();
1603                 upd.update_coords(*ir,
1604                                   step,
1605                                   md->homenr,
1606                                   md->havePartiallyFrozenAtoms,
1607                                   gmx::arrayRefFromArray(md->ptype, md->nr),
1608                                   gmx::arrayRefFromArray(md->invmass, md->nr),
1609                                   gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1610                                   state,
1611                                   forceCombined,
1612                                   &fcdata,
1613                                   ekind,
1614                                   M,
1615                                   etrtPOSITION,
1616                                   cr,
1617                                   constr != nullptr);
1618
1619                 wallcycle_stop(wcycle, WallCycleCounter::Update);
1620
1621                 constrain_coordinates(constr,
1622                                       do_log,
1623                                       do_ene,
1624                                       step,
1625                                       state,
1626                                       upd.xp()->arrayRefWithPadding(),
1627                                       &dvdl_constr,
1628                                       bCalcVir && !simulationWork.useMts,
1629                                       shake_vir);
1630
1631                 upd.update_sd_second_half(*ir,
1632                                           step,
1633                                           &dvdl_constr,
1634                                           md->homenr,
1635                                           gmx::arrayRefFromArray(md->ptype, md->nr),
1636                                           gmx::arrayRefFromArray(md->invmass, md->nr),
1637                                           state,
1638                                           cr,
1639                                           nrnb,
1640                                           wcycle,
1641                                           constr,
1642                                           do_log,
1643                                           do_ene);
1644                 upd.finish_update(
1645                         *ir, md->havePartiallyFrozenAtoms, md->homenr, state, wcycle, constr != nullptr);
1646             }
1647
1648             if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1649             {
1650                 updatePrevStepPullCom(pull_work, state);
1651             }
1652
1653             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1654         }
1655
1656         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1657         /* With Leap-Frog we can skip compute_globals at
1658          * non-communication steps, but we need to calculate
1659          * the kinetic energy one step before communication.
1660          */
1661         {
1662             // Organize to do inter-simulation signalling on steps if
1663             // and when algorithms require it.
1664             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1665
1666             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1667             {
1668                 // Copy coordinates when needed to stop the CM motion.
1669                 if (useGpuForUpdate && (bDoReplEx || (!EI_VV(ir->eI) && bStopCM)))
1670                 {
1671                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1672                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1673                 }
1674                 // Since we're already communicating at this step, we
1675                 // can propagate intra-simulation signals. Note that
1676                 // check_nstglobalcomm has the responsibility for
1677                 // choosing the value of nstglobalcomm that is one way
1678                 // bGStat becomes true, so we can't get into a
1679                 // situation where e.g. checkpointing can't be
1680                 // signalled.
1681                 bool                doIntraSimSignal = true;
1682                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1683
1684                 compute_globals(
1685                         gstat,
1686                         cr,
1687                         ir,
1688                         fr,
1689                         ekind,
1690                         makeConstArrayRef(state->x),
1691                         makeConstArrayRef(state->v),
1692                         state->box,
1693                         md,
1694                         nrnb,
1695                         &vcm,
1696                         wcycle,
1697                         enerd,
1698                         force_vir,
1699                         shake_vir,
1700                         total_vir,
1701                         pres,
1702                         (!EI_VV(ir->eI) && bCalcEner && constr != nullptr) ? constr->rmsdData()
1703                                                                            : gmx::ArrayRef<real>{},
1704                         &signaller,
1705                         lastbox,
1706                         &bSumEkinhOld,
1707                         (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1708                                 | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1709                                 | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1710                                 | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
1711                                 | (DOMAINDECOMP(cr) && dd_localTopologyChecker(*cr->dd).shouldCheckNumberOfBondedInteractions()
1712                                            ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1713                                            : 0),
1714                         step,
1715                         &observablesReducer);
1716                 if (DOMAINDECOMP(cr))
1717                 {
1718                     dd_localTopologyChecker(cr->dd)->checkNumberOfBondedInteractions(
1719                             &top, makeConstArrayRef(state->x), state->box);
1720                 }
1721                 if (!EI_VV(ir->eI) && bStopCM)
1722                 {
1723                     process_and_stopcm_grp(
1724                             fplog, &vcm, *md, makeArrayRef(state->x), makeArrayRef(state->v));
1725                     inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
1726
1727                     // TODO: The special case of removing CM motion should be dealt more gracefully
1728                     if (useGpuForUpdate)
1729                     {
1730                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1731                         // Here we block until the H2D copy completes because event sync with the
1732                         // force kernels that use the coordinates on the next steps is not implemented
1733                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1734                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1735                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1736                         if (vcm.mode != ComRemovalAlgorithm::No)
1737                         {
1738                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1739                         }
1740                     }
1741                 }
1742             }
1743         }
1744
1745         /* #############  END CALC EKIN AND PRESSURE ################# */
1746
1747         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1748            the virial that should probably be addressed eventually. state->veta has better properies,
1749            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1750            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1751
1752         if (ir->efep != FreeEnergyPerturbationType::No && !EI_VV(ir->eI))
1753         {
1754             /* Sum up the foreign energy and dK/dl terms for md and sd.
1755                Currently done every step so that dH/dl is correct in the .edr */
1756             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1757         }
1758
1759         update_pcouple_after_coordinates(fplog,
1760                                          step,
1761                                          ir,
1762                                          md->homenr,
1763                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1764                                                      : gmx::ArrayRef<const unsigned short>(),
1765                                          pres,
1766                                          force_vir,
1767                                          shake_vir,
1768                                          pressureCouplingMu,
1769                                          state,
1770                                          nrnb,
1771                                          upd.deform(),
1772                                          !useGpuForUpdate);
1773
1774         const bool doBerendsenPressureCoupling = (inputrec->epc == PressureCoupling::Berendsen
1775                                                   && do_per_step(step, inputrec->nstpcouple));
1776         const bool doCRescalePressureCoupling  = (inputrec->epc == PressureCoupling::CRescale
1777                                                  && do_per_step(step, inputrec->nstpcouple));
1778         if (useGpuForUpdate
1779             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1780         {
1781             integrator->scaleCoordinates(pressureCouplingMu);
1782             if (doCRescalePressureCoupling)
1783             {
1784                 matrix pressureCouplingInvMu;
1785                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1786                 integrator->scaleVelocities(pressureCouplingInvMu);
1787             }
1788             integrator->setPbc(PbcType::Xyz, state->box);
1789         }
1790
1791         /* ################# END UPDATE STEP 2 ################# */
1792         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1793
1794         /* The coordinates (x) were unshifted in update */
1795         if (!bGStat)
1796         {
1797             /* We will not sum ekinh_old,
1798              * so signal that we still have to do it.
1799              */
1800             bSumEkinhOld = TRUE;
1801         }
1802
1803         if (bCalcEner)
1804         {
1805             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1806
1807             /* use the directly determined last velocity, not actually the averaged half steps */
1808             if (bTrotter && ir->eI == IntegrationAlgorithm::VV)
1809             {
1810                 enerd->term[F_EKIN] = last_ekin;
1811             }
1812             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1813
1814             if (integratorHasConservedEnergyQuantity(ir))
1815             {
1816                 if (EI_VV(ir->eI))
1817                 {
1818                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1819                 }
1820                 else
1821                 {
1822                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1823                 }
1824             }
1825             /* #########  END PREPARING EDR OUTPUT  ###########  */
1826         }
1827
1828         /* Output stuff */
1829         if (MASTER(cr))
1830         {
1831             if (fplog && do_log && bDoExpanded)
1832             {
1833                 /* only needed if doing expanded ensemble */
1834                 PrintFreeEnergyInfoToFile(fplog,
1835                                           ir->fepvals.get(),
1836                                           ir->expandedvals.get(),
1837                                           ir->bSimTemp ? ir->simtempvals.get() : nullptr,
1838                                           state_global->dfhist,
1839                                           state->fep_state,
1840                                           ir->nstlog,
1841                                           step);
1842             }
1843             if (bCalcEner)
1844             {
1845                 energyOutput.addDataAtEnergyStep(bDoDHDL,
1846                                                  bCalcEnerStep,
1847                                                  t,
1848                                                  md->tmass,
1849                                                  enerd,
1850                                                  ir->fepvals.get(),
1851                                                  ir->expandedvals.get(),
1852                                                  lastbox,
1853                                                  PTCouplingArrays{ state->boxv,
1854                                                                    state->nosehoover_xi,
1855                                                                    state->nosehoover_vxi,
1856                                                                    state->nhpres_xi,
1857                                                                    state->nhpres_vxi },
1858                                                  state->fep_state,
1859                                                  total_vir,
1860                                                  pres,
1861                                                  ekind,
1862                                                  mu_tot,
1863                                                  constr);
1864             }
1865             else
1866             {
1867                 energyOutput.recordNonEnergyStep();
1868             }
1869
1870             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1871             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1872
1873             if (doSimulatedAnnealing)
1874             {
1875                 gmx::EnergyOutput::printAnnealingTemperatures(
1876                         do_log ? fplog : nullptr, groups, &(ir->opts));
1877             }
1878             if (do_log || do_ene || do_dr || do_or)
1879             {
1880                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf),
1881                                                    do_ene,
1882                                                    do_dr,
1883                                                    do_or,
1884                                                    do_log ? fplog : nullptr,
1885                                                    step,
1886                                                    t,
1887                                                    fr->fcdata.get(),
1888                                                    awh.get());
1889             }
1890             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1891             {
1892                 const bool isInitialOutput = false;
1893                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1894             }
1895
1896             if (ir->bPull)
1897             {
1898                 pull_print_output(pull_work, step, t);
1899             }
1900
1901             if (do_per_step(step, ir->nstlog))
1902             {
1903                 if (fflush(fplog) != 0)
1904                 {
1905                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1906                 }
1907             }
1908         }
1909         if (bDoExpanded)
1910         {
1911             /* Have to do this part _after_ outputting the logfile and the edr file */
1912             /* Gets written into the state at the beginning of next loop*/
1913             state->fep_state = lamnew;
1914         }
1915         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1916         {
1917             state->fep_state = awh->fepLambdaState();
1918         }
1919         /* Print the remaining wall clock time for the run */
1920         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1921         {
1922             if (shellfc)
1923             {
1924                 fprintf(stderr, "\n");
1925             }
1926             print_time(stderr, walltime_accounting, step, ir, cr);
1927         }
1928
1929         /* Ion/water position swapping.
1930          * Not done in last step since trajectory writing happens before this call
1931          * in the MD loop and exchanges would be lost anyway. */
1932         bNeedRepartition = FALSE;
1933         if ((ir->eSwapCoords != SwapType::No) && (step > 0) && !bLastStep
1934             && do_per_step(step, ir->swap->nstswap))
1935         {
1936             bNeedRepartition = do_swapcoords(cr,
1937                                              step,
1938                                              t,
1939                                              ir,
1940                                              swap,
1941                                              wcycle,
1942                                              as_rvec_array(state->x.data()),
1943                                              state->box,
1944                                              MASTER(cr) && mdrunOptions.verbose,
1945                                              bRerunMD);
1946
1947             if (bNeedRepartition && DOMAINDECOMP(cr))
1948             {
1949                 dd_collect_state(cr->dd, state, state_global);
1950             }
1951         }
1952
1953         /* Replica exchange */
1954         bExchanged = FALSE;
1955         if (bDoReplEx)
1956         {
1957             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1958         }
1959
1960         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1961         {
1962             dd_partition_system(fplog,
1963                                 mdlog,
1964                                 step,
1965                                 cr,
1966                                 TRUE,
1967                                 1,
1968                                 state_global,
1969                                 top_global,
1970                                 *ir,
1971                                 imdSession,
1972                                 pull_work,
1973                                 state,
1974                                 &f,
1975                                 mdAtoms,
1976                                 &top,
1977                                 fr,
1978                                 vsite,
1979                                 constr,
1980                                 nrnb,
1981                                 wcycle,
1982                                 FALSE);
1983             upd.updateAfterPartition(state->natoms,
1984                                      md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1985                                                  : gmx::ArrayRef<const unsigned short>(),
1986                                      md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1987                                              : gmx::ArrayRef<const unsigned short>());
1988         }
1989
1990         bFirstStep = FALSE;
1991         bInitStep  = FALSE;
1992
1993         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1994         /* With all integrators, except VV, we need to retain the pressure
1995          * at the current step for coupling at the next step.
1996          */
1997         if ((state->flags & enumValueToBitMask(StateEntry::PressurePrevious))
1998             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1999         {
2000             /* Store the pressure in t_state for pressure coupling
2001              * at the next MD step.
2002              */
2003             copy_mat(pres, state->pres_prev);
2004         }
2005
2006         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
2007
2008         if ((membed != nullptr) && (!bLastStep))
2009         {
2010             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
2011         }
2012
2013         cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
2014         if (DOMAINDECOMP(cr) && wcycle)
2015         {
2016             dd_cycles_add(cr->dd, cycles, ddCyclStep);
2017         }
2018
2019         /* increase the MD step number */
2020         step++;
2021         step_rel++;
2022
2023 #if GMX_FAHCORE
2024         if (MASTER(cr))
2025         {
2026             fcReportProgress(ir->nsteps + ir->init_step, step);
2027         }
2028 #endif
2029
2030         resetHandler->resetCounters(
2031                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
2032
2033         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
2034         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
2035     }
2036     /* End of main MD loop */
2037
2038     /* Closing TNG files can include compressing data. Therefore it is good to do that
2039      * before stopping the time measurements. */
2040     mdoutf_tng_close(outf);
2041
2042     /* Stop measuring walltime */
2043     walltime_accounting_end_time(walltime_accounting);
2044
2045     if (simulationWork.haveSeparatePmeRank)
2046     {
2047         /* Tell the PME only node to finish */
2048         gmx_pme_send_finish(cr);
2049     }
2050
2051     if (MASTER(cr))
2052     {
2053         if (ir->nstcalcenergy > 0)
2054         {
2055             energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
2056
2057             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
2058             energyOutput.printAverages(fplog, groups);
2059         }
2060     }
2061     done_mdoutf(outf);
2062
2063     if (bPMETune)
2064     {
2065         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
2066     }
2067
2068     done_shellfc(fplog, shellfc, step_rel);
2069
2070     if (useReplicaExchange && MASTER(cr))
2071     {
2072         print_replica_exchange_statistics(fplog, repl_ex);
2073     }
2074
2075     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
2076
2077     global_stat_destroy(gstat);
2078 }