src/gromacs/mdrun/md.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief Implements the integrator for normal molecular dynamics simulations
  40  *
  41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
  42  * \ingroup module_mdrun
  43  */
  44 #include "gmxpre.h"
  45
  46 #include <cinttypes>
  47 #include <cmath>
  48 #include <cstdio>
  49 #include <cstdlib>
  50
  51 #include <algorithm>
  52 #include <memory>
  53 #include <numeric>
  54
  55 #include "gromacs/applied_forces/awh/awh.h"
  56 #include "gromacs/applied_forces/awh/read_params.h"
  57 #include "gromacs/commandline/filenm.h"
  58 #include "gromacs/domdec/collect.h"
  59 #include "gromacs/domdec/dlbtiming.h"
  60 #include "gromacs/domdec/domdec.h"
  61 #include "gromacs/domdec/domdec_network.h"
  62 #include "gromacs/domdec/domdec_struct.h"
  63 #include "gromacs/domdec/gpuhaloexchange.h"
  64 #include "gromacs/domdec/localtopologychecker.h"
  65 #include "gromacs/domdec/mdsetup.h"
  66 #include "gromacs/domdec/partition.h"
  67 #include "gromacs/essentialdynamics/edsam.h"
  68 #include "gromacs/ewald/pme_load_balancing.h"
  69 #include "gromacs/ewald/pme_pp.h"
  70 #include "gromacs/fileio/trxio.h"
  71 #include "gromacs/gmxlib/network.h"
  72 #include "gromacs/gmxlib/nrnb.h"
  73 #include "gromacs/gpu_utils/device_stream_manager.h"
  74 #include "gromacs/gpu_utils/gpu_utils.h"
  75 #include "gromacs/imd/imd.h"
  76 #include "gromacs/listed_forces/listed_forces.h"
  77 #include "gromacs/math/functions.h"
  78 #include "gromacs/math/invertmatrix.h"
  79 #include "gromacs/math/vec.h"
  80 #include "gromacs/math/vectypes.h"
  81 #include "gromacs/mdlib/checkpointhandler.h"
  82 #include "gromacs/mdlib/compute_io.h"
  83 #include "gromacs/mdlib/constr.h"
  84 #include "gromacs/mdlib/coupling.h"
  85 #include "gromacs/mdlib/ebin.h"
  86 #include "gromacs/mdlib/enerdata_utils.h"
  87 #include "gromacs/mdlib/energyoutput.h"
  88 #include "gromacs/mdlib/expanded.h"
  89 #include "gromacs/mdlib/force.h"
  90 #include "gromacs/mdlib/force_flags.h"
  91 #include "gromacs/mdlib/forcerec.h"
  92 #include "gromacs/mdlib/freeenergyparameters.h"
  93 #include "gromacs/mdlib/md_support.h"
  94 #include "gromacs/mdlib/mdatoms.h"
  95 #include "gromacs/mdlib/mdoutf.h"
  96 #include "gromacs/mdlib/membed.h"
  97 #include "gromacs/mdlib/resethandler.h"
  98 #include "gromacs/mdlib/sighandler.h"
  99 #include "gromacs/mdlib/simulationsignal.h"
 100 #include "gromacs/mdlib/stat.h"
 101 #include "gromacs/mdlib/stophandler.h"
 102 #include "gromacs/mdlib/tgroup.h"
 103 #include "gromacs/mdlib/trajectory_writing.h"
 104 #include "gromacs/mdlib/update.h"
 105 #include "gromacs/mdlib/update_constrain_gpu.h"
 106 #include "gromacs/mdlib/update_vv.h"
 107 #include "gromacs/mdlib/vcm.h"
 108 #include "gromacs/mdlib/vsite.h"
 109 #include "gromacs/mdrunutility/freeenergy.h"
 110 #include "gromacs/mdrunutility/handlerestart.h"
 111 #include "gromacs/mdrunutility/multisim.h"
 112 #include "gromacs/mdrunutility/printtime.h"
 113 #include "gromacs/mdtypes/awh_history.h"
 114 #include "gromacs/mdtypes/awh_params.h"
 115 #include "gromacs/mdtypes/commrec.h"
 116 #include "gromacs/mdtypes/df_history.h"
 117 #include "gromacs/mdtypes/energyhistory.h"
 118 #include "gromacs/mdtypes/fcdata.h"
 119 #include "gromacs/mdtypes/forcebuffers.h"
 120 #include "gromacs/mdtypes/forcerec.h"
 121 #include "gromacs/mdtypes/group.h"
 122 #include "gromacs/mdtypes/inputrec.h"
 123 #include "gromacs/mdtypes/interaction_const.h"
 124 #include "gromacs/mdtypes/md_enums.h"
 125 #include "gromacs/mdtypes/mdatom.h"
 126 #include "gromacs/mdtypes/mdrunoptions.h"
 127 #include "gromacs/mdtypes/multipletimestepping.h"
 128 #include "gromacs/mdtypes/observableshistory.h"
 129 #include "gromacs/mdtypes/observablesreducer.h"
 130 #include "gromacs/mdtypes/pullhistory.h"
 131 #include "gromacs/mdtypes/simulation_workload.h"
 132 #include "gromacs/mdtypes/state.h"
 133 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
 134 #include "gromacs/modularsimulator/energydata.h"
 135 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 136 #include "gromacs/nbnxm/nbnxm.h"
 137 #include "gromacs/pbcutil/pbc.h"
 138 #include "gromacs/pulling/output.h"
 139 #include "gromacs/pulling/pull.h"
 140 #include "gromacs/swap/swapcoords.h"
 141 #include "gromacs/timing/wallcycle.h"
 142 #include "gromacs/timing/walltime_accounting.h"
 143 #include "gromacs/topology/atoms.h"
 144 #include "gromacs/topology/idef.h"
 145 #include "gromacs/topology/mtop_util.h"
 146 #include "gromacs/topology/topology.h"
 147 #include "gromacs/trajectory/trajectoryframe.h"
 148 #include "gromacs/utility/basedefinitions.h"
 149 #include "gromacs/utility/cstringutil.h"
 150 #include "gromacs/utility/fatalerror.h"
 151 #include "gromacs/utility/logger.h"
 152 #include "gromacs/utility/real.h"
 153 #include "gromacs/utility/smalloc.h"
 154
 155 #include "legacysimulator.h"
 156 #include "replicaexchange.h"
 157 #include "shellfc.h"
 158
 159 using gmx::SimulationSignaller;
 160
 161 void gmx::LegacySimulator::do_md()
 162 {
 163     // TODO Historically, the EM and MD "integrators" used different
 164     // names for the t_inputrec *parameter, but these must have the
 165     // same name, now that it's a member of a struct. We use this ir
 166     // alias to avoid a large ripple of nearly useless changes.
 167     // t_inputrec is being replaced by IMdpOptionsProvider, so this
 168     // will go away eventually.
 169     const t_inputrec* ir = inputrec;
 170
 171     double       t, t0 = ir->init_t;
 172     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
 173     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
 174     gmx_bool     bDoExpanded = FALSE;
 175     gmx_bool     do_ene, do_log, do_verbose;
 176     gmx_bool     bMasterState;
 177     unsigned int force_flags;
 178     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
 179     int    i, m;
 180     rvec   mu_tot;
 181     matrix pressureCouplingMu, M;
 182     gmx_repl_ex_t     repl_ex = nullptr;
 183     gmx_global_stat_t gstat;
 184     gmx_shellfc_t*    shellfc;
 185     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
 186     gmx_bool          bTrotter;
 187     real              dvdl_constr;
 188     std::vector<RVec> cbuf;
 189     matrix            lastbox;
 190     int               lamnew = 0;
 191     /* for FEP */
 192     double    cycles;
 193     real      saved_conserved_quantity = 0;
 194     real      last_ekin                = 0;
 195     t_extmass MassQ;
 196     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
 197
 198     /* PME load balancing data for GPU kernels */
 199     gmx_bool bPMETune         = FALSE;
 200     gmx_bool bPMETunePrinting = FALSE;
 201
 202     bool bInteractiveMDstep = false;
 203
 204     SimulationSignals signals;
 205     // Most global communnication stages don't propagate mdrun
 206     // signals, and will use this object to achieve that.
 207     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
 208
 209     if (!mdrunOptions.writeConfout)
 210     {
 211         // This is on by default, and the main known use case for
 212         // turning it off is for convenience in benchmarking, which is
 213         // something that should not show up in the general user
 214         // interface.
 215         GMX_LOG(mdlog.info)
 216                 .asParagraph()
 217                 .appendText(
 218                         "The -noconfout functionality is deprecated, and may be removed in a "
 219                         "future version.");
 220     }
 221
 222     /* md-vv uses averaged full step velocities for T-control
 223        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 224        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 225     bTrotter = (EI_VV(ir->eI)
 226                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
 227
 228     const bool bRerunMD = false;
 229
 230     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
 231     bGStatEveryStep   = (nstglobalcomm == 1);
 232
 233     const SimulationGroups* groups = &top_global.groups;
 234
 235     std::unique_ptr<EssentialDynamics> ed = nullptr;
 236     if (opt2bSet("-ei", nfile, fnm))
 237     {
 238         /* Initialize essential dynamics sampling */
 239         ed = init_edsam(mdlog,
 240                         opt2fn_null("-ei", nfile, fnm),
 241                         opt2fn("-eo", nfile, fnm),
 242                         top_global,
 243                         *ir,
 244                         cr,
 245                         constr,
 246                         state_global,
 247                         observablesHistory,
 248                         oenv,
 249                         startingBehavior);
 250     }
 251     else if (observablesHistory->edsamHistory)
 252     {
 253         gmx_fatal(FARGS,
 254                   "The checkpoint is from a run with essential dynamics sampling, "
 255                   "but the current run did not specify the -ei option. "
 256                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
 257     }
 258
 259     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
 260     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
 261     initialize_lambdas(fplog,
 262                        ir->efep,
 263                        ir->bSimTemp,
 264                        *ir->fepvals,
 265                        ir->simtempvals->temperatures,
 266                        gmx::arrayRefFromArray(ir->opts.ref_t, ir->opts.ngtc),
 267                        MASTER(cr),
 268                        fep_state,
 269                        lambda);
 270     Update upd(*ir, deform);
 271     bool   doSimulatedAnnealing = false;
 272     {
 273         // TODO: Avoid changing inputrec (#3854)
 274         // Simulated annealing updates the reference temperature.
 275         auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
 276         doSimulatedAnnealing   = initSimulatedAnnealing(nonConstInputrec, &upd);
 277     }
 278     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
 279
 280     t_fcdata& fcdata = *fr->fcdata;
 281
 282     bool simulationsShareState = false;
 283     int  nstSignalComm         = nstglobalcomm;
 284     {
 285         // TODO This implementation of ensemble orientation restraints is nasty because
 286         // a user can't just do multi-sim with single-sim orientation restraints.
 287         bool usingEnsembleRestraints = (fcdata.disres->nsystems > 1) || ((ms != nullptr) && fcdata.orires);
 288         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim() && (ms != nullptr));
 289
 290         // Replica exchange, ensemble restraints and AWH need all
 291         // simulations to remain synchronized, so they need
 292         // checkpoints and stop conditions to act on the same step, so
 293         // the propagation of such signals must take place between
 294         // simulations, not just within simulations.
 295         // TODO: Make algorithm initializers set these flags.
 296         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
 297
 298         if (simulationsShareState)
 299         {
 300             // Inter-simulation signal communication does not need to happen
 301             // often, so we use a minimum of 200 steps to reduce overhead.
 302             const int c_minimumInterSimulationSignallingInterval = 200;
 303             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
 304                             * nstglobalcomm;
 305         }
 306     }
 307
 308     if (startingBehavior != StartingBehavior::RestartWithAppending)
 309     {
 310         pleaseCiteCouplingAlgorithms(fplog, *ir);
 311     }
 312     gmx_mdoutf*       outf = init_mdoutf(fplog,
 313                                    nfile,
 314                                    fnm,
 315                                    mdrunOptions,
 316                                    cr,
 317                                    outputProvider,
 318                                    mdModulesNotifiers,
 319                                    ir,
 320                                    top_global,
 321                                    oenv,
 322                                    wcycle,
 323                                    startingBehavior,
 324                                    simulationsShareState,
 325                                    ms);
 326     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf),
 327                                    top_global,
 328                                    *ir,
 329                                    pull_work,
 330                                    mdoutf_get_fp_dhdl(outf),
 331                                    false,
 332                                    startingBehavior,
 333                                    simulationsShareState,
 334                                    mdModulesNotifiers);
 335
 336     gstat = global_stat_init(ir);
 337
 338     const auto& simulationWork     = runScheduleWork->simulationWork;
 339     const bool  useGpuForPme       = simulationWork.useGpuPme;
 340     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
 341     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
 342     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
 343
 344     /* Check for polarizable models and flexible constraints */
 345     shellfc = init_shell_flexcon(fplog,
 346                                  top_global,
 347                                  constr ? constr->numFlexibleConstraints() : 0,
 348                                  ir->nstcalcenergy,
 349                                  haveDDAtomOrdering(*cr),
 350                                  useGpuForPme);
 351
 352     {
 353         double io = compute_io(ir, top_global.natoms, *groups, energyOutput.numEnergyTerms(), 1);
 354         if ((io > 2000) && MASTER(cr))
 355         {
 356             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
 357         }
 358     }
 359
 360     ObservablesReducer observablesReducer = observablesReducerBuilder->build();
 361
 362     ForceBuffers     f(simulationWork.useMts,
 363                    ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 364                                ? PinningPolicy::PinnedIfSupported
 365                                : PinningPolicy::CannotBePinned);
 366     const t_mdatoms* md = mdAtoms->mdatoms();
 367     if (haveDDAtomOrdering(*cr))
 368     {
 369         // Local state only becomes valid now.
 370         dd_init_local_state(*cr->dd, state_global, state);
 371
 372         /* Distribute the charge groups over the nodes from the master node */
 373         dd_partition_system(fplog,
 374                             mdlog,
 375                             ir->init_step,
 376                             cr,
 377                             TRUE,
 378                             1,
 379                             state_global,
 380                             top_global,
 381                             *ir,
 382                             imdSession,
 383                             pull_work,
 384                             state,
 385                             &f,
 386                             mdAtoms,
 387                             top,
 388                             fr,
 389                             vsite,
 390                             constr,
 391                             nrnb,
 392                             nullptr,
 393                             FALSE);
 394         upd.updateAfterPartition(state->natoms,
 395                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
 396                                              : gmx::ArrayRef<const unsigned short>(),
 397                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
 398                                          : gmx::ArrayRef<const unsigned short>(),
 399                                  md->cACC ? gmx::arrayRefFromArray(md->cACC, md->nr)
 400                                           : gmx::ArrayRef<const unsigned short>());
 401         fr->longRangeNonbondeds->updateAfterPartition(*md);
 402     }
 403     else
 404     {
 405         state_change_natoms(state_global, state_global->natoms);
 406
 407         /* Generate and initialize new topology */
 408         mdAlgorithmsSetupAtomData(cr, *ir, top_global, top, fr, &f, mdAtoms, constr, vsite, shellfc);
 409
 410         upd.updateAfterPartition(state->natoms,
 411                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
 412                                              : gmx::ArrayRef<const unsigned short>(),
 413                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
 414                                          : gmx::ArrayRef<const unsigned short>(),
 415                                  md->cACC ? gmx::arrayRefFromArray(md->cACC, md->nr)
 416                                           : gmx::ArrayRef<const unsigned short>());
 417         fr->longRangeNonbondeds->updateAfterPartition(*md);
 418     }
 419
 420     std::unique_ptr<UpdateConstrainGpu> integrator;
 421
 422     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
 423
 424     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
 425     if (useGpuForUpdate)
 426     {
 427         GMX_RELEASE_ASSERT(!haveDDAtomOrdering(*cr) || ddUsesUpdateGroups(*cr->dd)
 428                                    || constr == nullptr || constr->numConstraintsTotal() == 0,
 429                            "Constraints in domain decomposition are only supported with update "
 430                            "groups if using GPU update.\n");
 431         GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
 432                                    || constr->numConstraintsTotal() == 0,
 433                            "SHAKE is not supported with GPU update.");
 434         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
 435                            "Either PME or short-ranged non-bonded interaction tasks must run on "
 436                            "the GPU to use GPU update.\n");
 437         GMX_RELEASE_ASSERT(ir->eI == IntegrationAlgorithm::MD,
 438                            "Only the md integrator is supported with the GPU update.\n");
 439         GMX_RELEASE_ASSERT(
 440                 ir->etc != TemperatureCoupling::NoseHoover,
 441                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
 442         GMX_RELEASE_ASSERT(
 443                 ir->epc == PressureCoupling::No || ir->epc == PressureCoupling::ParrinelloRahman
 444                         || ir->epc == PressureCoupling::Berendsen || ir->epc == PressureCoupling::CRescale,
 445                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
 446                 "with the GPU update.\n");
 447         GMX_RELEASE_ASSERT(!md->haveVsites,
 448                            "Virtual sites are not supported with the GPU update.\n");
 449         GMX_RELEASE_ASSERT(ed == nullptr,
 450                            "Essential dynamics is not supported with the GPU update.\n");
 451         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
 452                            "Constraints pulling is not supported with the GPU update.\n");
 453         GMX_RELEASE_ASSERT(fcdata.orires == nullptr,
 454                            "Orientation restraints are not supported with the GPU update.\n");
 455         GMX_RELEASE_ASSERT(
 456                 ir->efep == FreeEnergyPerturbationType::No
 457                         || (!haveFepPerturbedMasses(top_global) && !havePerturbedConstraints(top_global)),
 458                 "Free energy perturbation of masses and constraints are not supported with the GPU "
 459                 "update.");
 460
 461         if (constr != nullptr && constr->numConstraintsTotal() > 0)
 462         {
 463             GMX_LOG(mdlog.info)
 464                     .asParagraph()
 465                     .appendText("Updating coordinates and applying constraints on the GPU.");
 466         }
 467         else
 468         {
 469             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
 470         }
 471         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
 472                            "Device stream manager should be initialized in order to use GPU "
 473                            "update-constraints.");
 474         GMX_RELEASE_ASSERT(
 475                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
 476                 "Update stream should be initialized in order to use GPU "
 477                 "update-constraints.");
 478         integrator = std::make_unique<UpdateConstrainGpu>(
 479                 *ir,
 480                 top_global,
 481                 ekind->ngtc,
 482                 fr->deviceStreamManager->context(),
 483                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
 484                 wcycle);
 485
 486         stateGpu->setXUpdatedOnDeviceEvent(integrator->xUpdatedOnDeviceEvent());
 487
 488         integrator->setPbc(PbcType::Xyz, state->box);
 489     }
 490
 491     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 492     {
 493         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
 494     }
 495     if (useGpuForUpdate)
 496     {
 497         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
 498     }
 499
 500     // NOTE: The global state is no longer used at this point.
 501     // But state_global is still used as temporary storage space for writing
 502     // the global state to file and potentially for replica exchange.
 503     // (Global topology should persist.)
 504
 505     update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
 506
 507     if (ir->bExpanded)
 508     {
 509         /* Check nstexpanded here, because the grompp check was broken */
 510         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
 511         {
 512             gmx_fatal(FARGS,
 513                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
 514         }
 515         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
 516     }
 517
 518     if (MASTER(cr))
 519     {
 520         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
 521     }
 522
 523     preparePrevStepPullCom(ir,
 524                            pull_work,
 525                            gmx::arrayRefFromArray(md->massT, md->nr),
 526                            state,
 527                            state_global,
 528                            cr,
 529                            startingBehavior != StartingBehavior::NewSimulation);
 530
 531     // TODO: Remove this by converting AWH into a ForceProvider
 532     auto awh = prepareAwhModule(fplog,
 533                                 *ir,
 534                                 state_global,
 535                                 cr,
 536                                 ms,
 537                                 startingBehavior != StartingBehavior::NewSimulation,
 538                                 shellfc != nullptr,
 539                                 opt2fn("-awh", nfile, fnm),
 540                                 pull_work);
 541
 542     if (useReplicaExchange && MASTER(cr))
 543     {
 544         repl_ex = init_replica_exchange(fplog, ms, top_global.natoms, ir, replExParams);
 545     }
 546     /* PME tuning is only supported in the Verlet scheme, with PME for
 547      * Coulomb. It is not supported with only LJ PME. */
 548     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
 549                 && ir->cutoff_scheme != CutoffScheme::Group);
 550
 551     pme_load_balancing_t* pme_loadbal = nullptr;
 552     if (bPMETune)
 553     {
 554         pme_loadbal_init(
 555                 &pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu());
 556     }
 557
 558     if (!ir->bContinuation)
 559     {
 560         if (state->flags & enumValueToBitMask(StateEntry::V))
 561         {
 562             auto v = makeArrayRef(state->v);
 563             /* Set the velocities of vsites, shells and frozen atoms to zero */
 564             for (i = 0; i < md->homenr; i++)
 565             {
 566                 if (md->ptype[i] == ParticleType::Shell)
 567                 {
 568                     clear_rvec(v[i]);
 569                 }
 570                 else if (md->cFREEZE)
 571                 {
 572                     for (m = 0; m < DIM; m++)
 573                     {
 574                         if (ir->opts.nFreeze[md->cFREEZE[i]][m])
 575                         {
 576                             v[i][m] = 0;
 577                         }
 578                     }
 579                 }
 580             }
 581         }
 582
 583         if (constr)
 584         {
 585             /* Constrain the initial coordinates and velocities */
 586             do_constrain_first(fplog,
 587                                constr,
 588                                ir,
 589                                md->nr,
 590                                md->homenr,
 591                                state->x.arrayRefWithPadding(),
 592                                state->v.arrayRefWithPadding(),
 593                                state->box,
 594                                state->lambda[FreeEnergyPerturbationCouplingType::Bonded]);
 595         }
 596     }
 597
 598     const int nstfep = computeFepPeriod(*ir, replExParams);
 599
 600     /* Be REALLY careful about what flags you set here. You CANNOT assume
 601      * this is the first step, since we might be restarting from a checkpoint,
 602      * and in that case we should not do any modifications to the state.
 603      */
 604     bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && !ir->bContinuation);
 605
 606     // When restarting from a checkpoint, it can be appropriate to
 607     // initialize ekind from quantities in the checkpoint. Otherwise,
 608     // compute_globals must initialize ekind before the simulation
 609     // starts/restarts. However, only the master rank knows what was
 610     // found in the checkpoint file, so we have to communicate in
 611     // order to coordinate the restart.
 612     //
 613     // TODO Consider removing this communication if/when checkpoint
 614     // reading directly follows .tpr reading, because all ranks can
 615     // agree on hasReadEkinState at that time.
 616     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
 617     if (PAR(cr))
 618     {
 619         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
 620     }
 621     if (hasReadEkinState)
 622     {
 623         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
 624     }
 625
 626     unsigned int cglo_flags =
 627             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
 628              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
 629
 630     bSumEkinhOld = FALSE;
 631
 632     t_vcm vcm(top_global.groups, *ir);
 633     reportComRemovalInfo(fplog, vcm);
 634
 635     int64_t step     = ir->init_step;
 636     int64_t step_rel = 0;
 637
 638     /* To minimize communication, compute_globals computes the COM velocity
 639      * and the kinetic energy for the velocities without COM motion removed.
 640      * Thus to get the kinetic energy without the COM contribution, we need
 641      * to call compute_globals twice.
 642      */
 643     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
 644     {
 645         unsigned int cglo_flags_iteration = cglo_flags;
 646         if (bStopCM && cgloIteration == 0)
 647         {
 648             cglo_flags_iteration |= CGLO_STOPCM;
 649             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
 650         }
 651         compute_globals(gstat,
 652                         cr,
 653                         ir,
 654                         fr,
 655                         ekind,
 656                         makeConstArrayRef(state->x),
 657                         makeConstArrayRef(state->v),
 658                         state->box,
 659                         md,
 660                         nrnb,
 661                         &vcm,
 662                         nullptr,
 663                         enerd,
 664                         force_vir,
 665                         shake_vir,
 666                         total_vir,
 667                         pres,
 668                         &nullSignaller,
 669                         state->box,
 670                         &bSumEkinhOld,
 671                         cglo_flags_iteration,
 672                         step,
 673                         &observablesReducer);
 674         // Clean up after pre-step use of compute_globals()
 675         observablesReducer.markAsReadyToReduce();
 676
 677         if (cglo_flags_iteration & CGLO_STOPCM)
 678         {
 679             /* At initialization, do not pass x with acceleration-correction mode
 680              * to avoid (incorrect) correction of the initial coordinates.
 681              */
 682             auto x = (vcm.mode == ComRemovalAlgorithm::LinearAccelerationCorrection)
 683                              ? ArrayRef<RVec>()
 684                              : makeArrayRef(state->x);
 685             process_and_stopcm_grp(fplog, &vcm, *md, x, makeArrayRef(state->v));
 686             inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
 687         }
 688     }
 689     if (ir->eI == IntegrationAlgorithm::VVAK)
 690     {
 691         /* a second call to get the half step temperature initialized as well */
 692         /* we do the same call as above, but turn the pressure off -- internally to
 693            compute_globals, this is recognized as a velocity verlet half-step
 694            kinetic energy calculation.  This minimized excess variables, but
 695            perhaps loses some logic?*/
 696
 697         compute_globals(gstat,
 698                         cr,
 699                         ir,
 700                         fr,
 701                         ekind,
 702                         makeConstArrayRef(state->x),
 703                         makeConstArrayRef(state->v),
 704                         state->box,
 705                         md,
 706                         nrnb,
 707                         &vcm,
 708                         nullptr,
 709                         enerd,
 710                         force_vir,
 711                         shake_vir,
 712                         total_vir,
 713                         pres,
 714                         &nullSignaller,
 715                         state->box,
 716                         &bSumEkinhOld,
 717                         cglo_flags & ~CGLO_PRESSURE,
 718                         step,
 719                         &observablesReducer);
 720         // Clean up after pre-step use of compute_globals()
 721         observablesReducer.markAsReadyToReduce();
 722     }
 723
 724     /* Calculate the initial half step temperature, and save the ekinh_old */
 725     if (startingBehavior == StartingBehavior::NewSimulation)
 726     {
 727         for (i = 0; (i < ir->opts.ngtc); i++)
 728         {
 729             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
 730         }
 731     }
 732
 733     /* need to make an initiation call to get the Trotter variables set, as well as other constants
 734        for non-trotter temperature control */
 735     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
 736
 737     if (MASTER(cr))
 738     {
 739         if (!ir->bContinuation)
 740         {
 741             if (constr && ir->eConstrAlg == ConstraintAlgorithm::Lincs)
 742             {
 743                 fprintf(fplog,
 744                         "RMS relative constraint deviation after constraining: %.2e\n",
 745                         constr->rmsd());
 746             }
 747             if (EI_STATE_VELOCITY(ir->eI))
 748             {
 749                 real temp = enerd->term[F_TEMP];
 750                 if (ir->eI != IntegrationAlgorithm::VV)
 751                 {
 752                     /* Result of Ekin averaged over velocities of -half
 753                      * and +half step, while we only have -half step here.
 754                      */
 755                     temp *= 2;
 756                 }
 757                 fprintf(fplog, "Initial temperature: %g K\n", temp);
 758             }
 759         }
 760
 761         char tbuf[20];
 762         fprintf(stderr, "starting mdrun '%s'\n", *(top_global.name));
 763         if (ir->nsteps >= 0)
 764         {
 765             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
 766         }
 767         else
 768         {
 769             sprintf(tbuf, "%s", "infinite");
 770         }
 771         if (ir->init_step > 0)
 772         {
 773             fprintf(stderr,
 774                     "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 775                     gmx_step_str(ir->init_step + ir->nsteps, sbuf),
 776                     tbuf,
 777                     gmx_step_str(ir->init_step, sbuf2),
 778                     ir->init_step * ir->delta_t);
 779         }
 780         else
 781         {
 782             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
 783         }
 784         fprintf(fplog, "\n");
 785     }
 786
 787     walltime_accounting_start_time(walltime_accounting);
 788     wallcycle_start(wcycle, WallCycleCounter::Run);
 789     print_start(fplog, cr, walltime_accounting, "mdrun");
 790
 791     /***********************************************************
 792      *
 793      *             Loop over MD steps
 794      *
 795      ************************************************************/
 796
 797     bFirstStep = TRUE;
 798     /* Skip the first Nose-Hoover integration when we get the state from tpx */
 799     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
 800     bSumEkinhOld     = FALSE;
 801     bExchanged       = FALSE;
 802     bNeedRepartition = FALSE;
 803
 804     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
 805             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]),
 806             simulationsShareState,
 807             MASTER(cr),
 808             ir->nstlist,
 809             mdrunOptions.reproducible,
 810             nstSignalComm,
 811             mdrunOptions.maximumHoursToRun,
 812             ir->nstlist == 0,
 813             fplog,
 814             step,
 815             bNS,
 816             walltime_accounting);
 817
 818     auto checkpointHandler = std::make_unique<CheckpointHandler>(
 819             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
 820             simulationsShareState,
 821             ir->nstlist == 0,
 822             MASTER(cr),
 823             mdrunOptions.writeConfout,
 824             mdrunOptions.checkpointOptions.period);
 825
 826     const bool resetCountersIsLocal = true;
 827     auto       resetHandler         = std::make_unique<ResetHandler>(
 828             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
 829             !resetCountersIsLocal,
 830             ir->nsteps,
 831             MASTER(cr),
 832             mdrunOptions.timingOptions.resetHalfway,
 833             mdrunOptions.maximumHoursToRun,
 834             mdlog,
 835             wcycle,
 836             walltime_accounting);
 837
 838     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
 839
 840     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
 841     {
 842         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
 843     }
 844
 845     /* and stop now if we should */
 846     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
 847     while (!bLastStep)
 848     {
 849
 850         /* Determine if this is a neighbor search step */
 851         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
 852
 853         if (bPMETune && bNStList)
 854         {
 855             // This has to be here because PME load balancing is called so early.
 856             // TODO: Move to after all booleans are defined.
 857             if (useGpuForUpdate && !bFirstStep)
 858             {
 859                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 860                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 861             }
 862             /* PME grid + cut-off optimization with GPUs or PME nodes */
 863             pme_loadbal_do(pme_loadbal,
 864                            cr,
 865                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
 866                            fplog,
 867                            mdlog,
 868                            *ir,
 869                            fr,
 870                            state->box,
 871                            state->x,
 872                            wcycle,
 873                            step,
 874                            step_rel,
 875                            &bPMETunePrinting,
 876                            simulationWork.useGpuPmePpCommunication);
 877         }
 878
 879         wallcycle_start(wcycle, WallCycleCounter::Step);
 880
 881         bLastStep = (step_rel == ir->nsteps);
 882         t         = t0 + step * ir->delta_t;
 883
 884         // TODO Refactor this, so that nstfep does not need a default value of zero
 885         if (ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
 886         {
 887             /* find and set the current lambdas */
 888             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
 889
 890             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
 891                            && (!bFirstStep));
 892         }
 893
 894         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
 895                      && do_per_step(step, replExParams.exchangeInterval));
 896
 897         if (doSimulatedAnnealing)
 898         {
 899             // TODO: Avoid changing inputrec (#3854)
 900             // Simulated annealing updates the reference temperature.
 901             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
 902             update_annealing_target_temp(nonConstInputrec, t, &upd);
 903         }
 904
 905         /* Stop Center of Mass motion */
 906         bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && do_per_step(step, ir->nstcomm));
 907
 908         /* Determine whether or not to do Neighbour Searching */
 909         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
 910
 911         /* Note that the stopHandler will cause termination at nstglobalcomm
 912          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
 913          * nstpcouple steps, we have computed the half-step kinetic energy
 914          * of the previous step and can always output energies at the last step.
 915          */
 916         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
 917
 918         /* do_log triggers energy and virial calculation. Because this leads
 919          * to different code paths, forces can be different. Thus for exact
 920          * continuation we should avoid extra log output.
 921          * Note that the || bLastStep can result in non-exact continuation
 922          * beyond the last step. But we don't consider that to be an issue.
 923          */
 924         do_log     = (do_per_step(step, ir->nstlog)
 925                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
 926         do_verbose = mdrunOptions.verbose
 927                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
 928
 929         // On search steps, when doing the update on the GPU, copy
 930         // the coordinates and velocities to the host unless they are
 931         // already there (ie on the first step and after replica
 932         // exchange).
 933         if (useGpuForUpdate && bNS && !bFirstStep && !bExchanged)
 934         {
 935             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
 936             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 937             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
 938             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 939         }
 940
 941         // We only need to calculate virtual velocities if we are writing them in the current step
 942         const bool needVirtualVelocitiesThisStep =
 943                 (vsite != nullptr)
 944                 && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep());
 945
 946         if (vsite != nullptr)
 947         {
 948             // Virtual sites need to be updated before domain decomposition and forces are calculated
 949             wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
 950             // md-vv calculates virtual velocities once it has full-step real velocities
 951             vsite->construct(state->x,
 952                              state->v,
 953                              state->box,
 954                              (!EI_VV(inputrec->eI) && needVirtualVelocitiesThisStep)
 955                                      ? VSiteOperation::PositionsAndVelocities
 956                                      : VSiteOperation::Positions);
 957             wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
 958         }
 959
 960         if (bNS && !(bFirstStep && ir->bContinuation))
 961         {
 962             bMasterState = FALSE;
 963             /* Correct the new box if it is too skewed */
 964             if (inputrecDynamicBox(ir))
 965             {
 966                 if (correct_box(fplog, step, state->box))
 967                 {
 968                     bMasterState = TRUE;
 969                 }
 970             }
 971             // If update is offloaded, and the box was changed either
 972             // above or in a replica exchange on the previous step,
 973             // the GPU Update object should be informed
 974             if (useGpuForUpdate && (bMasterState || bExchanged))
 975             {
 976                 integrator->setPbc(PbcType::Xyz, state->box);
 977             }
 978             if (haveDDAtomOrdering(*cr) && bMasterState)
 979             {
 980                 dd_collect_state(cr->dd, state, state_global);
 981             }
 982
 983             if (haveDDAtomOrdering(*cr))
 984             {
 985                 /* Repartition the domain decomposition */
 986                 dd_partition_system(fplog,
 987                                     mdlog,
 988                                     step,
 989                                     cr,
 990                                     bMasterState,
 991                                     nstglobalcomm,
 992                                     state_global,
 993                                     top_global,
 994                                     *ir,
 995                                     imdSession,
 996                                     pull_work,
 997                                     state,
 998                                     &f,
 999                                     mdAtoms,
1000                                     top,
1001                                     fr,
1002                                     vsite,
1003                                     constr,
1004                                     nrnb,
1005                                     wcycle,
1006                                     do_verbose && !bPMETunePrinting);
1007                 upd.updateAfterPartition(state->natoms,
1008                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1009                                                      : gmx::ArrayRef<const unsigned short>(),
1010                                          md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1011                                                  : gmx::ArrayRef<const unsigned short>(),
1012                                          md->cACC ? gmx::arrayRefFromArray(md->cACC, md->nr)
1013                                                   : gmx::ArrayRef<const unsigned short>());
1014                 fr->longRangeNonbondeds->updateAfterPartition(*md);
1015             }
1016         }
1017
1018         // Allocate or re-size GPU halo exchange object, if necessary
1019         if (bNS && simulationWork.havePpDomainDecomposition && simulationWork.useGpuHaloExchange)
1020         {
1021             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
1022                                "GPU device manager has to be initialized to use GPU "
1023                                "version of halo exchange.");
1024             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
1025         }
1026
1027         if (MASTER(cr) && do_log)
1028         {
1029             gmx::EnergyOutput::printHeader(
1030                     fplog, step, t); /* can we improve the information printed here? */
1031         }
1032
1033         if (ir->efep != FreeEnergyPerturbationType::No)
1034         {
1035             update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
1036         }
1037
1038         if (bExchanged)
1039         {
1040             /* We need the kinetic energy at minus the half step for determining
1041              * the full step kinetic energy and possibly for T-coupling.*/
1042             /* This may not be quite working correctly yet . . . . */
1043             int cglo_flags = CGLO_GSTAT | CGLO_TEMPERATURE;
1044             compute_globals(gstat,
1045                             cr,
1046                             ir,
1047                             fr,
1048                             ekind,
1049                             makeConstArrayRef(state->x),
1050                             makeConstArrayRef(state->v),
1051                             state->box,
1052                             md,
1053                             nrnb,
1054                             &vcm,
1055                             wcycle,
1056                             enerd,
1057                             nullptr,
1058                             nullptr,
1059                             nullptr,
1060                             nullptr,
1061                             &nullSignaller,
1062                             state->box,
1063                             &bSumEkinhOld,
1064                             cglo_flags,
1065                             step,
1066                             &observablesReducer);
1067         }
1068         clear_mat(force_vir);
1069
1070         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
1071
1072         /* Determine the energy and pressure:
1073          * at nstcalcenergy steps and at energy output steps (set below).
1074          */
1075         if (EI_VV(ir->eI) && (!bInitStep))
1076         {
1077             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1078             bCalcVir      = bCalcEnerStep
1079                        || (ir->epc != PressureCoupling::No
1080                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
1081         }
1082         else
1083         {
1084             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1085             bCalcVir      = bCalcEnerStep
1086                        || (ir->epc != PressureCoupling::No && do_per_step(step, ir->nstpcouple));
1087         }
1088         bCalcEner = bCalcEnerStep;
1089
1090         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
1091
1092         if (do_ene || do_log || bDoReplEx)
1093         {
1094             bCalcVir  = TRUE;
1095             bCalcEner = TRUE;
1096         }
1097
1098         // bCalcEner is only here for when the last step is not a mulitple of nstfep
1099         const bool computeDHDL = ((ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
1100                                   && (do_per_step(step, nstfep) || bCalcEner));
1101
1102         /* Do we need global communication ? */
1103         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
1104                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
1105
1106         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
1107                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
1108                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (computeDHDL ? GMX_FORCE_DHDL : 0));
1109         if (simulationWork.useMts && !do_per_step(step, ir->nstfout))
1110         {
1111             // TODO: merge this with stepWork.useOnlyMtsCombinedForceBuffer
1112             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
1113         }
1114
1115         if (shellfc)
1116         {
1117             /* Now is the time to relax the shells */
1118             relax_shell_flexcon(fplog,
1119                                 cr,
1120                                 ms,
1121                                 mdrunOptions.verbose,
1122                                 enforcedRotation,
1123                                 step,
1124                                 ir,
1125                                 imdSession,
1126                                 pull_work,
1127                                 bNS,
1128                                 force_flags,
1129                                 top,
1130                                 constr,
1131                                 enerd,
1132                                 state->natoms,
1133                                 state->x.arrayRefWithPadding(),
1134                                 state->v.arrayRefWithPadding(),
1135                                 state->box,
1136                                 state->lambda,
1137                                 &state->hist,
1138                                 &f.view(),
1139                                 force_vir,
1140                                 *md,
1141                                 fr->longRangeNonbondeds.get(),
1142                                 nrnb,
1143                                 wcycle,
1144                                 shellfc,
1145                                 fr,
1146                                 runScheduleWork,
1147                                 t,
1148                                 mu_tot,
1149                                 vsite,
1150                                 ddBalanceRegionHandler);
1151         }
1152         else
1153         {
1154             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
1155                is updated (or the AWH update will be performed twice for one step when continuing).
1156                It would be best to call this update function from do_md_trajectory_writing but that
1157                would occur after do_force. One would have to divide the update_awh function into one
1158                function applying the AWH force and one doing the AWH bias update. The update AWH
1159                bias function could then be called after do_md_trajectory_writing (then containing
1160                update_awh_history). The checkpointing will in the future probably moved to the start
1161                of the md loop which will rid of this issue. */
1162             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
1163             {
1164                 awh->updateHistory(state_global->awhHistory.get());
1165             }
1166
1167             /* The coordinates (x) are shifted (to get whole molecules)
1168              * in do_force.
1169              * This is parallellized as well, and does communication too.
1170              * Check comments in sim_util.c
1171              */
1172             do_force(fplog,
1173                      cr,
1174                      ms,
1175                      *ir,
1176                      awh.get(),
1177                      enforcedRotation,
1178                      imdSession,
1179                      pull_work,
1180                      step,
1181                      nrnb,
1182                      wcycle,
1183                      top,
1184                      state->box,
1185                      state->x.arrayRefWithPadding(),
1186                      &state->hist,
1187                      &f.view(),
1188                      force_vir,
1189                      md,
1190                      enerd,
1191                      state->lambda,
1192                      fr,
1193                      runScheduleWork,
1194                      vsite,
1195                      mu_tot,
1196                      t,
1197                      ed ? ed->getLegacyED() : nullptr,
1198                      fr->longRangeNonbondeds.get(),
1199                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
1200                      ddBalanceRegionHandler);
1201         }
1202
1203         // VV integrators do not need the following velocity half step
1204         // if it is the first step after starting from a checkpoint.
1205         // That is, the half step is needed on all other steps, and
1206         // also the first step when starting from a .tpr file.
1207         if (EI_VV(ir->eI))
1208         {
1209             integrateVVFirstStep(step,
1210                                  bFirstStep,
1211                                  bInitStep,
1212                                  startingBehavior,
1213                                  nstglobalcomm,
1214                                  ir,
1215                                  fr,
1216                                  cr,
1217                                  state,
1218                                  mdAtoms->mdatoms(),
1219                                  &fcdata,
1220                                  &MassQ,
1221                                  &vcm,
1222                                  enerd,
1223                                  &observablesReducer,
1224                                  ekind,
1225                                  gstat,
1226                                  &last_ekin,
1227                                  bCalcVir,
1228                                  total_vir,
1229                                  shake_vir,
1230                                  force_vir,
1231                                  pres,
1232                                  M,
1233                                  do_log,
1234                                  do_ene,
1235                                  bCalcEner,
1236                                  bGStat,
1237                                  bStopCM,
1238                                  bTrotter,
1239                                  bExchanged,
1240                                  &bSumEkinhOld,
1241                                  &saved_conserved_quantity,
1242                                  &f,
1243                                  &upd,
1244                                  constr,
1245                                  &nullSignaller,
1246                                  trotter_seq,
1247                                  nrnb,
1248                                  fplog,
1249                                  wcycle);
1250             if (vsite != nullptr && needVirtualVelocitiesThisStep)
1251             {
1252                 // Positions were calculated earlier
1253                 wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
1254                 vsite->construct(state->x, state->v, state->box, VSiteOperation::Velocities);
1255                 wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
1256             }
1257         }
1258
1259         /* ########  END FIRST UPDATE STEP  ############## */
1260         /* ########  If doing VV, we now have v(dt) ###### */
1261         if (bDoExpanded)
1262         {
1263             /* perform extended ensemble sampling in lambda - we don't
1264                actually move to the new state before outputting
1265                statistics, but if performing simulated tempering, we
1266                do update the velocities and the tau_t. */
1267             // TODO: Avoid changing inputrec (#3854)
1268             // Simulated tempering updates the reference temperature.
1269             // Expanded ensemble without simulated tempering does not change the inputrec.
1270             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
1271             lamnew                 = ExpandedEnsembleDynamics(fplog,
1272                                               nonConstInputrec,
1273                                               enerd,
1274                                               state,
1275                                               &MassQ,
1276                                               state->fep_state,
1277                                               state->dfhist,
1278                                               step,
1279                                               state->v.rvec_array(),
1280                                               md->homenr,
1281                                               md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1282                                                                       : gmx::ArrayRef<const unsigned short>());
1283             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1284             if (MASTER(cr))
1285             {
1286                 copy_df_history(state_global->dfhist, state->dfhist);
1287             }
1288         }
1289
1290         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1291         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1292         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1293             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1294                 || checkpointHandler->isCheckpointingStep()))
1295         {
1296             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1297             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1298         }
1299         // Copy velocities if needed for the output/checkpointing.
1300         // NOTE: Copy on the search steps is done at the beginning of the step.
1301         if (useGpuForUpdate && !bNS
1302             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1303         {
1304             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1305             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1306         }
1307         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1308         // and update is offloaded hence forces are kept on the GPU for update and have not been
1309         // already transferred in do_force().
1310         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1311         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1312         //       prior to GPU update.
1313         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1314         //       copy call in do_force(...).
1315         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1316         //       on host after the D2H copy in do_force(...).
1317         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1318             && do_per_step(step, ir->nstfout))
1319         {
1320             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1321             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1322         }
1323         /* Now we have the energies and forces corresponding to the
1324          * coordinates at time t. We must output all of this before
1325          * the update.
1326          */
1327         do_md_trajectory_writing(fplog,
1328                                  cr,
1329                                  nfile,
1330                                  fnm,
1331                                  step,
1332                                  step_rel,
1333                                  t,
1334                                  ir,
1335                                  state,
1336                                  state_global,
1337                                  observablesHistory,
1338                                  top_global,
1339                                  fr,
1340                                  outf,
1341                                  energyOutput,
1342                                  ekind,
1343                                  f.view().force(),
1344                                  checkpointHandler->isCheckpointingStep(),
1345                                  bRerunMD,
1346                                  bLastStep,
1347                                  mdrunOptions.writeConfout,
1348                                  bSumEkinhOld);
1349         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1350         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x, t);
1351
1352         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1353         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1354             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1355         {
1356             copy_mat(state->svir_prev, shake_vir);
1357             copy_mat(state->fvir_prev, force_vir);
1358         }
1359
1360         stopHandler->setSignal();
1361         resetHandler->setSignal(walltime_accounting);
1362
1363         if (bGStat || !PAR(cr))
1364         {
1365             /* In parallel we only have to check for checkpointing in steps
1366              * where we do global communication,
1367              *  otherwise the other nodes don't know.
1368              */
1369             checkpointHandler->setSignal(walltime_accounting);
1370         }
1371
1372         /* #########   START SECOND UPDATE STEP ################# */
1373
1374         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1375            controlled in preprocessing */
1376
1377         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1378         {
1379             gmx_bool bIfRandomize;
1380             bIfRandomize = update_randomize_velocities(ir,
1381                                                        step,
1382                                                        cr,
1383                                                        md->homenr,
1384                                                        md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1385                                                                : gmx::ArrayRef<const unsigned short>(),
1386                                                        gmx::arrayRefFromArray(md->invmass, md->nr),
1387                                                        state->v,
1388                                                        &upd,
1389                                                        constr);
1390             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1391             if (constr && bIfRandomize)
1392             {
1393                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1394             }
1395         }
1396         /* Box is changed in update() when we do pressure coupling,
1397          * but we should still use the old box for energy corrections and when
1398          * writing it to the energy file, so it matches the trajectory files for
1399          * the same timestep above. Make a copy in a separate array.
1400          */
1401         copy_mat(state->box, lastbox);
1402
1403         dvdl_constr = 0;
1404
1405         if (!useGpuForUpdate)
1406         {
1407             wallcycle_start(wcycle, WallCycleCounter::Update);
1408         }
1409         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1410         if (bTrotter)
1411         {
1412             trotter_update(ir,
1413                            step,
1414                            ekind,
1415                            enerd,
1416                            state,
1417                            total_vir,
1418                            md->homenr,
1419                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1420                                    : gmx::ArrayRef<const unsigned short>(),
1421                            gmx::arrayRefFromArray(md->invmass, md->nr),
1422                            &MassQ,
1423                            trotter_seq,
1424                            TrotterSequence::Three);
1425             /* We can only do Berendsen coupling after we have summed
1426              * the kinetic energy or virial. Since the happens
1427              * in global_state after update, we should only do it at
1428              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1429              */
1430         }
1431         else
1432         {
1433             update_tcouple(step,
1434                            ir,
1435                            state,
1436                            ekind,
1437                            &MassQ,
1438                            md->homenr,
1439                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1440                                    : gmx::ArrayRef<const unsigned short>());
1441             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1442         }
1443
1444         /* With leap-frog type integrators we compute the kinetic energy
1445          * at a whole time step as the average of the half-time step kinetic
1446          * energies of two subsequent steps. Therefore we need to compute the
1447          * half step kinetic energy also if we need energies at the next step.
1448          */
1449         const bool needHalfStepKineticEnergy =
1450                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1451
1452         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1453         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1454         const bool doParrinelloRahman = (ir->epc == PressureCoupling::ParrinelloRahman
1455                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1456
1457         if (EI_VV(ir->eI))
1458         {
1459             GMX_ASSERT(!useGpuForUpdate, "GPU update is not supported with VVAK integrator.");
1460
1461             integrateVVSecondStep(step,
1462                                   ir,
1463                                   fr,
1464                                   cr,
1465                                   state,
1466                                   mdAtoms->mdatoms(),
1467                                   &fcdata,
1468                                   &MassQ,
1469                                   &vcm,
1470                                   pull_work,
1471                                   enerd,
1472                                   &observablesReducer,
1473                                   ekind,
1474                                   gstat,
1475                                   &dvdl_constr,
1476                                   bCalcVir,
1477                                   total_vir,
1478                                   shake_vir,
1479                                   force_vir,
1480                                   pres,
1481                                   M,
1482                                   lastbox,
1483                                   do_log,
1484                                   do_ene,
1485                                   bGStat,
1486                                   &bSumEkinhOld,
1487                                   &f,
1488                                   &cbuf,
1489                                   &upd,
1490                                   constr,
1491                                   &nullSignaller,
1492                                   trotter_seq,
1493                                   nrnb,
1494                                   wcycle);
1495         }
1496         else
1497         {
1498             if (useGpuForUpdate)
1499             {
1500                 // On search steps, update handles to device vectors
1501                 if (bNS && (bFirstStep || haveDDAtomOrdering(*cr) || bExchanged))
1502                 {
1503                     integrator->set(stateGpu->getCoordinates(),
1504                                     stateGpu->getVelocities(),
1505                                     stateGpu->getForces(),
1506                                     top->idef,
1507                                     *md);
1508
1509                     // Copy data to the GPU after buffers might have being reinitialized
1510                     /* The velocity copy is redundant if we had Center-of-Mass motion removed on
1511                      * the previous step. We don't check that now. */
1512                     stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1513                     if (bExchanged
1514                         || (!runScheduleWork->stepWork.haveGpuPmeOnThisRank
1515                             && !runScheduleWork->stepWork.useGpuXBufferOps))
1516                     {
1517                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1518                     }
1519                 }
1520
1521                 if ((simulationWork.useGpuPme && simulationWork.useCpuPmePpCommunication)
1522                     || (!runScheduleWork->stepWork.useGpuFBufferOps))
1523                 {
1524                     // The PME forces were recieved to the host, and reduced on the CPU with the
1525                     // rest of the forces computed on the GPU, so the final forces have to be copied
1526                     // back to the GPU. Or the buffer ops were not offloaded this step, so the
1527                     // forces are on the host and have to be copied
1528                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1529                 }
1530                 const bool doTemperatureScaling =
1531                         (ir->etc != TemperatureCoupling::No
1532                          && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1533
1534                 // This applies Leap-Frog, LINCS and SETTLE in succession
1535                 integrator->integrate(stateGpu->getLocalForcesReadyOnDeviceEvent(
1536                                               runScheduleWork->stepWork, runScheduleWork->simulationWork),
1537                                       ir->delta_t,
1538                                       true,
1539                                       bCalcVir,
1540                                       shake_vir,
1541                                       doTemperatureScaling,
1542                                       ekind->tcstat,
1543                                       doParrinelloRahman,
1544                                       ir->nstpcouple * ir->delta_t,
1545                                       M);
1546             }
1547             else
1548             {
1549                 /* With multiple time stepping we need to do an additional normal
1550                  * update step to obtain the virial, as the actual MTS integration
1551                  * using an acceleration where the slow forces are multiplied by mtsFactor.
1552                  * Using that acceleration would result in a virial with the slow
1553                  * force contribution would be a factor mtsFactor too large.
1554                  */
1555                 if (simulationWork.useMts && bCalcVir && constr != nullptr)
1556                 {
1557                     upd.update_for_constraint_virial(*ir,
1558                                                      md->homenr,
1559                                                      md->havePartiallyFrozenAtoms,
1560                                                      gmx::arrayRefFromArray(md->invmass, md->nr),
1561                                                      gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1562                                                      *state,
1563                                                      f.view().forceWithPadding(),
1564                                                      *ekind);
1565
1566                     constrain_coordinates(constr,
1567                                           do_log,
1568                                           do_ene,
1569                                           step,
1570                                           state,
1571                                           upd.xp()->arrayRefWithPadding(),
1572                                           &dvdl_constr,
1573                                           bCalcVir,
1574                                           shake_vir);
1575                 }
1576
1577                 ArrayRefWithPadding<const RVec> forceCombined =
1578                         (simulationWork.useMts && step % ir->mtsLevels[1].stepFactor == 0)
1579                                 ? f.view().forceMtsCombinedWithPadding()
1580                                 : f.view().forceWithPadding();
1581                 upd.update_coords(*ir,
1582                                   step,
1583                                   md->homenr,
1584                                   md->havePartiallyFrozenAtoms,
1585                                   gmx::arrayRefFromArray(md->ptype, md->nr),
1586                                   gmx::arrayRefFromArray(md->invmass, md->nr),
1587                                   gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1588                                   state,
1589                                   forceCombined,
1590                                   &fcdata,
1591                                   ekind,
1592                                   M,
1593                                   etrtPOSITION,
1594                                   cr,
1595                                   constr != nullptr);
1596
1597                 wallcycle_stop(wcycle, WallCycleCounter::Update);
1598
1599                 constrain_coordinates(constr,
1600                                       do_log,
1601                                       do_ene,
1602                                       step,
1603                                       state,
1604                                       upd.xp()->arrayRefWithPadding(),
1605                                       &dvdl_constr,
1606                                       bCalcVir && !simulationWork.useMts,
1607                                       shake_vir);
1608
1609                 upd.update_sd_second_half(*ir,
1610                                           step,
1611                                           &dvdl_constr,
1612                                           md->homenr,
1613                                           gmx::arrayRefFromArray(md->ptype, md->nr),
1614                                           gmx::arrayRefFromArray(md->invmass, md->nr),
1615                                           state,
1616                                           cr,
1617                                           nrnb,
1618                                           wcycle,
1619                                           constr,
1620                                           do_log,
1621                                           do_ene);
1622                 upd.finish_update(
1623                         *ir, md->havePartiallyFrozenAtoms, md->homenr, state, wcycle, constr != nullptr);
1624             }
1625
1626             if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1627             {
1628                 updatePrevStepPullCom(pull_work, state->pull_com_prev_step);
1629             }
1630
1631             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1632         }
1633
1634         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1635         /* With Leap-Frog we can skip compute_globals at
1636          * non-communication steps, but we need to calculate
1637          * the kinetic energy one step before communication.
1638          */
1639         {
1640             // Organize to do inter-simulation signalling on steps if
1641             // and when algorithms require it.
1642             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1643
1644             if (useGpuForUpdate)
1645             {
1646                 const bool coordinatesRequiredForStopCM =
1647                         bStopCM && (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1648                         && !EI_VV(ir->eI);
1649
1650                 // Copy coordinates when needed to stop the CM motion or for replica exchange
1651                 if (coordinatesRequiredForStopCM || bDoReplEx)
1652                 {
1653                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1654                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1655                 }
1656
1657                 // Copy velocities back to the host if:
1658                 // - Globals are computed this step (includes the energy output steps).
1659                 // - Temperature is needed for the next step.
1660                 // - This is a replica exchange step (even though we will only need
1661                 //     the velocities if an exchange succeeds)
1662                 if (bGStat || needHalfStepKineticEnergy || bDoReplEx)
1663                 {
1664                     stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1665                     stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1666                 }
1667             }
1668
1669             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1670             {
1671                 // Since we're already communicating at this step, we
1672                 // can propagate intra-simulation signals. Note that
1673                 // check_nstglobalcomm has the responsibility for
1674                 // choosing the value of nstglobalcomm that is one way
1675                 // bGStat becomes true, so we can't get into a
1676                 // situation where e.g. checkpointing can't be
1677                 // signalled.
1678                 bool                doIntraSimSignal = true;
1679                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1680
1681                 compute_globals(gstat,
1682                                 cr,
1683                                 ir,
1684                                 fr,
1685                                 ekind,
1686                                 makeConstArrayRef(state->x),
1687                                 makeConstArrayRef(state->v),
1688                                 state->box,
1689                                 md,
1690                                 nrnb,
1691                                 &vcm,
1692                                 wcycle,
1693                                 enerd,
1694                                 force_vir,
1695                                 shake_vir,
1696                                 total_vir,
1697                                 pres,
1698                                 &signaller,
1699                                 lastbox,
1700                                 &bSumEkinhOld,
1701                                 (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1702                                         | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1703                                         | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1704                                         | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT,
1705                                 step,
1706                                 &observablesReducer);
1707                 if (!EI_VV(ir->eI) && bStopCM)
1708                 {
1709                     process_and_stopcm_grp(
1710                             fplog, &vcm, *md, makeArrayRef(state->x), makeArrayRef(state->v));
1711                     inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
1712
1713                     // TODO: The special case of removing CM motion should be dealt more gracefully
1714                     if (useGpuForUpdate)
1715                     {
1716                         // Issue #3988, #4106.
1717                         stateGpu->resetCoordinatesCopiedToDeviceEvent(AtomLocality::Local);
1718                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1719                         // Here we block until the H2D copy completes because event sync with the
1720                         // force kernels that use the coordinates on the next steps is not implemented
1721                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1722                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1723                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1724                         if (vcm.mode != ComRemovalAlgorithm::No)
1725                         {
1726                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1727                         }
1728                     }
1729                 }
1730             }
1731         }
1732
1733         /* #############  END CALC EKIN AND PRESSURE ################# */
1734
1735         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1736            the virial that should probably be addressed eventually. state->veta has better properies,
1737            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1738            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1739
1740         if (ir->efep != FreeEnergyPerturbationType::No && !EI_VV(ir->eI))
1741         {
1742             /* Sum up the foreign energy and dK/dl terms for md and sd.
1743                Currently done every step so that dH/dl is correct in the .edr */
1744             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1745         }
1746
1747         bool scaleCoordinates = !useGpuForUpdate || bDoReplEx;
1748         update_pcouple_after_coordinates(fplog,
1749                                          step,
1750                                          ir,
1751                                          md->homenr,
1752                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1753                                                      : gmx::ArrayRef<const unsigned short>(),
1754                                          pres,
1755                                          force_vir,
1756                                          shake_vir,
1757                                          pressureCouplingMu,
1758                                          state,
1759                                          nrnb,
1760                                          upd.deform(),
1761                                          scaleCoordinates);
1762
1763         const bool doBerendsenPressureCoupling = (inputrec->epc == PressureCoupling::Berendsen
1764                                                   && do_per_step(step, inputrec->nstpcouple));
1765         const bool doCRescalePressureCoupling  = (inputrec->epc == PressureCoupling::CRescale
1766                                                  && do_per_step(step, inputrec->nstpcouple));
1767         if (useGpuForUpdate
1768             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1769         {
1770             integrator->scaleCoordinates(pressureCouplingMu);
1771             if (doCRescalePressureCoupling)
1772             {
1773                 matrix pressureCouplingInvMu;
1774                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1775                 integrator->scaleVelocities(pressureCouplingInvMu);
1776             }
1777             integrator->setPbc(PbcType::Xyz, state->box);
1778         }
1779
1780         /* ################# END UPDATE STEP 2 ################# */
1781         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1782
1783         /* The coordinates (x) were unshifted in update */
1784         if (!bGStat)
1785         {
1786             /* We will not sum ekinh_old,
1787              * so signal that we still have to do it.
1788              */
1789             bSumEkinhOld = TRUE;
1790         }
1791
1792         if (bCalcEner)
1793         {
1794             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1795
1796             /* use the directly determined last velocity, not actually the averaged half steps */
1797             if (bTrotter && ir->eI == IntegrationAlgorithm::VV)
1798             {
1799                 enerd->term[F_EKIN] = last_ekin;
1800             }
1801             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1802
1803             if (integratorHasConservedEnergyQuantity(ir))
1804             {
1805                 if (EI_VV(ir->eI))
1806                 {
1807                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1808                 }
1809                 else
1810                 {
1811                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1812                 }
1813             }
1814             /* #########  END PREPARING EDR OUTPUT  ###########  */
1815         }
1816
1817         /* Output stuff */
1818         if (MASTER(cr))
1819         {
1820             if (fplog && do_log && bDoExpanded)
1821             {
1822                 /* only needed if doing expanded ensemble */
1823                 PrintFreeEnergyInfoToFile(fplog,
1824                                           ir->fepvals.get(),
1825                                           ir->expandedvals.get(),
1826                                           ir->bSimTemp ? ir->simtempvals.get() : nullptr,
1827                                           state_global->dfhist,
1828                                           state->fep_state,
1829                                           ir->nstlog,
1830                                           step);
1831             }
1832             if (bCalcEner)
1833             {
1834                 const bool outputDHDL = (computeDHDL && do_per_step(step, ir->fepvals->nstdhdl));
1835
1836                 energyOutput.addDataAtEnergyStep(outputDHDL,
1837                                                  bCalcEnerStep,
1838                                                  t,
1839                                                  md->tmass,
1840                                                  enerd,
1841                                                  ir->fepvals.get(),
1842                                                  lastbox,
1843                                                  PTCouplingArrays{ state->boxv,
1844                                                                    state->nosehoover_xi,
1845                                                                    state->nosehoover_vxi,
1846                                                                    state->nhpres_xi,
1847                                                                    state->nhpres_vxi },
1848                                                  state->fep_state,
1849                                                  total_vir,
1850                                                  pres,
1851                                                  ekind,
1852                                                  mu_tot,
1853                                                  constr);
1854             }
1855             else
1856             {
1857                 energyOutput.recordNonEnergyStep();
1858             }
1859
1860             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1861             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1862
1863             if (doSimulatedAnnealing)
1864             {
1865                 gmx::EnergyOutput::printAnnealingTemperatures(
1866                         do_log ? fplog : nullptr, groups, &(ir->opts));
1867             }
1868             if (do_log || do_ene || do_dr || do_or)
1869             {
1870                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf),
1871                                                    do_ene,
1872                                                    do_dr,
1873                                                    do_or,
1874                                                    do_log ? fplog : nullptr,
1875                                                    step,
1876                                                    t,
1877                                                    fr->fcdata.get(),
1878                                                    awh.get());
1879             }
1880             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1881             {
1882                 const bool isInitialOutput = false;
1883                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1884             }
1885
1886             if (ir->bPull)
1887             {
1888                 pull_print_output(pull_work, step, t);
1889             }
1890
1891             if (do_per_step(step, ir->nstlog))
1892             {
1893                 if (fflush(fplog) != 0)
1894                 {
1895                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1896                 }
1897             }
1898         }
1899         if (bDoExpanded)
1900         {
1901             /* Have to do this part _after_ outputting the logfile and the edr file */
1902             /* Gets written into the state at the beginning of next loop*/
1903             state->fep_state = lamnew;
1904         }
1905         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1906         {
1907             state->fep_state = awh->fepLambdaState();
1908         }
1909         /* Print the remaining wall clock time for the run */
1910         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1911         {
1912             if (shellfc)
1913             {
1914                 fprintf(stderr, "\n");
1915             }
1916             print_time(stderr, walltime_accounting, step, ir, cr);
1917         }
1918
1919         /* Ion/water position swapping.
1920          * Not done in last step since trajectory writing happens before this call
1921          * in the MD loop and exchanges would be lost anyway. */
1922         bNeedRepartition = FALSE;
1923         if ((ir->eSwapCoords != SwapType::No) && (step > 0) && !bLastStep
1924             && do_per_step(step, ir->swap->nstswap))
1925         {
1926             bNeedRepartition = do_swapcoords(cr,
1927                                              step,
1928                                              t,
1929                                              ir,
1930                                              swap,
1931                                              wcycle,
1932                                              as_rvec_array(state->x.data()),
1933                                              state->box,
1934                                              MASTER(cr) && mdrunOptions.verbose,
1935                                              bRerunMD);
1936
1937             if (bNeedRepartition && haveDDAtomOrdering(*cr))
1938             {
1939                 dd_collect_state(cr->dd, state, state_global);
1940             }
1941         }
1942
1943         /* Replica exchange */
1944         bExchanged = FALSE;
1945         if (bDoReplEx)
1946         {
1947             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1948         }
1949
1950         if ((bExchanged || bNeedRepartition) && haveDDAtomOrdering(*cr))
1951         {
1952             dd_partition_system(fplog,
1953                                 mdlog,
1954                                 step,
1955                                 cr,
1956                                 TRUE,
1957                                 1,
1958                                 state_global,
1959                                 top_global,
1960                                 *ir,
1961                                 imdSession,
1962                                 pull_work,
1963                                 state,
1964                                 &f,
1965                                 mdAtoms,
1966                                 top,
1967                                 fr,
1968                                 vsite,
1969                                 constr,
1970                                 nrnb,
1971                                 wcycle,
1972                                 FALSE);
1973             upd.updateAfterPartition(state->natoms,
1974                                      md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1975                                                  : gmx::ArrayRef<const unsigned short>(),
1976                                      md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1977                                              : gmx::ArrayRef<const unsigned short>(),
1978                                      md->cACC ? gmx::arrayRefFromArray(md->cACC, md->nr)
1979                                               : gmx::ArrayRef<const unsigned short>());
1980             fr->longRangeNonbondeds->updateAfterPartition(*md);
1981         }
1982
1983         bFirstStep = FALSE;
1984         bInitStep  = FALSE;
1985
1986         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1987         /* With all integrators, except VV, we need to retain the pressure
1988          * at the current step for coupling at the next step.
1989          */
1990         if ((state->flags & enumValueToBitMask(StateEntry::PressurePrevious))
1991             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1992         {
1993             /* Store the pressure in t_state for pressure coupling
1994              * at the next MD step.
1995              */
1996             copy_mat(pres, state->pres_prev);
1997         }
1998
1999         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
2000
2001         if ((membed != nullptr) && (!bLastStep))
2002         {
2003             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
2004         }
2005
2006         cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
2007         if (haveDDAtomOrdering(*cr) && wcycle)
2008         {
2009             dd_cycles_add(cr->dd, cycles, ddCyclStep);
2010         }
2011
2012         /* increase the MD step number */
2013         step++;
2014         step_rel++;
2015         observablesReducer.markAsReadyToReduce();
2016
2017 #if GMX_FAHCORE
2018         if (MASTER(cr))
2019         {
2020             fcReportProgress(ir->nsteps + ir->init_step, step);
2021         }
2022 #endif
2023
2024         resetHandler->resetCounters(
2025                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
2026
2027         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
2028         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
2029     }
2030     /* End of main MD loop */
2031
2032     /* Closing TNG files can include compressing data. Therefore it is good to do that
2033      * before stopping the time measurements. */
2034     mdoutf_tng_close(outf);
2035
2036     /* Stop measuring walltime */
2037     walltime_accounting_end_time(walltime_accounting);
2038
2039     if (simulationWork.haveSeparatePmeRank)
2040     {
2041         /* Tell the PME only node to finish */
2042         gmx_pme_send_finish(cr);
2043     }
2044
2045     if (MASTER(cr))
2046     {
2047         if (ir->nstcalcenergy > 0)
2048         {
2049             energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
2050
2051             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
2052             energyOutput.printAverages(fplog, groups);
2053         }
2054     }
2055     done_mdoutf(outf);
2056
2057     if (bPMETune)
2058     {
2059         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
2060     }
2061
2062     done_shellfc(fplog, shellfc, step_rel);
2063
2064     if (useReplicaExchange && MASTER(cr))
2065     {
2066         print_replica_exchange_statistics(fplog, repl_ex);
2067     }
2068
2069     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
2070
2071     global_stat_destroy(gstat);
2072 }