src/gromacs/mdrun/md.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief Implements the integrator for normal molecular dynamics simulations
  40  *
  41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
  42  * \ingroup module_mdrun
  43  */
  44 #include "gmxpre.h"
  45
  46 #include <cinttypes>
  47 #include <cmath>
  48 #include <cstdio>
  49 #include <cstdlib>
  50
  51 #include <algorithm>
  52 #include <memory>
  53 #include <numeric>
  54
  55 #include "gromacs/applied_forces/awh/awh.h"
  56 #include "gromacs/applied_forces/awh/read_params.h"
  57 #include "gromacs/commandline/filenm.h"
  58 #include "gromacs/domdec/collect.h"
  59 #include "gromacs/domdec/dlbtiming.h"
  60 #include "gromacs/domdec/domdec.h"
  61 #include "gromacs/domdec/domdec_network.h"
  62 #include "gromacs/domdec/domdec_struct.h"
  63 #include "gromacs/domdec/gpuhaloexchange.h"
  64 #include "gromacs/domdec/localtopologychecker.h"
  65 #include "gromacs/domdec/mdsetup.h"
  66 #include "gromacs/domdec/partition.h"
  67 #include "gromacs/essentialdynamics/edsam.h"
  68 #include "gromacs/ewald/pme_load_balancing.h"
  69 #include "gromacs/ewald/pme_pp.h"
  70 #include "gromacs/fileio/trxio.h"
  71 #include "gromacs/gmxlib/network.h"
  72 #include "gromacs/gmxlib/nrnb.h"
  73 #include "gromacs/gpu_utils/device_stream_manager.h"
  74 #include "gromacs/gpu_utils/gpu_utils.h"
  75 #include "gromacs/imd/imd.h"
  76 #include "gromacs/listed_forces/listed_forces.h"
  77 #include "gromacs/math/functions.h"
  78 #include "gromacs/math/invertmatrix.h"
  79 #include "gromacs/math/vec.h"
  80 #include "gromacs/math/vectypes.h"
  81 #include "gromacs/mdlib/checkpointhandler.h"
  82 #include "gromacs/mdlib/compute_io.h"
  83 #include "gromacs/mdlib/constr.h"
  84 #include "gromacs/mdlib/coupling.h"
  85 #include "gromacs/mdlib/ebin.h"
  86 #include "gromacs/mdlib/enerdata_utils.h"
  87 #include "gromacs/mdlib/energyoutput.h"
  88 #include "gromacs/mdlib/expanded.h"
  89 #include "gromacs/mdlib/force.h"
  90 #include "gromacs/mdlib/force_flags.h"
  91 #include "gromacs/mdlib/forcerec.h"
  92 #include "gromacs/mdlib/freeenergyparameters.h"
  93 #include "gromacs/mdlib/md_support.h"
  94 #include "gromacs/mdlib/mdatoms.h"
  95 #include "gromacs/mdlib/mdoutf.h"
  96 #include "gromacs/mdlib/membed.h"
  97 #include "gromacs/mdlib/resethandler.h"
  98 #include "gromacs/mdlib/sighandler.h"
  99 #include "gromacs/mdlib/simulationsignal.h"
 100 #include "gromacs/mdlib/stat.h"
 101 #include "gromacs/mdlib/stophandler.h"
 102 #include "gromacs/mdlib/tgroup.h"
 103 #include "gromacs/mdlib/trajectory_writing.h"
 104 #include "gromacs/mdlib/update.h"
 105 #include "gromacs/mdlib/update_constrain_gpu.h"
 106 #include "gromacs/mdlib/update_vv.h"
 107 #include "gromacs/mdlib/vcm.h"
 108 #include "gromacs/mdlib/vsite.h"
 109 #include "gromacs/mdrunutility/freeenergy.h"
 110 #include "gromacs/mdrunutility/handlerestart.h"
 111 #include "gromacs/mdrunutility/multisim.h"
 112 #include "gromacs/mdrunutility/printtime.h"
 113 #include "gromacs/mdtypes/awh_history.h"
 114 #include "gromacs/mdtypes/awh_params.h"
 115 #include "gromacs/mdtypes/commrec.h"
 116 #include "gromacs/mdtypes/df_history.h"
 117 #include "gromacs/mdtypes/energyhistory.h"
 118 #include "gromacs/mdtypes/fcdata.h"
 119 #include "gromacs/mdtypes/forcebuffers.h"
 120 #include "gromacs/mdtypes/forcerec.h"
 121 #include "gromacs/mdtypes/group.h"
 122 #include "gromacs/mdtypes/inputrec.h"
 123 #include "gromacs/mdtypes/interaction_const.h"
 124 #include "gromacs/mdtypes/md_enums.h"
 125 #include "gromacs/mdtypes/mdatom.h"
 126 #include "gromacs/mdtypes/mdrunoptions.h"
 127 #include "gromacs/mdtypes/multipletimestepping.h"
 128 #include "gromacs/mdtypes/observableshistory.h"
 129 #include "gromacs/mdtypes/observablesreducer.h"
 130 #include "gromacs/mdtypes/pullhistory.h"
 131 #include "gromacs/mdtypes/simulation_workload.h"
 132 #include "gromacs/mdtypes/state.h"
 133 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
 134 #include "gromacs/modularsimulator/energydata.h"
 135 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 136 #include "gromacs/nbnxm/nbnxm.h"
 137 #include "gromacs/pbcutil/pbc.h"
 138 #include "gromacs/pulling/output.h"
 139 #include "gromacs/pulling/pull.h"
 140 #include "gromacs/swap/swapcoords.h"
 141 #include "gromacs/timing/wallcycle.h"
 142 #include "gromacs/timing/walltime_accounting.h"
 143 #include "gromacs/topology/atoms.h"
 144 #include "gromacs/topology/idef.h"
 145 #include "gromacs/topology/mtop_util.h"
 146 #include "gromacs/topology/topology.h"
 147 #include "gromacs/trajectory/trajectoryframe.h"
 148 #include "gromacs/utility/basedefinitions.h"
 149 #include "gromacs/utility/cstringutil.h"
 150 #include "gromacs/utility/fatalerror.h"
 151 #include "gromacs/utility/logger.h"
 152 #include "gromacs/utility/real.h"
 153 #include "gromacs/utility/smalloc.h"
 154
 155 #include "legacysimulator.h"
 156 #include "replicaexchange.h"
 157 #include "shellfc.h"
 158
 159 using gmx::SimulationSignaller;
 160
 161 void gmx::LegacySimulator::do_md()
 162 {
 163     // TODO Historically, the EM and MD "integrators" used different
 164     // names for the t_inputrec *parameter, but these must have the
 165     // same name, now that it's a member of a struct. We use this ir
 166     // alias to avoid a large ripple of nearly useless changes.
 167     // t_inputrec is being replaced by IMdpOptionsProvider, so this
 168     // will go away eventually.
 169     const t_inputrec* ir = inputrec;
 170
 171     double       t, t0 = ir->init_t;
 172     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
 173     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
 174     gmx_bool     bDoExpanded = FALSE;
 175     gmx_bool     do_ene, do_log, do_verbose;
 176     gmx_bool     bMasterState;
 177     unsigned int force_flags;
 178     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
 179     int    i, m;
 180     rvec   mu_tot;
 181     matrix pressureCouplingMu, M;
 182     gmx_repl_ex_t     repl_ex = nullptr;
 183     gmx_global_stat_t gstat;
 184     gmx_shellfc_t*    shellfc;
 185     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
 186     gmx_bool          bTrotter;
 187     real              dvdl_constr;
 188     std::vector<RVec> cbuf;
 189     matrix            lastbox;
 190     int               lamnew = 0;
 191     /* for FEP */
 192     double    cycles;
 193     real      saved_conserved_quantity = 0;
 194     real      last_ekin                = 0;
 195     t_extmass MassQ;
 196     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
 197
 198     /* PME load balancing data for GPU kernels */
 199     gmx_bool bPMETune         = FALSE;
 200     gmx_bool bPMETunePrinting = FALSE;
 201
 202     bool bInteractiveMDstep = false;
 203
 204     SimulationSignals signals;
 205     // Most global communnication stages don't propagate mdrun
 206     // signals, and will use this object to achieve that.
 207     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
 208
 209     if (!mdrunOptions.writeConfout)
 210     {
 211         // This is on by default, and the main known use case for
 212         // turning it off is for convenience in benchmarking, which is
 213         // something that should not show up in the general user
 214         // interface.
 215         GMX_LOG(mdlog.info)
 216                 .asParagraph()
 217                 .appendText(
 218                         "The -noconfout functionality is deprecated, and may be removed in a "
 219                         "future version.");
 220     }
 221
 222     /* md-vv uses averaged full step velocities for T-control
 223        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 224        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 225     bTrotter = (EI_VV(ir->eI)
 226                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
 227
 228     const bool bRerunMD = false;
 229
 230     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
 231     bGStatEveryStep   = (nstglobalcomm == 1);
 232
 233     const SimulationGroups* groups = &top_global.groups;
 234
 235     std::unique_ptr<EssentialDynamics> ed = nullptr;
 236     if (opt2bSet("-ei", nfile, fnm))
 237     {
 238         /* Initialize essential dynamics sampling */
 239         ed = init_edsam(mdlog,
 240                         opt2fn_null("-ei", nfile, fnm),
 241                         opt2fn("-eo", nfile, fnm),
 242                         top_global,
 243                         *ir,
 244                         cr,
 245                         constr,
 246                         state_global,
 247                         observablesHistory,
 248                         oenv,
 249                         startingBehavior);
 250     }
 251     else if (observablesHistory->edsamHistory)
 252     {
 253         gmx_fatal(FARGS,
 254                   "The checkpoint is from a run with essential dynamics sampling, "
 255                   "but the current run did not specify the -ei option. "
 256                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
 257     }
 258
 259     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
 260     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
 261     initialize_lambdas(fplog,
 262                        ir->efep,
 263                        ir->bSimTemp,
 264                        *ir->fepvals,
 265                        ir->simtempvals->temperatures,
 266                        gmx::arrayRefFromArray(ir->opts.ref_t, ir->opts.ngtc),
 267                        MASTER(cr),
 268                        fep_state,
 269                        lambda);
 270     Update upd(*ir, deform);
 271     bool   doSimulatedAnnealing = false;
 272     {
 273         // TODO: Avoid changing inputrec (#3854)
 274         // Simulated annealing updates the reference temperature.
 275         auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
 276         doSimulatedAnnealing   = initSimulatedAnnealing(nonConstInputrec, &upd);
 277     }
 278     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
 279
 280     t_fcdata& fcdata = *fr->fcdata;
 281
 282     bool simulationsShareState = false;
 283     int  nstSignalComm         = nstglobalcomm;
 284     {
 285         // TODO This implementation of ensemble orientation restraints is nasty because
 286         // a user can't just do multi-sim with single-sim orientation restraints.
 287         bool usingEnsembleRestraints = (fcdata.disres->nsystems > 1) || ((ms != nullptr) && fcdata.orires);
 288         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim() && (ms != nullptr));
 289
 290         // Replica exchange, ensemble restraints and AWH need all
 291         // simulations to remain synchronized, so they need
 292         // checkpoints and stop conditions to act on the same step, so
 293         // the propagation of such signals must take place between
 294         // simulations, not just within simulations.
 295         // TODO: Make algorithm initializers set these flags.
 296         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
 297
 298         if (simulationsShareState)
 299         {
 300             // Inter-simulation signal communication does not need to happen
 301             // often, so we use a minimum of 200 steps to reduce overhead.
 302             const int c_minimumInterSimulationSignallingInterval = 200;
 303             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
 304                             * nstglobalcomm;
 305         }
 306     }
 307
 308     if (startingBehavior != StartingBehavior::RestartWithAppending)
 309     {
 310         pleaseCiteCouplingAlgorithms(fplog, *ir);
 311     }
 312     gmx_mdoutf*       outf = init_mdoutf(fplog,
 313                                    nfile,
 314                                    fnm,
 315                                    mdrunOptions,
 316                                    cr,
 317                                    outputProvider,
 318                                    mdModulesNotifiers,
 319                                    ir,
 320                                    top_global,
 321                                    oenv,
 322                                    wcycle,
 323                                    startingBehavior,
 324                                    simulationsShareState,
 325                                    ms);
 326     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf),
 327                                    top_global,
 328                                    *ir,
 329                                    pull_work,
 330                                    mdoutf_get_fp_dhdl(outf),
 331                                    false,
 332                                    startingBehavior,
 333                                    simulationsShareState,
 334                                    mdModulesNotifiers);
 335
 336     gstat = global_stat_init(ir);
 337
 338     const auto& simulationWork     = runScheduleWork->simulationWork;
 339     const bool  useGpuForPme       = simulationWork.useGpuPme;
 340     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
 341     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
 342     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
 343
 344     /* Check for polarizable models and flexible constraints */
 345     shellfc = init_shell_flexcon(fplog,
 346                                  top_global,
 347                                  constr ? constr->numFlexibleConstraints() : 0,
 348                                  ir->nstcalcenergy,
 349                                  haveDDAtomOrdering(*cr),
 350                                  useGpuForPme);
 351
 352     {
 353         double io = compute_io(ir, top_global.natoms, *groups, energyOutput.numEnergyTerms(), 1);
 354         if ((io > 2000) && MASTER(cr))
 355         {
 356             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
 357         }
 358     }
 359
 360     ObservablesReducer observablesReducer = observablesReducerBuilder->build();
 361
 362     ForceBuffers     f(simulationWork.useMts,
 363                    ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 364                                ? PinningPolicy::PinnedIfSupported
 365                                : PinningPolicy::CannotBePinned);
 366     const t_mdatoms* md = mdAtoms->mdatoms();
 367     if (haveDDAtomOrdering(*cr))
 368     {
 369         // Local state only becomes valid now.
 370         dd_init_local_state(*cr->dd, state_global, state);
 371
 372         /* Distribute the charge groups over the nodes from the master node */
 373         dd_partition_system(fplog,
 374                             mdlog,
 375                             ir->init_step,
 376                             cr,
 377                             TRUE,
 378                             1,
 379                             state_global,
 380                             top_global,
 381                             *ir,
 382                             imdSession,
 383                             pull_work,
 384                             state,
 385                             &f,
 386                             mdAtoms,
 387                             top,
 388                             fr,
 389                             vsite,
 390                             constr,
 391                             nrnb,
 392                             nullptr,
 393                             FALSE);
 394         upd.updateAfterPartition(state->natoms,
 395                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
 396                                              : gmx::ArrayRef<const unsigned short>(),
 397                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
 398                                          : gmx::ArrayRef<const unsigned short>());
 399         fr->longRangeNonbondeds->updateAfterPartition(*md);
 400     }
 401     else
 402     {
 403         state_change_natoms(state_global, state_global->natoms);
 404
 405         /* Generate and initialize new topology */
 406         mdAlgorithmsSetupAtomData(cr, *ir, top_global, top, fr, &f, mdAtoms, constr, vsite, shellfc);
 407
 408         upd.updateAfterPartition(state->natoms,
 409                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
 410                                              : gmx::ArrayRef<const unsigned short>(),
 411                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
 412                                          : gmx::ArrayRef<const unsigned short>());
 413         fr->longRangeNonbondeds->updateAfterPartition(*md);
 414     }
 415
 416     std::unique_ptr<UpdateConstrainGpu> integrator;
 417
 418     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
 419
 420     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
 421     if (useGpuForUpdate)
 422     {
 423         GMX_RELEASE_ASSERT(!haveDDAtomOrdering(*cr) || ddUsesUpdateGroups(*cr->dd)
 424                                    || constr == nullptr || constr->numConstraintsTotal() == 0,
 425                            "Constraints in domain decomposition are only supported with update "
 426                            "groups if using GPU update.\n");
 427         GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
 428                                    || constr->numConstraintsTotal() == 0,
 429                            "SHAKE is not supported with GPU update.");
 430         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
 431                            "Either PME or short-ranged non-bonded interaction tasks must run on "
 432                            "the GPU to use GPU update.\n");
 433         GMX_RELEASE_ASSERT(ir->eI == IntegrationAlgorithm::MD,
 434                            "Only the md integrator is supported with the GPU update.\n");
 435         GMX_RELEASE_ASSERT(
 436                 ir->etc != TemperatureCoupling::NoseHoover,
 437                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
 438         GMX_RELEASE_ASSERT(
 439                 ir->epc == PressureCoupling::No || ir->epc == PressureCoupling::ParrinelloRahman
 440                         || ir->epc == PressureCoupling::Berendsen || ir->epc == PressureCoupling::CRescale,
 441                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
 442                 "with the GPU update.\n");
 443         GMX_RELEASE_ASSERT(!md->haveVsites,
 444                            "Virtual sites are not supported with the GPU update.\n");
 445         GMX_RELEASE_ASSERT(ed == nullptr,
 446                            "Essential dynamics is not supported with the GPU update.\n");
 447         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
 448                            "Constraints pulling is not supported with the GPU update.\n");
 449         GMX_RELEASE_ASSERT(fcdata.orires == nullptr,
 450                            "Orientation restraints are not supported with the GPU update.\n");
 451         GMX_RELEASE_ASSERT(
 452                 ir->efep == FreeEnergyPerturbationType::No
 453                         || (!haveFepPerturbedMasses(top_global) && !havePerturbedConstraints(top_global)),
 454                 "Free energy perturbation of masses and constraints are not supported with the GPU "
 455                 "update.");
 456
 457         if (constr != nullptr && constr->numConstraintsTotal() > 0)
 458         {
 459             GMX_LOG(mdlog.info)
 460                     .asParagraph()
 461                     .appendText("Updating coordinates and applying constraints on the GPU.");
 462         }
 463         else
 464         {
 465             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
 466         }
 467         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
 468                            "Device stream manager should be initialized in order to use GPU "
 469                            "update-constraints.");
 470         GMX_RELEASE_ASSERT(
 471                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
 472                 "Update stream should be initialized in order to use GPU "
 473                 "update-constraints.");
 474         integrator = std::make_unique<UpdateConstrainGpu>(
 475                 *ir,
 476                 top_global,
 477                 ekind->ngtc,
 478                 fr->deviceStreamManager->context(),
 479                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
 480                 wcycle);
 481
 482         stateGpu->setXUpdatedOnDeviceEvent(integrator->xUpdatedOnDeviceEvent());
 483
 484         integrator->setPbc(PbcType::Xyz, state->box);
 485     }
 486
 487     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 488     {
 489         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
 490     }
 491     if (useGpuForUpdate)
 492     {
 493         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
 494     }
 495
 496     // NOTE: The global state is no longer used at this point.
 497     // But state_global is still used as temporary storage space for writing
 498     // the global state to file and potentially for replica exchange.
 499     // (Global topology should persist.)
 500
 501     update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
 502
 503     if (ir->bExpanded)
 504     {
 505         /* Check nstexpanded here, because the grompp check was broken */
 506         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
 507         {
 508             gmx_fatal(FARGS,
 509                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
 510         }
 511         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
 512     }
 513
 514     if (MASTER(cr))
 515     {
 516         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
 517     }
 518
 519     preparePrevStepPullCom(ir,
 520                            pull_work,
 521                            gmx::arrayRefFromArray(md->massT, md->nr),
 522                            state,
 523                            state_global,
 524                            cr,
 525                            startingBehavior != StartingBehavior::NewSimulation);
 526
 527     // TODO: Remove this by converting AWH into a ForceProvider
 528     auto awh = prepareAwhModule(fplog,
 529                                 *ir,
 530                                 state_global,
 531                                 cr,
 532                                 ms,
 533                                 startingBehavior != StartingBehavior::NewSimulation,
 534                                 shellfc != nullptr,
 535                                 opt2fn("-awh", nfile, fnm),
 536                                 pull_work);
 537
 538     if (useReplicaExchange && MASTER(cr))
 539     {
 540         repl_ex = init_replica_exchange(fplog, ms, top_global.natoms, ir, replExParams);
 541     }
 542     /* PME tuning is only supported in the Verlet scheme, with PME for
 543      * Coulomb. It is not supported with only LJ PME. */
 544     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
 545                 && ir->cutoff_scheme != CutoffScheme::Group);
 546
 547     pme_load_balancing_t* pme_loadbal = nullptr;
 548     if (bPMETune)
 549     {
 550         pme_loadbal_init(
 551                 &pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu());
 552     }
 553
 554     if (!ir->bContinuation)
 555     {
 556         if (state->flags & enumValueToBitMask(StateEntry::V))
 557         {
 558             auto v = makeArrayRef(state->v);
 559             /* Set the velocities of vsites, shells and frozen atoms to zero */
 560             for (i = 0; i < md->homenr; i++)
 561             {
 562                 if (md->ptype[i] == ParticleType::Shell)
 563                 {
 564                     clear_rvec(v[i]);
 565                 }
 566                 else if (md->cFREEZE)
 567                 {
 568                     for (m = 0; m < DIM; m++)
 569                     {
 570                         if (ir->opts.nFreeze[md->cFREEZE[i]][m])
 571                         {
 572                             v[i][m] = 0;
 573                         }
 574                     }
 575                 }
 576             }
 577         }
 578
 579         if (constr)
 580         {
 581             /* Constrain the initial coordinates and velocities */
 582             do_constrain_first(fplog,
 583                                constr,
 584                                ir,
 585                                md->nr,
 586                                md->homenr,
 587                                state->x.arrayRefWithPadding(),
 588                                state->v.arrayRefWithPadding(),
 589                                state->box,
 590                                state->lambda[FreeEnergyPerturbationCouplingType::Bonded]);
 591         }
 592     }
 593
 594     const int nstfep = computeFepPeriod(*ir, replExParams);
 595
 596     /* Be REALLY careful about what flags you set here. You CANNOT assume
 597      * this is the first step, since we might be restarting from a checkpoint,
 598      * and in that case we should not do any modifications to the state.
 599      */
 600     bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && !ir->bContinuation);
 601
 602     // When restarting from a checkpoint, it can be appropriate to
 603     // initialize ekind from quantities in the checkpoint. Otherwise,
 604     // compute_globals must initialize ekind before the simulation
 605     // starts/restarts. However, only the master rank knows what was
 606     // found in the checkpoint file, so we have to communicate in
 607     // order to coordinate the restart.
 608     //
 609     // TODO Consider removing this communication if/when checkpoint
 610     // reading directly follows .tpr reading, because all ranks can
 611     // agree on hasReadEkinState at that time.
 612     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
 613     if (PAR(cr))
 614     {
 615         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
 616     }
 617     if (hasReadEkinState)
 618     {
 619         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
 620     }
 621
 622     unsigned int cglo_flags =
 623             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
 624              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
 625
 626     bSumEkinhOld = FALSE;
 627
 628     t_vcm vcm(top_global.groups, *ir);
 629     reportComRemovalInfo(fplog, vcm);
 630
 631     int64_t step     = ir->init_step;
 632     int64_t step_rel = 0;
 633
 634     /* To minimize communication, compute_globals computes the COM velocity
 635      * and the kinetic energy for the velocities without COM motion removed.
 636      * Thus to get the kinetic energy without the COM contribution, we need
 637      * to call compute_globals twice.
 638      */
 639     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
 640     {
 641         unsigned int cglo_flags_iteration = cglo_flags;
 642         if (bStopCM && cgloIteration == 0)
 643         {
 644             cglo_flags_iteration |= CGLO_STOPCM;
 645             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
 646         }
 647         compute_globals(gstat,
 648                         cr,
 649                         ir,
 650                         fr,
 651                         ekind,
 652                         makeConstArrayRef(state->x),
 653                         makeConstArrayRef(state->v),
 654                         state->box,
 655                         md,
 656                         nrnb,
 657                         &vcm,
 658                         nullptr,
 659                         enerd,
 660                         force_vir,
 661                         shake_vir,
 662                         total_vir,
 663                         pres,
 664                         &nullSignaller,
 665                         state->box,
 666                         &bSumEkinhOld,
 667                         cglo_flags_iteration,
 668                         step,
 669                         &observablesReducer);
 670         // Clean up after pre-step use of compute_globals()
 671         observablesReducer.markAsReadyToReduce();
 672
 673         if (cglo_flags_iteration & CGLO_STOPCM)
 674         {
 675             /* At initialization, do not pass x with acceleration-correction mode
 676              * to avoid (incorrect) correction of the initial coordinates.
 677              */
 678             auto x = (vcm.mode == ComRemovalAlgorithm::LinearAccelerationCorrection)
 679                              ? ArrayRef<RVec>()
 680                              : makeArrayRef(state->x);
 681             process_and_stopcm_grp(fplog, &vcm, *md, x, makeArrayRef(state->v));
 682             inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
 683         }
 684     }
 685     if (ir->eI == IntegrationAlgorithm::VVAK)
 686     {
 687         /* a second call to get the half step temperature initialized as well */
 688         /* we do the same call as above, but turn the pressure off -- internally to
 689            compute_globals, this is recognized as a velocity verlet half-step
 690            kinetic energy calculation.  This minimized excess variables, but
 691            perhaps loses some logic?*/
 692
 693         compute_globals(gstat,
 694                         cr,
 695                         ir,
 696                         fr,
 697                         ekind,
 698                         makeConstArrayRef(state->x),
 699                         makeConstArrayRef(state->v),
 700                         state->box,
 701                         md,
 702                         nrnb,
 703                         &vcm,
 704                         nullptr,
 705                         enerd,
 706                         force_vir,
 707                         shake_vir,
 708                         total_vir,
 709                         pres,
 710                         &nullSignaller,
 711                         state->box,
 712                         &bSumEkinhOld,
 713                         cglo_flags & ~CGLO_PRESSURE,
 714                         step,
 715                         &observablesReducer);
 716         // Clean up after pre-step use of compute_globals()
 717         observablesReducer.markAsReadyToReduce();
 718     }
 719
 720     /* Calculate the initial half step temperature, and save the ekinh_old */
 721     if (startingBehavior == StartingBehavior::NewSimulation)
 722     {
 723         for (i = 0; (i < ir->opts.ngtc); i++)
 724         {
 725             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
 726         }
 727     }
 728
 729     /* need to make an initiation call to get the Trotter variables set, as well as other constants
 730        for non-trotter temperature control */
 731     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
 732
 733     if (MASTER(cr))
 734     {
 735         if (!ir->bContinuation)
 736         {
 737             if (constr && ir->eConstrAlg == ConstraintAlgorithm::Lincs)
 738             {
 739                 fprintf(fplog,
 740                         "RMS relative constraint deviation after constraining: %.2e\n",
 741                         constr->rmsd());
 742             }
 743             if (EI_STATE_VELOCITY(ir->eI))
 744             {
 745                 real temp = enerd->term[F_TEMP];
 746                 if (ir->eI != IntegrationAlgorithm::VV)
 747                 {
 748                     /* Result of Ekin averaged over velocities of -half
 749                      * and +half step, while we only have -half step here.
 750                      */
 751                     temp *= 2;
 752                 }
 753                 fprintf(fplog, "Initial temperature: %g K\n", temp);
 754             }
 755         }
 756
 757         char tbuf[20];
 758         fprintf(stderr, "starting mdrun '%s'\n", *(top_global.name));
 759         if (ir->nsteps >= 0)
 760         {
 761             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
 762         }
 763         else
 764         {
 765             sprintf(tbuf, "%s", "infinite");
 766         }
 767         if (ir->init_step > 0)
 768         {
 769             fprintf(stderr,
 770                     "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 771                     gmx_step_str(ir->init_step + ir->nsteps, sbuf),
 772                     tbuf,
 773                     gmx_step_str(ir->init_step, sbuf2),
 774                     ir->init_step * ir->delta_t);
 775         }
 776         else
 777         {
 778             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
 779         }
 780         fprintf(fplog, "\n");
 781     }
 782
 783     walltime_accounting_start_time(walltime_accounting);
 784     wallcycle_start(wcycle, WallCycleCounter::Run);
 785     print_start(fplog, cr, walltime_accounting, "mdrun");
 786
 787     /***********************************************************
 788      *
 789      *             Loop over MD steps
 790      *
 791      ************************************************************/
 792
 793     bFirstStep = TRUE;
 794     /* Skip the first Nose-Hoover integration when we get the state from tpx */
 795     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
 796     bSumEkinhOld     = FALSE;
 797     bExchanged       = FALSE;
 798     bNeedRepartition = FALSE;
 799
 800     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
 801             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]),
 802             simulationsShareState,
 803             MASTER(cr),
 804             ir->nstlist,
 805             mdrunOptions.reproducible,
 806             nstSignalComm,
 807             mdrunOptions.maximumHoursToRun,
 808             ir->nstlist == 0,
 809             fplog,
 810             step,
 811             bNS,
 812             walltime_accounting);
 813
 814     auto checkpointHandler = std::make_unique<CheckpointHandler>(
 815             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
 816             simulationsShareState,
 817             ir->nstlist == 0,
 818             MASTER(cr),
 819             mdrunOptions.writeConfout,
 820             mdrunOptions.checkpointOptions.period);
 821
 822     const bool resetCountersIsLocal = true;
 823     auto       resetHandler         = std::make_unique<ResetHandler>(
 824             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
 825             !resetCountersIsLocal,
 826             ir->nsteps,
 827             MASTER(cr),
 828             mdrunOptions.timingOptions.resetHalfway,
 829             mdrunOptions.maximumHoursToRun,
 830             mdlog,
 831             wcycle,
 832             walltime_accounting);
 833
 834     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
 835
 836     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
 837     {
 838         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
 839     }
 840
 841     /* and stop now if we should */
 842     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
 843     while (!bLastStep)
 844     {
 845
 846         /* Determine if this is a neighbor search step */
 847         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
 848
 849         if (bPMETune && bNStList)
 850         {
 851             // This has to be here because PME load balancing is called so early.
 852             // TODO: Move to after all booleans are defined.
 853             if (useGpuForUpdate && !bFirstStep)
 854             {
 855                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 856                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 857             }
 858             /* PME grid + cut-off optimization with GPUs or PME nodes */
 859             pme_loadbal_do(pme_loadbal,
 860                            cr,
 861                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
 862                            fplog,
 863                            mdlog,
 864                            *ir,
 865                            fr,
 866                            state->box,
 867                            state->x,
 868                            wcycle,
 869                            step,
 870                            step_rel,
 871                            &bPMETunePrinting,
 872                            simulationWork.useGpuPmePpCommunication);
 873         }
 874
 875         wallcycle_start(wcycle, WallCycleCounter::Step);
 876
 877         bLastStep = (step_rel == ir->nsteps);
 878         t         = t0 + step * ir->delta_t;
 879
 880         // TODO Refactor this, so that nstfep does not need a default value of zero
 881         if (ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
 882         {
 883             /* find and set the current lambdas */
 884             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
 885
 886             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
 887                            && (!bFirstStep));
 888         }
 889
 890         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
 891                      && do_per_step(step, replExParams.exchangeInterval));
 892
 893         if (doSimulatedAnnealing)
 894         {
 895             // TODO: Avoid changing inputrec (#3854)
 896             // Simulated annealing updates the reference temperature.
 897             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
 898             update_annealing_target_temp(nonConstInputrec, t, &upd);
 899         }
 900
 901         /* Stop Center of Mass motion */
 902         bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && do_per_step(step, ir->nstcomm));
 903
 904         /* Determine whether or not to do Neighbour Searching */
 905         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
 906
 907         /* Note that the stopHandler will cause termination at nstglobalcomm
 908          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
 909          * nstpcouple steps, we have computed the half-step kinetic energy
 910          * of the previous step and can always output energies at the last step.
 911          */
 912         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
 913
 914         /* do_log triggers energy and virial calculation. Because this leads
 915          * to different code paths, forces can be different. Thus for exact
 916          * continuation we should avoid extra log output.
 917          * Note that the || bLastStep can result in non-exact continuation
 918          * beyond the last step. But we don't consider that to be an issue.
 919          */
 920         do_log     = (do_per_step(step, ir->nstlog)
 921                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
 922         do_verbose = mdrunOptions.verbose
 923                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
 924
 925         // On search steps, when doing the update on the GPU, copy
 926         // the coordinates and velocities to the host unless they are
 927         // already there (ie on the first step and after replica
 928         // exchange).
 929         if (useGpuForUpdate && bNS && !bFirstStep && !bExchanged)
 930         {
 931             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
 932             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 933             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
 934             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 935         }
 936
 937         // We only need to calculate virtual velocities if we are writing them in the current step
 938         const bool needVirtualVelocitiesThisStep =
 939                 (vsite != nullptr)
 940                 && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep());
 941
 942         if (vsite != nullptr)
 943         {
 944             // Virtual sites need to be updated before domain decomposition and forces are calculated
 945             wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
 946             // md-vv calculates virtual velocities once it has full-step real velocities
 947             vsite->construct(state->x,
 948                              state->v,
 949                              state->box,
 950                              (!EI_VV(inputrec->eI) && needVirtualVelocitiesThisStep)
 951                                      ? VSiteOperation::PositionsAndVelocities
 952                                      : VSiteOperation::Positions);
 953             wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
 954         }
 955
 956         if (bNS && !(bFirstStep && ir->bContinuation))
 957         {
 958             bMasterState = FALSE;
 959             /* Correct the new box if it is too skewed */
 960             if (inputrecDynamicBox(ir))
 961             {
 962                 if (correct_box(fplog, step, state->box))
 963                 {
 964                     bMasterState = TRUE;
 965                 }
 966             }
 967             // If update is offloaded, and the box was changed either
 968             // above or in a replica exchange on the previous step,
 969             // the GPU Update object should be informed
 970             if (useGpuForUpdate && (bMasterState || bExchanged))
 971             {
 972                 integrator->setPbc(PbcType::Xyz, state->box);
 973             }
 974             if (haveDDAtomOrdering(*cr) && bMasterState)
 975             {
 976                 dd_collect_state(cr->dd, state, state_global);
 977             }
 978
 979             if (haveDDAtomOrdering(*cr))
 980             {
 981                 /* Repartition the domain decomposition */
 982                 dd_partition_system(fplog,
 983                                     mdlog,
 984                                     step,
 985                                     cr,
 986                                     bMasterState,
 987                                     nstglobalcomm,
 988                                     state_global,
 989                                     top_global,
 990                                     *ir,
 991                                     imdSession,
 992                                     pull_work,
 993                                     state,
 994                                     &f,
 995                                     mdAtoms,
 996                                     top,
 997                                     fr,
 998                                     vsite,
 999                                     constr,
1000                                     nrnb,
1001                                     wcycle,
1002                                     do_verbose && !bPMETunePrinting);
1003                 upd.updateAfterPartition(state->natoms,
1004                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1005                                                      : gmx::ArrayRef<const unsigned short>(),
1006                                          md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1007                                                  : gmx::ArrayRef<const unsigned short>());
1008                 fr->longRangeNonbondeds->updateAfterPartition(*md);
1009             }
1010         }
1011
1012         // Allocate or re-size GPU halo exchange object, if necessary
1013         if (bNS && simulationWork.havePpDomainDecomposition && simulationWork.useGpuHaloExchange)
1014         {
1015             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
1016                                "GPU device manager has to be initialized to use GPU "
1017                                "version of halo exchange.");
1018             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
1019         }
1020
1021         if (MASTER(cr) && do_log)
1022         {
1023             gmx::EnergyOutput::printHeader(
1024                     fplog, step, t); /* can we improve the information printed here? */
1025         }
1026
1027         if (ir->efep != FreeEnergyPerturbationType::No)
1028         {
1029             update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
1030         }
1031
1032         if (bExchanged)
1033         {
1034             /* We need the kinetic energy at minus the half step for determining
1035              * the full step kinetic energy and possibly for T-coupling.*/
1036             /* This may not be quite working correctly yet . . . . */
1037             int cglo_flags = CGLO_GSTAT | CGLO_TEMPERATURE;
1038             compute_globals(gstat,
1039                             cr,
1040                             ir,
1041                             fr,
1042                             ekind,
1043                             makeConstArrayRef(state->x),
1044                             makeConstArrayRef(state->v),
1045                             state->box,
1046                             md,
1047                             nrnb,
1048                             &vcm,
1049                             wcycle,
1050                             enerd,
1051                             nullptr,
1052                             nullptr,
1053                             nullptr,
1054                             nullptr,
1055                             &nullSignaller,
1056                             state->box,
1057                             &bSumEkinhOld,
1058                             cglo_flags,
1059                             step,
1060                             &observablesReducer);
1061         }
1062         clear_mat(force_vir);
1063
1064         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
1065
1066         /* Determine the energy and pressure:
1067          * at nstcalcenergy steps and at energy output steps (set below).
1068          */
1069         if (EI_VV(ir->eI) && (!bInitStep))
1070         {
1071             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1072             bCalcVir      = bCalcEnerStep
1073                        || (ir->epc != PressureCoupling::No
1074                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
1075         }
1076         else
1077         {
1078             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1079             bCalcVir      = bCalcEnerStep
1080                        || (ir->epc != PressureCoupling::No && do_per_step(step, ir->nstpcouple));
1081         }
1082         bCalcEner = bCalcEnerStep;
1083
1084         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
1085
1086         if (do_ene || do_log || bDoReplEx)
1087         {
1088             bCalcVir  = TRUE;
1089             bCalcEner = TRUE;
1090         }
1091
1092         // bCalcEner is only here for when the last step is not a mulitple of nstfep
1093         const bool computeDHDL = ((ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
1094                                   && (do_per_step(step, nstfep) || bCalcEner));
1095
1096         /* Do we need global communication ? */
1097         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
1098                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
1099
1100         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
1101                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
1102                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (computeDHDL ? GMX_FORCE_DHDL : 0));
1103         if (simulationWork.useMts && !do_per_step(step, ir->nstfout))
1104         {
1105             // TODO: merge this with stepWork.useOnlyMtsCombinedForceBuffer
1106             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
1107         }
1108
1109         if (shellfc)
1110         {
1111             /* Now is the time to relax the shells */
1112             relax_shell_flexcon(fplog,
1113                                 cr,
1114                                 ms,
1115                                 mdrunOptions.verbose,
1116                                 enforcedRotation,
1117                                 step,
1118                                 ir,
1119                                 imdSession,
1120                                 pull_work,
1121                                 bNS,
1122                                 force_flags,
1123                                 top,
1124                                 constr,
1125                                 enerd,
1126                                 state->natoms,
1127                                 state->x.arrayRefWithPadding(),
1128                                 state->v.arrayRefWithPadding(),
1129                                 state->box,
1130                                 state->lambda,
1131                                 &state->hist,
1132                                 &f.view(),
1133                                 force_vir,
1134                                 *md,
1135                                 fr->longRangeNonbondeds.get(),
1136                                 nrnb,
1137                                 wcycle,
1138                                 shellfc,
1139                                 fr,
1140                                 runScheduleWork,
1141                                 t,
1142                                 mu_tot,
1143                                 vsite,
1144                                 ddBalanceRegionHandler);
1145         }
1146         else
1147         {
1148             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
1149                is updated (or the AWH update will be performed twice for one step when continuing).
1150                It would be best to call this update function from do_md_trajectory_writing but that
1151                would occur after do_force. One would have to divide the update_awh function into one
1152                function applying the AWH force and one doing the AWH bias update. The update AWH
1153                bias function could then be called after do_md_trajectory_writing (then containing
1154                update_awh_history). The checkpointing will in the future probably moved to the start
1155                of the md loop which will rid of this issue. */
1156             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
1157             {
1158                 awh->updateHistory(state_global->awhHistory.get());
1159             }
1160
1161             /* The coordinates (x) are shifted (to get whole molecules)
1162              * in do_force.
1163              * This is parallellized as well, and does communication too.
1164              * Check comments in sim_util.c
1165              */
1166             do_force(fplog,
1167                      cr,
1168                      ms,
1169                      *ir,
1170                      awh.get(),
1171                      enforcedRotation,
1172                      imdSession,
1173                      pull_work,
1174                      step,
1175                      nrnb,
1176                      wcycle,
1177                      top,
1178                      state->box,
1179                      state->x.arrayRefWithPadding(),
1180                      &state->hist,
1181                      &f.view(),
1182                      force_vir,
1183                      md,
1184                      enerd,
1185                      state->lambda,
1186                      fr,
1187                      runScheduleWork,
1188                      vsite,
1189                      mu_tot,
1190                      t,
1191                      ed ? ed->getLegacyED() : nullptr,
1192                      fr->longRangeNonbondeds.get(),
1193                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
1194                      ddBalanceRegionHandler);
1195         }
1196
1197         // VV integrators do not need the following velocity half step
1198         // if it is the first step after starting from a checkpoint.
1199         // That is, the half step is needed on all other steps, and
1200         // also the first step when starting from a .tpr file.
1201         if (EI_VV(ir->eI))
1202         {
1203             integrateVVFirstStep(step,
1204                                  bFirstStep,
1205                                  bInitStep,
1206                                  startingBehavior,
1207                                  nstglobalcomm,
1208                                  ir,
1209                                  fr,
1210                                  cr,
1211                                  state,
1212                                  mdAtoms->mdatoms(),
1213                                  &fcdata,
1214                                  &MassQ,
1215                                  &vcm,
1216                                  enerd,
1217                                  &observablesReducer,
1218                                  ekind,
1219                                  gstat,
1220                                  &last_ekin,
1221                                  bCalcVir,
1222                                  total_vir,
1223                                  shake_vir,
1224                                  force_vir,
1225                                  pres,
1226                                  M,
1227                                  do_log,
1228                                  do_ene,
1229                                  bCalcEner,
1230                                  bGStat,
1231                                  bStopCM,
1232                                  bTrotter,
1233                                  bExchanged,
1234                                  &bSumEkinhOld,
1235                                  &saved_conserved_quantity,
1236                                  &f,
1237                                  &upd,
1238                                  constr,
1239                                  &nullSignaller,
1240                                  trotter_seq,
1241                                  nrnb,
1242                                  fplog,
1243                                  wcycle);
1244             if (vsite != nullptr && needVirtualVelocitiesThisStep)
1245             {
1246                 // Positions were calculated earlier
1247                 wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
1248                 vsite->construct(state->x, state->v, state->box, VSiteOperation::Velocities);
1249                 wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
1250             }
1251         }
1252
1253         /* ########  END FIRST UPDATE STEP  ############## */
1254         /* ########  If doing VV, we now have v(dt) ###### */
1255         if (bDoExpanded)
1256         {
1257             /* perform extended ensemble sampling in lambda - we don't
1258                actually move to the new state before outputting
1259                statistics, but if performing simulated tempering, we
1260                do update the velocities and the tau_t. */
1261             // TODO: Avoid changing inputrec (#3854)
1262             // Simulated tempering updates the reference temperature.
1263             // Expanded ensemble without simulated tempering does not change the inputrec.
1264             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
1265             lamnew                 = ExpandedEnsembleDynamics(fplog,
1266                                               nonConstInputrec,
1267                                               enerd,
1268                                               state,
1269                                               &MassQ,
1270                                               state->fep_state,
1271                                               state->dfhist,
1272                                               step,
1273                                               state->v.rvec_array(),
1274                                               md->homenr,
1275                                               md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1276                                                                       : gmx::ArrayRef<const unsigned short>());
1277             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1278             if (MASTER(cr))
1279             {
1280                 copy_df_history(state_global->dfhist, state->dfhist);
1281             }
1282         }
1283
1284         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1285         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1286         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1287             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1288                 || checkpointHandler->isCheckpointingStep()))
1289         {
1290             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1291             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1292         }
1293         // Copy velocities if needed for the output/checkpointing.
1294         // NOTE: Copy on the search steps is done at the beginning of the step.
1295         if (useGpuForUpdate && !bNS
1296             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1297         {
1298             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1299             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1300         }
1301         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1302         // and update is offloaded hence forces are kept on the GPU for update and have not been
1303         // already transferred in do_force().
1304         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1305         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1306         //       prior to GPU update.
1307         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1308         //       copy call in do_force(...).
1309         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1310         //       on host after the D2H copy in do_force(...).
1311         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1312             && do_per_step(step, ir->nstfout))
1313         {
1314             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1315             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1316         }
1317         /* Now we have the energies and forces corresponding to the
1318          * coordinates at time t. We must output all of this before
1319          * the update.
1320          */
1321         do_md_trajectory_writing(fplog,
1322                                  cr,
1323                                  nfile,
1324                                  fnm,
1325                                  step,
1326                                  step_rel,
1327                                  t,
1328                                  ir,
1329                                  state,
1330                                  state_global,
1331                                  observablesHistory,
1332                                  top_global,
1333                                  fr,
1334                                  outf,
1335                                  energyOutput,
1336                                  ekind,
1337                                  f.view().force(),
1338                                  checkpointHandler->isCheckpointingStep(),
1339                                  bRerunMD,
1340                                  bLastStep,
1341                                  mdrunOptions.writeConfout,
1342                                  bSumEkinhOld);
1343         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1344         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x, t);
1345
1346         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1347         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1348             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1349         {
1350             copy_mat(state->svir_prev, shake_vir);
1351             copy_mat(state->fvir_prev, force_vir);
1352         }
1353
1354         stopHandler->setSignal();
1355         resetHandler->setSignal(walltime_accounting);
1356
1357         if (bGStat || !PAR(cr))
1358         {
1359             /* In parallel we only have to check for checkpointing in steps
1360              * where we do global communication,
1361              *  otherwise the other nodes don't know.
1362              */
1363             checkpointHandler->setSignal(walltime_accounting);
1364         }
1365
1366         /* #########   START SECOND UPDATE STEP ################# */
1367
1368         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1369            controlled in preprocessing */
1370
1371         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1372         {
1373             gmx_bool bIfRandomize;
1374             bIfRandomize = update_randomize_velocities(ir,
1375                                                        step,
1376                                                        cr,
1377                                                        md->homenr,
1378                                                        md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1379                                                                : gmx::ArrayRef<const unsigned short>(),
1380                                                        gmx::arrayRefFromArray(md->invmass, md->nr),
1381                                                        state->v,
1382                                                        &upd,
1383                                                        constr);
1384             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1385             if (constr && bIfRandomize)
1386             {
1387                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1388             }
1389         }
1390         /* Box is changed in update() when we do pressure coupling,
1391          * but we should still use the old box for energy corrections and when
1392          * writing it to the energy file, so it matches the trajectory files for
1393          * the same timestep above. Make a copy in a separate array.
1394          */
1395         copy_mat(state->box, lastbox);
1396
1397         dvdl_constr = 0;
1398
1399         if (!useGpuForUpdate)
1400         {
1401             wallcycle_start(wcycle, WallCycleCounter::Update);
1402         }
1403         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1404         if (bTrotter)
1405         {
1406             trotter_update(ir,
1407                            step,
1408                            ekind,
1409                            enerd,
1410                            state,
1411                            total_vir,
1412                            md->homenr,
1413                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1414                                    : gmx::ArrayRef<const unsigned short>(),
1415                            gmx::arrayRefFromArray(md->invmass, md->nr),
1416                            &MassQ,
1417                            trotter_seq,
1418                            TrotterSequence::Three);
1419             /* We can only do Berendsen coupling after we have summed
1420              * the kinetic energy or virial. Since the happens
1421              * in global_state after update, we should only do it at
1422              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1423              */
1424         }
1425         else
1426         {
1427             update_tcouple(step,
1428                            ir,
1429                            state,
1430                            ekind,
1431                            &MassQ,
1432                            md->homenr,
1433                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1434                                    : gmx::ArrayRef<const unsigned short>());
1435             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1436         }
1437
1438         /* With leap-frog type integrators we compute the kinetic energy
1439          * at a whole time step as the average of the half-time step kinetic
1440          * energies of two subsequent steps. Therefore we need to compute the
1441          * half step kinetic energy also if we need energies at the next step.
1442          */
1443         const bool needHalfStepKineticEnergy =
1444                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1445
1446         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1447         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1448         const bool doParrinelloRahman = (ir->epc == PressureCoupling::ParrinelloRahman
1449                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1450
1451         if (EI_VV(ir->eI))
1452         {
1453             GMX_ASSERT(!useGpuForUpdate, "GPU update is not supported with VVAK integrator.");
1454
1455             integrateVVSecondStep(step,
1456                                   ir,
1457                                   fr,
1458                                   cr,
1459                                   state,
1460                                   mdAtoms->mdatoms(),
1461                                   &fcdata,
1462                                   &MassQ,
1463                                   &vcm,
1464                                   pull_work,
1465                                   enerd,
1466                                   &observablesReducer,
1467                                   ekind,
1468                                   gstat,
1469                                   &dvdl_constr,
1470                                   bCalcVir,
1471                                   total_vir,
1472                                   shake_vir,
1473                                   force_vir,
1474                                   pres,
1475                                   M,
1476                                   lastbox,
1477                                   do_log,
1478                                   do_ene,
1479                                   bGStat,
1480                                   &bSumEkinhOld,
1481                                   &f,
1482                                   &cbuf,
1483                                   &upd,
1484                                   constr,
1485                                   &nullSignaller,
1486                                   trotter_seq,
1487                                   nrnb,
1488                                   wcycle);
1489         }
1490         else
1491         {
1492             if (useGpuForUpdate)
1493             {
1494                 // On search steps, update handles to device vectors
1495                 if (bNS && (bFirstStep || haveDDAtomOrdering(*cr) || bExchanged))
1496                 {
1497                     integrator->set(stateGpu->getCoordinates(),
1498                                     stateGpu->getVelocities(),
1499                                     stateGpu->getForces(),
1500                                     top->idef,
1501                                     *md);
1502
1503                     // Copy data to the GPU after buffers might have being reinitialized
1504                     /* The velocity copy is redundant if we had Center-of-Mass motion removed on
1505                      * the previous step. We don't check that now. */
1506                     stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1507                     if (bExchanged
1508                         || (!runScheduleWork->stepWork.haveGpuPmeOnThisRank
1509                             && !runScheduleWork->stepWork.useGpuXBufferOps))
1510                     {
1511                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1512                     }
1513                 }
1514
1515                 if ((simulationWork.useGpuPme && simulationWork.useCpuPmePpCommunication)
1516                     || (!runScheduleWork->stepWork.useGpuFBufferOps))
1517                 {
1518                     // The PME forces were recieved to the host, and reduced on the CPU with the
1519                     // rest of the forces computed on the GPU, so the final forces have to be copied
1520                     // back to the GPU. Or the buffer ops were not offloaded this step, so the
1521                     // forces are on the host and have to be copied
1522                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1523                 }
1524                 const bool doTemperatureScaling =
1525                         (ir->etc != TemperatureCoupling::No
1526                          && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1527
1528                 // This applies Leap-Frog, LINCS and SETTLE in succession
1529                 integrator->integrate(stateGpu->getLocalForcesReadyOnDeviceEvent(
1530                                               runScheduleWork->stepWork, runScheduleWork->simulationWork),
1531                                       ir->delta_t,
1532                                       true,
1533                                       bCalcVir,
1534                                       shake_vir,
1535                                       doTemperatureScaling,
1536                                       ekind->tcstat,
1537                                       doParrinelloRahman,
1538                                       ir->nstpcouple * ir->delta_t,
1539                                       M);
1540             }
1541             else
1542             {
1543                 /* With multiple time stepping we need to do an additional normal
1544                  * update step to obtain the virial, as the actual MTS integration
1545                  * using an acceleration where the slow forces are multiplied by mtsFactor.
1546                  * Using that acceleration would result in a virial with the slow
1547                  * force contribution would be a factor mtsFactor too large.
1548                  */
1549                 if (simulationWork.useMts && bCalcVir && constr != nullptr)
1550                 {
1551                     upd.update_for_constraint_virial(*ir,
1552                                                      md->homenr,
1553                                                      md->havePartiallyFrozenAtoms,
1554                                                      gmx::arrayRefFromArray(md->invmass, md->nr),
1555                                                      gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1556                                                      *state,
1557                                                      f.view().forceWithPadding(),
1558                                                      *ekind);
1559
1560                     constrain_coordinates(constr,
1561                                           do_log,
1562                                           do_ene,
1563                                           step,
1564                                           state,
1565                                           upd.xp()->arrayRefWithPadding(),
1566                                           &dvdl_constr,
1567                                           bCalcVir,
1568                                           shake_vir);
1569                 }
1570
1571                 ArrayRefWithPadding<const RVec> forceCombined =
1572                         (simulationWork.useMts && step % ir->mtsLevels[1].stepFactor == 0)
1573                                 ? f.view().forceMtsCombinedWithPadding()
1574                                 : f.view().forceWithPadding();
1575                 upd.update_coords(*ir,
1576                                   step,
1577                                   md->homenr,
1578                                   md->havePartiallyFrozenAtoms,
1579                                   gmx::arrayRefFromArray(md->ptype, md->nr),
1580                                   gmx::arrayRefFromArray(md->invmass, md->nr),
1581                                   gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1582                                   state,
1583                                   forceCombined,
1584                                   &fcdata,
1585                                   ekind,
1586                                   M,
1587                                   etrtPOSITION,
1588                                   cr,
1589                                   constr != nullptr);
1590
1591                 wallcycle_stop(wcycle, WallCycleCounter::Update);
1592
1593                 constrain_coordinates(constr,
1594                                       do_log,
1595                                       do_ene,
1596                                       step,
1597                                       state,
1598                                       upd.xp()->arrayRefWithPadding(),
1599                                       &dvdl_constr,
1600                                       bCalcVir && !simulationWork.useMts,
1601                                       shake_vir);
1602
1603                 upd.update_sd_second_half(*ir,
1604                                           step,
1605                                           &dvdl_constr,
1606                                           md->homenr,
1607                                           gmx::arrayRefFromArray(md->ptype, md->nr),
1608                                           gmx::arrayRefFromArray(md->invmass, md->nr),
1609                                           state,
1610                                           cr,
1611                                           nrnb,
1612                                           wcycle,
1613                                           constr,
1614                                           do_log,
1615                                           do_ene);
1616                 upd.finish_update(
1617                         *ir, md->havePartiallyFrozenAtoms, md->homenr, state, wcycle, constr != nullptr);
1618             }
1619
1620             if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1621             {
1622                 updatePrevStepPullCom(pull_work, state->pull_com_prev_step);
1623             }
1624
1625             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1626         }
1627
1628         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1629         /* With Leap-Frog we can skip compute_globals at
1630          * non-communication steps, but we need to calculate
1631          * the kinetic energy one step before communication.
1632          */
1633         {
1634             // Organize to do inter-simulation signalling on steps if
1635             // and when algorithms require it.
1636             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1637
1638             if (useGpuForUpdate)
1639             {
1640                 const bool coordinatesRequiredForStopCM =
1641                         bStopCM && (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1642                         && !EI_VV(ir->eI);
1643
1644                 // Copy coordinates when needed to stop the CM motion or for replica exchange
1645                 if (coordinatesRequiredForStopCM || bDoReplEx)
1646                 {
1647                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1648                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1649                 }
1650
1651                 // Copy velocities back to the host if:
1652                 // - Globals are computed this step (includes the energy output steps).
1653                 // - Temperature is needed for the next step.
1654                 // - This is a replica exchange step (even though we will only need
1655                 //     the velocities if an exchange succeeds)
1656                 if (bGStat || needHalfStepKineticEnergy || bDoReplEx)
1657                 {
1658                     stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1659                     stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1660                 }
1661             }
1662
1663             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1664             {
1665                 // Since we're already communicating at this step, we
1666                 // can propagate intra-simulation signals. Note that
1667                 // check_nstglobalcomm has the responsibility for
1668                 // choosing the value of nstglobalcomm that is one way
1669                 // bGStat becomes true, so we can't get into a
1670                 // situation where e.g. checkpointing can't be
1671                 // signalled.
1672                 bool                doIntraSimSignal = true;
1673                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1674
1675                 compute_globals(gstat,
1676                                 cr,
1677                                 ir,
1678                                 fr,
1679                                 ekind,
1680                                 makeConstArrayRef(state->x),
1681                                 makeConstArrayRef(state->v),
1682                                 state->box,
1683                                 md,
1684                                 nrnb,
1685                                 &vcm,
1686                                 wcycle,
1687                                 enerd,
1688                                 force_vir,
1689                                 shake_vir,
1690                                 total_vir,
1691                                 pres,
1692                                 &signaller,
1693                                 lastbox,
1694                                 &bSumEkinhOld,
1695                                 (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1696                                         | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1697                                         | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1698                                         | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT,
1699                                 step,
1700                                 &observablesReducer);
1701                 if (!EI_VV(ir->eI) && bStopCM)
1702                 {
1703                     process_and_stopcm_grp(
1704                             fplog, &vcm, *md, makeArrayRef(state->x), makeArrayRef(state->v));
1705                     inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
1706
1707                     // TODO: The special case of removing CM motion should be dealt more gracefully
1708                     if (useGpuForUpdate)
1709                     {
1710                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1711                         // Here we block until the H2D copy completes because event sync with the
1712                         // force kernels that use the coordinates on the next steps is not implemented
1713                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1714                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1715                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1716                         if (vcm.mode != ComRemovalAlgorithm::No)
1717                         {
1718                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1719                         }
1720                     }
1721                 }
1722             }
1723         }
1724
1725         /* #############  END CALC EKIN AND PRESSURE ################# */
1726
1727         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1728            the virial that should probably be addressed eventually. state->veta has better properies,
1729            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1730            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1731
1732         if (ir->efep != FreeEnergyPerturbationType::No && !EI_VV(ir->eI))
1733         {
1734             /* Sum up the foreign energy and dK/dl terms for md and sd.
1735                Currently done every step so that dH/dl is correct in the .edr */
1736             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1737         }
1738
1739         bool scaleCoordinates = !useGpuForUpdate || bDoReplEx;
1740         update_pcouple_after_coordinates(fplog,
1741                                          step,
1742                                          ir,
1743                                          md->homenr,
1744                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1745                                                      : gmx::ArrayRef<const unsigned short>(),
1746                                          pres,
1747                                          force_vir,
1748                                          shake_vir,
1749                                          pressureCouplingMu,
1750                                          state,
1751                                          nrnb,
1752                                          upd.deform(),
1753                                          scaleCoordinates);
1754
1755         const bool doBerendsenPressureCoupling = (inputrec->epc == PressureCoupling::Berendsen
1756                                                   && do_per_step(step, inputrec->nstpcouple));
1757         const bool doCRescalePressureCoupling  = (inputrec->epc == PressureCoupling::CRescale
1758                                                  && do_per_step(step, inputrec->nstpcouple));
1759         if (useGpuForUpdate
1760             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1761         {
1762             integrator->scaleCoordinates(pressureCouplingMu);
1763             if (doCRescalePressureCoupling)
1764             {
1765                 matrix pressureCouplingInvMu;
1766                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1767                 integrator->scaleVelocities(pressureCouplingInvMu);
1768             }
1769             integrator->setPbc(PbcType::Xyz, state->box);
1770         }
1771
1772         /* ################# END UPDATE STEP 2 ################# */
1773         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1774
1775         /* The coordinates (x) were unshifted in update */
1776         if (!bGStat)
1777         {
1778             /* We will not sum ekinh_old,
1779              * so signal that we still have to do it.
1780              */
1781             bSumEkinhOld = TRUE;
1782         }
1783
1784         if (bCalcEner)
1785         {
1786             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1787
1788             /* use the directly determined last velocity, not actually the averaged half steps */
1789             if (bTrotter && ir->eI == IntegrationAlgorithm::VV)
1790             {
1791                 enerd->term[F_EKIN] = last_ekin;
1792             }
1793             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1794
1795             if (integratorHasConservedEnergyQuantity(ir))
1796             {
1797                 if (EI_VV(ir->eI))
1798                 {
1799                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1800                 }
1801                 else
1802                 {
1803                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1804                 }
1805             }
1806             /* #########  END PREPARING EDR OUTPUT  ###########  */
1807         }
1808
1809         /* Output stuff */
1810         if (MASTER(cr))
1811         {
1812             if (fplog && do_log && bDoExpanded)
1813             {
1814                 /* only needed if doing expanded ensemble */
1815                 PrintFreeEnergyInfoToFile(fplog,
1816                                           ir->fepvals.get(),
1817                                           ir->expandedvals.get(),
1818                                           ir->bSimTemp ? ir->simtempvals.get() : nullptr,
1819                                           state_global->dfhist,
1820                                           state->fep_state,
1821                                           ir->nstlog,
1822                                           step);
1823             }
1824             if (bCalcEner)
1825             {
1826                 const bool outputDHDL = (computeDHDL && do_per_step(step, ir->fepvals->nstdhdl));
1827
1828                 energyOutput.addDataAtEnergyStep(outputDHDL,
1829                                                  bCalcEnerStep,
1830                                                  t,
1831                                                  md->tmass,
1832                                                  enerd,
1833                                                  ir->fepvals.get(),
1834                                                  ir->expandedvals.get(),
1835                                                  lastbox,
1836                                                  PTCouplingArrays{ state->boxv,
1837                                                                    state->nosehoover_xi,
1838                                                                    state->nosehoover_vxi,
1839                                                                    state->nhpres_xi,
1840                                                                    state->nhpres_vxi },
1841                                                  state->fep_state,
1842                                                  total_vir,
1843                                                  pres,
1844                                                  ekind,
1845                                                  mu_tot,
1846                                                  constr);
1847             }
1848             else
1849             {
1850                 energyOutput.recordNonEnergyStep();
1851             }
1852
1853             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1854             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1855
1856             if (doSimulatedAnnealing)
1857             {
1858                 gmx::EnergyOutput::printAnnealingTemperatures(
1859                         do_log ? fplog : nullptr, groups, &(ir->opts));
1860             }
1861             if (do_log || do_ene || do_dr || do_or)
1862             {
1863                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf),
1864                                                    do_ene,
1865                                                    do_dr,
1866                                                    do_or,
1867                                                    do_log ? fplog : nullptr,
1868                                                    step,
1869                                                    t,
1870                                                    fr->fcdata.get(),
1871                                                    awh.get());
1872             }
1873             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1874             {
1875                 const bool isInitialOutput = false;
1876                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1877             }
1878
1879             if (ir->bPull)
1880             {
1881                 pull_print_output(pull_work, step, t);
1882             }
1883
1884             if (do_per_step(step, ir->nstlog))
1885             {
1886                 if (fflush(fplog) != 0)
1887                 {
1888                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1889                 }
1890             }
1891         }
1892         if (bDoExpanded)
1893         {
1894             /* Have to do this part _after_ outputting the logfile and the edr file */
1895             /* Gets written into the state at the beginning of next loop*/
1896             state->fep_state = lamnew;
1897         }
1898         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1899         {
1900             state->fep_state = awh->fepLambdaState();
1901         }
1902         /* Print the remaining wall clock time for the run */
1903         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1904         {
1905             if (shellfc)
1906             {
1907                 fprintf(stderr, "\n");
1908             }
1909             print_time(stderr, walltime_accounting, step, ir, cr);
1910         }
1911
1912         /* Ion/water position swapping.
1913          * Not done in last step since trajectory writing happens before this call
1914          * in the MD loop and exchanges would be lost anyway. */
1915         bNeedRepartition = FALSE;
1916         if ((ir->eSwapCoords != SwapType::No) && (step > 0) && !bLastStep
1917             && do_per_step(step, ir->swap->nstswap))
1918         {
1919             bNeedRepartition = do_swapcoords(cr,
1920                                              step,
1921                                              t,
1922                                              ir,
1923                                              swap,
1924                                              wcycle,
1925                                              as_rvec_array(state->x.data()),
1926                                              state->box,
1927                                              MASTER(cr) && mdrunOptions.verbose,
1928                                              bRerunMD);
1929
1930             if (bNeedRepartition && haveDDAtomOrdering(*cr))
1931             {
1932                 dd_collect_state(cr->dd, state, state_global);
1933             }
1934         }
1935
1936         /* Replica exchange */
1937         bExchanged = FALSE;
1938         if (bDoReplEx)
1939         {
1940             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1941         }
1942
1943         if ((bExchanged || bNeedRepartition) && haveDDAtomOrdering(*cr))
1944         {
1945             dd_partition_system(fplog,
1946                                 mdlog,
1947                                 step,
1948                                 cr,
1949                                 TRUE,
1950                                 1,
1951                                 state_global,
1952                                 top_global,
1953                                 *ir,
1954                                 imdSession,
1955                                 pull_work,
1956                                 state,
1957                                 &f,
1958                                 mdAtoms,
1959                                 top,
1960                                 fr,
1961                                 vsite,
1962                                 constr,
1963                                 nrnb,
1964                                 wcycle,
1965                                 FALSE);
1966             upd.updateAfterPartition(state->natoms,
1967                                      md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1968                                                  : gmx::ArrayRef<const unsigned short>(),
1969                                      md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1970                                              : gmx::ArrayRef<const unsigned short>());
1971             fr->longRangeNonbondeds->updateAfterPartition(*md);
1972         }
1973
1974         bFirstStep = FALSE;
1975         bInitStep  = FALSE;
1976
1977         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1978         /* With all integrators, except VV, we need to retain the pressure
1979          * at the current step for coupling at the next step.
1980          */
1981         if ((state->flags & enumValueToBitMask(StateEntry::PressurePrevious))
1982             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1983         {
1984             /* Store the pressure in t_state for pressure coupling
1985              * at the next MD step.
1986              */
1987             copy_mat(pres, state->pres_prev);
1988         }
1989
1990         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1991
1992         if ((membed != nullptr) && (!bLastStep))
1993         {
1994             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1995         }
1996
1997         cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
1998         if (haveDDAtomOrdering(*cr) && wcycle)
1999         {
2000             dd_cycles_add(cr->dd, cycles, ddCyclStep);
2001         }
2002
2003         /* increase the MD step number */
2004         step++;
2005         step_rel++;
2006         observablesReducer.markAsReadyToReduce();
2007
2008 #if GMX_FAHCORE
2009         if (MASTER(cr))
2010         {
2011             fcReportProgress(ir->nsteps + ir->init_step, step);
2012         }
2013 #endif
2014
2015         resetHandler->resetCounters(
2016                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
2017
2018         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
2019         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
2020     }
2021     /* End of main MD loop */
2022
2023     /* Closing TNG files can include compressing data. Therefore it is good to do that
2024      * before stopping the time measurements. */
2025     mdoutf_tng_close(outf);
2026
2027     /* Stop measuring walltime */
2028     walltime_accounting_end_time(walltime_accounting);
2029
2030     if (simulationWork.haveSeparatePmeRank)
2031     {
2032         /* Tell the PME only node to finish */
2033         gmx_pme_send_finish(cr);
2034     }
2035
2036     if (MASTER(cr))
2037     {
2038         if (ir->nstcalcenergy > 0)
2039         {
2040             energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
2041
2042             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
2043             energyOutput.printAverages(fplog, groups);
2044         }
2045     }
2046     done_mdoutf(outf);
2047
2048     if (bPMETune)
2049     {
2050         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
2051     }
2052
2053     done_shellfc(fplog, shellfc, step_rel);
2054
2055     if (useReplicaExchange && MASTER(cr))
2056     {
2057         print_replica_exchange_statistics(fplog, repl_ex);
2058     }
2059
2060     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
2061
2062     global_stat_destroy(gstat);
2063 }