src/gromacs/mdrun/md.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief Implements the integrator for normal molecular dynamics simulations
  40  *
  41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
  42  * \ingroup module_mdrun
  43  */
  44 #include "gmxpre.h"
  45
  46 #include <cinttypes>
  47 #include <cmath>
  48 #include <cstdio>
  49 #include <cstdlib>
  50
  51 #include <algorithm>
  52 #include <memory>
  53 #include <numeric>
  54
  55 #include "gromacs/applied_forces/awh/awh.h"
  56 #include "gromacs/applied_forces/awh/read_params.h"
  57 #include "gromacs/commandline/filenm.h"
  58 #include "gromacs/domdec/collect.h"
  59 #include "gromacs/domdec/dlbtiming.h"
  60 #include "gromacs/domdec/domdec.h"
  61 #include "gromacs/domdec/domdec_network.h"
  62 #include "gromacs/domdec/domdec_struct.h"
  63 #include "gromacs/domdec/gpuhaloexchange.h"
  64 #include "gromacs/domdec/mdsetup.h"
  65 #include "gromacs/domdec/partition.h"
  66 #include "gromacs/essentialdynamics/edsam.h"
  67 #include "gromacs/ewald/pme_load_balancing.h"
  68 #include "gromacs/ewald/pme_pp.h"
  69 #include "gromacs/fileio/trxio.h"
  70 #include "gromacs/gmxlib/network.h"
  71 #include "gromacs/gmxlib/nrnb.h"
  72 #include "gromacs/gpu_utils/device_stream_manager.h"
  73 #include "gromacs/gpu_utils/gpu_utils.h"
  74 #include "gromacs/imd/imd.h"
  75 #include "gromacs/listed_forces/listed_forces.h"
  76 #include "gromacs/math/functions.h"
  77 #include "gromacs/math/invertmatrix.h"
  78 #include "gromacs/math/vec.h"
  79 #include "gromacs/math/vectypes.h"
  80 #include "gromacs/mdlib/checkpointhandler.h"
  81 #include "gromacs/mdlib/compute_io.h"
  82 #include "gromacs/mdlib/constr.h"
  83 #include "gromacs/mdlib/coupling.h"
  84 #include "gromacs/mdlib/ebin.h"
  85 #include "gromacs/mdlib/enerdata_utils.h"
  86 #include "gromacs/mdlib/energyoutput.h"
  87 #include "gromacs/mdlib/expanded.h"
  88 #include "gromacs/mdlib/force.h"
  89 #include "gromacs/mdlib/force_flags.h"
  90 #include "gromacs/mdlib/forcerec.h"
  91 #include "gromacs/mdlib/freeenergyparameters.h"
  92 #include "gromacs/mdlib/md_support.h"
  93 #include "gromacs/mdlib/mdatoms.h"
  94 #include "gromacs/mdlib/mdoutf.h"
  95 #include "gromacs/mdlib/membed.h"
  96 #include "gromacs/mdlib/resethandler.h"
  97 #include "gromacs/mdlib/sighandler.h"
  98 #include "gromacs/mdlib/simulationsignal.h"
  99 #include "gromacs/mdlib/stat.h"
 100 #include "gromacs/mdlib/stophandler.h"
 101 #include "gromacs/mdlib/tgroup.h"
 102 #include "gromacs/mdlib/trajectory_writing.h"
 103 #include "gromacs/mdlib/update.h"
 104 #include "gromacs/mdlib/update_constrain_gpu.h"
 105 #include "gromacs/mdlib/update_vv.h"
 106 #include "gromacs/mdlib/vcm.h"
 107 #include "gromacs/mdlib/vsite.h"
 108 #include "gromacs/mdrunutility/handlerestart.h"
 109 #include "gromacs/mdrunutility/multisim.h"
 110 #include "gromacs/mdrunutility/printtime.h"
 111 #include "gromacs/mdtypes/awh_history.h"
 112 #include "gromacs/mdtypes/awh_params.h"
 113 #include "gromacs/mdtypes/commrec.h"
 114 #include "gromacs/mdtypes/df_history.h"
 115 #include "gromacs/mdtypes/energyhistory.h"
 116 #include "gromacs/mdtypes/fcdata.h"
 117 #include "gromacs/mdtypes/forcebuffers.h"
 118 #include "gromacs/mdtypes/forcerec.h"
 119 #include "gromacs/mdtypes/group.h"
 120 #include "gromacs/mdtypes/inputrec.h"
 121 #include "gromacs/mdtypes/interaction_const.h"
 122 #include "gromacs/mdtypes/md_enums.h"
 123 #include "gromacs/mdtypes/mdatom.h"
 124 #include "gromacs/mdtypes/mdrunoptions.h"
 125 #include "gromacs/mdtypes/multipletimestepping.h"
 126 #include "gromacs/mdtypes/observableshistory.h"
 127 #include "gromacs/mdtypes/pullhistory.h"
 128 #include "gromacs/mdtypes/simulation_workload.h"
 129 #include "gromacs/mdtypes/state.h"
 130 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
 131 #include "gromacs/modularsimulator/energydata.h"
 132 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 133 #include "gromacs/nbnxm/nbnxm.h"
 134 #include "gromacs/pbcutil/pbc.h"
 135 #include "gromacs/pulling/output.h"
 136 #include "gromacs/pulling/pull.h"
 137 #include "gromacs/swap/swapcoords.h"
 138 #include "gromacs/timing/wallcycle.h"
 139 #include "gromacs/timing/walltime_accounting.h"
 140 #include "gromacs/topology/atoms.h"
 141 #include "gromacs/topology/idef.h"
 142 #include "gromacs/topology/mtop_util.h"
 143 #include "gromacs/topology/topology.h"
 144 #include "gromacs/trajectory/trajectoryframe.h"
 145 #include "gromacs/utility/basedefinitions.h"
 146 #include "gromacs/utility/cstringutil.h"
 147 #include "gromacs/utility/fatalerror.h"
 148 #include "gromacs/utility/logger.h"
 149 #include "gromacs/utility/real.h"
 150 #include "gromacs/utility/smalloc.h"
 151
 152 #include "legacysimulator.h"
 153 #include "replicaexchange.h"
 154 #include "shellfc.h"
 155
 156 using gmx::SimulationSignaller;
 157
 158 void gmx::LegacySimulator::do_md()
 159 {
 160     // TODO Historically, the EM and MD "integrators" used different
 161     // names for the t_inputrec *parameter, but these must have the
 162     // same name, now that it's a member of a struct. We use this ir
 163     // alias to avoid a large ripple of nearly useless changes.
 164     // t_inputrec is being replaced by IMdpOptionsProvider, so this
 165     // will go away eventually.
 166     const t_inputrec* ir = inputrec;
 167
 168     int64_t      step, step_rel;
 169     double       t, t0 = ir->init_t;
 170     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
 171     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
 172     gmx_bool     bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
 173     gmx_bool     do_ene, do_log, do_verbose;
 174     gmx_bool     bMasterState;
 175     unsigned int force_flags;
 176     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
 177     int    i, m;
 178     rvec   mu_tot;
 179     matrix pressureCouplingMu, M;
 180     gmx_repl_ex_t     repl_ex = nullptr;
 181     gmx_global_stat_t gstat;
 182     gmx_shellfc_t*    shellfc;
 183     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
 184     gmx_bool          bTrotter;
 185     real              dvdl_constr;
 186     std::vector<RVec> cbuf;
 187     matrix            lastbox;
 188     int               lamnew = 0;
 189     /* for FEP */
 190     int       nstfep = 0;
 191     double    cycles;
 192     real      saved_conserved_quantity = 0;
 193     real      last_ekin                = 0;
 194     t_extmass MassQ;
 195     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
 196
 197     /* PME load balancing data for GPU kernels */
 198     gmx_bool bPMETune         = FALSE;
 199     gmx_bool bPMETunePrinting = FALSE;
 200
 201     bool bInteractiveMDstep = false;
 202
 203     SimulationSignals signals;
 204     // Most global communnication stages don't propagate mdrun
 205     // signals, and will use this object to achieve that.
 206     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
 207
 208     if (!mdrunOptions.writeConfout)
 209     {
 210         // This is on by default, and the main known use case for
 211         // turning it off is for convenience in benchmarking, which is
 212         // something that should not show up in the general user
 213         // interface.
 214         GMX_LOG(mdlog.info)
 215                 .asParagraph()
 216                 .appendText(
 217                         "The -noconfout functionality is deprecated, and may be removed in a "
 218                         "future version.");
 219     }
 220
 221     /* md-vv uses averaged full step velocities for T-control
 222        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 223        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 224     bTrotter = (EI_VV(ir->eI)
 225                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
 226
 227     const bool bRerunMD = false;
 228
 229     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
 230     bGStatEveryStep   = (nstglobalcomm == 1);
 231
 232     const SimulationGroups* groups = &top_global.groups;
 233
 234     std::unique_ptr<EssentialDynamics> ed = nullptr;
 235     if (opt2bSet("-ei", nfile, fnm))
 236     {
 237         /* Initialize essential dynamics sampling */
 238         ed = init_edsam(mdlog,
 239                         opt2fn_null("-ei", nfile, fnm),
 240                         opt2fn("-eo", nfile, fnm),
 241                         top_global,
 242                         *ir,
 243                         cr,
 244                         constr,
 245                         state_global,
 246                         observablesHistory,
 247                         oenv,
 248                         startingBehavior);
 249     }
 250     else if (observablesHistory->edsamHistory)
 251     {
 252         gmx_fatal(FARGS,
 253                   "The checkpoint is from a run with essential dynamics sampling, "
 254                   "but the current run did not specify the -ei option. "
 255                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
 256     }
 257
 258     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
 259     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
 260     initialize_lambdas(fplog, *ir, MASTER(cr), fep_state, lambda);
 261     Update upd(*ir, deform);
 262     bool   doSimulatedAnnealing = false;
 263     {
 264         // TODO: Avoid changing inputrec (#3854)
 265         // Simulated annealing updates the reference temperature.
 266         auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
 267         doSimulatedAnnealing   = initSimulatedAnnealing(nonConstInputrec, &upd);
 268     }
 269     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
 270
 271     const t_fcdata& fcdata = *fr->fcdata;
 272
 273     bool simulationsShareState = false;
 274     int  nstSignalComm         = nstglobalcomm;
 275     {
 276         // TODO This implementation of ensemble orientation restraints is nasty because
 277         // a user can't just do multi-sim with single-sim orientation restraints.
 278         bool usingEnsembleRestraints =
 279                 (fcdata.disres->nsystems > 1) || ((ms != nullptr) && (fcdata.orires->nr != 0));
 280         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim() && (ms != nullptr));
 281
 282         // Replica exchange, ensemble restraints and AWH need all
 283         // simulations to remain synchronized, so they need
 284         // checkpoints and stop conditions to act on the same step, so
 285         // the propagation of such signals must take place between
 286         // simulations, not just within simulations.
 287         // TODO: Make algorithm initializers set these flags.
 288         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
 289
 290         if (simulationsShareState)
 291         {
 292             // Inter-simulation signal communication does not need to happen
 293             // often, so we use a minimum of 200 steps to reduce overhead.
 294             const int c_minimumInterSimulationSignallingInterval = 200;
 295             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
 296                             * nstglobalcomm;
 297         }
 298     }
 299
 300     if (startingBehavior != StartingBehavior::RestartWithAppending)
 301     {
 302         pleaseCiteCouplingAlgorithms(fplog, *ir);
 303     }
 304     gmx_mdoutf*       outf = init_mdoutf(fplog,
 305                                    nfile,
 306                                    fnm,
 307                                    mdrunOptions,
 308                                    cr,
 309                                    outputProvider,
 310                                    mdModulesNotifier,
 311                                    ir,
 312                                    top_global,
 313                                    oenv,
 314                                    wcycle,
 315                                    startingBehavior,
 316                                    simulationsShareState,
 317                                    ms);
 318     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf),
 319                                    top_global,
 320                                    *ir,
 321                                    pull_work,
 322                                    mdoutf_get_fp_dhdl(outf),
 323                                    false,
 324                                    startingBehavior,
 325                                    simulationsShareState,
 326                                    mdModulesNotifier);
 327
 328     gstat = global_stat_init(ir);
 329
 330     const auto& simulationWork     = runScheduleWork->simulationWork;
 331     const bool  useGpuForPme       = simulationWork.useGpuPme;
 332     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
 333     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
 334     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
 335
 336     /* Check for polarizable models and flexible constraints */
 337     shellfc = init_shell_flexcon(fplog,
 338                                  top_global,
 339                                  constr ? constr->numFlexibleConstraints() : 0,
 340                                  ir->nstcalcenergy,
 341                                  DOMAINDECOMP(cr),
 342                                  useGpuForPme);
 343
 344     {
 345         double io = compute_io(ir, top_global.natoms, *groups, energyOutput.numEnergyTerms(), 1);
 346         if ((io > 2000) && MASTER(cr))
 347         {
 348             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
 349         }
 350     }
 351
 352     // Local state only becomes valid now.
 353     std::unique_ptr<t_state> stateInstance;
 354     t_state*                 state;
 355
 356     gmx_localtop_t top(top_global.ffparams);
 357
 358     auto mdatoms = mdAtoms->mdatoms();
 359
 360     ForceBuffers f(fr->useMts,
 361                    ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 362                            ? PinningPolicy::PinnedIfSupported
 363                            : PinningPolicy::CannotBePinned);
 364     if (DOMAINDECOMP(cr))
 365     {
 366         stateInstance = std::make_unique<t_state>();
 367         state         = stateInstance.get();
 368         dd_init_local_state(*cr->dd, state_global, state);
 369
 370         /* Distribute the charge groups over the nodes from the master node */
 371         dd_partition_system(fplog,
 372                             mdlog,
 373                             ir->init_step,
 374                             cr,
 375                             TRUE,
 376                             1,
 377                             state_global,
 378                             top_global,
 379                             *ir,
 380                             imdSession,
 381                             pull_work,
 382                             state,
 383                             &f,
 384                             mdAtoms,
 385                             &top,
 386                             fr,
 387                             vsite,
 388                             constr,
 389                             nrnb,
 390                             nullptr,
 391                             FALSE);
 392         upd.setNumAtoms(state->natoms);
 393     }
 394     else
 395     {
 396         state_change_natoms(state_global, state_global->natoms);
 397         /* Copy the pointer to the global state */
 398         state = state_global;
 399
 400         /* Generate and initialize new topology */
 401         mdAlgorithmsSetupAtomData(cr, *ir, top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
 402
 403         upd.setNumAtoms(state->natoms);
 404     }
 405
 406     std::unique_ptr<UpdateConstrainGpu> integrator;
 407
 408     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
 409
 410     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
 411     if (useGpuForUpdate)
 412     {
 413         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
 414                                    || constr->numConstraintsTotal() == 0,
 415                            "Constraints in domain decomposition are only supported with update "
 416                            "groups if using GPU update.\n");
 417         GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
 418                                    || constr->numConstraintsTotal() == 0,
 419                            "SHAKE is not supported with GPU update.");
 420         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
 421                            "Either PME or short-ranged non-bonded interaction tasks must run on "
 422                            "the GPU to use GPU update.\n");
 423         GMX_RELEASE_ASSERT(ir->eI == IntegrationAlgorithm::MD,
 424                            "Only the md integrator is supported with the GPU update.\n");
 425         GMX_RELEASE_ASSERT(
 426                 ir->etc != TemperatureCoupling::NoseHoover,
 427                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
 428         GMX_RELEASE_ASSERT(
 429                 ir->epc == PressureCoupling::No || ir->epc == PressureCoupling::ParrinelloRahman
 430                         || ir->epc == PressureCoupling::Berendsen || ir->epc == PressureCoupling::CRescale,
 431                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
 432                 "with the GPU update.\n");
 433         GMX_RELEASE_ASSERT(!mdatoms->haveVsites,
 434                            "Virtual sites are not supported with the GPU update.\n");
 435         GMX_RELEASE_ASSERT(ed == nullptr,
 436                            "Essential dynamics is not supported with the GPU update.\n");
 437         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
 438                            "Constraints pulling is not supported with the GPU update.\n");
 439         GMX_RELEASE_ASSERT(fcdata.orires->nr == 0,
 440                            "Orientation restraints are not supported with the GPU update.\n");
 441         GMX_RELEASE_ASSERT(
 442                 ir->efep == FreeEnergyPerturbationType::No
 443                         || (!haveFepPerturbedMasses(top_global) && !havePerturbedConstraints(top_global)),
 444                 "Free energy perturbation of masses and constraints are not supported with the GPU "
 445                 "update.");
 446
 447         if (constr != nullptr && constr->numConstraintsTotal() > 0)
 448         {
 449             GMX_LOG(mdlog.info)
 450                     .asParagraph()
 451                     .appendText("Updating coordinates and applying constraints on the GPU.");
 452         }
 453         else
 454         {
 455             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
 456         }
 457         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
 458                            "Device stream manager should be initialized in order to use GPU "
 459                            "update-constraints.");
 460         GMX_RELEASE_ASSERT(
 461                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
 462                 "Update stream should be initialized in order to use GPU "
 463                 "update-constraints.");
 464         integrator = std::make_unique<UpdateConstrainGpu>(
 465                 *ir,
 466                 top_global,
 467                 ekind->ngtc,
 468                 fr->deviceStreamManager->context(),
 469                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
 470                 stateGpu->xUpdatedOnDevice(),
 471                 wcycle);
 472
 473         integrator->setPbc(PbcType::Xyz, state->box);
 474     }
 475
 476     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 477     {
 478         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
 479     }
 480     if (useGpuForUpdate)
 481     {
 482         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
 483     }
 484
 485     // NOTE: The global state is no longer used at this point.
 486     // But state_global is still used as temporary storage space for writing
 487     // the global state to file and potentially for replica exchange.
 488     // (Global topology should persist.)
 489
 490     update_mdatoms(mdatoms, state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
 491
 492     if (ir->bExpanded)
 493     {
 494         /* Check nstexpanded here, because the grompp check was broken */
 495         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
 496         {
 497             gmx_fatal(FARGS,
 498                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
 499         }
 500         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
 501     }
 502
 503     if (MASTER(cr))
 504     {
 505         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
 506     }
 507
 508     preparePrevStepPullCom(ir,
 509                            pull_work,
 510                            gmx::arrayRefFromArray(mdatoms->massT, mdatoms->nr),
 511                            state,
 512                            state_global,
 513                            cr,
 514                            startingBehavior != StartingBehavior::NewSimulation);
 515
 516     // TODO: Remove this by converting AWH into a ForceProvider
 517     auto awh = prepareAwhModule(fplog,
 518                                 *ir,
 519                                 state_global,
 520                                 cr,
 521                                 ms,
 522                                 startingBehavior != StartingBehavior::NewSimulation,
 523                                 shellfc != nullptr,
 524                                 opt2fn("-awh", nfile, fnm),
 525                                 pull_work);
 526
 527     if (useReplicaExchange && MASTER(cr))
 528     {
 529         repl_ex = init_replica_exchange(fplog, ms, top_global.natoms, ir, replExParams);
 530     }
 531     /* PME tuning is only supported in the Verlet scheme, with PME for
 532      * Coulomb. It is not supported with only LJ PME. */
 533     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
 534                 && ir->cutoff_scheme != CutoffScheme::Group);
 535
 536     pme_load_balancing_t* pme_loadbal = nullptr;
 537     if (bPMETune)
 538     {
 539         pme_loadbal_init(
 540                 &pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu());
 541     }
 542
 543     if (!ir->bContinuation)
 544     {
 545         if (state->flags & enumValueToBitMask(StateEntry::V))
 546         {
 547             auto v = makeArrayRef(state->v);
 548             /* Set the velocities of vsites, shells and frozen atoms to zero */
 549             for (i = 0; i < mdatoms->homenr; i++)
 550             {
 551                 if (mdatoms->ptype[i] == ParticleType::Shell)
 552                 {
 553                     clear_rvec(v[i]);
 554                 }
 555                 else if (mdatoms->cFREEZE)
 556                 {
 557                     for (m = 0; m < DIM; m++)
 558                     {
 559                         if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
 560                         {
 561                             v[i][m] = 0;
 562                         }
 563                     }
 564                 }
 565             }
 566         }
 567
 568         if (constr)
 569         {
 570             /* Constrain the initial coordinates and velocities */
 571             do_constrain_first(fplog,
 572                                constr,
 573                                ir,
 574                                mdatoms->nr,
 575                                mdatoms->homenr,
 576                                state->x.arrayRefWithPadding(),
 577                                state->v.arrayRefWithPadding(),
 578                                state->box,
 579                                state->lambda[FreeEnergyPerturbationCouplingType::Bonded]);
 580         }
 581     }
 582
 583     if (ir->efep != FreeEnergyPerturbationType::No)
 584     {
 585         /* Set free energy calculation frequency as the greatest common
 586          * denominator of nstdhdl and repl_ex_nst. */
 587         nstfep = ir->fepvals->nstdhdl;
 588         if (ir->bExpanded)
 589         {
 590             nstfep = std::gcd(ir->expandedvals->nstexpanded, nstfep);
 591         }
 592         if (useReplicaExchange)
 593         {
 594             nstfep = std::gcd(replExParams.exchangeInterval, nstfep);
 595         }
 596         if (ir->bDoAwh)
 597         {
 598             nstfep = std::gcd(ir->awhParams->nstSampleCoord(), nstfep);
 599         }
 600     }
 601
 602     /* Be REALLY careful about what flags you set here. You CANNOT assume
 603      * this is the first step, since we might be restarting from a checkpoint,
 604      * and in that case we should not do any modifications to the state.
 605      */
 606     bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && !ir->bContinuation);
 607
 608     // When restarting from a checkpoint, it can be appropriate to
 609     // initialize ekind from quantities in the checkpoint. Otherwise,
 610     // compute_globals must initialize ekind before the simulation
 611     // starts/restarts. However, only the master rank knows what was
 612     // found in the checkpoint file, so we have to communicate in
 613     // order to coordinate the restart.
 614     //
 615     // TODO Consider removing this communication if/when checkpoint
 616     // reading directly follows .tpr reading, because all ranks can
 617     // agree on hasReadEkinState at that time.
 618     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
 619     if (PAR(cr))
 620     {
 621         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
 622     }
 623     if (hasReadEkinState)
 624     {
 625         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
 626     }
 627
 628     unsigned int cglo_flags =
 629             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
 630              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
 631
 632     bSumEkinhOld = FALSE;
 633
 634     t_vcm vcm(top_global.groups, *ir);
 635     reportComRemovalInfo(fplog, vcm);
 636
 637     /* To minimize communication, compute_globals computes the COM velocity
 638      * and the kinetic energy for the velocities without COM motion removed.
 639      * Thus to get the kinetic energy without the COM contribution, we need
 640      * to call compute_globals twice.
 641      */
 642     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
 643     {
 644         unsigned int cglo_flags_iteration = cglo_flags;
 645         if (bStopCM && cgloIteration == 0)
 646         {
 647             cglo_flags_iteration |= CGLO_STOPCM;
 648             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
 649         }
 650         if (DOMAINDECOMP(cr) && shouldCheckNumberOfBondedInteractions(*cr->dd) && cgloIteration == 0)
 651         {
 652             cglo_flags_iteration |= CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS;
 653         }
 654         compute_globals(gstat,
 655                         cr,
 656                         ir,
 657                         fr,
 658                         ekind,
 659                         makeConstArrayRef(state->x),
 660                         makeConstArrayRef(state->v),
 661                         state->box,
 662                         mdatoms,
 663                         nrnb,
 664                         &vcm,
 665                         nullptr,
 666                         enerd,
 667                         force_vir,
 668                         shake_vir,
 669                         total_vir,
 670                         pres,
 671                         gmx::ArrayRef<real>{},
 672                         &nullSignaller,
 673                         state->box,
 674                         &bSumEkinhOld,
 675                         cglo_flags_iteration);
 676         if (cglo_flags_iteration & CGLO_STOPCM)
 677         {
 678             /* At initialization, do not pass x with acceleration-correction mode
 679              * to avoid (incorrect) correction of the initial coordinates.
 680              */
 681             auto x = (vcm.mode == ComRemovalAlgorithm::LinearAccelerationCorrection)
 682                              ? ArrayRef<RVec>()
 683                              : makeArrayRef(state->x);
 684             process_and_stopcm_grp(fplog, &vcm, *mdatoms, x, makeArrayRef(state->v));
 685             inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
 686         }
 687     }
 688     if (DOMAINDECOMP(cr))
 689     {
 690         checkNumberOfBondedInteractions(
 691                 mdlog, cr, top_global, &top, makeConstArrayRef(state->x), state->box);
 692     }
 693     if (ir->eI == IntegrationAlgorithm::VVAK)
 694     {
 695         /* a second call to get the half step temperature initialized as well */
 696         /* we do the same call as above, but turn the pressure off -- internally to
 697            compute_globals, this is recognized as a velocity verlet half-step
 698            kinetic energy calculation.  This minimized excess variables, but
 699            perhaps loses some logic?*/
 700
 701         compute_globals(gstat,
 702                         cr,
 703                         ir,
 704                         fr,
 705                         ekind,
 706                         makeConstArrayRef(state->x),
 707                         makeConstArrayRef(state->v),
 708                         state->box,
 709                         mdatoms,
 710                         nrnb,
 711                         &vcm,
 712                         nullptr,
 713                         enerd,
 714                         force_vir,
 715                         shake_vir,
 716                         total_vir,
 717                         pres,
 718                         gmx::ArrayRef<real>{},
 719                         &nullSignaller,
 720                         state->box,
 721                         &bSumEkinhOld,
 722                         cglo_flags & ~CGLO_PRESSURE);
 723     }
 724
 725     /* Calculate the initial half step temperature, and save the ekinh_old */
 726     if (startingBehavior == StartingBehavior::NewSimulation)
 727     {
 728         for (i = 0; (i < ir->opts.ngtc); i++)
 729         {
 730             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
 731         }
 732     }
 733
 734     /* need to make an initiation call to get the Trotter variables set, as well as other constants
 735        for non-trotter temperature control */
 736     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
 737
 738     if (MASTER(cr))
 739     {
 740         if (!ir->bContinuation)
 741         {
 742             if (constr && ir->eConstrAlg == ConstraintAlgorithm::Lincs)
 743             {
 744                 fprintf(fplog,
 745                         "RMS relative constraint deviation after constraining: %.2e\n",
 746                         constr->rmsd());
 747             }
 748             if (EI_STATE_VELOCITY(ir->eI))
 749             {
 750                 real temp = enerd->term[F_TEMP];
 751                 if (ir->eI != IntegrationAlgorithm::VV)
 752                 {
 753                     /* Result of Ekin averaged over velocities of -half
 754                      * and +half step, while we only have -half step here.
 755                      */
 756                     temp *= 2;
 757                 }
 758                 fprintf(fplog, "Initial temperature: %g K\n", temp);
 759             }
 760         }
 761
 762         char tbuf[20];
 763         fprintf(stderr, "starting mdrun '%s'\n", *(top_global.name));
 764         if (ir->nsteps >= 0)
 765         {
 766             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
 767         }
 768         else
 769         {
 770             sprintf(tbuf, "%s", "infinite");
 771         }
 772         if (ir->init_step > 0)
 773         {
 774             fprintf(stderr,
 775                     "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 776                     gmx_step_str(ir->init_step + ir->nsteps, sbuf),
 777                     tbuf,
 778                     gmx_step_str(ir->init_step, sbuf2),
 779                     ir->init_step * ir->delta_t);
 780         }
 781         else
 782         {
 783             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
 784         }
 785         fprintf(fplog, "\n");
 786     }
 787
 788     walltime_accounting_start_time(walltime_accounting);
 789     wallcycle_start(wcycle, ewcRUN);
 790     print_start(fplog, cr, walltime_accounting, "mdrun");
 791
 792     /***********************************************************
 793      *
 794      *             Loop over MD steps
 795      *
 796      ************************************************************/
 797
 798     bFirstStep = TRUE;
 799     /* Skip the first Nose-Hoover integration when we get the state from tpx */
 800     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
 801     bSumEkinhOld     = FALSE;
 802     bExchanged       = FALSE;
 803     bNeedRepartition = FALSE;
 804
 805     step     = ir->init_step;
 806     step_rel = 0;
 807
 808     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
 809             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]),
 810             simulationsShareState,
 811             MASTER(cr),
 812             ir->nstlist,
 813             mdrunOptions.reproducible,
 814             nstSignalComm,
 815             mdrunOptions.maximumHoursToRun,
 816             ir->nstlist == 0,
 817             fplog,
 818             step,
 819             bNS,
 820             walltime_accounting);
 821
 822     auto checkpointHandler = std::make_unique<CheckpointHandler>(
 823             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
 824             simulationsShareState,
 825             ir->nstlist == 0,
 826             MASTER(cr),
 827             mdrunOptions.writeConfout,
 828             mdrunOptions.checkpointOptions.period);
 829
 830     const bool resetCountersIsLocal = true;
 831     auto       resetHandler         = std::make_unique<ResetHandler>(
 832             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
 833             !resetCountersIsLocal,
 834             ir->nsteps,
 835             MASTER(cr),
 836             mdrunOptions.timingOptions.resetHalfway,
 837             mdrunOptions.maximumHoursToRun,
 838             mdlog,
 839             wcycle,
 840             walltime_accounting);
 841
 842     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
 843
 844     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
 845     {
 846         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
 847     }
 848
 849     /* and stop now if we should */
 850     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
 851     while (!bLastStep)
 852     {
 853
 854         /* Determine if this is a neighbor search step */
 855         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
 856
 857         if (bPMETune && bNStList)
 858         {
 859             // This has to be here because PME load balancing is called so early.
 860             // TODO: Move to after all booleans are defined.
 861             if (useGpuForUpdate && !bFirstStep)
 862             {
 863                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 864                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 865             }
 866             /* PME grid + cut-off optimization with GPUs or PME nodes */
 867             pme_loadbal_do(pme_loadbal,
 868                            cr,
 869                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
 870                            fplog,
 871                            mdlog,
 872                            *ir,
 873                            fr,
 874                            state->box,
 875                            state->x,
 876                            wcycle,
 877                            step,
 878                            step_rel,
 879                            &bPMETunePrinting,
 880                            simulationWork.useGpuPmePpCommunication);
 881         }
 882
 883         wallcycle_start(wcycle, ewcSTEP);
 884
 885         bLastStep = (step_rel == ir->nsteps);
 886         t         = t0 + step * ir->delta_t;
 887
 888         // TODO Refactor this, so that nstfep does not need a default value of zero
 889         if (ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
 890         {
 891             /* find and set the current lambdas */
 892             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
 893
 894             bDoDHDL = do_per_step(step, ir->fepvals->nstdhdl);
 895             bDoFEP  = ((ir->efep != FreeEnergyPerturbationType::No) && do_per_step(step, nstfep));
 896             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
 897                            && (!bFirstStep));
 898         }
 899
 900         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
 901                      && do_per_step(step, replExParams.exchangeInterval));
 902
 903         if (doSimulatedAnnealing)
 904         {
 905             // TODO: Avoid changing inputrec (#3854)
 906             // Simulated annealing updates the reference temperature.
 907             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
 908             update_annealing_target_temp(nonConstInputrec, t, &upd);
 909         }
 910
 911         /* Stop Center of Mass motion */
 912         bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && do_per_step(step, ir->nstcomm));
 913
 914         /* Determine whether or not to do Neighbour Searching */
 915         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
 916
 917         /* Note that the stopHandler will cause termination at nstglobalcomm
 918          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
 919          * nstpcouple steps, we have computed the half-step kinetic energy
 920          * of the previous step and can always output energies at the last step.
 921          */
 922         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
 923
 924         /* do_log triggers energy and virial calculation. Because this leads
 925          * to different code paths, forces can be different. Thus for exact
 926          * continuation we should avoid extra log output.
 927          * Note that the || bLastStep can result in non-exact continuation
 928          * beyond the last step. But we don't consider that to be an issue.
 929          */
 930         do_log     = (do_per_step(step, ir->nstlog)
 931                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
 932         do_verbose = mdrunOptions.verbose
 933                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
 934
 935         if (useGpuForUpdate && !bFirstStep && bNS)
 936         {
 937             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
 938             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
 939             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
 940             // Copy coordinate from the GPU when needed at the search step.
 941             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
 942             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
 943             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 944             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 945         }
 946
 947         // We only need to calculate virtual velocities if we are writing them in the current step
 948         const bool needVirtualVelocitiesThisStep =
 949                 (vsite != nullptr)
 950                 && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep());
 951
 952         if (vsite != nullptr)
 953         {
 954             // Virtual sites need to be updated before domain decomposition and forces are calculated
 955             wallcycle_start(wcycle, ewcVSITECONSTR);
 956             // md-vv calculates virtual velocities once it has full-step real velocities
 957             vsite->construct(state->x,
 958                              state->v,
 959                              state->box,
 960                              (!EI_VV(inputrec->eI) && needVirtualVelocitiesThisStep)
 961                                      ? VSiteOperation::PositionsAndVelocities
 962                                      : VSiteOperation::Positions);
 963             wallcycle_stop(wcycle, ewcVSITECONSTR);
 964         }
 965
 966         if (bNS && !(bFirstStep && ir->bContinuation))
 967         {
 968             bMasterState = FALSE;
 969             /* Correct the new box if it is too skewed */
 970             if (inputrecDynamicBox(ir))
 971             {
 972                 if (correct_box(fplog, step, state->box))
 973                 {
 974                     bMasterState = TRUE;
 975                     // If update is offloaded, it should be informed about the box size change
 976                     if (useGpuForUpdate)
 977                     {
 978                         integrator->setPbc(PbcType::Xyz, state->box);
 979                     }
 980                 }
 981             }
 982             if (DOMAINDECOMP(cr) && bMasterState)
 983             {
 984                 dd_collect_state(cr->dd, state, state_global);
 985             }
 986
 987             if (DOMAINDECOMP(cr))
 988             {
 989                 /* Repartition the domain decomposition */
 990                 dd_partition_system(fplog,
 991                                     mdlog,
 992                                     step,
 993                                     cr,
 994                                     bMasterState,
 995                                     nstglobalcomm,
 996                                     state_global,
 997                                     top_global,
 998                                     *ir,
 999                                     imdSession,
1000                                     pull_work,
1001                                     state,
1002                                     &f,
1003                                     mdAtoms,
1004                                     &top,
1005                                     fr,
1006                                     vsite,
1007                                     constr,
1008                                     nrnb,
1009                                     wcycle,
1010                                     do_verbose && !bPMETunePrinting);
1011                 upd.setNumAtoms(state->natoms);
1012             }
1013         }
1014
1015         // Allocate or re-size GPU halo exchange object, if necessary
1016         if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange)
1017         {
1018             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
1019                                "GPU device manager has to be initialized to use GPU "
1020                                "version of halo exchange.");
1021             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
1022         }
1023
1024         if (MASTER(cr) && do_log)
1025         {
1026             gmx::EnergyOutput::printHeader(
1027                     fplog, step, t); /* can we improve the information printed here? */
1028         }
1029
1030         if (ir->efep != FreeEnergyPerturbationType::No)
1031         {
1032             update_mdatoms(mdatoms, state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
1033         }
1034
1035         if (bExchanged)
1036         {
1037             /* We need the kinetic energy at minus the half step for determining
1038              * the full step kinetic energy and possibly for T-coupling.*/
1039             /* This may not be quite working correctly yet . . . . */
1040             int cglo_flags = CGLO_GSTAT | CGLO_TEMPERATURE;
1041             if (DOMAINDECOMP(cr) && shouldCheckNumberOfBondedInteractions(*cr->dd))
1042             {
1043                 cglo_flags |= CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS;
1044             }
1045             compute_globals(gstat,
1046                             cr,
1047                             ir,
1048                             fr,
1049                             ekind,
1050                             makeConstArrayRef(state->x),
1051                             makeConstArrayRef(state->v),
1052                             state->box,
1053                             mdatoms,
1054                             nrnb,
1055                             &vcm,
1056                             wcycle,
1057                             enerd,
1058                             nullptr,
1059                             nullptr,
1060                             nullptr,
1061                             nullptr,
1062                             gmx::ArrayRef<real>{},
1063                             &nullSignaller,
1064                             state->box,
1065                             &bSumEkinhOld,
1066                             cglo_flags);
1067             if (DOMAINDECOMP(cr))
1068             {
1069                 checkNumberOfBondedInteractions(
1070                         mdlog, cr, top_global, &top, makeConstArrayRef(state->x), state->box);
1071             }
1072         }
1073         clear_mat(force_vir);
1074
1075         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
1076
1077         /* Determine the energy and pressure:
1078          * at nstcalcenergy steps and at energy output steps (set below).
1079          */
1080         if (EI_VV(ir->eI) && (!bInitStep))
1081         {
1082             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1083             bCalcVir      = bCalcEnerStep
1084                        || (ir->epc != PressureCoupling::No
1085                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
1086         }
1087         else
1088         {
1089             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1090             bCalcVir      = bCalcEnerStep
1091                        || (ir->epc != PressureCoupling::No && do_per_step(step, ir->nstpcouple));
1092         }
1093         bCalcEner = bCalcEnerStep;
1094
1095         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
1096
1097         if (do_ene || do_log || bDoReplEx)
1098         {
1099             bCalcVir  = TRUE;
1100             bCalcEner = TRUE;
1101         }
1102
1103         /* Do we need global communication ? */
1104         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
1105                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
1106
1107         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
1108                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
1109                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
1110         if (fr->useMts && !do_per_step(step, ir->nstfout))
1111         {
1112             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
1113         }
1114
1115         if (shellfc)
1116         {
1117             /* Now is the time to relax the shells */
1118             relax_shell_flexcon(fplog,
1119                                 cr,
1120                                 ms,
1121                                 mdrunOptions.verbose,
1122                                 enforcedRotation,
1123                                 step,
1124                                 ir,
1125                                 imdSession,
1126                                 pull_work,
1127                                 bNS,
1128                                 force_flags,
1129                                 &top,
1130                                 constr,
1131                                 enerd,
1132                                 state->natoms,
1133                                 state->x.arrayRefWithPadding(),
1134                                 state->v.arrayRefWithPadding(),
1135                                 state->box,
1136                                 state->lambda,
1137                                 &state->hist,
1138                                 &f.view(),
1139                                 force_vir,
1140                                 *mdatoms,
1141                                 nrnb,
1142                                 wcycle,
1143                                 shellfc,
1144                                 fr,
1145                                 runScheduleWork,
1146                                 t,
1147                                 mu_tot,
1148                                 vsite,
1149                                 ddBalanceRegionHandler);
1150         }
1151         else
1152         {
1153             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
1154                is updated (or the AWH update will be performed twice for one step when continuing).
1155                It would be best to call this update function from do_md_trajectory_writing but that
1156                would occur after do_force. One would have to divide the update_awh function into one
1157                function applying the AWH force and one doing the AWH bias update. The update AWH
1158                bias function could then be called after do_md_trajectory_writing (then containing
1159                update_awh_history). The checkpointing will in the future probably moved to the start
1160                of the md loop which will rid of this issue. */
1161             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
1162             {
1163                 awh->updateHistory(state_global->awhHistory.get());
1164             }
1165
1166             /* The coordinates (x) are shifted (to get whole molecules)
1167              * in do_force.
1168              * This is parallellized as well, and does communication too.
1169              * Check comments in sim_util.c
1170              */
1171             do_force(fplog,
1172                      cr,
1173                      ms,
1174                      *ir,
1175                      awh.get(),
1176                      enforcedRotation,
1177                      imdSession,
1178                      pull_work,
1179                      step,
1180                      nrnb,
1181                      wcycle,
1182                      &top,
1183                      state->box,
1184                      state->x.arrayRefWithPadding(),
1185                      &state->hist,
1186                      &f.view(),
1187                      force_vir,
1188                      mdatoms,
1189                      enerd,
1190                      state->lambda,
1191                      fr,
1192                      runScheduleWork,
1193                      vsite,
1194                      mu_tot,
1195                      t,
1196                      ed ? ed->getLegacyED() : nullptr,
1197                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
1198                      ddBalanceRegionHandler);
1199         }
1200
1201         // VV integrators do not need the following velocity half step
1202         // if it is the first step after starting from a checkpoint.
1203         // That is, the half step is needed on all other steps, and
1204         // also the first step when starting from a .tpr file.
1205         if (EI_VV(ir->eI))
1206         {
1207             integrateVVFirstStep(step,
1208                                  bFirstStep,
1209                                  bInitStep,
1210                                  startingBehavior,
1211                                  nstglobalcomm,
1212                                  ir,
1213                                  fr,
1214                                  cr,
1215                                  state,
1216                                  mdatoms,
1217                                  fcdata,
1218                                  &MassQ,
1219                                  &vcm,
1220                                  top_global,
1221                                  top,
1222                                  enerd,
1223                                  ekind,
1224                                  gstat,
1225                                  &last_ekin,
1226                                  bCalcVir,
1227                                  total_vir,
1228                                  shake_vir,
1229                                  force_vir,
1230                                  pres,
1231                                  M,
1232                                  do_log,
1233                                  do_ene,
1234                                  bCalcEner,
1235                                  bGStat,
1236                                  bStopCM,
1237                                  bTrotter,
1238                                  bExchanged,
1239                                  &bSumEkinhOld,
1240                                  &saved_conserved_quantity,
1241                                  &f,
1242                                  &upd,
1243                                  constr,
1244                                  &nullSignaller,
1245                                  trotter_seq,
1246                                  nrnb,
1247                                  mdlog,
1248                                  fplog,
1249                                  wcycle);
1250             if (vsite != nullptr && needVirtualVelocitiesThisStep)
1251             {
1252                 // Positions were calculated earlier
1253                 wallcycle_start(wcycle, ewcVSITECONSTR);
1254                 vsite->construct(state->x, state->v, state->box, VSiteOperation::Velocities);
1255                 wallcycle_stop(wcycle, ewcVSITECONSTR);
1256             }
1257         }
1258
1259         /* ########  END FIRST UPDATE STEP  ############## */
1260         /* ########  If doing VV, we now have v(dt) ###### */
1261         if (bDoExpanded)
1262         {
1263             /* perform extended ensemble sampling in lambda - we don't
1264                actually move to the new state before outputting
1265                statistics, but if performing simulated tempering, we
1266                do update the velocities and the tau_t. */
1267             // TODO: Avoid changing inputrec (#3854)
1268             // Simulated tempering updates the reference temperature.
1269             // Expanded ensemble without simulated tempering does not change the inputrec.
1270             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
1271             lamnew                 = ExpandedEnsembleDynamics(fplog,
1272                                               nonConstInputrec,
1273                                               enerd,
1274                                               state,
1275                                               &MassQ,
1276                                               state->fep_state,
1277                                               state->dfhist,
1278                                               step,
1279                                               state->v.rvec_array(),
1280                                               mdatoms);
1281             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1282             if (MASTER(cr))
1283             {
1284                 copy_df_history(state_global->dfhist, state->dfhist);
1285             }
1286         }
1287
1288         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1289         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1290         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1291             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1292                 || checkpointHandler->isCheckpointingStep()))
1293         {
1294             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1295             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1296         }
1297         // Copy velocities if needed for the output/checkpointing.
1298         // NOTE: Copy on the search steps is done at the beginning of the step.
1299         if (useGpuForUpdate && !bNS
1300             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1301         {
1302             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1303             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1304         }
1305         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1306         // and update is offloaded hence forces are kept on the GPU for update and have not been
1307         // already transferred in do_force().
1308         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1309         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1310         //       prior to GPU update.
1311         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1312         //       copy call in do_force(...).
1313         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1314         //       on host after the D2H copy in do_force(...).
1315         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1316             && do_per_step(step, ir->nstfout))
1317         {
1318             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1319             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1320         }
1321         /* Now we have the energies and forces corresponding to the
1322          * coordinates at time t. We must output all of this before
1323          * the update.
1324          */
1325         do_md_trajectory_writing(fplog,
1326                                  cr,
1327                                  nfile,
1328                                  fnm,
1329                                  step,
1330                                  step_rel,
1331                                  t,
1332                                  ir,
1333                                  state,
1334                                  state_global,
1335                                  observablesHistory,
1336                                  top_global,
1337                                  fr,
1338                                  outf,
1339                                  energyOutput,
1340                                  ekind,
1341                                  f.view().force(),
1342                                  checkpointHandler->isCheckpointingStep(),
1343                                  bRerunMD,
1344                                  bLastStep,
1345                                  mdrunOptions.writeConfout,
1346                                  bSumEkinhOld);
1347         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1348         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x, t);
1349
1350         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1351         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1352             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1353         {
1354             copy_mat(state->svir_prev, shake_vir);
1355             copy_mat(state->fvir_prev, force_vir);
1356         }
1357
1358         stopHandler->setSignal();
1359         resetHandler->setSignal(walltime_accounting);
1360
1361         if (bGStat || !PAR(cr))
1362         {
1363             /* In parallel we only have to check for checkpointing in steps
1364              * where we do global communication,
1365              *  otherwise the other nodes don't know.
1366              */
1367             checkpointHandler->setSignal(walltime_accounting);
1368         }
1369
1370         /* #########   START SECOND UPDATE STEP ################# */
1371
1372         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1373            controlled in preprocessing */
1374
1375         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1376         {
1377             gmx_bool bIfRandomize;
1378             bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, &upd, constr);
1379             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1380             if (constr && bIfRandomize)
1381             {
1382                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1383             }
1384         }
1385         /* Box is changed in update() when we do pressure coupling,
1386          * but we should still use the old box for energy corrections and when
1387          * writing it to the energy file, so it matches the trajectory files for
1388          * the same timestep above. Make a copy in a separate array.
1389          */
1390         copy_mat(state->box, lastbox);
1391
1392         dvdl_constr = 0;
1393
1394         if (!useGpuForUpdate)
1395         {
1396             wallcycle_start(wcycle, ewcUPDATE);
1397         }
1398         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1399         if (bTrotter)
1400         {
1401             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
1402             /* We can only do Berendsen coupling after we have summed
1403              * the kinetic energy or virial. Since the happens
1404              * in global_state after update, we should only do it at
1405              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1406              */
1407         }
1408         else
1409         {
1410             update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
1411             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1412         }
1413
1414         /* With leap-frog type integrators we compute the kinetic energy
1415          * at a whole time step as the average of the half-time step kinetic
1416          * energies of two subsequent steps. Therefore we need to compute the
1417          * half step kinetic energy also if we need energies at the next step.
1418          */
1419         const bool needHalfStepKineticEnergy =
1420                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1421
1422         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1423         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1424         const bool doParrinelloRahman = (ir->epc == PressureCoupling::ParrinelloRahman
1425                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1426
1427         if (EI_VV(ir->eI))
1428         {
1429             GMX_ASSERT(!useGpuForUpdate, "GPU update is not supported with VVAK integrator.");
1430
1431             integrateVVSecondStep(step,
1432                                   ir,
1433                                   fr,
1434                                   cr,
1435                                   state,
1436                                   mdatoms,
1437                                   fcdata,
1438                                   &MassQ,
1439                                   &vcm,
1440                                   pull_work,
1441                                   enerd,
1442                                   ekind,
1443                                   gstat,
1444                                   &dvdl_constr,
1445                                   bCalcVir,
1446                                   total_vir,
1447                                   shake_vir,
1448                                   force_vir,
1449                                   pres,
1450                                   M,
1451                                   lastbox,
1452                                   do_log,
1453                                   do_ene,
1454                                   bGStat,
1455                                   &bSumEkinhOld,
1456                                   &f,
1457                                   &cbuf,
1458                                   &upd,
1459                                   constr,
1460                                   &nullSignaller,
1461                                   trotter_seq,
1462                                   nrnb,
1463                                   wcycle);
1464         }
1465         else
1466         {
1467             if (useGpuForUpdate)
1468             {
1469                 if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1470                 {
1471                     integrator->set(stateGpu->getCoordinates(),
1472                                     stateGpu->getVelocities(),
1473                                     stateGpu->getForces(),
1474                                     top.idef,
1475                                     *mdatoms);
1476
1477                     // Copy data to the GPU after buffers might have being reinitialized
1478                     stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1479                     stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1480                 }
1481
1482                 if (simulationWork.useGpuPme && !runScheduleWork->simulationWork.useGpuPmePpCommunication
1483                     && !thisRankHasDuty(cr, DUTY_PME))
1484                 {
1485                     // The PME forces were recieved to the host, so have to be copied
1486                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::All);
1487                 }
1488                 else if (!runScheduleWork->stepWork.useGpuFBufferOps)
1489                 {
1490                     // The buffer ops were not offloaded this step, so the forces are on the
1491                     // host and have to be copied
1492                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1493                 }
1494
1495                 const bool doTemperatureScaling =
1496                         (ir->etc != TemperatureCoupling::No
1497                          && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1498
1499                 // This applies Leap-Frog, LINCS and SETTLE in succession
1500                 integrator->integrate(
1501                         stateGpu->getForcesReadyOnDeviceEvent(
1502                                 AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1503                         ir->delta_t,
1504                         true,
1505                         bCalcVir,
1506                         shake_vir,
1507                         doTemperatureScaling,
1508                         ekind->tcstat,
1509                         doParrinelloRahman,
1510                         ir->nstpcouple * ir->delta_t,
1511                         M);
1512
1513                 // Copy velocities D2H after update if:
1514                 // - Globals are computed this step (includes the energy output steps).
1515                 // - Temperature is needed for the next step.
1516                 if (bGStat || needHalfStepKineticEnergy)
1517                 {
1518                     stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1519                     stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1520                 }
1521             }
1522             else
1523             {
1524                 /* With multiple time stepping we need to do an additional normal
1525                  * update step to obtain the virial, as the actual MTS integration
1526                  * using an acceleration where the slow forces are multiplied by mtsFactor.
1527                  * Using that acceleration would result in a virial with the slow
1528                  * force contribution would be a factor mtsFactor too large.
1529                  */
1530                 if (fr->useMts && bCalcVir && constr != nullptr)
1531                 {
1532                     upd.update_for_constraint_virial(
1533                             *ir, *mdatoms, *state, f.view().forceWithPadding(), *ekind);
1534
1535                     constrain_coordinates(constr,
1536                                           do_log,
1537                                           do_ene,
1538                                           step,
1539                                           state,
1540                                           upd.xp()->arrayRefWithPadding(),
1541                                           &dvdl_constr,
1542                                           bCalcVir,
1543                                           shake_vir);
1544                 }
1545
1546                 ArrayRefWithPadding<const RVec> forceCombined =
1547                         (fr->useMts && step % ir->mtsLevels[1].stepFactor == 0)
1548                                 ? f.view().forceMtsCombinedWithPadding()
1549                                 : f.view().forceWithPadding();
1550                 upd.update_coords(
1551                         *ir, step, mdatoms, state, forceCombined, fcdata, ekind, M, etrtPOSITION, cr, constr != nullptr);
1552
1553                 wallcycle_stop(wcycle, ewcUPDATE);
1554
1555                 constrain_coordinates(constr,
1556                                       do_log,
1557                                       do_ene,
1558                                       step,
1559                                       state,
1560                                       upd.xp()->arrayRefWithPadding(),
1561                                       &dvdl_constr,
1562                                       bCalcVir && !fr->useMts,
1563                                       shake_vir);
1564
1565                 upd.update_sd_second_half(
1566                         *ir, step, &dvdl_constr, mdatoms, state, cr, nrnb, wcycle, constr, do_log, do_ene);
1567                 upd.finish_update(*ir, mdatoms, state, wcycle, constr != nullptr);
1568             }
1569
1570             if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1571             {
1572                 updatePrevStepPullCom(pull_work, state);
1573             }
1574
1575             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1576         }
1577
1578         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1579         /* With Leap-Frog we can skip compute_globals at
1580          * non-communication steps, but we need to calculate
1581          * the kinetic energy one step before communication.
1582          */
1583         {
1584             // Organize to do inter-simulation signalling on steps if
1585             // and when algorithms require it.
1586             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1587
1588             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1589             {
1590                 // Copy coordinates when needed to stop the CM motion.
1591                 if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
1592                 {
1593                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1594                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1595                 }
1596                 // Since we're already communicating at this step, we
1597                 // can propagate intra-simulation signals. Note that
1598                 // check_nstglobalcomm has the responsibility for
1599                 // choosing the value of nstglobalcomm that is one way
1600                 // bGStat becomes true, so we can't get into a
1601                 // situation where e.g. checkpointing can't be
1602                 // signalled.
1603                 bool                doIntraSimSignal = true;
1604                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1605
1606                 compute_globals(
1607                         gstat,
1608                         cr,
1609                         ir,
1610                         fr,
1611                         ekind,
1612                         makeConstArrayRef(state->x),
1613                         makeConstArrayRef(state->v),
1614                         state->box,
1615                         mdatoms,
1616                         nrnb,
1617                         &vcm,
1618                         wcycle,
1619                         enerd,
1620                         force_vir,
1621                         shake_vir,
1622                         total_vir,
1623                         pres,
1624                         (!EI_VV(ir->eI) && bCalcEner && constr != nullptr) ? constr->rmsdData()
1625                                                                            : gmx::ArrayRef<real>{},
1626                         &signaller,
1627                         lastbox,
1628                         &bSumEkinhOld,
1629                         (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1630                                 | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1631                                 | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1632                                 | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
1633                                 | (DOMAINDECOMP(cr) && shouldCheckNumberOfBondedInteractions(*cr->dd)
1634                                            ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1635                                            : 0));
1636                 if (DOMAINDECOMP(cr))
1637                 {
1638                     checkNumberOfBondedInteractions(
1639                             mdlog, cr, top_global, &top, makeConstArrayRef(state->x), state->box);
1640                 }
1641                 if (!EI_VV(ir->eI) && bStopCM)
1642                 {
1643                     process_and_stopcm_grp(
1644                             fplog, &vcm, *mdatoms, makeArrayRef(state->x), makeArrayRef(state->v));
1645                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1646
1647                     // TODO: The special case of removing CM motion should be dealt more gracefully
1648                     if (useGpuForUpdate)
1649                     {
1650                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1651                         // Here we block until the H2D copy completes because event sync with the
1652                         // force kernels that use the coordinates on the next steps is not implemented
1653                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1654                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1655                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1656                         if (vcm.mode != ComRemovalAlgorithm::No)
1657                         {
1658                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1659                         }
1660                     }
1661                 }
1662             }
1663         }
1664
1665         /* #############  END CALC EKIN AND PRESSURE ################# */
1666
1667         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1668            the virial that should probably be addressed eventually. state->veta has better properies,
1669            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1670            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1671
1672         if (ir->efep != FreeEnergyPerturbationType::No && !EI_VV(ir->eI))
1673         {
1674             /* Sum up the foreign energy and dK/dl terms for md and sd.
1675                Currently done every step so that dH/dl is correct in the .edr */
1676             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1677         }
1678
1679         update_pcouple_after_coordinates(
1680                 fplog, step, ir, mdatoms, pres, force_vir, shake_vir, pressureCouplingMu, state, nrnb, upd.deform(), !useGpuForUpdate);
1681
1682         const bool doBerendsenPressureCoupling = (inputrec->epc == PressureCoupling::Berendsen
1683                                                   && do_per_step(step, inputrec->nstpcouple));
1684         const bool doCRescalePressureCoupling  = (inputrec->epc == PressureCoupling::CRescale
1685                                                  && do_per_step(step, inputrec->nstpcouple));
1686         if (useGpuForUpdate
1687             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1688         {
1689             integrator->scaleCoordinates(pressureCouplingMu);
1690             if (doCRescalePressureCoupling)
1691             {
1692                 matrix pressureCouplingInvMu;
1693                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1694                 integrator->scaleVelocities(pressureCouplingInvMu);
1695             }
1696             integrator->setPbc(PbcType::Xyz, state->box);
1697         }
1698
1699         /* ################# END UPDATE STEP 2 ################# */
1700         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1701
1702         /* The coordinates (x) were unshifted in update */
1703         if (!bGStat)
1704         {
1705             /* We will not sum ekinh_old,
1706              * so signal that we still have to do it.
1707              */
1708             bSumEkinhOld = TRUE;
1709         }
1710
1711         if (bCalcEner)
1712         {
1713             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1714
1715             /* use the directly determined last velocity, not actually the averaged half steps */
1716             if (bTrotter && ir->eI == IntegrationAlgorithm::VV)
1717             {
1718                 enerd->term[F_EKIN] = last_ekin;
1719             }
1720             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1721
1722             if (integratorHasConservedEnergyQuantity(ir))
1723             {
1724                 if (EI_VV(ir->eI))
1725                 {
1726                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1727                 }
1728                 else
1729                 {
1730                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1731                 }
1732             }
1733             /* #########  END PREPARING EDR OUTPUT  ###########  */
1734         }
1735
1736         /* Output stuff */
1737         if (MASTER(cr))
1738         {
1739             if (fplog && do_log && bDoExpanded)
1740             {
1741                 /* only needed if doing expanded ensemble */
1742                 PrintFreeEnergyInfoToFile(fplog,
1743                                           ir->fepvals.get(),
1744                                           ir->expandedvals.get(),
1745                                           ir->bSimTemp ? ir->simtempvals.get() : nullptr,
1746                                           state_global->dfhist,
1747                                           state->fep_state,
1748                                           ir->nstlog,
1749                                           step);
1750             }
1751             if (bCalcEner)
1752             {
1753                 energyOutput.addDataAtEnergyStep(bDoDHDL,
1754                                                  bCalcEnerStep,
1755                                                  t,
1756                                                  mdatoms->tmass,
1757                                                  enerd,
1758                                                  ir->fepvals.get(),
1759                                                  ir->expandedvals.get(),
1760                                                  lastbox,
1761                                                  PTCouplingArrays{ state->boxv,
1762                                                                    state->nosehoover_xi,
1763                                                                    state->nosehoover_vxi,
1764                                                                    state->nhpres_xi,
1765                                                                    state->nhpres_vxi },
1766                                                  state->fep_state,
1767                                                  shake_vir,
1768                                                  force_vir,
1769                                                  total_vir,
1770                                                  pres,
1771                                                  ekind,
1772                                                  mu_tot,
1773                                                  constr);
1774             }
1775             else
1776             {
1777                 energyOutput.recordNonEnergyStep();
1778             }
1779
1780             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1781             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1782
1783             if (doSimulatedAnnealing)
1784             {
1785                 gmx::EnergyOutput::printAnnealingTemperatures(
1786                         do_log ? fplog : nullptr, groups, &(ir->opts));
1787             }
1788             if (do_log || do_ene || do_dr || do_or)
1789             {
1790                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf),
1791                                                    do_ene,
1792                                                    do_dr,
1793                                                    do_or,
1794                                                    do_log ? fplog : nullptr,
1795                                                    step,
1796                                                    t,
1797                                                    fr->fcdata.get(),
1798                                                    awh.get());
1799             }
1800             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1801             {
1802                 const bool isInitialOutput = false;
1803                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1804             }
1805
1806             if (ir->bPull)
1807             {
1808                 pull_print_output(pull_work, step, t);
1809             }
1810
1811             if (do_per_step(step, ir->nstlog))
1812             {
1813                 if (fflush(fplog) != 0)
1814                 {
1815                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1816                 }
1817             }
1818         }
1819         if (bDoExpanded)
1820         {
1821             /* Have to do this part _after_ outputting the logfile and the edr file */
1822             /* Gets written into the state at the beginning of next loop*/
1823             state->fep_state = lamnew;
1824         }
1825         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1826         {
1827             state->fep_state = awh->fepLambdaState();
1828         }
1829         /* Print the remaining wall clock time for the run */
1830         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1831         {
1832             if (shellfc)
1833             {
1834                 fprintf(stderr, "\n");
1835             }
1836             print_time(stderr, walltime_accounting, step, ir, cr);
1837         }
1838
1839         /* Ion/water position swapping.
1840          * Not done in last step since trajectory writing happens before this call
1841          * in the MD loop and exchanges would be lost anyway. */
1842         bNeedRepartition = FALSE;
1843         if ((ir->eSwapCoords != SwapType::No) && (step > 0) && !bLastStep
1844             && do_per_step(step, ir->swap->nstswap))
1845         {
1846             bNeedRepartition = do_swapcoords(cr,
1847                                              step,
1848                                              t,
1849                                              ir,
1850                                              swap,
1851                                              wcycle,
1852                                              as_rvec_array(state->x.data()),
1853                                              state->box,
1854                                              MASTER(cr) && mdrunOptions.verbose,
1855                                              bRerunMD);
1856
1857             if (bNeedRepartition && DOMAINDECOMP(cr))
1858             {
1859                 dd_collect_state(cr->dd, state, state_global);
1860             }
1861         }
1862
1863         /* Replica exchange */
1864         bExchanged = FALSE;
1865         if (bDoReplEx)
1866         {
1867             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1868         }
1869
1870         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1871         {
1872             dd_partition_system(fplog,
1873                                 mdlog,
1874                                 step,
1875                                 cr,
1876                                 TRUE,
1877                                 1,
1878                                 state_global,
1879                                 top_global,
1880                                 *ir,
1881                                 imdSession,
1882                                 pull_work,
1883                                 state,
1884                                 &f,
1885                                 mdAtoms,
1886                                 &top,
1887                                 fr,
1888                                 vsite,
1889                                 constr,
1890                                 nrnb,
1891                                 wcycle,
1892                                 FALSE);
1893             upd.setNumAtoms(state->natoms);
1894         }
1895
1896         bFirstStep = FALSE;
1897         bInitStep  = FALSE;
1898
1899         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1900         /* With all integrators, except VV, we need to retain the pressure
1901          * at the current step for coupling at the next step.
1902          */
1903         if ((state->flags & enumValueToBitMask(StateEntry::PressurePrevious))
1904             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1905         {
1906             /* Store the pressure in t_state for pressure coupling
1907              * at the next MD step.
1908              */
1909             copy_mat(pres, state->pres_prev);
1910         }
1911
1912         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1913
1914         if ((membed != nullptr) && (!bLastStep))
1915         {
1916             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1917         }
1918
1919         cycles = wallcycle_stop(wcycle, ewcSTEP);
1920         if (DOMAINDECOMP(cr) && wcycle)
1921         {
1922             dd_cycles_add(cr->dd, cycles, ddCyclStep);
1923         }
1924
1925         /* increase the MD step number */
1926         step++;
1927         step_rel++;
1928
1929 #if GMX_FAHCORE
1930         if (MASTER(cr))
1931         {
1932             fcReportProgress(ir->nsteps + ir->init_step, step);
1933         }
1934 #endif
1935
1936         resetHandler->resetCounters(
1937                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
1938
1939         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
1940         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
1941     }
1942     /* End of main MD loop */
1943
1944     /* Closing TNG files can include compressing data. Therefore it is good to do that
1945      * before stopping the time measurements. */
1946     mdoutf_tng_close(outf);
1947
1948     /* Stop measuring walltime */
1949     walltime_accounting_end_time(walltime_accounting);
1950
1951     if (!thisRankHasDuty(cr, DUTY_PME))
1952     {
1953         /* Tell the PME only node to finish */
1954         gmx_pme_send_finish(cr);
1955     }
1956
1957     if (MASTER(cr))
1958     {
1959         if (ir->nstcalcenergy > 0)
1960         {
1961             energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
1962
1963             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
1964             energyOutput.printAverages(fplog, groups);
1965         }
1966     }
1967     done_mdoutf(outf);
1968
1969     if (bPMETune)
1970     {
1971         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
1972     }
1973
1974     done_shellfc(fplog, shellfc, step_rel);
1975
1976     if (useReplicaExchange && MASTER(cr))
1977     {
1978         print_replica_exchange_statistics(fplog, repl_ex);
1979     }
1980
1981     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
1982
1983     global_stat_destroy(gstat);
1984 }