src/gromacs/mdrun/md.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2011-2019,2020, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief Implements the integrator for normal molecular dynamics simulations
  40  *
  41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
  42  * \ingroup module_mdrun
  43  */
  44 #include "gmxpre.h"
  45
  46 #include <cinttypes>
  47 #include <cmath>
  48 #include <cstdio>
  49 #include <cstdlib>
  50
  51 #include <algorithm>
  52 #include <memory>
  53
  54 #include "gromacs/awh/awh.h"
  55 #include "gromacs/commandline/filenm.h"
  56 #include "gromacs/domdec/collect.h"
  57 #include "gromacs/domdec/dlbtiming.h"
  58 #include "gromacs/domdec/domdec.h"
  59 #include "gromacs/domdec/domdec_network.h"
  60 #include "gromacs/domdec/domdec_struct.h"
  61 #include "gromacs/domdec/gpuhaloexchange.h"
  62 #include "gromacs/domdec/mdsetup.h"
  63 #include "gromacs/domdec/partition.h"
  64 #include "gromacs/essentialdynamics/edsam.h"
  65 #include "gromacs/ewald/pme_load_balancing.h"
  66 #include "gromacs/ewald/pme_pp.h"
  67 #include "gromacs/fileio/trxio.h"
  68 #include "gromacs/gmxlib/network.h"
  69 #include "gromacs/gmxlib/nrnb.h"
  70 #include "gromacs/gpu_utils/gpu_utils.h"
  71 #include "gromacs/imd/imd.h"
  72 #include "gromacs/listed_forces/manage_threading.h"
  73 #include "gromacs/math/functions.h"
  74 #include "gromacs/math/utilities.h"
  75 #include "gromacs/math/vec.h"
  76 #include "gromacs/math/vectypes.h"
  77 #include "gromacs/mdlib/checkpointhandler.h"
  78 #include "gromacs/mdlib/compute_io.h"
  79 #include "gromacs/mdlib/constr.h"
  80 #include "gromacs/mdlib/ebin.h"
  81 #include "gromacs/mdlib/enerdata_utils.h"
  82 #include "gromacs/mdlib/energyoutput.h"
  83 #include "gromacs/mdlib/expanded.h"
  84 #include "gromacs/mdlib/force.h"
  85 #include "gromacs/mdlib/force_flags.h"
  86 #include "gromacs/mdlib/forcerec.h"
  87 #include "gromacs/mdlib/md_support.h"
  88 #include "gromacs/mdlib/mdatoms.h"
  89 #include "gromacs/mdlib/mdoutf.h"
  90 #include "gromacs/mdlib/membed.h"
  91 #include "gromacs/mdlib/resethandler.h"
  92 #include "gromacs/mdlib/sighandler.h"
  93 #include "gromacs/mdlib/simulationsignal.h"
  94 #include "gromacs/mdlib/stat.h"
  95 #include "gromacs/mdlib/stophandler.h"
  96 #include "gromacs/mdlib/tgroup.h"
  97 #include "gromacs/mdlib/trajectory_writing.h"
  98 #include "gromacs/mdlib/update.h"
  99 #include "gromacs/mdlib/update_constrain_gpu.h"
 100 #include "gromacs/mdlib/vcm.h"
 101 #include "gromacs/mdlib/vsite.h"
 102 #include "gromacs/mdrunutility/handlerestart.h"
 103 #include "gromacs/mdrunutility/multisim.h"
 104 #include "gromacs/mdrunutility/printtime.h"
 105 #include "gromacs/mdtypes/awh_history.h"
 106 #include "gromacs/mdtypes/awh_params.h"
 107 #include "gromacs/mdtypes/commrec.h"
 108 #include "gromacs/mdtypes/df_history.h"
 109 #include "gromacs/mdtypes/energyhistory.h"
 110 #include "gromacs/mdtypes/fcdata.h"
 111 #include "gromacs/mdtypes/forcerec.h"
 112 #include "gromacs/mdtypes/group.h"
 113 #include "gromacs/mdtypes/inputrec.h"
 114 #include "gromacs/mdtypes/interaction_const.h"
 115 #include "gromacs/mdtypes/md_enums.h"
 116 #include "gromacs/mdtypes/mdatom.h"
 117 #include "gromacs/mdtypes/mdrunoptions.h"
 118 #include "gromacs/mdtypes/observableshistory.h"
 119 #include "gromacs/mdtypes/pullhistory.h"
 120 #include "gromacs/mdtypes/simulation_workload.h"
 121 #include "gromacs/mdtypes/state.h"
 122 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
 123 #include "gromacs/modularsimulator/energyelement.h"
 124 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 125 #include "gromacs/nbnxm/nbnxm.h"
 126 #include "gromacs/pbcutil/mshift.h"
 127 #include "gromacs/pbcutil/pbc.h"
 128 #include "gromacs/pulling/output.h"
 129 #include "gromacs/pulling/pull.h"
 130 #include "gromacs/swap/swapcoords.h"
 131 #include "gromacs/timing/wallcycle.h"
 132 #include "gromacs/timing/walltime_accounting.h"
 133 #include "gromacs/topology/atoms.h"
 134 #include "gromacs/topology/idef.h"
 135 #include "gromacs/topology/mtop_util.h"
 136 #include "gromacs/topology/topology.h"
 137 #include "gromacs/trajectory/trajectoryframe.h"
 138 #include "gromacs/utility/basedefinitions.h"
 139 #include "gromacs/utility/cstringutil.h"
 140 #include "gromacs/utility/fatalerror.h"
 141 #include "gromacs/utility/logger.h"
 142 #include "gromacs/utility/real.h"
 143 #include "gromacs/utility/smalloc.h"
 144
 145 #include "legacysimulator.h"
 146 #include "replicaexchange.h"
 147 #include "shellfc.h"
 148
 149 #if GMX_FAHCORE
 150 #    include "corewrap.h"
 151 #endif
 152
 153 using gmx::SimulationSignaller;
 154
 155 void gmx::LegacySimulator::do_md()
 156 {
 157     // TODO Historically, the EM and MD "integrators" used different
 158     // names for the t_inputrec *parameter, but these must have the
 159     // same name, now that it's a member of a struct. We use this ir
 160     // alias to avoid a large ripple of nearly useless changes.
 161     // t_inputrec is being replaced by IMdpOptionsProvider, so this
 162     // will go away eventually.
 163     t_inputrec*  ir = inputrec;
 164     int64_t      step, step_rel;
 165     double       t, t0 = ir->init_t, lam0[efptNR];
 166     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
 167     gmx_bool     bNS, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
 168     gmx_bool     bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
 169     gmx_bool     do_ene, do_log, do_verbose;
 170     gmx_bool     bMasterState;
 171     unsigned int force_flags;
 172     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, tmp_vir = { { 0 } },
 173            pres = { { 0 } };
 174     int                         i, m;
 175     rvec                        mu_tot;
 176     matrix                      pressureCouplingMu, M;
 177     gmx_repl_ex_t               repl_ex = nullptr;
 178     PaddedHostVector<gmx::RVec> f{};
 179     gmx_global_stat_t           gstat;
 180     t_graph*                    graph = nullptr;
 181     gmx_shellfc_t*              shellfc;
 182     gmx_bool                    bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
 183     gmx_bool                    bTemp, bPres, bTrotter;
 184     real                        dvdl_constr;
 185     std::vector<RVec>           cbuf;
 186     matrix                      lastbox;
 187     int                         lamnew = 0;
 188     /* for FEP */
 189     int       nstfep = 0;
 190     double    cycles;
 191     real      saved_conserved_quantity = 0;
 192     real      last_ekin                = 0;
 193     t_extmass MassQ;
 194     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
 195
 196     /* PME load balancing data for GPU kernels */
 197     gmx_bool bPMETune         = FALSE;
 198     gmx_bool bPMETunePrinting = FALSE;
 199
 200     bool bInteractiveMDstep = false;
 201
 202     /* Domain decomposition could incorrectly miss a bonded
 203        interaction, but checking for that requires a global
 204        communication stage, which does not otherwise happen in DD
 205        code. So we do that alongside the first global energy reduction
 206        after a new DD is made. These variables handle whether the
 207        check happens, and the result it returns. */
 208     bool shouldCheckNumberOfBondedInteractions = false;
 209     int  totalNumberOfBondedInteractions       = -1;
 210
 211     SimulationSignals signals;
 212     // Most global communnication stages don't propagate mdrun
 213     // signals, and will use this object to achieve that.
 214     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
 215
 216     if (!mdrunOptions.writeConfout)
 217     {
 218         // This is on by default, and the main known use case for
 219         // turning it off is for convenience in benchmarking, which is
 220         // something that should not show up in the general user
 221         // interface.
 222         GMX_LOG(mdlog.info)
 223                 .asParagraph()
 224                 .appendText(
 225                         "The -noconfout functionality is deprecated, and may be removed in a "
 226                         "future version.");
 227     }
 228
 229     /* md-vv uses averaged full step velocities for T-control
 230        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 231        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 232     bTrotter = (EI_VV(ir->eI)
 233                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
 234
 235     const bool bRerunMD = false;
 236
 237     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
 238     bGStatEveryStep   = (nstglobalcomm == 1);
 239
 240     SimulationGroups* groups = &top_global->groups;
 241
 242     std::unique_ptr<EssentialDynamics> ed = nullptr;
 243     if (opt2bSet("-ei", nfile, fnm))
 244     {
 245         /* Initialize essential dynamics sampling */
 246         ed = init_edsam(mdlog, opt2fn_null("-ei", nfile, fnm), opt2fn("-eo", nfile, fnm), top_global,
 247                         ir, cr, constr, state_global, observablesHistory, oenv, startingBehavior);
 248     }
 249     else if (observablesHistory->edsamHistory)
 250     {
 251         gmx_fatal(FARGS,
 252                   "The checkpoint is from a run with essential dynamics sampling, "
 253                   "but the current run did not specify the -ei option. "
 254                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
 255     }
 256
 257     initialize_lambdas(fplog, *ir, MASTER(cr), &state_global->fep_state, state_global->lambda, lam0);
 258     Update     upd(ir, deform);
 259     const bool doSimulatedAnnealing = initSimulatedAnnealing(ir, &upd);
 260     const bool useReplicaExchange   = (replExParams.exchangeInterval > 0);
 261
 262     bool simulationsShareState = false;
 263     int  nstSignalComm         = nstglobalcomm;
 264     {
 265         // TODO This implementation of ensemble orientation restraints is nasty because
 266         // a user can't just do multi-sim with single-sim orientation restraints.
 267         bool usingEnsembleRestraints =
 268                 (fcd->disres.nsystems > 1) || ((ms != nullptr) && (fcd->orires.nr != 0));
 269         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr));
 270
 271         // Replica exchange, ensemble restraints and AWH need all
 272         // simulations to remain synchronized, so they need
 273         // checkpoints and stop conditions to act on the same step, so
 274         // the propagation of such signals must take place between
 275         // simulations, not just within simulations.
 276         // TODO: Make algorithm initializers set these flags.
 277         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
 278
 279         if (simulationsShareState)
 280         {
 281             // Inter-simulation signal communication does not need to happen
 282             // often, so we use a minimum of 200 steps to reduce overhead.
 283             const int c_minimumInterSimulationSignallingInterval = 200;
 284             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
 285                             * nstglobalcomm;
 286         }
 287     }
 288
 289     if (startingBehavior != StartingBehavior::RestartWithAppending)
 290     {
 291         pleaseCiteCouplingAlgorithms(fplog, *ir);
 292     }
 293     gmx_mdoutf* outf =
 294             init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, ir,
 295                         top_global, oenv, wcycle, startingBehavior, simulationsShareState, ms);
 296     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, ir, pull_work,
 297                                    mdoutf_get_fp_dhdl(outf), false, startingBehavior, mdModulesNotifier);
 298
 299     gstat = global_stat_init(ir);
 300
 301     /* Check for polarizable models and flexible constraints */
 302     shellfc = init_shell_flexcon(fplog, top_global, constr ? constr->numFlexibleConstraints() : 0,
 303                                  ir->nstcalcenergy, DOMAINDECOMP(cr));
 304
 305     {
 306         double io = compute_io(ir, top_global->natoms, *groups, energyOutput.numEnergyTerms(), 1);
 307         if ((io > 2000) && MASTER(cr))
 308         {
 309             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
 310         }
 311     }
 312
 313     // Local state only becomes valid now.
 314     std::unique_ptr<t_state> stateInstance;
 315     t_state*                 state;
 316
 317
 318     gmx_localtop_t top(top_global->ffparams);
 319
 320     auto mdatoms = mdAtoms->mdatoms();
 321
 322     std::unique_ptr<UpdateConstrainGpu> integrator;
 323
 324     if (DOMAINDECOMP(cr))
 325     {
 326         stateInstance = std::make_unique<t_state>();
 327         state         = stateInstance.get();
 328         dd_init_local_state(cr->dd, state_global, state);
 329
 330         /* Distribute the charge groups over the nodes from the master node */
 331         dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1, state_global, *top_global, ir,
 332                             imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
 333                             nrnb, nullptr, FALSE);
 334         shouldCheckNumberOfBondedInteractions = true;
 335         upd.setNumAtoms(state->natoms);
 336     }
 337     else
 338     {
 339         state_change_natoms(state_global, state_global->natoms);
 340         f.resizeWithPadding(state_global->natoms);
 341         /* Copy the pointer to the global state */
 342         state = state_global;
 343
 344         /* Generate and initialize new topology */
 345         mdAlgorithmsSetupAtomData(cr, ir, *top_global, &top, fr, &graph, mdAtoms, constr, vsite, shellfc);
 346
 347         upd.setNumAtoms(state->natoms);
 348     }
 349
 350     const auto& simulationWork     = runScheduleWork->simulationWork;
 351     const bool  useGpuForPme       = simulationWork.useGpuPme;
 352     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
 353     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
 354     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
 355
 356     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
 357
 358     if (useGpuForUpdate)
 359     {
 360         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
 361                                    || constr->numConstraintsTotal() == 0,
 362                            "Constraints in domain decomposition are only supported with update "
 363                            "groups if using GPU update.\n");
 364         GMX_RELEASE_ASSERT(ir->eConstrAlg != econtSHAKE || constr == nullptr
 365                                    || constr->numConstraintsTotal() == 0,
 366                            "SHAKE is not supported with GPU update.");
 367         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
 368                            "Either PME or short-ranged non-bonded interaction tasks must run on "
 369                            "the GPU to use GPU update.\n");
 370         GMX_RELEASE_ASSERT(ir->eI == eiMD,
 371                            "Only the md integrator is supported with the GPU update.\n");
 372         GMX_RELEASE_ASSERT(
 373                 ir->etc != etcNOSEHOOVER,
 374                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
 375         GMX_RELEASE_ASSERT(ir->epc == epcNO || ir->epc == epcPARRINELLORAHMAN || ir->epc == epcBERENDSEN,
 376                            "Only Parrinello-Rahman and Berendsen pressure coupling are supported "
 377                            "with the GPU update.\n");
 378         GMX_RELEASE_ASSERT(!mdatoms->haveVsites,
 379                            "Virtual sites are not supported with the GPU update.\n");
 380         GMX_RELEASE_ASSERT(ed == nullptr,
 381                            "Essential dynamics is not supported with the GPU update.\n");
 382         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(ir->pull),
 383                            "Constraints pulling is not supported with the GPU update.\n");
 384         GMX_RELEASE_ASSERT(fcd->orires.nr == 0,
 385                            "Orientation restraints are not supported with the GPU update.\n");
 386         GMX_RELEASE_ASSERT(
 387                 ir->efep == efepNO
 388                         || (!haveFreeEnergyType(*ir, efptBONDED) && !haveFreeEnergyType(*ir, efptMASS)),
 389                 "Free energy perturbation of masses and constraints are not supported with the GPU "
 390                 "update.");
 391         GMX_RELEASE_ASSERT(graph == nullptr, "The graph is not supported with GPU update.");
 392
 393         if (constr != nullptr && constr->numConstraintsTotal() > 0)
 394         {
 395             GMX_LOG(mdlog.info)
 396                     .asParagraph()
 397                     .appendText("Updating coordinates and applying constraints on the GPU.");
 398         }
 399         else
 400         {
 401             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
 402         }
 403
 404         GMX_RELEASE_ASSERT(fr->deviceContext != nullptr,
 405                            "GPU device context should be initialized to use GPU update.");
 406
 407         integrator = std::make_unique<UpdateConstrainGpu>(*ir, *top_global, *fr->deviceContext,
 408                                                           stateGpu->getUpdateStream(),
 409                                                           stateGpu->xUpdatedOnDevice());
 410
 411         integrator->setPbc(PbcType::Xyz, state->box);
 412     }
 413
 414     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 415     {
 416         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
 417     }
 418     if ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
 419     {
 420         changePinningPolicy(&f, PinningPolicy::PinnedIfSupported);
 421     }
 422     if (useGpuForUpdate)
 423     {
 424         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
 425     }
 426
 427     // NOTE: The global state is no longer used at this point.
 428     // But state_global is still used as temporary storage space for writing
 429     // the global state to file and potentially for replica exchange.
 430     // (Global topology should persist.)
 431
 432     update_mdatoms(mdatoms, state->lambda[efptMASS]);
 433
 434     if (ir->bExpanded)
 435     {
 436         /* Check nstexpanded here, because the grompp check was broken */
 437         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
 438         {
 439             gmx_fatal(FARGS,
 440                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
 441         }
 442         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
 443     }
 444
 445     if (MASTER(cr))
 446     {
 447         EnergyElement::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
 448     }
 449
 450     preparePrevStepPullCom(ir, pull_work, mdatoms, state, state_global, cr,
 451                            startingBehavior != StartingBehavior::NewSimulation);
 452
 453     // TODO: Remove this by converting AWH into a ForceProvider
 454     auto awh = prepareAwhModule(fplog, *ir, state_global, cr, ms,
 455                                 startingBehavior != StartingBehavior::NewSimulation,
 456                                 shellfc != nullptr, opt2fn("-awh", nfile, fnm), pull_work);
 457
 458     if (useReplicaExchange && MASTER(cr))
 459     {
 460         repl_ex = init_replica_exchange(fplog, ms, top_global->natoms, ir, replExParams);
 461     }
 462     /* PME tuning is only supported in the Verlet scheme, with PME for
 463      * Coulomb. It is not supported with only LJ PME. */
 464     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
 465                 && ir->cutoff_scheme != ecutsGROUP);
 466
 467     pme_load_balancing_t* pme_loadbal = nullptr;
 468     if (bPMETune)
 469     {
 470         pme_loadbal_init(&pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata,
 471                          fr->nbv->useGpu());
 472     }
 473
 474     if (!ir->bContinuation)
 475     {
 476         if (state->flags & (1U << estV))
 477         {
 478             auto v = makeArrayRef(state->v);
 479             /* Set the velocities of vsites, shells and frozen atoms to zero */
 480             for (i = 0; i < mdatoms->homenr; i++)
 481             {
 482                 if (mdatoms->ptype[i] == eptVSite || mdatoms->ptype[i] == eptShell)
 483                 {
 484                     clear_rvec(v[i]);
 485                 }
 486                 else if (mdatoms->cFREEZE)
 487                 {
 488                     for (m = 0; m < DIM; m++)
 489                     {
 490                         if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
 491                         {
 492                             v[i][m] = 0;
 493                         }
 494                     }
 495                 }
 496             }
 497         }
 498
 499         if (constr)
 500         {
 501             /* Constrain the initial coordinates and velocities */
 502             do_constrain_first(fplog, constr, ir, mdatoms, state->natoms, state->x.arrayRefWithPadding(),
 503                                state->v.arrayRefWithPadding(), state->box, state->lambda[efptBONDED]);
 504         }
 505         if (vsite)
 506         {
 507             /* Construct the virtual sites for the initial configuration */
 508             construct_vsites(vsite, state->x.rvec_array(), ir->delta_t, nullptr, top.idef.iparams,
 509                              top.idef.il, fr->pbcType, fr->bMolPBC, cr, state->box);
 510         }
 511     }
 512
 513     if (ir->efep != efepNO)
 514     {
 515         /* Set free energy calculation frequency as the greatest common
 516          * denominator of nstdhdl and repl_ex_nst. */
 517         nstfep = ir->fepvals->nstdhdl;
 518         if (ir->bExpanded)
 519         {
 520             nstfep = gmx_greatest_common_divisor(ir->expandedvals->nstexpanded, nstfep);
 521         }
 522         if (useReplicaExchange)
 523         {
 524             nstfep = gmx_greatest_common_divisor(replExParams.exchangeInterval, nstfep);
 525         }
 526     }
 527
 528     /* Be REALLY careful about what flags you set here. You CANNOT assume
 529      * this is the first step, since we might be restarting from a checkpoint,
 530      * and in that case we should not do any modifications to the state.
 531      */
 532     bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
 533
 534     // When restarting from a checkpoint, it can be appropriate to
 535     // initialize ekind from quantities in the checkpoint. Otherwise,
 536     // compute_globals must initialize ekind before the simulation
 537     // starts/restarts. However, only the master rank knows what was
 538     // found in the checkpoint file, so we have to communicate in
 539     // order to coordinate the restart.
 540     //
 541     // TODO Consider removing this communication if/when checkpoint
 542     // reading directly follows .tpr reading, because all ranks can
 543     // agree on hasReadEkinState at that time.
 544     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
 545     if (PAR(cr))
 546     {
 547         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr);
 548     }
 549     if (hasReadEkinState)
 550     {
 551         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
 552     }
 553
 554     unsigned int cglo_flags =
 555             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
 556              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
 557
 558     bSumEkinhOld = FALSE;
 559
 560     t_vcm vcm(top_global->groups, *ir);
 561     reportComRemovalInfo(fplog, vcm);
 562
 563     /* To minimize communication, compute_globals computes the COM velocity
 564      * and the kinetic energy for the velocities without COM motion removed.
 565      * Thus to get the kinetic energy without the COM contribution, we need
 566      * to call compute_globals twice.
 567      */
 568     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
 569     {
 570         unsigned int cglo_flags_iteration = cglo_flags;
 571         if (bStopCM && cgloIteration == 0)
 572         {
 573             cglo_flags_iteration |= CGLO_STOPCM;
 574             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
 575         }
 576         compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 577                         makeConstArrayRef(state->v), state->box, state->lambda[efptVDW], mdatoms,
 578                         nrnb, &vcm, nullptr, enerd, force_vir, shake_vir, total_vir, pres, constr,
 579                         &nullSignaller, state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
 580                         cglo_flags_iteration
 581                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
 582                                                                          : 0));
 583         if (cglo_flags_iteration & CGLO_STOPCM)
 584         {
 585             /* At initialization, do not pass x with acceleration-correction mode
 586              * to avoid (incorrect) correction of the initial coordinates.
 587              */
 588             auto x = (vcm.mode == ecmLINEAR_ACCELERATION_CORRECTION) ? ArrayRef<RVec>()
 589                                                                      : makeArrayRef(state->x);
 590             process_and_stopcm_grp(fplog, &vcm, *mdatoms, x, makeArrayRef(state->v));
 591             inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
 592         }
 593     }
 594     checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global, &top,
 595                                     makeConstArrayRef(state->x), state->box,
 596                                     &shouldCheckNumberOfBondedInteractions);
 597     if (ir->eI == eiVVAK)
 598     {
 599         /* a second call to get the half step temperature initialized as well */
 600         /* we do the same call as above, but turn the pressure off -- internally to
 601            compute_globals, this is recognized as a velocity verlet half-step
 602            kinetic energy calculation.  This minimized excess variables, but
 603            perhaps loses some logic?*/
 604
 605         compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 606                         makeConstArrayRef(state->v), state->box, state->lambda[efptVDW], mdatoms,
 607                         nrnb, &vcm, nullptr, enerd, force_vir, shake_vir, total_vir, pres, constr,
 608                         &nullSignaller, state->box, nullptr, &bSumEkinhOld, cglo_flags & ~CGLO_PRESSURE);
 609     }
 610
 611     /* Calculate the initial half step temperature, and save the ekinh_old */
 612     if (startingBehavior == StartingBehavior::NewSimulation)
 613     {
 614         for (i = 0; (i < ir->opts.ngtc); i++)
 615         {
 616             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
 617         }
 618     }
 619
 620     /* need to make an initiation call to get the Trotter variables set, as well as other constants
 621        for non-trotter temperature control */
 622     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
 623
 624     if (MASTER(cr))
 625     {
 626         if (!ir->bContinuation)
 627         {
 628             if (constr && ir->eConstrAlg == econtLINCS)
 629             {
 630                 fprintf(fplog, "RMS relative constraint deviation after constraining: %.2e\n",
 631                         constr->rmsd());
 632             }
 633             if (EI_STATE_VELOCITY(ir->eI))
 634             {
 635                 real temp = enerd->term[F_TEMP];
 636                 if (ir->eI != eiVV)
 637                 {
 638                     /* Result of Ekin averaged over velocities of -half
 639                      * and +half step, while we only have -half step here.
 640                      */
 641                     temp *= 2;
 642                 }
 643                 fprintf(fplog, "Initial temperature: %g K\n", temp);
 644             }
 645         }
 646
 647         char tbuf[20];
 648         fprintf(stderr, "starting mdrun '%s'\n", *(top_global->name));
 649         if (ir->nsteps >= 0)
 650         {
 651             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
 652         }
 653         else
 654         {
 655             sprintf(tbuf, "%s", "infinite");
 656         }
 657         if (ir->init_step > 0)
 658         {
 659             fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 660                     gmx_step_str(ir->init_step + ir->nsteps, sbuf), tbuf,
 661                     gmx_step_str(ir->init_step, sbuf2), ir->init_step * ir->delta_t);
 662         }
 663         else
 664         {
 665             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
 666         }
 667         fprintf(fplog, "\n");
 668     }
 669
 670     walltime_accounting_start_time(walltime_accounting);
 671     wallcycle_start(wcycle, ewcRUN);
 672     print_start(fplog, cr, walltime_accounting, "mdrun");
 673
 674 #if GMX_FAHCORE
 675     /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
 676     int chkpt_ret = fcCheckPointParallel(cr->nodeid, NULL, 0);
 677     if (chkpt_ret == 0)
 678     {
 679         gmx_fatal(3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0);
 680     }
 681 #endif
 682
 683     /***********************************************************
 684      *
 685      *             Loop over MD steps
 686      *
 687      ************************************************************/
 688
 689     bFirstStep = TRUE;
 690     /* Skip the first Nose-Hoover integration when we get the state from tpx */
 691     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
 692     bSumEkinhOld     = FALSE;
 693     bExchanged       = FALSE;
 694     bNeedRepartition = FALSE;
 695
 696     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
 697             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]), simulationsShareState,
 698             MASTER(cr), ir->nstlist, mdrunOptions.reproducible, nstSignalComm,
 699             mdrunOptions.maximumHoursToRun, ir->nstlist == 0, fplog, step, bNS, walltime_accounting);
 700
 701     auto checkpointHandler = std::make_unique<CheckpointHandler>(
 702             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]), simulationsShareState,
 703             ir->nstlist == 0, MASTER(cr), mdrunOptions.writeConfout,
 704             mdrunOptions.checkpointOptions.period);
 705
 706     const bool resetCountersIsLocal = true;
 707     auto       resetHandler         = std::make_unique<ResetHandler>(
 708             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
 709             !resetCountersIsLocal, ir->nsteps, MASTER(cr), mdrunOptions.timingOptions.resetHalfway,
 710             mdrunOptions.maximumHoursToRun, mdlog, wcycle, walltime_accounting);
 711
 712     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
 713
 714     step     = ir->init_step;
 715     step_rel = 0;
 716
 717     // TODO extract this to new multi-simulation module
 718     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
 719     {
 720         if (!multisim_int_all_are_equal(ms, ir->nsteps))
 721         {
 722             GMX_LOG(mdlog.warning)
 723                     .appendText(
 724                             "Note: The number of steps is not consistent across multi "
 725                             "simulations,\n"
 726                             "but we are proceeding anyway!");
 727         }
 728         if (!multisim_int_all_are_equal(ms, ir->init_step))
 729         {
 730             if (simulationsShareState)
 731             {
 732                 if (MASTER(cr))
 733                 {
 734                     gmx_fatal(FARGS,
 735                               "The initial step is not consistent across multi simulations which "
 736                               "share the state");
 737                 }
 738                 gmx_barrier(cr);
 739             }
 740             else
 741             {
 742                 GMX_LOG(mdlog.warning)
 743                         .appendText(
 744                                 "Note: The initial step is not consistent across multi "
 745                                 "simulations,\n"
 746                                 "but we are proceeding anyway!");
 747             }
 748         }
 749     }
 750
 751     /* and stop now if we should */
 752     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
 753     while (!bLastStep)
 754     {
 755
 756         /* Determine if this is a neighbor search step */
 757         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
 758
 759         if (bPMETune && bNStList)
 760         {
 761             // This has to be here because PME load balancing is called so early.
 762             // TODO: Move to after all booleans are defined.
 763             if (useGpuForUpdate && !bFirstStep)
 764             {
 765                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 766                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 767             }
 768             /* PME grid + cut-off optimization with GPUs or PME nodes */
 769             pme_loadbal_do(pme_loadbal, cr, (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
 770                            fplog, mdlog, *ir, fr, state->box, state->x, wcycle, step, step_rel,
 771                            &bPMETunePrinting, simulationWork.useGpuPmePpCommunication);
 772         }
 773
 774         wallcycle_start(wcycle, ewcSTEP);
 775
 776         bLastStep = (step_rel == ir->nsteps);
 777         t         = t0 + step * ir->delta_t;
 778
 779         // TODO Refactor this, so that nstfep does not need a default value of zero
 780         if (ir->efep != efepNO || ir->bSimTemp)
 781         {
 782             /* find and set the current lambdas */
 783             setCurrentLambdasLocal(step, ir->fepvals, lam0, state->lambda, state->fep_state);
 784
 785             bDoDHDL     = do_per_step(step, ir->fepvals->nstdhdl);
 786             bDoFEP      = ((ir->efep != efepNO) && do_per_step(step, nstfep));
 787             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
 788                            && (step > 0) && (startingBehavior == StartingBehavior::NewSimulation));
 789         }
 790
 791         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
 792                      && do_per_step(step, replExParams.exchangeInterval));
 793
 794         if (doSimulatedAnnealing)
 795         {
 796             update_annealing_target_temp(ir, t, &upd);
 797         }
 798
 799         /* Stop Center of Mass motion */
 800         bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
 801
 802         /* Determine whether or not to do Neighbour Searching */
 803         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
 804
 805         /* Note that the stopHandler will cause termination at nstglobalcomm
 806          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
 807          * nstpcouple steps, we have computed the half-step kinetic energy
 808          * of the previous step and can always output energies at the last step.
 809          */
 810         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
 811
 812         /* do_log triggers energy and virial calculation. Because this leads
 813          * to different code paths, forces can be different. Thus for exact
 814          * continuation we should avoid extra log output.
 815          * Note that the || bLastStep can result in non-exact continuation
 816          * beyond the last step. But we don't consider that to be an issue.
 817          */
 818         do_log     = (do_per_step(step, ir->nstlog)
 819                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
 820         do_verbose = mdrunOptions.verbose
 821                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
 822
 823         if (useGpuForUpdate && !bFirstStep && bNS)
 824         {
 825             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
 826             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
 827             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
 828             // Copy coordinate from the GPU when needed at the search step.
 829             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
 830             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
 831             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
 832             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
 833         }
 834
 835         if (bNS && !(bFirstStep && ir->bContinuation))
 836         {
 837             bMasterState = FALSE;
 838             /* Correct the new box if it is too skewed */
 839             if (inputrecDynamicBox(ir))
 840             {
 841                 if (correct_box(fplog, step, state->box, graph))
 842                 {
 843                     bMasterState = TRUE;
 844                     // If update is offloaded, it should be informed about the box size change
 845                     if (useGpuForUpdate)
 846                     {
 847                         integrator->setPbc(PbcType::Xyz, state->box);
 848                     }
 849                 }
 850             }
 851             if (DOMAINDECOMP(cr) && bMasterState)
 852             {
 853                 dd_collect_state(cr->dd, state, state_global);
 854             }
 855
 856             if (DOMAINDECOMP(cr))
 857             {
 858                 /* Repartition the domain decomposition */
 859                 dd_partition_system(fplog, mdlog, step, cr, bMasterState, nstglobalcomm, state_global,
 860                                     *top_global, ir, imdSession, pull_work, state, &f, mdAtoms, &top,
 861                                     fr, vsite, constr, nrnb, wcycle, do_verbose && !bPMETunePrinting);
 862                 shouldCheckNumberOfBondedInteractions = true;
 863                 upd.setNumAtoms(state->natoms);
 864
 865                 // Allocate or re-size GPU halo exchange object, if necessary
 866                 if (havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange
 867                     && useGpuForNonbonded && is1D(*cr->dd))
 868                 {
 869                     // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
 870                     void* streamLocal =
 871                             Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
 872                     void* streamNonLocal = Nbnxm::gpu_get_command_stream(
 873                             fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
 874                     GMX_RELEASE_ASSERT(
 875                             fr->deviceContext != nullptr,
 876                             "GPU device context should be initialized to use GPU halo exchange.");
 877                     constructGpuHaloExchange(mdlog, *cr, *fr->deviceContext, streamLocal, streamNonLocal);
 878                 }
 879             }
 880         }
 881
 882         if (MASTER(cr) && do_log)
 883         {
 884             energyOutput.printHeader(fplog, step, t); /* can we improve the information printed here? */
 885         }
 886
 887         if (ir->efep != efepNO)
 888         {
 889             update_mdatoms(mdatoms, state->lambda[efptMASS]);
 890         }
 891
 892         if (bExchanged)
 893         {
 894
 895             /* We need the kinetic energy at minus the half step for determining
 896              * the full step kinetic energy and possibly for T-coupling.*/
 897             /* This may not be quite working correctly yet . . . . */
 898             compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
 899                             makeConstArrayRef(state->v), state->box, state->lambda[efptVDW], mdatoms,
 900                             nrnb, &vcm, wcycle, enerd, nullptr, nullptr, nullptr, nullptr, constr,
 901                             &nullSignaller, state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
 902                             CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
 903             checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions, top_global,
 904                                             &top, makeConstArrayRef(state->x), state->box,
 905                                             &shouldCheckNumberOfBondedInteractions);
 906         }
 907         clear_mat(force_vir);
 908
 909         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
 910
 911         /* Determine the energy and pressure:
 912          * at nstcalcenergy steps and at energy output steps (set below).
 913          */
 914         if (EI_VV(ir->eI) && (!bInitStep))
 915         {
 916             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
 917             bCalcVir      = bCalcEnerStep
 918                        || (ir->epc != epcNO
 919                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
 920         }
 921         else
 922         {
 923             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
 924             bCalcVir = bCalcEnerStep || (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
 925         }
 926         bCalcEner = bCalcEnerStep;
 927
 928         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
 929
 930         if (do_ene || do_log || bDoReplEx)
 931         {
 932             bCalcVir  = TRUE;
 933             bCalcEner = TRUE;
 934         }
 935
 936         /* Do we need global communication ? */
 937         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
 938                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
 939
 940         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
 941                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
 942                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
 943
 944         if (shellfc)
 945         {
 946             /* Now is the time to relax the shells */
 947             relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose, enforcedRotation, step, ir,
 948                                 imdSession, pull_work, bNS, force_flags, &top, constr, enerd, fcd,
 949                                 state->natoms, state->x.arrayRefWithPadding(),
 950                                 state->v.arrayRefWithPadding(), state->box, state->lambda, &state->hist,
 951                                 f.arrayRefWithPadding(), force_vir, mdatoms, nrnb, wcycle, graph,
 952                                 shellfc, fr, runScheduleWork, t, mu_tot, vsite, ddBalanceRegionHandler);
 953         }
 954         else
 955         {
 956             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
 957                is updated (or the AWH update will be performed twice for one step when continuing).
 958                It would be best to call this update function from do_md_trajectory_writing but that
 959                would occur after do_force. One would have to divide the update_awh function into one
 960                function applying the AWH force and one doing the AWH bias update. The update AWH
 961                bias function could then be called after do_md_trajectory_writing (then containing
 962                update_awh_history). The checkpointing will in the future probably moved to the start
 963                of the md loop which will rid of this issue. */
 964             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
 965             {
 966                 awh->updateHistory(state_global->awhHistory.get());
 967             }
 968
 969             /* The coordinates (x) are shifted (to get whole molecules)
 970              * in do_force.
 971              * This is parallellized as well, and does communication too.
 972              * Check comments in sim_util.c
 973              */
 974             do_force(fplog, cr, ms, ir, awh.get(), enforcedRotation, imdSession, pull_work, step,
 975                      nrnb, wcycle, &top, state->box, state->x.arrayRefWithPadding(), &state->hist,
 976                      f.arrayRefWithPadding(), force_vir, mdatoms, enerd, fcd, state->lambda, graph,
 977                      fr, runScheduleWork, vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
 978                      (bNS ? GMX_FORCE_NS : 0) | force_flags, ddBalanceRegionHandler);
 979         }
 980
 981         // VV integrators do not need the following velocity half step
 982         // if it is the first step after starting from a checkpoint.
 983         // That is, the half step is needed on all other steps, and
 984         // also the first step when starting from a .tpr file.
 985         if (EI_VV(ir->eI) && (!bFirstStep || startingBehavior == StartingBehavior::NewSimulation))
 986         /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
 987         {
 988             rvec* vbuf = nullptr;
 989
 990             wallcycle_start(wcycle, ewcUPDATE);
 991             if (ir->eI == eiVV && bInitStep)
 992             {
 993                 /* if using velocity verlet with full time step Ekin,
 994                  * take the first half step only to compute the
 995                  * virial for the first step. From there,
 996                  * revert back to the initial coordinates
 997                  * so that the input is actually the initial step.
 998                  */
 999                 snew(vbuf, state->natoms);
1000                 copy_rvecn(state->v.rvec_array(), vbuf, 0,
1001                            state->natoms); /* should make this better for parallelizing? */
1002             }
1003             else
1004             {
1005                 /* this is for NHC in the Ekin(t+dt/2) version of vv */
1006                 trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ,
1007                                trotter_seq, ettTSEQ1);
1008             }
1009
1010             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd, ekind, M, &upd,
1011                           etrtVELOCITY1, cr, constr);
1012
1013             wallcycle_stop(wcycle, ewcUPDATE);
1014             constrain_velocities(step, nullptr, state, shake_vir, constr, bCalcVir, do_log, do_ene);
1015             wallcycle_start(wcycle, ewcUPDATE);
1016             /* if VV, compute the pressure and constraints */
1017             /* For VV2, we strictly only need this if using pressure
1018              * control, but we really would like to have accurate pressures
1019              * printed out.
1020              * Think about ways around this in the future?
1021              * For now, keep this choice in comments.
1022              */
1023             /*bPres = (ir->eI==eiVV || inputrecNptTrotter(ir)); */
1024             /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && inputrecNptTrotter(ir)));*/
1025             bPres = TRUE;
1026             bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
1027             if (bCalcEner && ir->eI == eiVVAK)
1028             {
1029                 bSumEkinhOld = TRUE;
1030             }
1031             /* for vv, the first half of the integration actually corresponds to the previous step.
1032                So we need information from the last step in the first half of the integration */
1033             if (bGStat || do_per_step(step - 1, nstglobalcomm))
1034             {
1035                 wallcycle_stop(wcycle, ewcUPDATE);
1036                 compute_globals(
1037                         gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1038                         makeConstArrayRef(state->v), state->box, state->lambda[efptVDW], mdatoms,
1039                         nrnb, &vcm, wcycle, enerd, force_vir, shake_vir, total_vir, pres, constr,
1040                         &nullSignaller, state->box, &totalNumberOfBondedInteractions, &bSumEkinhOld,
1041                         (bGStat ? CGLO_GSTAT : 0) | (bCalcEner ? CGLO_ENERGY : 0)
1042                                 | (bTemp ? CGLO_TEMPERATURE : 0) | (bPres ? CGLO_PRESSURE : 0)
1043                                 | (bPres ? CGLO_CONSTRAINT : 0) | (bStopCM ? CGLO_STOPCM : 0)
1044                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1045                                                                          : 0)
1046                                 | CGLO_SCALEEKIN);
1047                 /* explanation of above:
1048                    a) We compute Ekin at the full time step
1049                    if 1) we are using the AveVel Ekin, and it's not the
1050                    initial step, or 2) if we are using AveEkin, but need the full
1051                    time step kinetic energy for the pressure (always true now, since we want accurate statistics).
1052                    b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
1053                    EkinAveVel because it's needed for the pressure */
1054                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1055                                                 top_global, &top, makeConstArrayRef(state->x),
1056                                                 state->box, &shouldCheckNumberOfBondedInteractions);
1057                 if (bStopCM)
1058                 {
1059                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, makeArrayRef(state->x),
1060                                            makeArrayRef(state->v));
1061                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1062                 }
1063                 wallcycle_start(wcycle, ewcUPDATE);
1064             }
1065             /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
1066             if (!bInitStep)
1067             {
1068                 if (bTrotter)
1069                 {
1070                     m_add(force_vir, shake_vir,
1071                           total_vir); /* we need the un-dispersion corrected total vir here */
1072                     trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ,
1073                                    trotter_seq, ettTSEQ2);
1074
1075                     /* TODO This is only needed when we're about to write
1076                      * a checkpoint, because we use it after the restart
1077                      * (in a kludge?). But what should we be doing if
1078                      * the startingBehavior is NewSimulation or bInitStep are true? */
1079                     if (inputrecNptTrotter(ir) || inputrecNphTrotter(ir))
1080                     {
1081                         copy_mat(shake_vir, state->svir_prev);
1082                         copy_mat(force_vir, state->fvir_prev);
1083                     }
1084                     if (inputrecNvtTrotter(ir) && ir->eI == eiVV)
1085                     {
1086                         /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
1087                         enerd->term[F_TEMP] =
1088                                 sum_ekin(&(ir->opts), ekind, nullptr, (ir->eI == eiVV), FALSE);
1089                         enerd->term[F_EKIN] = trace(ekind->ekin);
1090                     }
1091                 }
1092                 else if (bExchanged)
1093                 {
1094                     wallcycle_stop(wcycle, ewcUPDATE);
1095                     /* We need the kinetic energy at minus the half step for determining
1096                      * the full step kinetic energy and possibly for T-coupling.*/
1097                     /* This may not be quite working correctly yet . . . . */
1098                     compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1099                                     makeConstArrayRef(state->v), state->box, state->lambda[efptVDW],
1100                                     mdatoms, nrnb, &vcm, wcycle, enerd, nullptr, nullptr, nullptr,
1101                                     nullptr, constr, &nullSignaller, state->box, nullptr,
1102                                     &bSumEkinhOld, CGLO_GSTAT | CGLO_TEMPERATURE);
1103                     wallcycle_start(wcycle, ewcUPDATE);
1104                 }
1105             }
1106             /* if it's the initial step, we performed this first step just to get the constraint virial */
1107             if (ir->eI == eiVV && bInitStep)
1108             {
1109                 copy_rvecn(vbuf, state->v.rvec_array(), 0, state->natoms);
1110                 sfree(vbuf);
1111             }
1112             wallcycle_stop(wcycle, ewcUPDATE);
1113         }
1114
1115         /* compute the conserved quantity */
1116         if (EI_VV(ir->eI))
1117         {
1118             saved_conserved_quantity = NPT_energy(ir, state, &MassQ);
1119             if (ir->eI == eiVV)
1120             {
1121                 last_ekin = enerd->term[F_EKIN];
1122             }
1123             if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
1124             {
1125                 saved_conserved_quantity -= enerd->term[F_DISPCORR];
1126             }
1127             /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
1128             if (ir->efep != efepNO)
1129             {
1130                 sum_dhdl(enerd, state->lambda, *ir->fepvals);
1131             }
1132         }
1133
1134         /* ########  END FIRST UPDATE STEP  ############## */
1135         /* ########  If doing VV, we now have v(dt) ###### */
1136         if (bDoExpanded)
1137         {
1138             /* perform extended ensemble sampling in lambda - we don't
1139                actually move to the new state before outputting
1140                statistics, but if performing simulated tempering, we
1141                do update the velocities and the tau_t. */
1142
1143             lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state,
1144                                               state->dfhist, step, state->v.rvec_array(), mdatoms);
1145             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1146             if (MASTER(cr))
1147             {
1148                 copy_df_history(state_global->dfhist, state->dfhist);
1149             }
1150         }
1151
1152         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1153         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1154         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1155             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1156                 || checkpointHandler->isCheckpointingStep()))
1157         {
1158             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1159             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1160         }
1161         // Copy velocities if needed for the output/checkpointing.
1162         // NOTE: Copy on the search steps is done at the beginning of the step.
1163         if (useGpuForUpdate && !bNS
1164             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1165         {
1166             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1167             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1168         }
1169         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1170         // and update is offloaded hence forces are kept on the GPU for update and have not been
1171         // already transferred in do_force().
1172         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1173         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1174         //       prior to GPU update.
1175         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1176         //       copy call in do_force(...).
1177         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1178         //       on host after the D2H copy in do_force(...).
1179         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1180             && do_per_step(step, ir->nstfout))
1181         {
1182             stateGpu->copyForcesFromGpu(ArrayRef<RVec>(f), AtomLocality::Local);
1183             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1184         }
1185         /* Now we have the energies and forces corresponding to the
1186          * coordinates at time t. We must output all of this before
1187          * the update.
1188          */
1189         do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t, ir, state, state_global,
1190                                  observablesHistory, top_global, fr, outf, energyOutput, ekind, f,
1191                                  checkpointHandler->isCheckpointingStep(), bRerunMD, bLastStep,
1192                                  mdrunOptions.writeConfout, bSumEkinhOld);
1193         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1194         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x.rvec_array(), t);
1195
1196         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1197         if (startingBehavior != StartingBehavior::NewSimulation
1198             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1199         {
1200             copy_mat(state->svir_prev, shake_vir);
1201             copy_mat(state->fvir_prev, force_vir);
1202         }
1203
1204         stopHandler->setSignal();
1205         resetHandler->setSignal(walltime_accounting);
1206
1207         if (bGStat || !PAR(cr))
1208         {
1209             /* In parallel we only have to check for checkpointing in steps
1210              * where we do global communication,
1211              *  otherwise the other nodes don't know.
1212              */
1213             checkpointHandler->setSignal(walltime_accounting);
1214         }
1215
1216         /* #########   START SECOND UPDATE STEP ################# */
1217
1218         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1219            controlled in preprocessing */
1220
1221         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1222         {
1223             gmx_bool bIfRandomize;
1224             bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, &upd, constr);
1225             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1226             if (constr && bIfRandomize)
1227             {
1228                 constrain_velocities(step, nullptr, state, tmp_vir, constr, bCalcVir, do_log, do_ene);
1229             }
1230         }
1231         /* Box is changed in update() when we do pressure coupling,
1232          * but we should still use the old box for energy corrections and when
1233          * writing it to the energy file, so it matches the trajectory files for
1234          * the same timestep above. Make a copy in a separate array.
1235          */
1236         copy_mat(state->box, lastbox);
1237
1238         dvdl_constr = 0;
1239
1240         wallcycle_start(wcycle, ewcUPDATE);
1241         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1242         if (bTrotter)
1243         {
1244             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
1245             /* We can only do Berendsen coupling after we have summed
1246              * the kinetic energy or virial. Since the happens
1247              * in global_state after update, we should only do it at
1248              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1249              */
1250         }
1251         else
1252         {
1253             update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
1254             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1255         }
1256
1257         if (EI_VV(ir->eI))
1258         {
1259             /* velocity half-step update */
1260             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd, ekind, M, &upd,
1261                           etrtVELOCITY2, cr, constr);
1262         }
1263
1264         /* Above, initialize just copies ekinh into ekin,
1265          * it doesn't copy position (for VV),
1266          * and entire integrator for MD.
1267          */
1268
1269         if (ir->eI == eiVVAK)
1270         {
1271             cbuf.resize(state->x.size());
1272             std::copy(state->x.begin(), state->x.end(), cbuf.begin());
1273         }
1274
1275         /* With leap-frog type integrators we compute the kinetic energy
1276          * at a whole time step as the average of the half-time step kinetic
1277          * energies of two subsequent steps. Therefore we need to compute the
1278          * half step kinetic energy also if we need energies at the next step.
1279          */
1280         const bool needHalfStepKineticEnergy =
1281                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1282
1283         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1284         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1285         const bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN
1286                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1287
1288         if (useGpuForUpdate)
1289         {
1290             if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1291             {
1292                 integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(),
1293                                 stateGpu->getForces(), top.idef, *mdatoms, ekind->ngtc);
1294
1295                 // Copy data to the GPU after buffers might have being reinitialized
1296                 stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1297                 stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1298             }
1299
1300             // If the buffer ops were not offloaded this step, the forces are on the host and have to be copied
1301             if (!runScheduleWork->stepWork.useGpuFBufferOps)
1302             {
1303                 stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), AtomLocality::Local);
1304             }
1305
1306             const bool doTemperatureScaling =
1307                     (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1308
1309             // This applies Leap-Frog, LINCS and SETTLE in succession
1310             integrator->integrate(stateGpu->getForcesReadyOnDeviceEvent(
1311                                           AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1312                                   ir->delta_t, true, bCalcVir, shake_vir, doTemperatureScaling,
1313                                   ekind->tcstat, doParrinelloRahman, ir->nstpcouple * ir->delta_t, M);
1314
1315             // Copy velocities D2H after update if:
1316             // - Globals are computed this step (includes the energy output steps).
1317             // - Temperature is needed for the next step.
1318             if (bGStat || needHalfStepKineticEnergy)
1319             {
1320                 stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1321                 stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1322             }
1323         }
1324         else
1325         {
1326             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd, ekind, M, &upd,
1327                           etrtPOSITION, cr, constr);
1328
1329             wallcycle_stop(wcycle, ewcUPDATE);
1330
1331             constrain_coordinates(step, &dvdl_constr, state, shake_vir, &upd, constr, bCalcVir,
1332                                   do_log, do_ene);
1333
1334             update_sd_second_half(step, &dvdl_constr, ir, mdatoms, state, cr, nrnb, wcycle, &upd,
1335                                   constr, do_log, do_ene);
1336             finish_update(ir, mdatoms, state, graph, nrnb, wcycle, &upd, constr);
1337         }
1338
1339         if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1340         {
1341             updatePrevStepPullCom(pull_work, state);
1342         }
1343
1344         if (ir->eI == eiVVAK)
1345         {
1346             /* erase F_EKIN and F_TEMP here? */
1347             /* just compute the kinetic energy at the half step to perform a trotter step */
1348             compute_globals(gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1349                             makeConstArrayRef(state->v), state->box, state->lambda[efptVDW],
1350                             mdatoms, nrnb, &vcm, wcycle, enerd, force_vir, shake_vir, total_vir,
1351                             pres, constr, &nullSignaller, lastbox, nullptr, &bSumEkinhOld,
1352                             (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE);
1353             wallcycle_start(wcycle, ewcUPDATE);
1354             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
1355             /* now we know the scaling, we can compute the positions again */
1356             std::copy(cbuf.begin(), cbuf.end(), state->x.begin());
1357
1358             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd, ekind, M, &upd,
1359                           etrtPOSITION, cr, constr);
1360             wallcycle_stop(wcycle, ewcUPDATE);
1361
1362             /* do we need an extra constraint here? just need to copy out of as_rvec_array(state->v.data()) to upd->xp? */
1363             /* are the small terms in the shake_vir here due
1364              * to numerical errors, or are they important
1365              * physically? I'm thinking they are just errors, but not completely sure.
1366              * For now, will call without actually constraining, constr=NULL*/
1367             finish_update(ir, mdatoms, state, graph, nrnb, wcycle, &upd, nullptr);
1368         }
1369         if (EI_VV(ir->eI))
1370         {
1371             /* this factor or 2 correction is necessary
1372                because half of the constraint force is removed
1373                in the vv step, so we have to double it.  See
1374                the Redmine issue #1255.  It is not yet clear
1375                if the factor of 2 is exact, or just a very
1376                good approximation, and this will be
1377                investigated.  The next step is to see if this
1378                can be done adding a dhdl contribution from the
1379                rattle step, but this is somewhat more
1380                complicated with the current code. Will be
1381                investigated, hopefully for 4.6.3. However,
1382                this current solution is much better than
1383                having it completely wrong.
1384              */
1385             enerd->term[F_DVDL_CONSTR] += 2 * dvdl_constr;
1386         }
1387         else
1388         {
1389             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1390         }
1391
1392         if (vsite != nullptr)
1393         {
1394             wallcycle_start(wcycle, ewcVSITECONSTR);
1395             if (graph != nullptr)
1396             {
1397                 shift_self(graph, state->box, state->x.rvec_array());
1398             }
1399             construct_vsites(vsite, state->x.rvec_array(), ir->delta_t, state->v.rvec_array(),
1400                              top.idef.iparams, top.idef.il, fr->pbcType, fr->bMolPBC, cr, state->box);
1401
1402             if (graph != nullptr)
1403             {
1404                 unshift_self(graph, state->box, state->x.rvec_array());
1405             }
1406             wallcycle_stop(wcycle, ewcVSITECONSTR);
1407         }
1408
1409         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1410         /* With Leap-Frog we can skip compute_globals at
1411          * non-communication steps, but we need to calculate
1412          * the kinetic energy one step before communication.
1413          */
1414         {
1415             // Organize to do inter-simulation signalling on steps if
1416             // and when algorithms require it.
1417             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1418
1419             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1420             {
1421                 // Copy coordinates when needed to stop the CM motion.
1422                 if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
1423                 {
1424                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1425                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1426                 }
1427                 // Since we're already communicating at this step, we
1428                 // can propagate intra-simulation signals. Note that
1429                 // check_nstglobalcomm has the responsibility for
1430                 // choosing the value of nstglobalcomm that is one way
1431                 // bGStat becomes true, so we can't get into a
1432                 // situation where e.g. checkpointing can't be
1433                 // signalled.
1434                 bool                doIntraSimSignal = true;
1435                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1436
1437                 compute_globals(
1438                         gstat, cr, ir, fr, ekind, makeConstArrayRef(state->x),
1439                         makeConstArrayRef(state->v), state->box, state->lambda[efptVDW], mdatoms,
1440                         nrnb, &vcm, wcycle, enerd, force_vir, shake_vir, total_vir, pres, constr,
1441                         &signaller, lastbox, &totalNumberOfBondedInteractions, &bSumEkinhOld,
1442                         (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1443                                 | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1444                                 | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1445                                 | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
1446                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1447                                                                          : 0));
1448                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1449                                                 top_global, &top, makeConstArrayRef(state->x),
1450                                                 state->box, &shouldCheckNumberOfBondedInteractions);
1451                 if (!EI_VV(ir->eI) && bStopCM)
1452                 {
1453                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, makeArrayRef(state->x),
1454                                            makeArrayRef(state->v));
1455                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1456
1457                     // TODO: The special case of removing CM motion should be dealt more gracefully
1458                     if (useGpuForUpdate)
1459                     {
1460                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1461                         // Here we block until the H2D copy completes because event sync with the
1462                         // force kernels that use the coordinates on the next steps is not implemented
1463                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1464                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1465                     }
1466                 }
1467             }
1468         }
1469
1470         /* #############  END CALC EKIN AND PRESSURE ################# */
1471
1472         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1473            the virial that should probably be addressed eventually. state->veta has better properies,
1474            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1475            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1476
1477         if (ir->efep != efepNO && !EI_VV(ir->eI))
1478         {
1479             /* Sum up the foreign energy and dhdl terms for md and sd.
1480                Currently done every step so that dhdl is correct in the .edr */
1481             sum_dhdl(enerd, state->lambda, *ir->fepvals);
1482         }
1483
1484         update_pcouple_after_coordinates(fplog, step, ir, mdatoms, pres, force_vir, shake_vir,
1485                                          pressureCouplingMu, state, nrnb, &upd, !useGpuForUpdate);
1486
1487         const bool doBerendsenPressureCoupling =
1488                 (inputrec->epc == epcBERENDSEN && do_per_step(step, inputrec->nstpcouple));
1489         if (useGpuForUpdate && (doBerendsenPressureCoupling || doParrinelloRahman))
1490         {
1491             integrator->scaleCoordinates(pressureCouplingMu);
1492             integrator->setPbc(PbcType::Xyz, state->box);
1493         }
1494
1495         /* ################# END UPDATE STEP 2 ################# */
1496         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1497
1498         /* The coordinates (x) were unshifted in update */
1499         if (!bGStat)
1500         {
1501             /* We will not sum ekinh_old,
1502              * so signal that we still have to do it.
1503              */
1504             bSumEkinhOld = TRUE;
1505         }
1506
1507         if (bCalcEner)
1508         {
1509             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1510
1511             /* use the directly determined last velocity, not actually the averaged half steps */
1512             if (bTrotter && ir->eI == eiVV)
1513             {
1514                 enerd->term[F_EKIN] = last_ekin;
1515             }
1516             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1517
1518             if (integratorHasConservedEnergyQuantity(ir))
1519             {
1520                 if (EI_VV(ir->eI))
1521                 {
1522                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1523                 }
1524                 else
1525                 {
1526                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1527                 }
1528             }
1529             /* #########  END PREPARING EDR OUTPUT  ###########  */
1530         }
1531
1532         /* Output stuff */
1533         if (MASTER(cr))
1534         {
1535             if (fplog && do_log && bDoExpanded)
1536             {
1537                 /* only needed if doing expanded ensemble */
1538                 PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals,
1539                                           ir->bSimTemp ? ir->simtempvals : nullptr,
1540                                           state_global->dfhist, state->fep_state, ir->nstlog, step);
1541             }
1542             if (bCalcEner)
1543             {
1544                 energyOutput.addDataAtEnergyStep(bDoDHDL, bCalcEnerStep, t, mdatoms->tmass, enerd, state,
1545                                                  ir->fepvals, ir->expandedvals, lastbox, shake_vir,
1546                                                  force_vir, total_vir, pres, ekind, mu_tot, constr);
1547             }
1548             else
1549             {
1550                 energyOutput.recordNonEnergyStep();
1551             }
1552
1553             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1554             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1555
1556             if (doSimulatedAnnealing)
1557             {
1558                 energyOutput.printAnnealingTemperatures(do_log ? fplog : nullptr, groups, &(ir->opts));
1559             }
1560             if (do_log || do_ene || do_dr || do_or)
1561             {
1562                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or,
1563                                                    do_log ? fplog : nullptr, step, t, fcd, awh.get());
1564             }
1565
1566             if (ir->bPull)
1567             {
1568                 pull_print_output(pull_work, step, t);
1569             }
1570
1571             if (do_per_step(step, ir->nstlog))
1572             {
1573                 if (fflush(fplog) != 0)
1574                 {
1575                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1576                 }
1577             }
1578         }
1579         if (bDoExpanded)
1580         {
1581             /* Have to do this part _after_ outputting the logfile and the edr file */
1582             /* Gets written into the state at the beginning of next loop*/
1583             state->fep_state = lamnew;
1584         }
1585         /* Print the remaining wall clock time for the run */
1586         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1587         {
1588             if (shellfc)
1589             {
1590                 fprintf(stderr, "\n");
1591             }
1592             print_time(stderr, walltime_accounting, step, ir, cr);
1593         }
1594
1595         /* Ion/water position swapping.
1596          * Not done in last step since trajectory writing happens before this call
1597          * in the MD loop and exchanges would be lost anyway. */
1598         bNeedRepartition = FALSE;
1599         if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep && do_per_step(step, ir->swap->nstswap))
1600         {
1601             bNeedRepartition =
1602                     do_swapcoords(cr, step, t, ir, swap, wcycle, as_rvec_array(state->x.data()),
1603                                   state->box, MASTER(cr) && mdrunOptions.verbose, bRerunMD);
1604
1605             if (bNeedRepartition && DOMAINDECOMP(cr))
1606             {
1607                 dd_collect_state(cr->dd, state, state_global);
1608             }
1609         }
1610
1611         /* Replica exchange */
1612         bExchanged = FALSE;
1613         if (bDoReplEx)
1614         {
1615             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1616         }
1617
1618         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1619         {
1620             dd_partition_system(fplog, mdlog, step, cr, TRUE, 1, state_global, *top_global, ir,
1621                                 imdSession, pull_work, state, &f, mdAtoms, &top, fr, vsite, constr,
1622                                 nrnb, wcycle, FALSE);
1623             shouldCheckNumberOfBondedInteractions = true;
1624             upd.setNumAtoms(state->natoms);
1625         }
1626
1627         bFirstStep = FALSE;
1628         bInitStep  = FALSE;
1629
1630         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1631         /* With all integrators, except VV, we need to retain the pressure
1632          * at the current step for coupling at the next step.
1633          */
1634         if ((state->flags & (1U << estPRES_PREV))
1635             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1636         {
1637             /* Store the pressure in t_state for pressure coupling
1638              * at the next MD step.
1639              */
1640             copy_mat(pres, state->pres_prev);
1641         }
1642
1643         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1644
1645         if ((membed != nullptr) && (!bLastStep))
1646         {
1647             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1648         }
1649
1650         cycles = wallcycle_stop(wcycle, ewcSTEP);
1651         if (DOMAINDECOMP(cr) && wcycle)
1652         {
1653             dd_cycles_add(cr->dd, cycles, ddCyclStep);
1654         }
1655
1656         /* increase the MD step number */
1657         step++;
1658         step_rel++;
1659
1660         resetHandler->resetCounters(step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb,
1661                                     fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
1662
1663         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
1664         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
1665     }
1666     /* End of main MD loop */
1667
1668     /* Closing TNG files can include compressing data. Therefore it is good to do that
1669      * before stopping the time measurements. */
1670     mdoutf_tng_close(outf);
1671
1672     /* Stop measuring walltime */
1673     walltime_accounting_end_time(walltime_accounting);
1674
1675     if (!thisRankHasDuty(cr, DUTY_PME))
1676     {
1677         /* Tell the PME only node to finish */
1678         gmx_pme_send_finish(cr);
1679     }
1680
1681     if (MASTER(cr))
1682     {
1683         if (ir->nstcalcenergy > 0)
1684         {
1685             energyOutput.printAnnealingTemperatures(fplog, groups, &(ir->opts));
1686             energyOutput.printAverages(fplog, groups);
1687         }
1688     }
1689     done_mdoutf(outf);
1690
1691     if (bPMETune)
1692     {
1693         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
1694     }
1695
1696     done_shellfc(fplog, shellfc, step_rel);
1697
1698     if (useReplicaExchange && MASTER(cr))
1699     {
1700         print_replica_exchange_statistics(fplog, repl_ex);
1701     }
1702
1703     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
1704
1705     global_stat_destroy(gstat);
1706 }