7ac685d7f2fd5e6c8c92cda4cc538d9c4721ae2a
[alexxy/gromacs.git] / src / gromacs / mdrun / md.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 /*! \internal \file
38  *
39  * \brief Implements the integrator for normal molecular dynamics simulations
40  *
41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
42  * \ingroup module_mdrun
43  */
44 #include "gmxpre.h"
45
46 #include <cinttypes>
47 #include <cmath>
48 #include <cstdio>
49 #include <cstdlib>
50
51 #include <algorithm>
52 #include <memory>
53 #include <numeric>
54
55 #include "gromacs/applied_forces/awh/awh.h"
56 #include "gromacs/applied_forces/awh/read_params.h"
57 #include "gromacs/commandline/filenm.h"
58 #include "gromacs/domdec/collect.h"
59 #include "gromacs/domdec/dlbtiming.h"
60 #include "gromacs/domdec/domdec.h"
61 #include "gromacs/domdec/domdec_network.h"
62 #include "gromacs/domdec/domdec_struct.h"
63 #include "gromacs/domdec/gpuhaloexchange.h"
64 #include "gromacs/domdec/mdsetup.h"
65 #include "gromacs/domdec/partition.h"
66 #include "gromacs/essentialdynamics/edsam.h"
67 #include "gromacs/ewald/pme_load_balancing.h"
68 #include "gromacs/ewald/pme_pp.h"
69 #include "gromacs/fileio/trxio.h"
70 #include "gromacs/gmxlib/network.h"
71 #include "gromacs/gmxlib/nrnb.h"
72 #include "gromacs/gpu_utils/device_stream_manager.h"
73 #include "gromacs/gpu_utils/gpu_utils.h"
74 #include "gromacs/imd/imd.h"
75 #include "gromacs/listed_forces/listed_forces.h"
76 #include "gromacs/math/functions.h"
77 #include "gromacs/math/invertmatrix.h"
78 #include "gromacs/math/vec.h"
79 #include "gromacs/math/vectypes.h"
80 #include "gromacs/mdlib/checkpointhandler.h"
81 #include "gromacs/mdlib/compute_io.h"
82 #include "gromacs/mdlib/constr.h"
83 #include "gromacs/mdlib/coupling.h"
84 #include "gromacs/mdlib/ebin.h"
85 #include "gromacs/mdlib/enerdata_utils.h"
86 #include "gromacs/mdlib/energyoutput.h"
87 #include "gromacs/mdlib/expanded.h"
88 #include "gromacs/mdlib/force.h"
89 #include "gromacs/mdlib/force_flags.h"
90 #include "gromacs/mdlib/forcerec.h"
91 #include "gromacs/mdlib/freeenergyparameters.h"
92 #include "gromacs/mdlib/md_support.h"
93 #include "gromacs/mdlib/mdatoms.h"
94 #include "gromacs/mdlib/mdoutf.h"
95 #include "gromacs/mdlib/membed.h"
96 #include "gromacs/mdlib/resethandler.h"
97 #include "gromacs/mdlib/sighandler.h"
98 #include "gromacs/mdlib/simulationsignal.h"
99 #include "gromacs/mdlib/stat.h"
100 #include "gromacs/mdlib/stophandler.h"
101 #include "gromacs/mdlib/tgroup.h"
102 #include "gromacs/mdlib/trajectory_writing.h"
103 #include "gromacs/mdlib/update.h"
104 #include "gromacs/mdlib/update_constrain_gpu.h"
105 #include "gromacs/mdlib/update_vv.h"
106 #include "gromacs/mdlib/vcm.h"
107 #include "gromacs/mdlib/vsite.h"
108 #include "gromacs/mdrunutility/handlerestart.h"
109 #include "gromacs/mdrunutility/multisim.h"
110 #include "gromacs/mdrunutility/printtime.h"
111 #include "gromacs/mdtypes/awh_history.h"
112 #include "gromacs/mdtypes/awh_params.h"
113 #include "gromacs/mdtypes/commrec.h"
114 #include "gromacs/mdtypes/df_history.h"
115 #include "gromacs/mdtypes/energyhistory.h"
116 #include "gromacs/mdtypes/fcdata.h"
117 #include "gromacs/mdtypes/forcebuffers.h"
118 #include "gromacs/mdtypes/forcerec.h"
119 #include "gromacs/mdtypes/group.h"
120 #include "gromacs/mdtypes/inputrec.h"
121 #include "gromacs/mdtypes/interaction_const.h"
122 #include "gromacs/mdtypes/md_enums.h"
123 #include "gromacs/mdtypes/mdatom.h"
124 #include "gromacs/mdtypes/mdrunoptions.h"
125 #include "gromacs/mdtypes/multipletimestepping.h"
126 #include "gromacs/mdtypes/observableshistory.h"
127 #include "gromacs/mdtypes/pullhistory.h"
128 #include "gromacs/mdtypes/simulation_workload.h"
129 #include "gromacs/mdtypes/state.h"
130 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
131 #include "gromacs/modularsimulator/energydata.h"
132 #include "gromacs/nbnxm/gpu_data_mgmt.h"
133 #include "gromacs/nbnxm/nbnxm.h"
134 #include "gromacs/pbcutil/pbc.h"
135 #include "gromacs/pulling/output.h"
136 #include "gromacs/pulling/pull.h"
137 #include "gromacs/swap/swapcoords.h"
138 #include "gromacs/timing/wallcycle.h"
139 #include "gromacs/timing/walltime_accounting.h"
140 #include "gromacs/topology/atoms.h"
141 #include "gromacs/topology/idef.h"
142 #include "gromacs/topology/mtop_util.h"
143 #include "gromacs/topology/topology.h"
144 #include "gromacs/trajectory/trajectoryframe.h"
145 #include "gromacs/utility/basedefinitions.h"
146 #include "gromacs/utility/cstringutil.h"
147 #include "gromacs/utility/fatalerror.h"
148 #include "gromacs/utility/logger.h"
149 #include "gromacs/utility/real.h"
150 #include "gromacs/utility/smalloc.h"
151
152 #include "legacysimulator.h"
153 #include "replicaexchange.h"
154 #include "shellfc.h"
155
156 using gmx::SimulationSignaller;
157
158 void gmx::LegacySimulator::do_md()
159 {
160     // TODO Historically, the EM and MD "integrators" used different
161     // names for the t_inputrec *parameter, but these must have the
162     // same name, now that it's a member of a struct. We use this ir
163     // alias to avoid a large ripple of nearly useless changes.
164     // t_inputrec is being replaced by IMdpOptionsProvider, so this
165     // will go away eventually.
166     const t_inputrec* ir = inputrec;
167
168     int64_t      step, step_rel;
169     double       t, t0 = ir->init_t;
170     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
171     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
172     gmx_bool     bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
173     gmx_bool     do_ene, do_log, do_verbose;
174     gmx_bool     bMasterState;
175     unsigned int force_flags;
176     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
177     int    i, m;
178     rvec   mu_tot;
179     matrix pressureCouplingMu, M;
180     gmx_repl_ex_t     repl_ex = nullptr;
181     gmx_global_stat_t gstat;
182     gmx_shellfc_t*    shellfc;
183     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
184     gmx_bool          bTrotter;
185     real              dvdl_constr;
186     std::vector<RVec> cbuf;
187     matrix            lastbox;
188     int               lamnew = 0;
189     /* for FEP */
190     int       nstfep = 0;
191     double    cycles;
192     real      saved_conserved_quantity = 0;
193     real      last_ekin                = 0;
194     t_extmass MassQ;
195     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
196
197     /* PME load balancing data for GPU kernels */
198     gmx_bool bPMETune         = FALSE;
199     gmx_bool bPMETunePrinting = FALSE;
200
201     bool bInteractiveMDstep = false;
202
203     SimulationSignals signals;
204     // Most global communnication stages don't propagate mdrun
205     // signals, and will use this object to achieve that.
206     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
207
208     if (!mdrunOptions.writeConfout)
209     {
210         // This is on by default, and the main known use case for
211         // turning it off is for convenience in benchmarking, which is
212         // something that should not show up in the general user
213         // interface.
214         GMX_LOG(mdlog.info)
215                 .asParagraph()
216                 .appendText(
217                         "The -noconfout functionality is deprecated, and may be removed in a "
218                         "future version.");
219     }
220
221     /* md-vv uses averaged full step velocities for T-control
222        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
223        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
224     bTrotter = (EI_VV(ir->eI)
225                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
226
227     const bool bRerunMD = false;
228
229     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
230     bGStatEveryStep   = (nstglobalcomm == 1);
231
232     const SimulationGroups* groups = &top_global.groups;
233
234     std::unique_ptr<EssentialDynamics> ed = nullptr;
235     if (opt2bSet("-ei", nfile, fnm))
236     {
237         /* Initialize essential dynamics sampling */
238         ed = init_edsam(mdlog,
239                         opt2fn_null("-ei", nfile, fnm),
240                         opt2fn("-eo", nfile, fnm),
241                         top_global,
242                         *ir,
243                         cr,
244                         constr,
245                         state_global,
246                         observablesHistory,
247                         oenv,
248                         startingBehavior);
249     }
250     else if (observablesHistory->edsamHistory)
251     {
252         gmx_fatal(FARGS,
253                   "The checkpoint is from a run with essential dynamics sampling, "
254                   "but the current run did not specify the -ei option. "
255                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
256     }
257
258     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
259     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
260     initialize_lambdas(
261             fplog, *ir, gmx::arrayRefFromArray(ir->opts.ref_t, ir->opts.ngtc), MASTER(cr), fep_state, lambda);
262     Update upd(*ir, deform);
263     bool   doSimulatedAnnealing = false;
264     {
265         // TODO: Avoid changing inputrec (#3854)
266         // Simulated annealing updates the reference temperature.
267         auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
268         doSimulatedAnnealing   = initSimulatedAnnealing(nonConstInputrec, &upd);
269     }
270     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
271
272     const t_fcdata& fcdata = *fr->fcdata;
273
274     bool simulationsShareState = false;
275     int  nstSignalComm         = nstglobalcomm;
276     {
277         // TODO This implementation of ensemble orientation restraints is nasty because
278         // a user can't just do multi-sim with single-sim orientation restraints.
279         bool usingEnsembleRestraints =
280                 (fcdata.disres->nsystems > 1) || ((ms != nullptr) && (fcdata.orires->nr != 0));
281         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim() && (ms != nullptr));
282
283         // Replica exchange, ensemble restraints and AWH need all
284         // simulations to remain synchronized, so they need
285         // checkpoints and stop conditions to act on the same step, so
286         // the propagation of such signals must take place between
287         // simulations, not just within simulations.
288         // TODO: Make algorithm initializers set these flags.
289         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
290
291         if (simulationsShareState)
292         {
293             // Inter-simulation signal communication does not need to happen
294             // often, so we use a minimum of 200 steps to reduce overhead.
295             const int c_minimumInterSimulationSignallingInterval = 200;
296             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
297                             * nstglobalcomm;
298         }
299     }
300
301     if (startingBehavior != StartingBehavior::RestartWithAppending)
302     {
303         pleaseCiteCouplingAlgorithms(fplog, *ir);
304     }
305     gmx_mdoutf*       outf = init_mdoutf(fplog,
306                                    nfile,
307                                    fnm,
308                                    mdrunOptions,
309                                    cr,
310                                    outputProvider,
311                                    mdModulesNotifiers,
312                                    ir,
313                                    top_global,
314                                    oenv,
315                                    wcycle,
316                                    startingBehavior,
317                                    simulationsShareState,
318                                    ms);
319     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf),
320                                    top_global,
321                                    *ir,
322                                    pull_work,
323                                    mdoutf_get_fp_dhdl(outf),
324                                    false,
325                                    startingBehavior,
326                                    simulationsShareState,
327                                    mdModulesNotifiers);
328
329     gstat = global_stat_init(ir);
330
331     const auto& simulationWork     = runScheduleWork->simulationWork;
332     const bool  useGpuForPme       = simulationWork.useGpuPme;
333     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
334     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
335     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
336
337     /* Check for polarizable models and flexible constraints */
338     shellfc = init_shell_flexcon(fplog,
339                                  top_global,
340                                  constr ? constr->numFlexibleConstraints() : 0,
341                                  ir->nstcalcenergy,
342                                  DOMAINDECOMP(cr),
343                                  useGpuForPme);
344
345     {
346         double io = compute_io(ir, top_global.natoms, *groups, energyOutput.numEnergyTerms(), 1);
347         if ((io > 2000) && MASTER(cr))
348         {
349             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
350         }
351     }
352
353     // Local state only becomes valid now.
354     std::unique_ptr<t_state> stateInstance;
355     t_state*                 state;
356
357     gmx_localtop_t top(top_global.ffparams);
358
359     auto mdatoms = mdAtoms->mdatoms();
360
361     ForceBuffers f(fr->useMts,
362                    ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
363                            ? PinningPolicy::PinnedIfSupported
364                            : PinningPolicy::CannotBePinned);
365     if (DOMAINDECOMP(cr))
366     {
367         stateInstance = std::make_unique<t_state>();
368         state         = stateInstance.get();
369         dd_init_local_state(*cr->dd, state_global, state);
370
371         /* Distribute the charge groups over the nodes from the master node */
372         dd_partition_system(fplog,
373                             mdlog,
374                             ir->init_step,
375                             cr,
376                             TRUE,
377                             1,
378                             state_global,
379                             top_global,
380                             *ir,
381                             imdSession,
382                             pull_work,
383                             state,
384                             &f,
385                             mdAtoms,
386                             &top,
387                             fr,
388                             vsite,
389                             constr,
390                             nrnb,
391                             nullptr,
392                             FALSE);
393         upd.setNumAtoms(state->natoms);
394     }
395     else
396     {
397         state_change_natoms(state_global, state_global->natoms);
398         /* Copy the pointer to the global state */
399         state = state_global;
400
401         /* Generate and initialize new topology */
402         mdAlgorithmsSetupAtomData(cr, *ir, top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
403
404         upd.setNumAtoms(state->natoms);
405     }
406
407     std::unique_ptr<UpdateConstrainGpu> integrator;
408
409     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
410
411     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
412     if (useGpuForUpdate)
413     {
414         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
415                                    || constr->numConstraintsTotal() == 0,
416                            "Constraints in domain decomposition are only supported with update "
417                            "groups if using GPU update.\n");
418         GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
419                                    || constr->numConstraintsTotal() == 0,
420                            "SHAKE is not supported with GPU update.");
421         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
422                            "Either PME or short-ranged non-bonded interaction tasks must run on "
423                            "the GPU to use GPU update.\n");
424         GMX_RELEASE_ASSERT(ir->eI == IntegrationAlgorithm::MD,
425                            "Only the md integrator is supported with the GPU update.\n");
426         GMX_RELEASE_ASSERT(
427                 ir->etc != TemperatureCoupling::NoseHoover,
428                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
429         GMX_RELEASE_ASSERT(
430                 ir->epc == PressureCoupling::No || ir->epc == PressureCoupling::ParrinelloRahman
431                         || ir->epc == PressureCoupling::Berendsen || ir->epc == PressureCoupling::CRescale,
432                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
433                 "with the GPU update.\n");
434         GMX_RELEASE_ASSERT(!mdatoms->haveVsites,
435                            "Virtual sites are not supported with the GPU update.\n");
436         GMX_RELEASE_ASSERT(ed == nullptr,
437                            "Essential dynamics is not supported with the GPU update.\n");
438         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
439                            "Constraints pulling is not supported with the GPU update.\n");
440         GMX_RELEASE_ASSERT(fcdata.orires->nr == 0,
441                            "Orientation restraints are not supported with the GPU update.\n");
442         GMX_RELEASE_ASSERT(
443                 ir->efep == FreeEnergyPerturbationType::No
444                         || (!haveFepPerturbedMasses(top_global) && !havePerturbedConstraints(top_global)),
445                 "Free energy perturbation of masses and constraints are not supported with the GPU "
446                 "update.");
447
448         if (constr != nullptr && constr->numConstraintsTotal() > 0)
449         {
450             GMX_LOG(mdlog.info)
451                     .asParagraph()
452                     .appendText("Updating coordinates and applying constraints on the GPU.");
453         }
454         else
455         {
456             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
457         }
458         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
459                            "Device stream manager should be initialized in order to use GPU "
460                            "update-constraints.");
461         GMX_RELEASE_ASSERT(
462                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
463                 "Update stream should be initialized in order to use GPU "
464                 "update-constraints.");
465         integrator = std::make_unique<UpdateConstrainGpu>(
466                 *ir,
467                 top_global,
468                 ekind->ngtc,
469                 fr->deviceStreamManager->context(),
470                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
471                 stateGpu->xUpdatedOnDevice(),
472                 wcycle);
473
474         integrator->setPbc(PbcType::Xyz, state->box);
475     }
476
477     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
478     {
479         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
480     }
481     if (useGpuForUpdate)
482     {
483         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
484     }
485
486     // NOTE: The global state is no longer used at this point.
487     // But state_global is still used as temporary storage space for writing
488     // the global state to file and potentially for replica exchange.
489     // (Global topology should persist.)
490
491     update_mdatoms(mdatoms, state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
492
493     if (ir->bExpanded)
494     {
495         /* Check nstexpanded here, because the grompp check was broken */
496         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
497         {
498             gmx_fatal(FARGS,
499                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
500         }
501         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
502     }
503
504     if (MASTER(cr))
505     {
506         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
507     }
508
509     preparePrevStepPullCom(ir,
510                            pull_work,
511                            gmx::arrayRefFromArray(mdatoms->massT, mdatoms->nr),
512                            state,
513                            state_global,
514                            cr,
515                            startingBehavior != StartingBehavior::NewSimulation);
516
517     // TODO: Remove this by converting AWH into a ForceProvider
518     auto awh = prepareAwhModule(fplog,
519                                 *ir,
520                                 state_global,
521                                 cr,
522                                 ms,
523                                 startingBehavior != StartingBehavior::NewSimulation,
524                                 shellfc != nullptr,
525                                 opt2fn("-awh", nfile, fnm),
526                                 pull_work);
527
528     if (useReplicaExchange && MASTER(cr))
529     {
530         repl_ex = init_replica_exchange(fplog, ms, top_global.natoms, ir, replExParams);
531     }
532     /* PME tuning is only supported in the Verlet scheme, with PME for
533      * Coulomb. It is not supported with only LJ PME. */
534     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
535                 && ir->cutoff_scheme != CutoffScheme::Group);
536
537     pme_load_balancing_t* pme_loadbal = nullptr;
538     if (bPMETune)
539     {
540         pme_loadbal_init(
541                 &pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu());
542     }
543
544     if (!ir->bContinuation)
545     {
546         if (state->flags & enumValueToBitMask(StateEntry::V))
547         {
548             auto v = makeArrayRef(state->v);
549             /* Set the velocities of vsites, shells and frozen atoms to zero */
550             for (i = 0; i < mdatoms->homenr; i++)
551             {
552                 if (mdatoms->ptype[i] == ParticleType::Shell)
553                 {
554                     clear_rvec(v[i]);
555                 }
556                 else if (mdatoms->cFREEZE)
557                 {
558                     for (m = 0; m < DIM; m++)
559                     {
560                         if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
561                         {
562                             v[i][m] = 0;
563                         }
564                     }
565                 }
566             }
567         }
568
569         if (constr)
570         {
571             /* Constrain the initial coordinates and velocities */
572             do_constrain_first(fplog,
573                                constr,
574                                ir,
575                                mdatoms->nr,
576                                mdatoms->homenr,
577                                state->x.arrayRefWithPadding(),
578                                state->v.arrayRefWithPadding(),
579                                state->box,
580                                state->lambda[FreeEnergyPerturbationCouplingType::Bonded]);
581         }
582     }
583
584     if (ir->efep != FreeEnergyPerturbationType::No)
585     {
586         /* Set free energy calculation frequency as the greatest common
587          * denominator of nstdhdl and repl_ex_nst. */
588         nstfep = ir->fepvals->nstdhdl;
589         if (ir->bExpanded)
590         {
591             nstfep = std::gcd(ir->expandedvals->nstexpanded, nstfep);
592         }
593         if (useReplicaExchange)
594         {
595             nstfep = std::gcd(replExParams.exchangeInterval, nstfep);
596         }
597         if (ir->bDoAwh)
598         {
599             nstfep = std::gcd(ir->awhParams->nstSampleCoord(), nstfep);
600         }
601     }
602
603     /* Be REALLY careful about what flags you set here. You CANNOT assume
604      * this is the first step, since we might be restarting from a checkpoint,
605      * and in that case we should not do any modifications to the state.
606      */
607     bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && !ir->bContinuation);
608
609     // When restarting from a checkpoint, it can be appropriate to
610     // initialize ekind from quantities in the checkpoint. Otherwise,
611     // compute_globals must initialize ekind before the simulation
612     // starts/restarts. However, only the master rank knows what was
613     // found in the checkpoint file, so we have to communicate in
614     // order to coordinate the restart.
615     //
616     // TODO Consider removing this communication if/when checkpoint
617     // reading directly follows .tpr reading, because all ranks can
618     // agree on hasReadEkinState at that time.
619     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
620     if (PAR(cr))
621     {
622         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
623     }
624     if (hasReadEkinState)
625     {
626         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
627     }
628
629     unsigned int cglo_flags =
630             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
631              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
632
633     bSumEkinhOld = FALSE;
634
635     t_vcm vcm(top_global.groups, *ir);
636     reportComRemovalInfo(fplog, vcm);
637
638     /* To minimize communication, compute_globals computes the COM velocity
639      * and the kinetic energy for the velocities without COM motion removed.
640      * Thus to get the kinetic energy without the COM contribution, we need
641      * to call compute_globals twice.
642      */
643     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
644     {
645         unsigned int cglo_flags_iteration = cglo_flags;
646         if (bStopCM && cgloIteration == 0)
647         {
648             cglo_flags_iteration |= CGLO_STOPCM;
649             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
650         }
651         if (DOMAINDECOMP(cr) && shouldCheckNumberOfBondedInteractions(*cr->dd) && cgloIteration == 0)
652         {
653             cglo_flags_iteration |= CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS;
654         }
655         compute_globals(gstat,
656                         cr,
657                         ir,
658                         fr,
659                         ekind,
660                         makeConstArrayRef(state->x),
661                         makeConstArrayRef(state->v),
662                         state->box,
663                         mdatoms,
664                         nrnb,
665                         &vcm,
666                         nullptr,
667                         enerd,
668                         force_vir,
669                         shake_vir,
670                         total_vir,
671                         pres,
672                         gmx::ArrayRef<real>{},
673                         &nullSignaller,
674                         state->box,
675                         &bSumEkinhOld,
676                         cglo_flags_iteration);
677         if (cglo_flags_iteration & CGLO_STOPCM)
678         {
679             /* At initialization, do not pass x with acceleration-correction mode
680              * to avoid (incorrect) correction of the initial coordinates.
681              */
682             auto x = (vcm.mode == ComRemovalAlgorithm::LinearAccelerationCorrection)
683                              ? ArrayRef<RVec>()
684                              : makeArrayRef(state->x);
685             process_and_stopcm_grp(fplog, &vcm, *mdatoms, x, makeArrayRef(state->v));
686             inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
687         }
688     }
689     if (DOMAINDECOMP(cr))
690     {
691         checkNumberOfBondedInteractions(
692                 mdlog, cr, top_global, &top, makeConstArrayRef(state->x), state->box);
693     }
694     if (ir->eI == IntegrationAlgorithm::VVAK)
695     {
696         /* a second call to get the half step temperature initialized as well */
697         /* we do the same call as above, but turn the pressure off -- internally to
698            compute_globals, this is recognized as a velocity verlet half-step
699            kinetic energy calculation.  This minimized excess variables, but
700            perhaps loses some logic?*/
701
702         compute_globals(gstat,
703                         cr,
704                         ir,
705                         fr,
706                         ekind,
707                         makeConstArrayRef(state->x),
708                         makeConstArrayRef(state->v),
709                         state->box,
710                         mdatoms,
711                         nrnb,
712                         &vcm,
713                         nullptr,
714                         enerd,
715                         force_vir,
716                         shake_vir,
717                         total_vir,
718                         pres,
719                         gmx::ArrayRef<real>{},
720                         &nullSignaller,
721                         state->box,
722                         &bSumEkinhOld,
723                         cglo_flags & ~CGLO_PRESSURE);
724     }
725
726     /* Calculate the initial half step temperature, and save the ekinh_old */
727     if (startingBehavior == StartingBehavior::NewSimulation)
728     {
729         for (i = 0; (i < ir->opts.ngtc); i++)
730         {
731             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
732         }
733     }
734
735     /* need to make an initiation call to get the Trotter variables set, as well as other constants
736        for non-trotter temperature control */
737     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
738
739     if (MASTER(cr))
740     {
741         if (!ir->bContinuation)
742         {
743             if (constr && ir->eConstrAlg == ConstraintAlgorithm::Lincs)
744             {
745                 fprintf(fplog,
746                         "RMS relative constraint deviation after constraining: %.2e\n",
747                         constr->rmsd());
748             }
749             if (EI_STATE_VELOCITY(ir->eI))
750             {
751                 real temp = enerd->term[F_TEMP];
752                 if (ir->eI != IntegrationAlgorithm::VV)
753                 {
754                     /* Result of Ekin averaged over velocities of -half
755                      * and +half step, while we only have -half step here.
756                      */
757                     temp *= 2;
758                 }
759                 fprintf(fplog, "Initial temperature: %g K\n", temp);
760             }
761         }
762
763         char tbuf[20];
764         fprintf(stderr, "starting mdrun '%s'\n", *(top_global.name));
765         if (ir->nsteps >= 0)
766         {
767             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
768         }
769         else
770         {
771             sprintf(tbuf, "%s", "infinite");
772         }
773         if (ir->init_step > 0)
774         {
775             fprintf(stderr,
776                     "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
777                     gmx_step_str(ir->init_step + ir->nsteps, sbuf),
778                     tbuf,
779                     gmx_step_str(ir->init_step, sbuf2),
780                     ir->init_step * ir->delta_t);
781         }
782         else
783         {
784             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
785         }
786         fprintf(fplog, "\n");
787     }
788
789     walltime_accounting_start_time(walltime_accounting);
790     wallcycle_start(wcycle, ewcRUN);
791     print_start(fplog, cr, walltime_accounting, "mdrun");
792
793     /***********************************************************
794      *
795      *             Loop over MD steps
796      *
797      ************************************************************/
798
799     bFirstStep = TRUE;
800     /* Skip the first Nose-Hoover integration when we get the state from tpx */
801     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
802     bSumEkinhOld     = FALSE;
803     bExchanged       = FALSE;
804     bNeedRepartition = FALSE;
805
806     step     = ir->init_step;
807     step_rel = 0;
808
809     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
810             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]),
811             simulationsShareState,
812             MASTER(cr),
813             ir->nstlist,
814             mdrunOptions.reproducible,
815             nstSignalComm,
816             mdrunOptions.maximumHoursToRun,
817             ir->nstlist == 0,
818             fplog,
819             step,
820             bNS,
821             walltime_accounting);
822
823     auto checkpointHandler = std::make_unique<CheckpointHandler>(
824             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
825             simulationsShareState,
826             ir->nstlist == 0,
827             MASTER(cr),
828             mdrunOptions.writeConfout,
829             mdrunOptions.checkpointOptions.period);
830
831     const bool resetCountersIsLocal = true;
832     auto       resetHandler         = std::make_unique<ResetHandler>(
833             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
834             !resetCountersIsLocal,
835             ir->nsteps,
836             MASTER(cr),
837             mdrunOptions.timingOptions.resetHalfway,
838             mdrunOptions.maximumHoursToRun,
839             mdlog,
840             wcycle,
841             walltime_accounting);
842
843     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
844
845     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
846     {
847         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
848     }
849
850     /* and stop now if we should */
851     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
852     while (!bLastStep)
853     {
854
855         /* Determine if this is a neighbor search step */
856         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
857
858         if (bPMETune && bNStList)
859         {
860             // This has to be here because PME load balancing is called so early.
861             // TODO: Move to after all booleans are defined.
862             if (useGpuForUpdate && !bFirstStep)
863             {
864                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
865                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
866             }
867             /* PME grid + cut-off optimization with GPUs or PME nodes */
868             pme_loadbal_do(pme_loadbal,
869                            cr,
870                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
871                            fplog,
872                            mdlog,
873                            *ir,
874                            fr,
875                            state->box,
876                            state->x,
877                            wcycle,
878                            step,
879                            step_rel,
880                            &bPMETunePrinting,
881                            simulationWork.useGpuPmePpCommunication);
882         }
883
884         wallcycle_start(wcycle, ewcSTEP);
885
886         bLastStep = (step_rel == ir->nsteps);
887         t         = t0 + step * ir->delta_t;
888
889         // TODO Refactor this, so that nstfep does not need a default value of zero
890         if (ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
891         {
892             /* find and set the current lambdas */
893             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
894
895             bDoDHDL = do_per_step(step, ir->fepvals->nstdhdl);
896             bDoFEP  = ((ir->efep != FreeEnergyPerturbationType::No) && do_per_step(step, nstfep));
897             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
898                            && (!bFirstStep));
899         }
900
901         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
902                      && do_per_step(step, replExParams.exchangeInterval));
903
904         if (doSimulatedAnnealing)
905         {
906             // TODO: Avoid changing inputrec (#3854)
907             // Simulated annealing updates the reference temperature.
908             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
909             update_annealing_target_temp(nonConstInputrec, t, &upd);
910         }
911
912         /* Stop Center of Mass motion */
913         bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && do_per_step(step, ir->nstcomm));
914
915         /* Determine whether or not to do Neighbour Searching */
916         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
917
918         /* Note that the stopHandler will cause termination at nstglobalcomm
919          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
920          * nstpcouple steps, we have computed the half-step kinetic energy
921          * of the previous step and can always output energies at the last step.
922          */
923         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
924
925         /* do_log triggers energy and virial calculation. Because this leads
926          * to different code paths, forces can be different. Thus for exact
927          * continuation we should avoid extra log output.
928          * Note that the || bLastStep can result in non-exact continuation
929          * beyond the last step. But we don't consider that to be an issue.
930          */
931         do_log     = (do_per_step(step, ir->nstlog)
932                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
933         do_verbose = mdrunOptions.verbose
934                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
935
936         if (useGpuForUpdate && !bFirstStep && bNS)
937         {
938             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
939             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
940             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
941             // Copy coordinate from the GPU when needed at the search step.
942             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
943             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
944             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
945             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
946         }
947
948         // We only need to calculate virtual velocities if we are writing them in the current step
949         const bool needVirtualVelocitiesThisStep =
950                 (vsite != nullptr)
951                 && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep());
952
953         if (vsite != nullptr)
954         {
955             // Virtual sites need to be updated before domain decomposition and forces are calculated
956             wallcycle_start(wcycle, ewcVSITECONSTR);
957             // md-vv calculates virtual velocities once it has full-step real velocities
958             vsite->construct(state->x,
959                              state->v,
960                              state->box,
961                              (!EI_VV(inputrec->eI) && needVirtualVelocitiesThisStep)
962                                      ? VSiteOperation::PositionsAndVelocities
963                                      : VSiteOperation::Positions);
964             wallcycle_stop(wcycle, ewcVSITECONSTR);
965         }
966
967         if (bNS && !(bFirstStep && ir->bContinuation))
968         {
969             bMasterState = FALSE;
970             /* Correct the new box if it is too skewed */
971             if (inputrecDynamicBox(ir))
972             {
973                 if (correct_box(fplog, step, state->box))
974                 {
975                     bMasterState = TRUE;
976                     // If update is offloaded, it should be informed about the box size change
977                     if (useGpuForUpdate)
978                     {
979                         integrator->setPbc(PbcType::Xyz, state->box);
980                     }
981                 }
982             }
983             if (DOMAINDECOMP(cr) && bMasterState)
984             {
985                 dd_collect_state(cr->dd, state, state_global);
986             }
987
988             if (DOMAINDECOMP(cr))
989             {
990                 /* Repartition the domain decomposition */
991                 dd_partition_system(fplog,
992                                     mdlog,
993                                     step,
994                                     cr,
995                                     bMasterState,
996                                     nstglobalcomm,
997                                     state_global,
998                                     top_global,
999                                     *ir,
1000                                     imdSession,
1001                                     pull_work,
1002                                     state,
1003                                     &f,
1004                                     mdAtoms,
1005                                     &top,
1006                                     fr,
1007                                     vsite,
1008                                     constr,
1009                                     nrnb,
1010                                     wcycle,
1011                                     do_verbose && !bPMETunePrinting);
1012                 upd.setNumAtoms(state->natoms);
1013             }
1014         }
1015
1016         // Allocate or re-size GPU halo exchange object, if necessary
1017         if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange)
1018         {
1019             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
1020                                "GPU device manager has to be initialized to use GPU "
1021                                "version of halo exchange.");
1022             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
1023         }
1024
1025         if (MASTER(cr) && do_log)
1026         {
1027             gmx::EnergyOutput::printHeader(
1028                     fplog, step, t); /* can we improve the information printed here? */
1029         }
1030
1031         if (ir->efep != FreeEnergyPerturbationType::No)
1032         {
1033             update_mdatoms(mdatoms, state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
1034         }
1035
1036         if (bExchanged)
1037         {
1038             /* We need the kinetic energy at minus the half step for determining
1039              * the full step kinetic energy and possibly for T-coupling.*/
1040             /* This may not be quite working correctly yet . . . . */
1041             int cglo_flags = CGLO_GSTAT | CGLO_TEMPERATURE;
1042             if (DOMAINDECOMP(cr) && shouldCheckNumberOfBondedInteractions(*cr->dd))
1043             {
1044                 cglo_flags |= CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS;
1045             }
1046             compute_globals(gstat,
1047                             cr,
1048                             ir,
1049                             fr,
1050                             ekind,
1051                             makeConstArrayRef(state->x),
1052                             makeConstArrayRef(state->v),
1053                             state->box,
1054                             mdatoms,
1055                             nrnb,
1056                             &vcm,
1057                             wcycle,
1058                             enerd,
1059                             nullptr,
1060                             nullptr,
1061                             nullptr,
1062                             nullptr,
1063                             gmx::ArrayRef<real>{},
1064                             &nullSignaller,
1065                             state->box,
1066                             &bSumEkinhOld,
1067                             cglo_flags);
1068             if (DOMAINDECOMP(cr))
1069             {
1070                 checkNumberOfBondedInteractions(
1071                         mdlog, cr, top_global, &top, makeConstArrayRef(state->x), state->box);
1072             }
1073         }
1074         clear_mat(force_vir);
1075
1076         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
1077
1078         /* Determine the energy and pressure:
1079          * at nstcalcenergy steps and at energy output steps (set below).
1080          */
1081         if (EI_VV(ir->eI) && (!bInitStep))
1082         {
1083             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1084             bCalcVir      = bCalcEnerStep
1085                        || (ir->epc != PressureCoupling::No
1086                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
1087         }
1088         else
1089         {
1090             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1091             bCalcVir      = bCalcEnerStep
1092                        || (ir->epc != PressureCoupling::No && do_per_step(step, ir->nstpcouple));
1093         }
1094         bCalcEner = bCalcEnerStep;
1095
1096         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
1097
1098         if (do_ene || do_log || bDoReplEx)
1099         {
1100             bCalcVir  = TRUE;
1101             bCalcEner = TRUE;
1102         }
1103
1104         /* Do we need global communication ? */
1105         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
1106                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
1107
1108         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
1109                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
1110                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
1111         if (fr->useMts && !do_per_step(step, ir->nstfout))
1112         {
1113             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
1114         }
1115
1116         if (shellfc)
1117         {
1118             /* Now is the time to relax the shells */
1119             relax_shell_flexcon(fplog,
1120                                 cr,
1121                                 ms,
1122                                 mdrunOptions.verbose,
1123                                 enforcedRotation,
1124                                 step,
1125                                 ir,
1126                                 imdSession,
1127                                 pull_work,
1128                                 bNS,
1129                                 force_flags,
1130                                 &top,
1131                                 constr,
1132                                 enerd,
1133                                 state->natoms,
1134                                 state->x.arrayRefWithPadding(),
1135                                 state->v.arrayRefWithPadding(),
1136                                 state->box,
1137                                 state->lambda,
1138                                 &state->hist,
1139                                 &f.view(),
1140                                 force_vir,
1141                                 *mdatoms,
1142                                 nrnb,
1143                                 wcycle,
1144                                 shellfc,
1145                                 fr,
1146                                 runScheduleWork,
1147                                 t,
1148                                 mu_tot,
1149                                 vsite,
1150                                 ddBalanceRegionHandler);
1151         }
1152         else
1153         {
1154             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
1155                is updated (or the AWH update will be performed twice for one step when continuing).
1156                It would be best to call this update function from do_md_trajectory_writing but that
1157                would occur after do_force. One would have to divide the update_awh function into one
1158                function applying the AWH force and one doing the AWH bias update. The update AWH
1159                bias function could then be called after do_md_trajectory_writing (then containing
1160                update_awh_history). The checkpointing will in the future probably moved to the start
1161                of the md loop which will rid of this issue. */
1162             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
1163             {
1164                 awh->updateHistory(state_global->awhHistory.get());
1165             }
1166
1167             /* The coordinates (x) are shifted (to get whole molecules)
1168              * in do_force.
1169              * This is parallellized as well, and does communication too.
1170              * Check comments in sim_util.c
1171              */
1172             do_force(fplog,
1173                      cr,
1174                      ms,
1175                      *ir,
1176                      awh.get(),
1177                      enforcedRotation,
1178                      imdSession,
1179                      pull_work,
1180                      step,
1181                      nrnb,
1182                      wcycle,
1183                      &top,
1184                      state->box,
1185                      state->x.arrayRefWithPadding(),
1186                      &state->hist,
1187                      &f.view(),
1188                      force_vir,
1189                      mdatoms,
1190                      enerd,
1191                      state->lambda,
1192                      fr,
1193                      runScheduleWork,
1194                      vsite,
1195                      mu_tot,
1196                      t,
1197                      ed ? ed->getLegacyED() : nullptr,
1198                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
1199                      ddBalanceRegionHandler);
1200         }
1201
1202         // VV integrators do not need the following velocity half step
1203         // if it is the first step after starting from a checkpoint.
1204         // That is, the half step is needed on all other steps, and
1205         // also the first step when starting from a .tpr file.
1206         if (EI_VV(ir->eI))
1207         {
1208             integrateVVFirstStep(step,
1209                                  bFirstStep,
1210                                  bInitStep,
1211                                  startingBehavior,
1212                                  nstglobalcomm,
1213                                  ir,
1214                                  fr,
1215                                  cr,
1216                                  state,
1217                                  mdatoms,
1218                                  fcdata,
1219                                  &MassQ,
1220                                  &vcm,
1221                                  top_global,
1222                                  top,
1223                                  enerd,
1224                                  ekind,
1225                                  gstat,
1226                                  &last_ekin,
1227                                  bCalcVir,
1228                                  total_vir,
1229                                  shake_vir,
1230                                  force_vir,
1231                                  pres,
1232                                  M,
1233                                  do_log,
1234                                  do_ene,
1235                                  bCalcEner,
1236                                  bGStat,
1237                                  bStopCM,
1238                                  bTrotter,
1239                                  bExchanged,
1240                                  &bSumEkinhOld,
1241                                  &saved_conserved_quantity,
1242                                  &f,
1243                                  &upd,
1244                                  constr,
1245                                  &nullSignaller,
1246                                  trotter_seq,
1247                                  nrnb,
1248                                  mdlog,
1249                                  fplog,
1250                                  wcycle);
1251             if (vsite != nullptr && needVirtualVelocitiesThisStep)
1252             {
1253                 // Positions were calculated earlier
1254                 wallcycle_start(wcycle, ewcVSITECONSTR);
1255                 vsite->construct(state->x, state->v, state->box, VSiteOperation::Velocities);
1256                 wallcycle_stop(wcycle, ewcVSITECONSTR);
1257             }
1258         }
1259
1260         /* ########  END FIRST UPDATE STEP  ############## */
1261         /* ########  If doing VV, we now have v(dt) ###### */
1262         if (bDoExpanded)
1263         {
1264             /* perform extended ensemble sampling in lambda - we don't
1265                actually move to the new state before outputting
1266                statistics, but if performing simulated tempering, we
1267                do update the velocities and the tau_t. */
1268             // TODO: Avoid changing inputrec (#3854)
1269             // Simulated tempering updates the reference temperature.
1270             // Expanded ensemble without simulated tempering does not change the inputrec.
1271             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
1272             lamnew                 = ExpandedEnsembleDynamics(fplog,
1273                                               nonConstInputrec,
1274                                               enerd,
1275                                               state,
1276                                               &MassQ,
1277                                               state->fep_state,
1278                                               state->dfhist,
1279                                               step,
1280                                               state->v.rvec_array(),
1281                                               mdatoms);
1282             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1283             if (MASTER(cr))
1284             {
1285                 copy_df_history(state_global->dfhist, state->dfhist);
1286             }
1287         }
1288
1289         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1290         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1291         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1292             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1293                 || checkpointHandler->isCheckpointingStep()))
1294         {
1295             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1296             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1297         }
1298         // Copy velocities if needed for the output/checkpointing.
1299         // NOTE: Copy on the search steps is done at the beginning of the step.
1300         if (useGpuForUpdate && !bNS
1301             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1302         {
1303             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1304             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1305         }
1306         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1307         // and update is offloaded hence forces are kept on the GPU for update and have not been
1308         // already transferred in do_force().
1309         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1310         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1311         //       prior to GPU update.
1312         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1313         //       copy call in do_force(...).
1314         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1315         //       on host after the D2H copy in do_force(...).
1316         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1317             && do_per_step(step, ir->nstfout))
1318         {
1319             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1320             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1321         }
1322         /* Now we have the energies and forces corresponding to the
1323          * coordinates at time t. We must output all of this before
1324          * the update.
1325          */
1326         do_md_trajectory_writing(fplog,
1327                                  cr,
1328                                  nfile,
1329                                  fnm,
1330                                  step,
1331                                  step_rel,
1332                                  t,
1333                                  ir,
1334                                  state,
1335                                  state_global,
1336                                  observablesHistory,
1337                                  top_global,
1338                                  fr,
1339                                  outf,
1340                                  energyOutput,
1341                                  ekind,
1342                                  f.view().force(),
1343                                  checkpointHandler->isCheckpointingStep(),
1344                                  bRerunMD,
1345                                  bLastStep,
1346                                  mdrunOptions.writeConfout,
1347                                  bSumEkinhOld);
1348         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1349         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x, t);
1350
1351         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1352         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1353             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1354         {
1355             copy_mat(state->svir_prev, shake_vir);
1356             copy_mat(state->fvir_prev, force_vir);
1357         }
1358
1359         stopHandler->setSignal();
1360         resetHandler->setSignal(walltime_accounting);
1361
1362         if (bGStat || !PAR(cr))
1363         {
1364             /* In parallel we only have to check for checkpointing in steps
1365              * where we do global communication,
1366              *  otherwise the other nodes don't know.
1367              */
1368             checkpointHandler->setSignal(walltime_accounting);
1369         }
1370
1371         /* #########   START SECOND UPDATE STEP ################# */
1372
1373         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1374            controlled in preprocessing */
1375
1376         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1377         {
1378             gmx_bool bIfRandomize;
1379             bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, &upd, constr);
1380             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1381             if (constr && bIfRandomize)
1382             {
1383                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1384             }
1385         }
1386         /* Box is changed in update() when we do pressure coupling,
1387          * but we should still use the old box for energy corrections and when
1388          * writing it to the energy file, so it matches the trajectory files for
1389          * the same timestep above. Make a copy in a separate array.
1390          */
1391         copy_mat(state->box, lastbox);
1392
1393         dvdl_constr = 0;
1394
1395         if (!useGpuForUpdate)
1396         {
1397             wallcycle_start(wcycle, ewcUPDATE);
1398         }
1399         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1400         if (bTrotter)
1401         {
1402             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
1403             /* We can only do Berendsen coupling after we have summed
1404              * the kinetic energy or virial. Since the happens
1405              * in global_state after update, we should only do it at
1406              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1407              */
1408         }
1409         else
1410         {
1411             update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
1412             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1413         }
1414
1415         /* With leap-frog type integrators we compute the kinetic energy
1416          * at a whole time step as the average of the half-time step kinetic
1417          * energies of two subsequent steps. Therefore we need to compute the
1418          * half step kinetic energy also if we need energies at the next step.
1419          */
1420         const bool needHalfStepKineticEnergy =
1421                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1422
1423         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1424         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1425         const bool doParrinelloRahman = (ir->epc == PressureCoupling::ParrinelloRahman
1426                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1427
1428         if (EI_VV(ir->eI))
1429         {
1430             GMX_ASSERT(!useGpuForUpdate, "GPU update is not supported with VVAK integrator.");
1431
1432             integrateVVSecondStep(step,
1433                                   ir,
1434                                   fr,
1435                                   cr,
1436                                   state,
1437                                   mdatoms,
1438                                   fcdata,
1439                                   &MassQ,
1440                                   &vcm,
1441                                   pull_work,
1442                                   enerd,
1443                                   ekind,
1444                                   gstat,
1445                                   &dvdl_constr,
1446                                   bCalcVir,
1447                                   total_vir,
1448                                   shake_vir,
1449                                   force_vir,
1450                                   pres,
1451                                   M,
1452                                   lastbox,
1453                                   do_log,
1454                                   do_ene,
1455                                   bGStat,
1456                                   &bSumEkinhOld,
1457                                   &f,
1458                                   &cbuf,
1459                                   &upd,
1460                                   constr,
1461                                   &nullSignaller,
1462                                   trotter_seq,
1463                                   nrnb,
1464                                   wcycle);
1465         }
1466         else
1467         {
1468             if (useGpuForUpdate)
1469             {
1470                 if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1471                 {
1472                     integrator->set(stateGpu->getCoordinates(),
1473                                     stateGpu->getVelocities(),
1474                                     stateGpu->getForces(),
1475                                     top.idef,
1476                                     *mdatoms);
1477
1478                     // Copy data to the GPU after buffers might have being reinitialized
1479                     stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1480                     stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1481                 }
1482
1483                 if (simulationWork.useGpuPme && !runScheduleWork->simulationWork.useGpuPmePpCommunication
1484                     && !thisRankHasDuty(cr, DUTY_PME))
1485                 {
1486                     // The PME forces were recieved to the host, so have to be copied
1487                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::All);
1488                 }
1489                 else if (!runScheduleWork->stepWork.useGpuFBufferOps)
1490                 {
1491                     // The buffer ops were not offloaded this step, so the forces are on the
1492                     // host and have to be copied
1493                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1494                 }
1495
1496                 const bool doTemperatureScaling =
1497                         (ir->etc != TemperatureCoupling::No
1498                          && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1499
1500                 // This applies Leap-Frog, LINCS and SETTLE in succession
1501                 integrator->integrate(
1502                         stateGpu->getForcesReadyOnDeviceEvent(
1503                                 AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1504                         ir->delta_t,
1505                         true,
1506                         bCalcVir,
1507                         shake_vir,
1508                         doTemperatureScaling,
1509                         ekind->tcstat,
1510                         doParrinelloRahman,
1511                         ir->nstpcouple * ir->delta_t,
1512                         M);
1513
1514                 // Copy velocities D2H after update if:
1515                 // - Globals are computed this step (includes the energy output steps).
1516                 // - Temperature is needed for the next step.
1517                 if (bGStat || needHalfStepKineticEnergy)
1518                 {
1519                     stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1520                     stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1521                 }
1522             }
1523             else
1524             {
1525                 /* With multiple time stepping we need to do an additional normal
1526                  * update step to obtain the virial, as the actual MTS integration
1527                  * using an acceleration where the slow forces are multiplied by mtsFactor.
1528                  * Using that acceleration would result in a virial with the slow
1529                  * force contribution would be a factor mtsFactor too large.
1530                  */
1531                 if (fr->useMts && bCalcVir && constr != nullptr)
1532                 {
1533                     upd.update_for_constraint_virial(
1534                             *ir, *mdatoms, *state, f.view().forceWithPadding(), *ekind);
1535
1536                     constrain_coordinates(constr,
1537                                           do_log,
1538                                           do_ene,
1539                                           step,
1540                                           state,
1541                                           upd.xp()->arrayRefWithPadding(),
1542                                           &dvdl_constr,
1543                                           bCalcVir,
1544                                           shake_vir);
1545                 }
1546
1547                 ArrayRefWithPadding<const RVec> forceCombined =
1548                         (fr->useMts && step % ir->mtsLevels[1].stepFactor == 0)
1549                                 ? f.view().forceMtsCombinedWithPadding()
1550                                 : f.view().forceWithPadding();
1551                 upd.update_coords(
1552                         *ir, step, mdatoms, state, forceCombined, fcdata, ekind, M, etrtPOSITION, cr, constr != nullptr);
1553
1554                 wallcycle_stop(wcycle, ewcUPDATE);
1555
1556                 constrain_coordinates(constr,
1557                                       do_log,
1558                                       do_ene,
1559                                       step,
1560                                       state,
1561                                       upd.xp()->arrayRefWithPadding(),
1562                                       &dvdl_constr,
1563                                       bCalcVir && !fr->useMts,
1564                                       shake_vir);
1565
1566                 upd.update_sd_second_half(
1567                         *ir, step, &dvdl_constr, mdatoms, state, cr, nrnb, wcycle, constr, do_log, do_ene);
1568                 upd.finish_update(*ir, mdatoms, state, wcycle, constr != nullptr);
1569             }
1570
1571             if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1572             {
1573                 updatePrevStepPullCom(pull_work, state);
1574             }
1575
1576             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1577         }
1578
1579         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1580         /* With Leap-Frog we can skip compute_globals at
1581          * non-communication steps, but we need to calculate
1582          * the kinetic energy one step before communication.
1583          */
1584         {
1585             // Organize to do inter-simulation signalling on steps if
1586             // and when algorithms require it.
1587             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1588
1589             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1590             {
1591                 // Copy coordinates when needed to stop the CM motion.
1592                 if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
1593                 {
1594                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1595                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1596                 }
1597                 // Since we're already communicating at this step, we
1598                 // can propagate intra-simulation signals. Note that
1599                 // check_nstglobalcomm has the responsibility for
1600                 // choosing the value of nstglobalcomm that is one way
1601                 // bGStat becomes true, so we can't get into a
1602                 // situation where e.g. checkpointing can't be
1603                 // signalled.
1604                 bool                doIntraSimSignal = true;
1605                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1606
1607                 compute_globals(
1608                         gstat,
1609                         cr,
1610                         ir,
1611                         fr,
1612                         ekind,
1613                         makeConstArrayRef(state->x),
1614                         makeConstArrayRef(state->v),
1615                         state->box,
1616                         mdatoms,
1617                         nrnb,
1618                         &vcm,
1619                         wcycle,
1620                         enerd,
1621                         force_vir,
1622                         shake_vir,
1623                         total_vir,
1624                         pres,
1625                         (!EI_VV(ir->eI) && bCalcEner && constr != nullptr) ? constr->rmsdData()
1626                                                                            : gmx::ArrayRef<real>{},
1627                         &signaller,
1628                         lastbox,
1629                         &bSumEkinhOld,
1630                         (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1631                                 | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1632                                 | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1633                                 | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
1634                                 | (DOMAINDECOMP(cr) && shouldCheckNumberOfBondedInteractions(*cr->dd)
1635                                            ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1636                                            : 0));
1637                 if (DOMAINDECOMP(cr))
1638                 {
1639                     checkNumberOfBondedInteractions(
1640                             mdlog, cr, top_global, &top, makeConstArrayRef(state->x), state->box);
1641                 }
1642                 if (!EI_VV(ir->eI) && bStopCM)
1643                 {
1644                     process_and_stopcm_grp(
1645                             fplog, &vcm, *mdatoms, makeArrayRef(state->x), makeArrayRef(state->v));
1646                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1647
1648                     // TODO: The special case of removing CM motion should be dealt more gracefully
1649                     if (useGpuForUpdate)
1650                     {
1651                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1652                         // Here we block until the H2D copy completes because event sync with the
1653                         // force kernels that use the coordinates on the next steps is not implemented
1654                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1655                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1656                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1657                         if (vcm.mode != ComRemovalAlgorithm::No)
1658                         {
1659                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1660                         }
1661                     }
1662                 }
1663             }
1664         }
1665
1666         /* #############  END CALC EKIN AND PRESSURE ################# */
1667
1668         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1669            the virial that should probably be addressed eventually. state->veta has better properies,
1670            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1671            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1672
1673         if (ir->efep != FreeEnergyPerturbationType::No && !EI_VV(ir->eI))
1674         {
1675             /* Sum up the foreign energy and dK/dl terms for md and sd.
1676                Currently done every step so that dH/dl is correct in the .edr */
1677             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1678         }
1679
1680         update_pcouple_after_coordinates(
1681                 fplog, step, ir, mdatoms, pres, force_vir, shake_vir, pressureCouplingMu, state, nrnb, upd.deform(), !useGpuForUpdate);
1682
1683         const bool doBerendsenPressureCoupling = (inputrec->epc == PressureCoupling::Berendsen
1684                                                   && do_per_step(step, inputrec->nstpcouple));
1685         const bool doCRescalePressureCoupling  = (inputrec->epc == PressureCoupling::CRescale
1686                                                  && do_per_step(step, inputrec->nstpcouple));
1687         if (useGpuForUpdate
1688             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1689         {
1690             integrator->scaleCoordinates(pressureCouplingMu);
1691             if (doCRescalePressureCoupling)
1692             {
1693                 matrix pressureCouplingInvMu;
1694                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1695                 integrator->scaleVelocities(pressureCouplingInvMu);
1696             }
1697             integrator->setPbc(PbcType::Xyz, state->box);
1698         }
1699
1700         /* ################# END UPDATE STEP 2 ################# */
1701         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1702
1703         /* The coordinates (x) were unshifted in update */
1704         if (!bGStat)
1705         {
1706             /* We will not sum ekinh_old,
1707              * so signal that we still have to do it.
1708              */
1709             bSumEkinhOld = TRUE;
1710         }
1711
1712         if (bCalcEner)
1713         {
1714             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1715
1716             /* use the directly determined last velocity, not actually the averaged half steps */
1717             if (bTrotter && ir->eI == IntegrationAlgorithm::VV)
1718             {
1719                 enerd->term[F_EKIN] = last_ekin;
1720             }
1721             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1722
1723             if (integratorHasConservedEnergyQuantity(ir))
1724             {
1725                 if (EI_VV(ir->eI))
1726                 {
1727                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1728                 }
1729                 else
1730                 {
1731                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1732                 }
1733             }
1734             /* #########  END PREPARING EDR OUTPUT  ###########  */
1735         }
1736
1737         /* Output stuff */
1738         if (MASTER(cr))
1739         {
1740             if (fplog && do_log && bDoExpanded)
1741             {
1742                 /* only needed if doing expanded ensemble */
1743                 PrintFreeEnergyInfoToFile(fplog,
1744                                           ir->fepvals.get(),
1745                                           ir->expandedvals.get(),
1746                                           ir->bSimTemp ? ir->simtempvals.get() : nullptr,
1747                                           state_global->dfhist,
1748                                           state->fep_state,
1749                                           ir->nstlog,
1750                                           step);
1751             }
1752             if (bCalcEner)
1753             {
1754                 energyOutput.addDataAtEnergyStep(bDoDHDL,
1755                                                  bCalcEnerStep,
1756                                                  t,
1757                                                  mdatoms->tmass,
1758                                                  enerd,
1759                                                  ir->fepvals.get(),
1760                                                  ir->expandedvals.get(),
1761                                                  lastbox,
1762                                                  PTCouplingArrays{ state->boxv,
1763                                                                    state->nosehoover_xi,
1764                                                                    state->nosehoover_vxi,
1765                                                                    state->nhpres_xi,
1766                                                                    state->nhpres_vxi },
1767                                                  state->fep_state,
1768                                                  shake_vir,
1769                                                  force_vir,
1770                                                  total_vir,
1771                                                  pres,
1772                                                  ekind,
1773                                                  mu_tot,
1774                                                  constr);
1775             }
1776             else
1777             {
1778                 energyOutput.recordNonEnergyStep();
1779             }
1780
1781             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1782             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1783
1784             if (doSimulatedAnnealing)
1785             {
1786                 gmx::EnergyOutput::printAnnealingTemperatures(
1787                         do_log ? fplog : nullptr, groups, &(ir->opts));
1788             }
1789             if (do_log || do_ene || do_dr || do_or)
1790             {
1791                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf),
1792                                                    do_ene,
1793                                                    do_dr,
1794                                                    do_or,
1795                                                    do_log ? fplog : nullptr,
1796                                                    step,
1797                                                    t,
1798                                                    fr->fcdata.get(),
1799                                                    awh.get());
1800             }
1801             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1802             {
1803                 const bool isInitialOutput = false;
1804                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1805             }
1806
1807             if (ir->bPull)
1808             {
1809                 pull_print_output(pull_work, step, t);
1810             }
1811
1812             if (do_per_step(step, ir->nstlog))
1813             {
1814                 if (fflush(fplog) != 0)
1815                 {
1816                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1817                 }
1818             }
1819         }
1820         if (bDoExpanded)
1821         {
1822             /* Have to do this part _after_ outputting the logfile and the edr file */
1823             /* Gets written into the state at the beginning of next loop*/
1824             state->fep_state = lamnew;
1825         }
1826         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1827         {
1828             state->fep_state = awh->fepLambdaState();
1829         }
1830         /* Print the remaining wall clock time for the run */
1831         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1832         {
1833             if (shellfc)
1834             {
1835                 fprintf(stderr, "\n");
1836             }
1837             print_time(stderr, walltime_accounting, step, ir, cr);
1838         }
1839
1840         /* Ion/water position swapping.
1841          * Not done in last step since trajectory writing happens before this call
1842          * in the MD loop and exchanges would be lost anyway. */
1843         bNeedRepartition = FALSE;
1844         if ((ir->eSwapCoords != SwapType::No) && (step > 0) && !bLastStep
1845             && do_per_step(step, ir->swap->nstswap))
1846         {
1847             bNeedRepartition = do_swapcoords(cr,
1848                                              step,
1849                                              t,
1850                                              ir,
1851                                              swap,
1852                                              wcycle,
1853                                              as_rvec_array(state->x.data()),
1854                                              state->box,
1855                                              MASTER(cr) && mdrunOptions.verbose,
1856                                              bRerunMD);
1857
1858             if (bNeedRepartition && DOMAINDECOMP(cr))
1859             {
1860                 dd_collect_state(cr->dd, state, state_global);
1861             }
1862         }
1863
1864         /* Replica exchange */
1865         bExchanged = FALSE;
1866         if (bDoReplEx)
1867         {
1868             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1869         }
1870
1871         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1872         {
1873             dd_partition_system(fplog,
1874                                 mdlog,
1875                                 step,
1876                                 cr,
1877                                 TRUE,
1878                                 1,
1879                                 state_global,
1880                                 top_global,
1881                                 *ir,
1882                                 imdSession,
1883                                 pull_work,
1884                                 state,
1885                                 &f,
1886                                 mdAtoms,
1887                                 &top,
1888                                 fr,
1889                                 vsite,
1890                                 constr,
1891                                 nrnb,
1892                                 wcycle,
1893                                 FALSE);
1894             upd.setNumAtoms(state->natoms);
1895         }
1896
1897         bFirstStep = FALSE;
1898         bInitStep  = FALSE;
1899
1900         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1901         /* With all integrators, except VV, we need to retain the pressure
1902          * at the current step for coupling at the next step.
1903          */
1904         if ((state->flags & enumValueToBitMask(StateEntry::PressurePrevious))
1905             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1906         {
1907             /* Store the pressure in t_state for pressure coupling
1908              * at the next MD step.
1909              */
1910             copy_mat(pres, state->pres_prev);
1911         }
1912
1913         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1914
1915         if ((membed != nullptr) && (!bLastStep))
1916         {
1917             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1918         }
1919
1920         cycles = wallcycle_stop(wcycle, ewcSTEP);
1921         if (DOMAINDECOMP(cr) && wcycle)
1922         {
1923             dd_cycles_add(cr->dd, cycles, ddCyclStep);
1924         }
1925
1926         /* increase the MD step number */
1927         step++;
1928         step_rel++;
1929
1930 #if GMX_FAHCORE
1931         if (MASTER(cr))
1932         {
1933             fcReportProgress(ir->nsteps + ir->init_step, step);
1934         }
1935 #endif
1936
1937         resetHandler->resetCounters(
1938                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
1939
1940         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
1941         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
1942     }
1943     /* End of main MD loop */
1944
1945     /* Closing TNG files can include compressing data. Therefore it is good to do that
1946      * before stopping the time measurements. */
1947     mdoutf_tng_close(outf);
1948
1949     /* Stop measuring walltime */
1950     walltime_accounting_end_time(walltime_accounting);
1951
1952     if (!thisRankHasDuty(cr, DUTY_PME))
1953     {
1954         /* Tell the PME only node to finish */
1955         gmx_pme_send_finish(cr);
1956     }
1957
1958     if (MASTER(cr))
1959     {
1960         if (ir->nstcalcenergy > 0)
1961         {
1962             energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
1963
1964             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
1965             energyOutput.printAverages(fplog, groups);
1966         }
1967     }
1968     done_mdoutf(outf);
1969
1970     if (bPMETune)
1971     {
1972         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
1973     }
1974
1975     done_shellfc(fplog, shellfc, step_rel);
1976
1977     if (useReplicaExchange && MASTER(cr))
1978     {
1979         print_replica_exchange_statistics(fplog, repl_ex);
1980     }
1981
1982     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
1983
1984     global_stat_destroy(gstat);
1985 }