Make temperature and pressure coupling enums enum classes
[alexxy/gromacs.git] / src / gromacs / mdrun / md.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 /*! \internal \file
38  *
39  * \brief Implements the integrator for normal molecular dynamics simulations
40  *
41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
42  * \ingroup module_mdrun
43  */
44 #include "gmxpre.h"
45
46 #include <cinttypes>
47 #include <cmath>
48 #include <cstdio>
49 #include <cstdlib>
50
51 #include <algorithm>
52 #include <memory>
53 #include <numeric>
54
55 #include "gromacs/applied_forces/awh/awh.h"
56 #include "gromacs/commandline/filenm.h"
57 #include "gromacs/domdec/collect.h"
58 #include "gromacs/domdec/dlbtiming.h"
59 #include "gromacs/domdec/domdec.h"
60 #include "gromacs/domdec/domdec_network.h"
61 #include "gromacs/domdec/domdec_struct.h"
62 #include "gromacs/domdec/gpuhaloexchange.h"
63 #include "gromacs/domdec/mdsetup.h"
64 #include "gromacs/domdec/partition.h"
65 #include "gromacs/essentialdynamics/edsam.h"
66 #include "gromacs/ewald/pme_load_balancing.h"
67 #include "gromacs/ewald/pme_pp.h"
68 #include "gromacs/fileio/trxio.h"
69 #include "gromacs/gmxlib/network.h"
70 #include "gromacs/gmxlib/nrnb.h"
71 #include "gromacs/gpu_utils/device_stream_manager.h"
72 #include "gromacs/gpu_utils/gpu_utils.h"
73 #include "gromacs/imd/imd.h"
74 #include "gromacs/listed_forces/listed_forces.h"
75 #include "gromacs/math/functions.h"
76 #include "gromacs/math/invertmatrix.h"
77 #include "gromacs/math/vec.h"
78 #include "gromacs/math/vectypes.h"
79 #include "gromacs/mdlib/checkpointhandler.h"
80 #include "gromacs/mdlib/compute_io.h"
81 #include "gromacs/mdlib/constr.h"
82 #include "gromacs/mdlib/coupling.h"
83 #include "gromacs/mdlib/ebin.h"
84 #include "gromacs/mdlib/enerdata_utils.h"
85 #include "gromacs/mdlib/energyoutput.h"
86 #include "gromacs/mdlib/expanded.h"
87 #include "gromacs/mdlib/force.h"
88 #include "gromacs/mdlib/force_flags.h"
89 #include "gromacs/mdlib/forcerec.h"
90 #include "gromacs/mdlib/freeenergyparameters.h"
91 #include "gromacs/mdlib/md_support.h"
92 #include "gromacs/mdlib/mdatoms.h"
93 #include "gromacs/mdlib/mdoutf.h"
94 #include "gromacs/mdlib/membed.h"
95 #include "gromacs/mdlib/resethandler.h"
96 #include "gromacs/mdlib/sighandler.h"
97 #include "gromacs/mdlib/simulationsignal.h"
98 #include "gromacs/mdlib/stat.h"
99 #include "gromacs/mdlib/stophandler.h"
100 #include "gromacs/mdlib/tgroup.h"
101 #include "gromacs/mdlib/trajectory_writing.h"
102 #include "gromacs/mdlib/update.h"
103 #include "gromacs/mdlib/update_constrain_gpu.h"
104 #include "gromacs/mdlib/update_vv.h"
105 #include "gromacs/mdlib/vcm.h"
106 #include "gromacs/mdlib/vsite.h"
107 #include "gromacs/mdrunutility/handlerestart.h"
108 #include "gromacs/mdrunutility/multisim.h"
109 #include "gromacs/mdrunutility/printtime.h"
110 #include "gromacs/mdtypes/awh_history.h"
111 #include "gromacs/mdtypes/awh_params.h"
112 #include "gromacs/mdtypes/commrec.h"
113 #include "gromacs/mdtypes/df_history.h"
114 #include "gromacs/mdtypes/energyhistory.h"
115 #include "gromacs/mdtypes/fcdata.h"
116 #include "gromacs/mdtypes/forcebuffers.h"
117 #include "gromacs/mdtypes/forcerec.h"
118 #include "gromacs/mdtypes/group.h"
119 #include "gromacs/mdtypes/inputrec.h"
120 #include "gromacs/mdtypes/interaction_const.h"
121 #include "gromacs/mdtypes/md_enums.h"
122 #include "gromacs/mdtypes/mdatom.h"
123 #include "gromacs/mdtypes/mdrunoptions.h"
124 #include "gromacs/mdtypes/multipletimestepping.h"
125 #include "gromacs/mdtypes/observableshistory.h"
126 #include "gromacs/mdtypes/pullhistory.h"
127 #include "gromacs/mdtypes/simulation_workload.h"
128 #include "gromacs/mdtypes/state.h"
129 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
130 #include "gromacs/modularsimulator/energydata.h"
131 #include "gromacs/nbnxm/gpu_data_mgmt.h"
132 #include "gromacs/nbnxm/nbnxm.h"
133 #include "gromacs/pbcutil/pbc.h"
134 #include "gromacs/pulling/output.h"
135 #include "gromacs/pulling/pull.h"
136 #include "gromacs/swap/swapcoords.h"
137 #include "gromacs/timing/wallcycle.h"
138 #include "gromacs/timing/walltime_accounting.h"
139 #include "gromacs/topology/atoms.h"
140 #include "gromacs/topology/idef.h"
141 #include "gromacs/topology/mtop_util.h"
142 #include "gromacs/topology/topology.h"
143 #include "gromacs/trajectory/trajectoryframe.h"
144 #include "gromacs/utility/basedefinitions.h"
145 #include "gromacs/utility/cstringutil.h"
146 #include "gromacs/utility/fatalerror.h"
147 #include "gromacs/utility/logger.h"
148 #include "gromacs/utility/real.h"
149 #include "gromacs/utility/smalloc.h"
150
151 #include "legacysimulator.h"
152 #include "replicaexchange.h"
153 #include "shellfc.h"
154
155 using gmx::SimulationSignaller;
156
157 void gmx::LegacySimulator::do_md()
158 {
159     // TODO Historically, the EM and MD "integrators" used different
160     // names for the t_inputrec *parameter, but these must have the
161     // same name, now that it's a member of a struct. We use this ir
162     // alias to avoid a large ripple of nearly useless changes.
163     // t_inputrec is being replaced by IMdpOptionsProvider, so this
164     // will go away eventually.
165     const t_inputrec* ir = inputrec;
166
167     int64_t      step, step_rel;
168     double       t, t0 = ir->init_t;
169     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
170     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
171     gmx_bool     bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
172     gmx_bool     do_ene, do_log, do_verbose;
173     gmx_bool     bMasterState;
174     unsigned int force_flags;
175     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
176     int    i, m;
177     rvec   mu_tot;
178     matrix pressureCouplingMu, M;
179     gmx_repl_ex_t     repl_ex = nullptr;
180     gmx_global_stat_t gstat;
181     gmx_shellfc_t*    shellfc;
182     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
183     gmx_bool          bTrotter;
184     real              dvdl_constr;
185     std::vector<RVec> cbuf;
186     matrix            lastbox;
187     int               lamnew = 0;
188     /* for FEP */
189     int       nstfep = 0;
190     double    cycles;
191     real      saved_conserved_quantity = 0;
192     real      last_ekin                = 0;
193     t_extmass MassQ;
194     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
195
196     /* PME load balancing data for GPU kernels */
197     gmx_bool bPMETune         = FALSE;
198     gmx_bool bPMETunePrinting = FALSE;
199
200     bool bInteractiveMDstep = false;
201
202     /* Domain decomposition could incorrectly miss a bonded
203        interaction, but checking for that requires a global
204        communication stage, which does not otherwise happen in DD
205        code. So we do that alongside the first global energy reduction
206        after a new DD is made. These variables handle whether the
207        check happens, and the result it returns. */
208     bool shouldCheckNumberOfBondedInteractions = false;
209     int  totalNumberOfBondedInteractions       = -1;
210
211     SimulationSignals signals;
212     // Most global communnication stages don't propagate mdrun
213     // signals, and will use this object to achieve that.
214     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
215
216     if (!mdrunOptions.writeConfout)
217     {
218         // This is on by default, and the main known use case for
219         // turning it off is for convenience in benchmarking, which is
220         // something that should not show up in the general user
221         // interface.
222         GMX_LOG(mdlog.info)
223                 .asParagraph()
224                 .appendText(
225                         "The -noconfout functionality is deprecated, and may be removed in a "
226                         "future version.");
227     }
228
229     /* md-vv uses averaged full step velocities for T-control
230        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
231        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
232     bTrotter = (EI_VV(ir->eI)
233                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
234
235     const bool bRerunMD = false;
236
237     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
238     bGStatEveryStep   = (nstglobalcomm == 1);
239
240     const SimulationGroups* groups = &top_global->groups;
241
242     std::unique_ptr<EssentialDynamics> ed = nullptr;
243     if (opt2bSet("-ei", nfile, fnm))
244     {
245         /* Initialize essential dynamics sampling */
246         ed = init_edsam(mdlog,
247                         opt2fn_null("-ei", nfile, fnm),
248                         opt2fn("-eo", nfile, fnm),
249                         top_global,
250                         ir,
251                         cr,
252                         constr,
253                         state_global,
254                         observablesHistory,
255                         oenv,
256                         startingBehavior);
257     }
258     else if (observablesHistory->edsamHistory)
259     {
260         gmx_fatal(FARGS,
261                   "The checkpoint is from a run with essential dynamics sampling, "
262                   "but the current run did not specify the -ei option. "
263                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
264     }
265
266     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
267     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
268     initialize_lambdas(fplog, *ir, MASTER(cr), fep_state, lambda);
269     Update upd(*ir, deform);
270     bool   doSimulatedAnnealing = false;
271     {
272         // TODO: Avoid changing inputrec (#3854)
273         // Simulated annealing updates the reference temperature.
274         auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
275         doSimulatedAnnealing   = initSimulatedAnnealing(nonConstInputrec, &upd);
276     }
277     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
278
279     const t_fcdata& fcdata = *fr->fcdata;
280
281     bool simulationsShareState = false;
282     int  nstSignalComm         = nstglobalcomm;
283     {
284         // TODO This implementation of ensemble orientation restraints is nasty because
285         // a user can't just do multi-sim with single-sim orientation restraints.
286         bool usingEnsembleRestraints =
287                 (fcdata.disres->nsystems > 1) || ((ms != nullptr) && (fcdata.orires->nr != 0));
288         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr));
289
290         // Replica exchange, ensemble restraints and AWH need all
291         // simulations to remain synchronized, so they need
292         // checkpoints and stop conditions to act on the same step, so
293         // the propagation of such signals must take place between
294         // simulations, not just within simulations.
295         // TODO: Make algorithm initializers set these flags.
296         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
297
298         if (simulationsShareState)
299         {
300             // Inter-simulation signal communication does not need to happen
301             // often, so we use a minimum of 200 steps to reduce overhead.
302             const int c_minimumInterSimulationSignallingInterval = 200;
303             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
304                             * nstglobalcomm;
305         }
306     }
307
308     if (startingBehavior != StartingBehavior::RestartWithAppending)
309     {
310         pleaseCiteCouplingAlgorithms(fplog, *ir);
311     }
312     gmx_mdoutf*       outf = init_mdoutf(fplog,
313                                    nfile,
314                                    fnm,
315                                    mdrunOptions,
316                                    cr,
317                                    outputProvider,
318                                    mdModulesNotifier,
319                                    ir,
320                                    top_global,
321                                    oenv,
322                                    wcycle,
323                                    startingBehavior,
324                                    simulationsShareState,
325                                    ms);
326     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf),
327                                    top_global,
328                                    ir,
329                                    pull_work,
330                                    mdoutf_get_fp_dhdl(outf),
331                                    false,
332                                    startingBehavior,
333                                    simulationsShareState,
334                                    mdModulesNotifier);
335
336     gstat = global_stat_init(ir);
337
338     const auto& simulationWork     = runScheduleWork->simulationWork;
339     const bool  useGpuForPme       = simulationWork.useGpuPme;
340     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
341     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
342     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
343
344     /* Check for polarizable models and flexible constraints */
345     shellfc = init_shell_flexcon(fplog,
346                                  top_global,
347                                  constr ? constr->numFlexibleConstraints() : 0,
348                                  ir->nstcalcenergy,
349                                  DOMAINDECOMP(cr),
350                                  useGpuForPme);
351
352     {
353         double io = compute_io(ir, top_global->natoms, *groups, energyOutput.numEnergyTerms(), 1);
354         if ((io > 2000) && MASTER(cr))
355         {
356             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
357         }
358     }
359
360     // Local state only becomes valid now.
361     std::unique_ptr<t_state> stateInstance;
362     t_state*                 state;
363
364     gmx_localtop_t top(top_global->ffparams);
365
366     auto mdatoms = mdAtoms->mdatoms();
367
368     ForceBuffers f(fr->useMts,
369                    ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
370                            ? PinningPolicy::PinnedIfSupported
371                            : PinningPolicy::CannotBePinned);
372     if (DOMAINDECOMP(cr))
373     {
374         stateInstance = std::make_unique<t_state>();
375         state         = stateInstance.get();
376         dd_init_local_state(cr->dd, state_global, state);
377
378         /* Distribute the charge groups over the nodes from the master node */
379         dd_partition_system(fplog,
380                             mdlog,
381                             ir->init_step,
382                             cr,
383                             TRUE,
384                             1,
385                             state_global,
386                             *top_global,
387                             ir,
388                             imdSession,
389                             pull_work,
390                             state,
391                             &f,
392                             mdAtoms,
393                             &top,
394                             fr,
395                             vsite,
396                             constr,
397                             nrnb,
398                             nullptr,
399                             FALSE);
400         shouldCheckNumberOfBondedInteractions = true;
401         upd.setNumAtoms(state->natoms);
402     }
403     else
404     {
405         state_change_natoms(state_global, state_global->natoms);
406         /* Copy the pointer to the global state */
407         state = state_global;
408
409         /* Generate and initialize new topology */
410         mdAlgorithmsSetupAtomData(cr, ir, *top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
411
412         upd.setNumAtoms(state->natoms);
413     }
414
415     std::unique_ptr<UpdateConstrainGpu> integrator;
416
417     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
418
419     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
420     if (useGpuForUpdate)
421     {
422         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
423                                    || constr->numConstraintsTotal() == 0,
424                            "Constraints in domain decomposition are only supported with update "
425                            "groups if using GPU update.\n");
426         GMX_RELEASE_ASSERT(ir->eConstrAlg != econtSHAKE || constr == nullptr
427                                    || constr->numConstraintsTotal() == 0,
428                            "SHAKE is not supported with GPU update.");
429         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
430                            "Either PME or short-ranged non-bonded interaction tasks must run on "
431                            "the GPU to use GPU update.\n");
432         GMX_RELEASE_ASSERT(ir->eI == eiMD,
433                            "Only the md integrator is supported with the GPU update.\n");
434         GMX_RELEASE_ASSERT(
435                 ir->etc != TemperatureCoupling::NoseHoover,
436                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
437         GMX_RELEASE_ASSERT(
438                 ir->epc == PressureCoupling::No || ir->epc == PressureCoupling::ParrinelloRahman
439                         || ir->epc == PressureCoupling::Berendsen || ir->epc == PressureCoupling::CRescale,
440                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
441                 "with the GPU update.\n");
442         GMX_RELEASE_ASSERT(!mdatoms->haveVsites,
443                            "Virtual sites are not supported with the GPU update.\n");
444         GMX_RELEASE_ASSERT(ed == nullptr,
445                            "Essential dynamics is not supported with the GPU update.\n");
446         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
447                            "Constraints pulling is not supported with the GPU update.\n");
448         GMX_RELEASE_ASSERT(fcdata.orires->nr == 0,
449                            "Orientation restraints are not supported with the GPU update.\n");
450         GMX_RELEASE_ASSERT(
451                 ir->efep == efepNO
452                         || (!haveFepPerturbedMasses(*top_global) && !havePerturbedConstraints(*top_global)),
453                 "Free energy perturbation of masses and constraints are not supported with the GPU "
454                 "update.");
455
456         if (constr != nullptr && constr->numConstraintsTotal() > 0)
457         {
458             GMX_LOG(mdlog.info)
459                     .asParagraph()
460                     .appendText("Updating coordinates and applying constraints on the GPU.");
461         }
462         else
463         {
464             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
465         }
466         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
467                            "Device stream manager should be initialized in order to use GPU "
468                            "update-constraints.");
469         GMX_RELEASE_ASSERT(
470                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
471                 "Update stream should be initialized in order to use GPU "
472                 "update-constraints.");
473         integrator = std::make_unique<UpdateConstrainGpu>(
474                 *ir,
475                 *top_global,
476                 fr->deviceStreamManager->context(),
477                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
478                 stateGpu->xUpdatedOnDevice(),
479                 wcycle);
480
481         integrator->setPbc(PbcType::Xyz, state->box);
482     }
483
484     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
485     {
486         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
487     }
488     if (useGpuForUpdate)
489     {
490         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
491     }
492
493     // NOTE: The global state is no longer used at this point.
494     // But state_global is still used as temporary storage space for writing
495     // the global state to file and potentially for replica exchange.
496     // (Global topology should persist.)
497
498     update_mdatoms(mdatoms, state->lambda[efptMASS]);
499
500     if (ir->bExpanded)
501     {
502         /* Check nstexpanded here, because the grompp check was broken */
503         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
504         {
505             gmx_fatal(FARGS,
506                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
507         }
508         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
509     }
510
511     if (MASTER(cr))
512     {
513         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
514     }
515
516     preparePrevStepPullCom(
517             ir, pull_work, mdatoms->massT, state, state_global, cr, startingBehavior != StartingBehavior::NewSimulation);
518
519     // TODO: Remove this by converting AWH into a ForceProvider
520     auto awh = prepareAwhModule(fplog,
521                                 *ir,
522                                 state_global,
523                                 cr,
524                                 ms,
525                                 startingBehavior != StartingBehavior::NewSimulation,
526                                 shellfc != nullptr,
527                                 opt2fn("-awh", nfile, fnm),
528                                 pull_work);
529
530     if (useReplicaExchange && MASTER(cr))
531     {
532         repl_ex = init_replica_exchange(fplog, ms, top_global->natoms, ir, replExParams);
533     }
534     /* PME tuning is only supported in the Verlet scheme, with PME for
535      * Coulomb. It is not supported with only LJ PME. */
536     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
537                 && ir->cutoff_scheme != ecutsGROUP);
538
539     pme_load_balancing_t* pme_loadbal = nullptr;
540     if (bPMETune)
541     {
542         pme_loadbal_init(
543                 &pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu());
544     }
545
546     if (!ir->bContinuation)
547     {
548         if (state->flags & (1U << estV))
549         {
550             auto v = makeArrayRef(state->v);
551             /* Set the velocities of vsites, shells and frozen atoms to zero */
552             for (i = 0; i < mdatoms->homenr; i++)
553             {
554                 if (mdatoms->ptype[i] == eptVSite || mdatoms->ptype[i] == eptShell)
555                 {
556                     clear_rvec(v[i]);
557                 }
558                 else if (mdatoms->cFREEZE)
559                 {
560                     for (m = 0; m < DIM; m++)
561                     {
562                         if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
563                         {
564                             v[i][m] = 0;
565                         }
566                     }
567                 }
568             }
569         }
570
571         if (constr)
572         {
573             /* Constrain the initial coordinates and velocities */
574             do_constrain_first(fplog,
575                                constr,
576                                ir,
577                                mdatoms->nr,
578                                mdatoms->homenr,
579                                state->x.arrayRefWithPadding(),
580                                state->v.arrayRefWithPadding(),
581                                state->box,
582                                state->lambda[efptBONDED]);
583         }
584         if (vsite)
585         {
586             /* Construct the virtual sites for the initial configuration */
587             vsite->construct(state->x, ir->delta_t, {}, state->box);
588         }
589     }
590
591     if (ir->efep != efepNO)
592     {
593         /* Set free energy calculation frequency as the greatest common
594          * denominator of nstdhdl and repl_ex_nst. */
595         nstfep = ir->fepvals->nstdhdl;
596         if (ir->bExpanded)
597         {
598             nstfep = std::gcd(ir->expandedvals->nstexpanded, nstfep);
599         }
600         if (useReplicaExchange)
601         {
602             nstfep = std::gcd(replExParams.exchangeInterval, nstfep);
603         }
604         if (ir->bDoAwh)
605         {
606             nstfep = std::gcd(ir->awhParams->nstSampleCoord, nstfep);
607         }
608     }
609
610     /* Be REALLY careful about what flags you set here. You CANNOT assume
611      * this is the first step, since we might be restarting from a checkpoint,
612      * and in that case we should not do any modifications to the state.
613      */
614     bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
615
616     // When restarting from a checkpoint, it can be appropriate to
617     // initialize ekind from quantities in the checkpoint. Otherwise,
618     // compute_globals must initialize ekind before the simulation
619     // starts/restarts. However, only the master rank knows what was
620     // found in the checkpoint file, so we have to communicate in
621     // order to coordinate the restart.
622     //
623     // TODO Consider removing this communication if/when checkpoint
624     // reading directly follows .tpr reading, because all ranks can
625     // agree on hasReadEkinState at that time.
626     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
627     if (PAR(cr))
628     {
629         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
630     }
631     if (hasReadEkinState)
632     {
633         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
634     }
635
636     unsigned int cglo_flags =
637             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
638              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
639
640     bSumEkinhOld = FALSE;
641
642     t_vcm vcm(top_global->groups, *ir);
643     reportComRemovalInfo(fplog, vcm);
644
645     /* To minimize communication, compute_globals computes the COM velocity
646      * and the kinetic energy for the velocities without COM motion removed.
647      * Thus to get the kinetic energy without the COM contribution, we need
648      * to call compute_globals twice.
649      */
650     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
651     {
652         unsigned int cglo_flags_iteration = cglo_flags;
653         if (bStopCM && cgloIteration == 0)
654         {
655             cglo_flags_iteration |= CGLO_STOPCM;
656             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
657         }
658         compute_globals(gstat,
659                         cr,
660                         ir,
661                         fr,
662                         ekind,
663                         makeConstArrayRef(state->x),
664                         makeConstArrayRef(state->v),
665                         state->box,
666                         mdatoms,
667                         nrnb,
668                         &vcm,
669                         nullptr,
670                         enerd,
671                         force_vir,
672                         shake_vir,
673                         total_vir,
674                         pres,
675                         gmx::ArrayRef<real>{},
676                         &nullSignaller,
677                         state->box,
678                         &totalNumberOfBondedInteractions,
679                         &bSumEkinhOld,
680                         cglo_flags_iteration
681                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
682                                                                          : 0));
683         if (cglo_flags_iteration & CGLO_STOPCM)
684         {
685             /* At initialization, do not pass x with acceleration-correction mode
686              * to avoid (incorrect) correction of the initial coordinates.
687              */
688             auto x = (vcm.mode == ecmLINEAR_ACCELERATION_CORRECTION) ? ArrayRef<RVec>()
689                                                                      : makeArrayRef(state->x);
690             process_and_stopcm_grp(fplog, &vcm, *mdatoms, x, makeArrayRef(state->v));
691             inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
692         }
693     }
694     checkNumberOfBondedInteractions(mdlog,
695                                     cr,
696                                     totalNumberOfBondedInteractions,
697                                     top_global,
698                                     &top,
699                                     makeConstArrayRef(state->x),
700                                     state->box,
701                                     &shouldCheckNumberOfBondedInteractions);
702     if (ir->eI == eiVVAK)
703     {
704         /* a second call to get the half step temperature initialized as well */
705         /* we do the same call as above, but turn the pressure off -- internally to
706            compute_globals, this is recognized as a velocity verlet half-step
707            kinetic energy calculation.  This minimized excess variables, but
708            perhaps loses some logic?*/
709
710         compute_globals(gstat,
711                         cr,
712                         ir,
713                         fr,
714                         ekind,
715                         makeConstArrayRef(state->x),
716                         makeConstArrayRef(state->v),
717                         state->box,
718                         mdatoms,
719                         nrnb,
720                         &vcm,
721                         nullptr,
722                         enerd,
723                         force_vir,
724                         shake_vir,
725                         total_vir,
726                         pres,
727                         gmx::ArrayRef<real>{},
728                         &nullSignaller,
729                         state->box,
730                         nullptr,
731                         &bSumEkinhOld,
732                         cglo_flags & ~CGLO_PRESSURE);
733     }
734
735     /* Calculate the initial half step temperature, and save the ekinh_old */
736     if (startingBehavior == StartingBehavior::NewSimulation)
737     {
738         for (i = 0; (i < ir->opts.ngtc); i++)
739         {
740             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
741         }
742     }
743
744     /* need to make an initiation call to get the Trotter variables set, as well as other constants
745        for non-trotter temperature control */
746     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
747
748     if (MASTER(cr))
749     {
750         if (!ir->bContinuation)
751         {
752             if (constr && ir->eConstrAlg == econtLINCS)
753             {
754                 fprintf(fplog,
755                         "RMS relative constraint deviation after constraining: %.2e\n",
756                         constr->rmsd());
757             }
758             if (EI_STATE_VELOCITY(ir->eI))
759             {
760                 real temp = enerd->term[F_TEMP];
761                 if (ir->eI != eiVV)
762                 {
763                     /* Result of Ekin averaged over velocities of -half
764                      * and +half step, while we only have -half step here.
765                      */
766                     temp *= 2;
767                 }
768                 fprintf(fplog, "Initial temperature: %g K\n", temp);
769             }
770         }
771
772         char tbuf[20];
773         fprintf(stderr, "starting mdrun '%s'\n", *(top_global->name));
774         if (ir->nsteps >= 0)
775         {
776             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
777         }
778         else
779         {
780             sprintf(tbuf, "%s", "infinite");
781         }
782         if (ir->init_step > 0)
783         {
784             fprintf(stderr,
785                     "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
786                     gmx_step_str(ir->init_step + ir->nsteps, sbuf),
787                     tbuf,
788                     gmx_step_str(ir->init_step, sbuf2),
789                     ir->init_step * ir->delta_t);
790         }
791         else
792         {
793             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
794         }
795         fprintf(fplog, "\n");
796     }
797
798     walltime_accounting_start_time(walltime_accounting);
799     wallcycle_start(wcycle, ewcRUN);
800     print_start(fplog, cr, walltime_accounting, "mdrun");
801
802     /***********************************************************
803      *
804      *             Loop over MD steps
805      *
806      ************************************************************/
807
808     bFirstStep = TRUE;
809     /* Skip the first Nose-Hoover integration when we get the state from tpx */
810     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
811     bSumEkinhOld     = FALSE;
812     bExchanged       = FALSE;
813     bNeedRepartition = FALSE;
814
815     step     = ir->init_step;
816     step_rel = 0;
817
818     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
819             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]),
820             simulationsShareState,
821             MASTER(cr),
822             ir->nstlist,
823             mdrunOptions.reproducible,
824             nstSignalComm,
825             mdrunOptions.maximumHoursToRun,
826             ir->nstlist == 0,
827             fplog,
828             step,
829             bNS,
830             walltime_accounting);
831
832     auto checkpointHandler = std::make_unique<CheckpointHandler>(
833             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
834             simulationsShareState,
835             ir->nstlist == 0,
836             MASTER(cr),
837             mdrunOptions.writeConfout,
838             mdrunOptions.checkpointOptions.period);
839
840     const bool resetCountersIsLocal = true;
841     auto       resetHandler         = std::make_unique<ResetHandler>(
842             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
843             !resetCountersIsLocal,
844             ir->nsteps,
845             MASTER(cr),
846             mdrunOptions.timingOptions.resetHalfway,
847             mdrunOptions.maximumHoursToRun,
848             mdlog,
849             wcycle,
850             walltime_accounting);
851
852     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
853
854     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
855     {
856         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
857     }
858
859     /* and stop now if we should */
860     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
861     while (!bLastStep)
862     {
863
864         /* Determine if this is a neighbor search step */
865         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
866
867         if (bPMETune && bNStList)
868         {
869             // This has to be here because PME load balancing is called so early.
870             // TODO: Move to after all booleans are defined.
871             if (useGpuForUpdate && !bFirstStep)
872             {
873                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
874                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
875             }
876             /* PME grid + cut-off optimization with GPUs or PME nodes */
877             pme_loadbal_do(pme_loadbal,
878                            cr,
879                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
880                            fplog,
881                            mdlog,
882                            *ir,
883                            fr,
884                            state->box,
885                            state->x,
886                            wcycle,
887                            step,
888                            step_rel,
889                            &bPMETunePrinting,
890                            simulationWork.useGpuPmePpCommunication);
891         }
892
893         wallcycle_start(wcycle, ewcSTEP);
894
895         bLastStep = (step_rel == ir->nsteps);
896         t         = t0 + step * ir->delta_t;
897
898         // TODO Refactor this, so that nstfep does not need a default value of zero
899         if (ir->efep != efepNO || ir->bSimTemp)
900         {
901             /* find and set the current lambdas */
902             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
903
904             bDoDHDL     = do_per_step(step, ir->fepvals->nstdhdl);
905             bDoFEP      = ((ir->efep != efepNO) && do_per_step(step, nstfep));
906             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
907                            && (!bFirstStep));
908         }
909
910         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
911                      && do_per_step(step, replExParams.exchangeInterval));
912
913         if (doSimulatedAnnealing)
914         {
915             // TODO: Avoid changing inputrec (#3854)
916             // Simulated annealing updates the reference temperature.
917             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
918             update_annealing_target_temp(nonConstInputrec, t, &upd);
919         }
920
921         /* Stop Center of Mass motion */
922         bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
923
924         /* Determine whether or not to do Neighbour Searching */
925         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
926
927         /* Note that the stopHandler will cause termination at nstglobalcomm
928          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
929          * nstpcouple steps, we have computed the half-step kinetic energy
930          * of the previous step and can always output energies at the last step.
931          */
932         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
933
934         /* do_log triggers energy and virial calculation. Because this leads
935          * to different code paths, forces can be different. Thus for exact
936          * continuation we should avoid extra log output.
937          * Note that the || bLastStep can result in non-exact continuation
938          * beyond the last step. But we don't consider that to be an issue.
939          */
940         do_log     = (do_per_step(step, ir->nstlog)
941                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
942         do_verbose = mdrunOptions.verbose
943                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
944
945         if (useGpuForUpdate && !bFirstStep && bNS)
946         {
947             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
948             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
949             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
950             // Copy coordinate from the GPU when needed at the search step.
951             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
952             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
953             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
954             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
955         }
956
957         if (bNS && !(bFirstStep && ir->bContinuation))
958         {
959             bMasterState = FALSE;
960             /* Correct the new box if it is too skewed */
961             if (inputrecDynamicBox(ir))
962             {
963                 if (correct_box(fplog, step, state->box))
964                 {
965                     bMasterState = TRUE;
966                     // If update is offloaded, it should be informed about the box size change
967                     if (useGpuForUpdate)
968                     {
969                         integrator->setPbc(PbcType::Xyz, state->box);
970                     }
971                 }
972             }
973             if (DOMAINDECOMP(cr) && bMasterState)
974             {
975                 dd_collect_state(cr->dd, state, state_global);
976             }
977
978             if (DOMAINDECOMP(cr))
979             {
980                 /* Repartition the domain decomposition */
981                 dd_partition_system(fplog,
982                                     mdlog,
983                                     step,
984                                     cr,
985                                     bMasterState,
986                                     nstglobalcomm,
987                                     state_global,
988                                     *top_global,
989                                     ir,
990                                     imdSession,
991                                     pull_work,
992                                     state,
993                                     &f,
994                                     mdAtoms,
995                                     &top,
996                                     fr,
997                                     vsite,
998                                     constr,
999                                     nrnb,
1000                                     wcycle,
1001                                     do_verbose && !bPMETunePrinting);
1002                 shouldCheckNumberOfBondedInteractions = true;
1003                 upd.setNumAtoms(state->natoms);
1004             }
1005         }
1006
1007         // Allocate or re-size GPU halo exchange object, if necessary
1008         if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange)
1009         {
1010             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
1011                                "GPU device manager has to be initialized to use GPU "
1012                                "version of halo exchange.");
1013             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
1014         }
1015
1016         if (MASTER(cr) && do_log)
1017         {
1018             gmx::EnergyOutput::printHeader(
1019                     fplog, step, t); /* can we improve the information printed here? */
1020         }
1021
1022         if (ir->efep != efepNO)
1023         {
1024             update_mdatoms(mdatoms, state->lambda[efptMASS]);
1025         }
1026
1027         if (bExchanged)
1028         {
1029             /* We need the kinetic energy at minus the half step for determining
1030              * the full step kinetic energy and possibly for T-coupling.*/
1031             /* This may not be quite working correctly yet . . . . */
1032             compute_globals(gstat,
1033                             cr,
1034                             ir,
1035                             fr,
1036                             ekind,
1037                             makeConstArrayRef(state->x),
1038                             makeConstArrayRef(state->v),
1039                             state->box,
1040                             mdatoms,
1041                             nrnb,
1042                             &vcm,
1043                             wcycle,
1044                             enerd,
1045                             nullptr,
1046                             nullptr,
1047                             nullptr,
1048                             nullptr,
1049                             gmx::ArrayRef<real>{},
1050                             &nullSignaller,
1051                             state->box,
1052                             &totalNumberOfBondedInteractions,
1053                             &bSumEkinhOld,
1054                             CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
1055             checkNumberOfBondedInteractions(mdlog,
1056                                             cr,
1057                                             totalNumberOfBondedInteractions,
1058                                             top_global,
1059                                             &top,
1060                                             makeConstArrayRef(state->x),
1061                                             state->box,
1062                                             &shouldCheckNumberOfBondedInteractions);
1063         }
1064         clear_mat(force_vir);
1065
1066         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
1067
1068         /* Determine the energy and pressure:
1069          * at nstcalcenergy steps and at energy output steps (set below).
1070          */
1071         if (EI_VV(ir->eI) && (!bInitStep))
1072         {
1073             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1074             bCalcVir      = bCalcEnerStep
1075                        || (ir->epc != PressureCoupling::No
1076                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
1077         }
1078         else
1079         {
1080             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1081             bCalcVir      = bCalcEnerStep
1082                        || (ir->epc != PressureCoupling::No && do_per_step(step, ir->nstpcouple));
1083         }
1084         bCalcEner = bCalcEnerStep;
1085
1086         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
1087
1088         if (do_ene || do_log || bDoReplEx)
1089         {
1090             bCalcVir  = TRUE;
1091             bCalcEner = TRUE;
1092         }
1093
1094         /* Do we need global communication ? */
1095         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
1096                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
1097
1098         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
1099                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
1100                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
1101         if (fr->useMts && !do_per_step(step, ir->nstfout))
1102         {
1103             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
1104         }
1105
1106         if (shellfc)
1107         {
1108             /* Now is the time to relax the shells */
1109             relax_shell_flexcon(fplog,
1110                                 cr,
1111                                 ms,
1112                                 mdrunOptions.verbose,
1113                                 enforcedRotation,
1114                                 step,
1115                                 ir,
1116                                 imdSession,
1117                                 pull_work,
1118                                 bNS,
1119                                 force_flags,
1120                                 &top,
1121                                 constr,
1122                                 enerd,
1123                                 state->natoms,
1124                                 state->x.arrayRefWithPadding(),
1125                                 state->v.arrayRefWithPadding(),
1126                                 state->box,
1127                                 state->lambda,
1128                                 &state->hist,
1129                                 &f.view(),
1130                                 force_vir,
1131                                 mdatoms,
1132                                 nrnb,
1133                                 wcycle,
1134                                 shellfc,
1135                                 fr,
1136                                 runScheduleWork,
1137                                 t,
1138                                 mu_tot,
1139                                 vsite,
1140                                 ddBalanceRegionHandler);
1141         }
1142         else
1143         {
1144             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
1145                is updated (or the AWH update will be performed twice for one step when continuing).
1146                It would be best to call this update function from do_md_trajectory_writing but that
1147                would occur after do_force. One would have to divide the update_awh function into one
1148                function applying the AWH force and one doing the AWH bias update. The update AWH
1149                bias function could then be called after do_md_trajectory_writing (then containing
1150                update_awh_history). The checkpointing will in the future probably moved to the start
1151                of the md loop which will rid of this issue. */
1152             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
1153             {
1154                 awh->updateHistory(state_global->awhHistory.get());
1155             }
1156
1157             /* The coordinates (x) are shifted (to get whole molecules)
1158              * in do_force.
1159              * This is parallellized as well, and does communication too.
1160              * Check comments in sim_util.c
1161              */
1162             do_force(fplog,
1163                      cr,
1164                      ms,
1165                      ir,
1166                      awh.get(),
1167                      enforcedRotation,
1168                      imdSession,
1169                      pull_work,
1170                      step,
1171                      nrnb,
1172                      wcycle,
1173                      &top,
1174                      state->box,
1175                      state->x.arrayRefWithPadding(),
1176                      &state->hist,
1177                      &f.view(),
1178                      force_vir,
1179                      mdatoms,
1180                      enerd,
1181                      state->lambda,
1182                      fr,
1183                      runScheduleWork,
1184                      vsite,
1185                      mu_tot,
1186                      t,
1187                      ed ? ed->getLegacyED() : nullptr,
1188                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
1189                      ddBalanceRegionHandler);
1190         }
1191
1192         // VV integrators do not need the following velocity half step
1193         // if it is the first step after starting from a checkpoint.
1194         // That is, the half step is needed on all other steps, and
1195         // also the first step when starting from a .tpr file.
1196         if (EI_VV(ir->eI))
1197         {
1198             integrateVVFirstStep(step,
1199                                  bFirstStep,
1200                                  bInitStep,
1201                                  startingBehavior,
1202                                  nstglobalcomm,
1203                                  ir,
1204                                  fr,
1205                                  cr,
1206                                  state,
1207                                  mdatoms,
1208                                  fcdata,
1209                                  &MassQ,
1210                                  &vcm,
1211                                  top_global,
1212                                  top,
1213                                  enerd,
1214                                  ekind,
1215                                  gstat,
1216                                  &last_ekin,
1217                                  bCalcVir,
1218                                  total_vir,
1219                                  shake_vir,
1220                                  force_vir,
1221                                  pres,
1222                                  M,
1223                                  do_log,
1224                                  do_ene,
1225                                  bCalcEner,
1226                                  bGStat,
1227                                  bStopCM,
1228                                  bTrotter,
1229                                  bExchanged,
1230                                  &bSumEkinhOld,
1231                                  &shouldCheckNumberOfBondedInteractions,
1232                                  &saved_conserved_quantity,
1233                                  &f,
1234                                  &upd,
1235                                  constr,
1236                                  &nullSignaller,
1237                                  trotter_seq,
1238                                  nrnb,
1239                                  mdlog,
1240                                  fplog,
1241                                  wcycle);
1242         }
1243
1244         /* ########  END FIRST UPDATE STEP  ############## */
1245         /* ########  If doing VV, we now have v(dt) ###### */
1246         if (bDoExpanded)
1247         {
1248             /* perform extended ensemble sampling in lambda - we don't
1249                actually move to the new state before outputting
1250                statistics, but if performing simulated tempering, we
1251                do update the velocities and the tau_t. */
1252             // TODO: Avoid changing inputrec (#3854)
1253             // Simulated tempering updates the reference temperature.
1254             // Expanded ensemble without simulated tempering does not change the inputrec.
1255             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
1256             lamnew                 = ExpandedEnsembleDynamics(fplog,
1257                                               nonConstInputrec,
1258                                               enerd,
1259                                               state,
1260                                               &MassQ,
1261                                               state->fep_state,
1262                                               state->dfhist,
1263                                               step,
1264                                               state->v.rvec_array(),
1265                                               mdatoms);
1266             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1267             if (MASTER(cr))
1268             {
1269                 copy_df_history(state_global->dfhist, state->dfhist);
1270             }
1271         }
1272
1273         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1274         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1275         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1276             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1277                 || checkpointHandler->isCheckpointingStep()))
1278         {
1279             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1280             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1281         }
1282         // Copy velocities if needed for the output/checkpointing.
1283         // NOTE: Copy on the search steps is done at the beginning of the step.
1284         if (useGpuForUpdate && !bNS
1285             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1286         {
1287             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1288             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1289         }
1290         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1291         // and update is offloaded hence forces are kept on the GPU for update and have not been
1292         // already transferred in do_force().
1293         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1294         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1295         //       prior to GPU update.
1296         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1297         //       copy call in do_force(...).
1298         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1299         //       on host after the D2H copy in do_force(...).
1300         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1301             && do_per_step(step, ir->nstfout))
1302         {
1303             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1304             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1305         }
1306         /* Now we have the energies and forces corresponding to the
1307          * coordinates at time t. We must output all of this before
1308          * the update.
1309          */
1310         do_md_trajectory_writing(fplog,
1311                                  cr,
1312                                  nfile,
1313                                  fnm,
1314                                  step,
1315                                  step_rel,
1316                                  t,
1317                                  ir,
1318                                  state,
1319                                  state_global,
1320                                  observablesHistory,
1321                                  top_global,
1322                                  fr,
1323                                  outf,
1324                                  energyOutput,
1325                                  ekind,
1326                                  f.view().force(),
1327                                  checkpointHandler->isCheckpointingStep(),
1328                                  bRerunMD,
1329                                  bLastStep,
1330                                  mdrunOptions.writeConfout,
1331                                  bSumEkinhOld);
1332         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1333         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x.rvec_array(), t);
1334
1335         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1336         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1337             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1338         {
1339             copy_mat(state->svir_prev, shake_vir);
1340             copy_mat(state->fvir_prev, force_vir);
1341         }
1342
1343         stopHandler->setSignal();
1344         resetHandler->setSignal(walltime_accounting);
1345
1346         if (bGStat || !PAR(cr))
1347         {
1348             /* In parallel we only have to check for checkpointing in steps
1349              * where we do global communication,
1350              *  otherwise the other nodes don't know.
1351              */
1352             checkpointHandler->setSignal(walltime_accounting);
1353         }
1354
1355         /* #########   START SECOND UPDATE STEP ################# */
1356
1357         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1358            controlled in preprocessing */
1359
1360         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1361         {
1362             gmx_bool bIfRandomize;
1363             bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, &upd, constr);
1364             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1365             if (constr && bIfRandomize)
1366             {
1367                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1368             }
1369         }
1370         /* Box is changed in update() when we do pressure coupling,
1371          * but we should still use the old box for energy corrections and when
1372          * writing it to the energy file, so it matches the trajectory files for
1373          * the same timestep above. Make a copy in a separate array.
1374          */
1375         copy_mat(state->box, lastbox);
1376
1377         dvdl_constr = 0;
1378
1379         if (!useGpuForUpdate)
1380         {
1381             wallcycle_start(wcycle, ewcUPDATE);
1382         }
1383         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1384         if (bTrotter)
1385         {
1386             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
1387             /* We can only do Berendsen coupling after we have summed
1388              * the kinetic energy or virial. Since the happens
1389              * in global_state after update, we should only do it at
1390              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1391              */
1392         }
1393         else
1394         {
1395             update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
1396             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1397         }
1398
1399         /* With leap-frog type integrators we compute the kinetic energy
1400          * at a whole time step as the average of the half-time step kinetic
1401          * energies of two subsequent steps. Therefore we need to compute the
1402          * half step kinetic energy also if we need energies at the next step.
1403          */
1404         const bool needHalfStepKineticEnergy =
1405                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1406
1407         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1408         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1409         const bool doParrinelloRahman = (ir->epc == PressureCoupling::ParrinelloRahman
1410                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1411
1412         if (EI_VV(ir->eI))
1413         {
1414             GMX_ASSERT(!useGpuForUpdate, "GPU update is not supported with VVAK integrator.");
1415
1416             integrateVVSecondStep(step,
1417                                   ir,
1418                                   fr,
1419                                   cr,
1420                                   state,
1421                                   mdatoms,
1422                                   fcdata,
1423                                   &MassQ,
1424                                   &vcm,
1425                                   pull_work,
1426                                   enerd,
1427                                   ekind,
1428                                   gstat,
1429                                   &dvdl_constr,
1430                                   bCalcVir,
1431                                   total_vir,
1432                                   shake_vir,
1433                                   force_vir,
1434                                   pres,
1435                                   M,
1436                                   lastbox,
1437                                   do_log,
1438                                   do_ene,
1439                                   bGStat,
1440                                   &bSumEkinhOld,
1441                                   &f,
1442                                   &cbuf,
1443                                   &upd,
1444                                   constr,
1445                                   &nullSignaller,
1446                                   trotter_seq,
1447                                   nrnb,
1448                                   wcycle);
1449         }
1450         else
1451         {
1452             if (useGpuForUpdate)
1453             {
1454
1455                 wallcycle_stop(wcycle, ewcUPDATE);
1456
1457                 if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1458                 {
1459                     integrator->set(stateGpu->getCoordinates(),
1460                                     stateGpu->getVelocities(),
1461                                     stateGpu->getForces(),
1462                                     top.idef,
1463                                     *mdatoms,
1464                                     ekind->ngtc);
1465
1466                     // Copy data to the GPU after buffers might have being reinitialized
1467                     stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1468                     stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1469                 }
1470
1471                 if (simulationWork.useGpuPme && !runScheduleWork->simulationWork.useGpuPmePpCommunication
1472                     && !thisRankHasDuty(cr, DUTY_PME))
1473                 {
1474                     // The PME forces were recieved to the host, so have to be copied
1475                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::All);
1476                 }
1477                 else if (!runScheduleWork->stepWork.useGpuFBufferOps)
1478                 {
1479                     // The buffer ops were not offloaded this step, so the forces are on the
1480                     // host and have to be copied
1481                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1482                 }
1483
1484                 const bool doTemperatureScaling =
1485                         (ir->etc != TemperatureCoupling::No
1486                          && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1487
1488                 // This applies Leap-Frog, LINCS and SETTLE in succession
1489                 integrator->integrate(
1490                         stateGpu->getForcesReadyOnDeviceEvent(
1491                                 AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1492                         ir->delta_t,
1493                         true,
1494                         bCalcVir,
1495                         shake_vir,
1496                         doTemperatureScaling,
1497                         ekind->tcstat,
1498                         doParrinelloRahman,
1499                         ir->nstpcouple * ir->delta_t,
1500                         M);
1501
1502                 // Copy velocities D2H after update if:
1503                 // - Globals are computed this step (includes the energy output steps).
1504                 // - Temperature is needed for the next step.
1505                 if (bGStat || needHalfStepKineticEnergy)
1506                 {
1507                     stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1508                     stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1509                 }
1510             }
1511             else
1512             {
1513                 /* With multiple time stepping we need to do an additional normal
1514                  * update step to obtain the virial, as the actual MTS integration
1515                  * using an acceleration where the slow forces are multiplied by mtsFactor.
1516                  * Using that acceleration would result in a virial with the slow
1517                  * force contribution would be a factor mtsFactor too large.
1518                  */
1519                 if (fr->useMts && bCalcVir && constr != nullptr)
1520                 {
1521                     upd.update_for_constraint_virial(
1522                             *ir, *mdatoms, *state, f.view().forceWithPadding(), *ekind);
1523
1524                     constrain_coordinates(constr,
1525                                           do_log,
1526                                           do_ene,
1527                                           step,
1528                                           state,
1529                                           upd.xp()->arrayRefWithPadding(),
1530                                           &dvdl_constr,
1531                                           bCalcVir,
1532                                           shake_vir);
1533                 }
1534
1535                 ArrayRefWithPadding<const RVec> forceCombined =
1536                         (fr->useMts && step % ir->mtsLevels[1].stepFactor == 0)
1537                                 ? f.view().forceMtsCombinedWithPadding()
1538                                 : f.view().forceWithPadding();
1539                 upd.update_coords(
1540                         *ir, step, mdatoms, state, forceCombined, fcdata, ekind, M, etrtPOSITION, cr, constr != nullptr);
1541
1542                 wallcycle_stop(wcycle, ewcUPDATE);
1543
1544                 constrain_coordinates(constr,
1545                                       do_log,
1546                                       do_ene,
1547                                       step,
1548                                       state,
1549                                       upd.xp()->arrayRefWithPadding(),
1550                                       &dvdl_constr,
1551                                       bCalcVir && !fr->useMts,
1552                                       shake_vir);
1553
1554                 upd.update_sd_second_half(
1555                         *ir, step, &dvdl_constr, mdatoms, state, cr, nrnb, wcycle, constr, do_log, do_ene);
1556                 upd.finish_update(*ir, mdatoms, state, wcycle, constr != nullptr);
1557             }
1558
1559             if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1560             {
1561                 updatePrevStepPullCom(pull_work, state);
1562             }
1563
1564             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1565         }
1566
1567         if (vsite != nullptr)
1568         {
1569             wallcycle_start(wcycle, ewcVSITECONSTR);
1570             vsite->construct(state->x, ir->delta_t, state->v, state->box);
1571             wallcycle_stop(wcycle, ewcVSITECONSTR);
1572         }
1573
1574         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1575         /* With Leap-Frog we can skip compute_globals at
1576          * non-communication steps, but we need to calculate
1577          * the kinetic energy one step before communication.
1578          */
1579         {
1580             // Organize to do inter-simulation signalling on steps if
1581             // and when algorithms require it.
1582             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1583
1584             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1585             {
1586                 // Copy coordinates when needed to stop the CM motion.
1587                 if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
1588                 {
1589                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1590                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1591                 }
1592                 // Since we're already communicating at this step, we
1593                 // can propagate intra-simulation signals. Note that
1594                 // check_nstglobalcomm has the responsibility for
1595                 // choosing the value of nstglobalcomm that is one way
1596                 // bGStat becomes true, so we can't get into a
1597                 // situation where e.g. checkpointing can't be
1598                 // signalled.
1599                 bool                doIntraSimSignal = true;
1600                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1601
1602                 compute_globals(
1603                         gstat,
1604                         cr,
1605                         ir,
1606                         fr,
1607                         ekind,
1608                         makeConstArrayRef(state->x),
1609                         makeConstArrayRef(state->v),
1610                         state->box,
1611                         mdatoms,
1612                         nrnb,
1613                         &vcm,
1614                         wcycle,
1615                         enerd,
1616                         force_vir,
1617                         shake_vir,
1618                         total_vir,
1619                         pres,
1620                         (!EI_VV(ir->eI) && bCalcEner && constr != nullptr) ? constr->rmsdData()
1621                                                                            : gmx::ArrayRef<real>{},
1622                         &signaller,
1623                         lastbox,
1624                         &totalNumberOfBondedInteractions,
1625                         &bSumEkinhOld,
1626                         (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1627                                 | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1628                                 | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1629                                 | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
1630                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1631                                                                          : 0));
1632                 checkNumberOfBondedInteractions(mdlog,
1633                                                 cr,
1634                                                 totalNumberOfBondedInteractions,
1635                                                 top_global,
1636                                                 &top,
1637                                                 makeConstArrayRef(state->x),
1638                                                 state->box,
1639                                                 &shouldCheckNumberOfBondedInteractions);
1640                 if (!EI_VV(ir->eI) && bStopCM)
1641                 {
1642                     process_and_stopcm_grp(
1643                             fplog, &vcm, *mdatoms, makeArrayRef(state->x), makeArrayRef(state->v));
1644                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1645
1646                     // TODO: The special case of removing CM motion should be dealt more gracefully
1647                     if (useGpuForUpdate)
1648                     {
1649                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1650                         // Here we block until the H2D copy completes because event sync with the
1651                         // force kernels that use the coordinates on the next steps is not implemented
1652                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1653                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1654                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1655                         if (vcm.mode != ecmNO)
1656                         {
1657                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1658                         }
1659                     }
1660                 }
1661             }
1662         }
1663
1664         /* #############  END CALC EKIN AND PRESSURE ################# */
1665
1666         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1667            the virial that should probably be addressed eventually. state->veta has better properies,
1668            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1669            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1670
1671         if (ir->efep != efepNO && !EI_VV(ir->eI))
1672         {
1673             /* Sum up the foreign energy and dK/dl terms for md and sd.
1674                Currently done every step so that dH/dl is correct in the .edr */
1675             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1676         }
1677
1678         update_pcouple_after_coordinates(
1679                 fplog, step, ir, mdatoms, pres, force_vir, shake_vir, pressureCouplingMu, state, nrnb, upd.deform(), !useGpuForUpdate);
1680
1681         const bool doBerendsenPressureCoupling = (inputrec->epc == PressureCoupling::Berendsen
1682                                                   && do_per_step(step, inputrec->nstpcouple));
1683         const bool doCRescalePressureCoupling  = (inputrec->epc == PressureCoupling::CRescale
1684                                                  && do_per_step(step, inputrec->nstpcouple));
1685         if (useGpuForUpdate
1686             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1687         {
1688             integrator->scaleCoordinates(pressureCouplingMu);
1689             if (doCRescalePressureCoupling)
1690             {
1691                 matrix pressureCouplingInvMu;
1692                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1693                 integrator->scaleVelocities(pressureCouplingInvMu);
1694             }
1695             integrator->setPbc(PbcType::Xyz, state->box);
1696         }
1697
1698         /* ################# END UPDATE STEP 2 ################# */
1699         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1700
1701         /* The coordinates (x) were unshifted in update */
1702         if (!bGStat)
1703         {
1704             /* We will not sum ekinh_old,
1705              * so signal that we still have to do it.
1706              */
1707             bSumEkinhOld = TRUE;
1708         }
1709
1710         if (bCalcEner)
1711         {
1712             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1713
1714             /* use the directly determined last velocity, not actually the averaged half steps */
1715             if (bTrotter && ir->eI == eiVV)
1716             {
1717                 enerd->term[F_EKIN] = last_ekin;
1718             }
1719             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1720
1721             if (integratorHasConservedEnergyQuantity(ir))
1722             {
1723                 if (EI_VV(ir->eI))
1724                 {
1725                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1726                 }
1727                 else
1728                 {
1729                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1730                 }
1731             }
1732             /* #########  END PREPARING EDR OUTPUT  ###########  */
1733         }
1734
1735         /* Output stuff */
1736         if (MASTER(cr))
1737         {
1738             if (fplog && do_log && bDoExpanded)
1739             {
1740                 /* only needed if doing expanded ensemble */
1741                 PrintFreeEnergyInfoToFile(fplog,
1742                                           ir->fepvals,
1743                                           ir->expandedvals,
1744                                           ir->bSimTemp ? ir->simtempvals : nullptr,
1745                                           state_global->dfhist,
1746                                           state->fep_state,
1747                                           ir->nstlog,
1748                                           step);
1749             }
1750             if (bCalcEner)
1751             {
1752                 energyOutput.addDataAtEnergyStep(bDoDHDL,
1753                                                  bCalcEnerStep,
1754                                                  t,
1755                                                  mdatoms->tmass,
1756                                                  enerd,
1757                                                  ir->fepvals,
1758                                                  ir->expandedvals,
1759                                                  lastbox,
1760                                                  PTCouplingArrays{ state->boxv,
1761                                                                    state->nosehoover_xi,
1762                                                                    state->nosehoover_vxi,
1763                                                                    state->nhpres_xi,
1764                                                                    state->nhpres_vxi },
1765                                                  state->fep_state,
1766                                                  shake_vir,
1767                                                  force_vir,
1768                                                  total_vir,
1769                                                  pres,
1770                                                  ekind,
1771                                                  mu_tot,
1772                                                  constr);
1773             }
1774             else
1775             {
1776                 energyOutput.recordNonEnergyStep();
1777             }
1778
1779             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1780             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1781
1782             if (doSimulatedAnnealing)
1783             {
1784                 gmx::EnergyOutput::printAnnealingTemperatures(
1785                         do_log ? fplog : nullptr, groups, &(ir->opts));
1786             }
1787             if (do_log || do_ene || do_dr || do_or)
1788             {
1789                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf),
1790                                                    do_ene,
1791                                                    do_dr,
1792                                                    do_or,
1793                                                    do_log ? fplog : nullptr,
1794                                                    step,
1795                                                    t,
1796                                                    fr->fcdata.get(),
1797                                                    awh.get());
1798             }
1799             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1800             {
1801                 const bool isInitialOutput = false;
1802                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1803             }
1804
1805             if (ir->bPull)
1806             {
1807                 pull_print_output(pull_work, step, t);
1808             }
1809
1810             if (do_per_step(step, ir->nstlog))
1811             {
1812                 if (fflush(fplog) != 0)
1813                 {
1814                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1815                 }
1816             }
1817         }
1818         if (bDoExpanded)
1819         {
1820             /* Have to do this part _after_ outputting the logfile and the edr file */
1821             /* Gets written into the state at the beginning of next loop*/
1822             state->fep_state = lamnew;
1823         }
1824         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1825         {
1826             state->fep_state = awh->fepLambdaState();
1827         }
1828         /* Print the remaining wall clock time for the run */
1829         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1830         {
1831             if (shellfc)
1832             {
1833                 fprintf(stderr, "\n");
1834             }
1835             print_time(stderr, walltime_accounting, step, ir, cr);
1836         }
1837
1838         /* Ion/water position swapping.
1839          * Not done in last step since trajectory writing happens before this call
1840          * in the MD loop and exchanges would be lost anyway. */
1841         bNeedRepartition = FALSE;
1842         if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep && do_per_step(step, ir->swap->nstswap))
1843         {
1844             bNeedRepartition = do_swapcoords(cr,
1845                                              step,
1846                                              t,
1847                                              ir,
1848                                              swap,
1849                                              wcycle,
1850                                              as_rvec_array(state->x.data()),
1851                                              state->box,
1852                                              MASTER(cr) && mdrunOptions.verbose,
1853                                              bRerunMD);
1854
1855             if (bNeedRepartition && DOMAINDECOMP(cr))
1856             {
1857                 dd_collect_state(cr->dd, state, state_global);
1858             }
1859         }
1860
1861         /* Replica exchange */
1862         bExchanged = FALSE;
1863         if (bDoReplEx)
1864         {
1865             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1866         }
1867
1868         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1869         {
1870             dd_partition_system(fplog,
1871                                 mdlog,
1872                                 step,
1873                                 cr,
1874                                 TRUE,
1875                                 1,
1876                                 state_global,
1877                                 *top_global,
1878                                 ir,
1879                                 imdSession,
1880                                 pull_work,
1881                                 state,
1882                                 &f,
1883                                 mdAtoms,
1884                                 &top,
1885                                 fr,
1886                                 vsite,
1887                                 constr,
1888                                 nrnb,
1889                                 wcycle,
1890                                 FALSE);
1891             shouldCheckNumberOfBondedInteractions = true;
1892             upd.setNumAtoms(state->natoms);
1893         }
1894
1895         bFirstStep = FALSE;
1896         bInitStep  = FALSE;
1897
1898         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1899         /* With all integrators, except VV, we need to retain the pressure
1900          * at the current step for coupling at the next step.
1901          */
1902         if ((state->flags & (1U << estPRES_PREV))
1903             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1904         {
1905             /* Store the pressure in t_state for pressure coupling
1906              * at the next MD step.
1907              */
1908             copy_mat(pres, state->pres_prev);
1909         }
1910
1911         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1912
1913         if ((membed != nullptr) && (!bLastStep))
1914         {
1915             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1916         }
1917
1918         cycles = wallcycle_stop(wcycle, ewcSTEP);
1919         if (DOMAINDECOMP(cr) && wcycle)
1920         {
1921             dd_cycles_add(cr->dd, cycles, ddCyclStep);
1922         }
1923
1924         /* increase the MD step number */
1925         step++;
1926         step_rel++;
1927
1928 #if GMX_FAHCORE
1929         if (MASTER(cr))
1930         {
1931             fcReportProgress(ir->nsteps + ir->init_step, step);
1932         }
1933 #endif
1934
1935         resetHandler->resetCounters(
1936                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
1937
1938         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
1939         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
1940     }
1941     /* End of main MD loop */
1942
1943     /* Closing TNG files can include compressing data. Therefore it is good to do that
1944      * before stopping the time measurements. */
1945     mdoutf_tng_close(outf);
1946
1947     /* Stop measuring walltime */
1948     walltime_accounting_end_time(walltime_accounting);
1949
1950     if (!thisRankHasDuty(cr, DUTY_PME))
1951     {
1952         /* Tell the PME only node to finish */
1953         gmx_pme_send_finish(cr);
1954     }
1955
1956     if (MASTER(cr))
1957     {
1958         if (ir->nstcalcenergy > 0)
1959         {
1960             energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
1961
1962             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
1963             energyOutput.printAverages(fplog, groups);
1964         }
1965     }
1966     done_mdoutf(outf);
1967
1968     if (bPMETune)
1969     {
1970         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
1971     }
1972
1973     done_shellfc(fplog, shellfc, step_rel);
1974
1975     if (useReplicaExchange && MASTER(cr))
1976     {
1977         print_replica_exchange_statistics(fplog, repl_ex);
1978     }
1979
1980     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
1981
1982     global_stat_destroy(gstat);
1983 }