d35ff541aebde12b428aaecbb6fe50458654bec1
[alexxy/gromacs.git] / src / gromacs / mdrun / md.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 /*! \internal \file
38  *
39  * \brief Implements the integrator for normal molecular dynamics simulations
40  *
41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
42  * \ingroup module_mdrun
43  */
44 #include "gmxpre.h"
45
46 #include <cinttypes>
47 #include <cmath>
48 #include <cstdio>
49 #include <cstdlib>
50
51 #include <algorithm>
52 #include <memory>
53 #include <numeric>
54
55 #include "gromacs/applied_forces/awh/awh.h"
56 #include "gromacs/commandline/filenm.h"
57 #include "gromacs/domdec/collect.h"
58 #include "gromacs/domdec/dlbtiming.h"
59 #include "gromacs/domdec/domdec.h"
60 #include "gromacs/domdec/domdec_network.h"
61 #include "gromacs/domdec/domdec_struct.h"
62 #include "gromacs/domdec/gpuhaloexchange.h"
63 #include "gromacs/domdec/mdsetup.h"
64 #include "gromacs/domdec/partition.h"
65 #include "gromacs/essentialdynamics/edsam.h"
66 #include "gromacs/ewald/pme_load_balancing.h"
67 #include "gromacs/ewald/pme_pp.h"
68 #include "gromacs/fileio/trxio.h"
69 #include "gromacs/gmxlib/network.h"
70 #include "gromacs/gmxlib/nrnb.h"
71 #include "gromacs/gpu_utils/device_stream_manager.h"
72 #include "gromacs/gpu_utils/gpu_utils.h"
73 #include "gromacs/imd/imd.h"
74 #include "gromacs/listed_forces/listed_forces.h"
75 #include "gromacs/math/functions.h"
76 #include "gromacs/math/invertmatrix.h"
77 #include "gromacs/math/vec.h"
78 #include "gromacs/math/vectypes.h"
79 #include "gromacs/mdlib/checkpointhandler.h"
80 #include "gromacs/mdlib/compute_io.h"
81 #include "gromacs/mdlib/constr.h"
82 #include "gromacs/mdlib/coupling.h"
83 #include "gromacs/mdlib/ebin.h"
84 #include "gromacs/mdlib/enerdata_utils.h"
85 #include "gromacs/mdlib/energyoutput.h"
86 #include "gromacs/mdlib/expanded.h"
87 #include "gromacs/mdlib/force.h"
88 #include "gromacs/mdlib/force_flags.h"
89 #include "gromacs/mdlib/forcerec.h"
90 #include "gromacs/mdlib/freeenergyparameters.h"
91 #include "gromacs/mdlib/md_support.h"
92 #include "gromacs/mdlib/mdatoms.h"
93 #include "gromacs/mdlib/mdoutf.h"
94 #include "gromacs/mdlib/membed.h"
95 #include "gromacs/mdlib/resethandler.h"
96 #include "gromacs/mdlib/sighandler.h"
97 #include "gromacs/mdlib/simulationsignal.h"
98 #include "gromacs/mdlib/stat.h"
99 #include "gromacs/mdlib/stophandler.h"
100 #include "gromacs/mdlib/tgroup.h"
101 #include "gromacs/mdlib/trajectory_writing.h"
102 #include "gromacs/mdlib/update.h"
103 #include "gromacs/mdlib/update_constrain_gpu.h"
104 #include "gromacs/mdlib/update_vv.h"
105 #include "gromacs/mdlib/vcm.h"
106 #include "gromacs/mdlib/vsite.h"
107 #include "gromacs/mdrunutility/handlerestart.h"
108 #include "gromacs/mdrunutility/multisim.h"
109 #include "gromacs/mdrunutility/printtime.h"
110 #include "gromacs/mdtypes/awh_history.h"
111 #include "gromacs/mdtypes/awh_params.h"
112 #include "gromacs/mdtypes/commrec.h"
113 #include "gromacs/mdtypes/df_history.h"
114 #include "gromacs/mdtypes/energyhistory.h"
115 #include "gromacs/mdtypes/fcdata.h"
116 #include "gromacs/mdtypes/forcebuffers.h"
117 #include "gromacs/mdtypes/forcerec.h"
118 #include "gromacs/mdtypes/group.h"
119 #include "gromacs/mdtypes/inputrec.h"
120 #include "gromacs/mdtypes/interaction_const.h"
121 #include "gromacs/mdtypes/md_enums.h"
122 #include "gromacs/mdtypes/mdatom.h"
123 #include "gromacs/mdtypes/mdrunoptions.h"
124 #include "gromacs/mdtypes/multipletimestepping.h"
125 #include "gromacs/mdtypes/observableshistory.h"
126 #include "gromacs/mdtypes/pullhistory.h"
127 #include "gromacs/mdtypes/simulation_workload.h"
128 #include "gromacs/mdtypes/state.h"
129 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
130 #include "gromacs/modularsimulator/energydata.h"
131 #include "gromacs/nbnxm/gpu_data_mgmt.h"
132 #include "gromacs/nbnxm/nbnxm.h"
133 #include "gromacs/pbcutil/pbc.h"
134 #include "gromacs/pulling/output.h"
135 #include "gromacs/pulling/pull.h"
136 #include "gromacs/swap/swapcoords.h"
137 #include "gromacs/timing/wallcycle.h"
138 #include "gromacs/timing/walltime_accounting.h"
139 #include "gromacs/topology/atoms.h"
140 #include "gromacs/topology/idef.h"
141 #include "gromacs/topology/mtop_util.h"
142 #include "gromacs/topology/topology.h"
143 #include "gromacs/trajectory/trajectoryframe.h"
144 #include "gromacs/utility/basedefinitions.h"
145 #include "gromacs/utility/cstringutil.h"
146 #include "gromacs/utility/fatalerror.h"
147 #include "gromacs/utility/logger.h"
148 #include "gromacs/utility/real.h"
149 #include "gromacs/utility/smalloc.h"
150
151 #include "legacysimulator.h"
152 #include "replicaexchange.h"
153 #include "shellfc.h"
154
155 using gmx::SimulationSignaller;
156
157 void gmx::LegacySimulator::do_md()
158 {
159     // TODO Historically, the EM and MD "integrators" used different
160     // names for the t_inputrec *parameter, but these must have the
161     // same name, now that it's a member of a struct. We use this ir
162     // alias to avoid a large ripple of nearly useless changes.
163     // t_inputrec is being replaced by IMdpOptionsProvider, so this
164     // will go away eventually.
165     const t_inputrec* ir = inputrec;
166
167     int64_t      step, step_rel;
168     double       t, t0 = ir->init_t;
169     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
170     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
171     gmx_bool     bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
172     gmx_bool     do_ene, do_log, do_verbose;
173     gmx_bool     bMasterState;
174     unsigned int force_flags;
175     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
176     int    i, m;
177     rvec   mu_tot;
178     matrix pressureCouplingMu, M;
179     gmx_repl_ex_t     repl_ex = nullptr;
180     gmx_global_stat_t gstat;
181     gmx_shellfc_t*    shellfc;
182     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
183     gmx_bool          bTrotter;
184     real              dvdl_constr;
185     std::vector<RVec> cbuf;
186     matrix            lastbox;
187     int               lamnew = 0;
188     /* for FEP */
189     int       nstfep = 0;
190     double    cycles;
191     real      saved_conserved_quantity = 0;
192     real      last_ekin                = 0;
193     t_extmass MassQ;
194     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
195
196     /* PME load balancing data for GPU kernels */
197     gmx_bool bPMETune         = FALSE;
198     gmx_bool bPMETunePrinting = FALSE;
199
200     bool bInteractiveMDstep = false;
201
202     /* Domain decomposition could incorrectly miss a bonded
203        interaction, but checking for that requires a global
204        communication stage, which does not otherwise happen in DD
205        code. So we do that alongside the first global energy reduction
206        after a new DD is made. These variables handle whether the
207        check happens, and the result it returns. */
208     bool shouldCheckNumberOfBondedInteractions = false;
209     int  totalNumberOfBondedInteractions       = -1;
210
211     SimulationSignals signals;
212     // Most global communnication stages don't propagate mdrun
213     // signals, and will use this object to achieve that.
214     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
215
216     if (!mdrunOptions.writeConfout)
217     {
218         // This is on by default, and the main known use case for
219         // turning it off is for convenience in benchmarking, which is
220         // something that should not show up in the general user
221         // interface.
222         GMX_LOG(mdlog.info)
223                 .asParagraph()
224                 .appendText(
225                         "The -noconfout functionality is deprecated, and may be removed in a "
226                         "future version.");
227     }
228
229     /* md-vv uses averaged full step velocities for T-control
230        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
231        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
232     bTrotter = (EI_VV(ir->eI)
233                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
234
235     const bool bRerunMD = false;
236
237     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
238     bGStatEveryStep   = (nstglobalcomm == 1);
239
240     const SimulationGroups* groups = &top_global->groups;
241
242     std::unique_ptr<EssentialDynamics> ed = nullptr;
243     if (opt2bSet("-ei", nfile, fnm))
244     {
245         /* Initialize essential dynamics sampling */
246         ed = init_edsam(mdlog,
247                         opt2fn_null("-ei", nfile, fnm),
248                         opt2fn("-eo", nfile, fnm),
249                         *top_global,
250                         *ir,
251                         cr,
252                         constr,
253                         state_global,
254                         observablesHistory,
255                         oenv,
256                         startingBehavior);
257     }
258     else if (observablesHistory->edsamHistory)
259     {
260         gmx_fatal(FARGS,
261                   "The checkpoint is from a run with essential dynamics sampling, "
262                   "but the current run did not specify the -ei option. "
263                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
264     }
265
266     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
267     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
268     initialize_lambdas(fplog, *ir, MASTER(cr), fep_state, lambda);
269     Update upd(*ir, deform);
270     bool   doSimulatedAnnealing = false;
271     {
272         // TODO: Avoid changing inputrec (#3854)
273         // Simulated annealing updates the reference temperature.
274         auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
275         doSimulatedAnnealing   = initSimulatedAnnealing(nonConstInputrec, &upd);
276     }
277     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
278
279     const t_fcdata& fcdata = *fr->fcdata;
280
281     bool simulationsShareState = false;
282     int  nstSignalComm         = nstglobalcomm;
283     {
284         // TODO This implementation of ensemble orientation restraints is nasty because
285         // a user can't just do multi-sim with single-sim orientation restraints.
286         bool usingEnsembleRestraints =
287                 (fcdata.disres->nsystems > 1) || ((ms != nullptr) && (fcdata.orires->nr != 0));
288         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr));
289
290         // Replica exchange, ensemble restraints and AWH need all
291         // simulations to remain synchronized, so they need
292         // checkpoints and stop conditions to act on the same step, so
293         // the propagation of such signals must take place between
294         // simulations, not just within simulations.
295         // TODO: Make algorithm initializers set these flags.
296         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
297
298         if (simulationsShareState)
299         {
300             // Inter-simulation signal communication does not need to happen
301             // often, so we use a minimum of 200 steps to reduce overhead.
302             const int c_minimumInterSimulationSignallingInterval = 200;
303             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
304                             * nstglobalcomm;
305         }
306     }
307
308     if (startingBehavior != StartingBehavior::RestartWithAppending)
309     {
310         pleaseCiteCouplingAlgorithms(fplog, *ir);
311     }
312     gmx_mdoutf*       outf = init_mdoutf(fplog,
313                                    nfile,
314                                    fnm,
315                                    mdrunOptions,
316                                    cr,
317                                    outputProvider,
318                                    mdModulesNotifier,
319                                    ir,
320                                    top_global,
321                                    oenv,
322                                    wcycle,
323                                    startingBehavior,
324                                    simulationsShareState,
325                                    ms);
326     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf),
327                                    *top_global,
328                                    *ir,
329                                    pull_work,
330                                    mdoutf_get_fp_dhdl(outf),
331                                    false,
332                                    startingBehavior,
333                                    simulationsShareState,
334                                    mdModulesNotifier);
335
336     gstat = global_stat_init(ir);
337
338     const auto& simulationWork     = runScheduleWork->simulationWork;
339     const bool  useGpuForPme       = simulationWork.useGpuPme;
340     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
341     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
342     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
343
344     /* Check for polarizable models and flexible constraints */
345     shellfc = init_shell_flexcon(fplog,
346                                  top_global,
347                                  constr ? constr->numFlexibleConstraints() : 0,
348                                  ir->nstcalcenergy,
349                                  DOMAINDECOMP(cr),
350                                  useGpuForPme);
351
352     {
353         double io = compute_io(ir, top_global->natoms, *groups, energyOutput.numEnergyTerms(), 1);
354         if ((io > 2000) && MASTER(cr))
355         {
356             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
357         }
358     }
359
360     // Local state only becomes valid now.
361     std::unique_ptr<t_state> stateInstance;
362     t_state*                 state;
363
364     gmx_localtop_t top(top_global->ffparams);
365
366     auto mdatoms = mdAtoms->mdatoms();
367
368     ForceBuffers f(fr->useMts,
369                    ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
370                            ? PinningPolicy::PinnedIfSupported
371                            : PinningPolicy::CannotBePinned);
372     if (DOMAINDECOMP(cr))
373     {
374         stateInstance = std::make_unique<t_state>();
375         state         = stateInstance.get();
376         dd_init_local_state(*cr->dd, state_global, state);
377
378         /* Distribute the charge groups over the nodes from the master node */
379         dd_partition_system(fplog,
380                             mdlog,
381                             ir->init_step,
382                             cr,
383                             TRUE,
384                             1,
385                             state_global,
386                             *top_global,
387                             *ir,
388                             imdSession,
389                             pull_work,
390                             state,
391                             &f,
392                             mdAtoms,
393                             &top,
394                             fr,
395                             vsite,
396                             constr,
397                             nrnb,
398                             nullptr,
399                             FALSE);
400         shouldCheckNumberOfBondedInteractions = true;
401         upd.setNumAtoms(state->natoms);
402     }
403     else
404     {
405         state_change_natoms(state_global, state_global->natoms);
406         /* Copy the pointer to the global state */
407         state = state_global;
408
409         /* Generate and initialize new topology */
410         mdAlgorithmsSetupAtomData(cr, *ir, *top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
411
412         upd.setNumAtoms(state->natoms);
413     }
414
415     std::unique_ptr<UpdateConstrainGpu> integrator;
416
417     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
418
419     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
420     if (useGpuForUpdate)
421     {
422         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
423                                    || constr->numConstraintsTotal() == 0,
424                            "Constraints in domain decomposition are only supported with update "
425                            "groups if using GPU update.\n");
426         GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
427                                    || constr->numConstraintsTotal() == 0,
428                            "SHAKE is not supported with GPU update.");
429         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
430                            "Either PME or short-ranged non-bonded interaction tasks must run on "
431                            "the GPU to use GPU update.\n");
432         GMX_RELEASE_ASSERT(ir->eI == IntegrationAlgorithm::MD,
433                            "Only the md integrator is supported with the GPU update.\n");
434         GMX_RELEASE_ASSERT(
435                 ir->etc != TemperatureCoupling::NoseHoover,
436                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
437         GMX_RELEASE_ASSERT(
438                 ir->epc == PressureCoupling::No || ir->epc == PressureCoupling::ParrinelloRahman
439                         || ir->epc == PressureCoupling::Berendsen || ir->epc == PressureCoupling::CRescale,
440                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
441                 "with the GPU update.\n");
442         GMX_RELEASE_ASSERT(!mdatoms->haveVsites,
443                            "Virtual sites are not supported with the GPU update.\n");
444         GMX_RELEASE_ASSERT(ed == nullptr,
445                            "Essential dynamics is not supported with the GPU update.\n");
446         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
447                            "Constraints pulling is not supported with the GPU update.\n");
448         GMX_RELEASE_ASSERT(fcdata.orires->nr == 0,
449                            "Orientation restraints are not supported with the GPU update.\n");
450         GMX_RELEASE_ASSERT(
451                 ir->efep == FreeEnergyPerturbationType::No
452                         || (!haveFepPerturbedMasses(*top_global) && !havePerturbedConstraints(*top_global)),
453                 "Free energy perturbation of masses and constraints are not supported with the GPU "
454                 "update.");
455
456         if (constr != nullptr && constr->numConstraintsTotal() > 0)
457         {
458             GMX_LOG(mdlog.info)
459                     .asParagraph()
460                     .appendText("Updating coordinates and applying constraints on the GPU.");
461         }
462         else
463         {
464             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
465         }
466         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
467                            "Device stream manager should be initialized in order to use GPU "
468                            "update-constraints.");
469         GMX_RELEASE_ASSERT(
470                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
471                 "Update stream should be initialized in order to use GPU "
472                 "update-constraints.");
473         integrator = std::make_unique<UpdateConstrainGpu>(
474                 *ir,
475                 *top_global,
476                 ekind->ngtc,
477                 fr->deviceStreamManager->context(),
478                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
479                 stateGpu->xUpdatedOnDevice(),
480                 wcycle);
481
482         integrator->setPbc(PbcType::Xyz, state->box);
483     }
484
485     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
486     {
487         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
488     }
489     if (useGpuForUpdate)
490     {
491         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
492     }
493
494     // NOTE: The global state is no longer used at this point.
495     // But state_global is still used as temporary storage space for writing
496     // the global state to file and potentially for replica exchange.
497     // (Global topology should persist.)
498
499     update_mdatoms(mdatoms, state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
500
501     if (ir->bExpanded)
502     {
503         /* Check nstexpanded here, because the grompp check was broken */
504         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
505         {
506             gmx_fatal(FARGS,
507                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
508         }
509         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
510     }
511
512     if (MASTER(cr))
513     {
514         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
515     }
516
517     preparePrevStepPullCom(
518             ir, pull_work, mdatoms->massT, state, state_global, cr, startingBehavior != StartingBehavior::NewSimulation);
519
520     // TODO: Remove this by converting AWH into a ForceProvider
521     auto awh = prepareAwhModule(fplog,
522                                 *ir,
523                                 state_global,
524                                 cr,
525                                 ms,
526                                 startingBehavior != StartingBehavior::NewSimulation,
527                                 shellfc != nullptr,
528                                 opt2fn("-awh", nfile, fnm),
529                                 pull_work);
530
531     if (useReplicaExchange && MASTER(cr))
532     {
533         repl_ex = init_replica_exchange(fplog, ms, top_global->natoms, ir, replExParams);
534     }
535     /* PME tuning is only supported in the Verlet scheme, with PME for
536      * Coulomb. It is not supported with only LJ PME. */
537     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
538                 && ir->cutoff_scheme != CutoffScheme::Group);
539
540     pme_load_balancing_t* pme_loadbal = nullptr;
541     if (bPMETune)
542     {
543         pme_loadbal_init(
544                 &pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu());
545     }
546
547     if (!ir->bContinuation)
548     {
549         if (state->flags & enumValueToBitMask(StateEntry::V))
550         {
551             auto v = makeArrayRef(state->v);
552             /* Set the velocities of vsites, shells and frozen atoms to zero */
553             for (i = 0; i < mdatoms->homenr; i++)
554             {
555                 if (mdatoms->ptype[i] == eptShell)
556                 {
557                     clear_rvec(v[i]);
558                 }
559                 else if (mdatoms->cFREEZE)
560                 {
561                     for (m = 0; m < DIM; m++)
562                     {
563                         if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
564                         {
565                             v[i][m] = 0;
566                         }
567                     }
568                 }
569             }
570         }
571
572         if (constr)
573         {
574             /* Constrain the initial coordinates and velocities */
575             do_constrain_first(fplog,
576                                constr,
577                                ir,
578                                mdatoms->nr,
579                                mdatoms->homenr,
580                                state->x.arrayRefWithPadding(),
581                                state->v.arrayRefWithPadding(),
582                                state->box,
583                                state->lambda[FreeEnergyPerturbationCouplingType::Bonded]);
584         }
585     }
586
587     if (ir->efep != FreeEnergyPerturbationType::No)
588     {
589         /* Set free energy calculation frequency as the greatest common
590          * denominator of nstdhdl and repl_ex_nst. */
591         nstfep = ir->fepvals->nstdhdl;
592         if (ir->bExpanded)
593         {
594             nstfep = std::gcd(ir->expandedvals->nstexpanded, nstfep);
595         }
596         if (useReplicaExchange)
597         {
598             nstfep = std::gcd(replExParams.exchangeInterval, nstfep);
599         }
600         if (ir->bDoAwh)
601         {
602             nstfep = std::gcd(ir->awhParams->nstSampleCoord, nstfep);
603         }
604     }
605
606     /* Be REALLY careful about what flags you set here. You CANNOT assume
607      * this is the first step, since we might be restarting from a checkpoint,
608      * and in that case we should not do any modifications to the state.
609      */
610     bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && !ir->bContinuation);
611
612     // When restarting from a checkpoint, it can be appropriate to
613     // initialize ekind from quantities in the checkpoint. Otherwise,
614     // compute_globals must initialize ekind before the simulation
615     // starts/restarts. However, only the master rank knows what was
616     // found in the checkpoint file, so we have to communicate in
617     // order to coordinate the restart.
618     //
619     // TODO Consider removing this communication if/when checkpoint
620     // reading directly follows .tpr reading, because all ranks can
621     // agree on hasReadEkinState at that time.
622     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
623     if (PAR(cr))
624     {
625         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
626     }
627     if (hasReadEkinState)
628     {
629         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
630     }
631
632     unsigned int cglo_flags =
633             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
634              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
635
636     bSumEkinhOld = FALSE;
637
638     t_vcm vcm(top_global->groups, *ir);
639     reportComRemovalInfo(fplog, vcm);
640
641     /* To minimize communication, compute_globals computes the COM velocity
642      * and the kinetic energy for the velocities without COM motion removed.
643      * Thus to get the kinetic energy without the COM contribution, we need
644      * to call compute_globals twice.
645      */
646     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
647     {
648         unsigned int cglo_flags_iteration = cglo_flags;
649         if (bStopCM && cgloIteration == 0)
650         {
651             cglo_flags_iteration |= CGLO_STOPCM;
652             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
653         }
654         compute_globals(gstat,
655                         cr,
656                         ir,
657                         fr,
658                         ekind,
659                         makeConstArrayRef(state->x),
660                         makeConstArrayRef(state->v),
661                         state->box,
662                         mdatoms,
663                         nrnb,
664                         &vcm,
665                         nullptr,
666                         enerd,
667                         force_vir,
668                         shake_vir,
669                         total_vir,
670                         pres,
671                         gmx::ArrayRef<real>{},
672                         &nullSignaller,
673                         state->box,
674                         &totalNumberOfBondedInteractions,
675                         &bSumEkinhOld,
676                         cglo_flags_iteration
677                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
678                                                                          : 0));
679         if (cglo_flags_iteration & CGLO_STOPCM)
680         {
681             /* At initialization, do not pass x with acceleration-correction mode
682              * to avoid (incorrect) correction of the initial coordinates.
683              */
684             auto x = (vcm.mode == ComRemovalAlgorithm::LinearAccelerationCorrection)
685                              ? ArrayRef<RVec>()
686                              : makeArrayRef(state->x);
687             process_and_stopcm_grp(fplog, &vcm, *mdatoms, x, makeArrayRef(state->v));
688             inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
689         }
690     }
691     checkNumberOfBondedInteractions(mdlog,
692                                     cr,
693                                     totalNumberOfBondedInteractions,
694                                     *top_global,
695                                     &top,
696                                     makeConstArrayRef(state->x),
697                                     state->box,
698                                     &shouldCheckNumberOfBondedInteractions);
699     if (ir->eI == IntegrationAlgorithm::VVAK)
700     {
701         /* a second call to get the half step temperature initialized as well */
702         /* we do the same call as above, but turn the pressure off -- internally to
703            compute_globals, this is recognized as a velocity verlet half-step
704            kinetic energy calculation.  This minimized excess variables, but
705            perhaps loses some logic?*/
706
707         compute_globals(gstat,
708                         cr,
709                         ir,
710                         fr,
711                         ekind,
712                         makeConstArrayRef(state->x),
713                         makeConstArrayRef(state->v),
714                         state->box,
715                         mdatoms,
716                         nrnb,
717                         &vcm,
718                         nullptr,
719                         enerd,
720                         force_vir,
721                         shake_vir,
722                         total_vir,
723                         pres,
724                         gmx::ArrayRef<real>{},
725                         &nullSignaller,
726                         state->box,
727                         nullptr,
728                         &bSumEkinhOld,
729                         cglo_flags & ~CGLO_PRESSURE);
730     }
731
732     /* Calculate the initial half step temperature, and save the ekinh_old */
733     if (startingBehavior == StartingBehavior::NewSimulation)
734     {
735         for (i = 0; (i < ir->opts.ngtc); i++)
736         {
737             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
738         }
739     }
740
741     /* need to make an initiation call to get the Trotter variables set, as well as other constants
742        for non-trotter temperature control */
743     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
744
745     if (MASTER(cr))
746     {
747         if (!ir->bContinuation)
748         {
749             if (constr && ir->eConstrAlg == ConstraintAlgorithm::Lincs)
750             {
751                 fprintf(fplog,
752                         "RMS relative constraint deviation after constraining: %.2e\n",
753                         constr->rmsd());
754             }
755             if (EI_STATE_VELOCITY(ir->eI))
756             {
757                 real temp = enerd->term[F_TEMP];
758                 if (ir->eI != IntegrationAlgorithm::VV)
759                 {
760                     /* Result of Ekin averaged over velocities of -half
761                      * and +half step, while we only have -half step here.
762                      */
763                     temp *= 2;
764                 }
765                 fprintf(fplog, "Initial temperature: %g K\n", temp);
766             }
767         }
768
769         char tbuf[20];
770         fprintf(stderr, "starting mdrun '%s'\n", *(top_global->name));
771         if (ir->nsteps >= 0)
772         {
773             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
774         }
775         else
776         {
777             sprintf(tbuf, "%s", "infinite");
778         }
779         if (ir->init_step > 0)
780         {
781             fprintf(stderr,
782                     "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
783                     gmx_step_str(ir->init_step + ir->nsteps, sbuf),
784                     tbuf,
785                     gmx_step_str(ir->init_step, sbuf2),
786                     ir->init_step * ir->delta_t);
787         }
788         else
789         {
790             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
791         }
792         fprintf(fplog, "\n");
793     }
794
795     walltime_accounting_start_time(walltime_accounting);
796     wallcycle_start(wcycle, ewcRUN);
797     print_start(fplog, cr, walltime_accounting, "mdrun");
798
799     /***********************************************************
800      *
801      *             Loop over MD steps
802      *
803      ************************************************************/
804
805     bFirstStep = TRUE;
806     /* Skip the first Nose-Hoover integration when we get the state from tpx */
807     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
808     bSumEkinhOld     = FALSE;
809     bExchanged       = FALSE;
810     bNeedRepartition = FALSE;
811
812     step     = ir->init_step;
813     step_rel = 0;
814
815     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
816             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]),
817             simulationsShareState,
818             MASTER(cr),
819             ir->nstlist,
820             mdrunOptions.reproducible,
821             nstSignalComm,
822             mdrunOptions.maximumHoursToRun,
823             ir->nstlist == 0,
824             fplog,
825             step,
826             bNS,
827             walltime_accounting);
828
829     auto checkpointHandler = std::make_unique<CheckpointHandler>(
830             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
831             simulationsShareState,
832             ir->nstlist == 0,
833             MASTER(cr),
834             mdrunOptions.writeConfout,
835             mdrunOptions.checkpointOptions.period);
836
837     const bool resetCountersIsLocal = true;
838     auto       resetHandler         = std::make_unique<ResetHandler>(
839             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
840             !resetCountersIsLocal,
841             ir->nsteps,
842             MASTER(cr),
843             mdrunOptions.timingOptions.resetHalfway,
844             mdrunOptions.maximumHoursToRun,
845             mdlog,
846             wcycle,
847             walltime_accounting);
848
849     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
850
851     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
852     {
853         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
854     }
855
856     /* and stop now if we should */
857     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
858     while (!bLastStep)
859     {
860
861         /* Determine if this is a neighbor search step */
862         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
863
864         if (bPMETune && bNStList)
865         {
866             // This has to be here because PME load balancing is called so early.
867             // TODO: Move to after all booleans are defined.
868             if (useGpuForUpdate && !bFirstStep)
869             {
870                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
871                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
872             }
873             /* PME grid + cut-off optimization with GPUs or PME nodes */
874             pme_loadbal_do(pme_loadbal,
875                            cr,
876                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
877                            fplog,
878                            mdlog,
879                            *ir,
880                            fr,
881                            state->box,
882                            state->x,
883                            wcycle,
884                            step,
885                            step_rel,
886                            &bPMETunePrinting,
887                            simulationWork.useGpuPmePpCommunication);
888         }
889
890         wallcycle_start(wcycle, ewcSTEP);
891
892         bLastStep = (step_rel == ir->nsteps);
893         t         = t0 + step * ir->delta_t;
894
895         // TODO Refactor this, so that nstfep does not need a default value of zero
896         if (ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
897         {
898             /* find and set the current lambdas */
899             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
900
901             bDoDHDL = do_per_step(step, ir->fepvals->nstdhdl);
902             bDoFEP  = ((ir->efep != FreeEnergyPerturbationType::No) && do_per_step(step, nstfep));
903             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
904                            && (!bFirstStep));
905         }
906
907         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
908                      && do_per_step(step, replExParams.exchangeInterval));
909
910         if (doSimulatedAnnealing)
911         {
912             // TODO: Avoid changing inputrec (#3854)
913             // Simulated annealing updates the reference temperature.
914             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
915             update_annealing_target_temp(nonConstInputrec, t, &upd);
916         }
917
918         /* Stop Center of Mass motion */
919         bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && do_per_step(step, ir->nstcomm));
920
921         /* Determine whether or not to do Neighbour Searching */
922         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
923
924         /* Note that the stopHandler will cause termination at nstglobalcomm
925          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
926          * nstpcouple steps, we have computed the half-step kinetic energy
927          * of the previous step and can always output energies at the last step.
928          */
929         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
930
931         /* do_log triggers energy and virial calculation. Because this leads
932          * to different code paths, forces can be different. Thus for exact
933          * continuation we should avoid extra log output.
934          * Note that the || bLastStep can result in non-exact continuation
935          * beyond the last step. But we don't consider that to be an issue.
936          */
937         do_log     = (do_per_step(step, ir->nstlog)
938                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
939         do_verbose = mdrunOptions.verbose
940                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
941
942         if (useGpuForUpdate && !bFirstStep && bNS)
943         {
944             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
945             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
946             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
947             // Copy coordinate from the GPU when needed at the search step.
948             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
949             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
950             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
951             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
952         }
953
954         // We only need to calculate virtual velocities if we are writing them in the current step
955         const bool needVirtualVelocitiesThisStep =
956                 (vsite != nullptr)
957                 && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep());
958
959         if (vsite != nullptr)
960         {
961             // Virtual sites need to be updated before domain decomposition and forces are calculated
962             wallcycle_start(wcycle, ewcVSITECONSTR);
963             // md-vv calculates virtual velocities once it has full-step real velocities
964             vsite->construct(state->x,
965                              state->v,
966                              state->box,
967                              (!EI_VV(inputrec->eI) && needVirtualVelocitiesThisStep)
968                                      ? VSiteOperation::PositionsAndVelocities
969                                      : VSiteOperation::Positions);
970             wallcycle_stop(wcycle, ewcVSITECONSTR);
971         }
972
973         if (bNS && !(bFirstStep && ir->bContinuation))
974         {
975             bMasterState = FALSE;
976             /* Correct the new box if it is too skewed */
977             if (inputrecDynamicBox(ir))
978             {
979                 if (correct_box(fplog, step, state->box))
980                 {
981                     bMasterState = TRUE;
982                     // If update is offloaded, it should be informed about the box size change
983                     if (useGpuForUpdate)
984                     {
985                         integrator->setPbc(PbcType::Xyz, state->box);
986                     }
987                 }
988             }
989             if (DOMAINDECOMP(cr) && bMasterState)
990             {
991                 dd_collect_state(cr->dd, state, state_global);
992             }
993
994             if (DOMAINDECOMP(cr))
995             {
996                 /* Repartition the domain decomposition */
997                 dd_partition_system(fplog,
998                                     mdlog,
999                                     step,
1000                                     cr,
1001                                     bMasterState,
1002                                     nstglobalcomm,
1003                                     state_global,
1004                                     *top_global,
1005                                     *ir,
1006                                     imdSession,
1007                                     pull_work,
1008                                     state,
1009                                     &f,
1010                                     mdAtoms,
1011                                     &top,
1012                                     fr,
1013                                     vsite,
1014                                     constr,
1015                                     nrnb,
1016                                     wcycle,
1017                                     do_verbose && !bPMETunePrinting);
1018                 shouldCheckNumberOfBondedInteractions = true;
1019                 upd.setNumAtoms(state->natoms);
1020             }
1021         }
1022
1023         // Allocate or re-size GPU halo exchange object, if necessary
1024         if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange)
1025         {
1026             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
1027                                "GPU device manager has to be initialized to use GPU "
1028                                "version of halo exchange.");
1029             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
1030         }
1031
1032         if (MASTER(cr) && do_log)
1033         {
1034             gmx::EnergyOutput::printHeader(
1035                     fplog, step, t); /* can we improve the information printed here? */
1036         }
1037
1038         if (ir->efep != FreeEnergyPerturbationType::No)
1039         {
1040             update_mdatoms(mdatoms, state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
1041         }
1042
1043         if (bExchanged)
1044         {
1045             /* We need the kinetic energy at minus the half step for determining
1046              * the full step kinetic energy and possibly for T-coupling.*/
1047             /* This may not be quite working correctly yet . . . . */
1048             compute_globals(gstat,
1049                             cr,
1050                             ir,
1051                             fr,
1052                             ekind,
1053                             makeConstArrayRef(state->x),
1054                             makeConstArrayRef(state->v),
1055                             state->box,
1056                             mdatoms,
1057                             nrnb,
1058                             &vcm,
1059                             wcycle,
1060                             enerd,
1061                             nullptr,
1062                             nullptr,
1063                             nullptr,
1064                             nullptr,
1065                             gmx::ArrayRef<real>{},
1066                             &nullSignaller,
1067                             state->box,
1068                             &totalNumberOfBondedInteractions,
1069                             &bSumEkinhOld,
1070                             CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
1071             checkNumberOfBondedInteractions(mdlog,
1072                                             cr,
1073                                             totalNumberOfBondedInteractions,
1074                                             *top_global,
1075                                             &top,
1076                                             makeConstArrayRef(state->x),
1077                                             state->box,
1078                                             &shouldCheckNumberOfBondedInteractions);
1079         }
1080         clear_mat(force_vir);
1081
1082         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
1083
1084         /* Determine the energy and pressure:
1085          * at nstcalcenergy steps and at energy output steps (set below).
1086          */
1087         if (EI_VV(ir->eI) && (!bInitStep))
1088         {
1089             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1090             bCalcVir      = bCalcEnerStep
1091                        || (ir->epc != PressureCoupling::No
1092                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
1093         }
1094         else
1095         {
1096             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1097             bCalcVir      = bCalcEnerStep
1098                        || (ir->epc != PressureCoupling::No && do_per_step(step, ir->nstpcouple));
1099         }
1100         bCalcEner = bCalcEnerStep;
1101
1102         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
1103
1104         if (do_ene || do_log || bDoReplEx)
1105         {
1106             bCalcVir  = TRUE;
1107             bCalcEner = TRUE;
1108         }
1109
1110         /* Do we need global communication ? */
1111         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
1112                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
1113
1114         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
1115                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
1116                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
1117         if (fr->useMts && !do_per_step(step, ir->nstfout))
1118         {
1119             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
1120         }
1121
1122         if (shellfc)
1123         {
1124             /* Now is the time to relax the shells */
1125             relax_shell_flexcon(fplog,
1126                                 cr,
1127                                 ms,
1128                                 mdrunOptions.verbose,
1129                                 enforcedRotation,
1130                                 step,
1131                                 ir,
1132                                 imdSession,
1133                                 pull_work,
1134                                 bNS,
1135                                 force_flags,
1136                                 &top,
1137                                 constr,
1138                                 enerd,
1139                                 state->natoms,
1140                                 state->x.arrayRefWithPadding(),
1141                                 state->v.arrayRefWithPadding(),
1142                                 state->box,
1143                                 state->lambda,
1144                                 &state->hist,
1145                                 &f.view(),
1146                                 force_vir,
1147                                 mdatoms,
1148                                 nrnb,
1149                                 wcycle,
1150                                 shellfc,
1151                                 fr,
1152                                 runScheduleWork,
1153                                 t,
1154                                 mu_tot,
1155                                 vsite,
1156                                 ddBalanceRegionHandler);
1157         }
1158         else
1159         {
1160             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
1161                is updated (or the AWH update will be performed twice for one step when continuing).
1162                It would be best to call this update function from do_md_trajectory_writing but that
1163                would occur after do_force. One would have to divide the update_awh function into one
1164                function applying the AWH force and one doing the AWH bias update. The update AWH
1165                bias function could then be called after do_md_trajectory_writing (then containing
1166                update_awh_history). The checkpointing will in the future probably moved to the start
1167                of the md loop which will rid of this issue. */
1168             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
1169             {
1170                 awh->updateHistory(state_global->awhHistory.get());
1171             }
1172
1173             /* The coordinates (x) are shifted (to get whole molecules)
1174              * in do_force.
1175              * This is parallellized as well, and does communication too.
1176              * Check comments in sim_util.c
1177              */
1178             do_force(fplog,
1179                      cr,
1180                      ms,
1181                      *ir,
1182                      awh.get(),
1183                      enforcedRotation,
1184                      imdSession,
1185                      pull_work,
1186                      step,
1187                      nrnb,
1188                      wcycle,
1189                      &top,
1190                      state->box,
1191                      state->x.arrayRefWithPadding(),
1192                      &state->hist,
1193                      &f.view(),
1194                      force_vir,
1195                      mdatoms,
1196                      enerd,
1197                      state->lambda,
1198                      fr,
1199                      runScheduleWork,
1200                      vsite,
1201                      mu_tot,
1202                      t,
1203                      ed ? ed->getLegacyED() : nullptr,
1204                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
1205                      ddBalanceRegionHandler);
1206         }
1207
1208         // VV integrators do not need the following velocity half step
1209         // if it is the first step after starting from a checkpoint.
1210         // That is, the half step is needed on all other steps, and
1211         // also the first step when starting from a .tpr file.
1212         if (EI_VV(ir->eI))
1213         {
1214             integrateVVFirstStep(step,
1215                                  bFirstStep,
1216                                  bInitStep,
1217                                  startingBehavior,
1218                                  nstglobalcomm,
1219                                  ir,
1220                                  fr,
1221                                  cr,
1222                                  state,
1223                                  mdatoms,
1224                                  fcdata,
1225                                  &MassQ,
1226                                  &vcm,
1227                                  top_global,
1228                                  top,
1229                                  enerd,
1230                                  ekind,
1231                                  gstat,
1232                                  &last_ekin,
1233                                  bCalcVir,
1234                                  total_vir,
1235                                  shake_vir,
1236                                  force_vir,
1237                                  pres,
1238                                  M,
1239                                  do_log,
1240                                  do_ene,
1241                                  bCalcEner,
1242                                  bGStat,
1243                                  bStopCM,
1244                                  bTrotter,
1245                                  bExchanged,
1246                                  &bSumEkinhOld,
1247                                  &shouldCheckNumberOfBondedInteractions,
1248                                  &saved_conserved_quantity,
1249                                  &f,
1250                                  &upd,
1251                                  constr,
1252                                  &nullSignaller,
1253                                  trotter_seq,
1254                                  nrnb,
1255                                  mdlog,
1256                                  fplog,
1257                                  wcycle);
1258             if (vsite != nullptr && needVirtualVelocitiesThisStep)
1259             {
1260                 // Positions were calculated earlier
1261                 wallcycle_start(wcycle, ewcVSITECONSTR);
1262                 vsite->construct(state->x, state->v, state->box, VSiteOperation::Velocities);
1263                 wallcycle_stop(wcycle, ewcVSITECONSTR);
1264             }
1265         }
1266
1267         /* ########  END FIRST UPDATE STEP  ############## */
1268         /* ########  If doing VV, we now have v(dt) ###### */
1269         if (bDoExpanded)
1270         {
1271             /* perform extended ensemble sampling in lambda - we don't
1272                actually move to the new state before outputting
1273                statistics, but if performing simulated tempering, we
1274                do update the velocities and the tau_t. */
1275             // TODO: Avoid changing inputrec (#3854)
1276             // Simulated tempering updates the reference temperature.
1277             // Expanded ensemble without simulated tempering does not change the inputrec.
1278             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
1279             lamnew                 = ExpandedEnsembleDynamics(fplog,
1280                                               nonConstInputrec,
1281                                               enerd,
1282                                               state,
1283                                               &MassQ,
1284                                               state->fep_state,
1285                                               state->dfhist,
1286                                               step,
1287                                               state->v.rvec_array(),
1288                                               mdatoms);
1289             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1290             if (MASTER(cr))
1291             {
1292                 copy_df_history(state_global->dfhist, state->dfhist);
1293             }
1294         }
1295
1296         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1297         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1298         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1299             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1300                 || checkpointHandler->isCheckpointingStep()))
1301         {
1302             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1303             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1304         }
1305         // Copy velocities if needed for the output/checkpointing.
1306         // NOTE: Copy on the search steps is done at the beginning of the step.
1307         if (useGpuForUpdate && !bNS
1308             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1309         {
1310             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1311             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1312         }
1313         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1314         // and update is offloaded hence forces are kept on the GPU for update and have not been
1315         // already transferred in do_force().
1316         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1317         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1318         //       prior to GPU update.
1319         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1320         //       copy call in do_force(...).
1321         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1322         //       on host after the D2H copy in do_force(...).
1323         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1324             && do_per_step(step, ir->nstfout))
1325         {
1326             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1327             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1328         }
1329         /* Now we have the energies and forces corresponding to the
1330          * coordinates at time t. We must output all of this before
1331          * the update.
1332          */
1333         do_md_trajectory_writing(fplog,
1334                                  cr,
1335                                  nfile,
1336                                  fnm,
1337                                  step,
1338                                  step_rel,
1339                                  t,
1340                                  ir,
1341                                  state,
1342                                  state_global,
1343                                  observablesHistory,
1344                                  top_global,
1345                                  fr,
1346                                  outf,
1347                                  energyOutput,
1348                                  ekind,
1349                                  f.view().force(),
1350                                  checkpointHandler->isCheckpointingStep(),
1351                                  bRerunMD,
1352                                  bLastStep,
1353                                  mdrunOptions.writeConfout,
1354                                  bSumEkinhOld);
1355         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1356         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x.rvec_array(), t);
1357
1358         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1359         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1360             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1361         {
1362             copy_mat(state->svir_prev, shake_vir);
1363             copy_mat(state->fvir_prev, force_vir);
1364         }
1365
1366         stopHandler->setSignal();
1367         resetHandler->setSignal(walltime_accounting);
1368
1369         if (bGStat || !PAR(cr))
1370         {
1371             /* In parallel we only have to check for checkpointing in steps
1372              * where we do global communication,
1373              *  otherwise the other nodes don't know.
1374              */
1375             checkpointHandler->setSignal(walltime_accounting);
1376         }
1377
1378         /* #########   START SECOND UPDATE STEP ################# */
1379
1380         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1381            controlled in preprocessing */
1382
1383         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1384         {
1385             gmx_bool bIfRandomize;
1386             bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, &upd, constr);
1387             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1388             if (constr && bIfRandomize)
1389             {
1390                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1391             }
1392         }
1393         /* Box is changed in update() when we do pressure coupling,
1394          * but we should still use the old box for energy corrections and when
1395          * writing it to the energy file, so it matches the trajectory files for
1396          * the same timestep above. Make a copy in a separate array.
1397          */
1398         copy_mat(state->box, lastbox);
1399
1400         dvdl_constr = 0;
1401
1402         if (!useGpuForUpdate)
1403         {
1404             wallcycle_start(wcycle, ewcUPDATE);
1405         }
1406         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1407         if (bTrotter)
1408         {
1409             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
1410             /* We can only do Berendsen coupling after we have summed
1411              * the kinetic energy or virial. Since the happens
1412              * in global_state after update, we should only do it at
1413              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1414              */
1415         }
1416         else
1417         {
1418             update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
1419             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1420         }
1421
1422         /* With leap-frog type integrators we compute the kinetic energy
1423          * at a whole time step as the average of the half-time step kinetic
1424          * energies of two subsequent steps. Therefore we need to compute the
1425          * half step kinetic energy also if we need energies at the next step.
1426          */
1427         const bool needHalfStepKineticEnergy =
1428                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1429
1430         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1431         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1432         const bool doParrinelloRahman = (ir->epc == PressureCoupling::ParrinelloRahman
1433                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1434
1435         if (EI_VV(ir->eI))
1436         {
1437             GMX_ASSERT(!useGpuForUpdate, "GPU update is not supported with VVAK integrator.");
1438
1439             integrateVVSecondStep(step,
1440                                   ir,
1441                                   fr,
1442                                   cr,
1443                                   state,
1444                                   mdatoms,
1445                                   fcdata,
1446                                   &MassQ,
1447                                   &vcm,
1448                                   pull_work,
1449                                   enerd,
1450                                   ekind,
1451                                   gstat,
1452                                   &dvdl_constr,
1453                                   bCalcVir,
1454                                   total_vir,
1455                                   shake_vir,
1456                                   force_vir,
1457                                   pres,
1458                                   M,
1459                                   lastbox,
1460                                   do_log,
1461                                   do_ene,
1462                                   bGStat,
1463                                   &bSumEkinhOld,
1464                                   &f,
1465                                   &cbuf,
1466                                   &upd,
1467                                   constr,
1468                                   &nullSignaller,
1469                                   trotter_seq,
1470                                   nrnb,
1471                                   wcycle);
1472         }
1473         else
1474         {
1475             if (useGpuForUpdate)
1476             {
1477
1478                 wallcycle_stop(wcycle, ewcUPDATE);
1479
1480                 if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1481                 {
1482                     integrator->set(stateGpu->getCoordinates(),
1483                                     stateGpu->getVelocities(),
1484                                     stateGpu->getForces(),
1485                                     top.idef,
1486                                     *mdatoms);
1487
1488                     // Copy data to the GPU after buffers might have being reinitialized
1489                     stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1490                     stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1491                 }
1492
1493                 if (simulationWork.useGpuPme && !runScheduleWork->simulationWork.useGpuPmePpCommunication
1494                     && !thisRankHasDuty(cr, DUTY_PME))
1495                 {
1496                     // The PME forces were recieved to the host, so have to be copied
1497                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::All);
1498                 }
1499                 else if (!runScheduleWork->stepWork.useGpuFBufferOps)
1500                 {
1501                     // The buffer ops were not offloaded this step, so the forces are on the
1502                     // host and have to be copied
1503                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1504                 }
1505
1506                 const bool doTemperatureScaling =
1507                         (ir->etc != TemperatureCoupling::No
1508                          && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1509
1510                 // This applies Leap-Frog, LINCS and SETTLE in succession
1511                 integrator->integrate(
1512                         stateGpu->getForcesReadyOnDeviceEvent(
1513                                 AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1514                         ir->delta_t,
1515                         true,
1516                         bCalcVir,
1517                         shake_vir,
1518                         doTemperatureScaling,
1519                         ekind->tcstat,
1520                         doParrinelloRahman,
1521                         ir->nstpcouple * ir->delta_t,
1522                         M);
1523
1524                 // Copy velocities D2H after update if:
1525                 // - Globals are computed this step (includes the energy output steps).
1526                 // - Temperature is needed for the next step.
1527                 if (bGStat || needHalfStepKineticEnergy)
1528                 {
1529                     stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1530                     stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1531                 }
1532             }
1533             else
1534             {
1535                 /* With multiple time stepping we need to do an additional normal
1536                  * update step to obtain the virial, as the actual MTS integration
1537                  * using an acceleration where the slow forces are multiplied by mtsFactor.
1538                  * Using that acceleration would result in a virial with the slow
1539                  * force contribution would be a factor mtsFactor too large.
1540                  */
1541                 if (fr->useMts && bCalcVir && constr != nullptr)
1542                 {
1543                     upd.update_for_constraint_virial(
1544                             *ir, *mdatoms, *state, f.view().forceWithPadding(), *ekind);
1545
1546                     constrain_coordinates(constr,
1547                                           do_log,
1548                                           do_ene,
1549                                           step,
1550                                           state,
1551                                           upd.xp()->arrayRefWithPadding(),
1552                                           &dvdl_constr,
1553                                           bCalcVir,
1554                                           shake_vir);
1555                 }
1556
1557                 ArrayRefWithPadding<const RVec> forceCombined =
1558                         (fr->useMts && step % ir->mtsLevels[1].stepFactor == 0)
1559                                 ? f.view().forceMtsCombinedWithPadding()
1560                                 : f.view().forceWithPadding();
1561                 upd.update_coords(
1562                         *ir, step, mdatoms, state, forceCombined, fcdata, ekind, M, etrtPOSITION, cr, constr != nullptr);
1563
1564                 wallcycle_stop(wcycle, ewcUPDATE);
1565
1566                 constrain_coordinates(constr,
1567                                       do_log,
1568                                       do_ene,
1569                                       step,
1570                                       state,
1571                                       upd.xp()->arrayRefWithPadding(),
1572                                       &dvdl_constr,
1573                                       bCalcVir && !fr->useMts,
1574                                       shake_vir);
1575
1576                 upd.update_sd_second_half(
1577                         *ir, step, &dvdl_constr, mdatoms, state, cr, nrnb, wcycle, constr, do_log, do_ene);
1578                 upd.finish_update(*ir, mdatoms, state, wcycle, constr != nullptr);
1579             }
1580
1581             if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1582             {
1583                 updatePrevStepPullCom(pull_work, state);
1584             }
1585
1586             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1587         }
1588
1589         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1590         /* With Leap-Frog we can skip compute_globals at
1591          * non-communication steps, but we need to calculate
1592          * the kinetic energy one step before communication.
1593          */
1594         {
1595             // Organize to do inter-simulation signalling on steps if
1596             // and when algorithms require it.
1597             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1598
1599             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1600             {
1601                 // Copy coordinates when needed to stop the CM motion.
1602                 if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
1603                 {
1604                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1605                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1606                 }
1607                 // Since we're already communicating at this step, we
1608                 // can propagate intra-simulation signals. Note that
1609                 // check_nstglobalcomm has the responsibility for
1610                 // choosing the value of nstglobalcomm that is one way
1611                 // bGStat becomes true, so we can't get into a
1612                 // situation where e.g. checkpointing can't be
1613                 // signalled.
1614                 bool                doIntraSimSignal = true;
1615                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1616
1617                 compute_globals(
1618                         gstat,
1619                         cr,
1620                         ir,
1621                         fr,
1622                         ekind,
1623                         makeConstArrayRef(state->x),
1624                         makeConstArrayRef(state->v),
1625                         state->box,
1626                         mdatoms,
1627                         nrnb,
1628                         &vcm,
1629                         wcycle,
1630                         enerd,
1631                         force_vir,
1632                         shake_vir,
1633                         total_vir,
1634                         pres,
1635                         (!EI_VV(ir->eI) && bCalcEner && constr != nullptr) ? constr->rmsdData()
1636                                                                            : gmx::ArrayRef<real>{},
1637                         &signaller,
1638                         lastbox,
1639                         &totalNumberOfBondedInteractions,
1640                         &bSumEkinhOld,
1641                         (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1642                                 | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1643                                 | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1644                                 | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
1645                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1646                                                                          : 0));
1647                 checkNumberOfBondedInteractions(mdlog,
1648                                                 cr,
1649                                                 totalNumberOfBondedInteractions,
1650                                                 *top_global,
1651                                                 &top,
1652                                                 makeConstArrayRef(state->x),
1653                                                 state->box,
1654                                                 &shouldCheckNumberOfBondedInteractions);
1655                 if (!EI_VV(ir->eI) && bStopCM)
1656                 {
1657                     process_and_stopcm_grp(
1658                             fplog, &vcm, *mdatoms, makeArrayRef(state->x), makeArrayRef(state->v));
1659                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1660
1661                     // TODO: The special case of removing CM motion should be dealt more gracefully
1662                     if (useGpuForUpdate)
1663                     {
1664                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1665                         // Here we block until the H2D copy completes because event sync with the
1666                         // force kernels that use the coordinates on the next steps is not implemented
1667                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1668                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1669                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1670                         if (vcm.mode != ComRemovalAlgorithm::No)
1671                         {
1672                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1673                         }
1674                     }
1675                 }
1676             }
1677         }
1678
1679         /* #############  END CALC EKIN AND PRESSURE ################# */
1680
1681         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1682            the virial that should probably be addressed eventually. state->veta has better properies,
1683            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1684            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1685
1686         if (ir->efep != FreeEnergyPerturbationType::No && !EI_VV(ir->eI))
1687         {
1688             /* Sum up the foreign energy and dK/dl terms for md and sd.
1689                Currently done every step so that dH/dl is correct in the .edr */
1690             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1691         }
1692
1693         update_pcouple_after_coordinates(
1694                 fplog, step, ir, mdatoms, pres, force_vir, shake_vir, pressureCouplingMu, state, nrnb, upd.deform(), !useGpuForUpdate);
1695
1696         const bool doBerendsenPressureCoupling = (inputrec->epc == PressureCoupling::Berendsen
1697                                                   && do_per_step(step, inputrec->nstpcouple));
1698         const bool doCRescalePressureCoupling  = (inputrec->epc == PressureCoupling::CRescale
1699                                                  && do_per_step(step, inputrec->nstpcouple));
1700         if (useGpuForUpdate
1701             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1702         {
1703             integrator->scaleCoordinates(pressureCouplingMu);
1704             if (doCRescalePressureCoupling)
1705             {
1706                 matrix pressureCouplingInvMu;
1707                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1708                 integrator->scaleVelocities(pressureCouplingInvMu);
1709             }
1710             integrator->setPbc(PbcType::Xyz, state->box);
1711         }
1712
1713         /* ################# END UPDATE STEP 2 ################# */
1714         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1715
1716         /* The coordinates (x) were unshifted in update */
1717         if (!bGStat)
1718         {
1719             /* We will not sum ekinh_old,
1720              * so signal that we still have to do it.
1721              */
1722             bSumEkinhOld = TRUE;
1723         }
1724
1725         if (bCalcEner)
1726         {
1727             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1728
1729             /* use the directly determined last velocity, not actually the averaged half steps */
1730             if (bTrotter && ir->eI == IntegrationAlgorithm::VV)
1731             {
1732                 enerd->term[F_EKIN] = last_ekin;
1733             }
1734             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1735
1736             if (integratorHasConservedEnergyQuantity(ir))
1737             {
1738                 if (EI_VV(ir->eI))
1739                 {
1740                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1741                 }
1742                 else
1743                 {
1744                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1745                 }
1746             }
1747             /* #########  END PREPARING EDR OUTPUT  ###########  */
1748         }
1749
1750         /* Output stuff */
1751         if (MASTER(cr))
1752         {
1753             if (fplog && do_log && bDoExpanded)
1754             {
1755                 /* only needed if doing expanded ensemble */
1756                 PrintFreeEnergyInfoToFile(fplog,
1757                                           ir->fepvals.get(),
1758                                           ir->expandedvals.get(),
1759                                           ir->bSimTemp ? ir->simtempvals.get() : nullptr,
1760                                           state_global->dfhist,
1761                                           state->fep_state,
1762                                           ir->nstlog,
1763                                           step);
1764             }
1765             if (bCalcEner)
1766             {
1767                 energyOutput.addDataAtEnergyStep(bDoDHDL,
1768                                                  bCalcEnerStep,
1769                                                  t,
1770                                                  mdatoms->tmass,
1771                                                  enerd,
1772                                                  ir->fepvals.get(),
1773                                                  ir->expandedvals.get(),
1774                                                  lastbox,
1775                                                  PTCouplingArrays{ state->boxv,
1776                                                                    state->nosehoover_xi,
1777                                                                    state->nosehoover_vxi,
1778                                                                    state->nhpres_xi,
1779                                                                    state->nhpres_vxi },
1780                                                  state->fep_state,
1781                                                  shake_vir,
1782                                                  force_vir,
1783                                                  total_vir,
1784                                                  pres,
1785                                                  ekind,
1786                                                  mu_tot,
1787                                                  constr);
1788             }
1789             else
1790             {
1791                 energyOutput.recordNonEnergyStep();
1792             }
1793
1794             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1795             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1796
1797             if (doSimulatedAnnealing)
1798             {
1799                 gmx::EnergyOutput::printAnnealingTemperatures(
1800                         do_log ? fplog : nullptr, groups, &(ir->opts));
1801             }
1802             if (do_log || do_ene || do_dr || do_or)
1803             {
1804                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf),
1805                                                    do_ene,
1806                                                    do_dr,
1807                                                    do_or,
1808                                                    do_log ? fplog : nullptr,
1809                                                    step,
1810                                                    t,
1811                                                    fr->fcdata.get(),
1812                                                    awh.get());
1813             }
1814             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1815             {
1816                 const bool isInitialOutput = false;
1817                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1818             }
1819
1820             if (ir->bPull)
1821             {
1822                 pull_print_output(pull_work, step, t);
1823             }
1824
1825             if (do_per_step(step, ir->nstlog))
1826             {
1827                 if (fflush(fplog) != 0)
1828                 {
1829                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1830                 }
1831             }
1832         }
1833         if (bDoExpanded)
1834         {
1835             /* Have to do this part _after_ outputting the logfile and the edr file */
1836             /* Gets written into the state at the beginning of next loop*/
1837             state->fep_state = lamnew;
1838         }
1839         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1840         {
1841             state->fep_state = awh->fepLambdaState();
1842         }
1843         /* Print the remaining wall clock time for the run */
1844         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1845         {
1846             if (shellfc)
1847             {
1848                 fprintf(stderr, "\n");
1849             }
1850             print_time(stderr, walltime_accounting, step, ir, cr);
1851         }
1852
1853         /* Ion/water position swapping.
1854          * Not done in last step since trajectory writing happens before this call
1855          * in the MD loop and exchanges would be lost anyway. */
1856         bNeedRepartition = FALSE;
1857         if ((ir->eSwapCoords != SwapType::No) && (step > 0) && !bLastStep
1858             && do_per_step(step, ir->swap->nstswap))
1859         {
1860             bNeedRepartition = do_swapcoords(cr,
1861                                              step,
1862                                              t,
1863                                              ir,
1864                                              swap,
1865                                              wcycle,
1866                                              as_rvec_array(state->x.data()),
1867                                              state->box,
1868                                              MASTER(cr) && mdrunOptions.verbose,
1869                                              bRerunMD);
1870
1871             if (bNeedRepartition && DOMAINDECOMP(cr))
1872             {
1873                 dd_collect_state(cr->dd, state, state_global);
1874             }
1875         }
1876
1877         /* Replica exchange */
1878         bExchanged = FALSE;
1879         if (bDoReplEx)
1880         {
1881             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1882         }
1883
1884         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1885         {
1886             dd_partition_system(fplog,
1887                                 mdlog,
1888                                 step,
1889                                 cr,
1890                                 TRUE,
1891                                 1,
1892                                 state_global,
1893                                 *top_global,
1894                                 *ir,
1895                                 imdSession,
1896                                 pull_work,
1897                                 state,
1898                                 &f,
1899                                 mdAtoms,
1900                                 &top,
1901                                 fr,
1902                                 vsite,
1903                                 constr,
1904                                 nrnb,
1905                                 wcycle,
1906                                 FALSE);
1907             shouldCheckNumberOfBondedInteractions = true;
1908             upd.setNumAtoms(state->natoms);
1909         }
1910
1911         bFirstStep = FALSE;
1912         bInitStep  = FALSE;
1913
1914         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1915         /* With all integrators, except VV, we need to retain the pressure
1916          * at the current step for coupling at the next step.
1917          */
1918         if ((state->flags & enumValueToBitMask(StateEntry::PressurePrevious))
1919             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1920         {
1921             /* Store the pressure in t_state for pressure coupling
1922              * at the next MD step.
1923              */
1924             copy_mat(pres, state->pres_prev);
1925         }
1926
1927         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1928
1929         if ((membed != nullptr) && (!bLastStep))
1930         {
1931             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1932         }
1933
1934         cycles = wallcycle_stop(wcycle, ewcSTEP);
1935         if (DOMAINDECOMP(cr) && wcycle)
1936         {
1937             dd_cycles_add(cr->dd, cycles, ddCyclStep);
1938         }
1939
1940         /* increase the MD step number */
1941         step++;
1942         step_rel++;
1943
1944 #if GMX_FAHCORE
1945         if (MASTER(cr))
1946         {
1947             fcReportProgress(ir->nsteps + ir->init_step, step);
1948         }
1949 #endif
1950
1951         resetHandler->resetCounters(
1952                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
1953
1954         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
1955         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
1956     }
1957     /* End of main MD loop */
1958
1959     /* Closing TNG files can include compressing data. Therefore it is good to do that
1960      * before stopping the time measurements. */
1961     mdoutf_tng_close(outf);
1962
1963     /* Stop measuring walltime */
1964     walltime_accounting_end_time(walltime_accounting);
1965
1966     if (!thisRankHasDuty(cr, DUTY_PME))
1967     {
1968         /* Tell the PME only node to finish */
1969         gmx_pme_send_finish(cr);
1970     }
1971
1972     if (MASTER(cr))
1973     {
1974         if (ir->nstcalcenergy > 0)
1975         {
1976             energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
1977
1978             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
1979             energyOutput.printAverages(fplog, groups);
1980         }
1981     }
1982     done_mdoutf(outf);
1983
1984     if (bPMETune)
1985     {
1986         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
1987     }
1988
1989     done_shellfc(fplog, shellfc, step_rel);
1990
1991     if (useReplicaExchange && MASTER(cr))
1992     {
1993         print_replica_exchange_statistics(fplog, repl_ex);
1994     }
1995
1996     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
1997
1998     global_stat_destroy(gstat);
1999 }