ea5494534c9d0739200115c71420be08a644f726
[alexxy/gromacs.git] / src / gromacs / mdrun / md.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 /*! \internal \file
38  *
39  * \brief Implements the integrator for normal molecular dynamics simulations
40  *
41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
42  * \ingroup module_mdrun
43  */
44 #include "gmxpre.h"
45
46 #include <cinttypes>
47 #include <cmath>
48 #include <cstdio>
49 #include <cstdlib>
50
51 #include <algorithm>
52 #include <memory>
53 #include <numeric>
54
55 #include "gromacs/applied_forces/awh/awh.h"
56 #include "gromacs/applied_forces/awh/read_params.h"
57 #include "gromacs/commandline/filenm.h"
58 #include "gromacs/domdec/collect.h"
59 #include "gromacs/domdec/dlbtiming.h"
60 #include "gromacs/domdec/domdec.h"
61 #include "gromacs/domdec/domdec_network.h"
62 #include "gromacs/domdec/domdec_struct.h"
63 #include "gromacs/domdec/gpuhaloexchange.h"
64 #include "gromacs/domdec/mdsetup.h"
65 #include "gromacs/domdec/partition.h"
66 #include "gromacs/essentialdynamics/edsam.h"
67 #include "gromacs/ewald/pme_load_balancing.h"
68 #include "gromacs/ewald/pme_pp.h"
69 #include "gromacs/fileio/trxio.h"
70 #include "gromacs/gmxlib/network.h"
71 #include "gromacs/gmxlib/nrnb.h"
72 #include "gromacs/gpu_utils/device_stream_manager.h"
73 #include "gromacs/gpu_utils/gpu_utils.h"
74 #include "gromacs/imd/imd.h"
75 #include "gromacs/listed_forces/listed_forces.h"
76 #include "gromacs/math/functions.h"
77 #include "gromacs/math/invertmatrix.h"
78 #include "gromacs/math/vec.h"
79 #include "gromacs/math/vectypes.h"
80 #include "gromacs/mdlib/checkpointhandler.h"
81 #include "gromacs/mdlib/compute_io.h"
82 #include "gromacs/mdlib/constr.h"
83 #include "gromacs/mdlib/coupling.h"
84 #include "gromacs/mdlib/ebin.h"
85 #include "gromacs/mdlib/enerdata_utils.h"
86 #include "gromacs/mdlib/energyoutput.h"
87 #include "gromacs/mdlib/expanded.h"
88 #include "gromacs/mdlib/force.h"
89 #include "gromacs/mdlib/force_flags.h"
90 #include "gromacs/mdlib/forcerec.h"
91 #include "gromacs/mdlib/freeenergyparameters.h"
92 #include "gromacs/mdlib/md_support.h"
93 #include "gromacs/mdlib/mdatoms.h"
94 #include "gromacs/mdlib/mdoutf.h"
95 #include "gromacs/mdlib/membed.h"
96 #include "gromacs/mdlib/resethandler.h"
97 #include "gromacs/mdlib/sighandler.h"
98 #include "gromacs/mdlib/simulationsignal.h"
99 #include "gromacs/mdlib/stat.h"
100 #include "gromacs/mdlib/stophandler.h"
101 #include "gromacs/mdlib/tgroup.h"
102 #include "gromacs/mdlib/trajectory_writing.h"
103 #include "gromacs/mdlib/update.h"
104 #include "gromacs/mdlib/update_constrain_gpu.h"
105 #include "gromacs/mdlib/update_vv.h"
106 #include "gromacs/mdlib/vcm.h"
107 #include "gromacs/mdlib/vsite.h"
108 #include "gromacs/mdrunutility/handlerestart.h"
109 #include "gromacs/mdrunutility/multisim.h"
110 #include "gromacs/mdrunutility/printtime.h"
111 #include "gromacs/mdtypes/awh_history.h"
112 #include "gromacs/mdtypes/awh_params.h"
113 #include "gromacs/mdtypes/commrec.h"
114 #include "gromacs/mdtypes/df_history.h"
115 #include "gromacs/mdtypes/energyhistory.h"
116 #include "gromacs/mdtypes/fcdata.h"
117 #include "gromacs/mdtypes/forcebuffers.h"
118 #include "gromacs/mdtypes/forcerec.h"
119 #include "gromacs/mdtypes/group.h"
120 #include "gromacs/mdtypes/inputrec.h"
121 #include "gromacs/mdtypes/interaction_const.h"
122 #include "gromacs/mdtypes/md_enums.h"
123 #include "gromacs/mdtypes/mdatom.h"
124 #include "gromacs/mdtypes/mdrunoptions.h"
125 #include "gromacs/mdtypes/multipletimestepping.h"
126 #include "gromacs/mdtypes/observableshistory.h"
127 #include "gromacs/mdtypes/pullhistory.h"
128 #include "gromacs/mdtypes/simulation_workload.h"
129 #include "gromacs/mdtypes/state.h"
130 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
131 #include "gromacs/modularsimulator/energydata.h"
132 #include "gromacs/nbnxm/gpu_data_mgmt.h"
133 #include "gromacs/nbnxm/nbnxm.h"
134 #include "gromacs/pbcutil/pbc.h"
135 #include "gromacs/pulling/output.h"
136 #include "gromacs/pulling/pull.h"
137 #include "gromacs/swap/swapcoords.h"
138 #include "gromacs/timing/wallcycle.h"
139 #include "gromacs/timing/walltime_accounting.h"
140 #include "gromacs/topology/atoms.h"
141 #include "gromacs/topology/idef.h"
142 #include "gromacs/topology/mtop_util.h"
143 #include "gromacs/topology/topology.h"
144 #include "gromacs/trajectory/trajectoryframe.h"
145 #include "gromacs/utility/basedefinitions.h"
146 #include "gromacs/utility/cstringutil.h"
147 #include "gromacs/utility/fatalerror.h"
148 #include "gromacs/utility/logger.h"
149 #include "gromacs/utility/real.h"
150 #include "gromacs/utility/smalloc.h"
151
152 #include "legacysimulator.h"
153 #include "replicaexchange.h"
154 #include "shellfc.h"
155
156 using gmx::SimulationSignaller;
157
158 void gmx::LegacySimulator::do_md()
159 {
160     // TODO Historically, the EM and MD "integrators" used different
161     // names for the t_inputrec *parameter, but these must have the
162     // same name, now that it's a member of a struct. We use this ir
163     // alias to avoid a large ripple of nearly useless changes.
164     // t_inputrec is being replaced by IMdpOptionsProvider, so this
165     // will go away eventually.
166     const t_inputrec* ir = inputrec;
167
168     int64_t      step, step_rel;
169     double       t, t0 = ir->init_t;
170     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
171     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
172     gmx_bool     bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
173     gmx_bool     do_ene, do_log, do_verbose;
174     gmx_bool     bMasterState;
175     unsigned int force_flags;
176     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
177     int    i, m;
178     rvec   mu_tot;
179     matrix pressureCouplingMu, M;
180     gmx_repl_ex_t     repl_ex = nullptr;
181     gmx_global_stat_t gstat;
182     gmx_shellfc_t*    shellfc;
183     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
184     gmx_bool          bTrotter;
185     real              dvdl_constr;
186     std::vector<RVec> cbuf;
187     matrix            lastbox;
188     int               lamnew = 0;
189     /* for FEP */
190     int       nstfep = 0;
191     double    cycles;
192     real      saved_conserved_quantity = 0;
193     real      last_ekin                = 0;
194     t_extmass MassQ;
195     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
196
197     /* PME load balancing data for GPU kernels */
198     gmx_bool bPMETune         = FALSE;
199     gmx_bool bPMETunePrinting = FALSE;
200
201     bool bInteractiveMDstep = false;
202
203     SimulationSignals signals;
204     // Most global communnication stages don't propagate mdrun
205     // signals, and will use this object to achieve that.
206     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
207
208     if (!mdrunOptions.writeConfout)
209     {
210         // This is on by default, and the main known use case for
211         // turning it off is for convenience in benchmarking, which is
212         // something that should not show up in the general user
213         // interface.
214         GMX_LOG(mdlog.info)
215                 .asParagraph()
216                 .appendText(
217                         "The -noconfout functionality is deprecated, and may be removed in a "
218                         "future version.");
219     }
220
221     /* md-vv uses averaged full step velocities for T-control
222        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
223        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
224     bTrotter = (EI_VV(ir->eI)
225                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
226
227     const bool bRerunMD = false;
228
229     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
230     bGStatEveryStep   = (nstglobalcomm == 1);
231
232     const SimulationGroups* groups = &top_global.groups;
233
234     std::unique_ptr<EssentialDynamics> ed = nullptr;
235     if (opt2bSet("-ei", nfile, fnm))
236     {
237         /* Initialize essential dynamics sampling */
238         ed = init_edsam(mdlog,
239                         opt2fn_null("-ei", nfile, fnm),
240                         opt2fn("-eo", nfile, fnm),
241                         top_global,
242                         *ir,
243                         cr,
244                         constr,
245                         state_global,
246                         observablesHistory,
247                         oenv,
248                         startingBehavior);
249     }
250     else if (observablesHistory->edsamHistory)
251     {
252         gmx_fatal(FARGS,
253                   "The checkpoint is from a run with essential dynamics sampling, "
254                   "but the current run did not specify the -ei option. "
255                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
256     }
257
258     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
259     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
260     initialize_lambdas(fplog,
261                        ir->efep,
262                        ir->bSimTemp,
263                        *ir->fepvals,
264                        ir->simtempvals->temperatures,
265                        gmx::arrayRefFromArray(ir->opts.ref_t, ir->opts.ngtc),
266                        MASTER(cr),
267                        fep_state,
268                        lambda);
269     Update upd(*ir, deform);
270     bool   doSimulatedAnnealing = false;
271     {
272         // TODO: Avoid changing inputrec (#3854)
273         // Simulated annealing updates the reference temperature.
274         auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
275         doSimulatedAnnealing   = initSimulatedAnnealing(nonConstInputrec, &upd);
276     }
277     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
278
279     const t_fcdata& fcdata = *fr->fcdata;
280
281     bool simulationsShareState = false;
282     int  nstSignalComm         = nstglobalcomm;
283     {
284         // TODO This implementation of ensemble orientation restraints is nasty because
285         // a user can't just do multi-sim with single-sim orientation restraints.
286         bool usingEnsembleRestraints =
287                 (fcdata.disres->nsystems > 1) || ((ms != nullptr) && (fcdata.orires->nr != 0));
288         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim() && (ms != nullptr));
289
290         // Replica exchange, ensemble restraints and AWH need all
291         // simulations to remain synchronized, so they need
292         // checkpoints and stop conditions to act on the same step, so
293         // the propagation of such signals must take place between
294         // simulations, not just within simulations.
295         // TODO: Make algorithm initializers set these flags.
296         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
297
298         if (simulationsShareState)
299         {
300             // Inter-simulation signal communication does not need to happen
301             // often, so we use a minimum of 200 steps to reduce overhead.
302             const int c_minimumInterSimulationSignallingInterval = 200;
303             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
304                             * nstglobalcomm;
305         }
306     }
307
308     if (startingBehavior != StartingBehavior::RestartWithAppending)
309     {
310         pleaseCiteCouplingAlgorithms(fplog, *ir);
311     }
312     gmx_mdoutf*       outf = init_mdoutf(fplog,
313                                    nfile,
314                                    fnm,
315                                    mdrunOptions,
316                                    cr,
317                                    outputProvider,
318                                    mdModulesNotifiers,
319                                    ir,
320                                    top_global,
321                                    oenv,
322                                    wcycle,
323                                    startingBehavior,
324                                    simulationsShareState,
325                                    ms);
326     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf),
327                                    top_global,
328                                    *ir,
329                                    pull_work,
330                                    mdoutf_get_fp_dhdl(outf),
331                                    false,
332                                    startingBehavior,
333                                    simulationsShareState,
334                                    mdModulesNotifiers);
335
336     gstat = global_stat_init(ir);
337
338     const auto& simulationWork     = runScheduleWork->simulationWork;
339     const bool  useGpuForPme       = simulationWork.useGpuPme;
340     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
341     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
342     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
343
344     /* Check for polarizable models and flexible constraints */
345     shellfc = init_shell_flexcon(fplog,
346                                  top_global,
347                                  constr ? constr->numFlexibleConstraints() : 0,
348                                  ir->nstcalcenergy,
349                                  DOMAINDECOMP(cr),
350                                  useGpuForPme);
351
352     {
353         double io = compute_io(ir, top_global.natoms, *groups, energyOutput.numEnergyTerms(), 1);
354         if ((io > 2000) && MASTER(cr))
355         {
356             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
357         }
358     }
359
360     // Local state only becomes valid now.
361     std::unique_ptr<t_state> stateInstance;
362     t_state*                 state;
363
364     gmx_localtop_t top(top_global.ffparams);
365
366     ForceBuffers     f(fr->useMts,
367                    ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
368                            ? PinningPolicy::PinnedIfSupported
369                            : PinningPolicy::CannotBePinned);
370     const t_mdatoms* md = mdAtoms->mdatoms();
371     if (DOMAINDECOMP(cr))
372     {
373         stateInstance = std::make_unique<t_state>();
374         state         = stateInstance.get();
375         dd_init_local_state(*cr->dd, state_global, state);
376
377         /* Distribute the charge groups over the nodes from the master node */
378         dd_partition_system(fplog,
379                             mdlog,
380                             ir->init_step,
381                             cr,
382                             TRUE,
383                             1,
384                             state_global,
385                             top_global,
386                             *ir,
387                             imdSession,
388                             pull_work,
389                             state,
390                             &f,
391                             mdAtoms,
392                             &top,
393                             fr,
394                             vsite,
395                             constr,
396                             nrnb,
397                             nullptr,
398                             FALSE);
399         upd.updateAfterPartition(state->natoms,
400                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
401                                              : gmx::ArrayRef<const unsigned short>(),
402                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
403                                          : gmx::ArrayRef<const unsigned short>());
404     }
405     else
406     {
407         state_change_natoms(state_global, state_global->natoms);
408         /* Copy the pointer to the global state */
409         state = state_global;
410
411         /* Generate and initialize new topology */
412         mdAlgorithmsSetupAtomData(cr, *ir, top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
413
414         upd.updateAfterPartition(state->natoms,
415                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
416                                              : gmx::ArrayRef<const unsigned short>(),
417                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
418                                          : gmx::ArrayRef<const unsigned short>());
419     }
420
421     std::unique_ptr<UpdateConstrainGpu> integrator;
422
423     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
424
425     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
426     if (useGpuForUpdate)
427     {
428         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
429                                    || constr->numConstraintsTotal() == 0,
430                            "Constraints in domain decomposition are only supported with update "
431                            "groups if using GPU update.\n");
432         GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
433                                    || constr->numConstraintsTotal() == 0,
434                            "SHAKE is not supported with GPU update.");
435         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
436                            "Either PME or short-ranged non-bonded interaction tasks must run on "
437                            "the GPU to use GPU update.\n");
438         GMX_RELEASE_ASSERT(ir->eI == IntegrationAlgorithm::MD,
439                            "Only the md integrator is supported with the GPU update.\n");
440         GMX_RELEASE_ASSERT(
441                 ir->etc != TemperatureCoupling::NoseHoover,
442                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
443         GMX_RELEASE_ASSERT(
444                 ir->epc == PressureCoupling::No || ir->epc == PressureCoupling::ParrinelloRahman
445                         || ir->epc == PressureCoupling::Berendsen || ir->epc == PressureCoupling::CRescale,
446                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
447                 "with the GPU update.\n");
448         GMX_RELEASE_ASSERT(!md->haveVsites,
449                            "Virtual sites are not supported with the GPU update.\n");
450         GMX_RELEASE_ASSERT(ed == nullptr,
451                            "Essential dynamics is not supported with the GPU update.\n");
452         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
453                            "Constraints pulling is not supported with the GPU update.\n");
454         GMX_RELEASE_ASSERT(fcdata.orires->nr == 0,
455                            "Orientation restraints are not supported with the GPU update.\n");
456         GMX_RELEASE_ASSERT(
457                 ir->efep == FreeEnergyPerturbationType::No
458                         || (!haveFepPerturbedMasses(top_global) && !havePerturbedConstraints(top_global)),
459                 "Free energy perturbation of masses and constraints are not supported with the GPU "
460                 "update.");
461
462         if (constr != nullptr && constr->numConstraintsTotal() > 0)
463         {
464             GMX_LOG(mdlog.info)
465                     .asParagraph()
466                     .appendText("Updating coordinates and applying constraints on the GPU.");
467         }
468         else
469         {
470             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
471         }
472         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
473                            "Device stream manager should be initialized in order to use GPU "
474                            "update-constraints.");
475         GMX_RELEASE_ASSERT(
476                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
477                 "Update stream should be initialized in order to use GPU "
478                 "update-constraints.");
479         integrator = std::make_unique<UpdateConstrainGpu>(
480                 *ir,
481                 top_global,
482                 ekind->ngtc,
483                 fr->deviceStreamManager->context(),
484                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
485                 stateGpu->xUpdatedOnDevice(),
486                 wcycle);
487
488         integrator->setPbc(PbcType::Xyz, state->box);
489     }
490
491     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
492     {
493         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
494     }
495     if (useGpuForUpdate)
496     {
497         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
498     }
499
500     // NOTE: The global state is no longer used at this point.
501     // But state_global is still used as temporary storage space for writing
502     // the global state to file and potentially for replica exchange.
503     // (Global topology should persist.)
504
505     update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
506
507     if (ir->bExpanded)
508     {
509         /* Check nstexpanded here, because the grompp check was broken */
510         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
511         {
512             gmx_fatal(FARGS,
513                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
514         }
515         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
516     }
517
518     if (MASTER(cr))
519     {
520         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
521     }
522
523     preparePrevStepPullCom(ir,
524                            pull_work,
525                            gmx::arrayRefFromArray(md->massT, md->nr),
526                            state,
527                            state_global,
528                            cr,
529                            startingBehavior != StartingBehavior::NewSimulation);
530
531     // TODO: Remove this by converting AWH into a ForceProvider
532     auto awh = prepareAwhModule(fplog,
533                                 *ir,
534                                 state_global,
535                                 cr,
536                                 ms,
537                                 startingBehavior != StartingBehavior::NewSimulation,
538                                 shellfc != nullptr,
539                                 opt2fn("-awh", nfile, fnm),
540                                 pull_work);
541
542     if (useReplicaExchange && MASTER(cr))
543     {
544         repl_ex = init_replica_exchange(fplog, ms, top_global.natoms, ir, replExParams);
545     }
546     /* PME tuning is only supported in the Verlet scheme, with PME for
547      * Coulomb. It is not supported with only LJ PME. */
548     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
549                 && ir->cutoff_scheme != CutoffScheme::Group);
550
551     pme_load_balancing_t* pme_loadbal = nullptr;
552     if (bPMETune)
553     {
554         pme_loadbal_init(
555                 &pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu());
556     }
557
558     if (!ir->bContinuation)
559     {
560         if (state->flags & enumValueToBitMask(StateEntry::V))
561         {
562             auto v = makeArrayRef(state->v);
563             /* Set the velocities of vsites, shells and frozen atoms to zero */
564             for (i = 0; i < md->homenr; i++)
565             {
566                 if (md->ptype[i] == ParticleType::Shell)
567                 {
568                     clear_rvec(v[i]);
569                 }
570                 else if (md->cFREEZE)
571                 {
572                     for (m = 0; m < DIM; m++)
573                     {
574                         if (ir->opts.nFreeze[md->cFREEZE[i]][m])
575                         {
576                             v[i][m] = 0;
577                         }
578                     }
579                 }
580             }
581         }
582
583         if (constr)
584         {
585             /* Constrain the initial coordinates and velocities */
586             do_constrain_first(fplog,
587                                constr,
588                                ir,
589                                md->nr,
590                                md->homenr,
591                                state->x.arrayRefWithPadding(),
592                                state->v.arrayRefWithPadding(),
593                                state->box,
594                                state->lambda[FreeEnergyPerturbationCouplingType::Bonded]);
595         }
596     }
597
598     if (ir->efep != FreeEnergyPerturbationType::No)
599     {
600         /* Set free energy calculation frequency as the greatest common
601          * denominator of nstdhdl and repl_ex_nst. */
602         nstfep = ir->fepvals->nstdhdl;
603         if (ir->bExpanded)
604         {
605             nstfep = std::gcd(ir->expandedvals->nstexpanded, nstfep);
606         }
607         if (useReplicaExchange)
608         {
609             nstfep = std::gcd(replExParams.exchangeInterval, nstfep);
610         }
611         if (ir->bDoAwh)
612         {
613             nstfep = std::gcd(ir->awhParams->nstSampleCoord(), nstfep);
614         }
615     }
616
617     /* Be REALLY careful about what flags you set here. You CANNOT assume
618      * this is the first step, since we might be restarting from a checkpoint,
619      * and in that case we should not do any modifications to the state.
620      */
621     bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && !ir->bContinuation);
622
623     // When restarting from a checkpoint, it can be appropriate to
624     // initialize ekind from quantities in the checkpoint. Otherwise,
625     // compute_globals must initialize ekind before the simulation
626     // starts/restarts. However, only the master rank knows what was
627     // found in the checkpoint file, so we have to communicate in
628     // order to coordinate the restart.
629     //
630     // TODO Consider removing this communication if/when checkpoint
631     // reading directly follows .tpr reading, because all ranks can
632     // agree on hasReadEkinState at that time.
633     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
634     if (PAR(cr))
635     {
636         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
637     }
638     if (hasReadEkinState)
639     {
640         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
641     }
642
643     unsigned int cglo_flags =
644             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
645              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
646
647     bSumEkinhOld = FALSE;
648
649     t_vcm vcm(top_global.groups, *ir);
650     reportComRemovalInfo(fplog, vcm);
651
652     /* To minimize communication, compute_globals computes the COM velocity
653      * and the kinetic energy for the velocities without COM motion removed.
654      * Thus to get the kinetic energy without the COM contribution, we need
655      * to call compute_globals twice.
656      */
657     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
658     {
659         unsigned int cglo_flags_iteration = cglo_flags;
660         if (bStopCM && cgloIteration == 0)
661         {
662             cglo_flags_iteration |= CGLO_STOPCM;
663             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
664         }
665         if (DOMAINDECOMP(cr) && shouldCheckNumberOfBondedInteractions(*cr->dd) && cgloIteration == 0)
666         {
667             cglo_flags_iteration |= CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS;
668         }
669         compute_globals(gstat,
670                         cr,
671                         ir,
672                         fr,
673                         ekind,
674                         makeConstArrayRef(state->x),
675                         makeConstArrayRef(state->v),
676                         state->box,
677                         md,
678                         nrnb,
679                         &vcm,
680                         nullptr,
681                         enerd,
682                         force_vir,
683                         shake_vir,
684                         total_vir,
685                         pres,
686                         gmx::ArrayRef<real>{},
687                         &nullSignaller,
688                         state->box,
689                         &bSumEkinhOld,
690                         cglo_flags_iteration);
691         if (cglo_flags_iteration & CGLO_STOPCM)
692         {
693             /* At initialization, do not pass x with acceleration-correction mode
694              * to avoid (incorrect) correction of the initial coordinates.
695              */
696             auto x = (vcm.mode == ComRemovalAlgorithm::LinearAccelerationCorrection)
697                              ? ArrayRef<RVec>()
698                              : makeArrayRef(state->x);
699             process_and_stopcm_grp(fplog, &vcm, *md, x, makeArrayRef(state->v));
700             inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
701         }
702     }
703     if (DOMAINDECOMP(cr))
704     {
705         checkNumberOfBondedInteractions(
706                 mdlog, cr, top_global, &top, makeConstArrayRef(state->x), state->box);
707     }
708     if (ir->eI == IntegrationAlgorithm::VVAK)
709     {
710         /* a second call to get the half step temperature initialized as well */
711         /* we do the same call as above, but turn the pressure off -- internally to
712            compute_globals, this is recognized as a velocity verlet half-step
713            kinetic energy calculation.  This minimized excess variables, but
714            perhaps loses some logic?*/
715
716         compute_globals(gstat,
717                         cr,
718                         ir,
719                         fr,
720                         ekind,
721                         makeConstArrayRef(state->x),
722                         makeConstArrayRef(state->v),
723                         state->box,
724                         md,
725                         nrnb,
726                         &vcm,
727                         nullptr,
728                         enerd,
729                         force_vir,
730                         shake_vir,
731                         total_vir,
732                         pres,
733                         gmx::ArrayRef<real>{},
734                         &nullSignaller,
735                         state->box,
736                         &bSumEkinhOld,
737                         cglo_flags & ~CGLO_PRESSURE);
738     }
739
740     /* Calculate the initial half step temperature, and save the ekinh_old */
741     if (startingBehavior == StartingBehavior::NewSimulation)
742     {
743         for (i = 0; (i < ir->opts.ngtc); i++)
744         {
745             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
746         }
747     }
748
749     /* need to make an initiation call to get the Trotter variables set, as well as other constants
750        for non-trotter temperature control */
751     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
752
753     if (MASTER(cr))
754     {
755         if (!ir->bContinuation)
756         {
757             if (constr && ir->eConstrAlg == ConstraintAlgorithm::Lincs)
758             {
759                 fprintf(fplog,
760                         "RMS relative constraint deviation after constraining: %.2e\n",
761                         constr->rmsd());
762             }
763             if (EI_STATE_VELOCITY(ir->eI))
764             {
765                 real temp = enerd->term[F_TEMP];
766                 if (ir->eI != IntegrationAlgorithm::VV)
767                 {
768                     /* Result of Ekin averaged over velocities of -half
769                      * and +half step, while we only have -half step here.
770                      */
771                     temp *= 2;
772                 }
773                 fprintf(fplog, "Initial temperature: %g K\n", temp);
774             }
775         }
776
777         char tbuf[20];
778         fprintf(stderr, "starting mdrun '%s'\n", *(top_global.name));
779         if (ir->nsteps >= 0)
780         {
781             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
782         }
783         else
784         {
785             sprintf(tbuf, "%s", "infinite");
786         }
787         if (ir->init_step > 0)
788         {
789             fprintf(stderr,
790                     "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
791                     gmx_step_str(ir->init_step + ir->nsteps, sbuf),
792                     tbuf,
793                     gmx_step_str(ir->init_step, sbuf2),
794                     ir->init_step * ir->delta_t);
795         }
796         else
797         {
798             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
799         }
800         fprintf(fplog, "\n");
801     }
802
803     walltime_accounting_start_time(walltime_accounting);
804     wallcycle_start(wcycle, WallCycleCounter::Run);
805     print_start(fplog, cr, walltime_accounting, "mdrun");
806
807     /***********************************************************
808      *
809      *             Loop over MD steps
810      *
811      ************************************************************/
812
813     bFirstStep = TRUE;
814     /* Skip the first Nose-Hoover integration when we get the state from tpx */
815     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
816     bSumEkinhOld     = FALSE;
817     bExchanged       = FALSE;
818     bNeedRepartition = FALSE;
819
820     step     = ir->init_step;
821     step_rel = 0;
822
823     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
824             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]),
825             simulationsShareState,
826             MASTER(cr),
827             ir->nstlist,
828             mdrunOptions.reproducible,
829             nstSignalComm,
830             mdrunOptions.maximumHoursToRun,
831             ir->nstlist == 0,
832             fplog,
833             step,
834             bNS,
835             walltime_accounting);
836
837     auto checkpointHandler = std::make_unique<CheckpointHandler>(
838             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
839             simulationsShareState,
840             ir->nstlist == 0,
841             MASTER(cr),
842             mdrunOptions.writeConfout,
843             mdrunOptions.checkpointOptions.period);
844
845     const bool resetCountersIsLocal = true;
846     auto       resetHandler         = std::make_unique<ResetHandler>(
847             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
848             !resetCountersIsLocal,
849             ir->nsteps,
850             MASTER(cr),
851             mdrunOptions.timingOptions.resetHalfway,
852             mdrunOptions.maximumHoursToRun,
853             mdlog,
854             wcycle,
855             walltime_accounting);
856
857     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
858
859     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
860     {
861         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
862     }
863
864     /* and stop now if we should */
865     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
866     while (!bLastStep)
867     {
868
869         /* Determine if this is a neighbor search step */
870         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
871
872         if (bPMETune && bNStList)
873         {
874             // This has to be here because PME load balancing is called so early.
875             // TODO: Move to after all booleans are defined.
876             if (useGpuForUpdate && !bFirstStep)
877             {
878                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
879                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
880             }
881             /* PME grid + cut-off optimization with GPUs or PME nodes */
882             pme_loadbal_do(pme_loadbal,
883                            cr,
884                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
885                            fplog,
886                            mdlog,
887                            *ir,
888                            fr,
889                            state->box,
890                            state->x,
891                            wcycle,
892                            step,
893                            step_rel,
894                            &bPMETunePrinting,
895                            simulationWork.useGpuPmePpCommunication);
896         }
897
898         wallcycle_start(wcycle, WallCycleCounter::Step);
899
900         bLastStep = (step_rel == ir->nsteps);
901         t         = t0 + step * ir->delta_t;
902
903         // TODO Refactor this, so that nstfep does not need a default value of zero
904         if (ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
905         {
906             /* find and set the current lambdas */
907             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
908
909             bDoDHDL = do_per_step(step, ir->fepvals->nstdhdl);
910             bDoFEP  = ((ir->efep != FreeEnergyPerturbationType::No) && do_per_step(step, nstfep));
911             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
912                            && (!bFirstStep));
913         }
914
915         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
916                      && do_per_step(step, replExParams.exchangeInterval));
917
918         if (doSimulatedAnnealing)
919         {
920             // TODO: Avoid changing inputrec (#3854)
921             // Simulated annealing updates the reference temperature.
922             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
923             update_annealing_target_temp(nonConstInputrec, t, &upd);
924         }
925
926         /* Stop Center of Mass motion */
927         bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && do_per_step(step, ir->nstcomm));
928
929         /* Determine whether or not to do Neighbour Searching */
930         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
931
932         /* Note that the stopHandler will cause termination at nstglobalcomm
933          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
934          * nstpcouple steps, we have computed the half-step kinetic energy
935          * of the previous step and can always output energies at the last step.
936          */
937         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
938
939         /* do_log triggers energy and virial calculation. Because this leads
940          * to different code paths, forces can be different. Thus for exact
941          * continuation we should avoid extra log output.
942          * Note that the || bLastStep can result in non-exact continuation
943          * beyond the last step. But we don't consider that to be an issue.
944          */
945         do_log     = (do_per_step(step, ir->nstlog)
946                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
947         do_verbose = mdrunOptions.verbose
948                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
949
950         if (useGpuForUpdate && !bFirstStep && bNS)
951         {
952             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
953             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
954             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
955             // Copy coordinate from the GPU when needed at the search step.
956             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
957             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
958             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
959             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
960         }
961
962         // We only need to calculate virtual velocities if we are writing them in the current step
963         const bool needVirtualVelocitiesThisStep =
964                 (vsite != nullptr)
965                 && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep());
966
967         if (vsite != nullptr)
968         {
969             // Virtual sites need to be updated before domain decomposition and forces are calculated
970             wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
971             // md-vv calculates virtual velocities once it has full-step real velocities
972             vsite->construct(state->x,
973                              state->v,
974                              state->box,
975                              (!EI_VV(inputrec->eI) && needVirtualVelocitiesThisStep)
976                                      ? VSiteOperation::PositionsAndVelocities
977                                      : VSiteOperation::Positions);
978             wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
979         }
980
981         if (bNS && !(bFirstStep && ir->bContinuation))
982         {
983             bMasterState = FALSE;
984             /* Correct the new box if it is too skewed */
985             if (inputrecDynamicBox(ir))
986             {
987                 if (correct_box(fplog, step, state->box))
988                 {
989                     bMasterState = TRUE;
990                     // If update is offloaded, it should be informed about the box size change
991                     if (useGpuForUpdate)
992                     {
993                         integrator->setPbc(PbcType::Xyz, state->box);
994                     }
995                 }
996             }
997             if (DOMAINDECOMP(cr) && bMasterState)
998             {
999                 dd_collect_state(cr->dd, state, state_global);
1000             }
1001
1002             if (DOMAINDECOMP(cr))
1003             {
1004                 /* Repartition the domain decomposition */
1005                 dd_partition_system(fplog,
1006                                     mdlog,
1007                                     step,
1008                                     cr,
1009                                     bMasterState,
1010                                     nstglobalcomm,
1011                                     state_global,
1012                                     top_global,
1013                                     *ir,
1014                                     imdSession,
1015                                     pull_work,
1016                                     state,
1017                                     &f,
1018                                     mdAtoms,
1019                                     &top,
1020                                     fr,
1021                                     vsite,
1022                                     constr,
1023                                     nrnb,
1024                                     wcycle,
1025                                     do_verbose && !bPMETunePrinting);
1026                 upd.updateAfterPartition(state->natoms,
1027                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1028                                                      : gmx::ArrayRef<const unsigned short>(),
1029                                          md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1030                                                  : gmx::ArrayRef<const unsigned short>());
1031             }
1032         }
1033
1034         // Allocate or re-size GPU halo exchange object, if necessary
1035         if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange)
1036         {
1037             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
1038                                "GPU device manager has to be initialized to use GPU "
1039                                "version of halo exchange.");
1040             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
1041         }
1042
1043         if (MASTER(cr) && do_log)
1044         {
1045             gmx::EnergyOutput::printHeader(
1046                     fplog, step, t); /* can we improve the information printed here? */
1047         }
1048
1049         if (ir->efep != FreeEnergyPerturbationType::No)
1050         {
1051             update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
1052         }
1053
1054         if (bExchanged)
1055         {
1056             /* We need the kinetic energy at minus the half step for determining
1057              * the full step kinetic energy and possibly for T-coupling.*/
1058             /* This may not be quite working correctly yet . . . . */
1059             int cglo_flags = CGLO_GSTAT | CGLO_TEMPERATURE;
1060             if (DOMAINDECOMP(cr) && shouldCheckNumberOfBondedInteractions(*cr->dd))
1061             {
1062                 cglo_flags |= CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS;
1063             }
1064             compute_globals(gstat,
1065                             cr,
1066                             ir,
1067                             fr,
1068                             ekind,
1069                             makeConstArrayRef(state->x),
1070                             makeConstArrayRef(state->v),
1071                             state->box,
1072                             md,
1073                             nrnb,
1074                             &vcm,
1075                             wcycle,
1076                             enerd,
1077                             nullptr,
1078                             nullptr,
1079                             nullptr,
1080                             nullptr,
1081                             gmx::ArrayRef<real>{},
1082                             &nullSignaller,
1083                             state->box,
1084                             &bSumEkinhOld,
1085                             cglo_flags);
1086             if (DOMAINDECOMP(cr))
1087             {
1088                 checkNumberOfBondedInteractions(
1089                         mdlog, cr, top_global, &top, makeConstArrayRef(state->x), state->box);
1090             }
1091         }
1092         clear_mat(force_vir);
1093
1094         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
1095
1096         /* Determine the energy and pressure:
1097          * at nstcalcenergy steps and at energy output steps (set below).
1098          */
1099         if (EI_VV(ir->eI) && (!bInitStep))
1100         {
1101             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1102             bCalcVir      = bCalcEnerStep
1103                        || (ir->epc != PressureCoupling::No
1104                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
1105         }
1106         else
1107         {
1108             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1109             bCalcVir      = bCalcEnerStep
1110                        || (ir->epc != PressureCoupling::No && do_per_step(step, ir->nstpcouple));
1111         }
1112         bCalcEner = bCalcEnerStep;
1113
1114         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
1115
1116         if (do_ene || do_log || bDoReplEx)
1117         {
1118             bCalcVir  = TRUE;
1119             bCalcEner = TRUE;
1120         }
1121
1122         /* Do we need global communication ? */
1123         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
1124                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
1125
1126         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
1127                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
1128                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
1129         if (fr->useMts && !do_per_step(step, ir->nstfout))
1130         {
1131             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
1132         }
1133
1134         if (shellfc)
1135         {
1136             /* Now is the time to relax the shells */
1137             relax_shell_flexcon(fplog,
1138                                 cr,
1139                                 ms,
1140                                 mdrunOptions.verbose,
1141                                 enforcedRotation,
1142                                 step,
1143                                 ir,
1144                                 imdSession,
1145                                 pull_work,
1146                                 bNS,
1147                                 force_flags,
1148                                 &top,
1149                                 constr,
1150                                 enerd,
1151                                 state->natoms,
1152                                 state->x.arrayRefWithPadding(),
1153                                 state->v.arrayRefWithPadding(),
1154                                 state->box,
1155                                 state->lambda,
1156                                 &state->hist,
1157                                 &f.view(),
1158                                 force_vir,
1159                                 *md,
1160                                 nrnb,
1161                                 wcycle,
1162                                 shellfc,
1163                                 fr,
1164                                 runScheduleWork,
1165                                 t,
1166                                 mu_tot,
1167                                 vsite,
1168                                 ddBalanceRegionHandler);
1169         }
1170         else
1171         {
1172             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
1173                is updated (or the AWH update will be performed twice for one step when continuing).
1174                It would be best to call this update function from do_md_trajectory_writing but that
1175                would occur after do_force. One would have to divide the update_awh function into one
1176                function applying the AWH force and one doing the AWH bias update. The update AWH
1177                bias function could then be called after do_md_trajectory_writing (then containing
1178                update_awh_history). The checkpointing will in the future probably moved to the start
1179                of the md loop which will rid of this issue. */
1180             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
1181             {
1182                 awh->updateHistory(state_global->awhHistory.get());
1183             }
1184
1185             /* The coordinates (x) are shifted (to get whole molecules)
1186              * in do_force.
1187              * This is parallellized as well, and does communication too.
1188              * Check comments in sim_util.c
1189              */
1190             do_force(fplog,
1191                      cr,
1192                      ms,
1193                      *ir,
1194                      awh.get(),
1195                      enforcedRotation,
1196                      imdSession,
1197                      pull_work,
1198                      step,
1199                      nrnb,
1200                      wcycle,
1201                      &top,
1202                      state->box,
1203                      state->x.arrayRefWithPadding(),
1204                      &state->hist,
1205                      &f.view(),
1206                      force_vir,
1207                      md,
1208                      enerd,
1209                      state->lambda,
1210                      fr,
1211                      runScheduleWork,
1212                      vsite,
1213                      mu_tot,
1214                      t,
1215                      ed ? ed->getLegacyED() : nullptr,
1216                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
1217                      ddBalanceRegionHandler);
1218         }
1219
1220         // VV integrators do not need the following velocity half step
1221         // if it is the first step after starting from a checkpoint.
1222         // That is, the half step is needed on all other steps, and
1223         // also the first step when starting from a .tpr file.
1224         if (EI_VV(ir->eI))
1225         {
1226             integrateVVFirstStep(step,
1227                                  bFirstStep,
1228                                  bInitStep,
1229                                  startingBehavior,
1230                                  nstglobalcomm,
1231                                  ir,
1232                                  fr,
1233                                  cr,
1234                                  state,
1235                                  mdAtoms->mdatoms(),
1236                                  fcdata,
1237                                  &MassQ,
1238                                  &vcm,
1239                                  top_global,
1240                                  top,
1241                                  enerd,
1242                                  ekind,
1243                                  gstat,
1244                                  &last_ekin,
1245                                  bCalcVir,
1246                                  total_vir,
1247                                  shake_vir,
1248                                  force_vir,
1249                                  pres,
1250                                  M,
1251                                  do_log,
1252                                  do_ene,
1253                                  bCalcEner,
1254                                  bGStat,
1255                                  bStopCM,
1256                                  bTrotter,
1257                                  bExchanged,
1258                                  &bSumEkinhOld,
1259                                  &saved_conserved_quantity,
1260                                  &f,
1261                                  &upd,
1262                                  constr,
1263                                  &nullSignaller,
1264                                  trotter_seq,
1265                                  nrnb,
1266                                  mdlog,
1267                                  fplog,
1268                                  wcycle);
1269             if (vsite != nullptr && needVirtualVelocitiesThisStep)
1270             {
1271                 // Positions were calculated earlier
1272                 wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
1273                 vsite->construct(state->x, state->v, state->box, VSiteOperation::Velocities);
1274                 wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
1275             }
1276         }
1277
1278         /* ########  END FIRST UPDATE STEP  ############## */
1279         /* ########  If doing VV, we now have v(dt) ###### */
1280         if (bDoExpanded)
1281         {
1282             /* perform extended ensemble sampling in lambda - we don't
1283                actually move to the new state before outputting
1284                statistics, but if performing simulated tempering, we
1285                do update the velocities and the tau_t. */
1286             // TODO: Avoid changing inputrec (#3854)
1287             // Simulated tempering updates the reference temperature.
1288             // Expanded ensemble without simulated tempering does not change the inputrec.
1289             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
1290             lamnew                 = ExpandedEnsembleDynamics(fplog,
1291                                               nonConstInputrec,
1292                                               enerd,
1293                                               state,
1294                                               &MassQ,
1295                                               state->fep_state,
1296                                               state->dfhist,
1297                                               step,
1298                                               state->v.rvec_array(),
1299                                               md);
1300             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1301             if (MASTER(cr))
1302             {
1303                 copy_df_history(state_global->dfhist, state->dfhist);
1304             }
1305         }
1306
1307         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1308         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1309         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1310             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1311                 || checkpointHandler->isCheckpointingStep()))
1312         {
1313             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1314             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1315         }
1316         // Copy velocities if needed for the output/checkpointing.
1317         // NOTE: Copy on the search steps is done at the beginning of the step.
1318         if (useGpuForUpdate && !bNS
1319             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1320         {
1321             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1322             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1323         }
1324         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1325         // and update is offloaded hence forces are kept on the GPU for update and have not been
1326         // already transferred in do_force().
1327         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1328         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1329         //       prior to GPU update.
1330         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1331         //       copy call in do_force(...).
1332         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1333         //       on host after the D2H copy in do_force(...).
1334         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1335             && do_per_step(step, ir->nstfout))
1336         {
1337             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1338             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1339         }
1340         /* Now we have the energies and forces corresponding to the
1341          * coordinates at time t. We must output all of this before
1342          * the update.
1343          */
1344         do_md_trajectory_writing(fplog,
1345                                  cr,
1346                                  nfile,
1347                                  fnm,
1348                                  step,
1349                                  step_rel,
1350                                  t,
1351                                  ir,
1352                                  state,
1353                                  state_global,
1354                                  observablesHistory,
1355                                  top_global,
1356                                  fr,
1357                                  outf,
1358                                  energyOutput,
1359                                  ekind,
1360                                  f.view().force(),
1361                                  checkpointHandler->isCheckpointingStep(),
1362                                  bRerunMD,
1363                                  bLastStep,
1364                                  mdrunOptions.writeConfout,
1365                                  bSumEkinhOld);
1366         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1367         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x, t);
1368
1369         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1370         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1371             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1372         {
1373             copy_mat(state->svir_prev, shake_vir);
1374             copy_mat(state->fvir_prev, force_vir);
1375         }
1376
1377         stopHandler->setSignal();
1378         resetHandler->setSignal(walltime_accounting);
1379
1380         if (bGStat || !PAR(cr))
1381         {
1382             /* In parallel we only have to check for checkpointing in steps
1383              * where we do global communication,
1384              *  otherwise the other nodes don't know.
1385              */
1386             checkpointHandler->setSignal(walltime_accounting);
1387         }
1388
1389         /* #########   START SECOND UPDATE STEP ################# */
1390
1391         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1392            controlled in preprocessing */
1393
1394         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1395         {
1396             gmx_bool bIfRandomize;
1397             bIfRandomize = update_randomize_velocities(ir, step, cr, md, state->v, &upd, constr);
1398             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1399             if (constr && bIfRandomize)
1400             {
1401                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1402             }
1403         }
1404         /* Box is changed in update() when we do pressure coupling,
1405          * but we should still use the old box for energy corrections and when
1406          * writing it to the energy file, so it matches the trajectory files for
1407          * the same timestep above. Make a copy in a separate array.
1408          */
1409         copy_mat(state->box, lastbox);
1410
1411         dvdl_constr = 0;
1412
1413         if (!useGpuForUpdate)
1414         {
1415             wallcycle_start(wcycle, WallCycleCounter::Update);
1416         }
1417         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1418         if (bTrotter)
1419         {
1420             trotter_update(ir, step, ekind, enerd, state, total_vir, md, &MassQ, trotter_seq, ettTSEQ3);
1421             /* We can only do Berendsen coupling after we have summed
1422              * the kinetic energy or virial. Since the happens
1423              * in global_state after update, we should only do it at
1424              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1425              */
1426         }
1427         else
1428         {
1429             update_tcouple(step, ir, state, ekind, &MassQ, md);
1430             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1431         }
1432
1433         /* With leap-frog type integrators we compute the kinetic energy
1434          * at a whole time step as the average of the half-time step kinetic
1435          * energies of two subsequent steps. Therefore we need to compute the
1436          * half step kinetic energy also if we need energies at the next step.
1437          */
1438         const bool needHalfStepKineticEnergy =
1439                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1440
1441         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1442         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1443         const bool doParrinelloRahman = (ir->epc == PressureCoupling::ParrinelloRahman
1444                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1445
1446         if (EI_VV(ir->eI))
1447         {
1448             GMX_ASSERT(!useGpuForUpdate, "GPU update is not supported with VVAK integrator.");
1449
1450             integrateVVSecondStep(step,
1451                                   ir,
1452                                   fr,
1453                                   cr,
1454                                   state,
1455                                   mdAtoms->mdatoms(),
1456                                   fcdata,
1457                                   &MassQ,
1458                                   &vcm,
1459                                   pull_work,
1460                                   enerd,
1461                                   ekind,
1462                                   gstat,
1463                                   &dvdl_constr,
1464                                   bCalcVir,
1465                                   total_vir,
1466                                   shake_vir,
1467                                   force_vir,
1468                                   pres,
1469                                   M,
1470                                   lastbox,
1471                                   do_log,
1472                                   do_ene,
1473                                   bGStat,
1474                                   &bSumEkinhOld,
1475                                   &f,
1476                                   &cbuf,
1477                                   &upd,
1478                                   constr,
1479                                   &nullSignaller,
1480                                   trotter_seq,
1481                                   nrnb,
1482                                   wcycle);
1483         }
1484         else
1485         {
1486             if (useGpuForUpdate)
1487             {
1488                 if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1489                 {
1490                     integrator->set(stateGpu->getCoordinates(),
1491                                     stateGpu->getVelocities(),
1492                                     stateGpu->getForces(),
1493                                     top.idef,
1494                                     *md);
1495
1496                     // Copy data to the GPU after buffers might have being reinitialized
1497                     stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1498                     stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1499                 }
1500
1501                 if (simulationWork.useGpuPme && !runScheduleWork->simulationWork.useGpuPmePpCommunication
1502                     && !thisRankHasDuty(cr, DUTY_PME))
1503                 {
1504                     // The PME forces were recieved to the host, so have to be copied
1505                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::All);
1506                 }
1507                 else if (!runScheduleWork->stepWork.useGpuFBufferOps)
1508                 {
1509                     // The buffer ops were not offloaded this step, so the forces are on the
1510                     // host and have to be copied
1511                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1512                 }
1513
1514                 const bool doTemperatureScaling =
1515                         (ir->etc != TemperatureCoupling::No
1516                          && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1517
1518                 // This applies Leap-Frog, LINCS and SETTLE in succession
1519                 integrator->integrate(
1520                         stateGpu->getForcesReadyOnDeviceEvent(
1521                                 AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1522                         ir->delta_t,
1523                         true,
1524                         bCalcVir,
1525                         shake_vir,
1526                         doTemperatureScaling,
1527                         ekind->tcstat,
1528                         doParrinelloRahman,
1529                         ir->nstpcouple * ir->delta_t,
1530                         M);
1531
1532                 // Copy velocities D2H after update if:
1533                 // - Globals are computed this step (includes the energy output steps).
1534                 // - Temperature is needed for the next step.
1535                 if (bGStat || needHalfStepKineticEnergy)
1536                 {
1537                     stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1538                     stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1539                 }
1540             }
1541             else
1542             {
1543                 /* With multiple time stepping we need to do an additional normal
1544                  * update step to obtain the virial, as the actual MTS integration
1545                  * using an acceleration where the slow forces are multiplied by mtsFactor.
1546                  * Using that acceleration would result in a virial with the slow
1547                  * force contribution would be a factor mtsFactor too large.
1548                  */
1549                 if (fr->useMts && bCalcVir && constr != nullptr)
1550                 {
1551                     upd.update_for_constraint_virial(*ir,
1552                                                      md->homenr,
1553                                                      md->havePartiallyFrozenAtoms,
1554                                                      gmx::arrayRefFromArray(md->invmass, md->nr),
1555                                                      gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1556                                                      *state,
1557                                                      f.view().forceWithPadding(),
1558                                                      *ekind);
1559
1560                     constrain_coordinates(constr,
1561                                           do_log,
1562                                           do_ene,
1563                                           step,
1564                                           state,
1565                                           upd.xp()->arrayRefWithPadding(),
1566                                           &dvdl_constr,
1567                                           bCalcVir,
1568                                           shake_vir);
1569                 }
1570
1571                 ArrayRefWithPadding<const RVec> forceCombined =
1572                         (fr->useMts && step % ir->mtsLevels[1].stepFactor == 0)
1573                                 ? f.view().forceMtsCombinedWithPadding()
1574                                 : f.view().forceWithPadding();
1575                 upd.update_coords(*ir,
1576                                   step,
1577                                   md->homenr,
1578                                   md->havePartiallyFrozenAtoms,
1579                                   gmx::arrayRefFromArray(md->ptype, md->nr),
1580                                   gmx::arrayRefFromArray(md->invmass, md->nr),
1581                                   gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1582                                   state,
1583                                   forceCombined,
1584                                   fcdata,
1585                                   ekind,
1586                                   M,
1587                                   etrtPOSITION,
1588                                   cr,
1589                                   constr != nullptr);
1590
1591                 wallcycle_stop(wcycle, WallCycleCounter::Update);
1592
1593                 constrain_coordinates(constr,
1594                                       do_log,
1595                                       do_ene,
1596                                       step,
1597                                       state,
1598                                       upd.xp()->arrayRefWithPadding(),
1599                                       &dvdl_constr,
1600                                       bCalcVir && !fr->useMts,
1601                                       shake_vir);
1602
1603                 upd.update_sd_second_half(*ir,
1604                                           step,
1605                                           &dvdl_constr,
1606                                           md->homenr,
1607                                           gmx::arrayRefFromArray(md->ptype, md->nr),
1608                                           gmx::arrayRefFromArray(md->invmass, md->nr),
1609                                           state,
1610                                           cr,
1611                                           nrnb,
1612                                           wcycle,
1613                                           constr,
1614                                           do_log,
1615                                           do_ene);
1616                 upd.finish_update(
1617                         *ir, md->havePartiallyFrozenAtoms, md->homenr, state, wcycle, constr != nullptr);
1618             }
1619
1620             if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1621             {
1622                 updatePrevStepPullCom(pull_work, state);
1623             }
1624
1625             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1626         }
1627
1628         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1629         /* With Leap-Frog we can skip compute_globals at
1630          * non-communication steps, but we need to calculate
1631          * the kinetic energy one step before communication.
1632          */
1633         {
1634             // Organize to do inter-simulation signalling on steps if
1635             // and when algorithms require it.
1636             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1637
1638             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1639             {
1640                 // Copy coordinates when needed to stop the CM motion.
1641                 if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
1642                 {
1643                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1644                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1645                 }
1646                 // Since we're already communicating at this step, we
1647                 // can propagate intra-simulation signals. Note that
1648                 // check_nstglobalcomm has the responsibility for
1649                 // choosing the value of nstglobalcomm that is one way
1650                 // bGStat becomes true, so we can't get into a
1651                 // situation where e.g. checkpointing can't be
1652                 // signalled.
1653                 bool                doIntraSimSignal = true;
1654                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1655
1656                 compute_globals(
1657                         gstat,
1658                         cr,
1659                         ir,
1660                         fr,
1661                         ekind,
1662                         makeConstArrayRef(state->x),
1663                         makeConstArrayRef(state->v),
1664                         state->box,
1665                         md,
1666                         nrnb,
1667                         &vcm,
1668                         wcycle,
1669                         enerd,
1670                         force_vir,
1671                         shake_vir,
1672                         total_vir,
1673                         pres,
1674                         (!EI_VV(ir->eI) && bCalcEner && constr != nullptr) ? constr->rmsdData()
1675                                                                            : gmx::ArrayRef<real>{},
1676                         &signaller,
1677                         lastbox,
1678                         &bSumEkinhOld,
1679                         (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1680                                 | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1681                                 | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1682                                 | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
1683                                 | (DOMAINDECOMP(cr) && shouldCheckNumberOfBondedInteractions(*cr->dd)
1684                                            ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1685                                            : 0));
1686                 if (DOMAINDECOMP(cr))
1687                 {
1688                     checkNumberOfBondedInteractions(
1689                             mdlog, cr, top_global, &top, makeConstArrayRef(state->x), state->box);
1690                 }
1691                 if (!EI_VV(ir->eI) && bStopCM)
1692                 {
1693                     process_and_stopcm_grp(
1694                             fplog, &vcm, *md, makeArrayRef(state->x), makeArrayRef(state->v));
1695                     inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
1696
1697                     // TODO: The special case of removing CM motion should be dealt more gracefully
1698                     if (useGpuForUpdate)
1699                     {
1700                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1701                         // Here we block until the H2D copy completes because event sync with the
1702                         // force kernels that use the coordinates on the next steps is not implemented
1703                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1704                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1705                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1706                         if (vcm.mode != ComRemovalAlgorithm::No)
1707                         {
1708                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1709                         }
1710                     }
1711                 }
1712             }
1713         }
1714
1715         /* #############  END CALC EKIN AND PRESSURE ################# */
1716
1717         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1718            the virial that should probably be addressed eventually. state->veta has better properies,
1719            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1720            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1721
1722         if (ir->efep != FreeEnergyPerturbationType::No && !EI_VV(ir->eI))
1723         {
1724             /* Sum up the foreign energy and dK/dl terms for md and sd.
1725                Currently done every step so that dH/dl is correct in the .edr */
1726             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1727         }
1728
1729         update_pcouple_after_coordinates(
1730                 fplog, step, ir, md, pres, force_vir, shake_vir, pressureCouplingMu, state, nrnb, upd.deform(), !useGpuForUpdate);
1731
1732         const bool doBerendsenPressureCoupling = (inputrec->epc == PressureCoupling::Berendsen
1733                                                   && do_per_step(step, inputrec->nstpcouple));
1734         const bool doCRescalePressureCoupling  = (inputrec->epc == PressureCoupling::CRescale
1735                                                  && do_per_step(step, inputrec->nstpcouple));
1736         if (useGpuForUpdate
1737             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1738         {
1739             integrator->scaleCoordinates(pressureCouplingMu);
1740             if (doCRescalePressureCoupling)
1741             {
1742                 matrix pressureCouplingInvMu;
1743                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1744                 integrator->scaleVelocities(pressureCouplingInvMu);
1745             }
1746             integrator->setPbc(PbcType::Xyz, state->box);
1747         }
1748
1749         /* ################# END UPDATE STEP 2 ################# */
1750         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1751
1752         /* The coordinates (x) were unshifted in update */
1753         if (!bGStat)
1754         {
1755             /* We will not sum ekinh_old,
1756              * so signal that we still have to do it.
1757              */
1758             bSumEkinhOld = TRUE;
1759         }
1760
1761         if (bCalcEner)
1762         {
1763             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1764
1765             /* use the directly determined last velocity, not actually the averaged half steps */
1766             if (bTrotter && ir->eI == IntegrationAlgorithm::VV)
1767             {
1768                 enerd->term[F_EKIN] = last_ekin;
1769             }
1770             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1771
1772             if (integratorHasConservedEnergyQuantity(ir))
1773             {
1774                 if (EI_VV(ir->eI))
1775                 {
1776                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1777                 }
1778                 else
1779                 {
1780                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1781                 }
1782             }
1783             /* #########  END PREPARING EDR OUTPUT  ###########  */
1784         }
1785
1786         /* Output stuff */
1787         if (MASTER(cr))
1788         {
1789             if (fplog && do_log && bDoExpanded)
1790             {
1791                 /* only needed if doing expanded ensemble */
1792                 PrintFreeEnergyInfoToFile(fplog,
1793                                           ir->fepvals.get(),
1794                                           ir->expandedvals.get(),
1795                                           ir->bSimTemp ? ir->simtempvals.get() : nullptr,
1796                                           state_global->dfhist,
1797                                           state->fep_state,
1798                                           ir->nstlog,
1799                                           step);
1800             }
1801             if (bCalcEner)
1802             {
1803                 energyOutput.addDataAtEnergyStep(bDoDHDL,
1804                                                  bCalcEnerStep,
1805                                                  t,
1806                                                  md->tmass,
1807                                                  enerd,
1808                                                  ir->fepvals.get(),
1809                                                  ir->expandedvals.get(),
1810                                                  lastbox,
1811                                                  PTCouplingArrays{ state->boxv,
1812                                                                    state->nosehoover_xi,
1813                                                                    state->nosehoover_vxi,
1814                                                                    state->nhpres_xi,
1815                                                                    state->nhpres_vxi },
1816                                                  state->fep_state,
1817                                                  shake_vir,
1818                                                  force_vir,
1819                                                  total_vir,
1820                                                  pres,
1821                                                  ekind,
1822                                                  mu_tot,
1823                                                  constr);
1824             }
1825             else
1826             {
1827                 energyOutput.recordNonEnergyStep();
1828             }
1829
1830             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1831             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1832
1833             if (doSimulatedAnnealing)
1834             {
1835                 gmx::EnergyOutput::printAnnealingTemperatures(
1836                         do_log ? fplog : nullptr, groups, &(ir->opts));
1837             }
1838             if (do_log || do_ene || do_dr || do_or)
1839             {
1840                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf),
1841                                                    do_ene,
1842                                                    do_dr,
1843                                                    do_or,
1844                                                    do_log ? fplog : nullptr,
1845                                                    step,
1846                                                    t,
1847                                                    fr->fcdata.get(),
1848                                                    awh.get());
1849             }
1850             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1851             {
1852                 const bool isInitialOutput = false;
1853                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1854             }
1855
1856             if (ir->bPull)
1857             {
1858                 pull_print_output(pull_work, step, t);
1859             }
1860
1861             if (do_per_step(step, ir->nstlog))
1862             {
1863                 if (fflush(fplog) != 0)
1864                 {
1865                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1866                 }
1867             }
1868         }
1869         if (bDoExpanded)
1870         {
1871             /* Have to do this part _after_ outputting the logfile and the edr file */
1872             /* Gets written into the state at the beginning of next loop*/
1873             state->fep_state = lamnew;
1874         }
1875         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1876         {
1877             state->fep_state = awh->fepLambdaState();
1878         }
1879         /* Print the remaining wall clock time for the run */
1880         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1881         {
1882             if (shellfc)
1883             {
1884                 fprintf(stderr, "\n");
1885             }
1886             print_time(stderr, walltime_accounting, step, ir, cr);
1887         }
1888
1889         /* Ion/water position swapping.
1890          * Not done in last step since trajectory writing happens before this call
1891          * in the MD loop and exchanges would be lost anyway. */
1892         bNeedRepartition = FALSE;
1893         if ((ir->eSwapCoords != SwapType::No) && (step > 0) && !bLastStep
1894             && do_per_step(step, ir->swap->nstswap))
1895         {
1896             bNeedRepartition = do_swapcoords(cr,
1897                                              step,
1898                                              t,
1899                                              ir,
1900                                              swap,
1901                                              wcycle,
1902                                              as_rvec_array(state->x.data()),
1903                                              state->box,
1904                                              MASTER(cr) && mdrunOptions.verbose,
1905                                              bRerunMD);
1906
1907             if (bNeedRepartition && DOMAINDECOMP(cr))
1908             {
1909                 dd_collect_state(cr->dd, state, state_global);
1910             }
1911         }
1912
1913         /* Replica exchange */
1914         bExchanged = FALSE;
1915         if (bDoReplEx)
1916         {
1917             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1918         }
1919
1920         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1921         {
1922             dd_partition_system(fplog,
1923                                 mdlog,
1924                                 step,
1925                                 cr,
1926                                 TRUE,
1927                                 1,
1928                                 state_global,
1929                                 top_global,
1930                                 *ir,
1931                                 imdSession,
1932                                 pull_work,
1933                                 state,
1934                                 &f,
1935                                 mdAtoms,
1936                                 &top,
1937                                 fr,
1938                                 vsite,
1939                                 constr,
1940                                 nrnb,
1941                                 wcycle,
1942                                 FALSE);
1943             upd.updateAfterPartition(state->natoms,
1944                                      md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1945                                                  : gmx::ArrayRef<const unsigned short>(),
1946                                      md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1947                                              : gmx::ArrayRef<const unsigned short>());
1948         }
1949
1950         bFirstStep = FALSE;
1951         bInitStep  = FALSE;
1952
1953         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1954         /* With all integrators, except VV, we need to retain the pressure
1955          * at the current step for coupling at the next step.
1956          */
1957         if ((state->flags & enumValueToBitMask(StateEntry::PressurePrevious))
1958             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1959         {
1960             /* Store the pressure in t_state for pressure coupling
1961              * at the next MD step.
1962              */
1963             copy_mat(pres, state->pres_prev);
1964         }
1965
1966         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1967
1968         if ((membed != nullptr) && (!bLastStep))
1969         {
1970             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1971         }
1972
1973         cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
1974         if (DOMAINDECOMP(cr) && wcycle)
1975         {
1976             dd_cycles_add(cr->dd, cycles, ddCyclStep);
1977         }
1978
1979         /* increase the MD step number */
1980         step++;
1981         step_rel++;
1982
1983 #if GMX_FAHCORE
1984         if (MASTER(cr))
1985         {
1986             fcReportProgress(ir->nsteps + ir->init_step, step);
1987         }
1988 #endif
1989
1990         resetHandler->resetCounters(
1991                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
1992
1993         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
1994         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
1995     }
1996     /* End of main MD loop */
1997
1998     /* Closing TNG files can include compressing data. Therefore it is good to do that
1999      * before stopping the time measurements. */
2000     mdoutf_tng_close(outf);
2001
2002     /* Stop measuring walltime */
2003     walltime_accounting_end_time(walltime_accounting);
2004
2005     if (!thisRankHasDuty(cr, DUTY_PME))
2006     {
2007         /* Tell the PME only node to finish */
2008         gmx_pme_send_finish(cr);
2009     }
2010
2011     if (MASTER(cr))
2012     {
2013         if (ir->nstcalcenergy > 0)
2014         {
2015             energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
2016
2017             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
2018             energyOutput.printAverages(fplog, groups);
2019         }
2020     }
2021     done_mdoutf(outf);
2022
2023     if (bPMETune)
2024     {
2025         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
2026     }
2027
2028     done_shellfc(fplog, shellfc, step_rel);
2029
2030     if (useReplicaExchange && MASTER(cr))
2031     {
2032         print_replica_exchange_statistics(fplog, repl_ex);
2033     }
2034
2035     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
2036
2037     global_stat_destroy(gstat);
2038 }