Merge remote-tracking branch 'origin/release-2021' into master
[alexxy/gromacs.git] / src / gromacs / mdrun / md.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 /*! \internal \file
38  *
39  * \brief Implements the integrator for normal molecular dynamics simulations
40  *
41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
42  * \ingroup module_mdrun
43  */
44 #include "gmxpre.h"
45
46 #include <cinttypes>
47 #include <cmath>
48 #include <cstdio>
49 #include <cstdlib>
50
51 #include <algorithm>
52 #include <memory>
53 #include <numeric>
54
55 #include "gromacs/applied_forces/awh/awh.h"
56 #include "gromacs/applied_forces/awh/read_params.h"
57 #include "gromacs/commandline/filenm.h"
58 #include "gromacs/domdec/collect.h"
59 #include "gromacs/domdec/dlbtiming.h"
60 #include "gromacs/domdec/domdec.h"
61 #include "gromacs/domdec/domdec_network.h"
62 #include "gromacs/domdec/domdec_struct.h"
63 #include "gromacs/domdec/gpuhaloexchange.h"
64 #include "gromacs/domdec/localtopologychecker.h"
65 #include "gromacs/domdec/mdsetup.h"
66 #include "gromacs/domdec/partition.h"
67 #include "gromacs/essentialdynamics/edsam.h"
68 #include "gromacs/ewald/pme_load_balancing.h"
69 #include "gromacs/ewald/pme_pp.h"
70 #include "gromacs/fileio/trxio.h"
71 #include "gromacs/gmxlib/network.h"
72 #include "gromacs/gmxlib/nrnb.h"
73 #include "gromacs/gpu_utils/device_stream_manager.h"
74 #include "gromacs/gpu_utils/gpu_utils.h"
75 #include "gromacs/imd/imd.h"
76 #include "gromacs/listed_forces/listed_forces.h"
77 #include "gromacs/math/functions.h"
78 #include "gromacs/math/invertmatrix.h"
79 #include "gromacs/math/vec.h"
80 #include "gromacs/math/vectypes.h"
81 #include "gromacs/mdlib/checkpointhandler.h"
82 #include "gromacs/mdlib/compute_io.h"
83 #include "gromacs/mdlib/constr.h"
84 #include "gromacs/mdlib/coupling.h"
85 #include "gromacs/mdlib/ebin.h"
86 #include "gromacs/mdlib/enerdata_utils.h"
87 #include "gromacs/mdlib/energyoutput.h"
88 #include "gromacs/mdlib/expanded.h"
89 #include "gromacs/mdlib/force.h"
90 #include "gromacs/mdlib/force_flags.h"
91 #include "gromacs/mdlib/forcerec.h"
92 #include "gromacs/mdlib/freeenergyparameters.h"
93 #include "gromacs/mdlib/md_support.h"
94 #include "gromacs/mdlib/mdatoms.h"
95 #include "gromacs/mdlib/mdoutf.h"
96 #include "gromacs/mdlib/membed.h"
97 #include "gromacs/mdlib/resethandler.h"
98 #include "gromacs/mdlib/sighandler.h"
99 #include "gromacs/mdlib/simulationsignal.h"
100 #include "gromacs/mdlib/stat.h"
101 #include "gromacs/mdlib/stophandler.h"
102 #include "gromacs/mdlib/tgroup.h"
103 #include "gromacs/mdlib/trajectory_writing.h"
104 #include "gromacs/mdlib/update.h"
105 #include "gromacs/mdlib/update_constrain_gpu.h"
106 #include "gromacs/mdlib/update_vv.h"
107 #include "gromacs/mdlib/vcm.h"
108 #include "gromacs/mdlib/vsite.h"
109 #include "gromacs/mdrunutility/freeenergy.h"
110 #include "gromacs/mdrunutility/handlerestart.h"
111 #include "gromacs/mdrunutility/multisim.h"
112 #include "gromacs/mdrunutility/printtime.h"
113 #include "gromacs/mdtypes/awh_history.h"
114 #include "gromacs/mdtypes/awh_params.h"
115 #include "gromacs/mdtypes/commrec.h"
116 #include "gromacs/mdtypes/df_history.h"
117 #include "gromacs/mdtypes/energyhistory.h"
118 #include "gromacs/mdtypes/fcdata.h"
119 #include "gromacs/mdtypes/forcebuffers.h"
120 #include "gromacs/mdtypes/forcerec.h"
121 #include "gromacs/mdtypes/group.h"
122 #include "gromacs/mdtypes/inputrec.h"
123 #include "gromacs/mdtypes/interaction_const.h"
124 #include "gromacs/mdtypes/md_enums.h"
125 #include "gromacs/mdtypes/mdatom.h"
126 #include "gromacs/mdtypes/mdrunoptions.h"
127 #include "gromacs/mdtypes/multipletimestepping.h"
128 #include "gromacs/mdtypes/observableshistory.h"
129 #include "gromacs/mdtypes/observablesreducer.h"
130 #include "gromacs/mdtypes/pullhistory.h"
131 #include "gromacs/mdtypes/simulation_workload.h"
132 #include "gromacs/mdtypes/state.h"
133 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
134 #include "gromacs/modularsimulator/energydata.h"
135 #include "gromacs/nbnxm/gpu_data_mgmt.h"
136 #include "gromacs/nbnxm/nbnxm.h"
137 #include "gromacs/pbcutil/pbc.h"
138 #include "gromacs/pulling/output.h"
139 #include "gromacs/pulling/pull.h"
140 #include "gromacs/swap/swapcoords.h"
141 #include "gromacs/timing/wallcycle.h"
142 #include "gromacs/timing/walltime_accounting.h"
143 #include "gromacs/topology/atoms.h"
144 #include "gromacs/topology/idef.h"
145 #include "gromacs/topology/mtop_util.h"
146 #include "gromacs/topology/topology.h"
147 #include "gromacs/trajectory/trajectoryframe.h"
148 #include "gromacs/utility/basedefinitions.h"
149 #include "gromacs/utility/cstringutil.h"
150 #include "gromacs/utility/fatalerror.h"
151 #include "gromacs/utility/logger.h"
152 #include "gromacs/utility/real.h"
153 #include "gromacs/utility/smalloc.h"
154
155 #include "legacysimulator.h"
156 #include "replicaexchange.h"
157 #include "shellfc.h"
158
159 using gmx::SimulationSignaller;
160
161 void gmx::LegacySimulator::do_md()
162 {
163     // TODO Historically, the EM and MD "integrators" used different
164     // names for the t_inputrec *parameter, but these must have the
165     // same name, now that it's a member of a struct. We use this ir
166     // alias to avoid a large ripple of nearly useless changes.
167     // t_inputrec is being replaced by IMdpOptionsProvider, so this
168     // will go away eventually.
169     const t_inputrec* ir = inputrec;
170
171     double       t, t0 = ir->init_t;
172     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
173     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
174     gmx_bool     bDoExpanded = FALSE;
175     gmx_bool     do_ene, do_log, do_verbose;
176     gmx_bool     bMasterState;
177     unsigned int force_flags;
178     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
179     int    i, m;
180     rvec   mu_tot;
181     matrix pressureCouplingMu, M;
182     gmx_repl_ex_t     repl_ex = nullptr;
183     gmx_global_stat_t gstat;
184     gmx_shellfc_t*    shellfc;
185     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
186     gmx_bool          bTrotter;
187     real              dvdl_constr;
188     std::vector<RVec> cbuf;
189     matrix            lastbox;
190     int               lamnew = 0;
191     /* for FEP */
192     double    cycles;
193     real      saved_conserved_quantity = 0;
194     real      last_ekin                = 0;
195     t_extmass MassQ;
196     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
197
198     /* PME load balancing data for GPU kernels */
199     gmx_bool bPMETune         = FALSE;
200     gmx_bool bPMETunePrinting = FALSE;
201
202     bool bInteractiveMDstep = false;
203
204     SimulationSignals signals;
205     // Most global communnication stages don't propagate mdrun
206     // signals, and will use this object to achieve that.
207     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
208
209     if (!mdrunOptions.writeConfout)
210     {
211         // This is on by default, and the main known use case for
212         // turning it off is for convenience in benchmarking, which is
213         // something that should not show up in the general user
214         // interface.
215         GMX_LOG(mdlog.info)
216                 .asParagraph()
217                 .appendText(
218                         "The -noconfout functionality is deprecated, and may be removed in a "
219                         "future version.");
220     }
221
222     /* md-vv uses averaged full step velocities for T-control
223        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
224        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
225     bTrotter = (EI_VV(ir->eI)
226                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
227
228     const bool bRerunMD = false;
229
230     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
231     bGStatEveryStep   = (nstglobalcomm == 1);
232
233     const SimulationGroups* groups = &top_global.groups;
234
235     std::unique_ptr<EssentialDynamics> ed = nullptr;
236     if (opt2bSet("-ei", nfile, fnm))
237     {
238         /* Initialize essential dynamics sampling */
239         ed = init_edsam(mdlog,
240                         opt2fn_null("-ei", nfile, fnm),
241                         opt2fn("-eo", nfile, fnm),
242                         top_global,
243                         *ir,
244                         cr,
245                         constr,
246                         state_global,
247                         observablesHistory,
248                         oenv,
249                         startingBehavior);
250     }
251     else if (observablesHistory->edsamHistory)
252     {
253         gmx_fatal(FARGS,
254                   "The checkpoint is from a run with essential dynamics sampling, "
255                   "but the current run did not specify the -ei option. "
256                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
257     }
258
259     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
260     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
261     initialize_lambdas(fplog,
262                        ir->efep,
263                        ir->bSimTemp,
264                        *ir->fepvals,
265                        ir->simtempvals->temperatures,
266                        gmx::arrayRefFromArray(ir->opts.ref_t, ir->opts.ngtc),
267                        MASTER(cr),
268                        fep_state,
269                        lambda);
270     Update upd(*ir, deform);
271     bool   doSimulatedAnnealing = false;
272     {
273         // TODO: Avoid changing inputrec (#3854)
274         // Simulated annealing updates the reference temperature.
275         auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
276         doSimulatedAnnealing   = initSimulatedAnnealing(nonConstInputrec, &upd);
277     }
278     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
279
280     t_fcdata& fcdata = *fr->fcdata;
281
282     bool simulationsShareState = false;
283     int  nstSignalComm         = nstglobalcomm;
284     {
285         // TODO This implementation of ensemble orientation restraints is nasty because
286         // a user can't just do multi-sim with single-sim orientation restraints.
287         bool usingEnsembleRestraints = (fcdata.disres->nsystems > 1) || ((ms != nullptr) && fcdata.orires);
288         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim() && (ms != nullptr));
289
290         // Replica exchange, ensemble restraints and AWH need all
291         // simulations to remain synchronized, so they need
292         // checkpoints and stop conditions to act on the same step, so
293         // the propagation of such signals must take place between
294         // simulations, not just within simulations.
295         // TODO: Make algorithm initializers set these flags.
296         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
297
298         if (simulationsShareState)
299         {
300             // Inter-simulation signal communication does not need to happen
301             // often, so we use a minimum of 200 steps to reduce overhead.
302             const int c_minimumInterSimulationSignallingInterval = 200;
303             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
304                             * nstglobalcomm;
305         }
306     }
307
308     if (startingBehavior != StartingBehavior::RestartWithAppending)
309     {
310         pleaseCiteCouplingAlgorithms(fplog, *ir);
311     }
312     gmx_mdoutf*       outf = init_mdoutf(fplog,
313                                    nfile,
314                                    fnm,
315                                    mdrunOptions,
316                                    cr,
317                                    outputProvider,
318                                    mdModulesNotifiers,
319                                    ir,
320                                    top_global,
321                                    oenv,
322                                    wcycle,
323                                    startingBehavior,
324                                    simulationsShareState,
325                                    ms);
326     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf),
327                                    top_global,
328                                    *ir,
329                                    pull_work,
330                                    mdoutf_get_fp_dhdl(outf),
331                                    false,
332                                    startingBehavior,
333                                    simulationsShareState,
334                                    mdModulesNotifiers);
335
336     gstat = global_stat_init(ir);
337
338     const auto& simulationWork     = runScheduleWork->simulationWork;
339     const bool  useGpuForPme       = simulationWork.useGpuPme;
340     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
341     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
342     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
343
344     /* Check for polarizable models and flexible constraints */
345     shellfc = init_shell_flexcon(fplog,
346                                  top_global,
347                                  constr ? constr->numFlexibleConstraints() : 0,
348                                  ir->nstcalcenergy,
349                                  DOMAINDECOMP(cr),
350                                  useGpuForPme);
351
352     {
353         double io = compute_io(ir, top_global.natoms, *groups, energyOutput.numEnergyTerms(), 1);
354         if ((io > 2000) && MASTER(cr))
355         {
356             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
357         }
358     }
359
360     // Local state only becomes valid now.
361     std::unique_ptr<t_state> stateInstance;
362     t_state*                 state;
363
364     gmx_localtop_t     top(top_global.ffparams);
365     ObservablesReducer observablesReducer = observablesReducerBuilder->build();
366
367     ForceBuffers     f(simulationWork.useMts,
368                    ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
369                                ? PinningPolicy::PinnedIfSupported
370                                : PinningPolicy::CannotBePinned);
371     const t_mdatoms* md = mdAtoms->mdatoms();
372     if (DOMAINDECOMP(cr))
373     {
374         stateInstance = std::make_unique<t_state>();
375         state         = stateInstance.get();
376         dd_init_local_state(*cr->dd, state_global, state);
377
378         /* Distribute the charge groups over the nodes from the master node */
379         dd_partition_system(fplog,
380                             mdlog,
381                             ir->init_step,
382                             cr,
383                             TRUE,
384                             1,
385                             state_global,
386                             top_global,
387                             *ir,
388                             imdSession,
389                             pull_work,
390                             state,
391                             &f,
392                             mdAtoms,
393                             &top,
394                             fr,
395                             vsite,
396                             constr,
397                             nrnb,
398                             nullptr,
399                             FALSE);
400         upd.updateAfterPartition(state->natoms,
401                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
402                                              : gmx::ArrayRef<const unsigned short>(),
403                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
404                                          : gmx::ArrayRef<const unsigned short>());
405     }
406     else
407     {
408         state_change_natoms(state_global, state_global->natoms);
409         /* Copy the pointer to the global state */
410         state = state_global;
411
412         /* Generate and initialize new topology */
413         mdAlgorithmsSetupAtomData(cr, *ir, top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
414
415         upd.updateAfterPartition(state->natoms,
416                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
417                                              : gmx::ArrayRef<const unsigned short>(),
418                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
419                                          : gmx::ArrayRef<const unsigned short>());
420     }
421
422     std::unique_ptr<UpdateConstrainGpu> integrator;
423
424     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
425
426     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
427     if (useGpuForUpdate)
428     {
429         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
430                                    || constr->numConstraintsTotal() == 0,
431                            "Constraints in domain decomposition are only supported with update "
432                            "groups if using GPU update.\n");
433         GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
434                                    || constr->numConstraintsTotal() == 0,
435                            "SHAKE is not supported with GPU update.");
436         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
437                            "Either PME or short-ranged non-bonded interaction tasks must run on "
438                            "the GPU to use GPU update.\n");
439         GMX_RELEASE_ASSERT(ir->eI == IntegrationAlgorithm::MD,
440                            "Only the md integrator is supported with the GPU update.\n");
441         GMX_RELEASE_ASSERT(
442                 ir->etc != TemperatureCoupling::NoseHoover,
443                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
444         GMX_RELEASE_ASSERT(
445                 ir->epc == PressureCoupling::No || ir->epc == PressureCoupling::ParrinelloRahman
446                         || ir->epc == PressureCoupling::Berendsen || ir->epc == PressureCoupling::CRescale,
447                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
448                 "with the GPU update.\n");
449         GMX_RELEASE_ASSERT(!md->haveVsites,
450                            "Virtual sites are not supported with the GPU update.\n");
451         GMX_RELEASE_ASSERT(ed == nullptr,
452                            "Essential dynamics is not supported with the GPU update.\n");
453         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
454                            "Constraints pulling is not supported with the GPU update.\n");
455         GMX_RELEASE_ASSERT(fcdata.orires == nullptr,
456                            "Orientation restraints are not supported with the GPU update.\n");
457         GMX_RELEASE_ASSERT(
458                 ir->efep == FreeEnergyPerturbationType::No
459                         || (!haveFepPerturbedMasses(top_global) && !havePerturbedConstraints(top_global)),
460                 "Free energy perturbation of masses and constraints are not supported with the GPU "
461                 "update.");
462
463         if (constr != nullptr && constr->numConstraintsTotal() > 0)
464         {
465             GMX_LOG(mdlog.info)
466                     .asParagraph()
467                     .appendText("Updating coordinates and applying constraints on the GPU.");
468         }
469         else
470         {
471             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
472         }
473         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
474                            "Device stream manager should be initialized in order to use GPU "
475                            "update-constraints.");
476         GMX_RELEASE_ASSERT(
477                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
478                 "Update stream should be initialized in order to use GPU "
479                 "update-constraints.");
480         integrator = std::make_unique<UpdateConstrainGpu>(
481                 *ir,
482                 top_global,
483                 ekind->ngtc,
484                 fr->deviceStreamManager->context(),
485                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
486                 wcycle);
487
488         stateGpu->setXUpdatedOnDeviceEvent(integrator->xUpdatedOnDeviceEvent());
489
490         integrator->setPbc(PbcType::Xyz, state->box);
491     }
492
493     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
494     {
495         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
496     }
497     if (useGpuForUpdate)
498     {
499         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
500     }
501
502     // NOTE: The global state is no longer used at this point.
503     // But state_global is still used as temporary storage space for writing
504     // the global state to file and potentially for replica exchange.
505     // (Global topology should persist.)
506
507     update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
508
509     if (ir->bExpanded)
510     {
511         /* Check nstexpanded here, because the grompp check was broken */
512         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
513         {
514             gmx_fatal(FARGS,
515                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
516         }
517         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
518     }
519
520     if (MASTER(cr))
521     {
522         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
523     }
524
525     preparePrevStepPullCom(ir,
526                            pull_work,
527                            gmx::arrayRefFromArray(md->massT, md->nr),
528                            state,
529                            state_global,
530                            cr,
531                            startingBehavior != StartingBehavior::NewSimulation);
532
533     // TODO: Remove this by converting AWH into a ForceProvider
534     auto awh = prepareAwhModule(fplog,
535                                 *ir,
536                                 state_global,
537                                 cr,
538                                 ms,
539                                 startingBehavior != StartingBehavior::NewSimulation,
540                                 shellfc != nullptr,
541                                 opt2fn("-awh", nfile, fnm),
542                                 pull_work);
543
544     if (useReplicaExchange && MASTER(cr))
545     {
546         repl_ex = init_replica_exchange(fplog, ms, top_global.natoms, ir, replExParams);
547     }
548     /* PME tuning is only supported in the Verlet scheme, with PME for
549      * Coulomb. It is not supported with only LJ PME. */
550     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
551                 && ir->cutoff_scheme != CutoffScheme::Group);
552
553     pme_load_balancing_t* pme_loadbal = nullptr;
554     if (bPMETune)
555     {
556         pme_loadbal_init(
557                 &pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu());
558     }
559
560     if (!ir->bContinuation)
561     {
562         if (state->flags & enumValueToBitMask(StateEntry::V))
563         {
564             auto v = makeArrayRef(state->v);
565             /* Set the velocities of vsites, shells and frozen atoms to zero */
566             for (i = 0; i < md->homenr; i++)
567             {
568                 if (md->ptype[i] == ParticleType::Shell)
569                 {
570                     clear_rvec(v[i]);
571                 }
572                 else if (md->cFREEZE)
573                 {
574                     for (m = 0; m < DIM; m++)
575                     {
576                         if (ir->opts.nFreeze[md->cFREEZE[i]][m])
577                         {
578                             v[i][m] = 0;
579                         }
580                     }
581                 }
582             }
583         }
584
585         if (constr)
586         {
587             /* Constrain the initial coordinates and velocities */
588             do_constrain_first(fplog,
589                                constr,
590                                ir,
591                                md->nr,
592                                md->homenr,
593                                state->x.arrayRefWithPadding(),
594                                state->v.arrayRefWithPadding(),
595                                state->box,
596                                state->lambda[FreeEnergyPerturbationCouplingType::Bonded]);
597         }
598     }
599
600     const int nstfep = computeFepPeriod(*ir, replExParams);
601
602     /* Be REALLY careful about what flags you set here. You CANNOT assume
603      * this is the first step, since we might be restarting from a checkpoint,
604      * and in that case we should not do any modifications to the state.
605      */
606     bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && !ir->bContinuation);
607
608     // When restarting from a checkpoint, it can be appropriate to
609     // initialize ekind from quantities in the checkpoint. Otherwise,
610     // compute_globals must initialize ekind before the simulation
611     // starts/restarts. However, only the master rank knows what was
612     // found in the checkpoint file, so we have to communicate in
613     // order to coordinate the restart.
614     //
615     // TODO Consider removing this communication if/when checkpoint
616     // reading directly follows .tpr reading, because all ranks can
617     // agree on hasReadEkinState at that time.
618     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
619     if (PAR(cr))
620     {
621         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
622     }
623     if (hasReadEkinState)
624     {
625         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
626     }
627
628     unsigned int cglo_flags =
629             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
630              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
631
632     bSumEkinhOld = FALSE;
633
634     t_vcm vcm(top_global.groups, *ir);
635     reportComRemovalInfo(fplog, vcm);
636
637     int64_t step     = ir->init_step;
638     int64_t step_rel = 0;
639
640     /* To minimize communication, compute_globals computes the COM velocity
641      * and the kinetic energy for the velocities without COM motion removed.
642      * Thus to get the kinetic energy without the COM contribution, we need
643      * to call compute_globals twice.
644      */
645     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
646     {
647         unsigned int cglo_flags_iteration = cglo_flags;
648         if (bStopCM && cgloIteration == 0)
649         {
650             cglo_flags_iteration |= CGLO_STOPCM;
651             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
652         }
653         if (DOMAINDECOMP(cr) && dd_localTopologyChecker(*cr->dd).shouldCheckNumberOfBondedInteractions()
654             && cgloIteration == 0)
655         {
656             cglo_flags_iteration |= CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS;
657         }
658         compute_globals(gstat,
659                         cr,
660                         ir,
661                         fr,
662                         ekind,
663                         makeConstArrayRef(state->x),
664                         makeConstArrayRef(state->v),
665                         state->box,
666                         md,
667                         nrnb,
668                         &vcm,
669                         nullptr,
670                         enerd,
671                         force_vir,
672                         shake_vir,
673                         total_vir,
674                         pres,
675                         gmx::ArrayRef<real>{},
676                         &nullSignaller,
677                         state->box,
678                         &bSumEkinhOld,
679                         cglo_flags_iteration,
680                         step,
681                         &observablesReducer);
682         if (cglo_flags_iteration & CGLO_STOPCM)
683         {
684             /* At initialization, do not pass x with acceleration-correction mode
685              * to avoid (incorrect) correction of the initial coordinates.
686              */
687             auto x = (vcm.mode == ComRemovalAlgorithm::LinearAccelerationCorrection)
688                              ? ArrayRef<RVec>()
689                              : makeArrayRef(state->x);
690             process_and_stopcm_grp(fplog, &vcm, *md, x, makeArrayRef(state->v));
691             inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
692         }
693     }
694     if (DOMAINDECOMP(cr))
695     {
696         dd_localTopologyChecker(cr->dd)->checkNumberOfBondedInteractions(
697                 &top, makeConstArrayRef(state->x), state->box);
698     }
699     if (ir->eI == IntegrationAlgorithm::VVAK)
700     {
701         /* a second call to get the half step temperature initialized as well */
702         /* we do the same call as above, but turn the pressure off -- internally to
703            compute_globals, this is recognized as a velocity verlet half-step
704            kinetic energy calculation.  This minimized excess variables, but
705            perhaps loses some logic?*/
706
707         compute_globals(gstat,
708                         cr,
709                         ir,
710                         fr,
711                         ekind,
712                         makeConstArrayRef(state->x),
713                         makeConstArrayRef(state->v),
714                         state->box,
715                         md,
716                         nrnb,
717                         &vcm,
718                         nullptr,
719                         enerd,
720                         force_vir,
721                         shake_vir,
722                         total_vir,
723                         pres,
724                         gmx::ArrayRef<real>{},
725                         &nullSignaller,
726                         state->box,
727                         &bSumEkinhOld,
728                         cglo_flags & ~CGLO_PRESSURE,
729                         step,
730                         &observablesReducer);
731     }
732
733     /* Calculate the initial half step temperature, and save the ekinh_old */
734     if (startingBehavior == StartingBehavior::NewSimulation)
735     {
736         for (i = 0; (i < ir->opts.ngtc); i++)
737         {
738             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
739         }
740     }
741
742     /* need to make an initiation call to get the Trotter variables set, as well as other constants
743        for non-trotter temperature control */
744     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
745
746     if (MASTER(cr))
747     {
748         if (!ir->bContinuation)
749         {
750             if (constr && ir->eConstrAlg == ConstraintAlgorithm::Lincs)
751             {
752                 fprintf(fplog,
753                         "RMS relative constraint deviation after constraining: %.2e\n",
754                         constr->rmsd());
755             }
756             if (EI_STATE_VELOCITY(ir->eI))
757             {
758                 real temp = enerd->term[F_TEMP];
759                 if (ir->eI != IntegrationAlgorithm::VV)
760                 {
761                     /* Result of Ekin averaged over velocities of -half
762                      * and +half step, while we only have -half step here.
763                      */
764                     temp *= 2;
765                 }
766                 fprintf(fplog, "Initial temperature: %g K\n", temp);
767             }
768         }
769
770         char tbuf[20];
771         fprintf(stderr, "starting mdrun '%s'\n", *(top_global.name));
772         if (ir->nsteps >= 0)
773         {
774             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
775         }
776         else
777         {
778             sprintf(tbuf, "%s", "infinite");
779         }
780         if (ir->init_step > 0)
781         {
782             fprintf(stderr,
783                     "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
784                     gmx_step_str(ir->init_step + ir->nsteps, sbuf),
785                     tbuf,
786                     gmx_step_str(ir->init_step, sbuf2),
787                     ir->init_step * ir->delta_t);
788         }
789         else
790         {
791             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
792         }
793         fprintf(fplog, "\n");
794     }
795
796     walltime_accounting_start_time(walltime_accounting);
797     wallcycle_start(wcycle, WallCycleCounter::Run);
798     print_start(fplog, cr, walltime_accounting, "mdrun");
799
800     /***********************************************************
801      *
802      *             Loop over MD steps
803      *
804      ************************************************************/
805
806     bFirstStep = TRUE;
807     /* Skip the first Nose-Hoover integration when we get the state from tpx */
808     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
809     bSumEkinhOld     = FALSE;
810     bExchanged       = FALSE;
811     bNeedRepartition = FALSE;
812
813     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
814             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]),
815             simulationsShareState,
816             MASTER(cr),
817             ir->nstlist,
818             mdrunOptions.reproducible,
819             nstSignalComm,
820             mdrunOptions.maximumHoursToRun,
821             ir->nstlist == 0,
822             fplog,
823             step,
824             bNS,
825             walltime_accounting);
826
827     auto checkpointHandler = std::make_unique<CheckpointHandler>(
828             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
829             simulationsShareState,
830             ir->nstlist == 0,
831             MASTER(cr),
832             mdrunOptions.writeConfout,
833             mdrunOptions.checkpointOptions.period);
834
835     const bool resetCountersIsLocal = true;
836     auto       resetHandler         = std::make_unique<ResetHandler>(
837             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
838             !resetCountersIsLocal,
839             ir->nsteps,
840             MASTER(cr),
841             mdrunOptions.timingOptions.resetHalfway,
842             mdrunOptions.maximumHoursToRun,
843             mdlog,
844             wcycle,
845             walltime_accounting);
846
847     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
848
849     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
850     {
851         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
852     }
853
854     /* and stop now if we should */
855     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
856     while (!bLastStep)
857     {
858
859         /* Determine if this is a neighbor search step */
860         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
861
862         if (bPMETune && bNStList)
863         {
864             // This has to be here because PME load balancing is called so early.
865             // TODO: Move to after all booleans are defined.
866             if (useGpuForUpdate && !bFirstStep)
867             {
868                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
869                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
870             }
871             /* PME grid + cut-off optimization with GPUs or PME nodes */
872             pme_loadbal_do(pme_loadbal,
873                            cr,
874                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
875                            fplog,
876                            mdlog,
877                            *ir,
878                            fr,
879                            state->box,
880                            state->x,
881                            wcycle,
882                            step,
883                            step_rel,
884                            &bPMETunePrinting,
885                            simulationWork.useGpuPmePpCommunication);
886         }
887
888         wallcycle_start(wcycle, WallCycleCounter::Step);
889
890         bLastStep = (step_rel == ir->nsteps);
891         t         = t0 + step * ir->delta_t;
892
893         // TODO Refactor this, so that nstfep does not need a default value of zero
894         if (ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
895         {
896             /* find and set the current lambdas */
897             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
898
899             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
900                            && (!bFirstStep));
901         }
902
903         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
904                      && do_per_step(step, replExParams.exchangeInterval));
905
906         if (doSimulatedAnnealing)
907         {
908             // TODO: Avoid changing inputrec (#3854)
909             // Simulated annealing updates the reference temperature.
910             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
911             update_annealing_target_temp(nonConstInputrec, t, &upd);
912         }
913
914         /* Stop Center of Mass motion */
915         bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && do_per_step(step, ir->nstcomm));
916
917         /* Determine whether or not to do Neighbour Searching */
918         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
919
920         /* Note that the stopHandler will cause termination at nstglobalcomm
921          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
922          * nstpcouple steps, we have computed the half-step kinetic energy
923          * of the previous step and can always output energies at the last step.
924          */
925         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
926
927         /* do_log triggers energy and virial calculation. Because this leads
928          * to different code paths, forces can be different. Thus for exact
929          * continuation we should avoid extra log output.
930          * Note that the || bLastStep can result in non-exact continuation
931          * beyond the last step. But we don't consider that to be an issue.
932          */
933         do_log     = (do_per_step(step, ir->nstlog)
934                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
935         do_verbose = mdrunOptions.verbose
936                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
937
938         if (useGpuForUpdate && !bFirstStep && bNS)
939         {
940             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
941             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
942             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
943             // Copy coordinate from the GPU when needed at the search step.
944             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
945             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
946             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
947             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
948         }
949
950         // We only need to calculate virtual velocities if we are writing them in the current step
951         const bool needVirtualVelocitiesThisStep =
952                 (vsite != nullptr)
953                 && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep());
954
955         if (vsite != nullptr)
956         {
957             // Virtual sites need to be updated before domain decomposition and forces are calculated
958             wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
959             // md-vv calculates virtual velocities once it has full-step real velocities
960             vsite->construct(state->x,
961                              state->v,
962                              state->box,
963                              (!EI_VV(inputrec->eI) && needVirtualVelocitiesThisStep)
964                                      ? VSiteOperation::PositionsAndVelocities
965                                      : VSiteOperation::Positions);
966             wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
967         }
968
969         if (bNS && !(bFirstStep && ir->bContinuation))
970         {
971             bMasterState = FALSE;
972             /* Correct the new box if it is too skewed */
973             if (inputrecDynamicBox(ir))
974             {
975                 if (correct_box(fplog, step, state->box))
976                 {
977                     bMasterState = TRUE;
978                     // If update is offloaded, it should be informed about the box size change
979                     if (useGpuForUpdate)
980                     {
981                         integrator->setPbc(PbcType::Xyz, state->box);
982                     }
983                 }
984             }
985             if (DOMAINDECOMP(cr) && bMasterState)
986             {
987                 dd_collect_state(cr->dd, state, state_global);
988             }
989
990             if (DOMAINDECOMP(cr))
991             {
992                 /* Repartition the domain decomposition */
993                 dd_partition_system(fplog,
994                                     mdlog,
995                                     step,
996                                     cr,
997                                     bMasterState,
998                                     nstglobalcomm,
999                                     state_global,
1000                                     top_global,
1001                                     *ir,
1002                                     imdSession,
1003                                     pull_work,
1004                                     state,
1005                                     &f,
1006                                     mdAtoms,
1007                                     &top,
1008                                     fr,
1009                                     vsite,
1010                                     constr,
1011                                     nrnb,
1012                                     wcycle,
1013                                     do_verbose && !bPMETunePrinting);
1014                 upd.updateAfterPartition(state->natoms,
1015                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1016                                                      : gmx::ArrayRef<const unsigned short>(),
1017                                          md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1018                                                  : gmx::ArrayRef<const unsigned short>());
1019             }
1020         }
1021
1022         // Allocate or re-size GPU halo exchange object, if necessary
1023         if (bNS && simulationWork.havePpDomainDecomposition && simulationWork.useGpuHaloExchange)
1024         {
1025             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
1026                                "GPU device manager has to be initialized to use GPU "
1027                                "version of halo exchange.");
1028             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
1029         }
1030
1031         if (MASTER(cr) && do_log)
1032         {
1033             gmx::EnergyOutput::printHeader(
1034                     fplog, step, t); /* can we improve the information printed here? */
1035         }
1036
1037         if (ir->efep != FreeEnergyPerturbationType::No)
1038         {
1039             update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
1040         }
1041
1042         if (bExchanged)
1043         {
1044             /* We need the kinetic energy at minus the half step for determining
1045              * the full step kinetic energy and possibly for T-coupling.*/
1046             /* This may not be quite working correctly yet . . . . */
1047             int cglo_flags = CGLO_GSTAT | CGLO_TEMPERATURE;
1048             if (DOMAINDECOMP(cr) && dd_localTopologyChecker(*cr->dd).shouldCheckNumberOfBondedInteractions())
1049             {
1050                 cglo_flags |= CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS;
1051             }
1052             compute_globals(gstat,
1053                             cr,
1054                             ir,
1055                             fr,
1056                             ekind,
1057                             makeConstArrayRef(state->x),
1058                             makeConstArrayRef(state->v),
1059                             state->box,
1060                             md,
1061                             nrnb,
1062                             &vcm,
1063                             wcycle,
1064                             enerd,
1065                             nullptr,
1066                             nullptr,
1067                             nullptr,
1068                             nullptr,
1069                             gmx::ArrayRef<real>{},
1070                             &nullSignaller,
1071                             state->box,
1072                             &bSumEkinhOld,
1073                             cglo_flags,
1074                             step,
1075                             &observablesReducer);
1076             if (DOMAINDECOMP(cr))
1077             {
1078                 dd_localTopologyChecker(cr->dd)->checkNumberOfBondedInteractions(
1079                         &top, makeConstArrayRef(state->x), state->box);
1080             }
1081         }
1082         clear_mat(force_vir);
1083
1084         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
1085
1086         /* Determine the energy and pressure:
1087          * at nstcalcenergy steps and at energy output steps (set below).
1088          */
1089         if (EI_VV(ir->eI) && (!bInitStep))
1090         {
1091             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1092             bCalcVir      = bCalcEnerStep
1093                        || (ir->epc != PressureCoupling::No
1094                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
1095         }
1096         else
1097         {
1098             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1099             bCalcVir      = bCalcEnerStep
1100                        || (ir->epc != PressureCoupling::No && do_per_step(step, ir->nstpcouple));
1101         }
1102         bCalcEner = bCalcEnerStep;
1103
1104         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
1105
1106         if (do_ene || do_log || bDoReplEx)
1107         {
1108             bCalcVir  = TRUE;
1109             bCalcEner = TRUE;
1110         }
1111
1112         // bCalcEner is only here for when the last step is not a mulitple of nstfep
1113         const bool computeDHDL = ((ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
1114                                   && (do_per_step(step, nstfep) || bCalcEner));
1115
1116         /* Do we need global communication ? */
1117         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
1118                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
1119
1120         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
1121                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
1122                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (computeDHDL ? GMX_FORCE_DHDL : 0));
1123         if (simulationWork.useMts && !do_per_step(step, ir->nstfout))
1124         {
1125             // TODO: merge this with stepWork.useOnlyMtsCombinedForceBuffer
1126             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
1127         }
1128
1129         if (shellfc)
1130         {
1131             /* Now is the time to relax the shells */
1132             relax_shell_flexcon(fplog,
1133                                 cr,
1134                                 ms,
1135                                 mdrunOptions.verbose,
1136                                 enforcedRotation,
1137                                 step,
1138                                 ir,
1139                                 imdSession,
1140                                 pull_work,
1141                                 bNS,
1142                                 force_flags,
1143                                 &top,
1144                                 constr,
1145                                 enerd,
1146                                 state->natoms,
1147                                 state->x.arrayRefWithPadding(),
1148                                 state->v.arrayRefWithPadding(),
1149                                 state->box,
1150                                 state->lambda,
1151                                 &state->hist,
1152                                 &f.view(),
1153                                 force_vir,
1154                                 *md,
1155                                 nrnb,
1156                                 wcycle,
1157                                 shellfc,
1158                                 fr,
1159                                 runScheduleWork,
1160                                 t,
1161                                 mu_tot,
1162                                 vsite,
1163                                 ddBalanceRegionHandler);
1164         }
1165         else
1166         {
1167             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
1168                is updated (or the AWH update will be performed twice for one step when continuing).
1169                It would be best to call this update function from do_md_trajectory_writing but that
1170                would occur after do_force. One would have to divide the update_awh function into one
1171                function applying the AWH force and one doing the AWH bias update. The update AWH
1172                bias function could then be called after do_md_trajectory_writing (then containing
1173                update_awh_history). The checkpointing will in the future probably moved to the start
1174                of the md loop which will rid of this issue. */
1175             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
1176             {
1177                 awh->updateHistory(state_global->awhHistory.get());
1178             }
1179
1180             /* The coordinates (x) are shifted (to get whole molecules)
1181              * in do_force.
1182              * This is parallellized as well, and does communication too.
1183              * Check comments in sim_util.c
1184              */
1185             do_force(fplog,
1186                      cr,
1187                      ms,
1188                      *ir,
1189                      awh.get(),
1190                      enforcedRotation,
1191                      imdSession,
1192                      pull_work,
1193                      step,
1194                      nrnb,
1195                      wcycle,
1196                      &top,
1197                      state->box,
1198                      state->x.arrayRefWithPadding(),
1199                      &state->hist,
1200                      &f.view(),
1201                      force_vir,
1202                      md,
1203                      enerd,
1204                      state->lambda,
1205                      fr,
1206                      runScheduleWork,
1207                      vsite,
1208                      mu_tot,
1209                      t,
1210                      ed ? ed->getLegacyED() : nullptr,
1211                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
1212                      ddBalanceRegionHandler);
1213         }
1214
1215         // VV integrators do not need the following velocity half step
1216         // if it is the first step after starting from a checkpoint.
1217         // That is, the half step is needed on all other steps, and
1218         // also the first step when starting from a .tpr file.
1219         if (EI_VV(ir->eI))
1220         {
1221             integrateVVFirstStep(step,
1222                                  bFirstStep,
1223                                  bInitStep,
1224                                  startingBehavior,
1225                                  nstglobalcomm,
1226                                  ir,
1227                                  fr,
1228                                  cr,
1229                                  state,
1230                                  mdAtoms->mdatoms(),
1231                                  &fcdata,
1232                                  &MassQ,
1233                                  &vcm,
1234                                  top,
1235                                  enerd,
1236                                  &observablesReducer,
1237                                  ekind,
1238                                  gstat,
1239                                  &last_ekin,
1240                                  bCalcVir,
1241                                  total_vir,
1242                                  shake_vir,
1243                                  force_vir,
1244                                  pres,
1245                                  M,
1246                                  do_log,
1247                                  do_ene,
1248                                  bCalcEner,
1249                                  bGStat,
1250                                  bStopCM,
1251                                  bTrotter,
1252                                  bExchanged,
1253                                  &bSumEkinhOld,
1254                                  &saved_conserved_quantity,
1255                                  &f,
1256                                  &upd,
1257                                  constr,
1258                                  &nullSignaller,
1259                                  trotter_seq,
1260                                  nrnb,
1261                                  fplog,
1262                                  wcycle);
1263             if (vsite != nullptr && needVirtualVelocitiesThisStep)
1264             {
1265                 // Positions were calculated earlier
1266                 wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
1267                 vsite->construct(state->x, state->v, state->box, VSiteOperation::Velocities);
1268                 wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
1269             }
1270         }
1271
1272         /* ########  END FIRST UPDATE STEP  ############## */
1273         /* ########  If doing VV, we now have v(dt) ###### */
1274         if (bDoExpanded)
1275         {
1276             /* perform extended ensemble sampling in lambda - we don't
1277                actually move to the new state before outputting
1278                statistics, but if performing simulated tempering, we
1279                do update the velocities and the tau_t. */
1280             // TODO: Avoid changing inputrec (#3854)
1281             // Simulated tempering updates the reference temperature.
1282             // Expanded ensemble without simulated tempering does not change the inputrec.
1283             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
1284             lamnew                 = ExpandedEnsembleDynamics(fplog,
1285                                               nonConstInputrec,
1286                                               enerd,
1287                                               state,
1288                                               &MassQ,
1289                                               state->fep_state,
1290                                               state->dfhist,
1291                                               step,
1292                                               state->v.rvec_array(),
1293                                               md->homenr,
1294                                               md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1295                                                                       : gmx::ArrayRef<const unsigned short>());
1296             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1297             if (MASTER(cr))
1298             {
1299                 copy_df_history(state_global->dfhist, state->dfhist);
1300             }
1301         }
1302
1303         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1304         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1305         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1306             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1307                 || checkpointHandler->isCheckpointingStep()))
1308         {
1309             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1310             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1311         }
1312         // Copy velocities if needed for the output/checkpointing.
1313         // NOTE: Copy on the search steps is done at the beginning of the step.
1314         if (useGpuForUpdate && !bNS
1315             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1316         {
1317             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1318             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1319         }
1320         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1321         // and update is offloaded hence forces are kept on the GPU for update and have not been
1322         // already transferred in do_force().
1323         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1324         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1325         //       prior to GPU update.
1326         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1327         //       copy call in do_force(...).
1328         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1329         //       on host after the D2H copy in do_force(...).
1330         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1331             && do_per_step(step, ir->nstfout))
1332         {
1333             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1334             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1335         }
1336         /* Now we have the energies and forces corresponding to the
1337          * coordinates at time t. We must output all of this before
1338          * the update.
1339          */
1340         do_md_trajectory_writing(fplog,
1341                                  cr,
1342                                  nfile,
1343                                  fnm,
1344                                  step,
1345                                  step_rel,
1346                                  t,
1347                                  ir,
1348                                  state,
1349                                  state_global,
1350                                  observablesHistory,
1351                                  top_global,
1352                                  fr,
1353                                  outf,
1354                                  energyOutput,
1355                                  ekind,
1356                                  f.view().force(),
1357                                  checkpointHandler->isCheckpointingStep(),
1358                                  bRerunMD,
1359                                  bLastStep,
1360                                  mdrunOptions.writeConfout,
1361                                  bSumEkinhOld);
1362         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1363         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x, t);
1364
1365         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1366         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1367             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1368         {
1369             copy_mat(state->svir_prev, shake_vir);
1370             copy_mat(state->fvir_prev, force_vir);
1371         }
1372
1373         stopHandler->setSignal();
1374         resetHandler->setSignal(walltime_accounting);
1375
1376         if (bGStat || !PAR(cr))
1377         {
1378             /* In parallel we only have to check for checkpointing in steps
1379              * where we do global communication,
1380              *  otherwise the other nodes don't know.
1381              */
1382             checkpointHandler->setSignal(walltime_accounting);
1383         }
1384
1385         /* #########   START SECOND UPDATE STEP ################# */
1386
1387         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1388            controlled in preprocessing */
1389
1390         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1391         {
1392             gmx_bool bIfRandomize;
1393             bIfRandomize = update_randomize_velocities(ir,
1394                                                        step,
1395                                                        cr,
1396                                                        md->homenr,
1397                                                        md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1398                                                                : gmx::ArrayRef<const unsigned short>(),
1399                                                        gmx::arrayRefFromArray(md->invmass, md->nr),
1400                                                        state->v,
1401                                                        &upd,
1402                                                        constr);
1403             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1404             if (constr && bIfRandomize)
1405             {
1406                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1407             }
1408         }
1409         /* Box is changed in update() when we do pressure coupling,
1410          * but we should still use the old box for energy corrections and when
1411          * writing it to the energy file, so it matches the trajectory files for
1412          * the same timestep above. Make a copy in a separate array.
1413          */
1414         copy_mat(state->box, lastbox);
1415
1416         dvdl_constr = 0;
1417
1418         if (!useGpuForUpdate)
1419         {
1420             wallcycle_start(wcycle, WallCycleCounter::Update);
1421         }
1422         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1423         if (bTrotter)
1424         {
1425             trotter_update(ir,
1426                            step,
1427                            ekind,
1428                            enerd,
1429                            state,
1430                            total_vir,
1431                            md->homenr,
1432                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1433                                    : gmx::ArrayRef<const unsigned short>(),
1434                            gmx::arrayRefFromArray(md->invmass, md->nr),
1435                            &MassQ,
1436                            trotter_seq,
1437                            TrotterSequence::Three);
1438             /* We can only do Berendsen coupling after we have summed
1439              * the kinetic energy or virial. Since the happens
1440              * in global_state after update, we should only do it at
1441              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1442              */
1443         }
1444         else
1445         {
1446             update_tcouple(step,
1447                            ir,
1448                            state,
1449                            ekind,
1450                            &MassQ,
1451                            md->homenr,
1452                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1453                                    : gmx::ArrayRef<const unsigned short>());
1454             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1455         }
1456
1457         /* With leap-frog type integrators we compute the kinetic energy
1458          * at a whole time step as the average of the half-time step kinetic
1459          * energies of two subsequent steps. Therefore we need to compute the
1460          * half step kinetic energy also if we need energies at the next step.
1461          */
1462         const bool needHalfStepKineticEnergy =
1463                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1464
1465         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1466         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1467         const bool doParrinelloRahman = (ir->epc == PressureCoupling::ParrinelloRahman
1468                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1469
1470         if (EI_VV(ir->eI))
1471         {
1472             GMX_ASSERT(!useGpuForUpdate, "GPU update is not supported with VVAK integrator.");
1473
1474             integrateVVSecondStep(step,
1475                                   ir,
1476                                   fr,
1477                                   cr,
1478                                   state,
1479                                   mdAtoms->mdatoms(),
1480                                   &fcdata,
1481                                   &MassQ,
1482                                   &vcm,
1483                                   pull_work,
1484                                   enerd,
1485                                   &observablesReducer,
1486                                   ekind,
1487                                   gstat,
1488                                   &dvdl_constr,
1489                                   bCalcVir,
1490                                   total_vir,
1491                                   shake_vir,
1492                                   force_vir,
1493                                   pres,
1494                                   M,
1495                                   lastbox,
1496                                   do_log,
1497                                   do_ene,
1498                                   bGStat,
1499                                   &bSumEkinhOld,
1500                                   &f,
1501                                   &cbuf,
1502                                   &upd,
1503                                   constr,
1504                                   &nullSignaller,
1505                                   trotter_seq,
1506                                   nrnb,
1507                                   wcycle);
1508         }
1509         else
1510         {
1511             if (useGpuForUpdate)
1512             {
1513                 if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1514                 {
1515                     integrator->set(stateGpu->getCoordinates(),
1516                                     stateGpu->getVelocities(),
1517                                     stateGpu->getForces(),
1518                                     top.idef,
1519                                     *md);
1520
1521                     // Copy data to the GPU after buffers might have being reinitialized
1522                     /* The velocity copy is redundant if we had Center-of-Mass motion removed on
1523                      * the previous step. We don't check that now. */
1524                     stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1525                     if (!runScheduleWork->stepWork.haveGpuPmeOnThisRank
1526                         && !runScheduleWork->stepWork.useGpuXBufferOps)
1527                     {
1528                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1529                     }
1530                 }
1531
1532                 if (simulationWork.useGpuPme && simulationWork.useCpuPmePpCommunication)
1533                 {
1534                     // The PME forces were recieved to the host, so have to be copied
1535                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::All);
1536                 }
1537                 else if (!runScheduleWork->stepWork.useGpuFBufferOps)
1538                 {
1539                     // The buffer ops were not offloaded this step, so the forces are on the
1540                     // host and have to be copied
1541                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1542                 }
1543
1544                 const bool doTemperatureScaling =
1545                         (ir->etc != TemperatureCoupling::No
1546                          && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1547
1548                 // This applies Leap-Frog, LINCS and SETTLE in succession
1549                 integrator->integrate(
1550                         stateGpu->getForcesReadyOnDeviceEvent(
1551                                 AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1552                         ir->delta_t,
1553                         true,
1554                         bCalcVir,
1555                         shake_vir,
1556                         doTemperatureScaling,
1557                         ekind->tcstat,
1558                         doParrinelloRahman,
1559                         ir->nstpcouple * ir->delta_t,
1560                         M);
1561
1562                 // Copy velocities D2H after update if:
1563                 // - Globals are computed this step (includes the energy output steps).
1564                 // - Temperature is needed for the next step.
1565                 if (bGStat || needHalfStepKineticEnergy)
1566                 {
1567                     stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1568                     stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1569                 }
1570             }
1571             else
1572             {
1573                 /* With multiple time stepping we need to do an additional normal
1574                  * update step to obtain the virial, as the actual MTS integration
1575                  * using an acceleration where the slow forces are multiplied by mtsFactor.
1576                  * Using that acceleration would result in a virial with the slow
1577                  * force contribution would be a factor mtsFactor too large.
1578                  */
1579                 if (simulationWork.useMts && bCalcVir && constr != nullptr)
1580                 {
1581                     upd.update_for_constraint_virial(*ir,
1582                                                      md->homenr,
1583                                                      md->havePartiallyFrozenAtoms,
1584                                                      gmx::arrayRefFromArray(md->invmass, md->nr),
1585                                                      gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1586                                                      *state,
1587                                                      f.view().forceWithPadding(),
1588                                                      *ekind);
1589
1590                     constrain_coordinates(constr,
1591                                           do_log,
1592                                           do_ene,
1593                                           step,
1594                                           state,
1595                                           upd.xp()->arrayRefWithPadding(),
1596                                           &dvdl_constr,
1597                                           bCalcVir,
1598                                           shake_vir);
1599                 }
1600
1601                 ArrayRefWithPadding<const RVec> forceCombined =
1602                         (simulationWork.useMts && step % ir->mtsLevels[1].stepFactor == 0)
1603                                 ? f.view().forceMtsCombinedWithPadding()
1604                                 : f.view().forceWithPadding();
1605                 upd.update_coords(*ir,
1606                                   step,
1607                                   md->homenr,
1608                                   md->havePartiallyFrozenAtoms,
1609                                   gmx::arrayRefFromArray(md->ptype, md->nr),
1610                                   gmx::arrayRefFromArray(md->invmass, md->nr),
1611                                   gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1612                                   state,
1613                                   forceCombined,
1614                                   &fcdata,
1615                                   ekind,
1616                                   M,
1617                                   etrtPOSITION,
1618                                   cr,
1619                                   constr != nullptr);
1620
1621                 wallcycle_stop(wcycle, WallCycleCounter::Update);
1622
1623                 constrain_coordinates(constr,
1624                                       do_log,
1625                                       do_ene,
1626                                       step,
1627                                       state,
1628                                       upd.xp()->arrayRefWithPadding(),
1629                                       &dvdl_constr,
1630                                       bCalcVir && !simulationWork.useMts,
1631                                       shake_vir);
1632
1633                 upd.update_sd_second_half(*ir,
1634                                           step,
1635                                           &dvdl_constr,
1636                                           md->homenr,
1637                                           gmx::arrayRefFromArray(md->ptype, md->nr),
1638                                           gmx::arrayRefFromArray(md->invmass, md->nr),
1639                                           state,
1640                                           cr,
1641                                           nrnb,
1642                                           wcycle,
1643                                           constr,
1644                                           do_log,
1645                                           do_ene);
1646                 upd.finish_update(
1647                         *ir, md->havePartiallyFrozenAtoms, md->homenr, state, wcycle, constr != nullptr);
1648             }
1649
1650             if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1651             {
1652                 updatePrevStepPullCom(pull_work, state);
1653             }
1654
1655             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1656         }
1657
1658         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1659         /* With Leap-Frog we can skip compute_globals at
1660          * non-communication steps, but we need to calculate
1661          * the kinetic energy one step before communication.
1662          */
1663         {
1664             // Organize to do inter-simulation signalling on steps if
1665             // and when algorithms require it.
1666             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1667
1668             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1669             {
1670                 // Copy coordinates when needed to stop the CM motion.
1671                 if (useGpuForUpdate && (bDoReplEx || (!EI_VV(ir->eI) && bStopCM)))
1672                 {
1673                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1674                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1675                 }
1676                 // Since we're already communicating at this step, we
1677                 // can propagate intra-simulation signals. Note that
1678                 // check_nstglobalcomm has the responsibility for
1679                 // choosing the value of nstglobalcomm that is one way
1680                 // bGStat becomes true, so we can't get into a
1681                 // situation where e.g. checkpointing can't be
1682                 // signalled.
1683                 bool                doIntraSimSignal = true;
1684                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1685
1686                 compute_globals(
1687                         gstat,
1688                         cr,
1689                         ir,
1690                         fr,
1691                         ekind,
1692                         makeConstArrayRef(state->x),
1693                         makeConstArrayRef(state->v),
1694                         state->box,
1695                         md,
1696                         nrnb,
1697                         &vcm,
1698                         wcycle,
1699                         enerd,
1700                         force_vir,
1701                         shake_vir,
1702                         total_vir,
1703                         pres,
1704                         (!EI_VV(ir->eI) && bCalcEner && constr != nullptr) ? constr->rmsdData()
1705                                                                            : gmx::ArrayRef<real>{},
1706                         &signaller,
1707                         lastbox,
1708                         &bSumEkinhOld,
1709                         (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1710                                 | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1711                                 | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1712                                 | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
1713                                 | (DOMAINDECOMP(cr) && dd_localTopologyChecker(*cr->dd).shouldCheckNumberOfBondedInteractions()
1714                                            ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1715                                            : 0),
1716                         step,
1717                         &observablesReducer);
1718                 if (DOMAINDECOMP(cr))
1719                 {
1720                     dd_localTopologyChecker(cr->dd)->checkNumberOfBondedInteractions(
1721                             &top, makeConstArrayRef(state->x), state->box);
1722                 }
1723                 if (!EI_VV(ir->eI) && bStopCM)
1724                 {
1725                     process_and_stopcm_grp(
1726                             fplog, &vcm, *md, makeArrayRef(state->x), makeArrayRef(state->v));
1727                     inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
1728
1729                     // TODO: The special case of removing CM motion should be dealt more gracefully
1730                     if (useGpuForUpdate)
1731                     {
1732                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1733                         // Here we block until the H2D copy completes because event sync with the
1734                         // force kernels that use the coordinates on the next steps is not implemented
1735                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1736                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1737                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1738                         if (vcm.mode != ComRemovalAlgorithm::No)
1739                         {
1740                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1741                         }
1742                     }
1743                 }
1744             }
1745         }
1746
1747         /* #############  END CALC EKIN AND PRESSURE ################# */
1748
1749         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1750            the virial that should probably be addressed eventually. state->veta has better properies,
1751            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1752            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1753
1754         if (ir->efep != FreeEnergyPerturbationType::No && !EI_VV(ir->eI))
1755         {
1756             /* Sum up the foreign energy and dK/dl terms for md and sd.
1757                Currently done every step so that dH/dl is correct in the .edr */
1758             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1759         }
1760
1761         update_pcouple_after_coordinates(fplog,
1762                                          step,
1763                                          ir,
1764                                          md->homenr,
1765                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1766                                                      : gmx::ArrayRef<const unsigned short>(),
1767                                          pres,
1768                                          force_vir,
1769                                          shake_vir,
1770                                          pressureCouplingMu,
1771                                          state,
1772                                          nrnb,
1773                                          upd.deform(),
1774                                          !useGpuForUpdate);
1775
1776         const bool doBerendsenPressureCoupling = (inputrec->epc == PressureCoupling::Berendsen
1777                                                   && do_per_step(step, inputrec->nstpcouple));
1778         const bool doCRescalePressureCoupling  = (inputrec->epc == PressureCoupling::CRescale
1779                                                  && do_per_step(step, inputrec->nstpcouple));
1780         if (useGpuForUpdate
1781             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1782         {
1783             integrator->scaleCoordinates(pressureCouplingMu);
1784             if (doCRescalePressureCoupling)
1785             {
1786                 matrix pressureCouplingInvMu;
1787                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1788                 integrator->scaleVelocities(pressureCouplingInvMu);
1789             }
1790             integrator->setPbc(PbcType::Xyz, state->box);
1791         }
1792
1793         /* ################# END UPDATE STEP 2 ################# */
1794         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1795
1796         /* The coordinates (x) were unshifted in update */
1797         if (!bGStat)
1798         {
1799             /* We will not sum ekinh_old,
1800              * so signal that we still have to do it.
1801              */
1802             bSumEkinhOld = TRUE;
1803         }
1804
1805         if (bCalcEner)
1806         {
1807             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1808
1809             /* use the directly determined last velocity, not actually the averaged half steps */
1810             if (bTrotter && ir->eI == IntegrationAlgorithm::VV)
1811             {
1812                 enerd->term[F_EKIN] = last_ekin;
1813             }
1814             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1815
1816             if (integratorHasConservedEnergyQuantity(ir))
1817             {
1818                 if (EI_VV(ir->eI))
1819                 {
1820                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1821                 }
1822                 else
1823                 {
1824                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1825                 }
1826             }
1827             /* #########  END PREPARING EDR OUTPUT  ###########  */
1828         }
1829
1830         /* Output stuff */
1831         if (MASTER(cr))
1832         {
1833             if (fplog && do_log && bDoExpanded)
1834             {
1835                 /* only needed if doing expanded ensemble */
1836                 PrintFreeEnergyInfoToFile(fplog,
1837                                           ir->fepvals.get(),
1838                                           ir->expandedvals.get(),
1839                                           ir->bSimTemp ? ir->simtempvals.get() : nullptr,
1840                                           state_global->dfhist,
1841                                           state->fep_state,
1842                                           ir->nstlog,
1843                                           step);
1844             }
1845             if (bCalcEner)
1846             {
1847                 const bool outputDHDL = (computeDHDL && do_per_step(step, ir->fepvals->nstdhdl));
1848
1849                 energyOutput.addDataAtEnergyStep(outputDHDL,
1850                                                  bCalcEnerStep,
1851                                                  t,
1852                                                  md->tmass,
1853                                                  enerd,
1854                                                  ir->fepvals.get(),
1855                                                  ir->expandedvals.get(),
1856                                                  lastbox,
1857                                                  PTCouplingArrays{ state->boxv,
1858                                                                    state->nosehoover_xi,
1859                                                                    state->nosehoover_vxi,
1860                                                                    state->nhpres_xi,
1861                                                                    state->nhpres_vxi },
1862                                                  state->fep_state,
1863                                                  total_vir,
1864                                                  pres,
1865                                                  ekind,
1866                                                  mu_tot,
1867                                                  constr);
1868             }
1869             else
1870             {
1871                 energyOutput.recordNonEnergyStep();
1872             }
1873
1874             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1875             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1876
1877             if (doSimulatedAnnealing)
1878             {
1879                 gmx::EnergyOutput::printAnnealingTemperatures(
1880                         do_log ? fplog : nullptr, groups, &(ir->opts));
1881             }
1882             if (do_log || do_ene || do_dr || do_or)
1883             {
1884                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf),
1885                                                    do_ene,
1886                                                    do_dr,
1887                                                    do_or,
1888                                                    do_log ? fplog : nullptr,
1889                                                    step,
1890                                                    t,
1891                                                    fr->fcdata.get(),
1892                                                    awh.get());
1893             }
1894             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1895             {
1896                 const bool isInitialOutput = false;
1897                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1898             }
1899
1900             if (ir->bPull)
1901             {
1902                 pull_print_output(pull_work, step, t);
1903             }
1904
1905             if (do_per_step(step, ir->nstlog))
1906             {
1907                 if (fflush(fplog) != 0)
1908                 {
1909                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1910                 }
1911             }
1912         }
1913         if (bDoExpanded)
1914         {
1915             /* Have to do this part _after_ outputting the logfile and the edr file */
1916             /* Gets written into the state at the beginning of next loop*/
1917             state->fep_state = lamnew;
1918         }
1919         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1920         {
1921             state->fep_state = awh->fepLambdaState();
1922         }
1923         /* Print the remaining wall clock time for the run */
1924         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1925         {
1926             if (shellfc)
1927             {
1928                 fprintf(stderr, "\n");
1929             }
1930             print_time(stderr, walltime_accounting, step, ir, cr);
1931         }
1932
1933         /* Ion/water position swapping.
1934          * Not done in last step since trajectory writing happens before this call
1935          * in the MD loop and exchanges would be lost anyway. */
1936         bNeedRepartition = FALSE;
1937         if ((ir->eSwapCoords != SwapType::No) && (step > 0) && !bLastStep
1938             && do_per_step(step, ir->swap->nstswap))
1939         {
1940             bNeedRepartition = do_swapcoords(cr,
1941                                              step,
1942                                              t,
1943                                              ir,
1944                                              swap,
1945                                              wcycle,
1946                                              as_rvec_array(state->x.data()),
1947                                              state->box,
1948                                              MASTER(cr) && mdrunOptions.verbose,
1949                                              bRerunMD);
1950
1951             if (bNeedRepartition && DOMAINDECOMP(cr))
1952             {
1953                 dd_collect_state(cr->dd, state, state_global);
1954             }
1955         }
1956
1957         /* Replica exchange */
1958         bExchanged = FALSE;
1959         if (bDoReplEx)
1960         {
1961             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1962         }
1963
1964         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1965         {
1966             dd_partition_system(fplog,
1967                                 mdlog,
1968                                 step,
1969                                 cr,
1970                                 TRUE,
1971                                 1,
1972                                 state_global,
1973                                 top_global,
1974                                 *ir,
1975                                 imdSession,
1976                                 pull_work,
1977                                 state,
1978                                 &f,
1979                                 mdAtoms,
1980                                 &top,
1981                                 fr,
1982                                 vsite,
1983                                 constr,
1984                                 nrnb,
1985                                 wcycle,
1986                                 FALSE);
1987             upd.updateAfterPartition(state->natoms,
1988                                      md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1989                                                  : gmx::ArrayRef<const unsigned short>(),
1990                                      md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1991                                              : gmx::ArrayRef<const unsigned short>());
1992         }
1993
1994         bFirstStep = FALSE;
1995         bInitStep  = FALSE;
1996
1997         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1998         /* With all integrators, except VV, we need to retain the pressure
1999          * at the current step for coupling at the next step.
2000          */
2001         if ((state->flags & enumValueToBitMask(StateEntry::PressurePrevious))
2002             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
2003         {
2004             /* Store the pressure in t_state for pressure coupling
2005              * at the next MD step.
2006              */
2007             copy_mat(pres, state->pres_prev);
2008         }
2009
2010         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
2011
2012         if ((membed != nullptr) && (!bLastStep))
2013         {
2014             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
2015         }
2016
2017         cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
2018         if (DOMAINDECOMP(cr) && wcycle)
2019         {
2020             dd_cycles_add(cr->dd, cycles, ddCyclStep);
2021         }
2022
2023         /* increase the MD step number */
2024         step++;
2025         step_rel++;
2026
2027 #if GMX_FAHCORE
2028         if (MASTER(cr))
2029         {
2030             fcReportProgress(ir->nsteps + ir->init_step, step);
2031         }
2032 #endif
2033
2034         resetHandler->resetCounters(
2035                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
2036
2037         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
2038         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
2039     }
2040     /* End of main MD loop */
2041
2042     /* Closing TNG files can include compressing data. Therefore it is good to do that
2043      * before stopping the time measurements. */
2044     mdoutf_tng_close(outf);
2045
2046     /* Stop measuring walltime */
2047     walltime_accounting_end_time(walltime_accounting);
2048
2049     if (simulationWork.haveSeparatePmeRank)
2050     {
2051         /* Tell the PME only node to finish */
2052         gmx_pme_send_finish(cr);
2053     }
2054
2055     if (MASTER(cr))
2056     {
2057         if (ir->nstcalcenergy > 0)
2058         {
2059             energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
2060
2061             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
2062             energyOutput.printAverages(fplog, groups);
2063         }
2064     }
2065     done_mdoutf(outf);
2066
2067     if (bPMETune)
2068     {
2069         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
2070     }
2071
2072     done_shellfc(fplog, shellfc, step_rel);
2073
2074     if (useReplicaExchange && MASTER(cr))
2075     {
2076         print_replica_exchange_statistics(fplog, repl_ex);
2077     }
2078
2079     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
2080
2081     global_stat_destroy(gstat);
2082 }