Balance event consumption for GPU update code path
[alexxy/gromacs.git] / src / gromacs / mdrun / md.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 /*! \internal \file
38  *
39  * \brief Implements the integrator for normal molecular dynamics simulations
40  *
41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
42  * \ingroup module_mdrun
43  */
44 #include "gmxpre.h"
45
46 #include <cinttypes>
47 #include <cmath>
48 #include <cstdio>
49 #include <cstdlib>
50
51 #include <algorithm>
52 #include <memory>
53 #include <numeric>
54
55 #include "gromacs/applied_forces/awh/awh.h"
56 #include "gromacs/applied_forces/awh/read_params.h"
57 #include "gromacs/commandline/filenm.h"
58 #include "gromacs/domdec/collect.h"
59 #include "gromacs/domdec/dlbtiming.h"
60 #include "gromacs/domdec/domdec.h"
61 #include "gromacs/domdec/domdec_network.h"
62 #include "gromacs/domdec/domdec_struct.h"
63 #include "gromacs/domdec/gpuhaloexchange.h"
64 #include "gromacs/domdec/localtopologychecker.h"
65 #include "gromacs/domdec/mdsetup.h"
66 #include "gromacs/domdec/partition.h"
67 #include "gromacs/essentialdynamics/edsam.h"
68 #include "gromacs/ewald/pme_load_balancing.h"
69 #include "gromacs/ewald/pme_pp.h"
70 #include "gromacs/fileio/trxio.h"
71 #include "gromacs/gmxlib/network.h"
72 #include "gromacs/gmxlib/nrnb.h"
73 #include "gromacs/gpu_utils/device_stream_manager.h"
74 #include "gromacs/gpu_utils/gpu_utils.h"
75 #include "gromacs/imd/imd.h"
76 #include "gromacs/listed_forces/listed_forces.h"
77 #include "gromacs/math/functions.h"
78 #include "gromacs/math/invertmatrix.h"
79 #include "gromacs/math/vec.h"
80 #include "gromacs/math/vectypes.h"
81 #include "gromacs/mdlib/checkpointhandler.h"
82 #include "gromacs/mdlib/compute_io.h"
83 #include "gromacs/mdlib/constr.h"
84 #include "gromacs/mdlib/coupling.h"
85 #include "gromacs/mdlib/ebin.h"
86 #include "gromacs/mdlib/enerdata_utils.h"
87 #include "gromacs/mdlib/energyoutput.h"
88 #include "gromacs/mdlib/expanded.h"
89 #include "gromacs/mdlib/force.h"
90 #include "gromacs/mdlib/force_flags.h"
91 #include "gromacs/mdlib/forcerec.h"
92 #include "gromacs/mdlib/freeenergyparameters.h"
93 #include "gromacs/mdlib/md_support.h"
94 #include "gromacs/mdlib/mdatoms.h"
95 #include "gromacs/mdlib/mdoutf.h"
96 #include "gromacs/mdlib/membed.h"
97 #include "gromacs/mdlib/resethandler.h"
98 #include "gromacs/mdlib/sighandler.h"
99 #include "gromacs/mdlib/simulationsignal.h"
100 #include "gromacs/mdlib/stat.h"
101 #include "gromacs/mdlib/stophandler.h"
102 #include "gromacs/mdlib/tgroup.h"
103 #include "gromacs/mdlib/trajectory_writing.h"
104 #include "gromacs/mdlib/update.h"
105 #include "gromacs/mdlib/update_constrain_gpu.h"
106 #include "gromacs/mdlib/update_vv.h"
107 #include "gromacs/mdlib/vcm.h"
108 #include "gromacs/mdlib/vsite.h"
109 #include "gromacs/mdrunutility/freeenergy.h"
110 #include "gromacs/mdrunutility/handlerestart.h"
111 #include "gromacs/mdrunutility/multisim.h"
112 #include "gromacs/mdrunutility/printtime.h"
113 #include "gromacs/mdtypes/awh_history.h"
114 #include "gromacs/mdtypes/awh_params.h"
115 #include "gromacs/mdtypes/commrec.h"
116 #include "gromacs/mdtypes/df_history.h"
117 #include "gromacs/mdtypes/energyhistory.h"
118 #include "gromacs/mdtypes/fcdata.h"
119 #include "gromacs/mdtypes/forcebuffers.h"
120 #include "gromacs/mdtypes/forcerec.h"
121 #include "gromacs/mdtypes/group.h"
122 #include "gromacs/mdtypes/inputrec.h"
123 #include "gromacs/mdtypes/interaction_const.h"
124 #include "gromacs/mdtypes/md_enums.h"
125 #include "gromacs/mdtypes/mdatom.h"
126 #include "gromacs/mdtypes/mdrunoptions.h"
127 #include "gromacs/mdtypes/multipletimestepping.h"
128 #include "gromacs/mdtypes/observableshistory.h"
129 #include "gromacs/mdtypes/observablesreducer.h"
130 #include "gromacs/mdtypes/pullhistory.h"
131 #include "gromacs/mdtypes/simulation_workload.h"
132 #include "gromacs/mdtypes/state.h"
133 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
134 #include "gromacs/modularsimulator/energydata.h"
135 #include "gromacs/nbnxm/gpu_data_mgmt.h"
136 #include "gromacs/nbnxm/nbnxm.h"
137 #include "gromacs/pbcutil/pbc.h"
138 #include "gromacs/pulling/output.h"
139 #include "gromacs/pulling/pull.h"
140 #include "gromacs/swap/swapcoords.h"
141 #include "gromacs/timing/wallcycle.h"
142 #include "gromacs/timing/walltime_accounting.h"
143 #include "gromacs/topology/atoms.h"
144 #include "gromacs/topology/idef.h"
145 #include "gromacs/topology/mtop_util.h"
146 #include "gromacs/topology/topology.h"
147 #include "gromacs/trajectory/trajectoryframe.h"
148 #include "gromacs/utility/basedefinitions.h"
149 #include "gromacs/utility/cstringutil.h"
150 #include "gromacs/utility/fatalerror.h"
151 #include "gromacs/utility/logger.h"
152 #include "gromacs/utility/real.h"
153 #include "gromacs/utility/smalloc.h"
154
155 #include "legacysimulator.h"
156 #include "replicaexchange.h"
157 #include "shellfc.h"
158
159 using gmx::SimulationSignaller;
160
161 void gmx::LegacySimulator::do_md()
162 {
163     // TODO Historically, the EM and MD "integrators" used different
164     // names for the t_inputrec *parameter, but these must have the
165     // same name, now that it's a member of a struct. We use this ir
166     // alias to avoid a large ripple of nearly useless changes.
167     // t_inputrec is being replaced by IMdpOptionsProvider, so this
168     // will go away eventually.
169     const t_inputrec* ir = inputrec;
170
171     double       t, t0 = ir->init_t;
172     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
173     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
174     gmx_bool     bDoExpanded = FALSE;
175     gmx_bool     do_ene, do_log, do_verbose;
176     gmx_bool     bMasterState;
177     unsigned int force_flags;
178     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
179     int    i, m;
180     rvec   mu_tot;
181     matrix pressureCouplingMu, M;
182     gmx_repl_ex_t     repl_ex = nullptr;
183     gmx_global_stat_t gstat;
184     gmx_shellfc_t*    shellfc;
185     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
186     gmx_bool          bTrotter;
187     real              dvdl_constr;
188     std::vector<RVec> cbuf;
189     matrix            lastbox;
190     int               lamnew = 0;
191     /* for FEP */
192     double    cycles;
193     real      saved_conserved_quantity = 0;
194     real      last_ekin                = 0;
195     t_extmass MassQ;
196     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
197
198     /* PME load balancing data for GPU kernels */
199     gmx_bool bPMETune         = FALSE;
200     gmx_bool bPMETunePrinting = FALSE;
201
202     bool bInteractiveMDstep = false;
203
204     SimulationSignals signals;
205     // Most global communnication stages don't propagate mdrun
206     // signals, and will use this object to achieve that.
207     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
208
209     if (!mdrunOptions.writeConfout)
210     {
211         // This is on by default, and the main known use case for
212         // turning it off is for convenience in benchmarking, which is
213         // something that should not show up in the general user
214         // interface.
215         GMX_LOG(mdlog.info)
216                 .asParagraph()
217                 .appendText(
218                         "The -noconfout functionality is deprecated, and may be removed in a "
219                         "future version.");
220     }
221
222     /* md-vv uses averaged full step velocities for T-control
223        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
224        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
225     bTrotter = (EI_VV(ir->eI)
226                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
227
228     const bool bRerunMD = false;
229
230     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
231     bGStatEveryStep   = (nstglobalcomm == 1);
232
233     const SimulationGroups* groups = &top_global.groups;
234
235     std::unique_ptr<EssentialDynamics> ed = nullptr;
236     if (opt2bSet("-ei", nfile, fnm))
237     {
238         /* Initialize essential dynamics sampling */
239         ed = init_edsam(mdlog,
240                         opt2fn_null("-ei", nfile, fnm),
241                         opt2fn("-eo", nfile, fnm),
242                         top_global,
243                         *ir,
244                         cr,
245                         constr,
246                         state_global,
247                         observablesHistory,
248                         oenv,
249                         startingBehavior);
250     }
251     else if (observablesHistory->edsamHistory)
252     {
253         gmx_fatal(FARGS,
254                   "The checkpoint is from a run with essential dynamics sampling, "
255                   "but the current run did not specify the -ei option. "
256                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
257     }
258
259     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
260     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
261     initialize_lambdas(fplog,
262                        ir->efep,
263                        ir->bSimTemp,
264                        *ir->fepvals,
265                        ir->simtempvals->temperatures,
266                        gmx::arrayRefFromArray(ir->opts.ref_t, ir->opts.ngtc),
267                        MASTER(cr),
268                        fep_state,
269                        lambda);
270     Update upd(*ir, deform);
271     bool   doSimulatedAnnealing = false;
272     {
273         // TODO: Avoid changing inputrec (#3854)
274         // Simulated annealing updates the reference temperature.
275         auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
276         doSimulatedAnnealing   = initSimulatedAnnealing(nonConstInputrec, &upd);
277     }
278     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
279
280     t_fcdata& fcdata = *fr->fcdata;
281
282     bool simulationsShareState = false;
283     int  nstSignalComm         = nstglobalcomm;
284     {
285         // TODO This implementation of ensemble orientation restraints is nasty because
286         // a user can't just do multi-sim with single-sim orientation restraints.
287         bool usingEnsembleRestraints = (fcdata.disres->nsystems > 1) || ((ms != nullptr) && fcdata.orires);
288         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim() && (ms != nullptr));
289
290         // Replica exchange, ensemble restraints and AWH need all
291         // simulations to remain synchronized, so they need
292         // checkpoints and stop conditions to act on the same step, so
293         // the propagation of such signals must take place between
294         // simulations, not just within simulations.
295         // TODO: Make algorithm initializers set these flags.
296         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
297
298         if (simulationsShareState)
299         {
300             // Inter-simulation signal communication does not need to happen
301             // often, so we use a minimum of 200 steps to reduce overhead.
302             const int c_minimumInterSimulationSignallingInterval = 200;
303             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
304                             * nstglobalcomm;
305         }
306     }
307
308     if (startingBehavior != StartingBehavior::RestartWithAppending)
309     {
310         pleaseCiteCouplingAlgorithms(fplog, *ir);
311     }
312     gmx_mdoutf*       outf = init_mdoutf(fplog,
313                                    nfile,
314                                    fnm,
315                                    mdrunOptions,
316                                    cr,
317                                    outputProvider,
318                                    mdModulesNotifiers,
319                                    ir,
320                                    top_global,
321                                    oenv,
322                                    wcycle,
323                                    startingBehavior,
324                                    simulationsShareState,
325                                    ms);
326     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf),
327                                    top_global,
328                                    *ir,
329                                    pull_work,
330                                    mdoutf_get_fp_dhdl(outf),
331                                    false,
332                                    startingBehavior,
333                                    simulationsShareState,
334                                    mdModulesNotifiers);
335
336     gstat = global_stat_init(ir);
337
338     const auto& simulationWork     = runScheduleWork->simulationWork;
339     const bool  useGpuForPme       = simulationWork.useGpuPme;
340     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
341     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
342     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
343
344     /* Check for polarizable models and flexible constraints */
345     shellfc = init_shell_flexcon(fplog,
346                                  top_global,
347                                  constr ? constr->numFlexibleConstraints() : 0,
348                                  ir->nstcalcenergy,
349                                  haveDDAtomOrdering(*cr),
350                                  useGpuForPme);
351
352     {
353         double io = compute_io(ir, top_global.natoms, *groups, energyOutput.numEnergyTerms(), 1);
354         if ((io > 2000) && MASTER(cr))
355         {
356             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
357         }
358     }
359
360     ObservablesReducer observablesReducer = observablesReducerBuilder->build();
361
362     ForceBuffers     f(simulationWork.useMts,
363                    ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
364                                ? PinningPolicy::PinnedIfSupported
365                                : PinningPolicy::CannotBePinned);
366     const t_mdatoms* md = mdAtoms->mdatoms();
367     if (haveDDAtomOrdering(*cr))
368     {
369         // Local state only becomes valid now.
370         dd_init_local_state(*cr->dd, state_global, state);
371
372         /* Distribute the charge groups over the nodes from the master node */
373         dd_partition_system(fplog,
374                             mdlog,
375                             ir->init_step,
376                             cr,
377                             TRUE,
378                             1,
379                             state_global,
380                             top_global,
381                             *ir,
382                             imdSession,
383                             pull_work,
384                             state,
385                             &f,
386                             mdAtoms,
387                             top,
388                             fr,
389                             vsite,
390                             constr,
391                             nrnb,
392                             nullptr,
393                             FALSE);
394         upd.updateAfterPartition(state->natoms,
395                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
396                                              : gmx::ArrayRef<const unsigned short>(),
397                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
398                                          : gmx::ArrayRef<const unsigned short>());
399         fr->longRangeNonbondeds->updateAfterPartition(*md);
400     }
401     else
402     {
403         state_change_natoms(state_global, state_global->natoms);
404
405         /* Generate and initialize new topology */
406         mdAlgorithmsSetupAtomData(cr, *ir, top_global, top, fr, &f, mdAtoms, constr, vsite, shellfc);
407
408         upd.updateAfterPartition(state->natoms,
409                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
410                                              : gmx::ArrayRef<const unsigned short>(),
411                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
412                                          : gmx::ArrayRef<const unsigned short>());
413         fr->longRangeNonbondeds->updateAfterPartition(*md);
414     }
415
416     std::unique_ptr<UpdateConstrainGpu> integrator;
417
418     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
419
420     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
421     if (useGpuForUpdate)
422     {
423         GMX_RELEASE_ASSERT(!haveDDAtomOrdering(*cr) || ddUsesUpdateGroups(*cr->dd)
424                                    || constr == nullptr || constr->numConstraintsTotal() == 0,
425                            "Constraints in domain decomposition are only supported with update "
426                            "groups if using GPU update.\n");
427         GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
428                                    || constr->numConstraintsTotal() == 0,
429                            "SHAKE is not supported with GPU update.");
430         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
431                            "Either PME or short-ranged non-bonded interaction tasks must run on "
432                            "the GPU to use GPU update.\n");
433         GMX_RELEASE_ASSERT(ir->eI == IntegrationAlgorithm::MD,
434                            "Only the md integrator is supported with the GPU update.\n");
435         GMX_RELEASE_ASSERT(
436                 ir->etc != TemperatureCoupling::NoseHoover,
437                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
438         GMX_RELEASE_ASSERT(
439                 ir->epc == PressureCoupling::No || ir->epc == PressureCoupling::ParrinelloRahman
440                         || ir->epc == PressureCoupling::Berendsen || ir->epc == PressureCoupling::CRescale,
441                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
442                 "with the GPU update.\n");
443         GMX_RELEASE_ASSERT(!md->haveVsites,
444                            "Virtual sites are not supported with the GPU update.\n");
445         GMX_RELEASE_ASSERT(ed == nullptr,
446                            "Essential dynamics is not supported with the GPU update.\n");
447         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
448                            "Constraints pulling is not supported with the GPU update.\n");
449         GMX_RELEASE_ASSERT(fcdata.orires == nullptr,
450                            "Orientation restraints are not supported with the GPU update.\n");
451         GMX_RELEASE_ASSERT(
452                 ir->efep == FreeEnergyPerturbationType::No
453                         || (!haveFepPerturbedMasses(top_global) && !havePerturbedConstraints(top_global)),
454                 "Free energy perturbation of masses and constraints are not supported with the GPU "
455                 "update.");
456
457         if (constr != nullptr && constr->numConstraintsTotal() > 0)
458         {
459             GMX_LOG(mdlog.info)
460                     .asParagraph()
461                     .appendText("Updating coordinates and applying constraints on the GPU.");
462         }
463         else
464         {
465             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
466         }
467         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
468                            "Device stream manager should be initialized in order to use GPU "
469                            "update-constraints.");
470         GMX_RELEASE_ASSERT(
471                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
472                 "Update stream should be initialized in order to use GPU "
473                 "update-constraints.");
474         integrator = std::make_unique<UpdateConstrainGpu>(
475                 *ir,
476                 top_global,
477                 ekind->ngtc,
478                 fr->deviceStreamManager->context(),
479                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
480                 wcycle);
481
482         stateGpu->setXUpdatedOnDeviceEvent(integrator->xUpdatedOnDeviceEvent());
483
484         integrator->setPbc(PbcType::Xyz, state->box);
485     }
486
487     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
488     {
489         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
490     }
491     if (useGpuForUpdate)
492     {
493         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
494     }
495
496     // NOTE: The global state is no longer used at this point.
497     // But state_global is still used as temporary storage space for writing
498     // the global state to file and potentially for replica exchange.
499     // (Global topology should persist.)
500
501     update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
502
503     if (ir->bExpanded)
504     {
505         /* Check nstexpanded here, because the grompp check was broken */
506         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
507         {
508             gmx_fatal(FARGS,
509                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
510         }
511         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
512     }
513
514     if (MASTER(cr))
515     {
516         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
517     }
518
519     preparePrevStepPullCom(ir,
520                            pull_work,
521                            gmx::arrayRefFromArray(md->massT, md->nr),
522                            state,
523                            state_global,
524                            cr,
525                            startingBehavior != StartingBehavior::NewSimulation);
526
527     // TODO: Remove this by converting AWH into a ForceProvider
528     auto awh = prepareAwhModule(fplog,
529                                 *ir,
530                                 state_global,
531                                 cr,
532                                 ms,
533                                 startingBehavior != StartingBehavior::NewSimulation,
534                                 shellfc != nullptr,
535                                 opt2fn("-awh", nfile, fnm),
536                                 pull_work);
537
538     if (useReplicaExchange && MASTER(cr))
539     {
540         repl_ex = init_replica_exchange(fplog, ms, top_global.natoms, ir, replExParams);
541     }
542     /* PME tuning is only supported in the Verlet scheme, with PME for
543      * Coulomb. It is not supported with only LJ PME. */
544     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
545                 && ir->cutoff_scheme != CutoffScheme::Group);
546
547     pme_load_balancing_t* pme_loadbal = nullptr;
548     if (bPMETune)
549     {
550         pme_loadbal_init(
551                 &pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu());
552     }
553
554     if (!ir->bContinuation)
555     {
556         if (state->flags & enumValueToBitMask(StateEntry::V))
557         {
558             auto v = makeArrayRef(state->v);
559             /* Set the velocities of vsites, shells and frozen atoms to zero */
560             for (i = 0; i < md->homenr; i++)
561             {
562                 if (md->ptype[i] == ParticleType::Shell)
563                 {
564                     clear_rvec(v[i]);
565                 }
566                 else if (md->cFREEZE)
567                 {
568                     for (m = 0; m < DIM; m++)
569                     {
570                         if (ir->opts.nFreeze[md->cFREEZE[i]][m])
571                         {
572                             v[i][m] = 0;
573                         }
574                     }
575                 }
576             }
577         }
578
579         if (constr)
580         {
581             /* Constrain the initial coordinates and velocities */
582             do_constrain_first(fplog,
583                                constr,
584                                ir,
585                                md->nr,
586                                md->homenr,
587                                state->x.arrayRefWithPadding(),
588                                state->v.arrayRefWithPadding(),
589                                state->box,
590                                state->lambda[FreeEnergyPerturbationCouplingType::Bonded]);
591         }
592     }
593
594     const int nstfep = computeFepPeriod(*ir, replExParams);
595
596     /* Be REALLY careful about what flags you set here. You CANNOT assume
597      * this is the first step, since we might be restarting from a checkpoint,
598      * and in that case we should not do any modifications to the state.
599      */
600     bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && !ir->bContinuation);
601
602     // When restarting from a checkpoint, it can be appropriate to
603     // initialize ekind from quantities in the checkpoint. Otherwise,
604     // compute_globals must initialize ekind before the simulation
605     // starts/restarts. However, only the master rank knows what was
606     // found in the checkpoint file, so we have to communicate in
607     // order to coordinate the restart.
608     //
609     // TODO Consider removing this communication if/when checkpoint
610     // reading directly follows .tpr reading, because all ranks can
611     // agree on hasReadEkinState at that time.
612     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
613     if (PAR(cr))
614     {
615         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
616     }
617     if (hasReadEkinState)
618     {
619         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
620     }
621
622     unsigned int cglo_flags =
623             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
624              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
625
626     bSumEkinhOld = FALSE;
627
628     t_vcm vcm(top_global.groups, *ir);
629     reportComRemovalInfo(fplog, vcm);
630
631     int64_t step     = ir->init_step;
632     int64_t step_rel = 0;
633
634     /* To minimize communication, compute_globals computes the COM velocity
635      * and the kinetic energy for the velocities without COM motion removed.
636      * Thus to get the kinetic energy without the COM contribution, we need
637      * to call compute_globals twice.
638      */
639     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
640     {
641         unsigned int cglo_flags_iteration = cglo_flags;
642         if (bStopCM && cgloIteration == 0)
643         {
644             cglo_flags_iteration |= CGLO_STOPCM;
645             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
646         }
647         compute_globals(gstat,
648                         cr,
649                         ir,
650                         fr,
651                         ekind,
652                         makeConstArrayRef(state->x),
653                         makeConstArrayRef(state->v),
654                         state->box,
655                         md,
656                         nrnb,
657                         &vcm,
658                         nullptr,
659                         enerd,
660                         force_vir,
661                         shake_vir,
662                         total_vir,
663                         pres,
664                         &nullSignaller,
665                         state->box,
666                         &bSumEkinhOld,
667                         cglo_flags_iteration,
668                         step,
669                         &observablesReducer);
670         // Clean up after pre-step use of compute_globals()
671         observablesReducer.markAsReadyToReduce();
672
673         if (cglo_flags_iteration & CGLO_STOPCM)
674         {
675             /* At initialization, do not pass x with acceleration-correction mode
676              * to avoid (incorrect) correction of the initial coordinates.
677              */
678             auto x = (vcm.mode == ComRemovalAlgorithm::LinearAccelerationCorrection)
679                              ? ArrayRef<RVec>()
680                              : makeArrayRef(state->x);
681             process_and_stopcm_grp(fplog, &vcm, *md, x, makeArrayRef(state->v));
682             inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
683         }
684     }
685     if (ir->eI == IntegrationAlgorithm::VVAK)
686     {
687         /* a second call to get the half step temperature initialized as well */
688         /* we do the same call as above, but turn the pressure off -- internally to
689            compute_globals, this is recognized as a velocity verlet half-step
690            kinetic energy calculation.  This minimized excess variables, but
691            perhaps loses some logic?*/
692
693         compute_globals(gstat,
694                         cr,
695                         ir,
696                         fr,
697                         ekind,
698                         makeConstArrayRef(state->x),
699                         makeConstArrayRef(state->v),
700                         state->box,
701                         md,
702                         nrnb,
703                         &vcm,
704                         nullptr,
705                         enerd,
706                         force_vir,
707                         shake_vir,
708                         total_vir,
709                         pres,
710                         &nullSignaller,
711                         state->box,
712                         &bSumEkinhOld,
713                         cglo_flags & ~CGLO_PRESSURE,
714                         step,
715                         &observablesReducer);
716         // Clean up after pre-step use of compute_globals()
717         observablesReducer.markAsReadyToReduce();
718     }
719
720     /* Calculate the initial half step temperature, and save the ekinh_old */
721     if (startingBehavior == StartingBehavior::NewSimulation)
722     {
723         for (i = 0; (i < ir->opts.ngtc); i++)
724         {
725             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
726         }
727     }
728
729     /* need to make an initiation call to get the Trotter variables set, as well as other constants
730        for non-trotter temperature control */
731     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
732
733     if (MASTER(cr))
734     {
735         if (!ir->bContinuation)
736         {
737             if (constr && ir->eConstrAlg == ConstraintAlgorithm::Lincs)
738             {
739                 fprintf(fplog,
740                         "RMS relative constraint deviation after constraining: %.2e\n",
741                         constr->rmsd());
742             }
743             if (EI_STATE_VELOCITY(ir->eI))
744             {
745                 real temp = enerd->term[F_TEMP];
746                 if (ir->eI != IntegrationAlgorithm::VV)
747                 {
748                     /* Result of Ekin averaged over velocities of -half
749                      * and +half step, while we only have -half step here.
750                      */
751                     temp *= 2;
752                 }
753                 fprintf(fplog, "Initial temperature: %g K\n", temp);
754             }
755         }
756
757         char tbuf[20];
758         fprintf(stderr, "starting mdrun '%s'\n", *(top_global.name));
759         if (ir->nsteps >= 0)
760         {
761             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
762         }
763         else
764         {
765             sprintf(tbuf, "%s", "infinite");
766         }
767         if (ir->init_step > 0)
768         {
769             fprintf(stderr,
770                     "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
771                     gmx_step_str(ir->init_step + ir->nsteps, sbuf),
772                     tbuf,
773                     gmx_step_str(ir->init_step, sbuf2),
774                     ir->init_step * ir->delta_t);
775         }
776         else
777         {
778             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
779         }
780         fprintf(fplog, "\n");
781     }
782
783     walltime_accounting_start_time(walltime_accounting);
784     wallcycle_start(wcycle, WallCycleCounter::Run);
785     print_start(fplog, cr, walltime_accounting, "mdrun");
786
787     /***********************************************************
788      *
789      *             Loop over MD steps
790      *
791      ************************************************************/
792
793     bFirstStep = TRUE;
794     /* Skip the first Nose-Hoover integration when we get the state from tpx */
795     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
796     bSumEkinhOld     = FALSE;
797     bExchanged       = FALSE;
798     bNeedRepartition = FALSE;
799
800     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
801             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]),
802             simulationsShareState,
803             MASTER(cr),
804             ir->nstlist,
805             mdrunOptions.reproducible,
806             nstSignalComm,
807             mdrunOptions.maximumHoursToRun,
808             ir->nstlist == 0,
809             fplog,
810             step,
811             bNS,
812             walltime_accounting);
813
814     auto checkpointHandler = std::make_unique<CheckpointHandler>(
815             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
816             simulationsShareState,
817             ir->nstlist == 0,
818             MASTER(cr),
819             mdrunOptions.writeConfout,
820             mdrunOptions.checkpointOptions.period);
821
822     const bool resetCountersIsLocal = true;
823     auto       resetHandler         = std::make_unique<ResetHandler>(
824             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
825             !resetCountersIsLocal,
826             ir->nsteps,
827             MASTER(cr),
828             mdrunOptions.timingOptions.resetHalfway,
829             mdrunOptions.maximumHoursToRun,
830             mdlog,
831             wcycle,
832             walltime_accounting);
833
834     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
835
836     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
837     {
838         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
839     }
840
841     /* and stop now if we should */
842     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
843     while (!bLastStep)
844     {
845
846         /* Determine if this is a neighbor search step */
847         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
848
849         if (bPMETune && bNStList)
850         {
851             // This has to be here because PME load balancing is called so early.
852             // TODO: Move to after all booleans are defined.
853             if (useGpuForUpdate && !bFirstStep)
854             {
855                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
856                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
857             }
858             /* PME grid + cut-off optimization with GPUs or PME nodes */
859             pme_loadbal_do(pme_loadbal,
860                            cr,
861                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
862                            fplog,
863                            mdlog,
864                            *ir,
865                            fr,
866                            state->box,
867                            state->x,
868                            wcycle,
869                            step,
870                            step_rel,
871                            &bPMETunePrinting,
872                            simulationWork.useGpuPmePpCommunication);
873         }
874
875         wallcycle_start(wcycle, WallCycleCounter::Step);
876
877         bLastStep = (step_rel == ir->nsteps);
878         t         = t0 + step * ir->delta_t;
879
880         // TODO Refactor this, so that nstfep does not need a default value of zero
881         if (ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
882         {
883             /* find and set the current lambdas */
884             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
885
886             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
887                            && (!bFirstStep));
888         }
889
890         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
891                      && do_per_step(step, replExParams.exchangeInterval));
892
893         if (doSimulatedAnnealing)
894         {
895             // TODO: Avoid changing inputrec (#3854)
896             // Simulated annealing updates the reference temperature.
897             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
898             update_annealing_target_temp(nonConstInputrec, t, &upd);
899         }
900
901         /* Stop Center of Mass motion */
902         bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && do_per_step(step, ir->nstcomm));
903
904         /* Determine whether or not to do Neighbour Searching */
905         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
906
907         /* Note that the stopHandler will cause termination at nstglobalcomm
908          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
909          * nstpcouple steps, we have computed the half-step kinetic energy
910          * of the previous step and can always output energies at the last step.
911          */
912         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
913
914         /* do_log triggers energy and virial calculation. Because this leads
915          * to different code paths, forces can be different. Thus for exact
916          * continuation we should avoid extra log output.
917          * Note that the || bLastStep can result in non-exact continuation
918          * beyond the last step. But we don't consider that to be an issue.
919          */
920         do_log     = (do_per_step(step, ir->nstlog)
921                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
922         do_verbose = mdrunOptions.verbose
923                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
924
925         // On search steps, when doing the update on the GPU, copy
926         // the coordinates and velocities to the host unless they are
927         // already there (ie on the first step and after replica
928         // exchange).
929         if (useGpuForUpdate && bNS && !bFirstStep && !bExchanged)
930         {
931             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
932             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
933             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
934             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
935         }
936
937         // We only need to calculate virtual velocities if we are writing them in the current step
938         const bool needVirtualVelocitiesThisStep =
939                 (vsite != nullptr)
940                 && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep());
941
942         if (vsite != nullptr)
943         {
944             // Virtual sites need to be updated before domain decomposition and forces are calculated
945             wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
946             // md-vv calculates virtual velocities once it has full-step real velocities
947             vsite->construct(state->x,
948                              state->v,
949                              state->box,
950                              (!EI_VV(inputrec->eI) && needVirtualVelocitiesThisStep)
951                                      ? VSiteOperation::PositionsAndVelocities
952                                      : VSiteOperation::Positions);
953             wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
954         }
955
956         if (bNS && !(bFirstStep && ir->bContinuation))
957         {
958             bMasterState = FALSE;
959             /* Correct the new box if it is too skewed */
960             if (inputrecDynamicBox(ir))
961             {
962                 if (correct_box(fplog, step, state->box))
963                 {
964                     bMasterState = TRUE;
965                 }
966             }
967             // If update is offloaded, and the box was changed either
968             // above or in a replica exchange on the previous step,
969             // the GPU Update object should be informed
970             if (useGpuForUpdate && (bMasterState || bExchanged))
971             {
972                 integrator->setPbc(PbcType::Xyz, state->box);
973             }
974             if (haveDDAtomOrdering(*cr) && bMasterState)
975             {
976                 dd_collect_state(cr->dd, state, state_global);
977             }
978
979             if (haveDDAtomOrdering(*cr))
980             {
981                 /* Repartition the domain decomposition */
982                 dd_partition_system(fplog,
983                                     mdlog,
984                                     step,
985                                     cr,
986                                     bMasterState,
987                                     nstglobalcomm,
988                                     state_global,
989                                     top_global,
990                                     *ir,
991                                     imdSession,
992                                     pull_work,
993                                     state,
994                                     &f,
995                                     mdAtoms,
996                                     top,
997                                     fr,
998                                     vsite,
999                                     constr,
1000                                     nrnb,
1001                                     wcycle,
1002                                     do_verbose && !bPMETunePrinting);
1003                 upd.updateAfterPartition(state->natoms,
1004                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1005                                                      : gmx::ArrayRef<const unsigned short>(),
1006                                          md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1007                                                  : gmx::ArrayRef<const unsigned short>());
1008                 fr->longRangeNonbondeds->updateAfterPartition(*md);
1009             }
1010         }
1011
1012         // Allocate or re-size GPU halo exchange object, if necessary
1013         if (bNS && simulationWork.havePpDomainDecomposition && simulationWork.useGpuHaloExchange)
1014         {
1015             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
1016                                "GPU device manager has to be initialized to use GPU "
1017                                "version of halo exchange.");
1018             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
1019         }
1020
1021         if (MASTER(cr) && do_log)
1022         {
1023             gmx::EnergyOutput::printHeader(
1024                     fplog, step, t); /* can we improve the information printed here? */
1025         }
1026
1027         if (ir->efep != FreeEnergyPerturbationType::No)
1028         {
1029             update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
1030         }
1031
1032         if (bExchanged)
1033         {
1034             /* We need the kinetic energy at minus the half step for determining
1035              * the full step kinetic energy and possibly for T-coupling.*/
1036             /* This may not be quite working correctly yet . . . . */
1037             int cglo_flags = CGLO_GSTAT | CGLO_TEMPERATURE;
1038             compute_globals(gstat,
1039                             cr,
1040                             ir,
1041                             fr,
1042                             ekind,
1043                             makeConstArrayRef(state->x),
1044                             makeConstArrayRef(state->v),
1045                             state->box,
1046                             md,
1047                             nrnb,
1048                             &vcm,
1049                             wcycle,
1050                             enerd,
1051                             nullptr,
1052                             nullptr,
1053                             nullptr,
1054                             nullptr,
1055                             &nullSignaller,
1056                             state->box,
1057                             &bSumEkinhOld,
1058                             cglo_flags,
1059                             step,
1060                             &observablesReducer);
1061         }
1062         clear_mat(force_vir);
1063
1064         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
1065
1066         /* Determine the energy and pressure:
1067          * at nstcalcenergy steps and at energy output steps (set below).
1068          */
1069         if (EI_VV(ir->eI) && (!bInitStep))
1070         {
1071             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1072             bCalcVir      = bCalcEnerStep
1073                        || (ir->epc != PressureCoupling::No
1074                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
1075         }
1076         else
1077         {
1078             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1079             bCalcVir      = bCalcEnerStep
1080                        || (ir->epc != PressureCoupling::No && do_per_step(step, ir->nstpcouple));
1081         }
1082         bCalcEner = bCalcEnerStep;
1083
1084         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
1085
1086         if (do_ene || do_log || bDoReplEx)
1087         {
1088             bCalcVir  = TRUE;
1089             bCalcEner = TRUE;
1090         }
1091
1092         // bCalcEner is only here for when the last step is not a mulitple of nstfep
1093         const bool computeDHDL = ((ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
1094                                   && (do_per_step(step, nstfep) || bCalcEner));
1095
1096         /* Do we need global communication ? */
1097         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
1098                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
1099
1100         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
1101                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
1102                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (computeDHDL ? GMX_FORCE_DHDL : 0));
1103         if (simulationWork.useMts && !do_per_step(step, ir->nstfout))
1104         {
1105             // TODO: merge this with stepWork.useOnlyMtsCombinedForceBuffer
1106             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
1107         }
1108
1109         if (shellfc)
1110         {
1111             /* Now is the time to relax the shells */
1112             relax_shell_flexcon(fplog,
1113                                 cr,
1114                                 ms,
1115                                 mdrunOptions.verbose,
1116                                 enforcedRotation,
1117                                 step,
1118                                 ir,
1119                                 imdSession,
1120                                 pull_work,
1121                                 bNS,
1122                                 force_flags,
1123                                 top,
1124                                 constr,
1125                                 enerd,
1126                                 state->natoms,
1127                                 state->x.arrayRefWithPadding(),
1128                                 state->v.arrayRefWithPadding(),
1129                                 state->box,
1130                                 state->lambda,
1131                                 &state->hist,
1132                                 &f.view(),
1133                                 force_vir,
1134                                 *md,
1135                                 fr->longRangeNonbondeds.get(),
1136                                 nrnb,
1137                                 wcycle,
1138                                 shellfc,
1139                                 fr,
1140                                 runScheduleWork,
1141                                 t,
1142                                 mu_tot,
1143                                 vsite,
1144                                 ddBalanceRegionHandler);
1145         }
1146         else
1147         {
1148             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
1149                is updated (or the AWH update will be performed twice for one step when continuing).
1150                It would be best to call this update function from do_md_trajectory_writing but that
1151                would occur after do_force. One would have to divide the update_awh function into one
1152                function applying the AWH force and one doing the AWH bias update. The update AWH
1153                bias function could then be called after do_md_trajectory_writing (then containing
1154                update_awh_history). The checkpointing will in the future probably moved to the start
1155                of the md loop which will rid of this issue. */
1156             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
1157             {
1158                 awh->updateHistory(state_global->awhHistory.get());
1159             }
1160
1161             /* The coordinates (x) are shifted (to get whole molecules)
1162              * in do_force.
1163              * This is parallellized as well, and does communication too.
1164              * Check comments in sim_util.c
1165              */
1166             do_force(fplog,
1167                      cr,
1168                      ms,
1169                      *ir,
1170                      awh.get(),
1171                      enforcedRotation,
1172                      imdSession,
1173                      pull_work,
1174                      step,
1175                      nrnb,
1176                      wcycle,
1177                      top,
1178                      state->box,
1179                      state->x.arrayRefWithPadding(),
1180                      &state->hist,
1181                      &f.view(),
1182                      force_vir,
1183                      md,
1184                      enerd,
1185                      state->lambda,
1186                      fr,
1187                      runScheduleWork,
1188                      vsite,
1189                      mu_tot,
1190                      t,
1191                      ed ? ed->getLegacyED() : nullptr,
1192                      fr->longRangeNonbondeds.get(),
1193                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
1194                      ddBalanceRegionHandler);
1195         }
1196
1197         // VV integrators do not need the following velocity half step
1198         // if it is the first step after starting from a checkpoint.
1199         // That is, the half step is needed on all other steps, and
1200         // also the first step when starting from a .tpr file.
1201         if (EI_VV(ir->eI))
1202         {
1203             integrateVVFirstStep(step,
1204                                  bFirstStep,
1205                                  bInitStep,
1206                                  startingBehavior,
1207                                  nstglobalcomm,
1208                                  ir,
1209                                  fr,
1210                                  cr,
1211                                  state,
1212                                  mdAtoms->mdatoms(),
1213                                  &fcdata,
1214                                  &MassQ,
1215                                  &vcm,
1216                                  enerd,
1217                                  &observablesReducer,
1218                                  ekind,
1219                                  gstat,
1220                                  &last_ekin,
1221                                  bCalcVir,
1222                                  total_vir,
1223                                  shake_vir,
1224                                  force_vir,
1225                                  pres,
1226                                  M,
1227                                  do_log,
1228                                  do_ene,
1229                                  bCalcEner,
1230                                  bGStat,
1231                                  bStopCM,
1232                                  bTrotter,
1233                                  bExchanged,
1234                                  &bSumEkinhOld,
1235                                  &saved_conserved_quantity,
1236                                  &f,
1237                                  &upd,
1238                                  constr,
1239                                  &nullSignaller,
1240                                  trotter_seq,
1241                                  nrnb,
1242                                  fplog,
1243                                  wcycle);
1244             if (vsite != nullptr && needVirtualVelocitiesThisStep)
1245             {
1246                 // Positions were calculated earlier
1247                 wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
1248                 vsite->construct(state->x, state->v, state->box, VSiteOperation::Velocities);
1249                 wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
1250             }
1251         }
1252
1253         /* ########  END FIRST UPDATE STEP  ############## */
1254         /* ########  If doing VV, we now have v(dt) ###### */
1255         if (bDoExpanded)
1256         {
1257             /* perform extended ensemble sampling in lambda - we don't
1258                actually move to the new state before outputting
1259                statistics, but if performing simulated tempering, we
1260                do update the velocities and the tau_t. */
1261             // TODO: Avoid changing inputrec (#3854)
1262             // Simulated tempering updates the reference temperature.
1263             // Expanded ensemble without simulated tempering does not change the inputrec.
1264             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
1265             lamnew                 = ExpandedEnsembleDynamics(fplog,
1266                                               nonConstInputrec,
1267                                               enerd,
1268                                               state,
1269                                               &MassQ,
1270                                               state->fep_state,
1271                                               state->dfhist,
1272                                               step,
1273                                               state->v.rvec_array(),
1274                                               md->homenr,
1275                                               md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1276                                                                       : gmx::ArrayRef<const unsigned short>());
1277             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1278             if (MASTER(cr))
1279             {
1280                 copy_df_history(state_global->dfhist, state->dfhist);
1281             }
1282         }
1283
1284         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1285         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1286         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1287             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1288                 || checkpointHandler->isCheckpointingStep()))
1289         {
1290             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1291             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1292         }
1293         // Copy velocities if needed for the output/checkpointing.
1294         // NOTE: Copy on the search steps is done at the beginning of the step.
1295         if (useGpuForUpdate && !bNS
1296             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1297         {
1298             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1299             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1300         }
1301         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1302         // and update is offloaded hence forces are kept on the GPU for update and have not been
1303         // already transferred in do_force().
1304         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1305         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1306         //       prior to GPU update.
1307         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1308         //       copy call in do_force(...).
1309         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1310         //       on host after the D2H copy in do_force(...).
1311         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1312             && do_per_step(step, ir->nstfout))
1313         {
1314             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1315             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1316         }
1317         /* Now we have the energies and forces corresponding to the
1318          * coordinates at time t. We must output all of this before
1319          * the update.
1320          */
1321         do_md_trajectory_writing(fplog,
1322                                  cr,
1323                                  nfile,
1324                                  fnm,
1325                                  step,
1326                                  step_rel,
1327                                  t,
1328                                  ir,
1329                                  state,
1330                                  state_global,
1331                                  observablesHistory,
1332                                  top_global,
1333                                  fr,
1334                                  outf,
1335                                  energyOutput,
1336                                  ekind,
1337                                  f.view().force(),
1338                                  checkpointHandler->isCheckpointingStep(),
1339                                  bRerunMD,
1340                                  bLastStep,
1341                                  mdrunOptions.writeConfout,
1342                                  bSumEkinhOld);
1343         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1344         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x, t);
1345
1346         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1347         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1348             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1349         {
1350             copy_mat(state->svir_prev, shake_vir);
1351             copy_mat(state->fvir_prev, force_vir);
1352         }
1353
1354         stopHandler->setSignal();
1355         resetHandler->setSignal(walltime_accounting);
1356
1357         if (bGStat || !PAR(cr))
1358         {
1359             /* In parallel we only have to check for checkpointing in steps
1360              * where we do global communication,
1361              *  otherwise the other nodes don't know.
1362              */
1363             checkpointHandler->setSignal(walltime_accounting);
1364         }
1365
1366         /* #########   START SECOND UPDATE STEP ################# */
1367
1368         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1369            controlled in preprocessing */
1370
1371         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1372         {
1373             gmx_bool bIfRandomize;
1374             bIfRandomize = update_randomize_velocities(ir,
1375                                                        step,
1376                                                        cr,
1377                                                        md->homenr,
1378                                                        md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1379                                                                : gmx::ArrayRef<const unsigned short>(),
1380                                                        gmx::arrayRefFromArray(md->invmass, md->nr),
1381                                                        state->v,
1382                                                        &upd,
1383                                                        constr);
1384             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1385             if (constr && bIfRandomize)
1386             {
1387                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1388             }
1389         }
1390         /* Box is changed in update() when we do pressure coupling,
1391          * but we should still use the old box for energy corrections and when
1392          * writing it to the energy file, so it matches the trajectory files for
1393          * the same timestep above. Make a copy in a separate array.
1394          */
1395         copy_mat(state->box, lastbox);
1396
1397         dvdl_constr = 0;
1398
1399         if (!useGpuForUpdate)
1400         {
1401             wallcycle_start(wcycle, WallCycleCounter::Update);
1402         }
1403         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1404         if (bTrotter)
1405         {
1406             trotter_update(ir,
1407                            step,
1408                            ekind,
1409                            enerd,
1410                            state,
1411                            total_vir,
1412                            md->homenr,
1413                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1414                                    : gmx::ArrayRef<const unsigned short>(),
1415                            gmx::arrayRefFromArray(md->invmass, md->nr),
1416                            &MassQ,
1417                            trotter_seq,
1418                            TrotterSequence::Three);
1419             /* We can only do Berendsen coupling after we have summed
1420              * the kinetic energy or virial. Since the happens
1421              * in global_state after update, we should only do it at
1422              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1423              */
1424         }
1425         else
1426         {
1427             update_tcouple(step,
1428                            ir,
1429                            state,
1430                            ekind,
1431                            &MassQ,
1432                            md->homenr,
1433                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1434                                    : gmx::ArrayRef<const unsigned short>());
1435             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1436         }
1437
1438         /* With leap-frog type integrators we compute the kinetic energy
1439          * at a whole time step as the average of the half-time step kinetic
1440          * energies of two subsequent steps. Therefore we need to compute the
1441          * half step kinetic energy also if we need energies at the next step.
1442          */
1443         const bool needHalfStepKineticEnergy =
1444                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1445
1446         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1447         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1448         const bool doParrinelloRahman = (ir->epc == PressureCoupling::ParrinelloRahman
1449                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1450
1451         if (EI_VV(ir->eI))
1452         {
1453             GMX_ASSERT(!useGpuForUpdate, "GPU update is not supported with VVAK integrator.");
1454
1455             integrateVVSecondStep(step,
1456                                   ir,
1457                                   fr,
1458                                   cr,
1459                                   state,
1460                                   mdAtoms->mdatoms(),
1461                                   &fcdata,
1462                                   &MassQ,
1463                                   &vcm,
1464                                   pull_work,
1465                                   enerd,
1466                                   &observablesReducer,
1467                                   ekind,
1468                                   gstat,
1469                                   &dvdl_constr,
1470                                   bCalcVir,
1471                                   total_vir,
1472                                   shake_vir,
1473                                   force_vir,
1474                                   pres,
1475                                   M,
1476                                   lastbox,
1477                                   do_log,
1478                                   do_ene,
1479                                   bGStat,
1480                                   &bSumEkinhOld,
1481                                   &f,
1482                                   &cbuf,
1483                                   &upd,
1484                                   constr,
1485                                   &nullSignaller,
1486                                   trotter_seq,
1487                                   nrnb,
1488                                   wcycle);
1489         }
1490         else
1491         {
1492             if (useGpuForUpdate)
1493             {
1494                 // On search steps, update handles to device vectors
1495                 if (bNS && (bFirstStep || haveDDAtomOrdering(*cr) || bExchanged))
1496                 {
1497                     integrator->set(stateGpu->getCoordinates(),
1498                                     stateGpu->getVelocities(),
1499                                     stateGpu->getForces(),
1500                                     top->idef,
1501                                     *md);
1502
1503                     // Copy data to the GPU after buffers might have being reinitialized
1504                     /* The velocity copy is redundant if we had Center-of-Mass motion removed on
1505                      * the previous step. We don't check that now. */
1506                     stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1507                     if (bExchanged
1508                         || (!runScheduleWork->stepWork.haveGpuPmeOnThisRank
1509                             && !runScheduleWork->stepWork.useGpuXBufferOps))
1510                     {
1511                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1512                     }
1513                 }
1514
1515                 if ((simulationWork.useGpuPme && simulationWork.useCpuPmePpCommunication)
1516                     || (!runScheduleWork->stepWork.useGpuFBufferOps))
1517                 {
1518                     // The PME forces were recieved to the host, and reduced on the CPU with the
1519                     // rest of the forces computed on the GPU, so the final forces have to be copied
1520                     // back to the GPU. Or the buffer ops were not offloaded this step, so the
1521                     // forces are on the host and have to be copied
1522                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1523                 }
1524                 const bool doTemperatureScaling =
1525                         (ir->etc != TemperatureCoupling::No
1526                          && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1527
1528                 // This applies Leap-Frog, LINCS and SETTLE in succession
1529                 integrator->integrate(stateGpu->getLocalForcesReadyOnDeviceEvent(
1530                                               runScheduleWork->stepWork, runScheduleWork->simulationWork),
1531                                       ir->delta_t,
1532                                       true,
1533                                       bCalcVir,
1534                                       shake_vir,
1535                                       doTemperatureScaling,
1536                                       ekind->tcstat,
1537                                       doParrinelloRahman,
1538                                       ir->nstpcouple * ir->delta_t,
1539                                       M);
1540             }
1541             else
1542             {
1543                 /* With multiple time stepping we need to do an additional normal
1544                  * update step to obtain the virial, as the actual MTS integration
1545                  * using an acceleration where the slow forces are multiplied by mtsFactor.
1546                  * Using that acceleration would result in a virial with the slow
1547                  * force contribution would be a factor mtsFactor too large.
1548                  */
1549                 if (simulationWork.useMts && bCalcVir && constr != nullptr)
1550                 {
1551                     upd.update_for_constraint_virial(*ir,
1552                                                      md->homenr,
1553                                                      md->havePartiallyFrozenAtoms,
1554                                                      gmx::arrayRefFromArray(md->invmass, md->nr),
1555                                                      gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1556                                                      *state,
1557                                                      f.view().forceWithPadding(),
1558                                                      *ekind);
1559
1560                     constrain_coordinates(constr,
1561                                           do_log,
1562                                           do_ene,
1563                                           step,
1564                                           state,
1565                                           upd.xp()->arrayRefWithPadding(),
1566                                           &dvdl_constr,
1567                                           bCalcVir,
1568                                           shake_vir);
1569                 }
1570
1571                 ArrayRefWithPadding<const RVec> forceCombined =
1572                         (simulationWork.useMts && step % ir->mtsLevels[1].stepFactor == 0)
1573                                 ? f.view().forceMtsCombinedWithPadding()
1574                                 : f.view().forceWithPadding();
1575                 upd.update_coords(*ir,
1576                                   step,
1577                                   md->homenr,
1578                                   md->havePartiallyFrozenAtoms,
1579                                   gmx::arrayRefFromArray(md->ptype, md->nr),
1580                                   gmx::arrayRefFromArray(md->invmass, md->nr),
1581                                   gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1582                                   state,
1583                                   forceCombined,
1584                                   &fcdata,
1585                                   ekind,
1586                                   M,
1587                                   etrtPOSITION,
1588                                   cr,
1589                                   constr != nullptr);
1590
1591                 wallcycle_stop(wcycle, WallCycleCounter::Update);
1592
1593                 constrain_coordinates(constr,
1594                                       do_log,
1595                                       do_ene,
1596                                       step,
1597                                       state,
1598                                       upd.xp()->arrayRefWithPadding(),
1599                                       &dvdl_constr,
1600                                       bCalcVir && !simulationWork.useMts,
1601                                       shake_vir);
1602
1603                 upd.update_sd_second_half(*ir,
1604                                           step,
1605                                           &dvdl_constr,
1606                                           md->homenr,
1607                                           gmx::arrayRefFromArray(md->ptype, md->nr),
1608                                           gmx::arrayRefFromArray(md->invmass, md->nr),
1609                                           state,
1610                                           cr,
1611                                           nrnb,
1612                                           wcycle,
1613                                           constr,
1614                                           do_log,
1615                                           do_ene);
1616                 upd.finish_update(
1617                         *ir, md->havePartiallyFrozenAtoms, md->homenr, state, wcycle, constr != nullptr);
1618             }
1619
1620             if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1621             {
1622                 updatePrevStepPullCom(pull_work, state->pull_com_prev_step);
1623             }
1624
1625             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1626         }
1627
1628         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1629         /* With Leap-Frog we can skip compute_globals at
1630          * non-communication steps, but we need to calculate
1631          * the kinetic energy one step before communication.
1632          */
1633         {
1634             // Organize to do inter-simulation signalling on steps if
1635             // and when algorithms require it.
1636             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1637
1638             if (useGpuForUpdate)
1639             {
1640                 const bool coordinatesRequiredForStopCM =
1641                         bStopCM && (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1642                         && !EI_VV(ir->eI);
1643
1644                 // Copy coordinates when needed to stop the CM motion or for replica exchange
1645                 if (coordinatesRequiredForStopCM || bDoReplEx)
1646                 {
1647                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1648                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1649                 }
1650
1651                 // Copy velocities back to the host if:
1652                 // - Globals are computed this step (includes the energy output steps).
1653                 // - Temperature is needed for the next step.
1654                 // - This is a replica exchange step (even though we will only need
1655                 //     the velocities if an exchange succeeds)
1656                 if (bGStat || needHalfStepKineticEnergy || bDoReplEx)
1657                 {
1658                     stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1659                     stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1660                 }
1661             }
1662
1663             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1664             {
1665                 // Since we're already communicating at this step, we
1666                 // can propagate intra-simulation signals. Note that
1667                 // check_nstglobalcomm has the responsibility for
1668                 // choosing the value of nstglobalcomm that is one way
1669                 // bGStat becomes true, so we can't get into a
1670                 // situation where e.g. checkpointing can't be
1671                 // signalled.
1672                 bool                doIntraSimSignal = true;
1673                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1674
1675                 compute_globals(gstat,
1676                                 cr,
1677                                 ir,
1678                                 fr,
1679                                 ekind,
1680                                 makeConstArrayRef(state->x),
1681                                 makeConstArrayRef(state->v),
1682                                 state->box,
1683                                 md,
1684                                 nrnb,
1685                                 &vcm,
1686                                 wcycle,
1687                                 enerd,
1688                                 force_vir,
1689                                 shake_vir,
1690                                 total_vir,
1691                                 pres,
1692                                 &signaller,
1693                                 lastbox,
1694                                 &bSumEkinhOld,
1695                                 (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1696                                         | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1697                                         | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1698                                         | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT,
1699                                 step,
1700                                 &observablesReducer);
1701                 if (!EI_VV(ir->eI) && bStopCM)
1702                 {
1703                     process_and_stopcm_grp(
1704                             fplog, &vcm, *md, makeArrayRef(state->x), makeArrayRef(state->v));
1705                     inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
1706
1707                     // TODO: The special case of removing CM motion should be dealt more gracefully
1708                     if (useGpuForUpdate)
1709                     {
1710                         // Issue #3988, #4106.
1711                         stateGpu->resetCoordinatesCopiedToDeviceEvent(AtomLocality::Local);
1712                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1713                         // Here we block until the H2D copy completes because event sync with the
1714                         // force kernels that use the coordinates on the next steps is not implemented
1715                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1716                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1717                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1718                         if (vcm.mode != ComRemovalAlgorithm::No)
1719                         {
1720                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1721                         }
1722                     }
1723                 }
1724             }
1725         }
1726
1727         /* #############  END CALC EKIN AND PRESSURE ################# */
1728
1729         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1730            the virial that should probably be addressed eventually. state->veta has better properies,
1731            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1732            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1733
1734         if (ir->efep != FreeEnergyPerturbationType::No && !EI_VV(ir->eI))
1735         {
1736             /* Sum up the foreign energy and dK/dl terms for md and sd.
1737                Currently done every step so that dH/dl is correct in the .edr */
1738             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1739         }
1740
1741         bool scaleCoordinates = !useGpuForUpdate || bDoReplEx;
1742         update_pcouple_after_coordinates(fplog,
1743                                          step,
1744                                          ir,
1745                                          md->homenr,
1746                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1747                                                      : gmx::ArrayRef<const unsigned short>(),
1748                                          pres,
1749                                          force_vir,
1750                                          shake_vir,
1751                                          pressureCouplingMu,
1752                                          state,
1753                                          nrnb,
1754                                          upd.deform(),
1755                                          scaleCoordinates);
1756
1757         const bool doBerendsenPressureCoupling = (inputrec->epc == PressureCoupling::Berendsen
1758                                                   && do_per_step(step, inputrec->nstpcouple));
1759         const bool doCRescalePressureCoupling  = (inputrec->epc == PressureCoupling::CRescale
1760                                                  && do_per_step(step, inputrec->nstpcouple));
1761         if (useGpuForUpdate
1762             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1763         {
1764             integrator->scaleCoordinates(pressureCouplingMu);
1765             if (doCRescalePressureCoupling)
1766             {
1767                 matrix pressureCouplingInvMu;
1768                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1769                 integrator->scaleVelocities(pressureCouplingInvMu);
1770             }
1771             integrator->setPbc(PbcType::Xyz, state->box);
1772         }
1773
1774         /* ################# END UPDATE STEP 2 ################# */
1775         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1776
1777         /* The coordinates (x) were unshifted in update */
1778         if (!bGStat)
1779         {
1780             /* We will not sum ekinh_old,
1781              * so signal that we still have to do it.
1782              */
1783             bSumEkinhOld = TRUE;
1784         }
1785
1786         if (bCalcEner)
1787         {
1788             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1789
1790             /* use the directly determined last velocity, not actually the averaged half steps */
1791             if (bTrotter && ir->eI == IntegrationAlgorithm::VV)
1792             {
1793                 enerd->term[F_EKIN] = last_ekin;
1794             }
1795             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1796
1797             if (integratorHasConservedEnergyQuantity(ir))
1798             {
1799                 if (EI_VV(ir->eI))
1800                 {
1801                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1802                 }
1803                 else
1804                 {
1805                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1806                 }
1807             }
1808             /* #########  END PREPARING EDR OUTPUT  ###########  */
1809         }
1810
1811         /* Output stuff */
1812         if (MASTER(cr))
1813         {
1814             if (fplog && do_log && bDoExpanded)
1815             {
1816                 /* only needed if doing expanded ensemble */
1817                 PrintFreeEnergyInfoToFile(fplog,
1818                                           ir->fepvals.get(),
1819                                           ir->expandedvals.get(),
1820                                           ir->bSimTemp ? ir->simtempvals.get() : nullptr,
1821                                           state_global->dfhist,
1822                                           state->fep_state,
1823                                           ir->nstlog,
1824                                           step);
1825             }
1826             if (bCalcEner)
1827             {
1828                 const bool outputDHDL = (computeDHDL && do_per_step(step, ir->fepvals->nstdhdl));
1829
1830                 energyOutput.addDataAtEnergyStep(outputDHDL,
1831                                                  bCalcEnerStep,
1832                                                  t,
1833                                                  md->tmass,
1834                                                  enerd,
1835                                                  ir->fepvals.get(),
1836                                                  lastbox,
1837                                                  PTCouplingArrays{ state->boxv,
1838                                                                    state->nosehoover_xi,
1839                                                                    state->nosehoover_vxi,
1840                                                                    state->nhpres_xi,
1841                                                                    state->nhpres_vxi },
1842                                                  state->fep_state,
1843                                                  total_vir,
1844                                                  pres,
1845                                                  ekind,
1846                                                  mu_tot,
1847                                                  constr);
1848             }
1849             else
1850             {
1851                 energyOutput.recordNonEnergyStep();
1852             }
1853
1854             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1855             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1856
1857             if (doSimulatedAnnealing)
1858             {
1859                 gmx::EnergyOutput::printAnnealingTemperatures(
1860                         do_log ? fplog : nullptr, groups, &(ir->opts));
1861             }
1862             if (do_log || do_ene || do_dr || do_or)
1863             {
1864                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf),
1865                                                    do_ene,
1866                                                    do_dr,
1867                                                    do_or,
1868                                                    do_log ? fplog : nullptr,
1869                                                    step,
1870                                                    t,
1871                                                    fr->fcdata.get(),
1872                                                    awh.get());
1873             }
1874             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1875             {
1876                 const bool isInitialOutput = false;
1877                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1878             }
1879
1880             if (ir->bPull)
1881             {
1882                 pull_print_output(pull_work, step, t);
1883             }
1884
1885             if (do_per_step(step, ir->nstlog))
1886             {
1887                 if (fflush(fplog) != 0)
1888                 {
1889                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1890                 }
1891             }
1892         }
1893         if (bDoExpanded)
1894         {
1895             /* Have to do this part _after_ outputting the logfile and the edr file */
1896             /* Gets written into the state at the beginning of next loop*/
1897             state->fep_state = lamnew;
1898         }
1899         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1900         {
1901             state->fep_state = awh->fepLambdaState();
1902         }
1903         /* Print the remaining wall clock time for the run */
1904         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1905         {
1906             if (shellfc)
1907             {
1908                 fprintf(stderr, "\n");
1909             }
1910             print_time(stderr, walltime_accounting, step, ir, cr);
1911         }
1912
1913         /* Ion/water position swapping.
1914          * Not done in last step since trajectory writing happens before this call
1915          * in the MD loop and exchanges would be lost anyway. */
1916         bNeedRepartition = FALSE;
1917         if ((ir->eSwapCoords != SwapType::No) && (step > 0) && !bLastStep
1918             && do_per_step(step, ir->swap->nstswap))
1919         {
1920             bNeedRepartition = do_swapcoords(cr,
1921                                              step,
1922                                              t,
1923                                              ir,
1924                                              swap,
1925                                              wcycle,
1926                                              as_rvec_array(state->x.data()),
1927                                              state->box,
1928                                              MASTER(cr) && mdrunOptions.verbose,
1929                                              bRerunMD);
1930
1931             if (bNeedRepartition && haveDDAtomOrdering(*cr))
1932             {
1933                 dd_collect_state(cr->dd, state, state_global);
1934             }
1935         }
1936
1937         /* Replica exchange */
1938         bExchanged = FALSE;
1939         if (bDoReplEx)
1940         {
1941             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1942         }
1943
1944         if ((bExchanged || bNeedRepartition) && haveDDAtomOrdering(*cr))
1945         {
1946             dd_partition_system(fplog,
1947                                 mdlog,
1948                                 step,
1949                                 cr,
1950                                 TRUE,
1951                                 1,
1952                                 state_global,
1953                                 top_global,
1954                                 *ir,
1955                                 imdSession,
1956                                 pull_work,
1957                                 state,
1958                                 &f,
1959                                 mdAtoms,
1960                                 top,
1961                                 fr,
1962                                 vsite,
1963                                 constr,
1964                                 nrnb,
1965                                 wcycle,
1966                                 FALSE);
1967             upd.updateAfterPartition(state->natoms,
1968                                      md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1969                                                  : gmx::ArrayRef<const unsigned short>(),
1970                                      md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1971                                              : gmx::ArrayRef<const unsigned short>());
1972             fr->longRangeNonbondeds->updateAfterPartition(*md);
1973         }
1974
1975         bFirstStep = FALSE;
1976         bInitStep  = FALSE;
1977
1978         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1979         /* With all integrators, except VV, we need to retain the pressure
1980          * at the current step for coupling at the next step.
1981          */
1982         if ((state->flags & enumValueToBitMask(StateEntry::PressurePrevious))
1983             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1984         {
1985             /* Store the pressure in t_state for pressure coupling
1986              * at the next MD step.
1987              */
1988             copy_mat(pres, state->pres_prev);
1989         }
1990
1991         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1992
1993         if ((membed != nullptr) && (!bLastStep))
1994         {
1995             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1996         }
1997
1998         cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
1999         if (haveDDAtomOrdering(*cr) && wcycle)
2000         {
2001             dd_cycles_add(cr->dd, cycles, ddCyclStep);
2002         }
2003
2004         /* increase the MD step number */
2005         step++;
2006         step_rel++;
2007         observablesReducer.markAsReadyToReduce();
2008
2009 #if GMX_FAHCORE
2010         if (MASTER(cr))
2011         {
2012             fcReportProgress(ir->nsteps + ir->init_step, step);
2013         }
2014 #endif
2015
2016         resetHandler->resetCounters(
2017                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
2018
2019         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
2020         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
2021     }
2022     /* End of main MD loop */
2023
2024     /* Closing TNG files can include compressing data. Therefore it is good to do that
2025      * before stopping the time measurements. */
2026     mdoutf_tng_close(outf);
2027
2028     /* Stop measuring walltime */
2029     walltime_accounting_end_time(walltime_accounting);
2030
2031     if (simulationWork.haveSeparatePmeRank)
2032     {
2033         /* Tell the PME only node to finish */
2034         gmx_pme_send_finish(cr);
2035     }
2036
2037     if (MASTER(cr))
2038     {
2039         if (ir->nstcalcenergy > 0)
2040         {
2041             energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
2042
2043             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
2044             energyOutput.printAverages(fplog, groups);
2045         }
2046     }
2047     done_mdoutf(outf);
2048
2049     if (bPMETune)
2050     {
2051         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
2052     }
2053
2054     done_shellfc(fplog, shellfc, step_rel);
2055
2056     if (useReplicaExchange && MASTER(cr))
2057     {
2058         print_replica_exchange_statistics(fplog, repl_ex);
2059     }
2060
2061     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
2062
2063     global_stat_destroy(gstat);
2064 }