Rework GPU halo and state propagator streams and dependencies to get better overlap
[alexxy/gromacs.git] / src / gromacs / mdrun / md.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 /*! \internal \file
38  *
39  * \brief Implements the integrator for normal molecular dynamics simulations
40  *
41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
42  * \ingroup module_mdrun
43  */
44 #include "gmxpre.h"
45
46 #include <cinttypes>
47 #include <cmath>
48 #include <cstdio>
49 #include <cstdlib>
50
51 #include <algorithm>
52 #include <memory>
53 #include <numeric>
54
55 #include "gromacs/applied_forces/awh/awh.h"
56 #include "gromacs/applied_forces/awh/read_params.h"
57 #include "gromacs/commandline/filenm.h"
58 #include "gromacs/domdec/collect.h"
59 #include "gromacs/domdec/dlbtiming.h"
60 #include "gromacs/domdec/domdec.h"
61 #include "gromacs/domdec/domdec_network.h"
62 #include "gromacs/domdec/domdec_struct.h"
63 #include "gromacs/domdec/gpuhaloexchange.h"
64 #include "gromacs/domdec/localtopologychecker.h"
65 #include "gromacs/domdec/mdsetup.h"
66 #include "gromacs/domdec/partition.h"
67 #include "gromacs/essentialdynamics/edsam.h"
68 #include "gromacs/ewald/pme_load_balancing.h"
69 #include "gromacs/ewald/pme_pp.h"
70 #include "gromacs/fileio/trxio.h"
71 #include "gromacs/gmxlib/network.h"
72 #include "gromacs/gmxlib/nrnb.h"
73 #include "gromacs/gpu_utils/device_stream_manager.h"
74 #include "gromacs/gpu_utils/gpu_utils.h"
75 #include "gromacs/imd/imd.h"
76 #include "gromacs/listed_forces/listed_forces.h"
77 #include "gromacs/math/functions.h"
78 #include "gromacs/math/invertmatrix.h"
79 #include "gromacs/math/vec.h"
80 #include "gromacs/math/vectypes.h"
81 #include "gromacs/mdlib/checkpointhandler.h"
82 #include "gromacs/mdlib/compute_io.h"
83 #include "gromacs/mdlib/constr.h"
84 #include "gromacs/mdlib/coupling.h"
85 #include "gromacs/mdlib/ebin.h"
86 #include "gromacs/mdlib/enerdata_utils.h"
87 #include "gromacs/mdlib/energyoutput.h"
88 #include "gromacs/mdlib/expanded.h"
89 #include "gromacs/mdlib/force.h"
90 #include "gromacs/mdlib/force_flags.h"
91 #include "gromacs/mdlib/forcerec.h"
92 #include "gromacs/mdlib/freeenergyparameters.h"
93 #include "gromacs/mdlib/md_support.h"
94 #include "gromacs/mdlib/mdatoms.h"
95 #include "gromacs/mdlib/mdoutf.h"
96 #include "gromacs/mdlib/membed.h"
97 #include "gromacs/mdlib/resethandler.h"
98 #include "gromacs/mdlib/sighandler.h"
99 #include "gromacs/mdlib/simulationsignal.h"
100 #include "gromacs/mdlib/stat.h"
101 #include "gromacs/mdlib/stophandler.h"
102 #include "gromacs/mdlib/tgroup.h"
103 #include "gromacs/mdlib/trajectory_writing.h"
104 #include "gromacs/mdlib/update.h"
105 #include "gromacs/mdlib/update_constrain_gpu.h"
106 #include "gromacs/mdlib/update_vv.h"
107 #include "gromacs/mdlib/vcm.h"
108 #include "gromacs/mdlib/vsite.h"
109 #include "gromacs/mdrunutility/freeenergy.h"
110 #include "gromacs/mdrunutility/handlerestart.h"
111 #include "gromacs/mdrunutility/multisim.h"
112 #include "gromacs/mdrunutility/printtime.h"
113 #include "gromacs/mdtypes/awh_history.h"
114 #include "gromacs/mdtypes/awh_params.h"
115 #include "gromacs/mdtypes/commrec.h"
116 #include "gromacs/mdtypes/df_history.h"
117 #include "gromacs/mdtypes/energyhistory.h"
118 #include "gromacs/mdtypes/fcdata.h"
119 #include "gromacs/mdtypes/forcebuffers.h"
120 #include "gromacs/mdtypes/forcerec.h"
121 #include "gromacs/mdtypes/group.h"
122 #include "gromacs/mdtypes/inputrec.h"
123 #include "gromacs/mdtypes/interaction_const.h"
124 #include "gromacs/mdtypes/md_enums.h"
125 #include "gromacs/mdtypes/mdatom.h"
126 #include "gromacs/mdtypes/mdrunoptions.h"
127 #include "gromacs/mdtypes/multipletimestepping.h"
128 #include "gromacs/mdtypes/observableshistory.h"
129 #include "gromacs/mdtypes/observablesreducer.h"
130 #include "gromacs/mdtypes/pullhistory.h"
131 #include "gromacs/mdtypes/simulation_workload.h"
132 #include "gromacs/mdtypes/state.h"
133 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
134 #include "gromacs/modularsimulator/energydata.h"
135 #include "gromacs/nbnxm/gpu_data_mgmt.h"
136 #include "gromacs/nbnxm/nbnxm.h"
137 #include "gromacs/pbcutil/pbc.h"
138 #include "gromacs/pulling/output.h"
139 #include "gromacs/pulling/pull.h"
140 #include "gromacs/swap/swapcoords.h"
141 #include "gromacs/timing/wallcycle.h"
142 #include "gromacs/timing/walltime_accounting.h"
143 #include "gromacs/topology/atoms.h"
144 #include "gromacs/topology/idef.h"
145 #include "gromacs/topology/mtop_util.h"
146 #include "gromacs/topology/topology.h"
147 #include "gromacs/trajectory/trajectoryframe.h"
148 #include "gromacs/utility/basedefinitions.h"
149 #include "gromacs/utility/cstringutil.h"
150 #include "gromacs/utility/fatalerror.h"
151 #include "gromacs/utility/logger.h"
152 #include "gromacs/utility/real.h"
153 #include "gromacs/utility/smalloc.h"
154
155 #include "legacysimulator.h"
156 #include "replicaexchange.h"
157 #include "shellfc.h"
158
159 using gmx::SimulationSignaller;
160
161 void gmx::LegacySimulator::do_md()
162 {
163     // TODO Historically, the EM and MD "integrators" used different
164     // names for the t_inputrec *parameter, but these must have the
165     // same name, now that it's a member of a struct. We use this ir
166     // alias to avoid a large ripple of nearly useless changes.
167     // t_inputrec is being replaced by IMdpOptionsProvider, so this
168     // will go away eventually.
169     const t_inputrec* ir = inputrec;
170
171     double       t, t0 = ir->init_t;
172     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
173     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
174     gmx_bool     bDoExpanded = FALSE;
175     gmx_bool     do_ene, do_log, do_verbose;
176     gmx_bool     bMasterState;
177     unsigned int force_flags;
178     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
179     int    i, m;
180     rvec   mu_tot;
181     matrix pressureCouplingMu, M;
182     gmx_repl_ex_t     repl_ex = nullptr;
183     gmx_global_stat_t gstat;
184     gmx_shellfc_t*    shellfc;
185     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
186     gmx_bool          bTrotter;
187     real              dvdl_constr;
188     std::vector<RVec> cbuf;
189     matrix            lastbox;
190     int               lamnew = 0;
191     /* for FEP */
192     double    cycles;
193     real      saved_conserved_quantity = 0;
194     real      last_ekin                = 0;
195     t_extmass MassQ;
196     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
197
198     /* PME load balancing data for GPU kernels */
199     gmx_bool bPMETune         = FALSE;
200     gmx_bool bPMETunePrinting = FALSE;
201
202     bool bInteractiveMDstep = false;
203
204     SimulationSignals signals;
205     // Most global communnication stages don't propagate mdrun
206     // signals, and will use this object to achieve that.
207     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
208
209     if (!mdrunOptions.writeConfout)
210     {
211         // This is on by default, and the main known use case for
212         // turning it off is for convenience in benchmarking, which is
213         // something that should not show up in the general user
214         // interface.
215         GMX_LOG(mdlog.info)
216                 .asParagraph()
217                 .appendText(
218                         "The -noconfout functionality is deprecated, and may be removed in a "
219                         "future version.");
220     }
221
222     /* md-vv uses averaged full step velocities for T-control
223        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
224        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
225     bTrotter = (EI_VV(ir->eI)
226                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
227
228     const bool bRerunMD = false;
229
230     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
231     bGStatEveryStep   = (nstglobalcomm == 1);
232
233     const SimulationGroups* groups = &top_global.groups;
234
235     std::unique_ptr<EssentialDynamics> ed = nullptr;
236     if (opt2bSet("-ei", nfile, fnm))
237     {
238         /* Initialize essential dynamics sampling */
239         ed = init_edsam(mdlog,
240                         opt2fn_null("-ei", nfile, fnm),
241                         opt2fn("-eo", nfile, fnm),
242                         top_global,
243                         *ir,
244                         cr,
245                         constr,
246                         state_global,
247                         observablesHistory,
248                         oenv,
249                         startingBehavior);
250     }
251     else if (observablesHistory->edsamHistory)
252     {
253         gmx_fatal(FARGS,
254                   "The checkpoint is from a run with essential dynamics sampling, "
255                   "but the current run did not specify the -ei option. "
256                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
257     }
258
259     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
260     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
261     initialize_lambdas(fplog,
262                        ir->efep,
263                        ir->bSimTemp,
264                        *ir->fepvals,
265                        ir->simtempvals->temperatures,
266                        gmx::arrayRefFromArray(ir->opts.ref_t, ir->opts.ngtc),
267                        MASTER(cr),
268                        fep_state,
269                        lambda);
270     Update upd(*ir, deform);
271     bool   doSimulatedAnnealing = false;
272     {
273         // TODO: Avoid changing inputrec (#3854)
274         // Simulated annealing updates the reference temperature.
275         auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
276         doSimulatedAnnealing   = initSimulatedAnnealing(nonConstInputrec, &upd);
277     }
278     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
279
280     t_fcdata& fcdata = *fr->fcdata;
281
282     bool simulationsShareState = false;
283     int  nstSignalComm         = nstglobalcomm;
284     {
285         // TODO This implementation of ensemble orientation restraints is nasty because
286         // a user can't just do multi-sim with single-sim orientation restraints.
287         bool usingEnsembleRestraints = (fcdata.disres->nsystems > 1) || ((ms != nullptr) && fcdata.orires);
288         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim() && (ms != nullptr));
289
290         // Replica exchange, ensemble restraints and AWH need all
291         // simulations to remain synchronized, so they need
292         // checkpoints and stop conditions to act on the same step, so
293         // the propagation of such signals must take place between
294         // simulations, not just within simulations.
295         // TODO: Make algorithm initializers set these flags.
296         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
297
298         if (simulationsShareState)
299         {
300             // Inter-simulation signal communication does not need to happen
301             // often, so we use a minimum of 200 steps to reduce overhead.
302             const int c_minimumInterSimulationSignallingInterval = 200;
303             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
304                             * nstglobalcomm;
305         }
306     }
307
308     if (startingBehavior != StartingBehavior::RestartWithAppending)
309     {
310         pleaseCiteCouplingAlgorithms(fplog, *ir);
311     }
312     gmx_mdoutf*       outf = init_mdoutf(fplog,
313                                    nfile,
314                                    fnm,
315                                    mdrunOptions,
316                                    cr,
317                                    outputProvider,
318                                    mdModulesNotifiers,
319                                    ir,
320                                    top_global,
321                                    oenv,
322                                    wcycle,
323                                    startingBehavior,
324                                    simulationsShareState,
325                                    ms);
326     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf),
327                                    top_global,
328                                    *ir,
329                                    pull_work,
330                                    mdoutf_get_fp_dhdl(outf),
331                                    false,
332                                    startingBehavior,
333                                    simulationsShareState,
334                                    mdModulesNotifiers);
335
336     gstat = global_stat_init(ir);
337
338     const auto& simulationWork     = runScheduleWork->simulationWork;
339     const bool  useGpuForPme       = simulationWork.useGpuPme;
340     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
341     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
342     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
343
344     /* Check for polarizable models and flexible constraints */
345     shellfc = init_shell_flexcon(fplog,
346                                  top_global,
347                                  constr ? constr->numFlexibleConstraints() : 0,
348                                  ir->nstcalcenergy,
349                                  haveDDAtomOrdering(*cr),
350                                  useGpuForPme);
351
352     {
353         double io = compute_io(ir, top_global.natoms, *groups, energyOutput.numEnergyTerms(), 1);
354         if ((io > 2000) && MASTER(cr))
355         {
356             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
357         }
358     }
359
360     ObservablesReducer observablesReducer = observablesReducerBuilder->build();
361
362     ForceBuffers     f(simulationWork.useMts,
363                    ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
364                                ? PinningPolicy::PinnedIfSupported
365                                : PinningPolicy::CannotBePinned);
366     const t_mdatoms* md = mdAtoms->mdatoms();
367     if (haveDDAtomOrdering(*cr))
368     {
369         // Local state only becomes valid now.
370         dd_init_local_state(*cr->dd, state_global, state);
371
372         /* Distribute the charge groups over the nodes from the master node */
373         dd_partition_system(fplog,
374                             mdlog,
375                             ir->init_step,
376                             cr,
377                             TRUE,
378                             1,
379                             state_global,
380                             top_global,
381                             *ir,
382                             imdSession,
383                             pull_work,
384                             state,
385                             &f,
386                             mdAtoms,
387                             top,
388                             fr,
389                             vsite,
390                             constr,
391                             nrnb,
392                             nullptr,
393                             FALSE);
394         upd.updateAfterPartition(state->natoms,
395                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
396                                              : gmx::ArrayRef<const unsigned short>(),
397                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
398                                          : gmx::ArrayRef<const unsigned short>());
399         fr->longRangeNonbondeds->updateAfterPartition(*md);
400     }
401     else
402     {
403         state_change_natoms(state_global, state_global->natoms);
404
405         /* Generate and initialize new topology */
406         mdAlgorithmsSetupAtomData(cr, *ir, top_global, top, fr, &f, mdAtoms, constr, vsite, shellfc);
407
408         upd.updateAfterPartition(state->natoms,
409                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
410                                              : gmx::ArrayRef<const unsigned short>(),
411                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
412                                          : gmx::ArrayRef<const unsigned short>());
413         fr->longRangeNonbondeds->updateAfterPartition(*md);
414     }
415
416     std::unique_ptr<UpdateConstrainGpu> integrator;
417
418     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
419
420     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
421     if (useGpuForUpdate)
422     {
423         GMX_RELEASE_ASSERT(!haveDDAtomOrdering(*cr) || ddUsesUpdateGroups(*cr->dd)
424                                    || constr == nullptr || constr->numConstraintsTotal() == 0,
425                            "Constraints in domain decomposition are only supported with update "
426                            "groups if using GPU update.\n");
427         GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
428                                    || constr->numConstraintsTotal() == 0,
429                            "SHAKE is not supported with GPU update.");
430         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
431                            "Either PME or short-ranged non-bonded interaction tasks must run on "
432                            "the GPU to use GPU update.\n");
433         GMX_RELEASE_ASSERT(ir->eI == IntegrationAlgorithm::MD,
434                            "Only the md integrator is supported with the GPU update.\n");
435         GMX_RELEASE_ASSERT(
436                 ir->etc != TemperatureCoupling::NoseHoover,
437                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
438         GMX_RELEASE_ASSERT(
439                 ir->epc == PressureCoupling::No || ir->epc == PressureCoupling::ParrinelloRahman
440                         || ir->epc == PressureCoupling::Berendsen || ir->epc == PressureCoupling::CRescale,
441                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
442                 "with the GPU update.\n");
443         GMX_RELEASE_ASSERT(!md->haveVsites,
444                            "Virtual sites are not supported with the GPU update.\n");
445         GMX_RELEASE_ASSERT(ed == nullptr,
446                            "Essential dynamics is not supported with the GPU update.\n");
447         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
448                            "Constraints pulling is not supported with the GPU update.\n");
449         GMX_RELEASE_ASSERT(fcdata.orires == nullptr,
450                            "Orientation restraints are not supported with the GPU update.\n");
451         GMX_RELEASE_ASSERT(
452                 ir->efep == FreeEnergyPerturbationType::No
453                         || (!haveFepPerturbedMasses(top_global) && !havePerturbedConstraints(top_global)),
454                 "Free energy perturbation of masses and constraints are not supported with the GPU "
455                 "update.");
456
457         if (constr != nullptr && constr->numConstraintsTotal() > 0)
458         {
459             GMX_LOG(mdlog.info)
460                     .asParagraph()
461                     .appendText("Updating coordinates and applying constraints on the GPU.");
462         }
463         else
464         {
465             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
466         }
467         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
468                            "Device stream manager should be initialized in order to use GPU "
469                            "update-constraints.");
470         GMX_RELEASE_ASSERT(
471                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
472                 "Update stream should be initialized in order to use GPU "
473                 "update-constraints.");
474         integrator = std::make_unique<UpdateConstrainGpu>(
475                 *ir,
476                 top_global,
477                 ekind->ngtc,
478                 fr->deviceStreamManager->context(),
479                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
480                 wcycle);
481
482         stateGpu->setXUpdatedOnDeviceEvent(integrator->xUpdatedOnDeviceEvent());
483
484         integrator->setPbc(PbcType::Xyz, state->box);
485     }
486
487     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
488     {
489         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
490     }
491     if (useGpuForUpdate)
492     {
493         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
494     }
495
496     // NOTE: The global state is no longer used at this point.
497     // But state_global is still used as temporary storage space for writing
498     // the global state to file and potentially for replica exchange.
499     // (Global topology should persist.)
500
501     update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
502
503     if (ir->bExpanded)
504     {
505         /* Check nstexpanded here, because the grompp check was broken */
506         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
507         {
508             gmx_fatal(FARGS,
509                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
510         }
511         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
512     }
513
514     if (MASTER(cr))
515     {
516         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
517     }
518
519     preparePrevStepPullCom(ir,
520                            pull_work,
521                            gmx::arrayRefFromArray(md->massT, md->nr),
522                            state,
523                            state_global,
524                            cr,
525                            startingBehavior != StartingBehavior::NewSimulation);
526
527     // TODO: Remove this by converting AWH into a ForceProvider
528     auto awh = prepareAwhModule(fplog,
529                                 *ir,
530                                 state_global,
531                                 cr,
532                                 ms,
533                                 startingBehavior != StartingBehavior::NewSimulation,
534                                 shellfc != nullptr,
535                                 opt2fn("-awh", nfile, fnm),
536                                 pull_work);
537
538     if (useReplicaExchange && MASTER(cr))
539     {
540         repl_ex = init_replica_exchange(fplog, ms, top_global.natoms, ir, replExParams);
541     }
542     /* PME tuning is only supported in the Verlet scheme, with PME for
543      * Coulomb. It is not supported with only LJ PME. */
544     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
545                 && ir->cutoff_scheme != CutoffScheme::Group);
546
547     pme_load_balancing_t* pme_loadbal = nullptr;
548     if (bPMETune)
549     {
550         pme_loadbal_init(
551                 &pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu());
552     }
553
554     if (!ir->bContinuation)
555     {
556         if (state->flags & enumValueToBitMask(StateEntry::V))
557         {
558             auto v = makeArrayRef(state->v);
559             /* Set the velocities of vsites, shells and frozen atoms to zero */
560             for (i = 0; i < md->homenr; i++)
561             {
562                 if (md->ptype[i] == ParticleType::Shell)
563                 {
564                     clear_rvec(v[i]);
565                 }
566                 else if (md->cFREEZE)
567                 {
568                     for (m = 0; m < DIM; m++)
569                     {
570                         if (ir->opts.nFreeze[md->cFREEZE[i]][m])
571                         {
572                             v[i][m] = 0;
573                         }
574                     }
575                 }
576             }
577         }
578
579         if (constr)
580         {
581             /* Constrain the initial coordinates and velocities */
582             do_constrain_first(fplog,
583                                constr,
584                                ir,
585                                md->nr,
586                                md->homenr,
587                                state->x.arrayRefWithPadding(),
588                                state->v.arrayRefWithPadding(),
589                                state->box,
590                                state->lambda[FreeEnergyPerturbationCouplingType::Bonded]);
591         }
592     }
593
594     const int nstfep = computeFepPeriod(*ir, replExParams);
595
596     /* Be REALLY careful about what flags you set here. You CANNOT assume
597      * this is the first step, since we might be restarting from a checkpoint,
598      * and in that case we should not do any modifications to the state.
599      */
600     bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && !ir->bContinuation);
601
602     // When restarting from a checkpoint, it can be appropriate to
603     // initialize ekind from quantities in the checkpoint. Otherwise,
604     // compute_globals must initialize ekind before the simulation
605     // starts/restarts. However, only the master rank knows what was
606     // found in the checkpoint file, so we have to communicate in
607     // order to coordinate the restart.
608     //
609     // TODO Consider removing this communication if/when checkpoint
610     // reading directly follows .tpr reading, because all ranks can
611     // agree on hasReadEkinState at that time.
612     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
613     if (PAR(cr))
614     {
615         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
616     }
617     if (hasReadEkinState)
618     {
619         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
620     }
621
622     unsigned int cglo_flags =
623             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
624              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
625
626     bSumEkinhOld = FALSE;
627
628     t_vcm vcm(top_global.groups, *ir);
629     reportComRemovalInfo(fplog, vcm);
630
631     int64_t step     = ir->init_step;
632     int64_t step_rel = 0;
633
634     /* To minimize communication, compute_globals computes the COM velocity
635      * and the kinetic energy for the velocities without COM motion removed.
636      * Thus to get the kinetic energy without the COM contribution, we need
637      * to call compute_globals twice.
638      */
639     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
640     {
641         unsigned int cglo_flags_iteration = cglo_flags;
642         if (bStopCM && cgloIteration == 0)
643         {
644             cglo_flags_iteration |= CGLO_STOPCM;
645             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
646         }
647         compute_globals(gstat,
648                         cr,
649                         ir,
650                         fr,
651                         ekind,
652                         makeConstArrayRef(state->x),
653                         makeConstArrayRef(state->v),
654                         state->box,
655                         md,
656                         nrnb,
657                         &vcm,
658                         nullptr,
659                         enerd,
660                         force_vir,
661                         shake_vir,
662                         total_vir,
663                         pres,
664                         &nullSignaller,
665                         state->box,
666                         &bSumEkinhOld,
667                         cglo_flags_iteration,
668                         step,
669                         &observablesReducer);
670         // Clean up after pre-step use of compute_globals()
671         observablesReducer.markAsReadyToReduce();
672
673         if (cglo_flags_iteration & CGLO_STOPCM)
674         {
675             /* At initialization, do not pass x with acceleration-correction mode
676              * to avoid (incorrect) correction of the initial coordinates.
677              */
678             auto x = (vcm.mode == ComRemovalAlgorithm::LinearAccelerationCorrection)
679                              ? ArrayRef<RVec>()
680                              : makeArrayRef(state->x);
681             process_and_stopcm_grp(fplog, &vcm, *md, x, makeArrayRef(state->v));
682             inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
683         }
684     }
685     if (ir->eI == IntegrationAlgorithm::VVAK)
686     {
687         /* a second call to get the half step temperature initialized as well */
688         /* we do the same call as above, but turn the pressure off -- internally to
689            compute_globals, this is recognized as a velocity verlet half-step
690            kinetic energy calculation.  This minimized excess variables, but
691            perhaps loses some logic?*/
692
693         compute_globals(gstat,
694                         cr,
695                         ir,
696                         fr,
697                         ekind,
698                         makeConstArrayRef(state->x),
699                         makeConstArrayRef(state->v),
700                         state->box,
701                         md,
702                         nrnb,
703                         &vcm,
704                         nullptr,
705                         enerd,
706                         force_vir,
707                         shake_vir,
708                         total_vir,
709                         pres,
710                         &nullSignaller,
711                         state->box,
712                         &bSumEkinhOld,
713                         cglo_flags & ~CGLO_PRESSURE,
714                         step,
715                         &observablesReducer);
716         // Clean up after pre-step use of compute_globals()
717         observablesReducer.markAsReadyToReduce();
718     }
719
720     /* Calculate the initial half step temperature, and save the ekinh_old */
721     if (startingBehavior == StartingBehavior::NewSimulation)
722     {
723         for (i = 0; (i < ir->opts.ngtc); i++)
724         {
725             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
726         }
727     }
728
729     /* need to make an initiation call to get the Trotter variables set, as well as other constants
730        for non-trotter temperature control */
731     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
732
733     if (MASTER(cr))
734     {
735         if (!ir->bContinuation)
736         {
737             if (constr && ir->eConstrAlg == ConstraintAlgorithm::Lincs)
738             {
739                 fprintf(fplog,
740                         "RMS relative constraint deviation after constraining: %.2e\n",
741                         constr->rmsd());
742             }
743             if (EI_STATE_VELOCITY(ir->eI))
744             {
745                 real temp = enerd->term[F_TEMP];
746                 if (ir->eI != IntegrationAlgorithm::VV)
747                 {
748                     /* Result of Ekin averaged over velocities of -half
749                      * and +half step, while we only have -half step here.
750                      */
751                     temp *= 2;
752                 }
753                 fprintf(fplog, "Initial temperature: %g K\n", temp);
754             }
755         }
756
757         char tbuf[20];
758         fprintf(stderr, "starting mdrun '%s'\n", *(top_global.name));
759         if (ir->nsteps >= 0)
760         {
761             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
762         }
763         else
764         {
765             sprintf(tbuf, "%s", "infinite");
766         }
767         if (ir->init_step > 0)
768         {
769             fprintf(stderr,
770                     "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
771                     gmx_step_str(ir->init_step + ir->nsteps, sbuf),
772                     tbuf,
773                     gmx_step_str(ir->init_step, sbuf2),
774                     ir->init_step * ir->delta_t);
775         }
776         else
777         {
778             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
779         }
780         fprintf(fplog, "\n");
781     }
782
783     walltime_accounting_start_time(walltime_accounting);
784     wallcycle_start(wcycle, WallCycleCounter::Run);
785     print_start(fplog, cr, walltime_accounting, "mdrun");
786
787     /***********************************************************
788      *
789      *             Loop over MD steps
790      *
791      ************************************************************/
792
793     bFirstStep = TRUE;
794     /* Skip the first Nose-Hoover integration when we get the state from tpx */
795     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
796     bSumEkinhOld     = FALSE;
797     bExchanged       = FALSE;
798     bNeedRepartition = FALSE;
799
800     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
801             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]),
802             simulationsShareState,
803             MASTER(cr),
804             ir->nstlist,
805             mdrunOptions.reproducible,
806             nstSignalComm,
807             mdrunOptions.maximumHoursToRun,
808             ir->nstlist == 0,
809             fplog,
810             step,
811             bNS,
812             walltime_accounting);
813
814     auto checkpointHandler = std::make_unique<CheckpointHandler>(
815             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
816             simulationsShareState,
817             ir->nstlist == 0,
818             MASTER(cr),
819             mdrunOptions.writeConfout,
820             mdrunOptions.checkpointOptions.period);
821
822     const bool resetCountersIsLocal = true;
823     auto       resetHandler         = std::make_unique<ResetHandler>(
824             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
825             !resetCountersIsLocal,
826             ir->nsteps,
827             MASTER(cr),
828             mdrunOptions.timingOptions.resetHalfway,
829             mdrunOptions.maximumHoursToRun,
830             mdlog,
831             wcycle,
832             walltime_accounting);
833
834     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
835
836     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
837     {
838         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
839     }
840
841     /* and stop now if we should */
842     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
843     while (!bLastStep)
844     {
845
846         /* Determine if this is a neighbor search step */
847         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
848
849         if (bPMETune && bNStList)
850         {
851             // This has to be here because PME load balancing is called so early.
852             // TODO: Move to after all booleans are defined.
853             if (useGpuForUpdate && !bFirstStep)
854             {
855                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
856                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
857             }
858             /* PME grid + cut-off optimization with GPUs or PME nodes */
859             pme_loadbal_do(pme_loadbal,
860                            cr,
861                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
862                            fplog,
863                            mdlog,
864                            *ir,
865                            fr,
866                            state->box,
867                            state->x,
868                            wcycle,
869                            step,
870                            step_rel,
871                            &bPMETunePrinting,
872                            simulationWork.useGpuPmePpCommunication);
873         }
874
875         wallcycle_start(wcycle, WallCycleCounter::Step);
876
877         bLastStep = (step_rel == ir->nsteps);
878         t         = t0 + step * ir->delta_t;
879
880         // TODO Refactor this, so that nstfep does not need a default value of zero
881         if (ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
882         {
883             /* find and set the current lambdas */
884             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
885
886             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
887                            && (!bFirstStep));
888         }
889
890         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
891                      && do_per_step(step, replExParams.exchangeInterval));
892
893         if (doSimulatedAnnealing)
894         {
895             // TODO: Avoid changing inputrec (#3854)
896             // Simulated annealing updates the reference temperature.
897             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
898             update_annealing_target_temp(nonConstInputrec, t, &upd);
899         }
900
901         /* Stop Center of Mass motion */
902         bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && do_per_step(step, ir->nstcomm));
903
904         /* Determine whether or not to do Neighbour Searching */
905         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
906
907         /* Note that the stopHandler will cause termination at nstglobalcomm
908          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
909          * nstpcouple steps, we have computed the half-step kinetic energy
910          * of the previous step and can always output energies at the last step.
911          */
912         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
913
914         /* do_log triggers energy and virial calculation. Because this leads
915          * to different code paths, forces can be different. Thus for exact
916          * continuation we should avoid extra log output.
917          * Note that the || bLastStep can result in non-exact continuation
918          * beyond the last step. But we don't consider that to be an issue.
919          */
920         do_log     = (do_per_step(step, ir->nstlog)
921                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
922         do_verbose = mdrunOptions.verbose
923                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
924
925         if (useGpuForUpdate && !bFirstStep && bNS)
926         {
927             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
928             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
929             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
930             // Copy coordinate from the GPU when needed at the search step.
931             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
932             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
933             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
934             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
935         }
936
937         // We only need to calculate virtual velocities if we are writing them in the current step
938         const bool needVirtualVelocitiesThisStep =
939                 (vsite != nullptr)
940                 && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep());
941
942         if (vsite != nullptr)
943         {
944             // Virtual sites need to be updated before domain decomposition and forces are calculated
945             wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
946             // md-vv calculates virtual velocities once it has full-step real velocities
947             vsite->construct(state->x,
948                              state->v,
949                              state->box,
950                              (!EI_VV(inputrec->eI) && needVirtualVelocitiesThisStep)
951                                      ? VSiteOperation::PositionsAndVelocities
952                                      : VSiteOperation::Positions);
953             wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
954         }
955
956         if (bNS && !(bFirstStep && ir->bContinuation))
957         {
958             bMasterState = FALSE;
959             /* Correct the new box if it is too skewed */
960             if (inputrecDynamicBox(ir))
961             {
962                 if (correct_box(fplog, step, state->box))
963                 {
964                     bMasterState = TRUE;
965                     // If update is offloaded, it should be informed about the box size change
966                     if (useGpuForUpdate)
967                     {
968                         integrator->setPbc(PbcType::Xyz, state->box);
969                     }
970                 }
971             }
972             if (haveDDAtomOrdering(*cr) && bMasterState)
973             {
974                 dd_collect_state(cr->dd, state, state_global);
975             }
976
977             if (haveDDAtomOrdering(*cr))
978             {
979                 /* Repartition the domain decomposition */
980                 dd_partition_system(fplog,
981                                     mdlog,
982                                     step,
983                                     cr,
984                                     bMasterState,
985                                     nstglobalcomm,
986                                     state_global,
987                                     top_global,
988                                     *ir,
989                                     imdSession,
990                                     pull_work,
991                                     state,
992                                     &f,
993                                     mdAtoms,
994                                     top,
995                                     fr,
996                                     vsite,
997                                     constr,
998                                     nrnb,
999                                     wcycle,
1000                                     do_verbose && !bPMETunePrinting);
1001                 upd.updateAfterPartition(state->natoms,
1002                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1003                                                      : gmx::ArrayRef<const unsigned short>(),
1004                                          md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1005                                                  : gmx::ArrayRef<const unsigned short>());
1006                 fr->longRangeNonbondeds->updateAfterPartition(*md);
1007             }
1008         }
1009
1010         // Allocate or re-size GPU halo exchange object, if necessary
1011         if (bNS && simulationWork.havePpDomainDecomposition && simulationWork.useGpuHaloExchange)
1012         {
1013             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
1014                                "GPU device manager has to be initialized to use GPU "
1015                                "version of halo exchange.");
1016             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
1017         }
1018
1019         if (MASTER(cr) && do_log)
1020         {
1021             gmx::EnergyOutput::printHeader(
1022                     fplog, step, t); /* can we improve the information printed here? */
1023         }
1024
1025         if (ir->efep != FreeEnergyPerturbationType::No)
1026         {
1027             update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
1028         }
1029
1030         if (bExchanged)
1031         {
1032             /* We need the kinetic energy at minus the half step for determining
1033              * the full step kinetic energy and possibly for T-coupling.*/
1034             /* This may not be quite working correctly yet . . . . */
1035             int cglo_flags = CGLO_GSTAT | CGLO_TEMPERATURE;
1036             compute_globals(gstat,
1037                             cr,
1038                             ir,
1039                             fr,
1040                             ekind,
1041                             makeConstArrayRef(state->x),
1042                             makeConstArrayRef(state->v),
1043                             state->box,
1044                             md,
1045                             nrnb,
1046                             &vcm,
1047                             wcycle,
1048                             enerd,
1049                             nullptr,
1050                             nullptr,
1051                             nullptr,
1052                             nullptr,
1053                             &nullSignaller,
1054                             state->box,
1055                             &bSumEkinhOld,
1056                             cglo_flags,
1057                             step,
1058                             &observablesReducer);
1059         }
1060         clear_mat(force_vir);
1061
1062         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
1063
1064         /* Determine the energy and pressure:
1065          * at nstcalcenergy steps and at energy output steps (set below).
1066          */
1067         if (EI_VV(ir->eI) && (!bInitStep))
1068         {
1069             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1070             bCalcVir      = bCalcEnerStep
1071                        || (ir->epc != PressureCoupling::No
1072                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
1073         }
1074         else
1075         {
1076             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1077             bCalcVir      = bCalcEnerStep
1078                        || (ir->epc != PressureCoupling::No && do_per_step(step, ir->nstpcouple));
1079         }
1080         bCalcEner = bCalcEnerStep;
1081
1082         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
1083
1084         if (do_ene || do_log || bDoReplEx)
1085         {
1086             bCalcVir  = TRUE;
1087             bCalcEner = TRUE;
1088         }
1089
1090         // bCalcEner is only here for when the last step is not a mulitple of nstfep
1091         const bool computeDHDL = ((ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
1092                                   && (do_per_step(step, nstfep) || bCalcEner));
1093
1094         /* Do we need global communication ? */
1095         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
1096                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
1097
1098         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
1099                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
1100                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (computeDHDL ? GMX_FORCE_DHDL : 0));
1101         if (simulationWork.useMts && !do_per_step(step, ir->nstfout))
1102         {
1103             // TODO: merge this with stepWork.useOnlyMtsCombinedForceBuffer
1104             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
1105         }
1106
1107         if (shellfc)
1108         {
1109             /* Now is the time to relax the shells */
1110             relax_shell_flexcon(fplog,
1111                                 cr,
1112                                 ms,
1113                                 mdrunOptions.verbose,
1114                                 enforcedRotation,
1115                                 step,
1116                                 ir,
1117                                 imdSession,
1118                                 pull_work,
1119                                 bNS,
1120                                 force_flags,
1121                                 top,
1122                                 constr,
1123                                 enerd,
1124                                 state->natoms,
1125                                 state->x.arrayRefWithPadding(),
1126                                 state->v.arrayRefWithPadding(),
1127                                 state->box,
1128                                 state->lambda,
1129                                 &state->hist,
1130                                 &f.view(),
1131                                 force_vir,
1132                                 *md,
1133                                 fr->longRangeNonbondeds.get(),
1134                                 nrnb,
1135                                 wcycle,
1136                                 shellfc,
1137                                 fr,
1138                                 runScheduleWork,
1139                                 t,
1140                                 mu_tot,
1141                                 vsite,
1142                                 ddBalanceRegionHandler);
1143         }
1144         else
1145         {
1146             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
1147                is updated (or the AWH update will be performed twice for one step when continuing).
1148                It would be best to call this update function from do_md_trajectory_writing but that
1149                would occur after do_force. One would have to divide the update_awh function into one
1150                function applying the AWH force and one doing the AWH bias update. The update AWH
1151                bias function could then be called after do_md_trajectory_writing (then containing
1152                update_awh_history). The checkpointing will in the future probably moved to the start
1153                of the md loop which will rid of this issue. */
1154             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
1155             {
1156                 awh->updateHistory(state_global->awhHistory.get());
1157             }
1158
1159             /* The coordinates (x) are shifted (to get whole molecules)
1160              * in do_force.
1161              * This is parallellized as well, and does communication too.
1162              * Check comments in sim_util.c
1163              */
1164             do_force(fplog,
1165                      cr,
1166                      ms,
1167                      *ir,
1168                      awh.get(),
1169                      enforcedRotation,
1170                      imdSession,
1171                      pull_work,
1172                      step,
1173                      nrnb,
1174                      wcycle,
1175                      top,
1176                      state->box,
1177                      state->x.arrayRefWithPadding(),
1178                      &state->hist,
1179                      &f.view(),
1180                      force_vir,
1181                      md,
1182                      enerd,
1183                      state->lambda,
1184                      fr,
1185                      runScheduleWork,
1186                      vsite,
1187                      mu_tot,
1188                      t,
1189                      ed ? ed->getLegacyED() : nullptr,
1190                      fr->longRangeNonbondeds.get(),
1191                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
1192                      ddBalanceRegionHandler);
1193         }
1194
1195         // VV integrators do not need the following velocity half step
1196         // if it is the first step after starting from a checkpoint.
1197         // That is, the half step is needed on all other steps, and
1198         // also the first step when starting from a .tpr file.
1199         if (EI_VV(ir->eI))
1200         {
1201             integrateVVFirstStep(step,
1202                                  bFirstStep,
1203                                  bInitStep,
1204                                  startingBehavior,
1205                                  nstglobalcomm,
1206                                  ir,
1207                                  fr,
1208                                  cr,
1209                                  state,
1210                                  mdAtoms->mdatoms(),
1211                                  &fcdata,
1212                                  &MassQ,
1213                                  &vcm,
1214                                  enerd,
1215                                  &observablesReducer,
1216                                  ekind,
1217                                  gstat,
1218                                  &last_ekin,
1219                                  bCalcVir,
1220                                  total_vir,
1221                                  shake_vir,
1222                                  force_vir,
1223                                  pres,
1224                                  M,
1225                                  do_log,
1226                                  do_ene,
1227                                  bCalcEner,
1228                                  bGStat,
1229                                  bStopCM,
1230                                  bTrotter,
1231                                  bExchanged,
1232                                  &bSumEkinhOld,
1233                                  &saved_conserved_quantity,
1234                                  &f,
1235                                  &upd,
1236                                  constr,
1237                                  &nullSignaller,
1238                                  trotter_seq,
1239                                  nrnb,
1240                                  fplog,
1241                                  wcycle);
1242             if (vsite != nullptr && needVirtualVelocitiesThisStep)
1243             {
1244                 // Positions were calculated earlier
1245                 wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
1246                 vsite->construct(state->x, state->v, state->box, VSiteOperation::Velocities);
1247                 wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
1248             }
1249         }
1250
1251         /* ########  END FIRST UPDATE STEP  ############## */
1252         /* ########  If doing VV, we now have v(dt) ###### */
1253         if (bDoExpanded)
1254         {
1255             /* perform extended ensemble sampling in lambda - we don't
1256                actually move to the new state before outputting
1257                statistics, but if performing simulated tempering, we
1258                do update the velocities and the tau_t. */
1259             // TODO: Avoid changing inputrec (#3854)
1260             // Simulated tempering updates the reference temperature.
1261             // Expanded ensemble without simulated tempering does not change the inputrec.
1262             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
1263             lamnew                 = ExpandedEnsembleDynamics(fplog,
1264                                               nonConstInputrec,
1265                                               enerd,
1266                                               state,
1267                                               &MassQ,
1268                                               state->fep_state,
1269                                               state->dfhist,
1270                                               step,
1271                                               state->v.rvec_array(),
1272                                               md->homenr,
1273                                               md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1274                                                                       : gmx::ArrayRef<const unsigned short>());
1275             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1276             if (MASTER(cr))
1277             {
1278                 copy_df_history(state_global->dfhist, state->dfhist);
1279             }
1280         }
1281
1282         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1283         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1284         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1285             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1286                 || checkpointHandler->isCheckpointingStep()))
1287         {
1288             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1289             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1290         }
1291         // Copy velocities if needed for the output/checkpointing.
1292         // NOTE: Copy on the search steps is done at the beginning of the step.
1293         if (useGpuForUpdate && !bNS
1294             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1295         {
1296             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1297             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1298         }
1299         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1300         // and update is offloaded hence forces are kept on the GPU for update and have not been
1301         // already transferred in do_force().
1302         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1303         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1304         //       prior to GPU update.
1305         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1306         //       copy call in do_force(...).
1307         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1308         //       on host after the D2H copy in do_force(...).
1309         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1310             && do_per_step(step, ir->nstfout))
1311         {
1312             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1313             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1314         }
1315         /* Now we have the energies and forces corresponding to the
1316          * coordinates at time t. We must output all of this before
1317          * the update.
1318          */
1319         do_md_trajectory_writing(fplog,
1320                                  cr,
1321                                  nfile,
1322                                  fnm,
1323                                  step,
1324                                  step_rel,
1325                                  t,
1326                                  ir,
1327                                  state,
1328                                  state_global,
1329                                  observablesHistory,
1330                                  top_global,
1331                                  fr,
1332                                  outf,
1333                                  energyOutput,
1334                                  ekind,
1335                                  f.view().force(),
1336                                  checkpointHandler->isCheckpointingStep(),
1337                                  bRerunMD,
1338                                  bLastStep,
1339                                  mdrunOptions.writeConfout,
1340                                  bSumEkinhOld);
1341         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1342         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x, t);
1343
1344         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1345         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1346             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1347         {
1348             copy_mat(state->svir_prev, shake_vir);
1349             copy_mat(state->fvir_prev, force_vir);
1350         }
1351
1352         stopHandler->setSignal();
1353         resetHandler->setSignal(walltime_accounting);
1354
1355         if (bGStat || !PAR(cr))
1356         {
1357             /* In parallel we only have to check for checkpointing in steps
1358              * where we do global communication,
1359              *  otherwise the other nodes don't know.
1360              */
1361             checkpointHandler->setSignal(walltime_accounting);
1362         }
1363
1364         /* #########   START SECOND UPDATE STEP ################# */
1365
1366         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1367            controlled in preprocessing */
1368
1369         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1370         {
1371             gmx_bool bIfRandomize;
1372             bIfRandomize = update_randomize_velocities(ir,
1373                                                        step,
1374                                                        cr,
1375                                                        md->homenr,
1376                                                        md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1377                                                                : gmx::ArrayRef<const unsigned short>(),
1378                                                        gmx::arrayRefFromArray(md->invmass, md->nr),
1379                                                        state->v,
1380                                                        &upd,
1381                                                        constr);
1382             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1383             if (constr && bIfRandomize)
1384             {
1385                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1386             }
1387         }
1388         /* Box is changed in update() when we do pressure coupling,
1389          * but we should still use the old box for energy corrections and when
1390          * writing it to the energy file, so it matches the trajectory files for
1391          * the same timestep above. Make a copy in a separate array.
1392          */
1393         copy_mat(state->box, lastbox);
1394
1395         dvdl_constr = 0;
1396
1397         if (!useGpuForUpdate)
1398         {
1399             wallcycle_start(wcycle, WallCycleCounter::Update);
1400         }
1401         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1402         if (bTrotter)
1403         {
1404             trotter_update(ir,
1405                            step,
1406                            ekind,
1407                            enerd,
1408                            state,
1409                            total_vir,
1410                            md->homenr,
1411                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1412                                    : gmx::ArrayRef<const unsigned short>(),
1413                            gmx::arrayRefFromArray(md->invmass, md->nr),
1414                            &MassQ,
1415                            trotter_seq,
1416                            TrotterSequence::Three);
1417             /* We can only do Berendsen coupling after we have summed
1418              * the kinetic energy or virial. Since the happens
1419              * in global_state after update, we should only do it at
1420              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1421              */
1422         }
1423         else
1424         {
1425             update_tcouple(step,
1426                            ir,
1427                            state,
1428                            ekind,
1429                            &MassQ,
1430                            md->homenr,
1431                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1432                                    : gmx::ArrayRef<const unsigned short>());
1433             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1434         }
1435
1436         /* With leap-frog type integrators we compute the kinetic energy
1437          * at a whole time step as the average of the half-time step kinetic
1438          * energies of two subsequent steps. Therefore we need to compute the
1439          * half step kinetic energy also if we need energies at the next step.
1440          */
1441         const bool needHalfStepKineticEnergy =
1442                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1443
1444         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1445         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1446         const bool doParrinelloRahman = (ir->epc == PressureCoupling::ParrinelloRahman
1447                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1448
1449         if (EI_VV(ir->eI))
1450         {
1451             GMX_ASSERT(!useGpuForUpdate, "GPU update is not supported with VVAK integrator.");
1452
1453             integrateVVSecondStep(step,
1454                                   ir,
1455                                   fr,
1456                                   cr,
1457                                   state,
1458                                   mdAtoms->mdatoms(),
1459                                   &fcdata,
1460                                   &MassQ,
1461                                   &vcm,
1462                                   pull_work,
1463                                   enerd,
1464                                   &observablesReducer,
1465                                   ekind,
1466                                   gstat,
1467                                   &dvdl_constr,
1468                                   bCalcVir,
1469                                   total_vir,
1470                                   shake_vir,
1471                                   force_vir,
1472                                   pres,
1473                                   M,
1474                                   lastbox,
1475                                   do_log,
1476                                   do_ene,
1477                                   bGStat,
1478                                   &bSumEkinhOld,
1479                                   &f,
1480                                   &cbuf,
1481                                   &upd,
1482                                   constr,
1483                                   &nullSignaller,
1484                                   trotter_seq,
1485                                   nrnb,
1486                                   wcycle);
1487         }
1488         else
1489         {
1490             if (useGpuForUpdate)
1491             {
1492                 if (bNS && (bFirstStep || haveDDAtomOrdering(*cr)))
1493                 {
1494                     integrator->set(stateGpu->getCoordinates(),
1495                                     stateGpu->getVelocities(),
1496                                     stateGpu->getForces(),
1497                                     top->idef,
1498                                     *md);
1499
1500                     // Copy data to the GPU after buffers might have being reinitialized
1501                     /* The velocity copy is redundant if we had Center-of-Mass motion removed on
1502                      * the previous step. We don't check that now. */
1503                     stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1504                     if (!runScheduleWork->stepWork.haveGpuPmeOnThisRank
1505                         && !runScheduleWork->stepWork.useGpuXBufferOps)
1506                     {
1507                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1508                     }
1509                 }
1510
1511                 if ((simulationWork.useGpuPme && simulationWork.useCpuPmePpCommunication)
1512                     || (!runScheduleWork->stepWork.useGpuFBufferOps))
1513                 {
1514                     // The PME forces were recieved to the host, and reduced on the CPU with the
1515                     // rest of the forces computed on the GPU, so the final forces have to be copied
1516                     // back to the GPU. Or the buffer ops were not offloaded this step, so the
1517                     // forces are on the host and have to be copied
1518                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1519                 }
1520                 const bool doTemperatureScaling =
1521                         (ir->etc != TemperatureCoupling::No
1522                          && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1523
1524                 // This applies Leap-Frog, LINCS and SETTLE in succession
1525                 integrator->integrate(stateGpu->getLocalForcesReadyOnDeviceEvent(
1526                                               runScheduleWork->stepWork, runScheduleWork->simulationWork),
1527                                       ir->delta_t,
1528                                       true,
1529                                       bCalcVir,
1530                                       shake_vir,
1531                                       doTemperatureScaling,
1532                                       ekind->tcstat,
1533                                       doParrinelloRahman,
1534                                       ir->nstpcouple * ir->delta_t,
1535                                       M);
1536
1537                 // Copy velocities D2H after update if:
1538                 // - Globals are computed this step (includes the energy output steps).
1539                 // - Temperature is needed for the next step.
1540                 if (bGStat || needHalfStepKineticEnergy)
1541                 {
1542                     stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1543                     stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1544                 }
1545             }
1546             else
1547             {
1548                 /* With multiple time stepping we need to do an additional normal
1549                  * update step to obtain the virial, as the actual MTS integration
1550                  * using an acceleration where the slow forces are multiplied by mtsFactor.
1551                  * Using that acceleration would result in a virial with the slow
1552                  * force contribution would be a factor mtsFactor too large.
1553                  */
1554                 if (simulationWork.useMts && bCalcVir && constr != nullptr)
1555                 {
1556                     upd.update_for_constraint_virial(*ir,
1557                                                      md->homenr,
1558                                                      md->havePartiallyFrozenAtoms,
1559                                                      gmx::arrayRefFromArray(md->invmass, md->nr),
1560                                                      gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1561                                                      *state,
1562                                                      f.view().forceWithPadding(),
1563                                                      *ekind);
1564
1565                     constrain_coordinates(constr,
1566                                           do_log,
1567                                           do_ene,
1568                                           step,
1569                                           state,
1570                                           upd.xp()->arrayRefWithPadding(),
1571                                           &dvdl_constr,
1572                                           bCalcVir,
1573                                           shake_vir);
1574                 }
1575
1576                 ArrayRefWithPadding<const RVec> forceCombined =
1577                         (simulationWork.useMts && step % ir->mtsLevels[1].stepFactor == 0)
1578                                 ? f.view().forceMtsCombinedWithPadding()
1579                                 : f.view().forceWithPadding();
1580                 upd.update_coords(*ir,
1581                                   step,
1582                                   md->homenr,
1583                                   md->havePartiallyFrozenAtoms,
1584                                   gmx::arrayRefFromArray(md->ptype, md->nr),
1585                                   gmx::arrayRefFromArray(md->invmass, md->nr),
1586                                   gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1587                                   state,
1588                                   forceCombined,
1589                                   &fcdata,
1590                                   ekind,
1591                                   M,
1592                                   etrtPOSITION,
1593                                   cr,
1594                                   constr != nullptr);
1595
1596                 wallcycle_stop(wcycle, WallCycleCounter::Update);
1597
1598                 constrain_coordinates(constr,
1599                                       do_log,
1600                                       do_ene,
1601                                       step,
1602                                       state,
1603                                       upd.xp()->arrayRefWithPadding(),
1604                                       &dvdl_constr,
1605                                       bCalcVir && !simulationWork.useMts,
1606                                       shake_vir);
1607
1608                 upd.update_sd_second_half(*ir,
1609                                           step,
1610                                           &dvdl_constr,
1611                                           md->homenr,
1612                                           gmx::arrayRefFromArray(md->ptype, md->nr),
1613                                           gmx::arrayRefFromArray(md->invmass, md->nr),
1614                                           state,
1615                                           cr,
1616                                           nrnb,
1617                                           wcycle,
1618                                           constr,
1619                                           do_log,
1620                                           do_ene);
1621                 upd.finish_update(
1622                         *ir, md->havePartiallyFrozenAtoms, md->homenr, state, wcycle, constr != nullptr);
1623             }
1624
1625             if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1626             {
1627                 updatePrevStepPullCom(pull_work, state);
1628             }
1629
1630             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1631         }
1632
1633         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1634         /* With Leap-Frog we can skip compute_globals at
1635          * non-communication steps, but we need to calculate
1636          * the kinetic energy one step before communication.
1637          */
1638         {
1639             // Organize to do inter-simulation signalling on steps if
1640             // and when algorithms require it.
1641             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1642
1643             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1644             {
1645                 // Copy coordinates when needed to stop the CM motion.
1646                 if (useGpuForUpdate && (bDoReplEx || (!EI_VV(ir->eI) && bStopCM)))
1647                 {
1648                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1649                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1650                 }
1651                 // Since we're already communicating at this step, we
1652                 // can propagate intra-simulation signals. Note that
1653                 // check_nstglobalcomm has the responsibility for
1654                 // choosing the value of nstglobalcomm that is one way
1655                 // bGStat becomes true, so we can't get into a
1656                 // situation where e.g. checkpointing can't be
1657                 // signalled.
1658                 bool                doIntraSimSignal = true;
1659                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1660
1661                 compute_globals(gstat,
1662                                 cr,
1663                                 ir,
1664                                 fr,
1665                                 ekind,
1666                                 makeConstArrayRef(state->x),
1667                                 makeConstArrayRef(state->v),
1668                                 state->box,
1669                                 md,
1670                                 nrnb,
1671                                 &vcm,
1672                                 wcycle,
1673                                 enerd,
1674                                 force_vir,
1675                                 shake_vir,
1676                                 total_vir,
1677                                 pres,
1678                                 &signaller,
1679                                 lastbox,
1680                                 &bSumEkinhOld,
1681                                 (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1682                                         | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1683                                         | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1684                                         | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT,
1685                                 step,
1686                                 &observablesReducer);
1687                 if (!EI_VV(ir->eI) && bStopCM)
1688                 {
1689                     process_and_stopcm_grp(
1690                             fplog, &vcm, *md, makeArrayRef(state->x), makeArrayRef(state->v));
1691                     inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
1692
1693                     // TODO: The special case of removing CM motion should be dealt more gracefully
1694                     if (useGpuForUpdate)
1695                     {
1696                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1697                         // Here we block until the H2D copy completes because event sync with the
1698                         // force kernels that use the coordinates on the next steps is not implemented
1699                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1700                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1701                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1702                         if (vcm.mode != ComRemovalAlgorithm::No)
1703                         {
1704                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1705                         }
1706                     }
1707                 }
1708             }
1709         }
1710
1711         /* #############  END CALC EKIN AND PRESSURE ################# */
1712
1713         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1714            the virial that should probably be addressed eventually. state->veta has better properies,
1715            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1716            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1717
1718         if (ir->efep != FreeEnergyPerturbationType::No && !EI_VV(ir->eI))
1719         {
1720             /* Sum up the foreign energy and dK/dl terms for md and sd.
1721                Currently done every step so that dH/dl is correct in the .edr */
1722             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1723         }
1724
1725         update_pcouple_after_coordinates(fplog,
1726                                          step,
1727                                          ir,
1728                                          md->homenr,
1729                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1730                                                      : gmx::ArrayRef<const unsigned short>(),
1731                                          pres,
1732                                          force_vir,
1733                                          shake_vir,
1734                                          pressureCouplingMu,
1735                                          state,
1736                                          nrnb,
1737                                          upd.deform(),
1738                                          !useGpuForUpdate);
1739
1740         const bool doBerendsenPressureCoupling = (inputrec->epc == PressureCoupling::Berendsen
1741                                                   && do_per_step(step, inputrec->nstpcouple));
1742         const bool doCRescalePressureCoupling  = (inputrec->epc == PressureCoupling::CRescale
1743                                                  && do_per_step(step, inputrec->nstpcouple));
1744         if (useGpuForUpdate
1745             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1746         {
1747             integrator->scaleCoordinates(pressureCouplingMu);
1748             if (doCRescalePressureCoupling)
1749             {
1750                 matrix pressureCouplingInvMu;
1751                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1752                 integrator->scaleVelocities(pressureCouplingInvMu);
1753             }
1754             integrator->setPbc(PbcType::Xyz, state->box);
1755         }
1756
1757         /* ################# END UPDATE STEP 2 ################# */
1758         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1759
1760         /* The coordinates (x) were unshifted in update */
1761         if (!bGStat)
1762         {
1763             /* We will not sum ekinh_old,
1764              * so signal that we still have to do it.
1765              */
1766             bSumEkinhOld = TRUE;
1767         }
1768
1769         if (bCalcEner)
1770         {
1771             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1772
1773             /* use the directly determined last velocity, not actually the averaged half steps */
1774             if (bTrotter && ir->eI == IntegrationAlgorithm::VV)
1775             {
1776                 enerd->term[F_EKIN] = last_ekin;
1777             }
1778             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1779
1780             if (integratorHasConservedEnergyQuantity(ir))
1781             {
1782                 if (EI_VV(ir->eI))
1783                 {
1784                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1785                 }
1786                 else
1787                 {
1788                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1789                 }
1790             }
1791             /* #########  END PREPARING EDR OUTPUT  ###########  */
1792         }
1793
1794         /* Output stuff */
1795         if (MASTER(cr))
1796         {
1797             if (fplog && do_log && bDoExpanded)
1798             {
1799                 /* only needed if doing expanded ensemble */
1800                 PrintFreeEnergyInfoToFile(fplog,
1801                                           ir->fepvals.get(),
1802                                           ir->expandedvals.get(),
1803                                           ir->bSimTemp ? ir->simtempvals.get() : nullptr,
1804                                           state_global->dfhist,
1805                                           state->fep_state,
1806                                           ir->nstlog,
1807                                           step);
1808             }
1809             if (bCalcEner)
1810             {
1811                 const bool outputDHDL = (computeDHDL && do_per_step(step, ir->fepvals->nstdhdl));
1812
1813                 energyOutput.addDataAtEnergyStep(outputDHDL,
1814                                                  bCalcEnerStep,
1815                                                  t,
1816                                                  md->tmass,
1817                                                  enerd,
1818                                                  ir->fepvals.get(),
1819                                                  ir->expandedvals.get(),
1820                                                  lastbox,
1821                                                  PTCouplingArrays{ state->boxv,
1822                                                                    state->nosehoover_xi,
1823                                                                    state->nosehoover_vxi,
1824                                                                    state->nhpres_xi,
1825                                                                    state->nhpres_vxi },
1826                                                  state->fep_state,
1827                                                  total_vir,
1828                                                  pres,
1829                                                  ekind,
1830                                                  mu_tot,
1831                                                  constr);
1832             }
1833             else
1834             {
1835                 energyOutput.recordNonEnergyStep();
1836             }
1837
1838             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1839             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1840
1841             if (doSimulatedAnnealing)
1842             {
1843                 gmx::EnergyOutput::printAnnealingTemperatures(
1844                         do_log ? fplog : nullptr, groups, &(ir->opts));
1845             }
1846             if (do_log || do_ene || do_dr || do_or)
1847             {
1848                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf),
1849                                                    do_ene,
1850                                                    do_dr,
1851                                                    do_or,
1852                                                    do_log ? fplog : nullptr,
1853                                                    step,
1854                                                    t,
1855                                                    fr->fcdata.get(),
1856                                                    awh.get());
1857             }
1858             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1859             {
1860                 const bool isInitialOutput = false;
1861                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1862             }
1863
1864             if (ir->bPull)
1865             {
1866                 pull_print_output(pull_work, step, t);
1867             }
1868
1869             if (do_per_step(step, ir->nstlog))
1870             {
1871                 if (fflush(fplog) != 0)
1872                 {
1873                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1874                 }
1875             }
1876         }
1877         if (bDoExpanded)
1878         {
1879             /* Have to do this part _after_ outputting the logfile and the edr file */
1880             /* Gets written into the state at the beginning of next loop*/
1881             state->fep_state = lamnew;
1882         }
1883         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1884         {
1885             state->fep_state = awh->fepLambdaState();
1886         }
1887         /* Print the remaining wall clock time for the run */
1888         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1889         {
1890             if (shellfc)
1891             {
1892                 fprintf(stderr, "\n");
1893             }
1894             print_time(stderr, walltime_accounting, step, ir, cr);
1895         }
1896
1897         /* Ion/water position swapping.
1898          * Not done in last step since trajectory writing happens before this call
1899          * in the MD loop and exchanges would be lost anyway. */
1900         bNeedRepartition = FALSE;
1901         if ((ir->eSwapCoords != SwapType::No) && (step > 0) && !bLastStep
1902             && do_per_step(step, ir->swap->nstswap))
1903         {
1904             bNeedRepartition = do_swapcoords(cr,
1905                                              step,
1906                                              t,
1907                                              ir,
1908                                              swap,
1909                                              wcycle,
1910                                              as_rvec_array(state->x.data()),
1911                                              state->box,
1912                                              MASTER(cr) && mdrunOptions.verbose,
1913                                              bRerunMD);
1914
1915             if (bNeedRepartition && haveDDAtomOrdering(*cr))
1916             {
1917                 dd_collect_state(cr->dd, state, state_global);
1918             }
1919         }
1920
1921         /* Replica exchange */
1922         bExchanged = FALSE;
1923         if (bDoReplEx)
1924         {
1925             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1926         }
1927
1928         if ((bExchanged || bNeedRepartition) && haveDDAtomOrdering(*cr))
1929         {
1930             dd_partition_system(fplog,
1931                                 mdlog,
1932                                 step,
1933                                 cr,
1934                                 TRUE,
1935                                 1,
1936                                 state_global,
1937                                 top_global,
1938                                 *ir,
1939                                 imdSession,
1940                                 pull_work,
1941                                 state,
1942                                 &f,
1943                                 mdAtoms,
1944                                 top,
1945                                 fr,
1946                                 vsite,
1947                                 constr,
1948                                 nrnb,
1949                                 wcycle,
1950                                 FALSE);
1951             upd.updateAfterPartition(state->natoms,
1952                                      md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1953                                                  : gmx::ArrayRef<const unsigned short>(),
1954                                      md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1955                                              : gmx::ArrayRef<const unsigned short>());
1956             fr->longRangeNonbondeds->updateAfterPartition(*md);
1957         }
1958
1959         bFirstStep = FALSE;
1960         bInitStep  = FALSE;
1961
1962         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1963         /* With all integrators, except VV, we need to retain the pressure
1964          * at the current step for coupling at the next step.
1965          */
1966         if ((state->flags & enumValueToBitMask(StateEntry::PressurePrevious))
1967             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1968         {
1969             /* Store the pressure in t_state for pressure coupling
1970              * at the next MD step.
1971              */
1972             copy_mat(pres, state->pres_prev);
1973         }
1974
1975         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1976
1977         if ((membed != nullptr) && (!bLastStep))
1978         {
1979             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1980         }
1981
1982         cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
1983         if (haveDDAtomOrdering(*cr) && wcycle)
1984         {
1985             dd_cycles_add(cr->dd, cycles, ddCyclStep);
1986         }
1987
1988         /* increase the MD step number */
1989         step++;
1990         step_rel++;
1991         observablesReducer.markAsReadyToReduce();
1992
1993 #if GMX_FAHCORE
1994         if (MASTER(cr))
1995         {
1996             fcReportProgress(ir->nsteps + ir->init_step, step);
1997         }
1998 #endif
1999
2000         resetHandler->resetCounters(
2001                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
2002
2003         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
2004         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
2005     }
2006     /* End of main MD loop */
2007
2008     /* Closing TNG files can include compressing data. Therefore it is good to do that
2009      * before stopping the time measurements. */
2010     mdoutf_tng_close(outf);
2011
2012     /* Stop measuring walltime */
2013     walltime_accounting_end_time(walltime_accounting);
2014
2015     if (simulationWork.haveSeparatePmeRank)
2016     {
2017         /* Tell the PME only node to finish */
2018         gmx_pme_send_finish(cr);
2019     }
2020
2021     if (MASTER(cr))
2022     {
2023         if (ir->nstcalcenergy > 0)
2024         {
2025             energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
2026
2027             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
2028             energyOutput.printAverages(fplog, groups);
2029         }
2030     }
2031     done_mdoutf(outf);
2032
2033     if (bPMETune)
2034     {
2035         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
2036     }
2037
2038     done_shellfc(fplog, shellfc, step_rel);
2039
2040     if (useReplicaExchange && MASTER(cr))
2041     {
2042         print_replica_exchange_statistics(fplog, repl_ex);
2043     }
2044
2045     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
2046
2047     global_stat_destroy(gstat);
2048 }