Fix incorrect event dependency of GPU update
[alexxy/gromacs.git] / src / gromacs / mdrun / md.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 /*! \internal \file
38  *
39  * \brief Implements the integrator for normal molecular dynamics simulations
40  *
41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
42  * \ingroup module_mdrun
43  */
44 #include "gmxpre.h"
45
46 #include <cinttypes>
47 #include <cmath>
48 #include <cstdio>
49 #include <cstdlib>
50
51 #include <algorithm>
52 #include <memory>
53 #include <numeric>
54
55 #include "gromacs/applied_forces/awh/awh.h"
56 #include "gromacs/applied_forces/awh/read_params.h"
57 #include "gromacs/commandline/filenm.h"
58 #include "gromacs/domdec/collect.h"
59 #include "gromacs/domdec/dlbtiming.h"
60 #include "gromacs/domdec/domdec.h"
61 #include "gromacs/domdec/domdec_network.h"
62 #include "gromacs/domdec/domdec_struct.h"
63 #include "gromacs/domdec/gpuhaloexchange.h"
64 #include "gromacs/domdec/localtopologychecker.h"
65 #include "gromacs/domdec/mdsetup.h"
66 #include "gromacs/domdec/partition.h"
67 #include "gromacs/essentialdynamics/edsam.h"
68 #include "gromacs/ewald/pme_load_balancing.h"
69 #include "gromacs/ewald/pme_pp.h"
70 #include "gromacs/fileio/trxio.h"
71 #include "gromacs/gmxlib/network.h"
72 #include "gromacs/gmxlib/nrnb.h"
73 #include "gromacs/gpu_utils/device_stream_manager.h"
74 #include "gromacs/gpu_utils/gpu_utils.h"
75 #include "gromacs/imd/imd.h"
76 #include "gromacs/listed_forces/listed_forces.h"
77 #include "gromacs/math/functions.h"
78 #include "gromacs/math/invertmatrix.h"
79 #include "gromacs/math/vec.h"
80 #include "gromacs/math/vectypes.h"
81 #include "gromacs/mdlib/checkpointhandler.h"
82 #include "gromacs/mdlib/compute_io.h"
83 #include "gromacs/mdlib/constr.h"
84 #include "gromacs/mdlib/coupling.h"
85 #include "gromacs/mdlib/ebin.h"
86 #include "gromacs/mdlib/enerdata_utils.h"
87 #include "gromacs/mdlib/energyoutput.h"
88 #include "gromacs/mdlib/expanded.h"
89 #include "gromacs/mdlib/force.h"
90 #include "gromacs/mdlib/force_flags.h"
91 #include "gromacs/mdlib/forcerec.h"
92 #include "gromacs/mdlib/freeenergyparameters.h"
93 #include "gromacs/mdlib/md_support.h"
94 #include "gromacs/mdlib/mdatoms.h"
95 #include "gromacs/mdlib/mdoutf.h"
96 #include "gromacs/mdlib/membed.h"
97 #include "gromacs/mdlib/resethandler.h"
98 #include "gromacs/mdlib/sighandler.h"
99 #include "gromacs/mdlib/simulationsignal.h"
100 #include "gromacs/mdlib/stat.h"
101 #include "gromacs/mdlib/stophandler.h"
102 #include "gromacs/mdlib/tgroup.h"
103 #include "gromacs/mdlib/trajectory_writing.h"
104 #include "gromacs/mdlib/update.h"
105 #include "gromacs/mdlib/update_constrain_gpu.h"
106 #include "gromacs/mdlib/update_vv.h"
107 #include "gromacs/mdlib/vcm.h"
108 #include "gromacs/mdlib/vsite.h"
109 #include "gromacs/mdrunutility/freeenergy.h"
110 #include "gromacs/mdrunutility/handlerestart.h"
111 #include "gromacs/mdrunutility/multisim.h"
112 #include "gromacs/mdrunutility/printtime.h"
113 #include "gromacs/mdtypes/awh_history.h"
114 #include "gromacs/mdtypes/awh_params.h"
115 #include "gromacs/mdtypes/commrec.h"
116 #include "gromacs/mdtypes/df_history.h"
117 #include "gromacs/mdtypes/energyhistory.h"
118 #include "gromacs/mdtypes/fcdata.h"
119 #include "gromacs/mdtypes/forcebuffers.h"
120 #include "gromacs/mdtypes/forcerec.h"
121 #include "gromacs/mdtypes/group.h"
122 #include "gromacs/mdtypes/inputrec.h"
123 #include "gromacs/mdtypes/interaction_const.h"
124 #include "gromacs/mdtypes/md_enums.h"
125 #include "gromacs/mdtypes/mdatom.h"
126 #include "gromacs/mdtypes/mdrunoptions.h"
127 #include "gromacs/mdtypes/multipletimestepping.h"
128 #include "gromacs/mdtypes/observableshistory.h"
129 #include "gromacs/mdtypes/observablesreducer.h"
130 #include "gromacs/mdtypes/pullhistory.h"
131 #include "gromacs/mdtypes/simulation_workload.h"
132 #include "gromacs/mdtypes/state.h"
133 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
134 #include "gromacs/modularsimulator/energydata.h"
135 #include "gromacs/nbnxm/gpu_data_mgmt.h"
136 #include "gromacs/nbnxm/nbnxm.h"
137 #include "gromacs/pbcutil/pbc.h"
138 #include "gromacs/pulling/output.h"
139 #include "gromacs/pulling/pull.h"
140 #include "gromacs/swap/swapcoords.h"
141 #include "gromacs/timing/wallcycle.h"
142 #include "gromacs/timing/walltime_accounting.h"
143 #include "gromacs/topology/atoms.h"
144 #include "gromacs/topology/idef.h"
145 #include "gromacs/topology/mtop_util.h"
146 #include "gromacs/topology/topology.h"
147 #include "gromacs/trajectory/trajectoryframe.h"
148 #include "gromacs/utility/basedefinitions.h"
149 #include "gromacs/utility/cstringutil.h"
150 #include "gromacs/utility/fatalerror.h"
151 #include "gromacs/utility/logger.h"
152 #include "gromacs/utility/real.h"
153 #include "gromacs/utility/smalloc.h"
154
155 #include "legacysimulator.h"
156 #include "replicaexchange.h"
157 #include "shellfc.h"
158
159 using gmx::SimulationSignaller;
160
161 void gmx::LegacySimulator::do_md()
162 {
163     // TODO Historically, the EM and MD "integrators" used different
164     // names for the t_inputrec *parameter, but these must have the
165     // same name, now that it's a member of a struct. We use this ir
166     // alias to avoid a large ripple of nearly useless changes.
167     // t_inputrec is being replaced by IMdpOptionsProvider, so this
168     // will go away eventually.
169     const t_inputrec* ir = inputrec;
170
171     double       t, t0 = ir->init_t;
172     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
173     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
174     gmx_bool     bDoExpanded = FALSE;
175     gmx_bool     do_ene, do_log, do_verbose;
176     gmx_bool     bMasterState;
177     unsigned int force_flags;
178     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
179     int    i, m;
180     rvec   mu_tot;
181     matrix pressureCouplingMu, M;
182     gmx_repl_ex_t     repl_ex = nullptr;
183     gmx_global_stat_t gstat;
184     gmx_shellfc_t*    shellfc;
185     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
186     gmx_bool          bTrotter;
187     real              dvdl_constr;
188     std::vector<RVec> cbuf;
189     matrix            lastbox;
190     int               lamnew = 0;
191     /* for FEP */
192     double    cycles;
193     real      saved_conserved_quantity = 0;
194     real      last_ekin                = 0;
195     t_extmass MassQ;
196     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
197
198     /* PME load balancing data for GPU kernels */
199     gmx_bool bPMETune         = FALSE;
200     gmx_bool bPMETunePrinting = FALSE;
201
202     bool bInteractiveMDstep = false;
203
204     SimulationSignals signals;
205     // Most global communnication stages don't propagate mdrun
206     // signals, and will use this object to achieve that.
207     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
208
209     if (!mdrunOptions.writeConfout)
210     {
211         // This is on by default, and the main known use case for
212         // turning it off is for convenience in benchmarking, which is
213         // something that should not show up in the general user
214         // interface.
215         GMX_LOG(mdlog.info)
216                 .asParagraph()
217                 .appendText(
218                         "The -noconfout functionality is deprecated, and may be removed in a "
219                         "future version.");
220     }
221
222     /* md-vv uses averaged full step velocities for T-control
223        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
224        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
225     bTrotter = (EI_VV(ir->eI)
226                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
227
228     const bool bRerunMD = false;
229
230     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
231     bGStatEveryStep   = (nstglobalcomm == 1);
232
233     const SimulationGroups* groups = &top_global.groups;
234
235     std::unique_ptr<EssentialDynamics> ed = nullptr;
236     if (opt2bSet("-ei", nfile, fnm))
237     {
238         /* Initialize essential dynamics sampling */
239         ed = init_edsam(mdlog,
240                         opt2fn_null("-ei", nfile, fnm),
241                         opt2fn("-eo", nfile, fnm),
242                         top_global,
243                         *ir,
244                         cr,
245                         constr,
246                         state_global,
247                         observablesHistory,
248                         oenv,
249                         startingBehavior);
250     }
251     else if (observablesHistory->edsamHistory)
252     {
253         gmx_fatal(FARGS,
254                   "The checkpoint is from a run with essential dynamics sampling, "
255                   "but the current run did not specify the -ei option. "
256                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
257     }
258
259     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
260     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
261     initialize_lambdas(fplog,
262                        ir->efep,
263                        ir->bSimTemp,
264                        *ir->fepvals,
265                        ir->simtempvals->temperatures,
266                        gmx::arrayRefFromArray(ir->opts.ref_t, ir->opts.ngtc),
267                        MASTER(cr),
268                        fep_state,
269                        lambda);
270     Update upd(*ir, deform);
271     bool   doSimulatedAnnealing = false;
272     {
273         // TODO: Avoid changing inputrec (#3854)
274         // Simulated annealing updates the reference temperature.
275         auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
276         doSimulatedAnnealing   = initSimulatedAnnealing(nonConstInputrec, &upd);
277     }
278     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
279
280     t_fcdata& fcdata = *fr->fcdata;
281
282     bool simulationsShareState = false;
283     int  nstSignalComm         = nstglobalcomm;
284     {
285         // TODO This implementation of ensemble orientation restraints is nasty because
286         // a user can't just do multi-sim with single-sim orientation restraints.
287         bool usingEnsembleRestraints = (fcdata.disres->nsystems > 1) || ((ms != nullptr) && fcdata.orires);
288         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim() && (ms != nullptr));
289
290         // Replica exchange, ensemble restraints and AWH need all
291         // simulations to remain synchronized, so they need
292         // checkpoints and stop conditions to act on the same step, so
293         // the propagation of such signals must take place between
294         // simulations, not just within simulations.
295         // TODO: Make algorithm initializers set these flags.
296         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
297
298         if (simulationsShareState)
299         {
300             // Inter-simulation signal communication does not need to happen
301             // often, so we use a minimum of 200 steps to reduce overhead.
302             const int c_minimumInterSimulationSignallingInterval = 200;
303             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
304                             * nstglobalcomm;
305         }
306     }
307
308     if (startingBehavior != StartingBehavior::RestartWithAppending)
309     {
310         pleaseCiteCouplingAlgorithms(fplog, *ir);
311     }
312     gmx_mdoutf*       outf = init_mdoutf(fplog,
313                                    nfile,
314                                    fnm,
315                                    mdrunOptions,
316                                    cr,
317                                    outputProvider,
318                                    mdModulesNotifiers,
319                                    ir,
320                                    top_global,
321                                    oenv,
322                                    wcycle,
323                                    startingBehavior,
324                                    simulationsShareState,
325                                    ms);
326     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf),
327                                    top_global,
328                                    *ir,
329                                    pull_work,
330                                    mdoutf_get_fp_dhdl(outf),
331                                    false,
332                                    startingBehavior,
333                                    simulationsShareState,
334                                    mdModulesNotifiers);
335
336     gstat = global_stat_init(ir);
337
338     const auto& simulationWork     = runScheduleWork->simulationWork;
339     const bool  useGpuForPme       = simulationWork.useGpuPme;
340     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
341     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
342     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
343
344     /* Check for polarizable models and flexible constraints */
345     shellfc = init_shell_flexcon(fplog,
346                                  top_global,
347                                  constr ? constr->numFlexibleConstraints() : 0,
348                                  ir->nstcalcenergy,
349                                  DOMAINDECOMP(cr),
350                                  useGpuForPme);
351
352     {
353         double io = compute_io(ir, top_global.natoms, *groups, energyOutput.numEnergyTerms(), 1);
354         if ((io > 2000) && MASTER(cr))
355         {
356             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
357         }
358     }
359
360     ObservablesReducer observablesReducer = observablesReducerBuilder->build();
361
362     ForceBuffers     f(simulationWork.useMts,
363                    ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
364                                ? PinningPolicy::PinnedIfSupported
365                                : PinningPolicy::CannotBePinned);
366     const t_mdatoms* md = mdAtoms->mdatoms();
367     if (DOMAINDECOMP(cr))
368     {
369         // Local state only becomes valid now.
370         dd_init_local_state(*cr->dd, state_global, state);
371
372         /* Distribute the charge groups over the nodes from the master node */
373         dd_partition_system(fplog,
374                             mdlog,
375                             ir->init_step,
376                             cr,
377                             TRUE,
378                             1,
379                             state_global,
380                             top_global,
381                             *ir,
382                             imdSession,
383                             pull_work,
384                             state,
385                             &f,
386                             mdAtoms,
387                             top,
388                             fr,
389                             vsite,
390                             constr,
391                             nrnb,
392                             nullptr,
393                             FALSE);
394         upd.updateAfterPartition(state->natoms,
395                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
396                                              : gmx::ArrayRef<const unsigned short>(),
397                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
398                                          : gmx::ArrayRef<const unsigned short>());
399         fr->longRangeNonbondeds->updateAfterPartition(*md);
400     }
401     else
402     {
403         state_change_natoms(state_global, state_global->natoms);
404
405         /* Generate and initialize new topology */
406         mdAlgorithmsSetupAtomData(cr, *ir, top_global, top, fr, &f, mdAtoms, constr, vsite, shellfc);
407
408         upd.updateAfterPartition(state->natoms,
409                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
410                                              : gmx::ArrayRef<const unsigned short>(),
411                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
412                                          : gmx::ArrayRef<const unsigned short>());
413         fr->longRangeNonbondeds->updateAfterPartition(*md);
414     }
415
416     std::unique_ptr<UpdateConstrainGpu> integrator;
417
418     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
419
420     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
421     if (useGpuForUpdate)
422     {
423         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
424                                    || constr->numConstraintsTotal() == 0,
425                            "Constraints in domain decomposition are only supported with update "
426                            "groups if using GPU update.\n");
427         GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
428                                    || constr->numConstraintsTotal() == 0,
429                            "SHAKE is not supported with GPU update.");
430         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
431                            "Either PME or short-ranged non-bonded interaction tasks must run on "
432                            "the GPU to use GPU update.\n");
433         GMX_RELEASE_ASSERT(ir->eI == IntegrationAlgorithm::MD,
434                            "Only the md integrator is supported with the GPU update.\n");
435         GMX_RELEASE_ASSERT(
436                 ir->etc != TemperatureCoupling::NoseHoover,
437                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
438         GMX_RELEASE_ASSERT(
439                 ir->epc == PressureCoupling::No || ir->epc == PressureCoupling::ParrinelloRahman
440                         || ir->epc == PressureCoupling::Berendsen || ir->epc == PressureCoupling::CRescale,
441                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
442                 "with the GPU update.\n");
443         GMX_RELEASE_ASSERT(!md->haveVsites,
444                            "Virtual sites are not supported with the GPU update.\n");
445         GMX_RELEASE_ASSERT(ed == nullptr,
446                            "Essential dynamics is not supported with the GPU update.\n");
447         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
448                            "Constraints pulling is not supported with the GPU update.\n");
449         GMX_RELEASE_ASSERT(fcdata.orires == nullptr,
450                            "Orientation restraints are not supported with the GPU update.\n");
451         GMX_RELEASE_ASSERT(
452                 ir->efep == FreeEnergyPerturbationType::No
453                         || (!haveFepPerturbedMasses(top_global) && !havePerturbedConstraints(top_global)),
454                 "Free energy perturbation of masses and constraints are not supported with the GPU "
455                 "update.");
456
457         if (constr != nullptr && constr->numConstraintsTotal() > 0)
458         {
459             GMX_LOG(mdlog.info)
460                     .asParagraph()
461                     .appendText("Updating coordinates and applying constraints on the GPU.");
462         }
463         else
464         {
465             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
466         }
467         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
468                            "Device stream manager should be initialized in order to use GPU "
469                            "update-constraints.");
470         GMX_RELEASE_ASSERT(
471                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
472                 "Update stream should be initialized in order to use GPU "
473                 "update-constraints.");
474         integrator = std::make_unique<UpdateConstrainGpu>(
475                 *ir,
476                 top_global,
477                 ekind->ngtc,
478                 fr->deviceStreamManager->context(),
479                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
480                 wcycle);
481
482         stateGpu->setXUpdatedOnDeviceEvent(integrator->xUpdatedOnDeviceEvent());
483
484         integrator->setPbc(PbcType::Xyz, state->box);
485     }
486
487     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
488     {
489         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
490     }
491     if (useGpuForUpdate)
492     {
493         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
494     }
495
496     // NOTE: The global state is no longer used at this point.
497     // But state_global is still used as temporary storage space for writing
498     // the global state to file and potentially for replica exchange.
499     // (Global topology should persist.)
500
501     update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
502
503     if (ir->bExpanded)
504     {
505         /* Check nstexpanded here, because the grompp check was broken */
506         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
507         {
508             gmx_fatal(FARGS,
509                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
510         }
511         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
512     }
513
514     if (MASTER(cr))
515     {
516         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
517     }
518
519     preparePrevStepPullCom(ir,
520                            pull_work,
521                            gmx::arrayRefFromArray(md->massT, md->nr),
522                            state,
523                            state_global,
524                            cr,
525                            startingBehavior != StartingBehavior::NewSimulation);
526
527     // TODO: Remove this by converting AWH into a ForceProvider
528     auto awh = prepareAwhModule(fplog,
529                                 *ir,
530                                 state_global,
531                                 cr,
532                                 ms,
533                                 startingBehavior != StartingBehavior::NewSimulation,
534                                 shellfc != nullptr,
535                                 opt2fn("-awh", nfile, fnm),
536                                 pull_work);
537
538     if (useReplicaExchange && MASTER(cr))
539     {
540         repl_ex = init_replica_exchange(fplog, ms, top_global.natoms, ir, replExParams);
541     }
542     /* PME tuning is only supported in the Verlet scheme, with PME for
543      * Coulomb. It is not supported with only LJ PME. */
544     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
545                 && ir->cutoff_scheme != CutoffScheme::Group);
546
547     pme_load_balancing_t* pme_loadbal = nullptr;
548     if (bPMETune)
549     {
550         pme_loadbal_init(
551                 &pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu());
552     }
553
554     if (!ir->bContinuation)
555     {
556         if (state->flags & enumValueToBitMask(StateEntry::V))
557         {
558             auto v = makeArrayRef(state->v);
559             /* Set the velocities of vsites, shells and frozen atoms to zero */
560             for (i = 0; i < md->homenr; i++)
561             {
562                 if (md->ptype[i] == ParticleType::Shell)
563                 {
564                     clear_rvec(v[i]);
565                 }
566                 else if (md->cFREEZE)
567                 {
568                     for (m = 0; m < DIM; m++)
569                     {
570                         if (ir->opts.nFreeze[md->cFREEZE[i]][m])
571                         {
572                             v[i][m] = 0;
573                         }
574                     }
575                 }
576             }
577         }
578
579         if (constr)
580         {
581             /* Constrain the initial coordinates and velocities */
582             do_constrain_first(fplog,
583                                constr,
584                                ir,
585                                md->nr,
586                                md->homenr,
587                                state->x.arrayRefWithPadding(),
588                                state->v.arrayRefWithPadding(),
589                                state->box,
590                                state->lambda[FreeEnergyPerturbationCouplingType::Bonded]);
591         }
592     }
593
594     const int nstfep = computeFepPeriod(*ir, replExParams);
595
596     /* Be REALLY careful about what flags you set here. You CANNOT assume
597      * this is the first step, since we might be restarting from a checkpoint,
598      * and in that case we should not do any modifications to the state.
599      */
600     bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && !ir->bContinuation);
601
602     // When restarting from a checkpoint, it can be appropriate to
603     // initialize ekind from quantities in the checkpoint. Otherwise,
604     // compute_globals must initialize ekind before the simulation
605     // starts/restarts. However, only the master rank knows what was
606     // found in the checkpoint file, so we have to communicate in
607     // order to coordinate the restart.
608     //
609     // TODO Consider removing this communication if/when checkpoint
610     // reading directly follows .tpr reading, because all ranks can
611     // agree on hasReadEkinState at that time.
612     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
613     if (PAR(cr))
614     {
615         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
616     }
617     if (hasReadEkinState)
618     {
619         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
620     }
621
622     unsigned int cglo_flags =
623             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
624              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
625
626     bSumEkinhOld = FALSE;
627
628     t_vcm vcm(top_global.groups, *ir);
629     reportComRemovalInfo(fplog, vcm);
630
631     int64_t step     = ir->init_step;
632     int64_t step_rel = 0;
633
634     /* To minimize communication, compute_globals computes the COM velocity
635      * and the kinetic energy for the velocities without COM motion removed.
636      * Thus to get the kinetic energy without the COM contribution, we need
637      * to call compute_globals twice.
638      */
639     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
640     {
641         unsigned int cglo_flags_iteration = cglo_flags;
642         if (bStopCM && cgloIteration == 0)
643         {
644             cglo_flags_iteration |= CGLO_STOPCM;
645             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
646         }
647         compute_globals(gstat,
648                         cr,
649                         ir,
650                         fr,
651                         ekind,
652                         makeConstArrayRef(state->x),
653                         makeConstArrayRef(state->v),
654                         state->box,
655                         md,
656                         nrnb,
657                         &vcm,
658                         nullptr,
659                         enerd,
660                         force_vir,
661                         shake_vir,
662                         total_vir,
663                         pres,
664                         gmx::ArrayRef<real>{},
665                         &nullSignaller,
666                         state->box,
667                         &bSumEkinhOld,
668                         cglo_flags_iteration,
669                         step,
670                         &observablesReducer);
671         if (cglo_flags_iteration & CGLO_STOPCM)
672         {
673             /* At initialization, do not pass x with acceleration-correction mode
674              * to avoid (incorrect) correction of the initial coordinates.
675              */
676             auto x = (vcm.mode == ComRemovalAlgorithm::LinearAccelerationCorrection)
677                              ? ArrayRef<RVec>()
678                              : makeArrayRef(state->x);
679             process_and_stopcm_grp(fplog, &vcm, *md, x, makeArrayRef(state->v));
680             inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
681         }
682     }
683     if (ir->eI == IntegrationAlgorithm::VVAK)
684     {
685         /* a second call to get the half step temperature initialized as well */
686         /* we do the same call as above, but turn the pressure off -- internally to
687            compute_globals, this is recognized as a velocity verlet half-step
688            kinetic energy calculation.  This minimized excess variables, but
689            perhaps loses some logic?*/
690
691         compute_globals(gstat,
692                         cr,
693                         ir,
694                         fr,
695                         ekind,
696                         makeConstArrayRef(state->x),
697                         makeConstArrayRef(state->v),
698                         state->box,
699                         md,
700                         nrnb,
701                         &vcm,
702                         nullptr,
703                         enerd,
704                         force_vir,
705                         shake_vir,
706                         total_vir,
707                         pres,
708                         gmx::ArrayRef<real>{},
709                         &nullSignaller,
710                         state->box,
711                         &bSumEkinhOld,
712                         cglo_flags & ~CGLO_PRESSURE,
713                         step,
714                         &observablesReducer);
715     }
716
717     /* Calculate the initial half step temperature, and save the ekinh_old */
718     if (startingBehavior == StartingBehavior::NewSimulation)
719     {
720         for (i = 0; (i < ir->opts.ngtc); i++)
721         {
722             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
723         }
724     }
725
726     /* need to make an initiation call to get the Trotter variables set, as well as other constants
727        for non-trotter temperature control */
728     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
729
730     if (MASTER(cr))
731     {
732         if (!ir->bContinuation)
733         {
734             if (constr && ir->eConstrAlg == ConstraintAlgorithm::Lincs)
735             {
736                 fprintf(fplog,
737                         "RMS relative constraint deviation after constraining: %.2e\n",
738                         constr->rmsd());
739             }
740             if (EI_STATE_VELOCITY(ir->eI))
741             {
742                 real temp = enerd->term[F_TEMP];
743                 if (ir->eI != IntegrationAlgorithm::VV)
744                 {
745                     /* Result of Ekin averaged over velocities of -half
746                      * and +half step, while we only have -half step here.
747                      */
748                     temp *= 2;
749                 }
750                 fprintf(fplog, "Initial temperature: %g K\n", temp);
751             }
752         }
753
754         char tbuf[20];
755         fprintf(stderr, "starting mdrun '%s'\n", *(top_global.name));
756         if (ir->nsteps >= 0)
757         {
758             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
759         }
760         else
761         {
762             sprintf(tbuf, "%s", "infinite");
763         }
764         if (ir->init_step > 0)
765         {
766             fprintf(stderr,
767                     "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
768                     gmx_step_str(ir->init_step + ir->nsteps, sbuf),
769                     tbuf,
770                     gmx_step_str(ir->init_step, sbuf2),
771                     ir->init_step * ir->delta_t);
772         }
773         else
774         {
775             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
776         }
777         fprintf(fplog, "\n");
778     }
779
780     walltime_accounting_start_time(walltime_accounting);
781     wallcycle_start(wcycle, WallCycleCounter::Run);
782     print_start(fplog, cr, walltime_accounting, "mdrun");
783
784     /***********************************************************
785      *
786      *             Loop over MD steps
787      *
788      ************************************************************/
789
790     bFirstStep = TRUE;
791     /* Skip the first Nose-Hoover integration when we get the state from tpx */
792     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
793     bSumEkinhOld     = FALSE;
794     bExchanged       = FALSE;
795     bNeedRepartition = FALSE;
796
797     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
798             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]),
799             simulationsShareState,
800             MASTER(cr),
801             ir->nstlist,
802             mdrunOptions.reproducible,
803             nstSignalComm,
804             mdrunOptions.maximumHoursToRun,
805             ir->nstlist == 0,
806             fplog,
807             step,
808             bNS,
809             walltime_accounting);
810
811     auto checkpointHandler = std::make_unique<CheckpointHandler>(
812             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
813             simulationsShareState,
814             ir->nstlist == 0,
815             MASTER(cr),
816             mdrunOptions.writeConfout,
817             mdrunOptions.checkpointOptions.period);
818
819     const bool resetCountersIsLocal = true;
820     auto       resetHandler         = std::make_unique<ResetHandler>(
821             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
822             !resetCountersIsLocal,
823             ir->nsteps,
824             MASTER(cr),
825             mdrunOptions.timingOptions.resetHalfway,
826             mdrunOptions.maximumHoursToRun,
827             mdlog,
828             wcycle,
829             walltime_accounting);
830
831     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
832
833     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
834     {
835         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
836     }
837
838     /* and stop now if we should */
839     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
840     while (!bLastStep)
841     {
842
843         /* Determine if this is a neighbor search step */
844         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
845
846         if (bPMETune && bNStList)
847         {
848             // This has to be here because PME load balancing is called so early.
849             // TODO: Move to after all booleans are defined.
850             if (useGpuForUpdate && !bFirstStep)
851             {
852                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
853                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
854             }
855             /* PME grid + cut-off optimization with GPUs or PME nodes */
856             pme_loadbal_do(pme_loadbal,
857                            cr,
858                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
859                            fplog,
860                            mdlog,
861                            *ir,
862                            fr,
863                            state->box,
864                            state->x,
865                            wcycle,
866                            step,
867                            step_rel,
868                            &bPMETunePrinting,
869                            simulationWork.useGpuPmePpCommunication);
870         }
871
872         wallcycle_start(wcycle, WallCycleCounter::Step);
873
874         bLastStep = (step_rel == ir->nsteps);
875         t         = t0 + step * ir->delta_t;
876
877         // TODO Refactor this, so that nstfep does not need a default value of zero
878         if (ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
879         {
880             /* find and set the current lambdas */
881             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
882
883             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
884                            && (!bFirstStep));
885         }
886
887         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
888                      && do_per_step(step, replExParams.exchangeInterval));
889
890         if (doSimulatedAnnealing)
891         {
892             // TODO: Avoid changing inputrec (#3854)
893             // Simulated annealing updates the reference temperature.
894             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
895             update_annealing_target_temp(nonConstInputrec, t, &upd);
896         }
897
898         /* Stop Center of Mass motion */
899         bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && do_per_step(step, ir->nstcomm));
900
901         /* Determine whether or not to do Neighbour Searching */
902         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
903
904         /* Note that the stopHandler will cause termination at nstglobalcomm
905          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
906          * nstpcouple steps, we have computed the half-step kinetic energy
907          * of the previous step and can always output energies at the last step.
908          */
909         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
910
911         /* do_log triggers energy and virial calculation. Because this leads
912          * to different code paths, forces can be different. Thus for exact
913          * continuation we should avoid extra log output.
914          * Note that the || bLastStep can result in non-exact continuation
915          * beyond the last step. But we don't consider that to be an issue.
916          */
917         do_log     = (do_per_step(step, ir->nstlog)
918                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
919         do_verbose = mdrunOptions.verbose
920                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
921
922         if (useGpuForUpdate && !bFirstStep && bNS)
923         {
924             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
925             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
926             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
927             // Copy coordinate from the GPU when needed at the search step.
928             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
929             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
930             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
931             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
932         }
933
934         // We only need to calculate virtual velocities if we are writing them in the current step
935         const bool needVirtualVelocitiesThisStep =
936                 (vsite != nullptr)
937                 && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep());
938
939         if (vsite != nullptr)
940         {
941             // Virtual sites need to be updated before domain decomposition and forces are calculated
942             wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
943             // md-vv calculates virtual velocities once it has full-step real velocities
944             vsite->construct(state->x,
945                              state->v,
946                              state->box,
947                              (!EI_VV(inputrec->eI) && needVirtualVelocitiesThisStep)
948                                      ? VSiteOperation::PositionsAndVelocities
949                                      : VSiteOperation::Positions);
950             wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
951         }
952
953         if (bNS && !(bFirstStep && ir->bContinuation))
954         {
955             bMasterState = FALSE;
956             /* Correct the new box if it is too skewed */
957             if (inputrecDynamicBox(ir))
958             {
959                 if (correct_box(fplog, step, state->box))
960                 {
961                     bMasterState = TRUE;
962                     // If update is offloaded, it should be informed about the box size change
963                     if (useGpuForUpdate)
964                     {
965                         integrator->setPbc(PbcType::Xyz, state->box);
966                     }
967                 }
968             }
969             if (DOMAINDECOMP(cr) && bMasterState)
970             {
971                 dd_collect_state(cr->dd, state, state_global);
972             }
973
974             if (DOMAINDECOMP(cr))
975             {
976                 /* Repartition the domain decomposition */
977                 dd_partition_system(fplog,
978                                     mdlog,
979                                     step,
980                                     cr,
981                                     bMasterState,
982                                     nstglobalcomm,
983                                     state_global,
984                                     top_global,
985                                     *ir,
986                                     imdSession,
987                                     pull_work,
988                                     state,
989                                     &f,
990                                     mdAtoms,
991                                     top,
992                                     fr,
993                                     vsite,
994                                     constr,
995                                     nrnb,
996                                     wcycle,
997                                     do_verbose && !bPMETunePrinting);
998                 upd.updateAfterPartition(state->natoms,
999                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1000                                                      : gmx::ArrayRef<const unsigned short>(),
1001                                          md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1002                                                  : gmx::ArrayRef<const unsigned short>());
1003                 fr->longRangeNonbondeds->updateAfterPartition(*md);
1004             }
1005         }
1006
1007         // Allocate or re-size GPU halo exchange object, if necessary
1008         if (bNS && simulationWork.havePpDomainDecomposition && simulationWork.useGpuHaloExchange)
1009         {
1010             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
1011                                "GPU device manager has to be initialized to use GPU "
1012                                "version of halo exchange.");
1013             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
1014         }
1015
1016         if (MASTER(cr) && do_log)
1017         {
1018             gmx::EnergyOutput::printHeader(
1019                     fplog, step, t); /* can we improve the information printed here? */
1020         }
1021
1022         if (ir->efep != FreeEnergyPerturbationType::No)
1023         {
1024             update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
1025         }
1026
1027         if (bExchanged)
1028         {
1029             /* We need the kinetic energy at minus the half step for determining
1030              * the full step kinetic energy and possibly for T-coupling.*/
1031             /* This may not be quite working correctly yet . . . . */
1032             int cglo_flags = CGLO_GSTAT | CGLO_TEMPERATURE;
1033             compute_globals(gstat,
1034                             cr,
1035                             ir,
1036                             fr,
1037                             ekind,
1038                             makeConstArrayRef(state->x),
1039                             makeConstArrayRef(state->v),
1040                             state->box,
1041                             md,
1042                             nrnb,
1043                             &vcm,
1044                             wcycle,
1045                             enerd,
1046                             nullptr,
1047                             nullptr,
1048                             nullptr,
1049                             nullptr,
1050                             gmx::ArrayRef<real>{},
1051                             &nullSignaller,
1052                             state->box,
1053                             &bSumEkinhOld,
1054                             cglo_flags,
1055                             step,
1056                             &observablesReducer);
1057         }
1058         clear_mat(force_vir);
1059
1060         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
1061
1062         /* Determine the energy and pressure:
1063          * at nstcalcenergy steps and at energy output steps (set below).
1064          */
1065         if (EI_VV(ir->eI) && (!bInitStep))
1066         {
1067             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1068             bCalcVir      = bCalcEnerStep
1069                        || (ir->epc != PressureCoupling::No
1070                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
1071         }
1072         else
1073         {
1074             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1075             bCalcVir      = bCalcEnerStep
1076                        || (ir->epc != PressureCoupling::No && do_per_step(step, ir->nstpcouple));
1077         }
1078         bCalcEner = bCalcEnerStep;
1079
1080         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
1081
1082         if (do_ene || do_log || bDoReplEx)
1083         {
1084             bCalcVir  = TRUE;
1085             bCalcEner = TRUE;
1086         }
1087
1088         // bCalcEner is only here for when the last step is not a mulitple of nstfep
1089         const bool computeDHDL = ((ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
1090                                   && (do_per_step(step, nstfep) || bCalcEner));
1091
1092         /* Do we need global communication ? */
1093         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
1094                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
1095
1096         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
1097                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
1098                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (computeDHDL ? GMX_FORCE_DHDL : 0));
1099         if (simulationWork.useMts && !do_per_step(step, ir->nstfout))
1100         {
1101             // TODO: merge this with stepWork.useOnlyMtsCombinedForceBuffer
1102             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
1103         }
1104
1105         if (shellfc)
1106         {
1107             /* Now is the time to relax the shells */
1108             relax_shell_flexcon(fplog,
1109                                 cr,
1110                                 ms,
1111                                 mdrunOptions.verbose,
1112                                 enforcedRotation,
1113                                 step,
1114                                 ir,
1115                                 imdSession,
1116                                 pull_work,
1117                                 bNS,
1118                                 force_flags,
1119                                 top,
1120                                 constr,
1121                                 enerd,
1122                                 state->natoms,
1123                                 state->x.arrayRefWithPadding(),
1124                                 state->v.arrayRefWithPadding(),
1125                                 state->box,
1126                                 state->lambda,
1127                                 &state->hist,
1128                                 &f.view(),
1129                                 force_vir,
1130                                 *md,
1131                                 fr->longRangeNonbondeds.get(),
1132                                 nrnb,
1133                                 wcycle,
1134                                 shellfc,
1135                                 fr,
1136                                 runScheduleWork,
1137                                 t,
1138                                 mu_tot,
1139                                 vsite,
1140                                 ddBalanceRegionHandler);
1141         }
1142         else
1143         {
1144             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
1145                is updated (or the AWH update will be performed twice for one step when continuing).
1146                It would be best to call this update function from do_md_trajectory_writing but that
1147                would occur after do_force. One would have to divide the update_awh function into one
1148                function applying the AWH force and one doing the AWH bias update. The update AWH
1149                bias function could then be called after do_md_trajectory_writing (then containing
1150                update_awh_history). The checkpointing will in the future probably moved to the start
1151                of the md loop which will rid of this issue. */
1152             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
1153             {
1154                 awh->updateHistory(state_global->awhHistory.get());
1155             }
1156
1157             /* The coordinates (x) are shifted (to get whole molecules)
1158              * in do_force.
1159              * This is parallellized as well, and does communication too.
1160              * Check comments in sim_util.c
1161              */
1162             do_force(fplog,
1163                      cr,
1164                      ms,
1165                      *ir,
1166                      awh.get(),
1167                      enforcedRotation,
1168                      imdSession,
1169                      pull_work,
1170                      step,
1171                      nrnb,
1172                      wcycle,
1173                      top,
1174                      state->box,
1175                      state->x.arrayRefWithPadding(),
1176                      &state->hist,
1177                      &f.view(),
1178                      force_vir,
1179                      md,
1180                      enerd,
1181                      state->lambda,
1182                      fr,
1183                      runScheduleWork,
1184                      vsite,
1185                      mu_tot,
1186                      t,
1187                      ed ? ed->getLegacyED() : nullptr,
1188                      fr->longRangeNonbondeds.get(),
1189                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
1190                      ddBalanceRegionHandler);
1191         }
1192
1193         // VV integrators do not need the following velocity half step
1194         // if it is the first step after starting from a checkpoint.
1195         // That is, the half step is needed on all other steps, and
1196         // also the first step when starting from a .tpr file.
1197         if (EI_VV(ir->eI))
1198         {
1199             integrateVVFirstStep(step,
1200                                  bFirstStep,
1201                                  bInitStep,
1202                                  startingBehavior,
1203                                  nstglobalcomm,
1204                                  ir,
1205                                  fr,
1206                                  cr,
1207                                  state,
1208                                  mdAtoms->mdatoms(),
1209                                  &fcdata,
1210                                  &MassQ,
1211                                  &vcm,
1212                                  enerd,
1213                                  &observablesReducer,
1214                                  ekind,
1215                                  gstat,
1216                                  &last_ekin,
1217                                  bCalcVir,
1218                                  total_vir,
1219                                  shake_vir,
1220                                  force_vir,
1221                                  pres,
1222                                  M,
1223                                  do_log,
1224                                  do_ene,
1225                                  bCalcEner,
1226                                  bGStat,
1227                                  bStopCM,
1228                                  bTrotter,
1229                                  bExchanged,
1230                                  &bSumEkinhOld,
1231                                  &saved_conserved_quantity,
1232                                  &f,
1233                                  &upd,
1234                                  constr,
1235                                  &nullSignaller,
1236                                  trotter_seq,
1237                                  nrnb,
1238                                  fplog,
1239                                  wcycle);
1240             if (vsite != nullptr && needVirtualVelocitiesThisStep)
1241             {
1242                 // Positions were calculated earlier
1243                 wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
1244                 vsite->construct(state->x, state->v, state->box, VSiteOperation::Velocities);
1245                 wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
1246             }
1247         }
1248
1249         /* ########  END FIRST UPDATE STEP  ############## */
1250         /* ########  If doing VV, we now have v(dt) ###### */
1251         if (bDoExpanded)
1252         {
1253             /* perform extended ensemble sampling in lambda - we don't
1254                actually move to the new state before outputting
1255                statistics, but if performing simulated tempering, we
1256                do update the velocities and the tau_t. */
1257             // TODO: Avoid changing inputrec (#3854)
1258             // Simulated tempering updates the reference temperature.
1259             // Expanded ensemble without simulated tempering does not change the inputrec.
1260             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
1261             lamnew                 = ExpandedEnsembleDynamics(fplog,
1262                                               nonConstInputrec,
1263                                               enerd,
1264                                               state,
1265                                               &MassQ,
1266                                               state->fep_state,
1267                                               state->dfhist,
1268                                               step,
1269                                               state->v.rvec_array(),
1270                                               md->homenr,
1271                                               md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1272                                                                       : gmx::ArrayRef<const unsigned short>());
1273             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1274             if (MASTER(cr))
1275             {
1276                 copy_df_history(state_global->dfhist, state->dfhist);
1277             }
1278         }
1279
1280         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1281         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1282         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1283             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1284                 || checkpointHandler->isCheckpointingStep()))
1285         {
1286             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1287             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1288         }
1289         // Copy velocities if needed for the output/checkpointing.
1290         // NOTE: Copy on the search steps is done at the beginning of the step.
1291         if (useGpuForUpdate && !bNS
1292             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1293         {
1294             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1295             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1296         }
1297         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1298         // and update is offloaded hence forces are kept on the GPU for update and have not been
1299         // already transferred in do_force().
1300         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1301         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1302         //       prior to GPU update.
1303         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1304         //       copy call in do_force(...).
1305         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1306         //       on host after the D2H copy in do_force(...).
1307         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1308             && do_per_step(step, ir->nstfout))
1309         {
1310             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1311             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1312         }
1313         /* Now we have the energies and forces corresponding to the
1314          * coordinates at time t. We must output all of this before
1315          * the update.
1316          */
1317         do_md_trajectory_writing(fplog,
1318                                  cr,
1319                                  nfile,
1320                                  fnm,
1321                                  step,
1322                                  step_rel,
1323                                  t,
1324                                  ir,
1325                                  state,
1326                                  state_global,
1327                                  observablesHistory,
1328                                  top_global,
1329                                  fr,
1330                                  outf,
1331                                  energyOutput,
1332                                  ekind,
1333                                  f.view().force(),
1334                                  checkpointHandler->isCheckpointingStep(),
1335                                  bRerunMD,
1336                                  bLastStep,
1337                                  mdrunOptions.writeConfout,
1338                                  bSumEkinhOld);
1339         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1340         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x, t);
1341
1342         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1343         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1344             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1345         {
1346             copy_mat(state->svir_prev, shake_vir);
1347             copy_mat(state->fvir_prev, force_vir);
1348         }
1349
1350         stopHandler->setSignal();
1351         resetHandler->setSignal(walltime_accounting);
1352
1353         if (bGStat || !PAR(cr))
1354         {
1355             /* In parallel we only have to check for checkpointing in steps
1356              * where we do global communication,
1357              *  otherwise the other nodes don't know.
1358              */
1359             checkpointHandler->setSignal(walltime_accounting);
1360         }
1361
1362         /* #########   START SECOND UPDATE STEP ################# */
1363
1364         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1365            controlled in preprocessing */
1366
1367         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1368         {
1369             gmx_bool bIfRandomize;
1370             bIfRandomize = update_randomize_velocities(ir,
1371                                                        step,
1372                                                        cr,
1373                                                        md->homenr,
1374                                                        md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1375                                                                : gmx::ArrayRef<const unsigned short>(),
1376                                                        gmx::arrayRefFromArray(md->invmass, md->nr),
1377                                                        state->v,
1378                                                        &upd,
1379                                                        constr);
1380             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1381             if (constr && bIfRandomize)
1382             {
1383                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1384             }
1385         }
1386         /* Box is changed in update() when we do pressure coupling,
1387          * but we should still use the old box for energy corrections and when
1388          * writing it to the energy file, so it matches the trajectory files for
1389          * the same timestep above. Make a copy in a separate array.
1390          */
1391         copy_mat(state->box, lastbox);
1392
1393         dvdl_constr = 0;
1394
1395         if (!useGpuForUpdate)
1396         {
1397             wallcycle_start(wcycle, WallCycleCounter::Update);
1398         }
1399         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1400         if (bTrotter)
1401         {
1402             trotter_update(ir,
1403                            step,
1404                            ekind,
1405                            enerd,
1406                            state,
1407                            total_vir,
1408                            md->homenr,
1409                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1410                                    : gmx::ArrayRef<const unsigned short>(),
1411                            gmx::arrayRefFromArray(md->invmass, md->nr),
1412                            &MassQ,
1413                            trotter_seq,
1414                            TrotterSequence::Three);
1415             /* We can only do Berendsen coupling after we have summed
1416              * the kinetic energy or virial. Since the happens
1417              * in global_state after update, we should only do it at
1418              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1419              */
1420         }
1421         else
1422         {
1423             update_tcouple(step,
1424                            ir,
1425                            state,
1426                            ekind,
1427                            &MassQ,
1428                            md->homenr,
1429                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1430                                    : gmx::ArrayRef<const unsigned short>());
1431             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1432         }
1433
1434         /* With leap-frog type integrators we compute the kinetic energy
1435          * at a whole time step as the average of the half-time step kinetic
1436          * energies of two subsequent steps. Therefore we need to compute the
1437          * half step kinetic energy also if we need energies at the next step.
1438          */
1439         const bool needHalfStepKineticEnergy =
1440                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1441
1442         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1443         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1444         const bool doParrinelloRahman = (ir->epc == PressureCoupling::ParrinelloRahman
1445                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1446
1447         if (EI_VV(ir->eI))
1448         {
1449             GMX_ASSERT(!useGpuForUpdate, "GPU update is not supported with VVAK integrator.");
1450
1451             integrateVVSecondStep(step,
1452                                   ir,
1453                                   fr,
1454                                   cr,
1455                                   state,
1456                                   mdAtoms->mdatoms(),
1457                                   &fcdata,
1458                                   &MassQ,
1459                                   &vcm,
1460                                   pull_work,
1461                                   enerd,
1462                                   &observablesReducer,
1463                                   ekind,
1464                                   gstat,
1465                                   &dvdl_constr,
1466                                   bCalcVir,
1467                                   total_vir,
1468                                   shake_vir,
1469                                   force_vir,
1470                                   pres,
1471                                   M,
1472                                   lastbox,
1473                                   do_log,
1474                                   do_ene,
1475                                   bGStat,
1476                                   &bSumEkinhOld,
1477                                   &f,
1478                                   &cbuf,
1479                                   &upd,
1480                                   constr,
1481                                   &nullSignaller,
1482                                   trotter_seq,
1483                                   nrnb,
1484                                   wcycle);
1485         }
1486         else
1487         {
1488             if (useGpuForUpdate)
1489             {
1490                 if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1491                 {
1492                     integrator->set(stateGpu->getCoordinates(),
1493                                     stateGpu->getVelocities(),
1494                                     stateGpu->getForces(),
1495                                     top->idef,
1496                                     *md);
1497
1498                     // Copy data to the GPU after buffers might have being reinitialized
1499                     /* The velocity copy is redundant if we had Center-of-Mass motion removed on
1500                      * the previous step. We don't check that now. */
1501                     stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1502                     if (!runScheduleWork->stepWork.haveGpuPmeOnThisRank
1503                         && !runScheduleWork->stepWork.useGpuXBufferOps)
1504                     {
1505                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1506                     }
1507                 }
1508
1509                 if (simulationWork.useGpuPme && simulationWork.useCpuPmePpCommunication)
1510                 {
1511                     // The PME forces were recieved to the host, and reduced on the CPU with the rest of the
1512                     // forces computed on the GPU, so the final forces have to be copied back to the GPU
1513                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1514                 }
1515                 else if (!runScheduleWork->stepWork.useGpuFBufferOps)
1516                 {
1517                     // The buffer ops were not offloaded this step, so the forces are on the
1518                     // host and have to be copied
1519                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1520                 }
1521                 const bool doTemperatureScaling =
1522                         (ir->etc != TemperatureCoupling::No
1523                          && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1524
1525                 // This applies Leap-Frog, LINCS and SETTLE in succession
1526                 integrator->integrate(
1527                         stateGpu->getForcesReadyOnDeviceEvent(
1528                                 AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1529                         ir->delta_t,
1530                         true,
1531                         bCalcVir,
1532                         shake_vir,
1533                         doTemperatureScaling,
1534                         ekind->tcstat,
1535                         doParrinelloRahman,
1536                         ir->nstpcouple * ir->delta_t,
1537                         M);
1538
1539                 // Copy velocities D2H after update if:
1540                 // - Globals are computed this step (includes the energy output steps).
1541                 // - Temperature is needed for the next step.
1542                 if (bGStat || needHalfStepKineticEnergy)
1543                 {
1544                     stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1545                     stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1546                 }
1547             }
1548             else
1549             {
1550                 /* With multiple time stepping we need to do an additional normal
1551                  * update step to obtain the virial, as the actual MTS integration
1552                  * using an acceleration where the slow forces are multiplied by mtsFactor.
1553                  * Using that acceleration would result in a virial with the slow
1554                  * force contribution would be a factor mtsFactor too large.
1555                  */
1556                 if (simulationWork.useMts && bCalcVir && constr != nullptr)
1557                 {
1558                     upd.update_for_constraint_virial(*ir,
1559                                                      md->homenr,
1560                                                      md->havePartiallyFrozenAtoms,
1561                                                      gmx::arrayRefFromArray(md->invmass, md->nr),
1562                                                      gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1563                                                      *state,
1564                                                      f.view().forceWithPadding(),
1565                                                      *ekind);
1566
1567                     constrain_coordinates(constr,
1568                                           do_log,
1569                                           do_ene,
1570                                           step,
1571                                           state,
1572                                           upd.xp()->arrayRefWithPadding(),
1573                                           &dvdl_constr,
1574                                           bCalcVir,
1575                                           shake_vir);
1576                 }
1577
1578                 ArrayRefWithPadding<const RVec> forceCombined =
1579                         (simulationWork.useMts && step % ir->mtsLevels[1].stepFactor == 0)
1580                                 ? f.view().forceMtsCombinedWithPadding()
1581                                 : f.view().forceWithPadding();
1582                 upd.update_coords(*ir,
1583                                   step,
1584                                   md->homenr,
1585                                   md->havePartiallyFrozenAtoms,
1586                                   gmx::arrayRefFromArray(md->ptype, md->nr),
1587                                   gmx::arrayRefFromArray(md->invmass, md->nr),
1588                                   gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1589                                   state,
1590                                   forceCombined,
1591                                   &fcdata,
1592                                   ekind,
1593                                   M,
1594                                   etrtPOSITION,
1595                                   cr,
1596                                   constr != nullptr);
1597
1598                 wallcycle_stop(wcycle, WallCycleCounter::Update);
1599
1600                 constrain_coordinates(constr,
1601                                       do_log,
1602                                       do_ene,
1603                                       step,
1604                                       state,
1605                                       upd.xp()->arrayRefWithPadding(),
1606                                       &dvdl_constr,
1607                                       bCalcVir && !simulationWork.useMts,
1608                                       shake_vir);
1609
1610                 upd.update_sd_second_half(*ir,
1611                                           step,
1612                                           &dvdl_constr,
1613                                           md->homenr,
1614                                           gmx::arrayRefFromArray(md->ptype, md->nr),
1615                                           gmx::arrayRefFromArray(md->invmass, md->nr),
1616                                           state,
1617                                           cr,
1618                                           nrnb,
1619                                           wcycle,
1620                                           constr,
1621                                           do_log,
1622                                           do_ene);
1623                 upd.finish_update(
1624                         *ir, md->havePartiallyFrozenAtoms, md->homenr, state, wcycle, constr != nullptr);
1625             }
1626
1627             if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1628             {
1629                 updatePrevStepPullCom(pull_work, state);
1630             }
1631
1632             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1633         }
1634
1635         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1636         /* With Leap-Frog we can skip compute_globals at
1637          * non-communication steps, but we need to calculate
1638          * the kinetic energy one step before communication.
1639          */
1640         {
1641             // Organize to do inter-simulation signalling on steps if
1642             // and when algorithms require it.
1643             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1644
1645             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1646             {
1647                 // Copy coordinates when needed to stop the CM motion.
1648                 if (useGpuForUpdate && (bDoReplEx || (!EI_VV(ir->eI) && bStopCM)))
1649                 {
1650                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1651                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1652                 }
1653                 // Since we're already communicating at this step, we
1654                 // can propagate intra-simulation signals. Note that
1655                 // check_nstglobalcomm has the responsibility for
1656                 // choosing the value of nstglobalcomm that is one way
1657                 // bGStat becomes true, so we can't get into a
1658                 // situation where e.g. checkpointing can't be
1659                 // signalled.
1660                 bool                doIntraSimSignal = true;
1661                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1662
1663                 compute_globals(
1664                         gstat,
1665                         cr,
1666                         ir,
1667                         fr,
1668                         ekind,
1669                         makeConstArrayRef(state->x),
1670                         makeConstArrayRef(state->v),
1671                         state->box,
1672                         md,
1673                         nrnb,
1674                         &vcm,
1675                         wcycle,
1676                         enerd,
1677                         force_vir,
1678                         shake_vir,
1679                         total_vir,
1680                         pres,
1681                         (!EI_VV(ir->eI) && bCalcEner && constr != nullptr) ? constr->rmsdData()
1682                                                                            : gmx::ArrayRef<real>{},
1683                         &signaller,
1684                         lastbox,
1685                         &bSumEkinhOld,
1686                         (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1687                                 | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1688                                 | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1689                                 | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT,
1690                         step,
1691                         &observablesReducer);
1692                 if (!EI_VV(ir->eI) && bStopCM)
1693                 {
1694                     process_and_stopcm_grp(
1695                             fplog, &vcm, *md, makeArrayRef(state->x), makeArrayRef(state->v));
1696                     inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
1697
1698                     // TODO: The special case of removing CM motion should be dealt more gracefully
1699                     if (useGpuForUpdate)
1700                     {
1701                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1702                         // Here we block until the H2D copy completes because event sync with the
1703                         // force kernels that use the coordinates on the next steps is not implemented
1704                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1705                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1706                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1707                         if (vcm.mode != ComRemovalAlgorithm::No)
1708                         {
1709                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1710                         }
1711                     }
1712                 }
1713             }
1714         }
1715
1716         /* #############  END CALC EKIN AND PRESSURE ################# */
1717
1718         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1719            the virial that should probably be addressed eventually. state->veta has better properies,
1720            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1721            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1722
1723         if (ir->efep != FreeEnergyPerturbationType::No && !EI_VV(ir->eI))
1724         {
1725             /* Sum up the foreign energy and dK/dl terms for md and sd.
1726                Currently done every step so that dH/dl is correct in the .edr */
1727             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1728         }
1729
1730         update_pcouple_after_coordinates(fplog,
1731                                          step,
1732                                          ir,
1733                                          md->homenr,
1734                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1735                                                      : gmx::ArrayRef<const unsigned short>(),
1736                                          pres,
1737                                          force_vir,
1738                                          shake_vir,
1739                                          pressureCouplingMu,
1740                                          state,
1741                                          nrnb,
1742                                          upd.deform(),
1743                                          !useGpuForUpdate);
1744
1745         const bool doBerendsenPressureCoupling = (inputrec->epc == PressureCoupling::Berendsen
1746                                                   && do_per_step(step, inputrec->nstpcouple));
1747         const bool doCRescalePressureCoupling  = (inputrec->epc == PressureCoupling::CRescale
1748                                                  && do_per_step(step, inputrec->nstpcouple));
1749         if (useGpuForUpdate
1750             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1751         {
1752             integrator->scaleCoordinates(pressureCouplingMu);
1753             if (doCRescalePressureCoupling)
1754             {
1755                 matrix pressureCouplingInvMu;
1756                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1757                 integrator->scaleVelocities(pressureCouplingInvMu);
1758             }
1759             integrator->setPbc(PbcType::Xyz, state->box);
1760         }
1761
1762         /* ################# END UPDATE STEP 2 ################# */
1763         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1764
1765         /* The coordinates (x) were unshifted in update */
1766         if (!bGStat)
1767         {
1768             /* We will not sum ekinh_old,
1769              * so signal that we still have to do it.
1770              */
1771             bSumEkinhOld = TRUE;
1772         }
1773
1774         if (bCalcEner)
1775         {
1776             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1777
1778             /* use the directly determined last velocity, not actually the averaged half steps */
1779             if (bTrotter && ir->eI == IntegrationAlgorithm::VV)
1780             {
1781                 enerd->term[F_EKIN] = last_ekin;
1782             }
1783             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1784
1785             if (integratorHasConservedEnergyQuantity(ir))
1786             {
1787                 if (EI_VV(ir->eI))
1788                 {
1789                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1790                 }
1791                 else
1792                 {
1793                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1794                 }
1795             }
1796             /* #########  END PREPARING EDR OUTPUT  ###########  */
1797         }
1798
1799         /* Output stuff */
1800         if (MASTER(cr))
1801         {
1802             if (fplog && do_log && bDoExpanded)
1803             {
1804                 /* only needed if doing expanded ensemble */
1805                 PrintFreeEnergyInfoToFile(fplog,
1806                                           ir->fepvals.get(),
1807                                           ir->expandedvals.get(),
1808                                           ir->bSimTemp ? ir->simtempvals.get() : nullptr,
1809                                           state_global->dfhist,
1810                                           state->fep_state,
1811                                           ir->nstlog,
1812                                           step);
1813             }
1814             if (bCalcEner)
1815             {
1816                 const bool outputDHDL = (computeDHDL && do_per_step(step, ir->fepvals->nstdhdl));
1817
1818                 energyOutput.addDataAtEnergyStep(outputDHDL,
1819                                                  bCalcEnerStep,
1820                                                  t,
1821                                                  md->tmass,
1822                                                  enerd,
1823                                                  ir->fepvals.get(),
1824                                                  ir->expandedvals.get(),
1825                                                  lastbox,
1826                                                  PTCouplingArrays{ state->boxv,
1827                                                                    state->nosehoover_xi,
1828                                                                    state->nosehoover_vxi,
1829                                                                    state->nhpres_xi,
1830                                                                    state->nhpres_vxi },
1831                                                  state->fep_state,
1832                                                  total_vir,
1833                                                  pres,
1834                                                  ekind,
1835                                                  mu_tot,
1836                                                  constr);
1837             }
1838             else
1839             {
1840                 energyOutput.recordNonEnergyStep();
1841             }
1842
1843             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1844             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1845
1846             if (doSimulatedAnnealing)
1847             {
1848                 gmx::EnergyOutput::printAnnealingTemperatures(
1849                         do_log ? fplog : nullptr, groups, &(ir->opts));
1850             }
1851             if (do_log || do_ene || do_dr || do_or)
1852             {
1853                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf),
1854                                                    do_ene,
1855                                                    do_dr,
1856                                                    do_or,
1857                                                    do_log ? fplog : nullptr,
1858                                                    step,
1859                                                    t,
1860                                                    fr->fcdata.get(),
1861                                                    awh.get());
1862             }
1863             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1864             {
1865                 const bool isInitialOutput = false;
1866                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1867             }
1868
1869             if (ir->bPull)
1870             {
1871                 pull_print_output(pull_work, step, t);
1872             }
1873
1874             if (do_per_step(step, ir->nstlog))
1875             {
1876                 if (fflush(fplog) != 0)
1877                 {
1878                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1879                 }
1880             }
1881         }
1882         if (bDoExpanded)
1883         {
1884             /* Have to do this part _after_ outputting the logfile and the edr file */
1885             /* Gets written into the state at the beginning of next loop*/
1886             state->fep_state = lamnew;
1887         }
1888         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1889         {
1890             state->fep_state = awh->fepLambdaState();
1891         }
1892         /* Print the remaining wall clock time for the run */
1893         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1894         {
1895             if (shellfc)
1896             {
1897                 fprintf(stderr, "\n");
1898             }
1899             print_time(stderr, walltime_accounting, step, ir, cr);
1900         }
1901
1902         /* Ion/water position swapping.
1903          * Not done in last step since trajectory writing happens before this call
1904          * in the MD loop and exchanges would be lost anyway. */
1905         bNeedRepartition = FALSE;
1906         if ((ir->eSwapCoords != SwapType::No) && (step > 0) && !bLastStep
1907             && do_per_step(step, ir->swap->nstswap))
1908         {
1909             bNeedRepartition = do_swapcoords(cr,
1910                                              step,
1911                                              t,
1912                                              ir,
1913                                              swap,
1914                                              wcycle,
1915                                              as_rvec_array(state->x.data()),
1916                                              state->box,
1917                                              MASTER(cr) && mdrunOptions.verbose,
1918                                              bRerunMD);
1919
1920             if (bNeedRepartition && DOMAINDECOMP(cr))
1921             {
1922                 dd_collect_state(cr->dd, state, state_global);
1923             }
1924         }
1925
1926         /* Replica exchange */
1927         bExchanged = FALSE;
1928         if (bDoReplEx)
1929         {
1930             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1931         }
1932
1933         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1934         {
1935             dd_partition_system(fplog,
1936                                 mdlog,
1937                                 step,
1938                                 cr,
1939                                 TRUE,
1940                                 1,
1941                                 state_global,
1942                                 top_global,
1943                                 *ir,
1944                                 imdSession,
1945                                 pull_work,
1946                                 state,
1947                                 &f,
1948                                 mdAtoms,
1949                                 top,
1950                                 fr,
1951                                 vsite,
1952                                 constr,
1953                                 nrnb,
1954                                 wcycle,
1955                                 FALSE);
1956             upd.updateAfterPartition(state->natoms,
1957                                      md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1958                                                  : gmx::ArrayRef<const unsigned short>(),
1959                                      md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1960                                              : gmx::ArrayRef<const unsigned short>());
1961             fr->longRangeNonbondeds->updateAfterPartition(*md);
1962         }
1963
1964         bFirstStep = FALSE;
1965         bInitStep  = FALSE;
1966
1967         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1968         /* With all integrators, except VV, we need to retain the pressure
1969          * at the current step for coupling at the next step.
1970          */
1971         if ((state->flags & enumValueToBitMask(StateEntry::PressurePrevious))
1972             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1973         {
1974             /* Store the pressure in t_state for pressure coupling
1975              * at the next MD step.
1976              */
1977             copy_mat(pres, state->pres_prev);
1978         }
1979
1980         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1981
1982         if ((membed != nullptr) && (!bLastStep))
1983         {
1984             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1985         }
1986
1987         cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
1988         if (DOMAINDECOMP(cr) && wcycle)
1989         {
1990             dd_cycles_add(cr->dd, cycles, ddCyclStep);
1991         }
1992
1993         /* increase the MD step number */
1994         step++;
1995         step_rel++;
1996
1997 #if GMX_FAHCORE
1998         if (MASTER(cr))
1999         {
2000             fcReportProgress(ir->nsteps + ir->init_step, step);
2001         }
2002 #endif
2003
2004         resetHandler->resetCounters(
2005                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
2006
2007         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
2008         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
2009     }
2010     /* End of main MD loop */
2011
2012     /* Closing TNG files can include compressing data. Therefore it is good to do that
2013      * before stopping the time measurements. */
2014     mdoutf_tng_close(outf);
2015
2016     /* Stop measuring walltime */
2017     walltime_accounting_end_time(walltime_accounting);
2018
2019     if (simulationWork.haveSeparatePmeRank)
2020     {
2021         /* Tell the PME only node to finish */
2022         gmx_pme_send_finish(cr);
2023     }
2024
2025     if (MASTER(cr))
2026     {
2027         if (ir->nstcalcenergy > 0)
2028         {
2029             energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
2030
2031             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
2032             energyOutput.printAverages(fplog, groups);
2033         }
2034     }
2035     done_mdoutf(outf);
2036
2037     if (bPMETune)
2038     {
2039         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
2040     }
2041
2042     done_shellfc(fplog, shellfc, step_rel);
2043
2044     if (useReplicaExchange && MASTER(cr))
2045     {
2046         print_replica_exchange_statistics(fplog, repl_ex);
2047     }
2048
2049     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
2050
2051     global_stat_destroy(gstat);
2052 }