Fix clang-format
[alexxy/gromacs.git] / src / gromacs / mdrun / md.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 /*! \internal \file
38  *
39  * \brief Implements the integrator for normal molecular dynamics simulations
40  *
41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
42  * \ingroup module_mdrun
43  */
44 #include "gmxpre.h"
45
46 #include <cinttypes>
47 #include <cmath>
48 #include <cstdio>
49 #include <cstdlib>
50
51 #include <algorithm>
52 #include <memory>
53 #include <numeric>
54
55 #include "gromacs/applied_forces/awh/awh.h"
56 #include "gromacs/applied_forces/awh/read_params.h"
57 #include "gromacs/commandline/filenm.h"
58 #include "gromacs/domdec/collect.h"
59 #include "gromacs/domdec/dlbtiming.h"
60 #include "gromacs/domdec/domdec.h"
61 #include "gromacs/domdec/domdec_network.h"
62 #include "gromacs/domdec/domdec_struct.h"
63 #include "gromacs/domdec/gpuhaloexchange.h"
64 #include "gromacs/domdec/mdsetup.h"
65 #include "gromacs/domdec/partition.h"
66 #include "gromacs/essentialdynamics/edsam.h"
67 #include "gromacs/ewald/pme_load_balancing.h"
68 #include "gromacs/ewald/pme_pp.h"
69 #include "gromacs/fileio/trxio.h"
70 #include "gromacs/gmxlib/network.h"
71 #include "gromacs/gmxlib/nrnb.h"
72 #include "gromacs/gpu_utils/device_stream_manager.h"
73 #include "gromacs/gpu_utils/gpu_utils.h"
74 #include "gromacs/imd/imd.h"
75 #include "gromacs/listed_forces/listed_forces.h"
76 #include "gromacs/math/functions.h"
77 #include "gromacs/math/invertmatrix.h"
78 #include "gromacs/math/vec.h"
79 #include "gromacs/math/vectypes.h"
80 #include "gromacs/mdlib/checkpointhandler.h"
81 #include "gromacs/mdlib/compute_io.h"
82 #include "gromacs/mdlib/constr.h"
83 #include "gromacs/mdlib/coupling.h"
84 #include "gromacs/mdlib/ebin.h"
85 #include "gromacs/mdlib/enerdata_utils.h"
86 #include "gromacs/mdlib/energyoutput.h"
87 #include "gromacs/mdlib/expanded.h"
88 #include "gromacs/mdlib/force.h"
89 #include "gromacs/mdlib/force_flags.h"
90 #include "gromacs/mdlib/forcerec.h"
91 #include "gromacs/mdlib/freeenergyparameters.h"
92 #include "gromacs/mdlib/md_support.h"
93 #include "gromacs/mdlib/mdatoms.h"
94 #include "gromacs/mdlib/mdoutf.h"
95 #include "gromacs/mdlib/membed.h"
96 #include "gromacs/mdlib/resethandler.h"
97 #include "gromacs/mdlib/sighandler.h"
98 #include "gromacs/mdlib/simulationsignal.h"
99 #include "gromacs/mdlib/stat.h"
100 #include "gromacs/mdlib/stophandler.h"
101 #include "gromacs/mdlib/tgroup.h"
102 #include "gromacs/mdlib/trajectory_writing.h"
103 #include "gromacs/mdlib/update.h"
104 #include "gromacs/mdlib/update_constrain_gpu.h"
105 #include "gromacs/mdlib/update_vv.h"
106 #include "gromacs/mdlib/vcm.h"
107 #include "gromacs/mdlib/vsite.h"
108 #include "gromacs/mdrunutility/handlerestart.h"
109 #include "gromacs/mdrunutility/multisim.h"
110 #include "gromacs/mdrunutility/printtime.h"
111 #include "gromacs/mdtypes/awh_history.h"
112 #include "gromacs/mdtypes/awh_params.h"
113 #include "gromacs/mdtypes/commrec.h"
114 #include "gromacs/mdtypes/df_history.h"
115 #include "gromacs/mdtypes/energyhistory.h"
116 #include "gromacs/mdtypes/fcdata.h"
117 #include "gromacs/mdtypes/forcebuffers.h"
118 #include "gromacs/mdtypes/forcerec.h"
119 #include "gromacs/mdtypes/group.h"
120 #include "gromacs/mdtypes/inputrec.h"
121 #include "gromacs/mdtypes/interaction_const.h"
122 #include "gromacs/mdtypes/md_enums.h"
123 #include "gromacs/mdtypes/mdatom.h"
124 #include "gromacs/mdtypes/mdrunoptions.h"
125 #include "gromacs/mdtypes/multipletimestepping.h"
126 #include "gromacs/mdtypes/observableshistory.h"
127 #include "gromacs/mdtypes/pullhistory.h"
128 #include "gromacs/mdtypes/simulation_workload.h"
129 #include "gromacs/mdtypes/state.h"
130 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
131 #include "gromacs/modularsimulator/energydata.h"
132 #include "gromacs/nbnxm/gpu_data_mgmt.h"
133 #include "gromacs/nbnxm/nbnxm.h"
134 #include "gromacs/pbcutil/pbc.h"
135 #include "gromacs/pulling/output.h"
136 #include "gromacs/pulling/pull.h"
137 #include "gromacs/swap/swapcoords.h"
138 #include "gromacs/timing/wallcycle.h"
139 #include "gromacs/timing/walltime_accounting.h"
140 #include "gromacs/topology/atoms.h"
141 #include "gromacs/topology/idef.h"
142 #include "gromacs/topology/mtop_util.h"
143 #include "gromacs/topology/topology.h"
144 #include "gromacs/trajectory/trajectoryframe.h"
145 #include "gromacs/utility/basedefinitions.h"
146 #include "gromacs/utility/cstringutil.h"
147 #include "gromacs/utility/fatalerror.h"
148 #include "gromacs/utility/logger.h"
149 #include "gromacs/utility/real.h"
150 #include "gromacs/utility/smalloc.h"
151
152 #include "legacysimulator.h"
153 #include "replicaexchange.h"
154 #include "shellfc.h"
155
156 using gmx::SimulationSignaller;
157
158 void gmx::LegacySimulator::do_md()
159 {
160     // TODO Historically, the EM and MD "integrators" used different
161     // names for the t_inputrec *parameter, but these must have the
162     // same name, now that it's a member of a struct. We use this ir
163     // alias to avoid a large ripple of nearly useless changes.
164     // t_inputrec is being replaced by IMdpOptionsProvider, so this
165     // will go away eventually.
166     const t_inputrec* ir = inputrec;
167
168     int64_t      step, step_rel;
169     double       t, t0 = ir->init_t;
170     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
171     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
172     gmx_bool     bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
173     gmx_bool     do_ene, do_log, do_verbose;
174     gmx_bool     bMasterState;
175     unsigned int force_flags;
176     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
177     int    i, m;
178     rvec   mu_tot;
179     matrix pressureCouplingMu, M;
180     gmx_repl_ex_t     repl_ex = nullptr;
181     gmx_global_stat_t gstat;
182     gmx_shellfc_t*    shellfc;
183     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
184     gmx_bool          bTrotter;
185     real              dvdl_constr;
186     std::vector<RVec> cbuf;
187     matrix            lastbox;
188     int               lamnew = 0;
189     /* for FEP */
190     int       nstfep = 0;
191     double    cycles;
192     real      saved_conserved_quantity = 0;
193     real      last_ekin                = 0;
194     t_extmass MassQ;
195     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
196
197     /* PME load balancing data for GPU kernels */
198     gmx_bool bPMETune         = FALSE;
199     gmx_bool bPMETunePrinting = FALSE;
200
201     bool bInteractiveMDstep = false;
202
203     SimulationSignals signals;
204     // Most global communnication stages don't propagate mdrun
205     // signals, and will use this object to achieve that.
206     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
207
208     if (!mdrunOptions.writeConfout)
209     {
210         // This is on by default, and the main known use case for
211         // turning it off is for convenience in benchmarking, which is
212         // something that should not show up in the general user
213         // interface.
214         GMX_LOG(mdlog.info)
215                 .asParagraph()
216                 .appendText(
217                         "The -noconfout functionality is deprecated, and may be removed in a "
218                         "future version.");
219     }
220
221     /* md-vv uses averaged full step velocities for T-control
222        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
223        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
224     bTrotter = (EI_VV(ir->eI)
225                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
226
227     const bool bRerunMD = false;
228
229     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
230     bGStatEveryStep   = (nstglobalcomm == 1);
231
232     const SimulationGroups* groups = &top_global.groups;
233
234     std::unique_ptr<EssentialDynamics> ed = nullptr;
235     if (opt2bSet("-ei", nfile, fnm))
236     {
237         /* Initialize essential dynamics sampling */
238         ed = init_edsam(mdlog,
239                         opt2fn_null("-ei", nfile, fnm),
240                         opt2fn("-eo", nfile, fnm),
241                         top_global,
242                         *ir,
243                         cr,
244                         constr,
245                         state_global,
246                         observablesHistory,
247                         oenv,
248                         startingBehavior);
249     }
250     else if (observablesHistory->edsamHistory)
251     {
252         gmx_fatal(FARGS,
253                   "The checkpoint is from a run with essential dynamics sampling, "
254                   "but the current run did not specify the -ei option. "
255                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
256     }
257
258     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
259     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
260     initialize_lambdas(fplog,
261                        ir->efep,
262                        ir->bSimTemp,
263                        *ir->fepvals,
264                        ir->simtempvals->temperatures,
265                        gmx::arrayRefFromArray(ir->opts.ref_t, ir->opts.ngtc),
266                        MASTER(cr),
267                        fep_state,
268                        lambda);
269     Update upd(*ir, deform);
270     bool   doSimulatedAnnealing = false;
271     {
272         // TODO: Avoid changing inputrec (#3854)
273         // Simulated annealing updates the reference temperature.
274         auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
275         doSimulatedAnnealing   = initSimulatedAnnealing(nonConstInputrec, &upd);
276     }
277     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
278
279     const t_fcdata& fcdata = *fr->fcdata;
280
281     bool simulationsShareState = false;
282     int  nstSignalComm         = nstglobalcomm;
283     {
284         // TODO This implementation of ensemble orientation restraints is nasty because
285         // a user can't just do multi-sim with single-sim orientation restraints.
286         bool usingEnsembleRestraints = (fcdata.disres->nsystems > 1) || ((ms != nullptr) && fcdata.orires);
287         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim() && (ms != nullptr));
288
289         // Replica exchange, ensemble restraints and AWH need all
290         // simulations to remain synchronized, so they need
291         // checkpoints and stop conditions to act on the same step, so
292         // the propagation of such signals must take place between
293         // simulations, not just within simulations.
294         // TODO: Make algorithm initializers set these flags.
295         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
296
297         if (simulationsShareState)
298         {
299             // Inter-simulation signal communication does not need to happen
300             // often, so we use a minimum of 200 steps to reduce overhead.
301             const int c_minimumInterSimulationSignallingInterval = 200;
302             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
303                             * nstglobalcomm;
304         }
305     }
306
307     if (startingBehavior != StartingBehavior::RestartWithAppending)
308     {
309         pleaseCiteCouplingAlgorithms(fplog, *ir);
310     }
311     gmx_mdoutf*       outf = init_mdoutf(fplog,
312                                    nfile,
313                                    fnm,
314                                    mdrunOptions,
315                                    cr,
316                                    outputProvider,
317                                    mdModulesNotifiers,
318                                    ir,
319                                    top_global,
320                                    oenv,
321                                    wcycle,
322                                    startingBehavior,
323                                    simulationsShareState,
324                                    ms);
325     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf),
326                                    top_global,
327                                    *ir,
328                                    pull_work,
329                                    mdoutf_get_fp_dhdl(outf),
330                                    false,
331                                    startingBehavior,
332                                    simulationsShareState,
333                                    mdModulesNotifiers);
334
335     gstat = global_stat_init(ir);
336
337     const auto& simulationWork     = runScheduleWork->simulationWork;
338     const bool  useGpuForPme       = simulationWork.useGpuPme;
339     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
340     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
341     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
342
343     /* Check for polarizable models and flexible constraints */
344     shellfc = init_shell_flexcon(fplog,
345                                  top_global,
346                                  constr ? constr->numFlexibleConstraints() : 0,
347                                  ir->nstcalcenergy,
348                                  DOMAINDECOMP(cr),
349                                  useGpuForPme);
350
351     {
352         double io = compute_io(ir, top_global.natoms, *groups, energyOutput.numEnergyTerms(), 1);
353         if ((io > 2000) && MASTER(cr))
354         {
355             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
356         }
357     }
358
359     // Local state only becomes valid now.
360     std::unique_ptr<t_state> stateInstance;
361     t_state*                 state;
362
363     gmx_localtop_t top(top_global.ffparams);
364
365     ForceBuffers     f(fr->useMts,
366                    ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
367                                ? PinningPolicy::PinnedIfSupported
368                                : PinningPolicy::CannotBePinned);
369     const t_mdatoms* md = mdAtoms->mdatoms();
370     if (DOMAINDECOMP(cr))
371     {
372         stateInstance = std::make_unique<t_state>();
373         state         = stateInstance.get();
374         dd_init_local_state(*cr->dd, state_global, state);
375
376         /* Distribute the charge groups over the nodes from the master node */
377         dd_partition_system(fplog,
378                             mdlog,
379                             ir->init_step,
380                             cr,
381                             TRUE,
382                             1,
383                             state_global,
384                             top_global,
385                             *ir,
386                             imdSession,
387                             pull_work,
388                             state,
389                             &f,
390                             mdAtoms,
391                             &top,
392                             fr,
393                             vsite,
394                             constr,
395                             nrnb,
396                             nullptr,
397                             FALSE);
398         upd.updateAfterPartition(state->natoms,
399                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
400                                              : gmx::ArrayRef<const unsigned short>(),
401                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
402                                          : gmx::ArrayRef<const unsigned short>());
403     }
404     else
405     {
406         state_change_natoms(state_global, state_global->natoms);
407         /* Copy the pointer to the global state */
408         state = state_global;
409
410         /* Generate and initialize new topology */
411         mdAlgorithmsSetupAtomData(cr, *ir, top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
412
413         upd.updateAfterPartition(state->natoms,
414                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
415                                              : gmx::ArrayRef<const unsigned short>(),
416                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
417                                          : gmx::ArrayRef<const unsigned short>());
418     }
419
420     std::unique_ptr<UpdateConstrainGpu> integrator;
421
422     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
423
424     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
425     if (useGpuForUpdate)
426     {
427         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
428                                    || constr->numConstraintsTotal() == 0,
429                            "Constraints in domain decomposition are only supported with update "
430                            "groups if using GPU update.\n");
431         GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
432                                    || constr->numConstraintsTotal() == 0,
433                            "SHAKE is not supported with GPU update.");
434         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
435                            "Either PME or short-ranged non-bonded interaction tasks must run on "
436                            "the GPU to use GPU update.\n");
437         GMX_RELEASE_ASSERT(ir->eI == IntegrationAlgorithm::MD,
438                            "Only the md integrator is supported with the GPU update.\n");
439         GMX_RELEASE_ASSERT(
440                 ir->etc != TemperatureCoupling::NoseHoover,
441                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
442         GMX_RELEASE_ASSERT(
443                 ir->epc == PressureCoupling::No || ir->epc == PressureCoupling::ParrinelloRahman
444                         || ir->epc == PressureCoupling::Berendsen || ir->epc == PressureCoupling::CRescale,
445                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
446                 "with the GPU update.\n");
447         GMX_RELEASE_ASSERT(!md->haveVsites,
448                            "Virtual sites are not supported with the GPU update.\n");
449         GMX_RELEASE_ASSERT(ed == nullptr,
450                            "Essential dynamics is not supported with the GPU update.\n");
451         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
452                            "Constraints pulling is not supported with the GPU update.\n");
453         GMX_RELEASE_ASSERT(fcdata.orires == nullptr,
454                            "Orientation restraints are not supported with the GPU update.\n");
455         GMX_RELEASE_ASSERT(
456                 ir->efep == FreeEnergyPerturbationType::No
457                         || (!haveFepPerturbedMasses(top_global) && !havePerturbedConstraints(top_global)),
458                 "Free energy perturbation of masses and constraints are not supported with the GPU "
459                 "update.");
460
461         if (constr != nullptr && constr->numConstraintsTotal() > 0)
462         {
463             GMX_LOG(mdlog.info)
464                     .asParagraph()
465                     .appendText("Updating coordinates and applying constraints on the GPU.");
466         }
467         else
468         {
469             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
470         }
471         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
472                            "Device stream manager should be initialized in order to use GPU "
473                            "update-constraints.");
474         GMX_RELEASE_ASSERT(
475                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
476                 "Update stream should be initialized in order to use GPU "
477                 "update-constraints.");
478         integrator = std::make_unique<UpdateConstrainGpu>(
479                 *ir,
480                 top_global,
481                 ekind->ngtc,
482                 fr->deviceStreamManager->context(),
483                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
484                 stateGpu->xUpdatedOnDevice(),
485                 wcycle);
486
487         integrator->setPbc(PbcType::Xyz, state->box);
488     }
489
490     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
491     {
492         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
493     }
494     if (useGpuForUpdate)
495     {
496         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
497     }
498
499     // NOTE: The global state is no longer used at this point.
500     // But state_global is still used as temporary storage space for writing
501     // the global state to file and potentially for replica exchange.
502     // (Global topology should persist.)
503
504     update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
505
506     if (ir->bExpanded)
507     {
508         /* Check nstexpanded here, because the grompp check was broken */
509         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
510         {
511             gmx_fatal(FARGS,
512                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
513         }
514         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
515     }
516
517     if (MASTER(cr))
518     {
519         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
520     }
521
522     preparePrevStepPullCom(ir,
523                            pull_work,
524                            gmx::arrayRefFromArray(md->massT, md->nr),
525                            state,
526                            state_global,
527                            cr,
528                            startingBehavior != StartingBehavior::NewSimulation);
529
530     // TODO: Remove this by converting AWH into a ForceProvider
531     auto awh = prepareAwhModule(fplog,
532                                 *ir,
533                                 state_global,
534                                 cr,
535                                 ms,
536                                 startingBehavior != StartingBehavior::NewSimulation,
537                                 shellfc != nullptr,
538                                 opt2fn("-awh", nfile, fnm),
539                                 pull_work);
540
541     if (useReplicaExchange && MASTER(cr))
542     {
543         repl_ex = init_replica_exchange(fplog, ms, top_global.natoms, ir, replExParams);
544     }
545     /* PME tuning is only supported in the Verlet scheme, with PME for
546      * Coulomb. It is not supported with only LJ PME. */
547     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
548                 && ir->cutoff_scheme != CutoffScheme::Group);
549
550     pme_load_balancing_t* pme_loadbal = nullptr;
551     if (bPMETune)
552     {
553         pme_loadbal_init(
554                 &pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu());
555     }
556
557     if (!ir->bContinuation)
558     {
559         if (state->flags & enumValueToBitMask(StateEntry::V))
560         {
561             auto v = makeArrayRef(state->v);
562             /* Set the velocities of vsites, shells and frozen atoms to zero */
563             for (i = 0; i < md->homenr; i++)
564             {
565                 if (md->ptype[i] == ParticleType::Shell)
566                 {
567                     clear_rvec(v[i]);
568                 }
569                 else if (md->cFREEZE)
570                 {
571                     for (m = 0; m < DIM; m++)
572                     {
573                         if (ir->opts.nFreeze[md->cFREEZE[i]][m])
574                         {
575                             v[i][m] = 0;
576                         }
577                     }
578                 }
579             }
580         }
581
582         if (constr)
583         {
584             /* Constrain the initial coordinates and velocities */
585             do_constrain_first(fplog,
586                                constr,
587                                ir,
588                                md->nr,
589                                md->homenr,
590                                state->x.arrayRefWithPadding(),
591                                state->v.arrayRefWithPadding(),
592                                state->box,
593                                state->lambda[FreeEnergyPerturbationCouplingType::Bonded]);
594         }
595     }
596
597     if (ir->efep != FreeEnergyPerturbationType::No)
598     {
599         /* Set free energy calculation frequency as the greatest common
600          * denominator of nstdhdl and repl_ex_nst. */
601         nstfep = ir->fepvals->nstdhdl;
602         if (ir->bExpanded)
603         {
604             nstfep = std::gcd(ir->expandedvals->nstexpanded, nstfep);
605         }
606         if (useReplicaExchange)
607         {
608             nstfep = std::gcd(replExParams.exchangeInterval, nstfep);
609         }
610         if (ir->bDoAwh)
611         {
612             nstfep = std::gcd(ir->awhParams->nstSampleCoord(), nstfep);
613         }
614     }
615
616     /* Be REALLY careful about what flags you set here. You CANNOT assume
617      * this is the first step, since we might be restarting from a checkpoint,
618      * and in that case we should not do any modifications to the state.
619      */
620     bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && !ir->bContinuation);
621
622     // When restarting from a checkpoint, it can be appropriate to
623     // initialize ekind from quantities in the checkpoint. Otherwise,
624     // compute_globals must initialize ekind before the simulation
625     // starts/restarts. However, only the master rank knows what was
626     // found in the checkpoint file, so we have to communicate in
627     // order to coordinate the restart.
628     //
629     // TODO Consider removing this communication if/when checkpoint
630     // reading directly follows .tpr reading, because all ranks can
631     // agree on hasReadEkinState at that time.
632     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
633     if (PAR(cr))
634     {
635         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
636     }
637     if (hasReadEkinState)
638     {
639         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
640     }
641
642     unsigned int cglo_flags =
643             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
644              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
645
646     bSumEkinhOld = FALSE;
647
648     t_vcm vcm(top_global.groups, *ir);
649     reportComRemovalInfo(fplog, vcm);
650
651     /* To minimize communication, compute_globals computes the COM velocity
652      * and the kinetic energy for the velocities without COM motion removed.
653      * Thus to get the kinetic energy without the COM contribution, we need
654      * to call compute_globals twice.
655      */
656     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
657     {
658         unsigned int cglo_flags_iteration = cglo_flags;
659         if (bStopCM && cgloIteration == 0)
660         {
661             cglo_flags_iteration |= CGLO_STOPCM;
662             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
663         }
664         if (DOMAINDECOMP(cr) && shouldCheckNumberOfBondedInteractions(*cr->dd) && cgloIteration == 0)
665         {
666             cglo_flags_iteration |= CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS;
667         }
668         compute_globals(gstat,
669                         cr,
670                         ir,
671                         fr,
672                         ekind,
673                         makeConstArrayRef(state->x),
674                         makeConstArrayRef(state->v),
675                         state->box,
676                         md,
677                         nrnb,
678                         &vcm,
679                         nullptr,
680                         enerd,
681                         force_vir,
682                         shake_vir,
683                         total_vir,
684                         pres,
685                         gmx::ArrayRef<real>{},
686                         &nullSignaller,
687                         state->box,
688                         &bSumEkinhOld,
689                         cglo_flags_iteration);
690         if (cglo_flags_iteration & CGLO_STOPCM)
691         {
692             /* At initialization, do not pass x with acceleration-correction mode
693              * to avoid (incorrect) correction of the initial coordinates.
694              */
695             auto x = (vcm.mode == ComRemovalAlgorithm::LinearAccelerationCorrection)
696                              ? ArrayRef<RVec>()
697                              : makeArrayRef(state->x);
698             process_and_stopcm_grp(fplog, &vcm, *md, x, makeArrayRef(state->v));
699             inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
700         }
701     }
702     if (DOMAINDECOMP(cr))
703     {
704         checkNumberOfBondedInteractions(
705                 mdlog, cr, top_global, &top, makeConstArrayRef(state->x), state->box);
706     }
707     if (ir->eI == IntegrationAlgorithm::VVAK)
708     {
709         /* a second call to get the half step temperature initialized as well */
710         /* we do the same call as above, but turn the pressure off -- internally to
711            compute_globals, this is recognized as a velocity verlet half-step
712            kinetic energy calculation.  This minimized excess variables, but
713            perhaps loses some logic?*/
714
715         compute_globals(gstat,
716                         cr,
717                         ir,
718                         fr,
719                         ekind,
720                         makeConstArrayRef(state->x),
721                         makeConstArrayRef(state->v),
722                         state->box,
723                         md,
724                         nrnb,
725                         &vcm,
726                         nullptr,
727                         enerd,
728                         force_vir,
729                         shake_vir,
730                         total_vir,
731                         pres,
732                         gmx::ArrayRef<real>{},
733                         &nullSignaller,
734                         state->box,
735                         &bSumEkinhOld,
736                         cglo_flags & ~CGLO_PRESSURE);
737     }
738
739     /* Calculate the initial half step temperature, and save the ekinh_old */
740     if (startingBehavior == StartingBehavior::NewSimulation)
741     {
742         for (i = 0; (i < ir->opts.ngtc); i++)
743         {
744             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
745         }
746     }
747
748     /* need to make an initiation call to get the Trotter variables set, as well as other constants
749        for non-trotter temperature control */
750     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
751
752     if (MASTER(cr))
753     {
754         if (!ir->bContinuation)
755         {
756             if (constr && ir->eConstrAlg == ConstraintAlgorithm::Lincs)
757             {
758                 fprintf(fplog,
759                         "RMS relative constraint deviation after constraining: %.2e\n",
760                         constr->rmsd());
761             }
762             if (EI_STATE_VELOCITY(ir->eI))
763             {
764                 real temp = enerd->term[F_TEMP];
765                 if (ir->eI != IntegrationAlgorithm::VV)
766                 {
767                     /* Result of Ekin averaged over velocities of -half
768                      * and +half step, while we only have -half step here.
769                      */
770                     temp *= 2;
771                 }
772                 fprintf(fplog, "Initial temperature: %g K\n", temp);
773             }
774         }
775
776         char tbuf[20];
777         fprintf(stderr, "starting mdrun '%s'\n", *(top_global.name));
778         if (ir->nsteps >= 0)
779         {
780             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
781         }
782         else
783         {
784             sprintf(tbuf, "%s", "infinite");
785         }
786         if (ir->init_step > 0)
787         {
788             fprintf(stderr,
789                     "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
790                     gmx_step_str(ir->init_step + ir->nsteps, sbuf),
791                     tbuf,
792                     gmx_step_str(ir->init_step, sbuf2),
793                     ir->init_step * ir->delta_t);
794         }
795         else
796         {
797             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
798         }
799         fprintf(fplog, "\n");
800     }
801
802     walltime_accounting_start_time(walltime_accounting);
803     wallcycle_start(wcycle, WallCycleCounter::Run);
804     print_start(fplog, cr, walltime_accounting, "mdrun");
805
806     /***********************************************************
807      *
808      *             Loop over MD steps
809      *
810      ************************************************************/
811
812     bFirstStep = TRUE;
813     /* Skip the first Nose-Hoover integration when we get the state from tpx */
814     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
815     bSumEkinhOld     = FALSE;
816     bExchanged       = FALSE;
817     bNeedRepartition = FALSE;
818
819     step     = ir->init_step;
820     step_rel = 0;
821
822     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
823             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]),
824             simulationsShareState,
825             MASTER(cr),
826             ir->nstlist,
827             mdrunOptions.reproducible,
828             nstSignalComm,
829             mdrunOptions.maximumHoursToRun,
830             ir->nstlist == 0,
831             fplog,
832             step,
833             bNS,
834             walltime_accounting);
835
836     auto checkpointHandler = std::make_unique<CheckpointHandler>(
837             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
838             simulationsShareState,
839             ir->nstlist == 0,
840             MASTER(cr),
841             mdrunOptions.writeConfout,
842             mdrunOptions.checkpointOptions.period);
843
844     const bool resetCountersIsLocal = true;
845     auto       resetHandler         = std::make_unique<ResetHandler>(
846             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
847             !resetCountersIsLocal,
848             ir->nsteps,
849             MASTER(cr),
850             mdrunOptions.timingOptions.resetHalfway,
851             mdrunOptions.maximumHoursToRun,
852             mdlog,
853             wcycle,
854             walltime_accounting);
855
856     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
857
858     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
859     {
860         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
861     }
862
863     /* and stop now if we should */
864     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
865     while (!bLastStep)
866     {
867
868         /* Determine if this is a neighbor search step */
869         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
870
871         if (bPMETune && bNStList)
872         {
873             // This has to be here because PME load balancing is called so early.
874             // TODO: Move to after all booleans are defined.
875             if (useGpuForUpdate && !bFirstStep)
876             {
877                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
878                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
879             }
880             /* PME grid + cut-off optimization with GPUs or PME nodes */
881             pme_loadbal_do(pme_loadbal,
882                            cr,
883                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
884                            fplog,
885                            mdlog,
886                            *ir,
887                            fr,
888                            state->box,
889                            state->x,
890                            wcycle,
891                            step,
892                            step_rel,
893                            &bPMETunePrinting,
894                            simulationWork.useGpuPmePpCommunication);
895         }
896
897         wallcycle_start(wcycle, WallCycleCounter::Step);
898
899         bLastStep = (step_rel == ir->nsteps);
900         t         = t0 + step * ir->delta_t;
901
902         // TODO Refactor this, so that nstfep does not need a default value of zero
903         if (ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
904         {
905             /* find and set the current lambdas */
906             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
907
908             bDoDHDL = do_per_step(step, ir->fepvals->nstdhdl);
909             bDoFEP  = ((ir->efep != FreeEnergyPerturbationType::No) && do_per_step(step, nstfep));
910             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
911                            && (!bFirstStep));
912         }
913
914         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
915                      && do_per_step(step, replExParams.exchangeInterval));
916
917         if (doSimulatedAnnealing)
918         {
919             // TODO: Avoid changing inputrec (#3854)
920             // Simulated annealing updates the reference temperature.
921             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
922             update_annealing_target_temp(nonConstInputrec, t, &upd);
923         }
924
925         /* Stop Center of Mass motion */
926         bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && do_per_step(step, ir->nstcomm));
927
928         /* Determine whether or not to do Neighbour Searching */
929         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
930
931         /* Note that the stopHandler will cause termination at nstglobalcomm
932          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
933          * nstpcouple steps, we have computed the half-step kinetic energy
934          * of the previous step and can always output energies at the last step.
935          */
936         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
937
938         /* do_log triggers energy and virial calculation. Because this leads
939          * to different code paths, forces can be different. Thus for exact
940          * continuation we should avoid extra log output.
941          * Note that the || bLastStep can result in non-exact continuation
942          * beyond the last step. But we don't consider that to be an issue.
943          */
944         do_log     = (do_per_step(step, ir->nstlog)
945                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
946         do_verbose = mdrunOptions.verbose
947                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
948
949         if (useGpuForUpdate && !bFirstStep && bNS)
950         {
951             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
952             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
953             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
954             // Copy coordinate from the GPU when needed at the search step.
955             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
956             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
957             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
958             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
959         }
960
961         // We only need to calculate virtual velocities if we are writing them in the current step
962         const bool needVirtualVelocitiesThisStep =
963                 (vsite != nullptr)
964                 && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep());
965
966         if (vsite != nullptr)
967         {
968             // Virtual sites need to be updated before domain decomposition and forces are calculated
969             wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
970             // md-vv calculates virtual velocities once it has full-step real velocities
971             vsite->construct(state->x,
972                              state->v,
973                              state->box,
974                              (!EI_VV(inputrec->eI) && needVirtualVelocitiesThisStep)
975                                      ? VSiteOperation::PositionsAndVelocities
976                                      : VSiteOperation::Positions);
977             wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
978         }
979
980         if (bNS && !(bFirstStep && ir->bContinuation))
981         {
982             bMasterState = FALSE;
983             /* Correct the new box if it is too skewed */
984             if (inputrecDynamicBox(ir))
985             {
986                 if (correct_box(fplog, step, state->box))
987                 {
988                     bMasterState = TRUE;
989                     // If update is offloaded, it should be informed about the box size change
990                     if (useGpuForUpdate)
991                     {
992                         integrator->setPbc(PbcType::Xyz, state->box);
993                     }
994                 }
995             }
996             if (DOMAINDECOMP(cr) && bMasterState)
997             {
998                 dd_collect_state(cr->dd, state, state_global);
999             }
1000
1001             if (DOMAINDECOMP(cr))
1002             {
1003                 /* Repartition the domain decomposition */
1004                 dd_partition_system(fplog,
1005                                     mdlog,
1006                                     step,
1007                                     cr,
1008                                     bMasterState,
1009                                     nstglobalcomm,
1010                                     state_global,
1011                                     top_global,
1012                                     *ir,
1013                                     imdSession,
1014                                     pull_work,
1015                                     state,
1016                                     &f,
1017                                     mdAtoms,
1018                                     &top,
1019                                     fr,
1020                                     vsite,
1021                                     constr,
1022                                     nrnb,
1023                                     wcycle,
1024                                     do_verbose && !bPMETunePrinting);
1025                 upd.updateAfterPartition(state->natoms,
1026                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1027                                                      : gmx::ArrayRef<const unsigned short>(),
1028                                          md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1029                                                  : gmx::ArrayRef<const unsigned short>());
1030             }
1031         }
1032
1033         // Allocate or re-size GPU halo exchange object, if necessary
1034         if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange)
1035         {
1036             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
1037                                "GPU device manager has to be initialized to use GPU "
1038                                "version of halo exchange.");
1039             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
1040         }
1041
1042         if (MASTER(cr) && do_log)
1043         {
1044             gmx::EnergyOutput::printHeader(
1045                     fplog, step, t); /* can we improve the information printed here? */
1046         }
1047
1048         if (ir->efep != FreeEnergyPerturbationType::No)
1049         {
1050             update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
1051         }
1052
1053         if (bExchanged)
1054         {
1055             /* We need the kinetic energy at minus the half step for determining
1056              * the full step kinetic energy and possibly for T-coupling.*/
1057             /* This may not be quite working correctly yet . . . . */
1058             int cglo_flags = CGLO_GSTAT | CGLO_TEMPERATURE;
1059             if (DOMAINDECOMP(cr) && shouldCheckNumberOfBondedInteractions(*cr->dd))
1060             {
1061                 cglo_flags |= CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS;
1062             }
1063             compute_globals(gstat,
1064                             cr,
1065                             ir,
1066                             fr,
1067                             ekind,
1068                             makeConstArrayRef(state->x),
1069                             makeConstArrayRef(state->v),
1070                             state->box,
1071                             md,
1072                             nrnb,
1073                             &vcm,
1074                             wcycle,
1075                             enerd,
1076                             nullptr,
1077                             nullptr,
1078                             nullptr,
1079                             nullptr,
1080                             gmx::ArrayRef<real>{},
1081                             &nullSignaller,
1082                             state->box,
1083                             &bSumEkinhOld,
1084                             cglo_flags);
1085             if (DOMAINDECOMP(cr))
1086             {
1087                 checkNumberOfBondedInteractions(
1088                         mdlog, cr, top_global, &top, makeConstArrayRef(state->x), state->box);
1089             }
1090         }
1091         clear_mat(force_vir);
1092
1093         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
1094
1095         /* Determine the energy and pressure:
1096          * at nstcalcenergy steps and at energy output steps (set below).
1097          */
1098         if (EI_VV(ir->eI) && (!bInitStep))
1099         {
1100             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1101             bCalcVir      = bCalcEnerStep
1102                        || (ir->epc != PressureCoupling::No
1103                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
1104         }
1105         else
1106         {
1107             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1108             bCalcVir      = bCalcEnerStep
1109                        || (ir->epc != PressureCoupling::No && do_per_step(step, ir->nstpcouple));
1110         }
1111         bCalcEner = bCalcEnerStep;
1112
1113         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
1114
1115         if (do_ene || do_log || bDoReplEx)
1116         {
1117             bCalcVir  = TRUE;
1118             bCalcEner = TRUE;
1119         }
1120
1121         /* Do we need global communication ? */
1122         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
1123                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
1124
1125         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
1126                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
1127                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
1128         if (fr->useMts && !do_per_step(step, ir->nstfout))
1129         {
1130             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
1131         }
1132
1133         if (shellfc)
1134         {
1135             /* Now is the time to relax the shells */
1136             relax_shell_flexcon(fplog,
1137                                 cr,
1138                                 ms,
1139                                 mdrunOptions.verbose,
1140                                 enforcedRotation,
1141                                 step,
1142                                 ir,
1143                                 imdSession,
1144                                 pull_work,
1145                                 bNS,
1146                                 force_flags,
1147                                 &top,
1148                                 constr,
1149                                 enerd,
1150                                 state->natoms,
1151                                 state->x.arrayRefWithPadding(),
1152                                 state->v.arrayRefWithPadding(),
1153                                 state->box,
1154                                 state->lambda,
1155                                 &state->hist,
1156                                 &f.view(),
1157                                 force_vir,
1158                                 *md,
1159                                 nrnb,
1160                                 wcycle,
1161                                 shellfc,
1162                                 fr,
1163                                 runScheduleWork,
1164                                 t,
1165                                 mu_tot,
1166                                 vsite,
1167                                 ddBalanceRegionHandler);
1168         }
1169         else
1170         {
1171             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
1172                is updated (or the AWH update will be performed twice for one step when continuing).
1173                It would be best to call this update function from do_md_trajectory_writing but that
1174                would occur after do_force. One would have to divide the update_awh function into one
1175                function applying the AWH force and one doing the AWH bias update. The update AWH
1176                bias function could then be called after do_md_trajectory_writing (then containing
1177                update_awh_history). The checkpointing will in the future probably moved to the start
1178                of the md loop which will rid of this issue. */
1179             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
1180             {
1181                 awh->updateHistory(state_global->awhHistory.get());
1182             }
1183
1184             /* The coordinates (x) are shifted (to get whole molecules)
1185              * in do_force.
1186              * This is parallellized as well, and does communication too.
1187              * Check comments in sim_util.c
1188              */
1189             do_force(fplog,
1190                      cr,
1191                      ms,
1192                      *ir,
1193                      awh.get(),
1194                      enforcedRotation,
1195                      imdSession,
1196                      pull_work,
1197                      step,
1198                      nrnb,
1199                      wcycle,
1200                      &top,
1201                      state->box,
1202                      state->x.arrayRefWithPadding(),
1203                      &state->hist,
1204                      &f.view(),
1205                      force_vir,
1206                      md,
1207                      enerd,
1208                      state->lambda,
1209                      fr,
1210                      runScheduleWork,
1211                      vsite,
1212                      mu_tot,
1213                      t,
1214                      ed ? ed->getLegacyED() : nullptr,
1215                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
1216                      ddBalanceRegionHandler);
1217         }
1218
1219         // VV integrators do not need the following velocity half step
1220         // if it is the first step after starting from a checkpoint.
1221         // That is, the half step is needed on all other steps, and
1222         // also the first step when starting from a .tpr file.
1223         if (EI_VV(ir->eI))
1224         {
1225             integrateVVFirstStep(step,
1226                                  bFirstStep,
1227                                  bInitStep,
1228                                  startingBehavior,
1229                                  nstglobalcomm,
1230                                  ir,
1231                                  fr,
1232                                  cr,
1233                                  state,
1234                                  mdAtoms->mdatoms(),
1235                                  fcdata,
1236                                  &MassQ,
1237                                  &vcm,
1238                                  top_global,
1239                                  top,
1240                                  enerd,
1241                                  ekind,
1242                                  gstat,
1243                                  &last_ekin,
1244                                  bCalcVir,
1245                                  total_vir,
1246                                  shake_vir,
1247                                  force_vir,
1248                                  pres,
1249                                  M,
1250                                  do_log,
1251                                  do_ene,
1252                                  bCalcEner,
1253                                  bGStat,
1254                                  bStopCM,
1255                                  bTrotter,
1256                                  bExchanged,
1257                                  &bSumEkinhOld,
1258                                  &saved_conserved_quantity,
1259                                  &f,
1260                                  &upd,
1261                                  constr,
1262                                  &nullSignaller,
1263                                  trotter_seq,
1264                                  nrnb,
1265                                  mdlog,
1266                                  fplog,
1267                                  wcycle);
1268             if (vsite != nullptr && needVirtualVelocitiesThisStep)
1269             {
1270                 // Positions were calculated earlier
1271                 wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
1272                 vsite->construct(state->x, state->v, state->box, VSiteOperation::Velocities);
1273                 wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
1274             }
1275         }
1276
1277         /* ########  END FIRST UPDATE STEP  ############## */
1278         /* ########  If doing VV, we now have v(dt) ###### */
1279         if (bDoExpanded)
1280         {
1281             /* perform extended ensemble sampling in lambda - we don't
1282                actually move to the new state before outputting
1283                statistics, but if performing simulated tempering, we
1284                do update the velocities and the tau_t. */
1285             // TODO: Avoid changing inputrec (#3854)
1286             // Simulated tempering updates the reference temperature.
1287             // Expanded ensemble without simulated tempering does not change the inputrec.
1288             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
1289             lamnew                 = ExpandedEnsembleDynamics(fplog,
1290                                               nonConstInputrec,
1291                                               enerd,
1292                                               state,
1293                                               &MassQ,
1294                                               state->fep_state,
1295                                               state->dfhist,
1296                                               step,
1297                                               state->v.rvec_array(),
1298                                               md->homenr,
1299                                               md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1300                                                                       : gmx::ArrayRef<const unsigned short>());
1301             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1302             if (MASTER(cr))
1303             {
1304                 copy_df_history(state_global->dfhist, state->dfhist);
1305             }
1306         }
1307
1308         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1309         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1310         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1311             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1312                 || checkpointHandler->isCheckpointingStep()))
1313         {
1314             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1315             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1316         }
1317         // Copy velocities if needed for the output/checkpointing.
1318         // NOTE: Copy on the search steps is done at the beginning of the step.
1319         if (useGpuForUpdate && !bNS
1320             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1321         {
1322             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1323             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1324         }
1325         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1326         // and update is offloaded hence forces are kept on the GPU for update and have not been
1327         // already transferred in do_force().
1328         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1329         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1330         //       prior to GPU update.
1331         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1332         //       copy call in do_force(...).
1333         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1334         //       on host after the D2H copy in do_force(...).
1335         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1336             && do_per_step(step, ir->nstfout))
1337         {
1338             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1339             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1340         }
1341         /* Now we have the energies and forces corresponding to the
1342          * coordinates at time t. We must output all of this before
1343          * the update.
1344          */
1345         do_md_trajectory_writing(fplog,
1346                                  cr,
1347                                  nfile,
1348                                  fnm,
1349                                  step,
1350                                  step_rel,
1351                                  t,
1352                                  ir,
1353                                  state,
1354                                  state_global,
1355                                  observablesHistory,
1356                                  top_global,
1357                                  fr,
1358                                  outf,
1359                                  energyOutput,
1360                                  ekind,
1361                                  f.view().force(),
1362                                  checkpointHandler->isCheckpointingStep(),
1363                                  bRerunMD,
1364                                  bLastStep,
1365                                  mdrunOptions.writeConfout,
1366                                  bSumEkinhOld);
1367         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1368         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x, t);
1369
1370         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1371         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1372             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1373         {
1374             copy_mat(state->svir_prev, shake_vir);
1375             copy_mat(state->fvir_prev, force_vir);
1376         }
1377
1378         stopHandler->setSignal();
1379         resetHandler->setSignal(walltime_accounting);
1380
1381         if (bGStat || !PAR(cr))
1382         {
1383             /* In parallel we only have to check for checkpointing in steps
1384              * where we do global communication,
1385              *  otherwise the other nodes don't know.
1386              */
1387             checkpointHandler->setSignal(walltime_accounting);
1388         }
1389
1390         /* #########   START SECOND UPDATE STEP ################# */
1391
1392         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1393            controlled in preprocessing */
1394
1395         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1396         {
1397             gmx_bool bIfRandomize;
1398             bIfRandomize = update_randomize_velocities(ir,
1399                                                        step,
1400                                                        cr,
1401                                                        md->homenr,
1402                                                        md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1403                                                                : gmx::ArrayRef<const unsigned short>(),
1404                                                        gmx::arrayRefFromArray(md->invmass, md->nr),
1405                                                        state->v,
1406                                                        &upd,
1407                                                        constr);
1408             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1409             if (constr && bIfRandomize)
1410             {
1411                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1412             }
1413         }
1414         /* Box is changed in update() when we do pressure coupling,
1415          * but we should still use the old box for energy corrections and when
1416          * writing it to the energy file, so it matches the trajectory files for
1417          * the same timestep above. Make a copy in a separate array.
1418          */
1419         copy_mat(state->box, lastbox);
1420
1421         dvdl_constr = 0;
1422
1423         if (!useGpuForUpdate)
1424         {
1425             wallcycle_start(wcycle, WallCycleCounter::Update);
1426         }
1427         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1428         if (bTrotter)
1429         {
1430             trotter_update(ir,
1431                            step,
1432                            ekind,
1433                            enerd,
1434                            state,
1435                            total_vir,
1436                            md->homenr,
1437                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1438                                    : gmx::ArrayRef<const unsigned short>(),
1439                            gmx::arrayRefFromArray(md->invmass, md->nr),
1440                            &MassQ,
1441                            trotter_seq,
1442                            ettTSEQ3);
1443             /* We can only do Berendsen coupling after we have summed
1444              * the kinetic energy or virial. Since the happens
1445              * in global_state after update, we should only do it at
1446              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1447              */
1448         }
1449         else
1450         {
1451             update_tcouple(step,
1452                            ir,
1453                            state,
1454                            ekind,
1455                            &MassQ,
1456                            md->homenr,
1457                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1458                                    : gmx::ArrayRef<const unsigned short>());
1459             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1460         }
1461
1462         /* With leap-frog type integrators we compute the kinetic energy
1463          * at a whole time step as the average of the half-time step kinetic
1464          * energies of two subsequent steps. Therefore we need to compute the
1465          * half step kinetic energy also if we need energies at the next step.
1466          */
1467         const bool needHalfStepKineticEnergy =
1468                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1469
1470         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1471         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1472         const bool doParrinelloRahman = (ir->epc == PressureCoupling::ParrinelloRahman
1473                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1474
1475         if (EI_VV(ir->eI))
1476         {
1477             GMX_ASSERT(!useGpuForUpdate, "GPU update is not supported with VVAK integrator.");
1478
1479             integrateVVSecondStep(step,
1480                                   ir,
1481                                   fr,
1482                                   cr,
1483                                   state,
1484                                   mdAtoms->mdatoms(),
1485                                   fcdata,
1486                                   &MassQ,
1487                                   &vcm,
1488                                   pull_work,
1489                                   enerd,
1490                                   ekind,
1491                                   gstat,
1492                                   &dvdl_constr,
1493                                   bCalcVir,
1494                                   total_vir,
1495                                   shake_vir,
1496                                   force_vir,
1497                                   pres,
1498                                   M,
1499                                   lastbox,
1500                                   do_log,
1501                                   do_ene,
1502                                   bGStat,
1503                                   &bSumEkinhOld,
1504                                   &f,
1505                                   &cbuf,
1506                                   &upd,
1507                                   constr,
1508                                   &nullSignaller,
1509                                   trotter_seq,
1510                                   nrnb,
1511                                   wcycle);
1512         }
1513         else
1514         {
1515             if (useGpuForUpdate)
1516             {
1517                 if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1518                 {
1519                     integrator->set(stateGpu->getCoordinates(),
1520                                     stateGpu->getVelocities(),
1521                                     stateGpu->getForces(),
1522                                     top.idef,
1523                                     *md);
1524
1525                     // Copy data to the GPU after buffers might have being reinitialized
1526                     // coordinates have been copied already if PME or buffer ops has not needed it this step.
1527                     stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1528                     const bool useGpuPmeOnThisRank = runScheduleWork->simulationWork.useGpuPme
1529                                                      && thisRankHasDuty(cr, DUTY_PME)
1530                                                      && runScheduleWork->stepWork.computeSlowForces;
1531                     if (!useGpuPmeOnThisRank && !runScheduleWork->stepWork.useGpuXBufferOps)
1532                     {
1533                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1534                     }
1535                 }
1536
1537                 if (simulationWork.useGpuPme && !runScheduleWork->simulationWork.useGpuPmePpCommunication
1538                     && !thisRankHasDuty(cr, DUTY_PME))
1539                 {
1540                     // The PME forces were recieved to the host, so have to be copied
1541                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::All);
1542                 }
1543                 else if (!runScheduleWork->stepWork.useGpuFBufferOps)
1544                 {
1545                     // The buffer ops were not offloaded this step, so the forces are on the
1546                     // host and have to be copied
1547                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1548                 }
1549
1550                 const bool doTemperatureScaling =
1551                         (ir->etc != TemperatureCoupling::No
1552                          && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1553
1554                 // This applies Leap-Frog, LINCS and SETTLE in succession
1555                 integrator->integrate(
1556                         stateGpu->getForcesReadyOnDeviceEvent(
1557                                 AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1558                         ir->delta_t,
1559                         true,
1560                         bCalcVir,
1561                         shake_vir,
1562                         doTemperatureScaling,
1563                         ekind->tcstat,
1564                         doParrinelloRahman,
1565                         ir->nstpcouple * ir->delta_t,
1566                         M);
1567
1568                 // Copy velocities D2H after update if:
1569                 // - Globals are computed this step (includes the energy output steps).
1570                 // - Temperature is needed for the next step.
1571                 if (bGStat || needHalfStepKineticEnergy)
1572                 {
1573                     stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1574                     stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1575                 }
1576             }
1577             else
1578             {
1579                 /* With multiple time stepping we need to do an additional normal
1580                  * update step to obtain the virial, as the actual MTS integration
1581                  * using an acceleration where the slow forces are multiplied by mtsFactor.
1582                  * Using that acceleration would result in a virial with the slow
1583                  * force contribution would be a factor mtsFactor too large.
1584                  */
1585                 if (fr->useMts && bCalcVir && constr != nullptr)
1586                 {
1587                     upd.update_for_constraint_virial(*ir,
1588                                                      md->homenr,
1589                                                      md->havePartiallyFrozenAtoms,
1590                                                      gmx::arrayRefFromArray(md->invmass, md->nr),
1591                                                      gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1592                                                      *state,
1593                                                      f.view().forceWithPadding(),
1594                                                      *ekind);
1595
1596                     constrain_coordinates(constr,
1597                                           do_log,
1598                                           do_ene,
1599                                           step,
1600                                           state,
1601                                           upd.xp()->arrayRefWithPadding(),
1602                                           &dvdl_constr,
1603                                           bCalcVir,
1604                                           shake_vir);
1605                 }
1606
1607                 ArrayRefWithPadding<const RVec> forceCombined =
1608                         (fr->useMts && step % ir->mtsLevels[1].stepFactor == 0)
1609                                 ? f.view().forceMtsCombinedWithPadding()
1610                                 : f.view().forceWithPadding();
1611                 upd.update_coords(*ir,
1612                                   step,
1613                                   md->homenr,
1614                                   md->havePartiallyFrozenAtoms,
1615                                   gmx::arrayRefFromArray(md->ptype, md->nr),
1616                                   gmx::arrayRefFromArray(md->invmass, md->nr),
1617                                   gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1618                                   state,
1619                                   forceCombined,
1620                                   fcdata,
1621                                   ekind,
1622                                   M,
1623                                   etrtPOSITION,
1624                                   cr,
1625                                   constr != nullptr);
1626
1627                 wallcycle_stop(wcycle, WallCycleCounter::Update);
1628
1629                 constrain_coordinates(constr,
1630                                       do_log,
1631                                       do_ene,
1632                                       step,
1633                                       state,
1634                                       upd.xp()->arrayRefWithPadding(),
1635                                       &dvdl_constr,
1636                                       bCalcVir && !fr->useMts,
1637                                       shake_vir);
1638
1639                 upd.update_sd_second_half(*ir,
1640                                           step,
1641                                           &dvdl_constr,
1642                                           md->homenr,
1643                                           gmx::arrayRefFromArray(md->ptype, md->nr),
1644                                           gmx::arrayRefFromArray(md->invmass, md->nr),
1645                                           state,
1646                                           cr,
1647                                           nrnb,
1648                                           wcycle,
1649                                           constr,
1650                                           do_log,
1651                                           do_ene);
1652                 upd.finish_update(
1653                         *ir, md->havePartiallyFrozenAtoms, md->homenr, state, wcycle, constr != nullptr);
1654             }
1655
1656             if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1657             {
1658                 updatePrevStepPullCom(pull_work, state);
1659             }
1660
1661             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1662         }
1663
1664         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1665         /* With Leap-Frog we can skip compute_globals at
1666          * non-communication steps, but we need to calculate
1667          * the kinetic energy one step before communication.
1668          */
1669         {
1670             // Organize to do inter-simulation signalling on steps if
1671             // and when algorithms require it.
1672             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1673
1674             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1675             {
1676                 // Copy coordinates when needed to stop the CM motion.
1677                 if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
1678                 {
1679                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1680                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1681                 }
1682                 // Since we're already communicating at this step, we
1683                 // can propagate intra-simulation signals. Note that
1684                 // check_nstglobalcomm has the responsibility for
1685                 // choosing the value of nstglobalcomm that is one way
1686                 // bGStat becomes true, so we can't get into a
1687                 // situation where e.g. checkpointing can't be
1688                 // signalled.
1689                 bool                doIntraSimSignal = true;
1690                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1691
1692                 compute_globals(
1693                         gstat,
1694                         cr,
1695                         ir,
1696                         fr,
1697                         ekind,
1698                         makeConstArrayRef(state->x),
1699                         makeConstArrayRef(state->v),
1700                         state->box,
1701                         md,
1702                         nrnb,
1703                         &vcm,
1704                         wcycle,
1705                         enerd,
1706                         force_vir,
1707                         shake_vir,
1708                         total_vir,
1709                         pres,
1710                         (!EI_VV(ir->eI) && bCalcEner && constr != nullptr) ? constr->rmsdData()
1711                                                                            : gmx::ArrayRef<real>{},
1712                         &signaller,
1713                         lastbox,
1714                         &bSumEkinhOld,
1715                         (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1716                                 | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1717                                 | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1718                                 | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
1719                                 | (DOMAINDECOMP(cr) && shouldCheckNumberOfBondedInteractions(*cr->dd)
1720                                            ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1721                                            : 0));
1722                 if (DOMAINDECOMP(cr))
1723                 {
1724                     checkNumberOfBondedInteractions(
1725                             mdlog, cr, top_global, &top, makeConstArrayRef(state->x), state->box);
1726                 }
1727                 if (!EI_VV(ir->eI) && bStopCM)
1728                 {
1729                     process_and_stopcm_grp(
1730                             fplog, &vcm, *md, makeArrayRef(state->x), makeArrayRef(state->v));
1731                     inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
1732
1733                     // TODO: The special case of removing CM motion should be dealt more gracefully
1734                     if (useGpuForUpdate)
1735                     {
1736                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1737                         // Here we block until the H2D copy completes because event sync with the
1738                         // force kernels that use the coordinates on the next steps is not implemented
1739                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1740                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1741                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1742                         if (vcm.mode != ComRemovalAlgorithm::No)
1743                         {
1744                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1745                         }
1746                     }
1747                 }
1748             }
1749         }
1750
1751         /* #############  END CALC EKIN AND PRESSURE ################# */
1752
1753         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1754            the virial that should probably be addressed eventually. state->veta has better properies,
1755            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1756            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1757
1758         if (ir->efep != FreeEnergyPerturbationType::No && !EI_VV(ir->eI))
1759         {
1760             /* Sum up the foreign energy and dK/dl terms for md and sd.
1761                Currently done every step so that dH/dl is correct in the .edr */
1762             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1763         }
1764
1765         update_pcouple_after_coordinates(fplog,
1766                                          step,
1767                                          ir,
1768                                          md->homenr,
1769                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1770                                                      : gmx::ArrayRef<const unsigned short>(),
1771                                          pres,
1772                                          force_vir,
1773                                          shake_vir,
1774                                          pressureCouplingMu,
1775                                          state,
1776                                          nrnb,
1777                                          upd.deform(),
1778                                          !useGpuForUpdate);
1779
1780         const bool doBerendsenPressureCoupling = (inputrec->epc == PressureCoupling::Berendsen
1781                                                   && do_per_step(step, inputrec->nstpcouple));
1782         const bool doCRescalePressureCoupling  = (inputrec->epc == PressureCoupling::CRescale
1783                                                  && do_per_step(step, inputrec->nstpcouple));
1784         if (useGpuForUpdate
1785             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1786         {
1787             integrator->scaleCoordinates(pressureCouplingMu);
1788             if (doCRescalePressureCoupling)
1789             {
1790                 matrix pressureCouplingInvMu;
1791                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1792                 integrator->scaleVelocities(pressureCouplingInvMu);
1793             }
1794             integrator->setPbc(PbcType::Xyz, state->box);
1795         }
1796
1797         /* ################# END UPDATE STEP 2 ################# */
1798         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1799
1800         /* The coordinates (x) were unshifted in update */
1801         if (!bGStat)
1802         {
1803             /* We will not sum ekinh_old,
1804              * so signal that we still have to do it.
1805              */
1806             bSumEkinhOld = TRUE;
1807         }
1808
1809         if (bCalcEner)
1810         {
1811             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1812
1813             /* use the directly determined last velocity, not actually the averaged half steps */
1814             if (bTrotter && ir->eI == IntegrationAlgorithm::VV)
1815             {
1816                 enerd->term[F_EKIN] = last_ekin;
1817             }
1818             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1819
1820             if (integratorHasConservedEnergyQuantity(ir))
1821             {
1822                 if (EI_VV(ir->eI))
1823                 {
1824                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1825                 }
1826                 else
1827                 {
1828                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1829                 }
1830             }
1831             /* #########  END PREPARING EDR OUTPUT  ###########  */
1832         }
1833
1834         /* Output stuff */
1835         if (MASTER(cr))
1836         {
1837             if (fplog && do_log && bDoExpanded)
1838             {
1839                 /* only needed if doing expanded ensemble */
1840                 PrintFreeEnergyInfoToFile(fplog,
1841                                           ir->fepvals.get(),
1842                                           ir->expandedvals.get(),
1843                                           ir->bSimTemp ? ir->simtempvals.get() : nullptr,
1844                                           state_global->dfhist,
1845                                           state->fep_state,
1846                                           ir->nstlog,
1847                                           step);
1848             }
1849             if (bCalcEner)
1850             {
1851                 energyOutput.addDataAtEnergyStep(bDoDHDL,
1852                                                  bCalcEnerStep,
1853                                                  t,
1854                                                  md->tmass,
1855                                                  enerd,
1856                                                  ir->fepvals.get(),
1857                                                  ir->expandedvals.get(),
1858                                                  lastbox,
1859                                                  PTCouplingArrays{ state->boxv,
1860                                                                    state->nosehoover_xi,
1861                                                                    state->nosehoover_vxi,
1862                                                                    state->nhpres_xi,
1863                                                                    state->nhpres_vxi },
1864                                                  state->fep_state,
1865                                                  shake_vir,
1866                                                  force_vir,
1867                                                  total_vir,
1868                                                  pres,
1869                                                  ekind,
1870                                                  mu_tot,
1871                                                  constr);
1872             }
1873             else
1874             {
1875                 energyOutput.recordNonEnergyStep();
1876             }
1877
1878             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1879             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1880
1881             if (doSimulatedAnnealing)
1882             {
1883                 gmx::EnergyOutput::printAnnealingTemperatures(
1884                         do_log ? fplog : nullptr, groups, &(ir->opts));
1885             }
1886             if (do_log || do_ene || do_dr || do_or)
1887             {
1888                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf),
1889                                                    do_ene,
1890                                                    do_dr,
1891                                                    do_or,
1892                                                    do_log ? fplog : nullptr,
1893                                                    step,
1894                                                    t,
1895                                                    fr->fcdata.get(),
1896                                                    awh.get());
1897             }
1898             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1899             {
1900                 const bool isInitialOutput = false;
1901                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1902             }
1903
1904             if (ir->bPull)
1905             {
1906                 pull_print_output(pull_work, step, t);
1907             }
1908
1909             if (do_per_step(step, ir->nstlog))
1910             {
1911                 if (fflush(fplog) != 0)
1912                 {
1913                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1914                 }
1915             }
1916         }
1917         if (bDoExpanded)
1918         {
1919             /* Have to do this part _after_ outputting the logfile and the edr file */
1920             /* Gets written into the state at the beginning of next loop*/
1921             state->fep_state = lamnew;
1922         }
1923         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1924         {
1925             state->fep_state = awh->fepLambdaState();
1926         }
1927         /* Print the remaining wall clock time for the run */
1928         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1929         {
1930             if (shellfc)
1931             {
1932                 fprintf(stderr, "\n");
1933             }
1934             print_time(stderr, walltime_accounting, step, ir, cr);
1935         }
1936
1937         /* Ion/water position swapping.
1938          * Not done in last step since trajectory writing happens before this call
1939          * in the MD loop and exchanges would be lost anyway. */
1940         bNeedRepartition = FALSE;
1941         if ((ir->eSwapCoords != SwapType::No) && (step > 0) && !bLastStep
1942             && do_per_step(step, ir->swap->nstswap))
1943         {
1944             bNeedRepartition = do_swapcoords(cr,
1945                                              step,
1946                                              t,
1947                                              ir,
1948                                              swap,
1949                                              wcycle,
1950                                              as_rvec_array(state->x.data()),
1951                                              state->box,
1952                                              MASTER(cr) && mdrunOptions.verbose,
1953                                              bRerunMD);
1954
1955             if (bNeedRepartition && DOMAINDECOMP(cr))
1956             {
1957                 dd_collect_state(cr->dd, state, state_global);
1958             }
1959         }
1960
1961         /* Replica exchange */
1962         bExchanged = FALSE;
1963         if (bDoReplEx)
1964         {
1965             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1966         }
1967
1968         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1969         {
1970             dd_partition_system(fplog,
1971                                 mdlog,
1972                                 step,
1973                                 cr,
1974                                 TRUE,
1975                                 1,
1976                                 state_global,
1977                                 top_global,
1978                                 *ir,
1979                                 imdSession,
1980                                 pull_work,
1981                                 state,
1982                                 &f,
1983                                 mdAtoms,
1984                                 &top,
1985                                 fr,
1986                                 vsite,
1987                                 constr,
1988                                 nrnb,
1989                                 wcycle,
1990                                 FALSE);
1991             upd.updateAfterPartition(state->natoms,
1992                                      md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1993                                                  : gmx::ArrayRef<const unsigned short>(),
1994                                      md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1995                                              : gmx::ArrayRef<const unsigned short>());
1996         }
1997
1998         bFirstStep = FALSE;
1999         bInitStep  = FALSE;
2000
2001         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
2002         /* With all integrators, except VV, we need to retain the pressure
2003          * at the current step for coupling at the next step.
2004          */
2005         if ((state->flags & enumValueToBitMask(StateEntry::PressurePrevious))
2006             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
2007         {
2008             /* Store the pressure in t_state for pressure coupling
2009              * at the next MD step.
2010              */
2011             copy_mat(pres, state->pres_prev);
2012         }
2013
2014         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
2015
2016         if ((membed != nullptr) && (!bLastStep))
2017         {
2018             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
2019         }
2020
2021         cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
2022         if (DOMAINDECOMP(cr) && wcycle)
2023         {
2024             dd_cycles_add(cr->dd, cycles, ddCyclStep);
2025         }
2026
2027         /* increase the MD step number */
2028         step++;
2029         step_rel++;
2030
2031 #if GMX_FAHCORE
2032         if (MASTER(cr))
2033         {
2034             fcReportProgress(ir->nsteps + ir->init_step, step);
2035         }
2036 #endif
2037
2038         resetHandler->resetCounters(
2039                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
2040
2041         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
2042         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
2043     }
2044     /* End of main MD loop */
2045
2046     /* Closing TNG files can include compressing data. Therefore it is good to do that
2047      * before stopping the time measurements. */
2048     mdoutf_tng_close(outf);
2049
2050     /* Stop measuring walltime */
2051     walltime_accounting_end_time(walltime_accounting);
2052
2053     if (!thisRankHasDuty(cr, DUTY_PME))
2054     {
2055         /* Tell the PME only node to finish */
2056         gmx_pme_send_finish(cr);
2057     }
2058
2059     if (MASTER(cr))
2060     {
2061         if (ir->nstcalcenergy > 0)
2062         {
2063             energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
2064
2065             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
2066             energyOutput.printAverages(fplog, groups);
2067         }
2068     }
2069     done_mdoutf(outf);
2070
2071     if (bPMETune)
2072     {
2073         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
2074     }
2075
2076     done_shellfc(fplog, shellfc, step_rel);
2077
2078     if (useReplicaExchange && MASTER(cr))
2079     {
2080         print_replica_exchange_statistics(fplog, repl_ex);
2081     }
2082
2083     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
2084
2085     global_stat_destroy(gstat);
2086 }