80a820784d720b296678b3e88201f2c6e5daad6f
[alexxy/gromacs.git] / src / gromacs / mdrun / md.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 /*! \internal \file
38  *
39  * \brief Implements the integrator for normal molecular dynamics simulations
40  *
41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
42  * \ingroup module_mdrun
43  */
44 #include "gmxpre.h"
45
46 #include <cinttypes>
47 #include <cmath>
48 #include <cstdio>
49 #include <cstdlib>
50
51 #include <algorithm>
52 #include <memory>
53 #include <numeric>
54
55 #include "gromacs/applied_forces/awh/awh.h"
56 #include "gromacs/applied_forces/awh/read_params.h"
57 #include "gromacs/commandline/filenm.h"
58 #include "gromacs/domdec/collect.h"
59 #include "gromacs/domdec/dlbtiming.h"
60 #include "gromacs/domdec/domdec.h"
61 #include "gromacs/domdec/domdec_network.h"
62 #include "gromacs/domdec/domdec_struct.h"
63 #include "gromacs/domdec/gpuhaloexchange.h"
64 #include "gromacs/domdec/localtopologychecker.h"
65 #include "gromacs/domdec/mdsetup.h"
66 #include "gromacs/domdec/partition.h"
67 #include "gromacs/essentialdynamics/edsam.h"
68 #include "gromacs/ewald/pme_load_balancing.h"
69 #include "gromacs/ewald/pme_pp.h"
70 #include "gromacs/fileio/trxio.h"
71 #include "gromacs/gmxlib/network.h"
72 #include "gromacs/gmxlib/nrnb.h"
73 #include "gromacs/gpu_utils/device_stream_manager.h"
74 #include "gromacs/gpu_utils/gpu_utils.h"
75 #include "gromacs/imd/imd.h"
76 #include "gromacs/listed_forces/listed_forces.h"
77 #include "gromacs/math/functions.h"
78 #include "gromacs/math/invertmatrix.h"
79 #include "gromacs/math/vec.h"
80 #include "gromacs/math/vectypes.h"
81 #include "gromacs/mdlib/checkpointhandler.h"
82 #include "gromacs/mdlib/compute_io.h"
83 #include "gromacs/mdlib/constr.h"
84 #include "gromacs/mdlib/coupling.h"
85 #include "gromacs/mdlib/ebin.h"
86 #include "gromacs/mdlib/enerdata_utils.h"
87 #include "gromacs/mdlib/energyoutput.h"
88 #include "gromacs/mdlib/expanded.h"
89 #include "gromacs/mdlib/force.h"
90 #include "gromacs/mdlib/force_flags.h"
91 #include "gromacs/mdlib/forcerec.h"
92 #include "gromacs/mdlib/freeenergyparameters.h"
93 #include "gromacs/mdlib/md_support.h"
94 #include "gromacs/mdlib/mdatoms.h"
95 #include "gromacs/mdlib/mdoutf.h"
96 #include "gromacs/mdlib/membed.h"
97 #include "gromacs/mdlib/resethandler.h"
98 #include "gromacs/mdlib/sighandler.h"
99 #include "gromacs/mdlib/simulationsignal.h"
100 #include "gromacs/mdlib/stat.h"
101 #include "gromacs/mdlib/stophandler.h"
102 #include "gromacs/mdlib/tgroup.h"
103 #include "gromacs/mdlib/trajectory_writing.h"
104 #include "gromacs/mdlib/update.h"
105 #include "gromacs/mdlib/update_constrain_gpu.h"
106 #include "gromacs/mdlib/update_vv.h"
107 #include "gromacs/mdlib/vcm.h"
108 #include "gromacs/mdlib/vsite.h"
109 #include "gromacs/mdrunutility/handlerestart.h"
110 #include "gromacs/mdrunutility/multisim.h"
111 #include "gromacs/mdrunutility/printtime.h"
112 #include "gromacs/mdtypes/awh_history.h"
113 #include "gromacs/mdtypes/awh_params.h"
114 #include "gromacs/mdtypes/commrec.h"
115 #include "gromacs/mdtypes/df_history.h"
116 #include "gromacs/mdtypes/energyhistory.h"
117 #include "gromacs/mdtypes/fcdata.h"
118 #include "gromacs/mdtypes/forcebuffers.h"
119 #include "gromacs/mdtypes/forcerec.h"
120 #include "gromacs/mdtypes/group.h"
121 #include "gromacs/mdtypes/inputrec.h"
122 #include "gromacs/mdtypes/interaction_const.h"
123 #include "gromacs/mdtypes/md_enums.h"
124 #include "gromacs/mdtypes/mdatom.h"
125 #include "gromacs/mdtypes/mdrunoptions.h"
126 #include "gromacs/mdtypes/multipletimestepping.h"
127 #include "gromacs/mdtypes/observableshistory.h"
128 #include "gromacs/mdtypes/pullhistory.h"
129 #include "gromacs/mdtypes/simulation_workload.h"
130 #include "gromacs/mdtypes/state.h"
131 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
132 #include "gromacs/modularsimulator/energydata.h"
133 #include "gromacs/nbnxm/gpu_data_mgmt.h"
134 #include "gromacs/nbnxm/nbnxm.h"
135 #include "gromacs/pbcutil/pbc.h"
136 #include "gromacs/pulling/output.h"
137 #include "gromacs/pulling/pull.h"
138 #include "gromacs/swap/swapcoords.h"
139 #include "gromacs/timing/wallcycle.h"
140 #include "gromacs/timing/walltime_accounting.h"
141 #include "gromacs/topology/atoms.h"
142 #include "gromacs/topology/idef.h"
143 #include "gromacs/topology/mtop_util.h"
144 #include "gromacs/topology/topology.h"
145 #include "gromacs/trajectory/trajectoryframe.h"
146 #include "gromacs/utility/basedefinitions.h"
147 #include "gromacs/utility/cstringutil.h"
148 #include "gromacs/utility/fatalerror.h"
149 #include "gromacs/utility/logger.h"
150 #include "gromacs/utility/real.h"
151 #include "gromacs/utility/smalloc.h"
152
153 #include "legacysimulator.h"
154 #include "replicaexchange.h"
155 #include "shellfc.h"
156
157 using gmx::SimulationSignaller;
158
159 void gmx::LegacySimulator::do_md()
160 {
161     // TODO Historically, the EM and MD "integrators" used different
162     // names for the t_inputrec *parameter, but these must have the
163     // same name, now that it's a member of a struct. We use this ir
164     // alias to avoid a large ripple of nearly useless changes.
165     // t_inputrec is being replaced by IMdpOptionsProvider, so this
166     // will go away eventually.
167     const t_inputrec* ir = inputrec;
168
169     int64_t      step, step_rel;
170     double       t, t0 = ir->init_t;
171     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
172     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
173     gmx_bool     bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
174     gmx_bool     do_ene, do_log, do_verbose;
175     gmx_bool     bMasterState;
176     unsigned int force_flags;
177     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
178     int    i, m;
179     rvec   mu_tot;
180     matrix pressureCouplingMu, M;
181     gmx_repl_ex_t     repl_ex = nullptr;
182     gmx_global_stat_t gstat;
183     gmx_shellfc_t*    shellfc;
184     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
185     gmx_bool          bTrotter;
186     real              dvdl_constr;
187     std::vector<RVec> cbuf;
188     matrix            lastbox;
189     int               lamnew = 0;
190     /* for FEP */
191     int       nstfep = 0;
192     double    cycles;
193     real      saved_conserved_quantity = 0;
194     real      last_ekin                = 0;
195     t_extmass MassQ;
196     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
197
198     /* PME load balancing data for GPU kernels */
199     gmx_bool bPMETune         = FALSE;
200     gmx_bool bPMETunePrinting = FALSE;
201
202     bool bInteractiveMDstep = false;
203
204     SimulationSignals signals;
205     // Most global communnication stages don't propagate mdrun
206     // signals, and will use this object to achieve that.
207     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
208
209     if (!mdrunOptions.writeConfout)
210     {
211         // This is on by default, and the main known use case for
212         // turning it off is for convenience in benchmarking, which is
213         // something that should not show up in the general user
214         // interface.
215         GMX_LOG(mdlog.info)
216                 .asParagraph()
217                 .appendText(
218                         "The -noconfout functionality is deprecated, and may be removed in a "
219                         "future version.");
220     }
221
222     /* md-vv uses averaged full step velocities for T-control
223        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
224        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
225     bTrotter = (EI_VV(ir->eI)
226                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
227
228     const bool bRerunMD = false;
229
230     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
231     bGStatEveryStep   = (nstglobalcomm == 1);
232
233     const SimulationGroups* groups = &top_global.groups;
234
235     std::unique_ptr<EssentialDynamics> ed = nullptr;
236     if (opt2bSet("-ei", nfile, fnm))
237     {
238         /* Initialize essential dynamics sampling */
239         ed = init_edsam(mdlog,
240                         opt2fn_null("-ei", nfile, fnm),
241                         opt2fn("-eo", nfile, fnm),
242                         top_global,
243                         *ir,
244                         cr,
245                         constr,
246                         state_global,
247                         observablesHistory,
248                         oenv,
249                         startingBehavior);
250     }
251     else if (observablesHistory->edsamHistory)
252     {
253         gmx_fatal(FARGS,
254                   "The checkpoint is from a run with essential dynamics sampling, "
255                   "but the current run did not specify the -ei option. "
256                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
257     }
258
259     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
260     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
261     initialize_lambdas(fplog,
262                        ir->efep,
263                        ir->bSimTemp,
264                        *ir->fepvals,
265                        ir->simtempvals->temperatures,
266                        gmx::arrayRefFromArray(ir->opts.ref_t, ir->opts.ngtc),
267                        MASTER(cr),
268                        fep_state,
269                        lambda);
270     Update upd(*ir, deform);
271     bool   doSimulatedAnnealing = false;
272     {
273         // TODO: Avoid changing inputrec (#3854)
274         // Simulated annealing updates the reference temperature.
275         auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
276         doSimulatedAnnealing   = initSimulatedAnnealing(nonConstInputrec, &upd);
277     }
278     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
279
280     const t_fcdata& fcdata = *fr->fcdata;
281
282     bool simulationsShareState = false;
283     int  nstSignalComm         = nstglobalcomm;
284     {
285         // TODO This implementation of ensemble orientation restraints is nasty because
286         // a user can't just do multi-sim with single-sim orientation restraints.
287         bool usingEnsembleRestraints = (fcdata.disres->nsystems > 1) || ((ms != nullptr) && fcdata.orires);
288         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim() && (ms != nullptr));
289
290         // Replica exchange, ensemble restraints and AWH need all
291         // simulations to remain synchronized, so they need
292         // checkpoints and stop conditions to act on the same step, so
293         // the propagation of such signals must take place between
294         // simulations, not just within simulations.
295         // TODO: Make algorithm initializers set these flags.
296         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
297
298         if (simulationsShareState)
299         {
300             // Inter-simulation signal communication does not need to happen
301             // often, so we use a minimum of 200 steps to reduce overhead.
302             const int c_minimumInterSimulationSignallingInterval = 200;
303             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
304                             * nstglobalcomm;
305         }
306     }
307
308     if (startingBehavior != StartingBehavior::RestartWithAppending)
309     {
310         pleaseCiteCouplingAlgorithms(fplog, *ir);
311     }
312     gmx_mdoutf*       outf = init_mdoutf(fplog,
313                                    nfile,
314                                    fnm,
315                                    mdrunOptions,
316                                    cr,
317                                    outputProvider,
318                                    mdModulesNotifiers,
319                                    ir,
320                                    top_global,
321                                    oenv,
322                                    wcycle,
323                                    startingBehavior,
324                                    simulationsShareState,
325                                    ms);
326     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf),
327                                    top_global,
328                                    *ir,
329                                    pull_work,
330                                    mdoutf_get_fp_dhdl(outf),
331                                    false,
332                                    startingBehavior,
333                                    simulationsShareState,
334                                    mdModulesNotifiers);
335
336     gstat = global_stat_init(ir);
337
338     const auto& simulationWork     = runScheduleWork->simulationWork;
339     const bool  useGpuForPme       = simulationWork.useGpuPme;
340     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
341     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
342     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
343
344     /* Check for polarizable models and flexible constraints */
345     shellfc = init_shell_flexcon(fplog,
346                                  top_global,
347                                  constr ? constr->numFlexibleConstraints() : 0,
348                                  ir->nstcalcenergy,
349                                  DOMAINDECOMP(cr),
350                                  useGpuForPme);
351
352     {
353         double io = compute_io(ir, top_global.natoms, *groups, energyOutput.numEnergyTerms(), 1);
354         if ((io > 2000) && MASTER(cr))
355         {
356             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
357         }
358     }
359
360     // Local state only becomes valid now.
361     std::unique_ptr<t_state> stateInstance;
362     t_state*                 state;
363
364     gmx_localtop_t top(top_global.ffparams);
365
366     ForceBuffers     f(simulationWork.useMts,
367                    ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
368                                ? PinningPolicy::PinnedIfSupported
369                                : PinningPolicy::CannotBePinned);
370     const t_mdatoms* md = mdAtoms->mdatoms();
371     if (DOMAINDECOMP(cr))
372     {
373         stateInstance = std::make_unique<t_state>();
374         state         = stateInstance.get();
375         dd_init_local_state(*cr->dd, state_global, state);
376
377         /* Distribute the charge groups over the nodes from the master node */
378         dd_partition_system(fplog,
379                             mdlog,
380                             ir->init_step,
381                             cr,
382                             TRUE,
383                             1,
384                             state_global,
385                             top_global,
386                             *ir,
387                             imdSession,
388                             pull_work,
389                             state,
390                             &f,
391                             mdAtoms,
392                             &top,
393                             fr,
394                             vsite,
395                             constr,
396                             nrnb,
397                             nullptr,
398                             FALSE);
399         upd.updateAfterPartition(state->natoms,
400                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
401                                              : gmx::ArrayRef<const unsigned short>(),
402                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
403                                          : gmx::ArrayRef<const unsigned short>());
404     }
405     else
406     {
407         state_change_natoms(state_global, state_global->natoms);
408         /* Copy the pointer to the global state */
409         state = state_global;
410
411         /* Generate and initialize new topology */
412         mdAlgorithmsSetupAtomData(cr, *ir, top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
413
414         upd.updateAfterPartition(state->natoms,
415                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
416                                              : gmx::ArrayRef<const unsigned short>(),
417                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
418                                          : gmx::ArrayRef<const unsigned short>());
419     }
420
421     std::unique_ptr<UpdateConstrainGpu> integrator;
422
423     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
424
425     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
426     if (useGpuForUpdate)
427     {
428         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
429                                    || constr->numConstraintsTotal() == 0,
430                            "Constraints in domain decomposition are only supported with update "
431                            "groups if using GPU update.\n");
432         GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
433                                    || constr->numConstraintsTotal() == 0,
434                            "SHAKE is not supported with GPU update.");
435         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
436                            "Either PME or short-ranged non-bonded interaction tasks must run on "
437                            "the GPU to use GPU update.\n");
438         GMX_RELEASE_ASSERT(ir->eI == IntegrationAlgorithm::MD,
439                            "Only the md integrator is supported with the GPU update.\n");
440         GMX_RELEASE_ASSERT(
441                 ir->etc != TemperatureCoupling::NoseHoover,
442                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
443         GMX_RELEASE_ASSERT(
444                 ir->epc == PressureCoupling::No || ir->epc == PressureCoupling::ParrinelloRahman
445                         || ir->epc == PressureCoupling::Berendsen || ir->epc == PressureCoupling::CRescale,
446                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
447                 "with the GPU update.\n");
448         GMX_RELEASE_ASSERT(!md->haveVsites,
449                            "Virtual sites are not supported with the GPU update.\n");
450         GMX_RELEASE_ASSERT(ed == nullptr,
451                            "Essential dynamics is not supported with the GPU update.\n");
452         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
453                            "Constraints pulling is not supported with the GPU update.\n");
454         GMX_RELEASE_ASSERT(fcdata.orires == nullptr,
455                            "Orientation restraints are not supported with the GPU update.\n");
456         GMX_RELEASE_ASSERT(
457                 ir->efep == FreeEnergyPerturbationType::No
458                         || (!haveFepPerturbedMasses(top_global) && !havePerturbedConstraints(top_global)),
459                 "Free energy perturbation of masses and constraints are not supported with the GPU "
460                 "update.");
461
462         if (constr != nullptr && constr->numConstraintsTotal() > 0)
463         {
464             GMX_LOG(mdlog.info)
465                     .asParagraph()
466                     .appendText("Updating coordinates and applying constraints on the GPU.");
467         }
468         else
469         {
470             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
471         }
472         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
473                            "Device stream manager should be initialized in order to use GPU "
474                            "update-constraints.");
475         GMX_RELEASE_ASSERT(
476                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
477                 "Update stream should be initialized in order to use GPU "
478                 "update-constraints.");
479         integrator = std::make_unique<UpdateConstrainGpu>(
480                 *ir,
481                 top_global,
482                 ekind->ngtc,
483                 fr->deviceStreamManager->context(),
484                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
485                 stateGpu->xUpdatedOnDevice(),
486                 wcycle);
487
488         integrator->setPbc(PbcType::Xyz, state->box);
489     }
490
491     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
492     {
493         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
494     }
495     if (useGpuForUpdate)
496     {
497         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
498     }
499
500     // NOTE: The global state is no longer used at this point.
501     // But state_global is still used as temporary storage space for writing
502     // the global state to file and potentially for replica exchange.
503     // (Global topology should persist.)
504
505     update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
506
507     if (ir->bExpanded)
508     {
509         /* Check nstexpanded here, because the grompp check was broken */
510         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
511         {
512             gmx_fatal(FARGS,
513                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
514         }
515         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
516     }
517
518     if (MASTER(cr))
519     {
520         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
521     }
522
523     preparePrevStepPullCom(ir,
524                            pull_work,
525                            gmx::arrayRefFromArray(md->massT, md->nr),
526                            state,
527                            state_global,
528                            cr,
529                            startingBehavior != StartingBehavior::NewSimulation);
530
531     // TODO: Remove this by converting AWH into a ForceProvider
532     auto awh = prepareAwhModule(fplog,
533                                 *ir,
534                                 state_global,
535                                 cr,
536                                 ms,
537                                 startingBehavior != StartingBehavior::NewSimulation,
538                                 shellfc != nullptr,
539                                 opt2fn("-awh", nfile, fnm),
540                                 pull_work);
541
542     if (useReplicaExchange && MASTER(cr))
543     {
544         repl_ex = init_replica_exchange(fplog, ms, top_global.natoms, ir, replExParams);
545     }
546     /* PME tuning is only supported in the Verlet scheme, with PME for
547      * Coulomb. It is not supported with only LJ PME. */
548     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
549                 && ir->cutoff_scheme != CutoffScheme::Group);
550
551     pme_load_balancing_t* pme_loadbal = nullptr;
552     if (bPMETune)
553     {
554         pme_loadbal_init(
555                 &pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu());
556     }
557
558     if (!ir->bContinuation)
559     {
560         if (state->flags & enumValueToBitMask(StateEntry::V))
561         {
562             auto v = makeArrayRef(state->v);
563             /* Set the velocities of vsites, shells and frozen atoms to zero */
564             for (i = 0; i < md->homenr; i++)
565             {
566                 if (md->ptype[i] == ParticleType::Shell)
567                 {
568                     clear_rvec(v[i]);
569                 }
570                 else if (md->cFREEZE)
571                 {
572                     for (m = 0; m < DIM; m++)
573                     {
574                         if (ir->opts.nFreeze[md->cFREEZE[i]][m])
575                         {
576                             v[i][m] = 0;
577                         }
578                     }
579                 }
580             }
581         }
582
583         if (constr)
584         {
585             /* Constrain the initial coordinates and velocities */
586             do_constrain_first(fplog,
587                                constr,
588                                ir,
589                                md->nr,
590                                md->homenr,
591                                state->x.arrayRefWithPadding(),
592                                state->v.arrayRefWithPadding(),
593                                state->box,
594                                state->lambda[FreeEnergyPerturbationCouplingType::Bonded]);
595         }
596     }
597
598     if (ir->efep != FreeEnergyPerturbationType::No)
599     {
600         /* Set free energy calculation frequency as the greatest common
601          * denominator of nstdhdl and repl_ex_nst. */
602         nstfep = ir->fepvals->nstdhdl;
603         if (ir->bExpanded)
604         {
605             nstfep = std::gcd(ir->expandedvals->nstexpanded, nstfep);
606         }
607         if (useReplicaExchange)
608         {
609             nstfep = std::gcd(replExParams.exchangeInterval, nstfep);
610         }
611         if (ir->bDoAwh)
612         {
613             nstfep = std::gcd(ir->awhParams->nstSampleCoord(), nstfep);
614         }
615     }
616
617     /* Be REALLY careful about what flags you set here. You CANNOT assume
618      * this is the first step, since we might be restarting from a checkpoint,
619      * and in that case we should not do any modifications to the state.
620      */
621     bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && !ir->bContinuation);
622
623     // When restarting from a checkpoint, it can be appropriate to
624     // initialize ekind from quantities in the checkpoint. Otherwise,
625     // compute_globals must initialize ekind before the simulation
626     // starts/restarts. However, only the master rank knows what was
627     // found in the checkpoint file, so we have to communicate in
628     // order to coordinate the restart.
629     //
630     // TODO Consider removing this communication if/when checkpoint
631     // reading directly follows .tpr reading, because all ranks can
632     // agree on hasReadEkinState at that time.
633     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
634     if (PAR(cr))
635     {
636         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
637     }
638     if (hasReadEkinState)
639     {
640         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
641     }
642
643     unsigned int cglo_flags =
644             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
645              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
646
647     bSumEkinhOld = FALSE;
648
649     t_vcm vcm(top_global.groups, *ir);
650     reportComRemovalInfo(fplog, vcm);
651
652     /* To minimize communication, compute_globals computes the COM velocity
653      * and the kinetic energy for the velocities without COM motion removed.
654      * Thus to get the kinetic energy without the COM contribution, we need
655      * to call compute_globals twice.
656      */
657     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
658     {
659         unsigned int cglo_flags_iteration = cglo_flags;
660         if (bStopCM && cgloIteration == 0)
661         {
662             cglo_flags_iteration |= CGLO_STOPCM;
663             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
664         }
665         if (DOMAINDECOMP(cr) && dd_localTopologyChecker(*cr->dd).shouldCheckNumberOfBondedInteractions()
666             && cgloIteration == 0)
667         {
668             cglo_flags_iteration |= CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS;
669         }
670         compute_globals(gstat,
671                         cr,
672                         ir,
673                         fr,
674                         ekind,
675                         makeConstArrayRef(state->x),
676                         makeConstArrayRef(state->v),
677                         state->box,
678                         md,
679                         nrnb,
680                         &vcm,
681                         nullptr,
682                         enerd,
683                         force_vir,
684                         shake_vir,
685                         total_vir,
686                         pres,
687                         gmx::ArrayRef<real>{},
688                         &nullSignaller,
689                         state->box,
690                         &bSumEkinhOld,
691                         cglo_flags_iteration);
692         if (cglo_flags_iteration & CGLO_STOPCM)
693         {
694             /* At initialization, do not pass x with acceleration-correction mode
695              * to avoid (incorrect) correction of the initial coordinates.
696              */
697             auto x = (vcm.mode == ComRemovalAlgorithm::LinearAccelerationCorrection)
698                              ? ArrayRef<RVec>()
699                              : makeArrayRef(state->x);
700             process_and_stopcm_grp(fplog, &vcm, *md, x, makeArrayRef(state->v));
701             inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
702         }
703     }
704     if (DOMAINDECOMP(cr))
705     {
706         dd_localTopologyChecker(cr->dd)->checkNumberOfBondedInteractions(
707                 &top, makeConstArrayRef(state->x), state->box);
708     }
709     if (ir->eI == IntegrationAlgorithm::VVAK)
710     {
711         /* a second call to get the half step temperature initialized as well */
712         /* we do the same call as above, but turn the pressure off -- internally to
713            compute_globals, this is recognized as a velocity verlet half-step
714            kinetic energy calculation.  This minimized excess variables, but
715            perhaps loses some logic?*/
716
717         compute_globals(gstat,
718                         cr,
719                         ir,
720                         fr,
721                         ekind,
722                         makeConstArrayRef(state->x),
723                         makeConstArrayRef(state->v),
724                         state->box,
725                         md,
726                         nrnb,
727                         &vcm,
728                         nullptr,
729                         enerd,
730                         force_vir,
731                         shake_vir,
732                         total_vir,
733                         pres,
734                         gmx::ArrayRef<real>{},
735                         &nullSignaller,
736                         state->box,
737                         &bSumEkinhOld,
738                         cglo_flags & ~CGLO_PRESSURE);
739     }
740
741     /* Calculate the initial half step temperature, and save the ekinh_old */
742     if (startingBehavior == StartingBehavior::NewSimulation)
743     {
744         for (i = 0; (i < ir->opts.ngtc); i++)
745         {
746             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
747         }
748     }
749
750     /* need to make an initiation call to get the Trotter variables set, as well as other constants
751        for non-trotter temperature control */
752     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
753
754     if (MASTER(cr))
755     {
756         if (!ir->bContinuation)
757         {
758             if (constr && ir->eConstrAlg == ConstraintAlgorithm::Lincs)
759             {
760                 fprintf(fplog,
761                         "RMS relative constraint deviation after constraining: %.2e\n",
762                         constr->rmsd());
763             }
764             if (EI_STATE_VELOCITY(ir->eI))
765             {
766                 real temp = enerd->term[F_TEMP];
767                 if (ir->eI != IntegrationAlgorithm::VV)
768                 {
769                     /* Result of Ekin averaged over velocities of -half
770                      * and +half step, while we only have -half step here.
771                      */
772                     temp *= 2;
773                 }
774                 fprintf(fplog, "Initial temperature: %g K\n", temp);
775             }
776         }
777
778         char tbuf[20];
779         fprintf(stderr, "starting mdrun '%s'\n", *(top_global.name));
780         if (ir->nsteps >= 0)
781         {
782             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
783         }
784         else
785         {
786             sprintf(tbuf, "%s", "infinite");
787         }
788         if (ir->init_step > 0)
789         {
790             fprintf(stderr,
791                     "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
792                     gmx_step_str(ir->init_step + ir->nsteps, sbuf),
793                     tbuf,
794                     gmx_step_str(ir->init_step, sbuf2),
795                     ir->init_step * ir->delta_t);
796         }
797         else
798         {
799             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
800         }
801         fprintf(fplog, "\n");
802     }
803
804     walltime_accounting_start_time(walltime_accounting);
805     wallcycle_start(wcycle, WallCycleCounter::Run);
806     print_start(fplog, cr, walltime_accounting, "mdrun");
807
808     /***********************************************************
809      *
810      *             Loop over MD steps
811      *
812      ************************************************************/
813
814     bFirstStep = TRUE;
815     /* Skip the first Nose-Hoover integration when we get the state from tpx */
816     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
817     bSumEkinhOld     = FALSE;
818     bExchanged       = FALSE;
819     bNeedRepartition = FALSE;
820
821     step     = ir->init_step;
822     step_rel = 0;
823
824     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
825             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]),
826             simulationsShareState,
827             MASTER(cr),
828             ir->nstlist,
829             mdrunOptions.reproducible,
830             nstSignalComm,
831             mdrunOptions.maximumHoursToRun,
832             ir->nstlist == 0,
833             fplog,
834             step,
835             bNS,
836             walltime_accounting);
837
838     auto checkpointHandler = std::make_unique<CheckpointHandler>(
839             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
840             simulationsShareState,
841             ir->nstlist == 0,
842             MASTER(cr),
843             mdrunOptions.writeConfout,
844             mdrunOptions.checkpointOptions.period);
845
846     const bool resetCountersIsLocal = true;
847     auto       resetHandler         = std::make_unique<ResetHandler>(
848             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
849             !resetCountersIsLocal,
850             ir->nsteps,
851             MASTER(cr),
852             mdrunOptions.timingOptions.resetHalfway,
853             mdrunOptions.maximumHoursToRun,
854             mdlog,
855             wcycle,
856             walltime_accounting);
857
858     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
859
860     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
861     {
862         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
863     }
864
865     /* and stop now if we should */
866     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
867     while (!bLastStep)
868     {
869
870         /* Determine if this is a neighbor search step */
871         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
872
873         if (bPMETune && bNStList)
874         {
875             // This has to be here because PME load balancing is called so early.
876             // TODO: Move to after all booleans are defined.
877             if (useGpuForUpdate && !bFirstStep)
878             {
879                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
880                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
881             }
882             /* PME grid + cut-off optimization with GPUs or PME nodes */
883             pme_loadbal_do(pme_loadbal,
884                            cr,
885                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
886                            fplog,
887                            mdlog,
888                            *ir,
889                            fr,
890                            state->box,
891                            state->x,
892                            wcycle,
893                            step,
894                            step_rel,
895                            &bPMETunePrinting,
896                            simulationWork.useGpuPmePpCommunication);
897         }
898
899         wallcycle_start(wcycle, WallCycleCounter::Step);
900
901         bLastStep = (step_rel == ir->nsteps);
902         t         = t0 + step * ir->delta_t;
903
904         // TODO Refactor this, so that nstfep does not need a default value of zero
905         if (ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
906         {
907             /* find and set the current lambdas */
908             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
909
910             bDoDHDL = do_per_step(step, ir->fepvals->nstdhdl);
911             bDoFEP  = ((ir->efep != FreeEnergyPerturbationType::No) && do_per_step(step, nstfep));
912             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
913                            && (!bFirstStep));
914         }
915
916         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
917                      && do_per_step(step, replExParams.exchangeInterval));
918
919         if (doSimulatedAnnealing)
920         {
921             // TODO: Avoid changing inputrec (#3854)
922             // Simulated annealing updates the reference temperature.
923             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
924             update_annealing_target_temp(nonConstInputrec, t, &upd);
925         }
926
927         /* Stop Center of Mass motion */
928         bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && do_per_step(step, ir->nstcomm));
929
930         /* Determine whether or not to do Neighbour Searching */
931         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
932
933         /* Note that the stopHandler will cause termination at nstglobalcomm
934          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
935          * nstpcouple steps, we have computed the half-step kinetic energy
936          * of the previous step and can always output energies at the last step.
937          */
938         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
939
940         /* do_log triggers energy and virial calculation. Because this leads
941          * to different code paths, forces can be different. Thus for exact
942          * continuation we should avoid extra log output.
943          * Note that the || bLastStep can result in non-exact continuation
944          * beyond the last step. But we don't consider that to be an issue.
945          */
946         do_log     = (do_per_step(step, ir->nstlog)
947                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
948         do_verbose = mdrunOptions.verbose
949                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
950
951         if (useGpuForUpdate && !bFirstStep && bNS)
952         {
953             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
954             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
955             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
956             // Copy coordinate from the GPU when needed at the search step.
957             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
958             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
959             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
960             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
961         }
962
963         // We only need to calculate virtual velocities if we are writing them in the current step
964         const bool needVirtualVelocitiesThisStep =
965                 (vsite != nullptr)
966                 && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep());
967
968         if (vsite != nullptr)
969         {
970             // Virtual sites need to be updated before domain decomposition and forces are calculated
971             wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
972             // md-vv calculates virtual velocities once it has full-step real velocities
973             vsite->construct(state->x,
974                              state->v,
975                              state->box,
976                              (!EI_VV(inputrec->eI) && needVirtualVelocitiesThisStep)
977                                      ? VSiteOperation::PositionsAndVelocities
978                                      : VSiteOperation::Positions);
979             wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
980         }
981
982         if (bNS && !(bFirstStep && ir->bContinuation))
983         {
984             bMasterState = FALSE;
985             /* Correct the new box if it is too skewed */
986             if (inputrecDynamicBox(ir))
987             {
988                 if (correct_box(fplog, step, state->box))
989                 {
990                     bMasterState = TRUE;
991                     // If update is offloaded, it should be informed about the box size change
992                     if (useGpuForUpdate)
993                     {
994                         integrator->setPbc(PbcType::Xyz, state->box);
995                     }
996                 }
997             }
998             if (DOMAINDECOMP(cr) && bMasterState)
999             {
1000                 dd_collect_state(cr->dd, state, state_global);
1001             }
1002
1003             if (DOMAINDECOMP(cr))
1004             {
1005                 /* Repartition the domain decomposition */
1006                 dd_partition_system(fplog,
1007                                     mdlog,
1008                                     step,
1009                                     cr,
1010                                     bMasterState,
1011                                     nstglobalcomm,
1012                                     state_global,
1013                                     top_global,
1014                                     *ir,
1015                                     imdSession,
1016                                     pull_work,
1017                                     state,
1018                                     &f,
1019                                     mdAtoms,
1020                                     &top,
1021                                     fr,
1022                                     vsite,
1023                                     constr,
1024                                     nrnb,
1025                                     wcycle,
1026                                     do_verbose && !bPMETunePrinting);
1027                 upd.updateAfterPartition(state->natoms,
1028                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1029                                                      : gmx::ArrayRef<const unsigned short>(),
1030                                          md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1031                                                  : gmx::ArrayRef<const unsigned short>());
1032             }
1033         }
1034
1035         // Allocate or re-size GPU halo exchange object, if necessary
1036         if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange)
1037         {
1038             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
1039                                "GPU device manager has to be initialized to use GPU "
1040                                "version of halo exchange.");
1041             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
1042         }
1043
1044         if (MASTER(cr) && do_log)
1045         {
1046             gmx::EnergyOutput::printHeader(
1047                     fplog, step, t); /* can we improve the information printed here? */
1048         }
1049
1050         if (ir->efep != FreeEnergyPerturbationType::No)
1051         {
1052             update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
1053         }
1054
1055         if (bExchanged)
1056         {
1057             /* We need the kinetic energy at minus the half step for determining
1058              * the full step kinetic energy and possibly for T-coupling.*/
1059             /* This may not be quite working correctly yet . . . . */
1060             int cglo_flags = CGLO_GSTAT | CGLO_TEMPERATURE;
1061             if (DOMAINDECOMP(cr) && dd_localTopologyChecker(*cr->dd).shouldCheckNumberOfBondedInteractions())
1062             {
1063                 cglo_flags |= CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS;
1064             }
1065             compute_globals(gstat,
1066                             cr,
1067                             ir,
1068                             fr,
1069                             ekind,
1070                             makeConstArrayRef(state->x),
1071                             makeConstArrayRef(state->v),
1072                             state->box,
1073                             md,
1074                             nrnb,
1075                             &vcm,
1076                             wcycle,
1077                             enerd,
1078                             nullptr,
1079                             nullptr,
1080                             nullptr,
1081                             nullptr,
1082                             gmx::ArrayRef<real>{},
1083                             &nullSignaller,
1084                             state->box,
1085                             &bSumEkinhOld,
1086                             cglo_flags);
1087             if (DOMAINDECOMP(cr))
1088             {
1089                 dd_localTopologyChecker(cr->dd)->checkNumberOfBondedInteractions(
1090                         &top, makeConstArrayRef(state->x), state->box);
1091             }
1092         }
1093         clear_mat(force_vir);
1094
1095         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
1096
1097         /* Determine the energy and pressure:
1098          * at nstcalcenergy steps and at energy output steps (set below).
1099          */
1100         if (EI_VV(ir->eI) && (!bInitStep))
1101         {
1102             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1103             bCalcVir      = bCalcEnerStep
1104                        || (ir->epc != PressureCoupling::No
1105                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
1106         }
1107         else
1108         {
1109             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1110             bCalcVir      = bCalcEnerStep
1111                        || (ir->epc != PressureCoupling::No && do_per_step(step, ir->nstpcouple));
1112         }
1113         bCalcEner = bCalcEnerStep;
1114
1115         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
1116
1117         if (do_ene || do_log || bDoReplEx)
1118         {
1119             bCalcVir  = TRUE;
1120             bCalcEner = TRUE;
1121         }
1122
1123         /* Do we need global communication ? */
1124         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
1125                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
1126
1127         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
1128                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
1129                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
1130         if (simulationWork.useMts && !do_per_step(step, ir->nstfout))
1131         {
1132             // TODO: merge this with stepWork.useOnlyMtsCombinedForceBuffer
1133             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
1134         }
1135
1136         if (shellfc)
1137         {
1138             /* Now is the time to relax the shells */
1139             relax_shell_flexcon(fplog,
1140                                 cr,
1141                                 ms,
1142                                 mdrunOptions.verbose,
1143                                 enforcedRotation,
1144                                 step,
1145                                 ir,
1146                                 imdSession,
1147                                 pull_work,
1148                                 bNS,
1149                                 force_flags,
1150                                 &top,
1151                                 constr,
1152                                 enerd,
1153                                 state->natoms,
1154                                 state->x.arrayRefWithPadding(),
1155                                 state->v.arrayRefWithPadding(),
1156                                 state->box,
1157                                 state->lambda,
1158                                 &state->hist,
1159                                 &f.view(),
1160                                 force_vir,
1161                                 *md,
1162                                 nrnb,
1163                                 wcycle,
1164                                 shellfc,
1165                                 fr,
1166                                 runScheduleWork,
1167                                 t,
1168                                 mu_tot,
1169                                 vsite,
1170                                 ddBalanceRegionHandler);
1171         }
1172         else
1173         {
1174             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
1175                is updated (or the AWH update will be performed twice for one step when continuing).
1176                It would be best to call this update function from do_md_trajectory_writing but that
1177                would occur after do_force. One would have to divide the update_awh function into one
1178                function applying the AWH force and one doing the AWH bias update. The update AWH
1179                bias function could then be called after do_md_trajectory_writing (then containing
1180                update_awh_history). The checkpointing will in the future probably moved to the start
1181                of the md loop which will rid of this issue. */
1182             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
1183             {
1184                 awh->updateHistory(state_global->awhHistory.get());
1185             }
1186
1187             /* The coordinates (x) are shifted (to get whole molecules)
1188              * in do_force.
1189              * This is parallellized as well, and does communication too.
1190              * Check comments in sim_util.c
1191              */
1192             do_force(fplog,
1193                      cr,
1194                      ms,
1195                      *ir,
1196                      awh.get(),
1197                      enforcedRotation,
1198                      imdSession,
1199                      pull_work,
1200                      step,
1201                      nrnb,
1202                      wcycle,
1203                      &top,
1204                      state->box,
1205                      state->x.arrayRefWithPadding(),
1206                      &state->hist,
1207                      &f.view(),
1208                      force_vir,
1209                      md,
1210                      enerd,
1211                      state->lambda,
1212                      fr,
1213                      runScheduleWork,
1214                      vsite,
1215                      mu_tot,
1216                      t,
1217                      ed ? ed->getLegacyED() : nullptr,
1218                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
1219                      ddBalanceRegionHandler);
1220         }
1221
1222         // VV integrators do not need the following velocity half step
1223         // if it is the first step after starting from a checkpoint.
1224         // That is, the half step is needed on all other steps, and
1225         // also the first step when starting from a .tpr file.
1226         if (EI_VV(ir->eI))
1227         {
1228             integrateVVFirstStep(step,
1229                                  bFirstStep,
1230                                  bInitStep,
1231                                  startingBehavior,
1232                                  nstglobalcomm,
1233                                  ir,
1234                                  fr,
1235                                  cr,
1236                                  state,
1237                                  mdAtoms->mdatoms(),
1238                                  fcdata,
1239                                  &MassQ,
1240                                  &vcm,
1241                                  top,
1242                                  enerd,
1243                                  ekind,
1244                                  gstat,
1245                                  &last_ekin,
1246                                  bCalcVir,
1247                                  total_vir,
1248                                  shake_vir,
1249                                  force_vir,
1250                                  pres,
1251                                  M,
1252                                  do_log,
1253                                  do_ene,
1254                                  bCalcEner,
1255                                  bGStat,
1256                                  bStopCM,
1257                                  bTrotter,
1258                                  bExchanged,
1259                                  &bSumEkinhOld,
1260                                  &saved_conserved_quantity,
1261                                  &f,
1262                                  &upd,
1263                                  constr,
1264                                  &nullSignaller,
1265                                  trotter_seq,
1266                                  nrnb,
1267                                  fplog,
1268                                  wcycle);
1269             if (vsite != nullptr && needVirtualVelocitiesThisStep)
1270             {
1271                 // Positions were calculated earlier
1272                 wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
1273                 vsite->construct(state->x, state->v, state->box, VSiteOperation::Velocities);
1274                 wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
1275             }
1276         }
1277
1278         /* ########  END FIRST UPDATE STEP  ############## */
1279         /* ########  If doing VV, we now have v(dt) ###### */
1280         if (bDoExpanded)
1281         {
1282             /* perform extended ensemble sampling in lambda - we don't
1283                actually move to the new state before outputting
1284                statistics, but if performing simulated tempering, we
1285                do update the velocities and the tau_t. */
1286             // TODO: Avoid changing inputrec (#3854)
1287             // Simulated tempering updates the reference temperature.
1288             // Expanded ensemble without simulated tempering does not change the inputrec.
1289             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
1290             lamnew                 = ExpandedEnsembleDynamics(fplog,
1291                                               nonConstInputrec,
1292                                               enerd,
1293                                               state,
1294                                               &MassQ,
1295                                               state->fep_state,
1296                                               state->dfhist,
1297                                               step,
1298                                               state->v.rvec_array(),
1299                                               md->homenr,
1300                                               md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1301                                                                       : gmx::ArrayRef<const unsigned short>());
1302             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1303             if (MASTER(cr))
1304             {
1305                 copy_df_history(state_global->dfhist, state->dfhist);
1306             }
1307         }
1308
1309         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1310         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1311         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1312             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1313                 || checkpointHandler->isCheckpointingStep()))
1314         {
1315             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1316             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1317         }
1318         // Copy velocities if needed for the output/checkpointing.
1319         // NOTE: Copy on the search steps is done at the beginning of the step.
1320         if (useGpuForUpdate && !bNS
1321             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1322         {
1323             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1324             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1325         }
1326         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1327         // and update is offloaded hence forces are kept on the GPU for update and have not been
1328         // already transferred in do_force().
1329         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1330         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1331         //       prior to GPU update.
1332         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1333         //       copy call in do_force(...).
1334         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1335         //       on host after the D2H copy in do_force(...).
1336         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1337             && do_per_step(step, ir->nstfout))
1338         {
1339             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1340             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1341         }
1342         /* Now we have the energies and forces corresponding to the
1343          * coordinates at time t. We must output all of this before
1344          * the update.
1345          */
1346         do_md_trajectory_writing(fplog,
1347                                  cr,
1348                                  nfile,
1349                                  fnm,
1350                                  step,
1351                                  step_rel,
1352                                  t,
1353                                  ir,
1354                                  state,
1355                                  state_global,
1356                                  observablesHistory,
1357                                  top_global,
1358                                  fr,
1359                                  outf,
1360                                  energyOutput,
1361                                  ekind,
1362                                  f.view().force(),
1363                                  checkpointHandler->isCheckpointingStep(),
1364                                  bRerunMD,
1365                                  bLastStep,
1366                                  mdrunOptions.writeConfout,
1367                                  bSumEkinhOld);
1368         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1369         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x, t);
1370
1371         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1372         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1373             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1374         {
1375             copy_mat(state->svir_prev, shake_vir);
1376             copy_mat(state->fvir_prev, force_vir);
1377         }
1378
1379         stopHandler->setSignal();
1380         resetHandler->setSignal(walltime_accounting);
1381
1382         if (bGStat || !PAR(cr))
1383         {
1384             /* In parallel we only have to check for checkpointing in steps
1385              * where we do global communication,
1386              *  otherwise the other nodes don't know.
1387              */
1388             checkpointHandler->setSignal(walltime_accounting);
1389         }
1390
1391         /* #########   START SECOND UPDATE STEP ################# */
1392
1393         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1394            controlled in preprocessing */
1395
1396         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1397         {
1398             gmx_bool bIfRandomize;
1399             bIfRandomize = update_randomize_velocities(ir,
1400                                                        step,
1401                                                        cr,
1402                                                        md->homenr,
1403                                                        md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1404                                                                : gmx::ArrayRef<const unsigned short>(),
1405                                                        gmx::arrayRefFromArray(md->invmass, md->nr),
1406                                                        state->v,
1407                                                        &upd,
1408                                                        constr);
1409             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1410             if (constr && bIfRandomize)
1411             {
1412                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1413             }
1414         }
1415         /* Box is changed in update() when we do pressure coupling,
1416          * but we should still use the old box for energy corrections and when
1417          * writing it to the energy file, so it matches the trajectory files for
1418          * the same timestep above. Make a copy in a separate array.
1419          */
1420         copy_mat(state->box, lastbox);
1421
1422         dvdl_constr = 0;
1423
1424         if (!useGpuForUpdate)
1425         {
1426             wallcycle_start(wcycle, WallCycleCounter::Update);
1427         }
1428         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1429         if (bTrotter)
1430         {
1431             trotter_update(ir,
1432                            step,
1433                            ekind,
1434                            enerd,
1435                            state,
1436                            total_vir,
1437                            md->homenr,
1438                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1439                                    : gmx::ArrayRef<const unsigned short>(),
1440                            gmx::arrayRefFromArray(md->invmass, md->nr),
1441                            &MassQ,
1442                            trotter_seq,
1443                            TrotterSequence::Three);
1444             /* We can only do Berendsen coupling after we have summed
1445              * the kinetic energy or virial. Since the happens
1446              * in global_state after update, we should only do it at
1447              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1448              */
1449         }
1450         else
1451         {
1452             update_tcouple(step,
1453                            ir,
1454                            state,
1455                            ekind,
1456                            &MassQ,
1457                            md->homenr,
1458                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1459                                    : gmx::ArrayRef<const unsigned short>());
1460             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1461         }
1462
1463         /* With leap-frog type integrators we compute the kinetic energy
1464          * at a whole time step as the average of the half-time step kinetic
1465          * energies of two subsequent steps. Therefore we need to compute the
1466          * half step kinetic energy also if we need energies at the next step.
1467          */
1468         const bool needHalfStepKineticEnergy =
1469                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1470
1471         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1472         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1473         const bool doParrinelloRahman = (ir->epc == PressureCoupling::ParrinelloRahman
1474                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1475
1476         if (EI_VV(ir->eI))
1477         {
1478             GMX_ASSERT(!useGpuForUpdate, "GPU update is not supported with VVAK integrator.");
1479
1480             integrateVVSecondStep(step,
1481                                   ir,
1482                                   fr,
1483                                   cr,
1484                                   state,
1485                                   mdAtoms->mdatoms(),
1486                                   fcdata,
1487                                   &MassQ,
1488                                   &vcm,
1489                                   pull_work,
1490                                   enerd,
1491                                   ekind,
1492                                   gstat,
1493                                   &dvdl_constr,
1494                                   bCalcVir,
1495                                   total_vir,
1496                                   shake_vir,
1497                                   force_vir,
1498                                   pres,
1499                                   M,
1500                                   lastbox,
1501                                   do_log,
1502                                   do_ene,
1503                                   bGStat,
1504                                   &bSumEkinhOld,
1505                                   &f,
1506                                   &cbuf,
1507                                   &upd,
1508                                   constr,
1509                                   &nullSignaller,
1510                                   trotter_seq,
1511                                   nrnb,
1512                                   wcycle);
1513         }
1514         else
1515         {
1516             if (useGpuForUpdate)
1517             {
1518                 if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1519                 {
1520                     integrator->set(stateGpu->getCoordinates(),
1521                                     stateGpu->getVelocities(),
1522                                     stateGpu->getForces(),
1523                                     top.idef,
1524                                     *md);
1525
1526                     // Copy data to the GPU after buffers might have being reinitialized
1527                     // coordinates have been copied already if PME or buffer ops has not needed it this step.
1528                     stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1529                     const bool useGpuPmeOnThisRank = runScheduleWork->simulationWork.useGpuPme
1530                                                      && thisRankHasDuty(cr, DUTY_PME)
1531                                                      && runScheduleWork->stepWork.computeSlowForces;
1532                     if (!useGpuPmeOnThisRank && !runScheduleWork->stepWork.useGpuXBufferOps)
1533                     {
1534                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1535                     }
1536                 }
1537
1538                 if (simulationWork.useGpuPme && !runScheduleWork->simulationWork.useGpuPmePpCommunication
1539                     && !thisRankHasDuty(cr, DUTY_PME))
1540                 {
1541                     // The PME forces were recieved to the host, so have to be copied
1542                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::All);
1543                 }
1544                 else if (!runScheduleWork->stepWork.useGpuFBufferOps)
1545                 {
1546                     // The buffer ops were not offloaded this step, so the forces are on the
1547                     // host and have to be copied
1548                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1549                 }
1550
1551                 const bool doTemperatureScaling =
1552                         (ir->etc != TemperatureCoupling::No
1553                          && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1554
1555                 // This applies Leap-Frog, LINCS and SETTLE in succession
1556                 integrator->integrate(
1557                         stateGpu->getForcesReadyOnDeviceEvent(
1558                                 AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1559                         ir->delta_t,
1560                         true,
1561                         bCalcVir,
1562                         shake_vir,
1563                         doTemperatureScaling,
1564                         ekind->tcstat,
1565                         doParrinelloRahman,
1566                         ir->nstpcouple * ir->delta_t,
1567                         M);
1568
1569                 // Copy velocities D2H after update if:
1570                 // - Globals are computed this step (includes the energy output steps).
1571                 // - Temperature is needed for the next step.
1572                 if (bGStat || needHalfStepKineticEnergy)
1573                 {
1574                     stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1575                     stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1576                 }
1577             }
1578             else
1579             {
1580                 /* With multiple time stepping we need to do an additional normal
1581                  * update step to obtain the virial, as the actual MTS integration
1582                  * using an acceleration where the slow forces are multiplied by mtsFactor.
1583                  * Using that acceleration would result in a virial with the slow
1584                  * force contribution would be a factor mtsFactor too large.
1585                  */
1586                 if (simulationWork.useMts && bCalcVir && constr != nullptr)
1587                 {
1588                     upd.update_for_constraint_virial(*ir,
1589                                                      md->homenr,
1590                                                      md->havePartiallyFrozenAtoms,
1591                                                      gmx::arrayRefFromArray(md->invmass, md->nr),
1592                                                      gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1593                                                      *state,
1594                                                      f.view().forceWithPadding(),
1595                                                      *ekind);
1596
1597                     constrain_coordinates(constr,
1598                                           do_log,
1599                                           do_ene,
1600                                           step,
1601                                           state,
1602                                           upd.xp()->arrayRefWithPadding(),
1603                                           &dvdl_constr,
1604                                           bCalcVir,
1605                                           shake_vir);
1606                 }
1607
1608                 ArrayRefWithPadding<const RVec> forceCombined =
1609                         (simulationWork.useMts && step % ir->mtsLevels[1].stepFactor == 0)
1610                                 ? f.view().forceMtsCombinedWithPadding()
1611                                 : f.view().forceWithPadding();
1612                 upd.update_coords(*ir,
1613                                   step,
1614                                   md->homenr,
1615                                   md->havePartiallyFrozenAtoms,
1616                                   gmx::arrayRefFromArray(md->ptype, md->nr),
1617                                   gmx::arrayRefFromArray(md->invmass, md->nr),
1618                                   gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1619                                   state,
1620                                   forceCombined,
1621                                   fcdata,
1622                                   ekind,
1623                                   M,
1624                                   etrtPOSITION,
1625                                   cr,
1626                                   constr != nullptr);
1627
1628                 wallcycle_stop(wcycle, WallCycleCounter::Update);
1629
1630                 constrain_coordinates(constr,
1631                                       do_log,
1632                                       do_ene,
1633                                       step,
1634                                       state,
1635                                       upd.xp()->arrayRefWithPadding(),
1636                                       &dvdl_constr,
1637                                       bCalcVir && !simulationWork.useMts,
1638                                       shake_vir);
1639
1640                 upd.update_sd_second_half(*ir,
1641                                           step,
1642                                           &dvdl_constr,
1643                                           md->homenr,
1644                                           gmx::arrayRefFromArray(md->ptype, md->nr),
1645                                           gmx::arrayRefFromArray(md->invmass, md->nr),
1646                                           state,
1647                                           cr,
1648                                           nrnb,
1649                                           wcycle,
1650                                           constr,
1651                                           do_log,
1652                                           do_ene);
1653                 upd.finish_update(
1654                         *ir, md->havePartiallyFrozenAtoms, md->homenr, state, wcycle, constr != nullptr);
1655             }
1656
1657             if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1658             {
1659                 updatePrevStepPullCom(pull_work, state);
1660             }
1661
1662             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1663         }
1664
1665         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1666         /* With Leap-Frog we can skip compute_globals at
1667          * non-communication steps, but we need to calculate
1668          * the kinetic energy one step before communication.
1669          */
1670         {
1671             // Organize to do inter-simulation signalling on steps if
1672             // and when algorithms require it.
1673             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1674
1675             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1676             {
1677                 // Copy coordinates when needed to stop the CM motion.
1678                 if (useGpuForUpdate && (bDoReplEx || (!EI_VV(ir->eI) && bStopCM)))
1679                 {
1680                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1681                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1682                 }
1683                 // Since we're already communicating at this step, we
1684                 // can propagate intra-simulation signals. Note that
1685                 // check_nstglobalcomm has the responsibility for
1686                 // choosing the value of nstglobalcomm that is one way
1687                 // bGStat becomes true, so we can't get into a
1688                 // situation where e.g. checkpointing can't be
1689                 // signalled.
1690                 bool                doIntraSimSignal = true;
1691                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1692
1693                 compute_globals(
1694                         gstat,
1695                         cr,
1696                         ir,
1697                         fr,
1698                         ekind,
1699                         makeConstArrayRef(state->x),
1700                         makeConstArrayRef(state->v),
1701                         state->box,
1702                         md,
1703                         nrnb,
1704                         &vcm,
1705                         wcycle,
1706                         enerd,
1707                         force_vir,
1708                         shake_vir,
1709                         total_vir,
1710                         pres,
1711                         (!EI_VV(ir->eI) && bCalcEner && constr != nullptr) ? constr->rmsdData()
1712                                                                            : gmx::ArrayRef<real>{},
1713                         &signaller,
1714                         lastbox,
1715                         &bSumEkinhOld,
1716                         (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1717                                 | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1718                                 | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1719                                 | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
1720                                 | (DOMAINDECOMP(cr) && dd_localTopologyChecker(*cr->dd).shouldCheckNumberOfBondedInteractions()
1721                                            ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1722                                            : 0));
1723                 if (DOMAINDECOMP(cr))
1724                 {
1725                     dd_localTopologyChecker(cr->dd)->checkNumberOfBondedInteractions(
1726                             &top, makeConstArrayRef(state->x), state->box);
1727                 }
1728                 if (!EI_VV(ir->eI) && bStopCM)
1729                 {
1730                     process_and_stopcm_grp(
1731                             fplog, &vcm, *md, makeArrayRef(state->x), makeArrayRef(state->v));
1732                     inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
1733
1734                     // TODO: The special case of removing CM motion should be dealt more gracefully
1735                     if (useGpuForUpdate)
1736                     {
1737                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1738                         // Here we block until the H2D copy completes because event sync with the
1739                         // force kernels that use the coordinates on the next steps is not implemented
1740                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1741                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1742                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1743                         if (vcm.mode != ComRemovalAlgorithm::No)
1744                         {
1745                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1746                         }
1747                     }
1748                 }
1749             }
1750         }
1751
1752         /* #############  END CALC EKIN AND PRESSURE ################# */
1753
1754         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1755            the virial that should probably be addressed eventually. state->veta has better properies,
1756            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1757            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1758
1759         if (ir->efep != FreeEnergyPerturbationType::No && !EI_VV(ir->eI))
1760         {
1761             /* Sum up the foreign energy and dK/dl terms for md and sd.
1762                Currently done every step so that dH/dl is correct in the .edr */
1763             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1764         }
1765
1766         update_pcouple_after_coordinates(fplog,
1767                                          step,
1768                                          ir,
1769                                          md->homenr,
1770                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1771                                                      : gmx::ArrayRef<const unsigned short>(),
1772                                          pres,
1773                                          force_vir,
1774                                          shake_vir,
1775                                          pressureCouplingMu,
1776                                          state,
1777                                          nrnb,
1778                                          upd.deform(),
1779                                          !useGpuForUpdate);
1780
1781         const bool doBerendsenPressureCoupling = (inputrec->epc == PressureCoupling::Berendsen
1782                                                   && do_per_step(step, inputrec->nstpcouple));
1783         const bool doCRescalePressureCoupling  = (inputrec->epc == PressureCoupling::CRescale
1784                                                  && do_per_step(step, inputrec->nstpcouple));
1785         if (useGpuForUpdate
1786             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1787         {
1788             integrator->scaleCoordinates(pressureCouplingMu);
1789             if (doCRescalePressureCoupling)
1790             {
1791                 matrix pressureCouplingInvMu;
1792                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1793                 integrator->scaleVelocities(pressureCouplingInvMu);
1794             }
1795             integrator->setPbc(PbcType::Xyz, state->box);
1796         }
1797
1798         /* ################# END UPDATE STEP 2 ################# */
1799         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1800
1801         /* The coordinates (x) were unshifted in update */
1802         if (!bGStat)
1803         {
1804             /* We will not sum ekinh_old,
1805              * so signal that we still have to do it.
1806              */
1807             bSumEkinhOld = TRUE;
1808         }
1809
1810         if (bCalcEner)
1811         {
1812             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1813
1814             /* use the directly determined last velocity, not actually the averaged half steps */
1815             if (bTrotter && ir->eI == IntegrationAlgorithm::VV)
1816             {
1817                 enerd->term[F_EKIN] = last_ekin;
1818             }
1819             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1820
1821             if (integratorHasConservedEnergyQuantity(ir))
1822             {
1823                 if (EI_VV(ir->eI))
1824                 {
1825                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1826                 }
1827                 else
1828                 {
1829                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1830                 }
1831             }
1832             /* #########  END PREPARING EDR OUTPUT  ###########  */
1833         }
1834
1835         /* Output stuff */
1836         if (MASTER(cr))
1837         {
1838             if (fplog && do_log && bDoExpanded)
1839             {
1840                 /* only needed if doing expanded ensemble */
1841                 PrintFreeEnergyInfoToFile(fplog,
1842                                           ir->fepvals.get(),
1843                                           ir->expandedvals.get(),
1844                                           ir->bSimTemp ? ir->simtempvals.get() : nullptr,
1845                                           state_global->dfhist,
1846                                           state->fep_state,
1847                                           ir->nstlog,
1848                                           step);
1849             }
1850             if (bCalcEner)
1851             {
1852                 energyOutput.addDataAtEnergyStep(bDoDHDL,
1853                                                  bCalcEnerStep,
1854                                                  t,
1855                                                  md->tmass,
1856                                                  enerd,
1857                                                  ir->fepvals.get(),
1858                                                  ir->expandedvals.get(),
1859                                                  lastbox,
1860                                                  PTCouplingArrays{ state->boxv,
1861                                                                    state->nosehoover_xi,
1862                                                                    state->nosehoover_vxi,
1863                                                                    state->nhpres_xi,
1864                                                                    state->nhpres_vxi },
1865                                                  state->fep_state,
1866                                                  total_vir,
1867                                                  pres,
1868                                                  ekind,
1869                                                  mu_tot,
1870                                                  constr);
1871             }
1872             else
1873             {
1874                 energyOutput.recordNonEnergyStep();
1875             }
1876
1877             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1878             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1879
1880             if (doSimulatedAnnealing)
1881             {
1882                 gmx::EnergyOutput::printAnnealingTemperatures(
1883                         do_log ? fplog : nullptr, groups, &(ir->opts));
1884             }
1885             if (do_log || do_ene || do_dr || do_or)
1886             {
1887                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf),
1888                                                    do_ene,
1889                                                    do_dr,
1890                                                    do_or,
1891                                                    do_log ? fplog : nullptr,
1892                                                    step,
1893                                                    t,
1894                                                    fr->fcdata.get(),
1895                                                    awh.get());
1896             }
1897             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1898             {
1899                 const bool isInitialOutput = false;
1900                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1901             }
1902
1903             if (ir->bPull)
1904             {
1905                 pull_print_output(pull_work, step, t);
1906             }
1907
1908             if (do_per_step(step, ir->nstlog))
1909             {
1910                 if (fflush(fplog) != 0)
1911                 {
1912                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1913                 }
1914             }
1915         }
1916         if (bDoExpanded)
1917         {
1918             /* Have to do this part _after_ outputting the logfile and the edr file */
1919             /* Gets written into the state at the beginning of next loop*/
1920             state->fep_state = lamnew;
1921         }
1922         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1923         {
1924             state->fep_state = awh->fepLambdaState();
1925         }
1926         /* Print the remaining wall clock time for the run */
1927         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1928         {
1929             if (shellfc)
1930             {
1931                 fprintf(stderr, "\n");
1932             }
1933             print_time(stderr, walltime_accounting, step, ir, cr);
1934         }
1935
1936         /* Ion/water position swapping.
1937          * Not done in last step since trajectory writing happens before this call
1938          * in the MD loop and exchanges would be lost anyway. */
1939         bNeedRepartition = FALSE;
1940         if ((ir->eSwapCoords != SwapType::No) && (step > 0) && !bLastStep
1941             && do_per_step(step, ir->swap->nstswap))
1942         {
1943             bNeedRepartition = do_swapcoords(cr,
1944                                              step,
1945                                              t,
1946                                              ir,
1947                                              swap,
1948                                              wcycle,
1949                                              as_rvec_array(state->x.data()),
1950                                              state->box,
1951                                              MASTER(cr) && mdrunOptions.verbose,
1952                                              bRerunMD);
1953
1954             if (bNeedRepartition && DOMAINDECOMP(cr))
1955             {
1956                 dd_collect_state(cr->dd, state, state_global);
1957             }
1958         }
1959
1960         /* Replica exchange */
1961         bExchanged = FALSE;
1962         if (bDoReplEx)
1963         {
1964             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1965         }
1966
1967         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1968         {
1969             dd_partition_system(fplog,
1970                                 mdlog,
1971                                 step,
1972                                 cr,
1973                                 TRUE,
1974                                 1,
1975                                 state_global,
1976                                 top_global,
1977                                 *ir,
1978                                 imdSession,
1979                                 pull_work,
1980                                 state,
1981                                 &f,
1982                                 mdAtoms,
1983                                 &top,
1984                                 fr,
1985                                 vsite,
1986                                 constr,
1987                                 nrnb,
1988                                 wcycle,
1989                                 FALSE);
1990             upd.updateAfterPartition(state->natoms,
1991                                      md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1992                                                  : gmx::ArrayRef<const unsigned short>(),
1993                                      md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1994                                              : gmx::ArrayRef<const unsigned short>());
1995         }
1996
1997         bFirstStep = FALSE;
1998         bInitStep  = FALSE;
1999
2000         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
2001         /* With all integrators, except VV, we need to retain the pressure
2002          * at the current step for coupling at the next step.
2003          */
2004         if ((state->flags & enumValueToBitMask(StateEntry::PressurePrevious))
2005             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
2006         {
2007             /* Store the pressure in t_state for pressure coupling
2008              * at the next MD step.
2009              */
2010             copy_mat(pres, state->pres_prev);
2011         }
2012
2013         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
2014
2015         if ((membed != nullptr) && (!bLastStep))
2016         {
2017             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
2018         }
2019
2020         cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
2021         if (DOMAINDECOMP(cr) && wcycle)
2022         {
2023             dd_cycles_add(cr->dd, cycles, ddCyclStep);
2024         }
2025
2026         /* increase the MD step number */
2027         step++;
2028         step_rel++;
2029
2030 #if GMX_FAHCORE
2031         if (MASTER(cr))
2032         {
2033             fcReportProgress(ir->nsteps + ir->init_step, step);
2034         }
2035 #endif
2036
2037         resetHandler->resetCounters(
2038                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
2039
2040         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
2041         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
2042     }
2043     /* End of main MD loop */
2044
2045     /* Closing TNG files can include compressing data. Therefore it is good to do that
2046      * before stopping the time measurements. */
2047     mdoutf_tng_close(outf);
2048
2049     /* Stop measuring walltime */
2050     walltime_accounting_end_time(walltime_accounting);
2051
2052     if (!thisRankHasDuty(cr, DUTY_PME))
2053     {
2054         /* Tell the PME only node to finish */
2055         gmx_pme_send_finish(cr);
2056     }
2057
2058     if (MASTER(cr))
2059     {
2060         if (ir->nstcalcenergy > 0)
2061         {
2062             energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
2063
2064             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
2065             energyOutput.printAverages(fplog, groups);
2066         }
2067     }
2068     done_mdoutf(outf);
2069
2070     if (bPMETune)
2071     {
2072         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
2073     }
2074
2075     done_shellfc(fplog, shellfc, step_rel);
2076
2077     if (useReplicaExchange && MASTER(cr))
2078     {
2079         print_replica_exchange_statistics(fplog, repl_ex);
2080     }
2081
2082     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
2083
2084     global_stat_destroy(gstat);
2085 }