Split simulationWork.useGpuBufferOps into separate x and f flags
[alexxy/gromacs.git] / src / gromacs / mdrun / md.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 /*! \internal \file
38  *
39  * \brief Implements the integrator for normal molecular dynamics simulations
40  *
41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
42  * \ingroup module_mdrun
43  */
44 #include "gmxpre.h"
45
46 #include <cinttypes>
47 #include <cmath>
48 #include <cstdio>
49 #include <cstdlib>
50
51 #include <algorithm>
52 #include <memory>
53 #include <numeric>
54
55 #include "gromacs/applied_forces/awh/awh.h"
56 #include "gromacs/applied_forces/awh/read_params.h"
57 #include "gromacs/commandline/filenm.h"
58 #include "gromacs/domdec/collect.h"
59 #include "gromacs/domdec/dlbtiming.h"
60 #include "gromacs/domdec/domdec.h"
61 #include "gromacs/domdec/domdec_network.h"
62 #include "gromacs/domdec/domdec_struct.h"
63 #include "gromacs/domdec/gpuhaloexchange.h"
64 #include "gromacs/domdec/localtopologychecker.h"
65 #include "gromacs/domdec/mdsetup.h"
66 #include "gromacs/domdec/partition.h"
67 #include "gromacs/essentialdynamics/edsam.h"
68 #include "gromacs/ewald/pme_load_balancing.h"
69 #include "gromacs/ewald/pme_pp.h"
70 #include "gromacs/fileio/trxio.h"
71 #include "gromacs/gmxlib/network.h"
72 #include "gromacs/gmxlib/nrnb.h"
73 #include "gromacs/gpu_utils/device_stream_manager.h"
74 #include "gromacs/gpu_utils/gpu_utils.h"
75 #include "gromacs/imd/imd.h"
76 #include "gromacs/listed_forces/listed_forces.h"
77 #include "gromacs/math/functions.h"
78 #include "gromacs/math/invertmatrix.h"
79 #include "gromacs/math/vec.h"
80 #include "gromacs/math/vectypes.h"
81 #include "gromacs/mdlib/checkpointhandler.h"
82 #include "gromacs/mdlib/compute_io.h"
83 #include "gromacs/mdlib/constr.h"
84 #include "gromacs/mdlib/coupling.h"
85 #include "gromacs/mdlib/ebin.h"
86 #include "gromacs/mdlib/enerdata_utils.h"
87 #include "gromacs/mdlib/energyoutput.h"
88 #include "gromacs/mdlib/expanded.h"
89 #include "gromacs/mdlib/force.h"
90 #include "gromacs/mdlib/force_flags.h"
91 #include "gromacs/mdlib/forcerec.h"
92 #include "gromacs/mdlib/freeenergyparameters.h"
93 #include "gromacs/mdlib/md_support.h"
94 #include "gromacs/mdlib/mdatoms.h"
95 #include "gromacs/mdlib/mdoutf.h"
96 #include "gromacs/mdlib/membed.h"
97 #include "gromacs/mdlib/resethandler.h"
98 #include "gromacs/mdlib/sighandler.h"
99 #include "gromacs/mdlib/simulationsignal.h"
100 #include "gromacs/mdlib/stat.h"
101 #include "gromacs/mdlib/stophandler.h"
102 #include "gromacs/mdlib/tgroup.h"
103 #include "gromacs/mdlib/trajectory_writing.h"
104 #include "gromacs/mdlib/update.h"
105 #include "gromacs/mdlib/update_constrain_gpu.h"
106 #include "gromacs/mdlib/update_vv.h"
107 #include "gromacs/mdlib/vcm.h"
108 #include "gromacs/mdlib/vsite.h"
109 #include "gromacs/mdrunutility/freeenergy.h"
110 #include "gromacs/mdrunutility/handlerestart.h"
111 #include "gromacs/mdrunutility/multisim.h"
112 #include "gromacs/mdrunutility/printtime.h"
113 #include "gromacs/mdtypes/awh_history.h"
114 #include "gromacs/mdtypes/awh_params.h"
115 #include "gromacs/mdtypes/commrec.h"
116 #include "gromacs/mdtypes/df_history.h"
117 #include "gromacs/mdtypes/energyhistory.h"
118 #include "gromacs/mdtypes/fcdata.h"
119 #include "gromacs/mdtypes/forcebuffers.h"
120 #include "gromacs/mdtypes/forcerec.h"
121 #include "gromacs/mdtypes/group.h"
122 #include "gromacs/mdtypes/inputrec.h"
123 #include "gromacs/mdtypes/interaction_const.h"
124 #include "gromacs/mdtypes/md_enums.h"
125 #include "gromacs/mdtypes/mdatom.h"
126 #include "gromacs/mdtypes/mdrunoptions.h"
127 #include "gromacs/mdtypes/multipletimestepping.h"
128 #include "gromacs/mdtypes/observableshistory.h"
129 #include "gromacs/mdtypes/observablesreducer.h"
130 #include "gromacs/mdtypes/pullhistory.h"
131 #include "gromacs/mdtypes/simulation_workload.h"
132 #include "gromacs/mdtypes/state.h"
133 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
134 #include "gromacs/modularsimulator/energydata.h"
135 #include "gromacs/nbnxm/gpu_data_mgmt.h"
136 #include "gromacs/nbnxm/nbnxm.h"
137 #include "gromacs/pbcutil/pbc.h"
138 #include "gromacs/pulling/output.h"
139 #include "gromacs/pulling/pull.h"
140 #include "gromacs/swap/swapcoords.h"
141 #include "gromacs/timing/wallcycle.h"
142 #include "gromacs/timing/walltime_accounting.h"
143 #include "gromacs/topology/atoms.h"
144 #include "gromacs/topology/idef.h"
145 #include "gromacs/topology/mtop_util.h"
146 #include "gromacs/topology/topology.h"
147 #include "gromacs/trajectory/trajectoryframe.h"
148 #include "gromacs/utility/basedefinitions.h"
149 #include "gromacs/utility/cstringutil.h"
150 #include "gromacs/utility/fatalerror.h"
151 #include "gromacs/utility/logger.h"
152 #include "gromacs/utility/real.h"
153 #include "gromacs/utility/smalloc.h"
154
155 #include "legacysimulator.h"
156 #include "replicaexchange.h"
157 #include "shellfc.h"
158
159 using gmx::SimulationSignaller;
160
161 void gmx::LegacySimulator::do_md()
162 {
163     // TODO Historically, the EM and MD "integrators" used different
164     // names for the t_inputrec *parameter, but these must have the
165     // same name, now that it's a member of a struct. We use this ir
166     // alias to avoid a large ripple of nearly useless changes.
167     // t_inputrec is being replaced by IMdpOptionsProvider, so this
168     // will go away eventually.
169     const t_inputrec* ir = inputrec;
170
171     double       t, t0 = ir->init_t;
172     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
173     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
174     gmx_bool     bDoExpanded = FALSE;
175     gmx_bool     do_ene, do_log, do_verbose;
176     gmx_bool     bMasterState;
177     unsigned int force_flags;
178     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
179     int    i, m;
180     rvec   mu_tot;
181     matrix pressureCouplingMu, M;
182     gmx_repl_ex_t     repl_ex = nullptr;
183     gmx_global_stat_t gstat;
184     gmx_shellfc_t*    shellfc;
185     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
186     gmx_bool          bTrotter;
187     real              dvdl_constr;
188     std::vector<RVec> cbuf;
189     matrix            lastbox;
190     int               lamnew = 0;
191     /* for FEP */
192     double    cycles;
193     real      saved_conserved_quantity = 0;
194     real      last_ekin                = 0;
195     t_extmass MassQ;
196     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
197
198     /* PME load balancing data for GPU kernels */
199     gmx_bool bPMETune         = FALSE;
200     gmx_bool bPMETunePrinting = FALSE;
201
202     bool bInteractiveMDstep = false;
203
204     SimulationSignals signals;
205     // Most global communnication stages don't propagate mdrun
206     // signals, and will use this object to achieve that.
207     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
208
209     if (!mdrunOptions.writeConfout)
210     {
211         // This is on by default, and the main known use case for
212         // turning it off is for convenience in benchmarking, which is
213         // something that should not show up in the general user
214         // interface.
215         GMX_LOG(mdlog.info)
216                 .asParagraph()
217                 .appendText(
218                         "The -noconfout functionality is deprecated, and may be removed in a "
219                         "future version.");
220     }
221
222     /* md-vv uses averaged full step velocities for T-control
223        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
224        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
225     bTrotter = (EI_VV(ir->eI)
226                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
227
228     const bool bRerunMD = false;
229
230     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
231     bGStatEveryStep   = (nstglobalcomm == 1);
232
233     const SimulationGroups* groups = &top_global.groups;
234
235     std::unique_ptr<EssentialDynamics> ed = nullptr;
236     if (opt2bSet("-ei", nfile, fnm))
237     {
238         /* Initialize essential dynamics sampling */
239         ed = init_edsam(mdlog,
240                         opt2fn_null("-ei", nfile, fnm),
241                         opt2fn("-eo", nfile, fnm),
242                         top_global,
243                         *ir,
244                         cr,
245                         constr,
246                         state_global,
247                         observablesHistory,
248                         oenv,
249                         startingBehavior);
250     }
251     else if (observablesHistory->edsamHistory)
252     {
253         gmx_fatal(FARGS,
254                   "The checkpoint is from a run with essential dynamics sampling, "
255                   "but the current run did not specify the -ei option. "
256                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
257     }
258
259     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
260     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
261     initialize_lambdas(fplog,
262                        ir->efep,
263                        ir->bSimTemp,
264                        *ir->fepvals,
265                        ir->simtempvals->temperatures,
266                        gmx::arrayRefFromArray(ir->opts.ref_t, ir->opts.ngtc),
267                        MASTER(cr),
268                        fep_state,
269                        lambda);
270     Update upd(*ir, deform);
271     bool   doSimulatedAnnealing = false;
272     {
273         // TODO: Avoid changing inputrec (#3854)
274         // Simulated annealing updates the reference temperature.
275         auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
276         doSimulatedAnnealing   = initSimulatedAnnealing(nonConstInputrec, &upd);
277     }
278     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
279
280     t_fcdata& fcdata = *fr->fcdata;
281
282     bool simulationsShareState = false;
283     int  nstSignalComm         = nstglobalcomm;
284     {
285         // TODO This implementation of ensemble orientation restraints is nasty because
286         // a user can't just do multi-sim with single-sim orientation restraints.
287         bool usingEnsembleRestraints = (fcdata.disres->nsystems > 1) || ((ms != nullptr) && fcdata.orires);
288         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim() && (ms != nullptr));
289
290         // Replica exchange, ensemble restraints and AWH need all
291         // simulations to remain synchronized, so they need
292         // checkpoints and stop conditions to act on the same step, so
293         // the propagation of such signals must take place between
294         // simulations, not just within simulations.
295         // TODO: Make algorithm initializers set these flags.
296         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
297
298         if (simulationsShareState)
299         {
300             // Inter-simulation signal communication does not need to happen
301             // often, so we use a minimum of 200 steps to reduce overhead.
302             const int c_minimumInterSimulationSignallingInterval = 200;
303             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
304                             * nstglobalcomm;
305         }
306     }
307
308     if (startingBehavior != StartingBehavior::RestartWithAppending)
309     {
310         pleaseCiteCouplingAlgorithms(fplog, *ir);
311     }
312     gmx_mdoutf*       outf = init_mdoutf(fplog,
313                                    nfile,
314                                    fnm,
315                                    mdrunOptions,
316                                    cr,
317                                    outputProvider,
318                                    mdModulesNotifiers,
319                                    ir,
320                                    top_global,
321                                    oenv,
322                                    wcycle,
323                                    startingBehavior,
324                                    simulationsShareState,
325                                    ms);
326     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf),
327                                    top_global,
328                                    *ir,
329                                    pull_work,
330                                    mdoutf_get_fp_dhdl(outf),
331                                    false,
332                                    startingBehavior,
333                                    simulationsShareState,
334                                    mdModulesNotifiers);
335
336     gstat = global_stat_init(ir);
337
338     const auto& simulationWork     = runScheduleWork->simulationWork;
339     const bool  useGpuForPme       = simulationWork.useGpuPme;
340     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
341     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
342
343     /* Check for polarizable models and flexible constraints */
344     shellfc = init_shell_flexcon(fplog,
345                                  top_global,
346                                  constr ? constr->numFlexibleConstraints() : 0,
347                                  ir->nstcalcenergy,
348                                  haveDDAtomOrdering(*cr),
349                                  useGpuForPme);
350
351     {
352         double io = compute_io(ir, top_global.natoms, *groups, energyOutput.numEnergyTerms(), 1);
353         if ((io > 2000) && MASTER(cr))
354         {
355             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
356         }
357     }
358
359     ObservablesReducer observablesReducer = observablesReducerBuilder->build();
360
361     ForceBuffers     f(simulationWork.useMts,
362                    (simulationWork.useGpuFBufferOps || useGpuForUpdate) ? PinningPolicy::PinnedIfSupported
363                                                                             : PinningPolicy::CannotBePinned);
364     const t_mdatoms* md = mdAtoms->mdatoms();
365     if (haveDDAtomOrdering(*cr))
366     {
367         // Local state only becomes valid now.
368         dd_init_local_state(*cr->dd, state_global, state);
369
370         /* Distribute the charge groups over the nodes from the master node */
371         dd_partition_system(fplog,
372                             mdlog,
373                             ir->init_step,
374                             cr,
375                             TRUE,
376                             1,
377                             state_global,
378                             top_global,
379                             *ir,
380                             imdSession,
381                             pull_work,
382                             state,
383                             &f,
384                             mdAtoms,
385                             top,
386                             fr,
387                             vsite,
388                             constr,
389                             nrnb,
390                             nullptr,
391                             FALSE);
392         upd.updateAfterPartition(state->natoms,
393                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
394                                              : gmx::ArrayRef<const unsigned short>(),
395                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
396                                          : gmx::ArrayRef<const unsigned short>(),
397                                  md->cACC ? gmx::arrayRefFromArray(md->cACC, md->nr)
398                                           : gmx::ArrayRef<const unsigned short>());
399         fr->longRangeNonbondeds->updateAfterPartition(*md);
400     }
401     else
402     {
403         state_change_natoms(state_global, state_global->natoms);
404
405         /* Generate and initialize new topology */
406         mdAlgorithmsSetupAtomData(cr, *ir, top_global, top, fr, &f, mdAtoms, constr, vsite, shellfc);
407
408         upd.updateAfterPartition(state->natoms,
409                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
410                                              : gmx::ArrayRef<const unsigned short>(),
411                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
412                                          : gmx::ArrayRef<const unsigned short>(),
413                                  md->cACC ? gmx::arrayRefFromArray(md->cACC, md->nr)
414                                           : gmx::ArrayRef<const unsigned short>());
415         fr->longRangeNonbondeds->updateAfterPartition(*md);
416     }
417
418     std::unique_ptr<UpdateConstrainGpu> integrator;
419
420     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
421
422     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
423     if (useGpuForUpdate)
424     {
425         GMX_RELEASE_ASSERT(!haveDDAtomOrdering(*cr) || ddUsesUpdateGroups(*cr->dd)
426                                    || constr == nullptr || constr->numConstraintsTotal() == 0,
427                            "Constraints in domain decomposition are only supported with update "
428                            "groups if using GPU update.\n");
429         GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
430                                    || constr->numConstraintsTotal() == 0,
431                            "SHAKE is not supported with GPU update.");
432         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuXBufferOps),
433                            "Either PME or short-ranged non-bonded interaction tasks must run on "
434                            "the GPU to use GPU update.\n");
435         GMX_RELEASE_ASSERT(ir->eI == IntegrationAlgorithm::MD,
436                            "Only the md integrator is supported with the GPU update.\n");
437         GMX_RELEASE_ASSERT(
438                 ir->etc != TemperatureCoupling::NoseHoover,
439                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
440         GMX_RELEASE_ASSERT(
441                 ir->epc == PressureCoupling::No || ir->epc == PressureCoupling::ParrinelloRahman
442                         || ir->epc == PressureCoupling::Berendsen || ir->epc == PressureCoupling::CRescale,
443                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
444                 "with the GPU update.\n");
445         GMX_RELEASE_ASSERT(!md->haveVsites,
446                            "Virtual sites are not supported with the GPU update.\n");
447         GMX_RELEASE_ASSERT(ed == nullptr,
448                            "Essential dynamics is not supported with the GPU update.\n");
449         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
450                            "Constraints pulling is not supported with the GPU update.\n");
451         GMX_RELEASE_ASSERT(fcdata.orires == nullptr,
452                            "Orientation restraints are not supported with the GPU update.\n");
453         GMX_RELEASE_ASSERT(
454                 ir->efep == FreeEnergyPerturbationType::No
455                         || (!haveFepPerturbedMasses(top_global) && !havePerturbedConstraints(top_global)),
456                 "Free energy perturbation of masses and constraints are not supported with the GPU "
457                 "update.");
458
459         if (constr != nullptr && constr->numConstraintsTotal() > 0)
460         {
461             GMX_LOG(mdlog.info)
462                     .asParagraph()
463                     .appendText("Updating coordinates and applying constraints on the GPU.");
464         }
465         else
466         {
467             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
468         }
469         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
470                            "Device stream manager should be initialized in order to use GPU "
471                            "update-constraints.");
472         GMX_RELEASE_ASSERT(
473                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
474                 "Update stream should be initialized in order to use GPU "
475                 "update-constraints.");
476         integrator = std::make_unique<UpdateConstrainGpu>(
477                 *ir,
478                 top_global,
479                 ekind->ngtc,
480                 fr->deviceStreamManager->context(),
481                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
482                 wcycle);
483
484         stateGpu->setXUpdatedOnDeviceEvent(integrator->xUpdatedOnDeviceEvent());
485
486         integrator->setPbc(PbcType::Xyz, state->box);
487     }
488
489     if (useGpuForPme || simulationWork.useGpuXBufferOps || useGpuForUpdate)
490     {
491         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
492     }
493     if (useGpuForUpdate)
494     {
495         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
496     }
497
498     // NOTE: The global state is no longer used at this point.
499     // But state_global is still used as temporary storage space for writing
500     // the global state to file and potentially for replica exchange.
501     // (Global topology should persist.)
502
503     update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
504
505     if (ir->bExpanded)
506     {
507         /* Check nstexpanded here, because the grompp check was broken */
508         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
509         {
510             gmx_fatal(FARGS,
511                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
512         }
513         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
514     }
515
516     if (MASTER(cr))
517     {
518         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
519     }
520
521     preparePrevStepPullCom(ir,
522                            pull_work,
523                            gmx::arrayRefFromArray(md->massT, md->nr),
524                            state,
525                            state_global,
526                            cr,
527                            startingBehavior != StartingBehavior::NewSimulation);
528
529     // TODO: Remove this by converting AWH into a ForceProvider
530     auto awh = prepareAwhModule(fplog,
531                                 *ir,
532                                 state_global,
533                                 cr,
534                                 ms,
535                                 startingBehavior != StartingBehavior::NewSimulation,
536                                 shellfc != nullptr,
537                                 opt2fn("-awh", nfile, fnm),
538                                 pull_work);
539
540     if (useReplicaExchange && MASTER(cr))
541     {
542         repl_ex = init_replica_exchange(fplog, ms, top_global.natoms, ir, replExParams);
543     }
544     /* PME tuning is only supported in the Verlet scheme, with PME for
545      * Coulomb. It is not supported with only LJ PME. */
546     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
547                 && ir->cutoff_scheme != CutoffScheme::Group);
548
549     pme_load_balancing_t* pme_loadbal = nullptr;
550     if (bPMETune)
551     {
552         pme_loadbal_init(
553                 &pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu());
554     }
555
556     if (!ir->bContinuation)
557     {
558         if (state->flags & enumValueToBitMask(StateEntry::V))
559         {
560             auto v = makeArrayRef(state->v);
561             /* Set the velocities of vsites, shells and frozen atoms to zero */
562             for (i = 0; i < md->homenr; i++)
563             {
564                 if (md->ptype[i] == ParticleType::Shell)
565                 {
566                     clear_rvec(v[i]);
567                 }
568                 else if (md->cFREEZE)
569                 {
570                     for (m = 0; m < DIM; m++)
571                     {
572                         if (ir->opts.nFreeze[md->cFREEZE[i]][m])
573                         {
574                             v[i][m] = 0;
575                         }
576                     }
577                 }
578             }
579         }
580
581         if (constr)
582         {
583             /* Constrain the initial coordinates and velocities */
584             do_constrain_first(fplog,
585                                constr,
586                                ir,
587                                md->nr,
588                                md->homenr,
589                                state->x.arrayRefWithPadding(),
590                                state->v.arrayRefWithPadding(),
591                                state->box,
592                                state->lambda[FreeEnergyPerturbationCouplingType::Bonded]);
593         }
594     }
595
596     const int nstfep = computeFepPeriod(*ir, replExParams);
597
598     /* Be REALLY careful about what flags you set here. You CANNOT assume
599      * this is the first step, since we might be restarting from a checkpoint,
600      * and in that case we should not do any modifications to the state.
601      */
602     bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && !ir->bContinuation);
603
604     // When restarting from a checkpoint, it can be appropriate to
605     // initialize ekind from quantities in the checkpoint. Otherwise,
606     // compute_globals must initialize ekind before the simulation
607     // starts/restarts. However, only the master rank knows what was
608     // found in the checkpoint file, so we have to communicate in
609     // order to coordinate the restart.
610     //
611     // TODO Consider removing this communication if/when checkpoint
612     // reading directly follows .tpr reading, because all ranks can
613     // agree on hasReadEkinState at that time.
614     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
615     if (PAR(cr))
616     {
617         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
618     }
619     if (hasReadEkinState)
620     {
621         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
622     }
623
624     unsigned int cglo_flags =
625             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
626              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
627
628     bSumEkinhOld = FALSE;
629
630     t_vcm vcm(top_global.groups, *ir);
631     reportComRemovalInfo(fplog, vcm);
632
633     int64_t step     = ir->init_step;
634     int64_t step_rel = 0;
635
636     /* To minimize communication, compute_globals computes the COM velocity
637      * and the kinetic energy for the velocities without COM motion removed.
638      * Thus to get the kinetic energy without the COM contribution, we need
639      * to call compute_globals twice.
640      */
641     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
642     {
643         unsigned int cglo_flags_iteration = cglo_flags;
644         if (bStopCM && cgloIteration == 0)
645         {
646             cglo_flags_iteration |= CGLO_STOPCM;
647             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
648         }
649         compute_globals(gstat,
650                         cr,
651                         ir,
652                         fr,
653                         ekind,
654                         makeConstArrayRef(state->x),
655                         makeConstArrayRef(state->v),
656                         state->box,
657                         md,
658                         nrnb,
659                         &vcm,
660                         nullptr,
661                         enerd,
662                         force_vir,
663                         shake_vir,
664                         total_vir,
665                         pres,
666                         &nullSignaller,
667                         state->box,
668                         &bSumEkinhOld,
669                         cglo_flags_iteration,
670                         step,
671                         &observablesReducer);
672         // Clean up after pre-step use of compute_globals()
673         observablesReducer.markAsReadyToReduce();
674
675         if (cglo_flags_iteration & CGLO_STOPCM)
676         {
677             /* At initialization, do not pass x with acceleration-correction mode
678              * to avoid (incorrect) correction of the initial coordinates.
679              */
680             auto x = (vcm.mode == ComRemovalAlgorithm::LinearAccelerationCorrection)
681                              ? ArrayRef<RVec>()
682                              : makeArrayRef(state->x);
683             process_and_stopcm_grp(fplog, &vcm, *md, x, makeArrayRef(state->v));
684             inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
685         }
686     }
687     if (ir->eI == IntegrationAlgorithm::VVAK)
688     {
689         /* a second call to get the half step temperature initialized as well */
690         /* we do the same call as above, but turn the pressure off -- internally to
691            compute_globals, this is recognized as a velocity verlet half-step
692            kinetic energy calculation.  This minimized excess variables, but
693            perhaps loses some logic?*/
694
695         compute_globals(gstat,
696                         cr,
697                         ir,
698                         fr,
699                         ekind,
700                         makeConstArrayRef(state->x),
701                         makeConstArrayRef(state->v),
702                         state->box,
703                         md,
704                         nrnb,
705                         &vcm,
706                         nullptr,
707                         enerd,
708                         force_vir,
709                         shake_vir,
710                         total_vir,
711                         pres,
712                         &nullSignaller,
713                         state->box,
714                         &bSumEkinhOld,
715                         cglo_flags & ~CGLO_PRESSURE,
716                         step,
717                         &observablesReducer);
718         // Clean up after pre-step use of compute_globals()
719         observablesReducer.markAsReadyToReduce();
720     }
721
722     /* Calculate the initial half step temperature, and save the ekinh_old */
723     if (startingBehavior == StartingBehavior::NewSimulation)
724     {
725         for (i = 0; (i < ir->opts.ngtc); i++)
726         {
727             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
728         }
729     }
730
731     /* need to make an initiation call to get the Trotter variables set, as well as other constants
732        for non-trotter temperature control */
733     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
734
735     if (MASTER(cr))
736     {
737         if (!ir->bContinuation)
738         {
739             if (constr && ir->eConstrAlg == ConstraintAlgorithm::Lincs)
740             {
741                 fprintf(fplog,
742                         "RMS relative constraint deviation after constraining: %.2e\n",
743                         constr->rmsd());
744             }
745             if (EI_STATE_VELOCITY(ir->eI))
746             {
747                 real temp = enerd->term[F_TEMP];
748                 if (ir->eI != IntegrationAlgorithm::VV)
749                 {
750                     /* Result of Ekin averaged over velocities of -half
751                      * and +half step, while we only have -half step here.
752                      */
753                     temp *= 2;
754                 }
755                 fprintf(fplog, "Initial temperature: %g K\n", temp);
756             }
757         }
758
759         char tbuf[20];
760         fprintf(stderr, "starting mdrun '%s'\n", *(top_global.name));
761         if (ir->nsteps >= 0)
762         {
763             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
764         }
765         else
766         {
767             sprintf(tbuf, "%s", "infinite");
768         }
769         if (ir->init_step > 0)
770         {
771             fprintf(stderr,
772                     "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
773                     gmx_step_str(ir->init_step + ir->nsteps, sbuf),
774                     tbuf,
775                     gmx_step_str(ir->init_step, sbuf2),
776                     ir->init_step * ir->delta_t);
777         }
778         else
779         {
780             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
781         }
782         fprintf(fplog, "\n");
783     }
784
785     walltime_accounting_start_time(walltime_accounting);
786     wallcycle_start(wcycle, WallCycleCounter::Run);
787     print_start(fplog, cr, walltime_accounting, "mdrun");
788
789     /***********************************************************
790      *
791      *             Loop over MD steps
792      *
793      ************************************************************/
794
795     bFirstStep = TRUE;
796     /* Skip the first Nose-Hoover integration when we get the state from tpx */
797     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
798     bSumEkinhOld     = FALSE;
799     bExchanged       = FALSE;
800     bNeedRepartition = FALSE;
801
802     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
803             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]),
804             simulationsShareState,
805             MASTER(cr),
806             ir->nstlist,
807             mdrunOptions.reproducible,
808             nstSignalComm,
809             mdrunOptions.maximumHoursToRun,
810             ir->nstlist == 0,
811             fplog,
812             step,
813             bNS,
814             walltime_accounting);
815
816     auto checkpointHandler = std::make_unique<CheckpointHandler>(
817             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
818             simulationsShareState,
819             ir->nstlist == 0,
820             MASTER(cr),
821             mdrunOptions.writeConfout,
822             mdrunOptions.checkpointOptions.period);
823
824     const bool resetCountersIsLocal = true;
825     auto       resetHandler         = std::make_unique<ResetHandler>(
826             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
827             !resetCountersIsLocal,
828             ir->nsteps,
829             MASTER(cr),
830             mdrunOptions.timingOptions.resetHalfway,
831             mdrunOptions.maximumHoursToRun,
832             mdlog,
833             wcycle,
834             walltime_accounting);
835
836     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
837
838     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
839     {
840         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
841     }
842
843     /* and stop now if we should */
844     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
845     while (!bLastStep)
846     {
847
848         /* Determine if this is a neighbor search step */
849         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
850
851         if (bPMETune && bNStList)
852         {
853             // This has to be here because PME load balancing is called so early.
854             // TODO: Move to after all booleans are defined.
855             if (useGpuForUpdate && !bFirstStep)
856             {
857                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
858                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
859             }
860             /* PME grid + cut-off optimization with GPUs or PME nodes */
861             pme_loadbal_do(pme_loadbal,
862                            cr,
863                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
864                            fplog,
865                            mdlog,
866                            *ir,
867                            fr,
868                            state->box,
869                            state->x,
870                            wcycle,
871                            step,
872                            step_rel,
873                            &bPMETunePrinting,
874                            simulationWork.useGpuPmePpCommunication);
875         }
876
877         wallcycle_start(wcycle, WallCycleCounter::Step);
878
879         bLastStep = (step_rel == ir->nsteps);
880         t         = t0 + step * ir->delta_t;
881
882         // TODO Refactor this, so that nstfep does not need a default value of zero
883         if (ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
884         {
885             /* find and set the current lambdas */
886             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
887
888             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
889                            && (!bFirstStep));
890         }
891
892         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
893                      && do_per_step(step, replExParams.exchangeInterval));
894
895         if (doSimulatedAnnealing)
896         {
897             // TODO: Avoid changing inputrec (#3854)
898             // Simulated annealing updates the reference temperature.
899             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
900             update_annealing_target_temp(nonConstInputrec, t, &upd);
901         }
902
903         /* Stop Center of Mass motion */
904         bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && do_per_step(step, ir->nstcomm));
905
906         /* Determine whether or not to do Neighbour Searching */
907         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
908
909         /* Note that the stopHandler will cause termination at nstglobalcomm
910          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
911          * nstpcouple steps, we have computed the half-step kinetic energy
912          * of the previous step and can always output energies at the last step.
913          */
914         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
915
916         /* do_log triggers energy and virial calculation. Because this leads
917          * to different code paths, forces can be different. Thus for exact
918          * continuation we should avoid extra log output.
919          * Note that the || bLastStep can result in non-exact continuation
920          * beyond the last step. But we don't consider that to be an issue.
921          */
922         do_log     = (do_per_step(step, ir->nstlog)
923                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
924         do_verbose = mdrunOptions.verbose
925                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
926
927         // On search steps, when doing the update on the GPU, copy
928         // the coordinates and velocities to the host unless they are
929         // already there (ie on the first step and after replica
930         // exchange).
931         if (useGpuForUpdate && bNS && !bFirstStep && !bExchanged)
932         {
933             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
934             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
935             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
936             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
937         }
938
939         // We only need to calculate virtual velocities if we are writing them in the current step
940         const bool needVirtualVelocitiesThisStep =
941                 (vsite != nullptr)
942                 && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep());
943
944         if (vsite != nullptr)
945         {
946             // Virtual sites need to be updated before domain decomposition and forces are calculated
947             wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
948             // md-vv calculates virtual velocities once it has full-step real velocities
949             vsite->construct(state->x,
950                              state->v,
951                              state->box,
952                              (!EI_VV(inputrec->eI) && needVirtualVelocitiesThisStep)
953                                      ? VSiteOperation::PositionsAndVelocities
954                                      : VSiteOperation::Positions);
955             wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
956         }
957
958         if (bNS && !(bFirstStep && ir->bContinuation))
959         {
960             bMasterState = FALSE;
961             /* Correct the new box if it is too skewed */
962             if (inputrecDynamicBox(ir))
963             {
964                 if (correct_box(fplog, step, state->box))
965                 {
966                     bMasterState = TRUE;
967                 }
968             }
969             // If update is offloaded, and the box was changed either
970             // above or in a replica exchange on the previous step,
971             // the GPU Update object should be informed
972             if (useGpuForUpdate && (bMasterState || bExchanged))
973             {
974                 integrator->setPbc(PbcType::Xyz, state->box);
975             }
976             if (haveDDAtomOrdering(*cr) && bMasterState)
977             {
978                 dd_collect_state(cr->dd, state, state_global);
979             }
980
981             if (haveDDAtomOrdering(*cr))
982             {
983                 /* Repartition the domain decomposition */
984                 dd_partition_system(fplog,
985                                     mdlog,
986                                     step,
987                                     cr,
988                                     bMasterState,
989                                     nstglobalcomm,
990                                     state_global,
991                                     top_global,
992                                     *ir,
993                                     imdSession,
994                                     pull_work,
995                                     state,
996                                     &f,
997                                     mdAtoms,
998                                     top,
999                                     fr,
1000                                     vsite,
1001                                     constr,
1002                                     nrnb,
1003                                     wcycle,
1004                                     do_verbose && !bPMETunePrinting);
1005                 upd.updateAfterPartition(state->natoms,
1006                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1007                                                      : gmx::ArrayRef<const unsigned short>(),
1008                                          md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1009                                                  : gmx::ArrayRef<const unsigned short>(),
1010                                          md->cACC ? gmx::arrayRefFromArray(md->cACC, md->nr)
1011                                                   : gmx::ArrayRef<const unsigned short>());
1012                 fr->longRangeNonbondeds->updateAfterPartition(*md);
1013             }
1014         }
1015
1016         // Allocate or re-size GPU halo exchange object, if necessary
1017         if (bNS && simulationWork.havePpDomainDecomposition && simulationWork.useGpuHaloExchange)
1018         {
1019             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
1020                                "GPU device manager has to be initialized to use GPU "
1021                                "version of halo exchange.");
1022             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
1023         }
1024
1025         if (MASTER(cr) && do_log)
1026         {
1027             gmx::EnergyOutput::printHeader(
1028                     fplog, step, t); /* can we improve the information printed here? */
1029         }
1030
1031         if (ir->efep != FreeEnergyPerturbationType::No)
1032         {
1033             update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
1034         }
1035
1036         if (bExchanged)
1037         {
1038             /* We need the kinetic energy at minus the half step for determining
1039              * the full step kinetic energy and possibly for T-coupling.*/
1040             /* This may not be quite working correctly yet . . . . */
1041             int cglo_flags = CGLO_GSTAT | CGLO_TEMPERATURE;
1042             compute_globals(gstat,
1043                             cr,
1044                             ir,
1045                             fr,
1046                             ekind,
1047                             makeConstArrayRef(state->x),
1048                             makeConstArrayRef(state->v),
1049                             state->box,
1050                             md,
1051                             nrnb,
1052                             &vcm,
1053                             wcycle,
1054                             enerd,
1055                             nullptr,
1056                             nullptr,
1057                             nullptr,
1058                             nullptr,
1059                             &nullSignaller,
1060                             state->box,
1061                             &bSumEkinhOld,
1062                             cglo_flags,
1063                             step,
1064                             &observablesReducer);
1065         }
1066         clear_mat(force_vir);
1067
1068         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
1069
1070         /* Determine the energy and pressure:
1071          * at nstcalcenergy steps and at energy output steps (set below).
1072          */
1073         if (EI_VV(ir->eI) && (!bInitStep))
1074         {
1075             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1076             bCalcVir      = bCalcEnerStep
1077                        || (ir->epc != PressureCoupling::No
1078                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
1079         }
1080         else
1081         {
1082             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1083             bCalcVir      = bCalcEnerStep
1084                        || (ir->epc != PressureCoupling::No && do_per_step(step, ir->nstpcouple));
1085         }
1086         bCalcEner = bCalcEnerStep;
1087
1088         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
1089
1090         if (do_ene || do_log || bDoReplEx)
1091         {
1092             bCalcVir  = TRUE;
1093             bCalcEner = TRUE;
1094         }
1095
1096         // bCalcEner is only here for when the last step is not a mulitple of nstfep
1097         const bool computeDHDL = ((ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
1098                                   && (do_per_step(step, nstfep) || bCalcEner));
1099
1100         /* Do we need global communication ? */
1101         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
1102                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
1103
1104         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
1105                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
1106                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (computeDHDL ? GMX_FORCE_DHDL : 0));
1107         if (simulationWork.useMts && !do_per_step(step, ir->nstfout))
1108         {
1109             // TODO: merge this with stepWork.useOnlyMtsCombinedForceBuffer
1110             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
1111         }
1112
1113         if (shellfc)
1114         {
1115             /* Now is the time to relax the shells */
1116             relax_shell_flexcon(fplog,
1117                                 cr,
1118                                 ms,
1119                                 mdrunOptions.verbose,
1120                                 enforcedRotation,
1121                                 step,
1122                                 ir,
1123                                 imdSession,
1124                                 pull_work,
1125                                 bNS,
1126                                 force_flags,
1127                                 top,
1128                                 constr,
1129                                 enerd,
1130                                 state->natoms,
1131                                 state->x.arrayRefWithPadding(),
1132                                 state->v.arrayRefWithPadding(),
1133                                 state->box,
1134                                 state->lambda,
1135                                 &state->hist,
1136                                 &f.view(),
1137                                 force_vir,
1138                                 *md,
1139                                 fr->longRangeNonbondeds.get(),
1140                                 nrnb,
1141                                 wcycle,
1142                                 shellfc,
1143                                 fr,
1144                                 runScheduleWork,
1145                                 t,
1146                                 mu_tot,
1147                                 vsite,
1148                                 ddBalanceRegionHandler);
1149         }
1150         else
1151         {
1152             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
1153                is updated (or the AWH update will be performed twice for one step when continuing).
1154                It would be best to call this update function from do_md_trajectory_writing but that
1155                would occur after do_force. One would have to divide the update_awh function into one
1156                function applying the AWH force and one doing the AWH bias update. The update AWH
1157                bias function could then be called after do_md_trajectory_writing (then containing
1158                update_awh_history). The checkpointing will in the future probably moved to the start
1159                of the md loop which will rid of this issue. */
1160             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
1161             {
1162                 awh->updateHistory(state_global->awhHistory.get());
1163             }
1164
1165             /* The coordinates (x) are shifted (to get whole molecules)
1166              * in do_force.
1167              * This is parallellized as well, and does communication too.
1168              * Check comments in sim_util.c
1169              */
1170             do_force(fplog,
1171                      cr,
1172                      ms,
1173                      *ir,
1174                      awh.get(),
1175                      enforcedRotation,
1176                      imdSession,
1177                      pull_work,
1178                      step,
1179                      nrnb,
1180                      wcycle,
1181                      top,
1182                      state->box,
1183                      state->x.arrayRefWithPadding(),
1184                      &state->hist,
1185                      &f.view(),
1186                      force_vir,
1187                      md,
1188                      enerd,
1189                      state->lambda,
1190                      fr,
1191                      runScheduleWork,
1192                      vsite,
1193                      mu_tot,
1194                      t,
1195                      ed ? ed->getLegacyED() : nullptr,
1196                      fr->longRangeNonbondeds.get(),
1197                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
1198                      ddBalanceRegionHandler);
1199         }
1200
1201         // VV integrators do not need the following velocity half step
1202         // if it is the first step after starting from a checkpoint.
1203         // That is, the half step is needed on all other steps, and
1204         // also the first step when starting from a .tpr file.
1205         if (EI_VV(ir->eI))
1206         {
1207             integrateVVFirstStep(step,
1208                                  bFirstStep,
1209                                  bInitStep,
1210                                  startingBehavior,
1211                                  nstglobalcomm,
1212                                  ir,
1213                                  fr,
1214                                  cr,
1215                                  state,
1216                                  mdAtoms->mdatoms(),
1217                                  &fcdata,
1218                                  &MassQ,
1219                                  &vcm,
1220                                  enerd,
1221                                  &observablesReducer,
1222                                  ekind,
1223                                  gstat,
1224                                  &last_ekin,
1225                                  bCalcVir,
1226                                  total_vir,
1227                                  shake_vir,
1228                                  force_vir,
1229                                  pres,
1230                                  M,
1231                                  do_log,
1232                                  do_ene,
1233                                  bCalcEner,
1234                                  bGStat,
1235                                  bStopCM,
1236                                  bTrotter,
1237                                  bExchanged,
1238                                  &bSumEkinhOld,
1239                                  &saved_conserved_quantity,
1240                                  &f,
1241                                  &upd,
1242                                  constr,
1243                                  &nullSignaller,
1244                                  trotter_seq,
1245                                  nrnb,
1246                                  fplog,
1247                                  wcycle);
1248             if (vsite != nullptr && needVirtualVelocitiesThisStep)
1249             {
1250                 // Positions were calculated earlier
1251                 wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
1252                 vsite->construct(state->x, state->v, state->box, VSiteOperation::Velocities);
1253                 wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
1254             }
1255         }
1256
1257         /* ########  END FIRST UPDATE STEP  ############## */
1258         /* ########  If doing VV, we now have v(dt) ###### */
1259         if (bDoExpanded)
1260         {
1261             /* perform extended ensemble sampling in lambda - we don't
1262                actually move to the new state before outputting
1263                statistics, but if performing simulated tempering, we
1264                do update the velocities and the tau_t. */
1265             // TODO: Avoid changing inputrec (#3854)
1266             // Simulated tempering updates the reference temperature.
1267             // Expanded ensemble without simulated tempering does not change the inputrec.
1268             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
1269             lamnew                 = ExpandedEnsembleDynamics(fplog,
1270                                               nonConstInputrec,
1271                                               enerd,
1272                                               state,
1273                                               &MassQ,
1274                                               state->fep_state,
1275                                               state->dfhist,
1276                                               step,
1277                                               state->v.rvec_array(),
1278                                               md->homenr,
1279                                               md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1280                                                                       : gmx::ArrayRef<const unsigned short>());
1281             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1282             if (MASTER(cr))
1283             {
1284                 copy_df_history(state_global->dfhist, state->dfhist);
1285             }
1286         }
1287
1288         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1289         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1290         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1291             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1292                 || checkpointHandler->isCheckpointingStep()))
1293         {
1294             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1295             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1296         }
1297         // Copy velocities if needed for the output/checkpointing.
1298         // NOTE: Copy on the search steps is done at the beginning of the step.
1299         if (useGpuForUpdate && !bNS
1300             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1301         {
1302             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1303             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1304         }
1305         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1306         // and update is offloaded hence forces are kept on the GPU for update and have not been
1307         // already transferred in do_force().
1308         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1309         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1310         //       prior to GPU update.
1311         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1312         //       copy call in do_force(...).
1313         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1314         //       on host after the D2H copy in do_force(...).
1315         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1316             && do_per_step(step, ir->nstfout))
1317         {
1318             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1319             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1320         }
1321         /* Now we have the energies and forces corresponding to the
1322          * coordinates at time t. We must output all of this before
1323          * the update.
1324          */
1325         do_md_trajectory_writing(fplog,
1326                                  cr,
1327                                  nfile,
1328                                  fnm,
1329                                  step,
1330                                  step_rel,
1331                                  t,
1332                                  ir,
1333                                  state,
1334                                  state_global,
1335                                  observablesHistory,
1336                                  top_global,
1337                                  fr,
1338                                  outf,
1339                                  energyOutput,
1340                                  ekind,
1341                                  f.view().force(),
1342                                  checkpointHandler->isCheckpointingStep(),
1343                                  bRerunMD,
1344                                  bLastStep,
1345                                  mdrunOptions.writeConfout,
1346                                  bSumEkinhOld);
1347         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1348         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x, t);
1349
1350         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1351         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1352             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1353         {
1354             copy_mat(state->svir_prev, shake_vir);
1355             copy_mat(state->fvir_prev, force_vir);
1356         }
1357
1358         stopHandler->setSignal();
1359         resetHandler->setSignal(walltime_accounting);
1360
1361         if (bGStat || !PAR(cr))
1362         {
1363             /* In parallel we only have to check for checkpointing in steps
1364              * where we do global communication,
1365              *  otherwise the other nodes don't know.
1366              */
1367             checkpointHandler->setSignal(walltime_accounting);
1368         }
1369
1370         /* #########   START SECOND UPDATE STEP ################# */
1371
1372         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1373            controlled in preprocessing */
1374
1375         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1376         {
1377             gmx_bool bIfRandomize;
1378             bIfRandomize = update_randomize_velocities(ir,
1379                                                        step,
1380                                                        cr,
1381                                                        md->homenr,
1382                                                        md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1383                                                                : gmx::ArrayRef<const unsigned short>(),
1384                                                        gmx::arrayRefFromArray(md->invmass, md->nr),
1385                                                        state->v,
1386                                                        &upd,
1387                                                        constr);
1388             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1389             if (constr && bIfRandomize)
1390             {
1391                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1392             }
1393         }
1394         /* Box is changed in update() when we do pressure coupling,
1395          * but we should still use the old box for energy corrections and when
1396          * writing it to the energy file, so it matches the trajectory files for
1397          * the same timestep above. Make a copy in a separate array.
1398          */
1399         copy_mat(state->box, lastbox);
1400
1401         dvdl_constr = 0;
1402
1403         if (!useGpuForUpdate)
1404         {
1405             wallcycle_start(wcycle, WallCycleCounter::Update);
1406         }
1407         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1408         if (bTrotter)
1409         {
1410             trotter_update(ir,
1411                            step,
1412                            ekind,
1413                            enerd,
1414                            state,
1415                            total_vir,
1416                            md->homenr,
1417                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1418                                    : gmx::ArrayRef<const unsigned short>(),
1419                            gmx::arrayRefFromArray(md->invmass, md->nr),
1420                            &MassQ,
1421                            trotter_seq,
1422                            TrotterSequence::Three);
1423             /* We can only do Berendsen coupling after we have summed
1424              * the kinetic energy or virial. Since the happens
1425              * in global_state after update, we should only do it at
1426              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1427              */
1428         }
1429         else
1430         {
1431             update_tcouple(step,
1432                            ir,
1433                            state,
1434                            ekind,
1435                            &MassQ,
1436                            md->homenr,
1437                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1438                                    : gmx::ArrayRef<const unsigned short>());
1439             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1440         }
1441
1442         /* With leap-frog type integrators we compute the kinetic energy
1443          * at a whole time step as the average of the half-time step kinetic
1444          * energies of two subsequent steps. Therefore we need to compute the
1445          * half step kinetic energy also if we need energies at the next step.
1446          */
1447         const bool needHalfStepKineticEnergy =
1448                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1449
1450         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1451         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1452         const bool doParrinelloRahman = (ir->epc == PressureCoupling::ParrinelloRahman
1453                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1454
1455         if (EI_VV(ir->eI))
1456         {
1457             GMX_ASSERT(!useGpuForUpdate, "GPU update is not supported with VVAK integrator.");
1458
1459             integrateVVSecondStep(step,
1460                                   ir,
1461                                   fr,
1462                                   cr,
1463                                   state,
1464                                   mdAtoms->mdatoms(),
1465                                   &fcdata,
1466                                   &MassQ,
1467                                   &vcm,
1468                                   pull_work,
1469                                   enerd,
1470                                   &observablesReducer,
1471                                   ekind,
1472                                   gstat,
1473                                   &dvdl_constr,
1474                                   bCalcVir,
1475                                   total_vir,
1476                                   shake_vir,
1477                                   force_vir,
1478                                   pres,
1479                                   M,
1480                                   lastbox,
1481                                   do_log,
1482                                   do_ene,
1483                                   bGStat,
1484                                   &bSumEkinhOld,
1485                                   &f,
1486                                   &cbuf,
1487                                   &upd,
1488                                   constr,
1489                                   &nullSignaller,
1490                                   trotter_seq,
1491                                   nrnb,
1492                                   wcycle);
1493         }
1494         else
1495         {
1496             if (useGpuForUpdate)
1497             {
1498                 // On search steps, update handles to device vectors
1499                 if (bNS && (bFirstStep || haveDDAtomOrdering(*cr) || bExchanged))
1500                 {
1501                     integrator->set(stateGpu->getCoordinates(),
1502                                     stateGpu->getVelocities(),
1503                                     stateGpu->getForces(),
1504                                     top->idef,
1505                                     *md);
1506
1507                     // Copy data to the GPU after buffers might have being reinitialized
1508                     /* The velocity copy is redundant if we had Center-of-Mass motion removed on
1509                      * the previous step. We don't check that now. */
1510                     stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1511                     if (bExchanged
1512                         || (!runScheduleWork->stepWork.haveGpuPmeOnThisRank
1513                             && !runScheduleWork->stepWork.useGpuXBufferOps))
1514                     {
1515                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1516                     }
1517                 }
1518
1519                 if ((simulationWork.useGpuPme && simulationWork.useCpuPmePpCommunication)
1520                     || (!runScheduleWork->stepWork.useGpuFBufferOps))
1521                 {
1522                     // The PME forces were recieved to the host, and reduced on the CPU with the
1523                     // rest of the forces computed on the GPU, so the final forces have to be copied
1524                     // back to the GPU. Or the buffer ops were not offloaded this step, so the
1525                     // forces are on the host and have to be copied
1526                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1527                 }
1528                 const bool doTemperatureScaling =
1529                         (ir->etc != TemperatureCoupling::No
1530                          && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1531
1532                 // This applies Leap-Frog, LINCS and SETTLE in succession
1533                 integrator->integrate(stateGpu->getLocalForcesReadyOnDeviceEvent(
1534                                               runScheduleWork->stepWork, runScheduleWork->simulationWork),
1535                                       ir->delta_t,
1536                                       true,
1537                                       bCalcVir,
1538                                       shake_vir,
1539                                       doTemperatureScaling,
1540                                       ekind->tcstat,
1541                                       doParrinelloRahman,
1542                                       ir->nstpcouple * ir->delta_t,
1543                                       M);
1544             }
1545             else
1546             {
1547                 /* With multiple time stepping we need to do an additional normal
1548                  * update step to obtain the virial, as the actual MTS integration
1549                  * using an acceleration where the slow forces are multiplied by mtsFactor.
1550                  * Using that acceleration would result in a virial with the slow
1551                  * force contribution would be a factor mtsFactor too large.
1552                  */
1553                 if (simulationWork.useMts && bCalcVir && constr != nullptr)
1554                 {
1555                     upd.update_for_constraint_virial(*ir,
1556                                                      md->homenr,
1557                                                      md->havePartiallyFrozenAtoms,
1558                                                      gmx::arrayRefFromArray(md->invmass, md->nr),
1559                                                      gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1560                                                      *state,
1561                                                      f.view().forceWithPadding(),
1562                                                      *ekind);
1563
1564                     constrain_coordinates(constr,
1565                                           do_log,
1566                                           do_ene,
1567                                           step,
1568                                           state,
1569                                           upd.xp()->arrayRefWithPadding(),
1570                                           &dvdl_constr,
1571                                           bCalcVir,
1572                                           shake_vir);
1573                 }
1574
1575                 ArrayRefWithPadding<const RVec> forceCombined =
1576                         (simulationWork.useMts && step % ir->mtsLevels[1].stepFactor == 0)
1577                                 ? f.view().forceMtsCombinedWithPadding()
1578                                 : f.view().forceWithPadding();
1579                 upd.update_coords(*ir,
1580                                   step,
1581                                   md->homenr,
1582                                   md->havePartiallyFrozenAtoms,
1583                                   gmx::arrayRefFromArray(md->ptype, md->nr),
1584                                   gmx::arrayRefFromArray(md->invmass, md->nr),
1585                                   gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1586                                   state,
1587                                   forceCombined,
1588                                   &fcdata,
1589                                   ekind,
1590                                   M,
1591                                   etrtPOSITION,
1592                                   cr,
1593                                   constr != nullptr);
1594
1595                 wallcycle_stop(wcycle, WallCycleCounter::Update);
1596
1597                 constrain_coordinates(constr,
1598                                       do_log,
1599                                       do_ene,
1600                                       step,
1601                                       state,
1602                                       upd.xp()->arrayRefWithPadding(),
1603                                       &dvdl_constr,
1604                                       bCalcVir && !simulationWork.useMts,
1605                                       shake_vir);
1606
1607                 upd.update_sd_second_half(*ir,
1608                                           step,
1609                                           &dvdl_constr,
1610                                           md->homenr,
1611                                           gmx::arrayRefFromArray(md->ptype, md->nr),
1612                                           gmx::arrayRefFromArray(md->invmass, md->nr),
1613                                           state,
1614                                           cr,
1615                                           nrnb,
1616                                           wcycle,
1617                                           constr,
1618                                           do_log,
1619                                           do_ene);
1620                 upd.finish_update(
1621                         *ir, md->havePartiallyFrozenAtoms, md->homenr, state, wcycle, constr != nullptr);
1622             }
1623
1624             if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1625             {
1626                 updatePrevStepPullCom(pull_work, state->pull_com_prev_step);
1627             }
1628
1629             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1630         }
1631
1632         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1633         /* With Leap-Frog we can skip compute_globals at
1634          * non-communication steps, but we need to calculate
1635          * the kinetic energy one step before communication.
1636          */
1637         {
1638             // Organize to do inter-simulation signalling on steps if
1639             // and when algorithms require it.
1640             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1641
1642             if (useGpuForUpdate)
1643             {
1644                 const bool coordinatesRequiredForStopCM =
1645                         bStopCM && (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1646                         && !EI_VV(ir->eI);
1647
1648                 // Copy coordinates when needed to stop the CM motion or for replica exchange
1649                 if (coordinatesRequiredForStopCM || bDoReplEx)
1650                 {
1651                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1652                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1653                 }
1654
1655                 // Copy velocities back to the host if:
1656                 // - Globals are computed this step (includes the energy output steps).
1657                 // - Temperature is needed for the next step.
1658                 // - This is a replica exchange step (even though we will only need
1659                 //     the velocities if an exchange succeeds)
1660                 if (bGStat || needHalfStepKineticEnergy || bDoReplEx)
1661                 {
1662                     stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1663                     stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1664                 }
1665             }
1666
1667             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1668             {
1669                 // Since we're already communicating at this step, we
1670                 // can propagate intra-simulation signals. Note that
1671                 // check_nstglobalcomm has the responsibility for
1672                 // choosing the value of nstglobalcomm that is one way
1673                 // bGStat becomes true, so we can't get into a
1674                 // situation where e.g. checkpointing can't be
1675                 // signalled.
1676                 bool                doIntraSimSignal = true;
1677                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1678
1679                 compute_globals(gstat,
1680                                 cr,
1681                                 ir,
1682                                 fr,
1683                                 ekind,
1684                                 makeConstArrayRef(state->x),
1685                                 makeConstArrayRef(state->v),
1686                                 state->box,
1687                                 md,
1688                                 nrnb,
1689                                 &vcm,
1690                                 wcycle,
1691                                 enerd,
1692                                 force_vir,
1693                                 shake_vir,
1694                                 total_vir,
1695                                 pres,
1696                                 &signaller,
1697                                 lastbox,
1698                                 &bSumEkinhOld,
1699                                 (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1700                                         | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1701                                         | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1702                                         | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT,
1703                                 step,
1704                                 &observablesReducer);
1705                 if (!EI_VV(ir->eI) && bStopCM)
1706                 {
1707                     process_and_stopcm_grp(
1708                             fplog, &vcm, *md, makeArrayRef(state->x), makeArrayRef(state->v));
1709                     inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
1710
1711                     // TODO: The special case of removing CM motion should be dealt more gracefully
1712                     if (useGpuForUpdate)
1713                     {
1714                         // Issue #3988, #4106.
1715                         stateGpu->resetCoordinatesCopiedToDeviceEvent(AtomLocality::Local);
1716                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1717                         // Here we block until the H2D copy completes because event sync with the
1718                         // force kernels that use the coordinates on the next steps is not implemented
1719                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1720                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1721                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1722                         if (vcm.mode != ComRemovalAlgorithm::No)
1723                         {
1724                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1725                         }
1726                     }
1727                 }
1728             }
1729         }
1730
1731         /* #############  END CALC EKIN AND PRESSURE ################# */
1732
1733         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1734            the virial that should probably be addressed eventually. state->veta has better properies,
1735            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1736            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1737
1738         if (ir->efep != FreeEnergyPerturbationType::No && !EI_VV(ir->eI))
1739         {
1740             /* Sum up the foreign energy and dK/dl terms for md and sd.
1741                Currently done every step so that dH/dl is correct in the .edr */
1742             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1743         }
1744
1745         bool scaleCoordinates = !useGpuForUpdate || bDoReplEx;
1746         update_pcouple_after_coordinates(fplog,
1747                                          step,
1748                                          ir,
1749                                          md->homenr,
1750                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1751                                                      : gmx::ArrayRef<const unsigned short>(),
1752                                          pres,
1753                                          force_vir,
1754                                          shake_vir,
1755                                          pressureCouplingMu,
1756                                          state,
1757                                          nrnb,
1758                                          upd.deform(),
1759                                          scaleCoordinates);
1760
1761         const bool doBerendsenPressureCoupling = (inputrec->epc == PressureCoupling::Berendsen
1762                                                   && do_per_step(step, inputrec->nstpcouple));
1763         const bool doCRescalePressureCoupling  = (inputrec->epc == PressureCoupling::CRescale
1764                                                  && do_per_step(step, inputrec->nstpcouple));
1765         if (useGpuForUpdate
1766             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1767         {
1768             integrator->scaleCoordinates(pressureCouplingMu);
1769             if (doCRescalePressureCoupling)
1770             {
1771                 matrix pressureCouplingInvMu;
1772                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1773                 integrator->scaleVelocities(pressureCouplingInvMu);
1774             }
1775             integrator->setPbc(PbcType::Xyz, state->box);
1776         }
1777
1778         /* ################# END UPDATE STEP 2 ################# */
1779         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1780
1781         /* The coordinates (x) were unshifted in update */
1782         if (!bGStat)
1783         {
1784             /* We will not sum ekinh_old,
1785              * so signal that we still have to do it.
1786              */
1787             bSumEkinhOld = TRUE;
1788         }
1789
1790         if (bCalcEner)
1791         {
1792             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1793
1794             /* use the directly determined last velocity, not actually the averaged half steps */
1795             if (bTrotter && ir->eI == IntegrationAlgorithm::VV)
1796             {
1797                 enerd->term[F_EKIN] = last_ekin;
1798             }
1799             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1800
1801             if (integratorHasConservedEnergyQuantity(ir))
1802             {
1803                 if (EI_VV(ir->eI))
1804                 {
1805                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1806                 }
1807                 else
1808                 {
1809                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1810                 }
1811             }
1812             /* #########  END PREPARING EDR OUTPUT  ###########  */
1813         }
1814
1815         /* Output stuff */
1816         if (MASTER(cr))
1817         {
1818             if (fplog && do_log && bDoExpanded)
1819             {
1820                 /* only needed if doing expanded ensemble */
1821                 PrintFreeEnergyInfoToFile(fplog,
1822                                           ir->fepvals.get(),
1823                                           ir->expandedvals.get(),
1824                                           ir->bSimTemp ? ir->simtempvals.get() : nullptr,
1825                                           state_global->dfhist,
1826                                           state->fep_state,
1827                                           ir->nstlog,
1828                                           step);
1829             }
1830             if (bCalcEner)
1831             {
1832                 const bool outputDHDL = (computeDHDL && do_per_step(step, ir->fepvals->nstdhdl));
1833
1834                 energyOutput.addDataAtEnergyStep(outputDHDL,
1835                                                  bCalcEnerStep,
1836                                                  t,
1837                                                  md->tmass,
1838                                                  enerd,
1839                                                  ir->fepvals.get(),
1840                                                  lastbox,
1841                                                  PTCouplingArrays{ state->boxv,
1842                                                                    state->nosehoover_xi,
1843                                                                    state->nosehoover_vxi,
1844                                                                    state->nhpres_xi,
1845                                                                    state->nhpres_vxi },
1846                                                  state->fep_state,
1847                                                  total_vir,
1848                                                  pres,
1849                                                  ekind,
1850                                                  mu_tot,
1851                                                  constr);
1852             }
1853             else
1854             {
1855                 energyOutput.recordNonEnergyStep();
1856             }
1857
1858             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1859             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1860
1861             if (doSimulatedAnnealing)
1862             {
1863                 gmx::EnergyOutput::printAnnealingTemperatures(
1864                         do_log ? fplog : nullptr, groups, &(ir->opts));
1865             }
1866             if (do_log || do_ene || do_dr || do_or)
1867             {
1868                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf),
1869                                                    do_ene,
1870                                                    do_dr,
1871                                                    do_or,
1872                                                    do_log ? fplog : nullptr,
1873                                                    step,
1874                                                    t,
1875                                                    fr->fcdata.get(),
1876                                                    awh.get());
1877             }
1878             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1879             {
1880                 const bool isInitialOutput = false;
1881                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1882             }
1883
1884             if (ir->bPull)
1885             {
1886                 pull_print_output(pull_work, step, t);
1887             }
1888
1889             if (do_per_step(step, ir->nstlog))
1890             {
1891                 if (fflush(fplog) != 0)
1892                 {
1893                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1894                 }
1895             }
1896         }
1897         if (bDoExpanded)
1898         {
1899             /* Have to do this part _after_ outputting the logfile and the edr file */
1900             /* Gets written into the state at the beginning of next loop*/
1901             state->fep_state = lamnew;
1902         }
1903         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1904         {
1905             state->fep_state = awh->fepLambdaState();
1906         }
1907         /* Print the remaining wall clock time for the run */
1908         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1909         {
1910             if (shellfc)
1911             {
1912                 fprintf(stderr, "\n");
1913             }
1914             print_time(stderr, walltime_accounting, step, ir, cr);
1915         }
1916
1917         /* Ion/water position swapping.
1918          * Not done in last step since trajectory writing happens before this call
1919          * in the MD loop and exchanges would be lost anyway. */
1920         bNeedRepartition = FALSE;
1921         if ((ir->eSwapCoords != SwapType::No) && (step > 0) && !bLastStep
1922             && do_per_step(step, ir->swap->nstswap))
1923         {
1924             bNeedRepartition = do_swapcoords(cr,
1925                                              step,
1926                                              t,
1927                                              ir,
1928                                              swap,
1929                                              wcycle,
1930                                              as_rvec_array(state->x.data()),
1931                                              state->box,
1932                                              MASTER(cr) && mdrunOptions.verbose,
1933                                              bRerunMD);
1934
1935             if (bNeedRepartition && haveDDAtomOrdering(*cr))
1936             {
1937                 dd_collect_state(cr->dd, state, state_global);
1938             }
1939         }
1940
1941         /* Replica exchange */
1942         bExchanged = FALSE;
1943         if (bDoReplEx)
1944         {
1945             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1946         }
1947
1948         if ((bExchanged || bNeedRepartition) && haveDDAtomOrdering(*cr))
1949         {
1950             dd_partition_system(fplog,
1951                                 mdlog,
1952                                 step,
1953                                 cr,
1954                                 TRUE,
1955                                 1,
1956                                 state_global,
1957                                 top_global,
1958                                 *ir,
1959                                 imdSession,
1960                                 pull_work,
1961                                 state,
1962                                 &f,
1963                                 mdAtoms,
1964                                 top,
1965                                 fr,
1966                                 vsite,
1967                                 constr,
1968                                 nrnb,
1969                                 wcycle,
1970                                 FALSE);
1971             upd.updateAfterPartition(state->natoms,
1972                                      md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1973                                                  : gmx::ArrayRef<const unsigned short>(),
1974                                      md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1975                                              : gmx::ArrayRef<const unsigned short>(),
1976                                      md->cACC ? gmx::arrayRefFromArray(md->cACC, md->nr)
1977                                               : gmx::ArrayRef<const unsigned short>());
1978             fr->longRangeNonbondeds->updateAfterPartition(*md);
1979         }
1980
1981         bFirstStep = FALSE;
1982         bInitStep  = FALSE;
1983
1984         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1985         /* With all integrators, except VV, we need to retain the pressure
1986          * at the current step for coupling at the next step.
1987          */
1988         if ((state->flags & enumValueToBitMask(StateEntry::PressurePrevious))
1989             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1990         {
1991             /* Store the pressure in t_state for pressure coupling
1992              * at the next MD step.
1993              */
1994             copy_mat(pres, state->pres_prev);
1995         }
1996
1997         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1998
1999         if ((membed != nullptr) && (!bLastStep))
2000         {
2001             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
2002         }
2003
2004         cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
2005         if (haveDDAtomOrdering(*cr) && wcycle)
2006         {
2007             dd_cycles_add(cr->dd, cycles, ddCyclStep);
2008         }
2009
2010         /* increase the MD step number */
2011         step++;
2012         step_rel++;
2013         observablesReducer.markAsReadyToReduce();
2014
2015 #if GMX_FAHCORE
2016         if (MASTER(cr))
2017         {
2018             fcReportProgress(ir->nsteps + ir->init_step, step);
2019         }
2020 #endif
2021
2022         resetHandler->resetCounters(
2023                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
2024
2025         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
2026         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
2027     }
2028     /* End of main MD loop */
2029
2030     /* Closing TNG files can include compressing data. Therefore it is good to do that
2031      * before stopping the time measurements. */
2032     mdoutf_tng_close(outf);
2033
2034     /* Stop measuring walltime */
2035     walltime_accounting_end_time(walltime_accounting);
2036
2037     if (simulationWork.haveSeparatePmeRank)
2038     {
2039         /* Tell the PME only node to finish */
2040         gmx_pme_send_finish(cr);
2041     }
2042
2043     if (MASTER(cr))
2044     {
2045         if (ir->nstcalcenergy > 0)
2046         {
2047             energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
2048
2049             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
2050             energyOutput.printAverages(fplog, groups);
2051         }
2052     }
2053     done_mdoutf(outf);
2054
2055     if (bPMETune)
2056     {
2057         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
2058     }
2059
2060     done_shellfc(fplog, shellfc, step_rel);
2061
2062     if (useReplicaExchange && MASTER(cr))
2063     {
2064         print_replica_exchange_statistics(fplog, repl_ex);
2065     }
2066
2067     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
2068
2069     global_stat_destroy(gstat);
2070 }