Merge branch release-2021
[alexxy/gromacs.git] / src / gromacs / mdrun / md.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2011-2019,2020, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 /*! \internal \file
38  *
39  * \brief Implements the integrator for normal molecular dynamics simulations
40  *
41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
42  * \ingroup module_mdrun
43  */
44 #include "gmxpre.h"
45
46 #include <cinttypes>
47 #include <cmath>
48 #include <cstdio>
49 #include <cstdlib>
50
51 #include <algorithm>
52 #include <memory>
53 #include <numeric>
54
55 #include "gromacs/applied_forces/awh/awh.h"
56 #include "gromacs/commandline/filenm.h"
57 #include "gromacs/domdec/collect.h"
58 #include "gromacs/domdec/dlbtiming.h"
59 #include "gromacs/domdec/domdec.h"
60 #include "gromacs/domdec/domdec_network.h"
61 #include "gromacs/domdec/domdec_struct.h"
62 #include "gromacs/domdec/gpuhaloexchange.h"
63 #include "gromacs/domdec/mdsetup.h"
64 #include "gromacs/domdec/partition.h"
65 #include "gromacs/essentialdynamics/edsam.h"
66 #include "gromacs/ewald/pme_load_balancing.h"
67 #include "gromacs/ewald/pme_pp.h"
68 #include "gromacs/fileio/trxio.h"
69 #include "gromacs/gmxlib/network.h"
70 #include "gromacs/gmxlib/nrnb.h"
71 #include "gromacs/gpu_utils/device_stream_manager.h"
72 #include "gromacs/gpu_utils/gpu_utils.h"
73 #include "gromacs/imd/imd.h"
74 #include "gromacs/listed_forces/listed_forces.h"
75 #include "gromacs/math/functions.h"
76 #include "gromacs/math/invertmatrix.h"
77 #include "gromacs/math/vec.h"
78 #include "gromacs/math/vectypes.h"
79 #include "gromacs/mdlib/checkpointhandler.h"
80 #include "gromacs/mdlib/compute_io.h"
81 #include "gromacs/mdlib/constr.h"
82 #include "gromacs/mdlib/coupling.h"
83 #include "gromacs/mdlib/ebin.h"
84 #include "gromacs/mdlib/enerdata_utils.h"
85 #include "gromacs/mdlib/energyoutput.h"
86 #include "gromacs/mdlib/expanded.h"
87 #include "gromacs/mdlib/force.h"
88 #include "gromacs/mdlib/force_flags.h"
89 #include "gromacs/mdlib/forcerec.h"
90 #include "gromacs/mdlib/freeenergyparameters.h"
91 #include "gromacs/mdlib/md_support.h"
92 #include "gromacs/mdlib/mdatoms.h"
93 #include "gromacs/mdlib/mdoutf.h"
94 #include "gromacs/mdlib/membed.h"
95 #include "gromacs/mdlib/resethandler.h"
96 #include "gromacs/mdlib/sighandler.h"
97 #include "gromacs/mdlib/simulationsignal.h"
98 #include "gromacs/mdlib/stat.h"
99 #include "gromacs/mdlib/stophandler.h"
100 #include "gromacs/mdlib/tgroup.h"
101 #include "gromacs/mdlib/trajectory_writing.h"
102 #include "gromacs/mdlib/update.h"
103 #include "gromacs/mdlib/update_constrain_gpu.h"
104 #include "gromacs/mdlib/update_vv.h"
105 #include "gromacs/mdlib/vcm.h"
106 #include "gromacs/mdlib/vsite.h"
107 #include "gromacs/mdrunutility/handlerestart.h"
108 #include "gromacs/mdrunutility/multisim.h"
109 #include "gromacs/mdrunutility/printtime.h"
110 #include "gromacs/mdtypes/awh_history.h"
111 #include "gromacs/mdtypes/awh_params.h"
112 #include "gromacs/mdtypes/commrec.h"
113 #include "gromacs/mdtypes/df_history.h"
114 #include "gromacs/mdtypes/energyhistory.h"
115 #include "gromacs/mdtypes/fcdata.h"
116 #include "gromacs/mdtypes/forcebuffers.h"
117 #include "gromacs/mdtypes/forcerec.h"
118 #include "gromacs/mdtypes/group.h"
119 #include "gromacs/mdtypes/inputrec.h"
120 #include "gromacs/mdtypes/interaction_const.h"
121 #include "gromacs/mdtypes/md_enums.h"
122 #include "gromacs/mdtypes/mdatom.h"
123 #include "gromacs/mdtypes/mdrunoptions.h"
124 #include "gromacs/mdtypes/multipletimestepping.h"
125 #include "gromacs/mdtypes/observableshistory.h"
126 #include "gromacs/mdtypes/pullhistory.h"
127 #include "gromacs/mdtypes/simulation_workload.h"
128 #include "gromacs/mdtypes/state.h"
129 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
130 #include "gromacs/modularsimulator/energydata.h"
131 #include "gromacs/nbnxm/gpu_data_mgmt.h"
132 #include "gromacs/nbnxm/nbnxm.h"
133 #include "gromacs/pbcutil/pbc.h"
134 #include "gromacs/pulling/output.h"
135 #include "gromacs/pulling/pull.h"
136 #include "gromacs/swap/swapcoords.h"
137 #include "gromacs/timing/wallcycle.h"
138 #include "gromacs/timing/walltime_accounting.h"
139 #include "gromacs/topology/atoms.h"
140 #include "gromacs/topology/idef.h"
141 #include "gromacs/topology/mtop_util.h"
142 #include "gromacs/topology/topology.h"
143 #include "gromacs/trajectory/trajectoryframe.h"
144 #include "gromacs/utility/basedefinitions.h"
145 #include "gromacs/utility/cstringutil.h"
146 #include "gromacs/utility/fatalerror.h"
147 #include "gromacs/utility/logger.h"
148 #include "gromacs/utility/real.h"
149 #include "gromacs/utility/smalloc.h"
150
151 #include "legacysimulator.h"
152 #include "replicaexchange.h"
153 #include "shellfc.h"
154
155 using gmx::SimulationSignaller;
156
157 void gmx::LegacySimulator::do_md()
158 {
159     // TODO Historically, the EM and MD "integrators" used different
160     // names for the t_inputrec *parameter, but these must have the
161     // same name, now that it's a member of a struct. We use this ir
162     // alias to avoid a large ripple of nearly useless changes.
163     // t_inputrec is being replaced by IMdpOptionsProvider, so this
164     // will go away eventually.
165     t_inputrec*  ir = inputrec;
166     int64_t      step, step_rel;
167     double       t, t0 = ir->init_t;
168     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
169     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
170     gmx_bool     bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
171     gmx_bool     do_ene, do_log, do_verbose;
172     gmx_bool     bMasterState;
173     unsigned int force_flags;
174     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
175     int    i, m;
176     rvec   mu_tot;
177     matrix pressureCouplingMu, M;
178     gmx_repl_ex_t     repl_ex = nullptr;
179     gmx_global_stat_t gstat;
180     gmx_shellfc_t*    shellfc;
181     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
182     gmx_bool          bTrotter;
183     real              dvdl_constr;
184     std::vector<RVec> cbuf;
185     matrix            lastbox;
186     int               lamnew = 0;
187     /* for FEP */
188     int       nstfep = 0;
189     double    cycles;
190     real      saved_conserved_quantity = 0;
191     real      last_ekin                = 0;
192     t_extmass MassQ;
193     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
194
195     /* PME load balancing data for GPU kernels */
196     gmx_bool bPMETune         = FALSE;
197     gmx_bool bPMETunePrinting = FALSE;
198
199     bool bInteractiveMDstep = false;
200
201     /* Domain decomposition could incorrectly miss a bonded
202        interaction, but checking for that requires a global
203        communication stage, which does not otherwise happen in DD
204        code. So we do that alongside the first global energy reduction
205        after a new DD is made. These variables handle whether the
206        check happens, and the result it returns. */
207     bool shouldCheckNumberOfBondedInteractions = false;
208     int  totalNumberOfBondedInteractions       = -1;
209
210     SimulationSignals signals;
211     // Most global communnication stages don't propagate mdrun
212     // signals, and will use this object to achieve that.
213     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
214
215     if (!mdrunOptions.writeConfout)
216     {
217         // This is on by default, and the main known use case for
218         // turning it off is for convenience in benchmarking, which is
219         // something that should not show up in the general user
220         // interface.
221         GMX_LOG(mdlog.info)
222                 .asParagraph()
223                 .appendText(
224                         "The -noconfout functionality is deprecated, and may be removed in a "
225                         "future version.");
226     }
227
228     /* md-vv uses averaged full step velocities for T-control
229        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
230        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
231     bTrotter = (EI_VV(ir->eI)
232                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
233
234     const bool bRerunMD = false;
235
236     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
237     bGStatEveryStep   = (nstglobalcomm == 1);
238
239     const SimulationGroups* groups = &top_global->groups;
240
241     std::unique_ptr<EssentialDynamics> ed = nullptr;
242     if (opt2bSet("-ei", nfile, fnm))
243     {
244         /* Initialize essential dynamics sampling */
245         ed = init_edsam(mdlog,
246                         opt2fn_null("-ei", nfile, fnm),
247                         opt2fn("-eo", nfile, fnm),
248                         top_global,
249                         ir,
250                         cr,
251                         constr,
252                         state_global,
253                         observablesHistory,
254                         oenv,
255                         startingBehavior);
256     }
257     else if (observablesHistory->edsamHistory)
258     {
259         gmx_fatal(FARGS,
260                   "The checkpoint is from a run with essential dynamics sampling, "
261                   "but the current run did not specify the -ei option. "
262                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
263     }
264
265     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
266     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
267     initialize_lambdas(fplog, *ir, MASTER(cr), fep_state, lambda);
268     Update     upd(*ir, deform);
269     const bool doSimulatedAnnealing = initSimulatedAnnealing(ir, &upd);
270     const bool useReplicaExchange   = (replExParams.exchangeInterval > 0);
271
272     const t_fcdata& fcdata = *fr->fcdata;
273
274     bool simulationsShareState = false;
275     int  nstSignalComm         = nstglobalcomm;
276     {
277         // TODO This implementation of ensemble orientation restraints is nasty because
278         // a user can't just do multi-sim with single-sim orientation restraints.
279         bool usingEnsembleRestraints =
280                 (fcdata.disres->nsystems > 1) || ((ms != nullptr) && (fcdata.orires->nr != 0));
281         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr));
282
283         // Replica exchange, ensemble restraints and AWH need all
284         // simulations to remain synchronized, so they need
285         // checkpoints and stop conditions to act on the same step, so
286         // the propagation of such signals must take place between
287         // simulations, not just within simulations.
288         // TODO: Make algorithm initializers set these flags.
289         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
290
291         if (simulationsShareState)
292         {
293             // Inter-simulation signal communication does not need to happen
294             // often, so we use a minimum of 200 steps to reduce overhead.
295             const int c_minimumInterSimulationSignallingInterval = 200;
296             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
297                             * nstglobalcomm;
298         }
299     }
300
301     if (startingBehavior != StartingBehavior::RestartWithAppending)
302     {
303         pleaseCiteCouplingAlgorithms(fplog, *ir);
304     }
305     gmx_mdoutf*       outf = init_mdoutf(fplog,
306                                    nfile,
307                                    fnm,
308                                    mdrunOptions,
309                                    cr,
310                                    outputProvider,
311                                    mdModulesNotifier,
312                                    ir,
313                                    top_global,
314                                    oenv,
315                                    wcycle,
316                                    startingBehavior,
317                                    simulationsShareState,
318                                    ms);
319     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf),
320                                    top_global,
321                                    ir,
322                                    pull_work,
323                                    mdoutf_get_fp_dhdl(outf),
324                                    false,
325                                    startingBehavior,
326                                    simulationsShareState,
327                                    mdModulesNotifier);
328
329     gstat = global_stat_init(ir);
330
331     const auto& simulationWork     = runScheduleWork->simulationWork;
332     const bool  useGpuForPme       = simulationWork.useGpuPme;
333     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
334     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
335     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
336
337     /* Check for polarizable models and flexible constraints */
338     shellfc = init_shell_flexcon(fplog,
339                                  top_global,
340                                  constr ? constr->numFlexibleConstraints() : 0,
341                                  ir->nstcalcenergy,
342                                  DOMAINDECOMP(cr),
343                                  useGpuForPme);
344
345     {
346         double io = compute_io(ir, top_global->natoms, *groups, energyOutput.numEnergyTerms(), 1);
347         if ((io > 2000) && MASTER(cr))
348         {
349             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
350         }
351     }
352
353     // Local state only becomes valid now.
354     std::unique_ptr<t_state> stateInstance;
355     t_state*                 state;
356
357     gmx_localtop_t top(top_global->ffparams);
358
359     auto mdatoms = mdAtoms->mdatoms();
360
361     ForceBuffers f(fr->useMts,
362                    ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
363                            ? PinningPolicy::PinnedIfSupported
364                            : PinningPolicy::CannotBePinned);
365     if (DOMAINDECOMP(cr))
366     {
367         stateInstance = std::make_unique<t_state>();
368         state         = stateInstance.get();
369         dd_init_local_state(cr->dd, state_global, state);
370
371         /* Distribute the charge groups over the nodes from the master node */
372         dd_partition_system(fplog,
373                             mdlog,
374                             ir->init_step,
375                             cr,
376                             TRUE,
377                             1,
378                             state_global,
379                             *top_global,
380                             ir,
381                             imdSession,
382                             pull_work,
383                             state,
384                             &f,
385                             mdAtoms,
386                             &top,
387                             fr,
388                             vsite,
389                             constr,
390                             nrnb,
391                             nullptr,
392                             FALSE);
393         shouldCheckNumberOfBondedInteractions = true;
394         upd.setNumAtoms(state->natoms);
395     }
396     else
397     {
398         state_change_natoms(state_global, state_global->natoms);
399         /* Copy the pointer to the global state */
400         state = state_global;
401
402         /* Generate and initialize new topology */
403         mdAlgorithmsSetupAtomData(cr, ir, *top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
404
405         upd.setNumAtoms(state->natoms);
406     }
407
408     std::unique_ptr<UpdateConstrainGpu> integrator;
409
410     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
411
412     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
413     if (useGpuForUpdate)
414     {
415         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
416                                    || constr->numConstraintsTotal() == 0,
417                            "Constraints in domain decomposition are only supported with update "
418                            "groups if using GPU update.\n");
419         GMX_RELEASE_ASSERT(ir->eConstrAlg != econtSHAKE || constr == nullptr
420                                    || constr->numConstraintsTotal() == 0,
421                            "SHAKE is not supported with GPU update.");
422         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
423                            "Either PME or short-ranged non-bonded interaction tasks must run on "
424                            "the GPU to use GPU update.\n");
425         GMX_RELEASE_ASSERT(ir->eI == eiMD,
426                            "Only the md integrator is supported with the GPU update.\n");
427         GMX_RELEASE_ASSERT(
428                 ir->etc != etcNOSEHOOVER,
429                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
430         GMX_RELEASE_ASSERT(
431                 ir->epc == epcNO || ir->epc == epcPARRINELLORAHMAN || ir->epc == epcBERENDSEN
432                         || ir->epc == epcCRESCALE,
433                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
434                 "with the GPU update.\n");
435         GMX_RELEASE_ASSERT(!mdatoms->haveVsites,
436                            "Virtual sites are not supported with the GPU update.\n");
437         GMX_RELEASE_ASSERT(ed == nullptr,
438                            "Essential dynamics is not supported with the GPU update.\n");
439         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
440                            "Constraints pulling is not supported with the GPU update.\n");
441         GMX_RELEASE_ASSERT(fcdata.orires->nr == 0,
442                            "Orientation restraints are not supported with the GPU update.\n");
443         GMX_RELEASE_ASSERT(
444                 ir->efep == efepNO
445                         || (!haveFepPerturbedMasses(*top_global) && !havePerturbedConstraints(*top_global)),
446                 "Free energy perturbation of masses and constraints are not supported with the GPU "
447                 "update.");
448
449         if (constr != nullptr && constr->numConstraintsTotal() > 0)
450         {
451             GMX_LOG(mdlog.info)
452                     .asParagraph()
453                     .appendText("Updating coordinates and applying constraints on the GPU.");
454         }
455         else
456         {
457             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
458         }
459         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
460                            "Device stream manager should be initialized in order to use GPU "
461                            "update-constraints.");
462         GMX_RELEASE_ASSERT(
463                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
464                 "Update stream should be initialized in order to use GPU "
465                 "update-constraints.");
466         integrator = std::make_unique<UpdateConstrainGpu>(
467                 *ir,
468                 *top_global,
469                 fr->deviceStreamManager->context(),
470                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
471                 stateGpu->xUpdatedOnDevice(),
472                 wcycle);
473
474         integrator->setPbc(PbcType::Xyz, state->box);
475     }
476
477     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
478     {
479         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
480     }
481     if (useGpuForUpdate)
482     {
483         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
484     }
485
486     // NOTE: The global state is no longer used at this point.
487     // But state_global is still used as temporary storage space for writing
488     // the global state to file and potentially for replica exchange.
489     // (Global topology should persist.)
490
491     update_mdatoms(mdatoms, state->lambda[efptMASS]);
492
493     if (ir->bExpanded)
494     {
495         /* Check nstexpanded here, because the grompp check was broken */
496         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
497         {
498             gmx_fatal(FARGS,
499                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
500         }
501         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
502     }
503
504     if (MASTER(cr))
505     {
506         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
507     }
508
509     preparePrevStepPullCom(
510             ir, pull_work, mdatoms->massT, state, state_global, cr, startingBehavior != StartingBehavior::NewSimulation);
511
512     // TODO: Remove this by converting AWH into a ForceProvider
513     auto awh = prepareAwhModule(fplog,
514                                 *ir,
515                                 state_global,
516                                 cr,
517                                 ms,
518                                 startingBehavior != StartingBehavior::NewSimulation,
519                                 shellfc != nullptr,
520                                 opt2fn("-awh", nfile, fnm),
521                                 pull_work);
522
523     if (useReplicaExchange && MASTER(cr))
524     {
525         repl_ex = init_replica_exchange(fplog, ms, top_global->natoms, ir, replExParams);
526     }
527     /* PME tuning is only supported in the Verlet scheme, with PME for
528      * Coulomb. It is not supported with only LJ PME. */
529     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
530                 && ir->cutoff_scheme != ecutsGROUP);
531
532     pme_load_balancing_t* pme_loadbal = nullptr;
533     if (bPMETune)
534     {
535         pme_loadbal_init(
536                 &pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu());
537     }
538
539     if (!ir->bContinuation)
540     {
541         if (state->flags & (1U << estV))
542         {
543             auto v = makeArrayRef(state->v);
544             /* Set the velocities of vsites, shells and frozen atoms to zero */
545             for (i = 0; i < mdatoms->homenr; i++)
546             {
547                 if (mdatoms->ptype[i] == eptVSite || mdatoms->ptype[i] == eptShell)
548                 {
549                     clear_rvec(v[i]);
550                 }
551                 else if (mdatoms->cFREEZE)
552                 {
553                     for (m = 0; m < DIM; m++)
554                     {
555                         if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
556                         {
557                             v[i][m] = 0;
558                         }
559                     }
560                 }
561             }
562         }
563
564         if (constr)
565         {
566             /* Constrain the initial coordinates and velocities */
567             do_constrain_first(fplog,
568                                constr,
569                                ir,
570                                mdatoms->nr,
571                                mdatoms->homenr,
572                                state->x.arrayRefWithPadding(),
573                                state->v.arrayRefWithPadding(),
574                                state->box,
575                                state->lambda[efptBONDED]);
576         }
577         if (vsite)
578         {
579             /* Construct the virtual sites for the initial configuration */
580             vsite->construct(state->x, ir->delta_t, {}, state->box);
581         }
582     }
583
584     if (ir->efep != efepNO)
585     {
586         /* Set free energy calculation frequency as the greatest common
587          * denominator of nstdhdl and repl_ex_nst. */
588         nstfep = ir->fepvals->nstdhdl;
589         if (ir->bExpanded)
590         {
591             nstfep = std::gcd(ir->expandedvals->nstexpanded, nstfep);
592         }
593         if (useReplicaExchange)
594         {
595             nstfep = std::gcd(replExParams.exchangeInterval, nstfep);
596         }
597         if (ir->bDoAwh)
598         {
599             nstfep = std::gcd(ir->awhParams->nstSampleCoord, nstfep);
600         }
601     }
602
603     /* Be REALLY careful about what flags you set here. You CANNOT assume
604      * this is the first step, since we might be restarting from a checkpoint,
605      * and in that case we should not do any modifications to the state.
606      */
607     bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
608
609     // When restarting from a checkpoint, it can be appropriate to
610     // initialize ekind from quantities in the checkpoint. Otherwise,
611     // compute_globals must initialize ekind before the simulation
612     // starts/restarts. However, only the master rank knows what was
613     // found in the checkpoint file, so we have to communicate in
614     // order to coordinate the restart.
615     //
616     // TODO Consider removing this communication if/when checkpoint
617     // reading directly follows .tpr reading, because all ranks can
618     // agree on hasReadEkinState at that time.
619     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
620     if (PAR(cr))
621     {
622         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
623     }
624     if (hasReadEkinState)
625     {
626         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
627     }
628
629     unsigned int cglo_flags =
630             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
631              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
632
633     bSumEkinhOld = FALSE;
634
635     t_vcm vcm(top_global->groups, *ir);
636     reportComRemovalInfo(fplog, vcm);
637
638     /* To minimize communication, compute_globals computes the COM velocity
639      * and the kinetic energy for the velocities without COM motion removed.
640      * Thus to get the kinetic energy without the COM contribution, we need
641      * to call compute_globals twice.
642      */
643     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
644     {
645         unsigned int cglo_flags_iteration = cglo_flags;
646         if (bStopCM && cgloIteration == 0)
647         {
648             cglo_flags_iteration |= CGLO_STOPCM;
649             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
650         }
651         compute_globals(gstat,
652                         cr,
653                         ir,
654                         fr,
655                         ekind,
656                         makeConstArrayRef(state->x),
657                         makeConstArrayRef(state->v),
658                         state->box,
659                         mdatoms,
660                         nrnb,
661                         &vcm,
662                         nullptr,
663                         enerd,
664                         force_vir,
665                         shake_vir,
666                         total_vir,
667                         pres,
668                         constr,
669                         &nullSignaller,
670                         state->box,
671                         &totalNumberOfBondedInteractions,
672                         &bSumEkinhOld,
673                         cglo_flags_iteration
674                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
675                                                                          : 0));
676         if (cglo_flags_iteration & CGLO_STOPCM)
677         {
678             /* At initialization, do not pass x with acceleration-correction mode
679              * to avoid (incorrect) correction of the initial coordinates.
680              */
681             auto x = (vcm.mode == ecmLINEAR_ACCELERATION_CORRECTION) ? ArrayRef<RVec>()
682                                                                      : makeArrayRef(state->x);
683             process_and_stopcm_grp(fplog, &vcm, *mdatoms, x, makeArrayRef(state->v));
684             inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
685         }
686     }
687     checkNumberOfBondedInteractions(mdlog,
688                                     cr,
689                                     totalNumberOfBondedInteractions,
690                                     top_global,
691                                     &top,
692                                     makeConstArrayRef(state->x),
693                                     state->box,
694                                     &shouldCheckNumberOfBondedInteractions);
695     if (ir->eI == eiVVAK)
696     {
697         /* a second call to get the half step temperature initialized as well */
698         /* we do the same call as above, but turn the pressure off -- internally to
699            compute_globals, this is recognized as a velocity verlet half-step
700            kinetic energy calculation.  This minimized excess variables, but
701            perhaps loses some logic?*/
702
703         compute_globals(gstat,
704                         cr,
705                         ir,
706                         fr,
707                         ekind,
708                         makeConstArrayRef(state->x),
709                         makeConstArrayRef(state->v),
710                         state->box,
711                         mdatoms,
712                         nrnb,
713                         &vcm,
714                         nullptr,
715                         enerd,
716                         force_vir,
717                         shake_vir,
718                         total_vir,
719                         pres,
720                         constr,
721                         &nullSignaller,
722                         state->box,
723                         nullptr,
724                         &bSumEkinhOld,
725                         cglo_flags & ~CGLO_PRESSURE);
726     }
727
728     /* Calculate the initial half step temperature, and save the ekinh_old */
729     if (startingBehavior == StartingBehavior::NewSimulation)
730     {
731         for (i = 0; (i < ir->opts.ngtc); i++)
732         {
733             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
734         }
735     }
736
737     /* need to make an initiation call to get the Trotter variables set, as well as other constants
738        for non-trotter temperature control */
739     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
740
741     if (MASTER(cr))
742     {
743         if (!ir->bContinuation)
744         {
745             if (constr && ir->eConstrAlg == econtLINCS)
746             {
747                 fprintf(fplog,
748                         "RMS relative constraint deviation after constraining: %.2e\n",
749                         constr->rmsd());
750             }
751             if (EI_STATE_VELOCITY(ir->eI))
752             {
753                 real temp = enerd->term[F_TEMP];
754                 if (ir->eI != eiVV)
755                 {
756                     /* Result of Ekin averaged over velocities of -half
757                      * and +half step, while we only have -half step here.
758                      */
759                     temp *= 2;
760                 }
761                 fprintf(fplog, "Initial temperature: %g K\n", temp);
762             }
763         }
764
765         char tbuf[20];
766         fprintf(stderr, "starting mdrun '%s'\n", *(top_global->name));
767         if (ir->nsteps >= 0)
768         {
769             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
770         }
771         else
772         {
773             sprintf(tbuf, "%s", "infinite");
774         }
775         if (ir->init_step > 0)
776         {
777             fprintf(stderr,
778                     "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
779                     gmx_step_str(ir->init_step + ir->nsteps, sbuf),
780                     tbuf,
781                     gmx_step_str(ir->init_step, sbuf2),
782                     ir->init_step * ir->delta_t);
783         }
784         else
785         {
786             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
787         }
788         fprintf(fplog, "\n");
789     }
790
791     walltime_accounting_start_time(walltime_accounting);
792     wallcycle_start(wcycle, ewcRUN);
793     print_start(fplog, cr, walltime_accounting, "mdrun");
794
795     /***********************************************************
796      *
797      *             Loop over MD steps
798      *
799      ************************************************************/
800
801     bFirstStep = TRUE;
802     /* Skip the first Nose-Hoover integration when we get the state from tpx */
803     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
804     bSumEkinhOld     = FALSE;
805     bExchanged       = FALSE;
806     bNeedRepartition = FALSE;
807
808     step     = ir->init_step;
809     step_rel = 0;
810
811     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
812             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]),
813             simulationsShareState,
814             MASTER(cr),
815             ir->nstlist,
816             mdrunOptions.reproducible,
817             nstSignalComm,
818             mdrunOptions.maximumHoursToRun,
819             ir->nstlist == 0,
820             fplog,
821             step,
822             bNS,
823             walltime_accounting);
824
825     auto checkpointHandler = std::make_unique<CheckpointHandler>(
826             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
827             simulationsShareState,
828             ir->nstlist == 0,
829             MASTER(cr),
830             mdrunOptions.writeConfout,
831             mdrunOptions.checkpointOptions.period);
832
833     const bool resetCountersIsLocal = true;
834     auto       resetHandler         = std::make_unique<ResetHandler>(
835             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
836             !resetCountersIsLocal,
837             ir->nsteps,
838             MASTER(cr),
839             mdrunOptions.timingOptions.resetHalfway,
840             mdrunOptions.maximumHoursToRun,
841             mdlog,
842             wcycle,
843             walltime_accounting);
844
845     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
846
847     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
848     {
849         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
850     }
851
852     /* and stop now if we should */
853     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
854     while (!bLastStep)
855     {
856
857         /* Determine if this is a neighbor search step */
858         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
859
860         if (bPMETune && bNStList)
861         {
862             // This has to be here because PME load balancing is called so early.
863             // TODO: Move to after all booleans are defined.
864             if (useGpuForUpdate && !bFirstStep)
865             {
866                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
867                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
868             }
869             /* PME grid + cut-off optimization with GPUs or PME nodes */
870             pme_loadbal_do(pme_loadbal,
871                            cr,
872                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
873                            fplog,
874                            mdlog,
875                            *ir,
876                            fr,
877                            state->box,
878                            state->x,
879                            wcycle,
880                            step,
881                            step_rel,
882                            &bPMETunePrinting,
883                            simulationWork.useGpuPmePpCommunication);
884         }
885
886         wallcycle_start(wcycle, ewcSTEP);
887
888         bLastStep = (step_rel == ir->nsteps);
889         t         = t0 + step * ir->delta_t;
890
891         // TODO Refactor this, so that nstfep does not need a default value of zero
892         if (ir->efep != efepNO || ir->bSimTemp)
893         {
894             /* find and set the current lambdas */
895             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
896
897             bDoDHDL     = do_per_step(step, ir->fepvals->nstdhdl);
898             bDoFEP      = ((ir->efep != efepNO) && do_per_step(step, nstfep));
899             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
900                            && (!bFirstStep));
901         }
902
903         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
904                      && do_per_step(step, replExParams.exchangeInterval));
905
906         if (doSimulatedAnnealing)
907         {
908             update_annealing_target_temp(ir, t, &upd);
909         }
910
911         /* Stop Center of Mass motion */
912         bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
913
914         /* Determine whether or not to do Neighbour Searching */
915         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
916
917         /* Note that the stopHandler will cause termination at nstglobalcomm
918          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
919          * nstpcouple steps, we have computed the half-step kinetic energy
920          * of the previous step and can always output energies at the last step.
921          */
922         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
923
924         /* do_log triggers energy and virial calculation. Because this leads
925          * to different code paths, forces can be different. Thus for exact
926          * continuation we should avoid extra log output.
927          * Note that the || bLastStep can result in non-exact continuation
928          * beyond the last step. But we don't consider that to be an issue.
929          */
930         do_log     = (do_per_step(step, ir->nstlog)
931                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
932         do_verbose = mdrunOptions.verbose
933                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
934
935         if (useGpuForUpdate && !bFirstStep && bNS)
936         {
937             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
938             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
939             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
940             // Copy coordinate from the GPU when needed at the search step.
941             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
942             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
943             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
944             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
945         }
946
947         if (bNS && !(bFirstStep && ir->bContinuation))
948         {
949             bMasterState = FALSE;
950             /* Correct the new box if it is too skewed */
951             if (inputrecDynamicBox(ir))
952             {
953                 if (correct_box(fplog, step, state->box))
954                 {
955                     bMasterState = TRUE;
956                     // If update is offloaded, it should be informed about the box size change
957                     if (useGpuForUpdate)
958                     {
959                         integrator->setPbc(PbcType::Xyz, state->box);
960                     }
961                 }
962             }
963             if (DOMAINDECOMP(cr) && bMasterState)
964             {
965                 dd_collect_state(cr->dd, state, state_global);
966             }
967
968             if (DOMAINDECOMP(cr))
969             {
970                 /* Repartition the domain decomposition */
971                 dd_partition_system(fplog,
972                                     mdlog,
973                                     step,
974                                     cr,
975                                     bMasterState,
976                                     nstglobalcomm,
977                                     state_global,
978                                     *top_global,
979                                     ir,
980                                     imdSession,
981                                     pull_work,
982                                     state,
983                                     &f,
984                                     mdAtoms,
985                                     &top,
986                                     fr,
987                                     vsite,
988                                     constr,
989                                     nrnb,
990                                     wcycle,
991                                     do_verbose && !bPMETunePrinting);
992                 shouldCheckNumberOfBondedInteractions = true;
993                 upd.setNumAtoms(state->natoms);
994             }
995         }
996
997         // Allocate or re-size GPU halo exchange object, if necessary
998         if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange)
999         {
1000             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
1001                                "GPU device manager has to be initialized to use GPU "
1002                                "version of halo exchange.");
1003             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
1004         }
1005
1006         if (MASTER(cr) && do_log)
1007         {
1008             gmx::EnergyOutput::printHeader(
1009                     fplog, step, t); /* can we improve the information printed here? */
1010         }
1011
1012         if (ir->efep != efepNO)
1013         {
1014             update_mdatoms(mdatoms, state->lambda[efptMASS]);
1015         }
1016
1017         if (bExchanged)
1018         {
1019             /* We need the kinetic energy at minus the half step for determining
1020              * the full step kinetic energy and possibly for T-coupling.*/
1021             /* This may not be quite working correctly yet . . . . */
1022             compute_globals(gstat,
1023                             cr,
1024                             ir,
1025                             fr,
1026                             ekind,
1027                             makeConstArrayRef(state->x),
1028                             makeConstArrayRef(state->v),
1029                             state->box,
1030                             mdatoms,
1031                             nrnb,
1032                             &vcm,
1033                             wcycle,
1034                             enerd,
1035                             nullptr,
1036                             nullptr,
1037                             nullptr,
1038                             nullptr,
1039                             constr,
1040                             &nullSignaller,
1041                             state->box,
1042                             &totalNumberOfBondedInteractions,
1043                             &bSumEkinhOld,
1044                             CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
1045             checkNumberOfBondedInteractions(mdlog,
1046                                             cr,
1047                                             totalNumberOfBondedInteractions,
1048                                             top_global,
1049                                             &top,
1050                                             makeConstArrayRef(state->x),
1051                                             state->box,
1052                                             &shouldCheckNumberOfBondedInteractions);
1053         }
1054         clear_mat(force_vir);
1055
1056         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
1057
1058         /* Determine the energy and pressure:
1059          * at nstcalcenergy steps and at energy output steps (set below).
1060          */
1061         if (EI_VV(ir->eI) && (!bInitStep))
1062         {
1063             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1064             bCalcVir      = bCalcEnerStep
1065                        || (ir->epc != epcNO
1066                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
1067         }
1068         else
1069         {
1070             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1071             bCalcVir = bCalcEnerStep || (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
1072         }
1073         bCalcEner = bCalcEnerStep;
1074
1075         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
1076
1077         if (do_ene || do_log || bDoReplEx)
1078         {
1079             bCalcVir  = TRUE;
1080             bCalcEner = TRUE;
1081         }
1082
1083         /* Do we need global communication ? */
1084         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
1085                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
1086
1087         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
1088                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
1089                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
1090         if (fr->useMts && !do_per_step(step, ir->nstfout))
1091         {
1092             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
1093         }
1094
1095         if (shellfc)
1096         {
1097             /* Now is the time to relax the shells */
1098             relax_shell_flexcon(fplog,
1099                                 cr,
1100                                 ms,
1101                                 mdrunOptions.verbose,
1102                                 enforcedRotation,
1103                                 step,
1104                                 ir,
1105                                 imdSession,
1106                                 pull_work,
1107                                 bNS,
1108                                 force_flags,
1109                                 &top,
1110                                 constr,
1111                                 enerd,
1112                                 state->natoms,
1113                                 state->x.arrayRefWithPadding(),
1114                                 state->v.arrayRefWithPadding(),
1115                                 state->box,
1116                                 state->lambda,
1117                                 &state->hist,
1118                                 &f.view(),
1119                                 force_vir,
1120                                 mdatoms,
1121                                 nrnb,
1122                                 wcycle,
1123                                 shellfc,
1124                                 fr,
1125                                 runScheduleWork,
1126                                 t,
1127                                 mu_tot,
1128                                 vsite,
1129                                 ddBalanceRegionHandler);
1130         }
1131         else
1132         {
1133             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
1134                is updated (or the AWH update will be performed twice for one step when continuing).
1135                It would be best to call this update function from do_md_trajectory_writing but that
1136                would occur after do_force. One would have to divide the update_awh function into one
1137                function applying the AWH force and one doing the AWH bias update. The update AWH
1138                bias function could then be called after do_md_trajectory_writing (then containing
1139                update_awh_history). The checkpointing will in the future probably moved to the start
1140                of the md loop which will rid of this issue. */
1141             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
1142             {
1143                 awh->updateHistory(state_global->awhHistory.get());
1144             }
1145
1146             /* The coordinates (x) are shifted (to get whole molecules)
1147              * in do_force.
1148              * This is parallellized as well, and does communication too.
1149              * Check comments in sim_util.c
1150              */
1151             do_force(fplog,
1152                      cr,
1153                      ms,
1154                      ir,
1155                      awh.get(),
1156                      enforcedRotation,
1157                      imdSession,
1158                      pull_work,
1159                      step,
1160                      nrnb,
1161                      wcycle,
1162                      &top,
1163                      state->box,
1164                      state->x.arrayRefWithPadding(),
1165                      &state->hist,
1166                      &f.view(),
1167                      force_vir,
1168                      mdatoms,
1169                      enerd,
1170                      state->lambda,
1171                      fr,
1172                      runScheduleWork,
1173                      vsite,
1174                      mu_tot,
1175                      t,
1176                      ed ? ed->getLegacyED() : nullptr,
1177                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
1178                      ddBalanceRegionHandler);
1179         }
1180
1181         // VV integrators do not need the following velocity half step
1182         // if it is the first step after starting from a checkpoint.
1183         // That is, the half step is needed on all other steps, and
1184         // also the first step when starting from a .tpr file.
1185         if (EI_VV(ir->eI))
1186         {
1187             integrateVVFirstStep(step,
1188                                  bFirstStep,
1189                                  bInitStep,
1190                                  startingBehavior,
1191                                  nstglobalcomm,
1192                                  ir,
1193                                  fr,
1194                                  cr,
1195                                  state,
1196                                  mdatoms,
1197                                  fcdata,
1198                                  &MassQ,
1199                                  &vcm,
1200                                  top_global,
1201                                  top,
1202                                  enerd,
1203                                  ekind,
1204                                  gstat,
1205                                  &last_ekin,
1206                                  bCalcVir,
1207                                  total_vir,
1208                                  shake_vir,
1209                                  force_vir,
1210                                  pres,
1211                                  M,
1212                                  do_log,
1213                                  do_ene,
1214                                  bCalcEner,
1215                                  bGStat,
1216                                  bStopCM,
1217                                  bTrotter,
1218                                  bExchanged,
1219                                  &bSumEkinhOld,
1220                                  &shouldCheckNumberOfBondedInteractions,
1221                                  &saved_conserved_quantity,
1222                                  &f,
1223                                  &upd,
1224                                  constr,
1225                                  &nullSignaller,
1226                                  trotter_seq,
1227                                  nrnb,
1228                                  mdlog,
1229                                  fplog,
1230                                  wcycle);
1231         }
1232
1233         /* ########  END FIRST UPDATE STEP  ############## */
1234         /* ########  If doing VV, we now have v(dt) ###### */
1235         if (bDoExpanded)
1236         {
1237             /* perform extended ensemble sampling in lambda - we don't
1238                actually move to the new state before outputting
1239                statistics, but if performing simulated tempering, we
1240                do update the velocities and the tau_t. */
1241
1242             lamnew = ExpandedEnsembleDynamics(
1243                     fplog, ir, enerd, state, &MassQ, state->fep_state, state->dfhist, step, state->v.rvec_array(), mdatoms);
1244             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1245             if (MASTER(cr))
1246             {
1247                 copy_df_history(state_global->dfhist, state->dfhist);
1248             }
1249         }
1250
1251         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1252         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1253         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1254             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1255                 || checkpointHandler->isCheckpointingStep()))
1256         {
1257             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1258             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1259         }
1260         // Copy velocities if needed for the output/checkpointing.
1261         // NOTE: Copy on the search steps is done at the beginning of the step.
1262         if (useGpuForUpdate && !bNS
1263             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1264         {
1265             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1266             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1267         }
1268         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1269         // and update is offloaded hence forces are kept on the GPU for update and have not been
1270         // already transferred in do_force().
1271         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1272         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1273         //       prior to GPU update.
1274         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1275         //       copy call in do_force(...).
1276         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1277         //       on host after the D2H copy in do_force(...).
1278         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1279             && do_per_step(step, ir->nstfout))
1280         {
1281             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1282             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1283         }
1284         /* Now we have the energies and forces corresponding to the
1285          * coordinates at time t. We must output all of this before
1286          * the update.
1287          */
1288         do_md_trajectory_writing(fplog,
1289                                  cr,
1290                                  nfile,
1291                                  fnm,
1292                                  step,
1293                                  step_rel,
1294                                  t,
1295                                  ir,
1296                                  state,
1297                                  state_global,
1298                                  observablesHistory,
1299                                  top_global,
1300                                  fr,
1301                                  outf,
1302                                  energyOutput,
1303                                  ekind,
1304                                  f.view().force(),
1305                                  checkpointHandler->isCheckpointingStep(),
1306                                  bRerunMD,
1307                                  bLastStep,
1308                                  mdrunOptions.writeConfout,
1309                                  bSumEkinhOld);
1310         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1311         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x.rvec_array(), t);
1312
1313         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1314         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1315             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1316         {
1317             copy_mat(state->svir_prev, shake_vir);
1318             copy_mat(state->fvir_prev, force_vir);
1319         }
1320
1321         stopHandler->setSignal();
1322         resetHandler->setSignal(walltime_accounting);
1323
1324         if (bGStat || !PAR(cr))
1325         {
1326             /* In parallel we only have to check for checkpointing in steps
1327              * where we do global communication,
1328              *  otherwise the other nodes don't know.
1329              */
1330             checkpointHandler->setSignal(walltime_accounting);
1331         }
1332
1333         /* #########   START SECOND UPDATE STEP ################# */
1334
1335         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1336            controlled in preprocessing */
1337
1338         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1339         {
1340             gmx_bool bIfRandomize;
1341             bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, &upd, constr);
1342             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1343             if (constr && bIfRandomize)
1344             {
1345                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1346             }
1347         }
1348         /* Box is changed in update() when we do pressure coupling,
1349          * but we should still use the old box for energy corrections and when
1350          * writing it to the energy file, so it matches the trajectory files for
1351          * the same timestep above. Make a copy in a separate array.
1352          */
1353         copy_mat(state->box, lastbox);
1354
1355         dvdl_constr = 0;
1356
1357         if (!useGpuForUpdate)
1358         {
1359             wallcycle_start(wcycle, ewcUPDATE);
1360         }
1361         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1362         if (bTrotter)
1363         {
1364             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
1365             /* We can only do Berendsen coupling after we have summed
1366              * the kinetic energy or virial. Since the happens
1367              * in global_state after update, we should only do it at
1368              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1369              */
1370         }
1371         else
1372         {
1373             update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
1374             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1375         }
1376
1377         /* With leap-frog type integrators we compute the kinetic energy
1378          * at a whole time step as the average of the half-time step kinetic
1379          * energies of two subsequent steps. Therefore we need to compute the
1380          * half step kinetic energy also if we need energies at the next step.
1381          */
1382         const bool needHalfStepKineticEnergy =
1383                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1384
1385         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1386         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1387         const bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN
1388                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1389
1390         if (EI_VV(ir->eI))
1391         {
1392             GMX_ASSERT(!useGpuForUpdate, "GPU update is not supported with VVAK integrator.");
1393
1394             integrateVVSecondStep(step,
1395                                   ir,
1396                                   fr,
1397                                   cr,
1398                                   state,
1399                                   mdatoms,
1400                                   fcdata,
1401                                   &MassQ,
1402                                   &vcm,
1403                                   pull_work,
1404                                   enerd,
1405                                   ekind,
1406                                   gstat,
1407                                   &dvdl_constr,
1408                                   bCalcVir,
1409                                   total_vir,
1410                                   shake_vir,
1411                                   force_vir,
1412                                   pres,
1413                                   M,
1414                                   lastbox,
1415                                   do_log,
1416                                   do_ene,
1417                                   bGStat,
1418                                   &bSumEkinhOld,
1419                                   &f,
1420                                   &cbuf,
1421                                   &upd,
1422                                   constr,
1423                                   &nullSignaller,
1424                                   trotter_seq,
1425                                   nrnb,
1426                                   wcycle);
1427         }
1428         else
1429         {
1430             if (useGpuForUpdate)
1431             {
1432
1433                 wallcycle_stop(wcycle, ewcUPDATE);
1434
1435                 if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1436                 {
1437                     integrator->set(stateGpu->getCoordinates(),
1438                                     stateGpu->getVelocities(),
1439                                     stateGpu->getForces(),
1440                                     top.idef,
1441                                     *mdatoms,
1442                                     ekind->ngtc);
1443
1444                     // Copy data to the GPU after buffers might have being reinitialized
1445                     stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1446                     stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1447                 }
1448
1449                 if (simulationWork.useGpuPme && !runScheduleWork->simulationWork.useGpuPmePpCommunication
1450                     && !thisRankHasDuty(cr, DUTY_PME))
1451                 {
1452                     // The PME forces were recieved to the host, so have to be copied
1453                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::All);
1454                 }
1455                 else if (!runScheduleWork->stepWork.useGpuFBufferOps)
1456                 {
1457                     // The buffer ops were not offloaded this step, so the forces are on the
1458                     // host and have to be copied
1459                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1460                 }
1461
1462                 const bool doTemperatureScaling =
1463                         (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1464
1465                 // This applies Leap-Frog, LINCS and SETTLE in succession
1466                 integrator->integrate(
1467                         stateGpu->getForcesReadyOnDeviceEvent(
1468                                 AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1469                         ir->delta_t,
1470                         true,
1471                         bCalcVir,
1472                         shake_vir,
1473                         doTemperatureScaling,
1474                         ekind->tcstat,
1475                         doParrinelloRahman,
1476                         ir->nstpcouple * ir->delta_t,
1477                         M);
1478
1479                 // Copy velocities D2H after update if:
1480                 // - Globals are computed this step (includes the energy output steps).
1481                 // - Temperature is needed for the next step.
1482                 if (bGStat || needHalfStepKineticEnergy)
1483                 {
1484                     stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1485                     stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1486                 }
1487             }
1488             else
1489             {
1490                 /* With multiple time stepping we need to do an additional normal
1491                  * update step to obtain the virial, as the actual MTS integration
1492                  * using an acceleration where the slow forces are multiplied by mtsFactor.
1493                  * Using that acceleration would result in a virial with the slow
1494                  * force contribution would be a factor mtsFactor too large.
1495                  */
1496                 if (fr->useMts && bCalcVir && constr != nullptr)
1497                 {
1498                     upd.update_for_constraint_virial(
1499                             *ir, *mdatoms, *state, f.view().forceWithPadding(), *ekind);
1500
1501                     constrain_coordinates(constr,
1502                                           do_log,
1503                                           do_ene,
1504                                           step,
1505                                           state,
1506                                           upd.xp()->arrayRefWithPadding(),
1507                                           &dvdl_constr,
1508                                           bCalcVir,
1509                                           shake_vir);
1510                 }
1511
1512                 ArrayRefWithPadding<const RVec> forceCombined =
1513                         (fr->useMts && step % ir->mtsLevels[1].stepFactor == 0)
1514                                 ? f.view().forceMtsCombinedWithPadding()
1515                                 : f.view().forceWithPadding();
1516                 upd.update_coords(
1517                         *ir, step, mdatoms, state, forceCombined, fcdata, ekind, M, etrtPOSITION, cr, constr != nullptr);
1518
1519                 wallcycle_stop(wcycle, ewcUPDATE);
1520
1521                 constrain_coordinates(constr,
1522                                       do_log,
1523                                       do_ene,
1524                                       step,
1525                                       state,
1526                                       upd.xp()->arrayRefWithPadding(),
1527                                       &dvdl_constr,
1528                                       bCalcVir && !fr->useMts,
1529                                       shake_vir);
1530
1531                 upd.update_sd_second_half(
1532                         *ir, step, &dvdl_constr, mdatoms, state, cr, nrnb, wcycle, constr, do_log, do_ene);
1533                 upd.finish_update(*ir, mdatoms, state, wcycle, constr != nullptr);
1534             }
1535
1536             if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1537             {
1538                 updatePrevStepPullCom(pull_work, state);
1539             }
1540
1541             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1542         }
1543
1544         if (vsite != nullptr)
1545         {
1546             wallcycle_start(wcycle, ewcVSITECONSTR);
1547             vsite->construct(state->x, ir->delta_t, state->v, state->box);
1548             wallcycle_stop(wcycle, ewcVSITECONSTR);
1549         }
1550
1551         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1552         /* With Leap-Frog we can skip compute_globals at
1553          * non-communication steps, but we need to calculate
1554          * the kinetic energy one step before communication.
1555          */
1556         {
1557             // Organize to do inter-simulation signalling on steps if
1558             // and when algorithms require it.
1559             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1560
1561             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1562             {
1563                 // Copy coordinates when needed to stop the CM motion.
1564                 if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
1565                 {
1566                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1567                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1568                 }
1569                 // Since we're already communicating at this step, we
1570                 // can propagate intra-simulation signals. Note that
1571                 // check_nstglobalcomm has the responsibility for
1572                 // choosing the value of nstglobalcomm that is one way
1573                 // bGStat becomes true, so we can't get into a
1574                 // situation where e.g. checkpointing can't be
1575                 // signalled.
1576                 bool                doIntraSimSignal = true;
1577                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1578
1579                 compute_globals(gstat,
1580                                 cr,
1581                                 ir,
1582                                 fr,
1583                                 ekind,
1584                                 makeConstArrayRef(state->x),
1585                                 makeConstArrayRef(state->v),
1586                                 state->box,
1587                                 mdatoms,
1588                                 nrnb,
1589                                 &vcm,
1590                                 wcycle,
1591                                 enerd,
1592                                 force_vir,
1593                                 shake_vir,
1594                                 total_vir,
1595                                 pres,
1596                                 constr,
1597                                 &signaller,
1598                                 lastbox,
1599                                 &totalNumberOfBondedInteractions,
1600                                 &bSumEkinhOld,
1601                                 (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1602                                         | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1603                                         | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1604                                         | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
1605                                         | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1606                                                                                  : 0));
1607                 checkNumberOfBondedInteractions(mdlog,
1608                                                 cr,
1609                                                 totalNumberOfBondedInteractions,
1610                                                 top_global,
1611                                                 &top,
1612                                                 makeConstArrayRef(state->x),
1613                                                 state->box,
1614                                                 &shouldCheckNumberOfBondedInteractions);
1615                 if (!EI_VV(ir->eI) && bStopCM)
1616                 {
1617                     process_and_stopcm_grp(
1618                             fplog, &vcm, *mdatoms, makeArrayRef(state->x), makeArrayRef(state->v));
1619                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1620
1621                     // TODO: The special case of removing CM motion should be dealt more gracefully
1622                     if (useGpuForUpdate)
1623                     {
1624                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1625                         // Here we block until the H2D copy completes because event sync with the
1626                         // force kernels that use the coordinates on the next steps is not implemented
1627                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1628                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1629                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1630                         if (vcm.mode != ecmNO)
1631                         {
1632                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1633                         }
1634                     }
1635                 }
1636             }
1637         }
1638
1639         /* #############  END CALC EKIN AND PRESSURE ################# */
1640
1641         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1642            the virial that should probably be addressed eventually. state->veta has better properies,
1643            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1644            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1645
1646         if (ir->efep != efepNO && !EI_VV(ir->eI))
1647         {
1648             /* Sum up the foreign energy and dK/dl terms for md and sd.
1649                Currently done every step so that dH/dl is correct in the .edr */
1650             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1651         }
1652
1653         update_pcouple_after_coordinates(
1654                 fplog, step, ir, mdatoms, pres, force_vir, shake_vir, pressureCouplingMu, state, nrnb, upd.deform(), !useGpuForUpdate);
1655
1656         const bool doBerendsenPressureCoupling =
1657                 (inputrec->epc == epcBERENDSEN && do_per_step(step, inputrec->nstpcouple));
1658         const bool doCRescalePressureCoupling =
1659                 (inputrec->epc == epcCRESCALE && do_per_step(step, inputrec->nstpcouple));
1660         if (useGpuForUpdate
1661             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1662         {
1663             integrator->scaleCoordinates(pressureCouplingMu);
1664             if (doCRescalePressureCoupling)
1665             {
1666                 matrix pressureCouplingInvMu;
1667                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1668                 integrator->scaleVelocities(pressureCouplingInvMu);
1669             }
1670             integrator->setPbc(PbcType::Xyz, state->box);
1671         }
1672
1673         /* ################# END UPDATE STEP 2 ################# */
1674         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1675
1676         /* The coordinates (x) were unshifted in update */
1677         if (!bGStat)
1678         {
1679             /* We will not sum ekinh_old,
1680              * so signal that we still have to do it.
1681              */
1682             bSumEkinhOld = TRUE;
1683         }
1684
1685         if (bCalcEner)
1686         {
1687             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1688
1689             /* use the directly determined last velocity, not actually the averaged half steps */
1690             if (bTrotter && ir->eI == eiVV)
1691             {
1692                 enerd->term[F_EKIN] = last_ekin;
1693             }
1694             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1695
1696             if (integratorHasConservedEnergyQuantity(ir))
1697             {
1698                 if (EI_VV(ir->eI))
1699                 {
1700                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1701                 }
1702                 else
1703                 {
1704                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1705                 }
1706             }
1707             /* #########  END PREPARING EDR OUTPUT  ###########  */
1708         }
1709
1710         /* Output stuff */
1711         if (MASTER(cr))
1712         {
1713             if (fplog && do_log && bDoExpanded)
1714             {
1715                 /* only needed if doing expanded ensemble */
1716                 PrintFreeEnergyInfoToFile(fplog,
1717                                           ir->fepvals,
1718                                           ir->expandedvals,
1719                                           ir->bSimTemp ? ir->simtempvals : nullptr,
1720                                           state_global->dfhist,
1721                                           state->fep_state,
1722                                           ir->nstlog,
1723                                           step);
1724             }
1725             if (bCalcEner)
1726             {
1727                 energyOutput.addDataAtEnergyStep(bDoDHDL,
1728                                                  bCalcEnerStep,
1729                                                  t,
1730                                                  mdatoms->tmass,
1731                                                  enerd,
1732                                                  ir->fepvals,
1733                                                  ir->expandedvals,
1734                                                  lastbox,
1735                                                  PTCouplingArrays{ state->boxv,
1736                                                                    state->nosehoover_xi,
1737                                                                    state->nosehoover_vxi,
1738                                                                    state->nhpres_xi,
1739                                                                    state->nhpres_vxi },
1740                                                  state->fep_state,
1741                                                  shake_vir,
1742                                                  force_vir,
1743                                                  total_vir,
1744                                                  pres,
1745                                                  ekind,
1746                                                  mu_tot,
1747                                                  constr);
1748             }
1749             else
1750             {
1751                 energyOutput.recordNonEnergyStep();
1752             }
1753
1754             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1755             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1756
1757             if (doSimulatedAnnealing)
1758             {
1759                 gmx::EnergyOutput::printAnnealingTemperatures(
1760                         do_log ? fplog : nullptr, groups, &(ir->opts));
1761             }
1762             if (do_log || do_ene || do_dr || do_or)
1763             {
1764                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf),
1765                                                    do_ene,
1766                                                    do_dr,
1767                                                    do_or,
1768                                                    do_log ? fplog : nullptr,
1769                                                    step,
1770                                                    t,
1771                                                    fr->fcdata.get(),
1772                                                    awh.get());
1773             }
1774             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1775             {
1776                 const bool isInitialOutput = false;
1777                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1778             }
1779
1780             if (ir->bPull)
1781             {
1782                 pull_print_output(pull_work, step, t);
1783             }
1784
1785             if (do_per_step(step, ir->nstlog))
1786             {
1787                 if (fflush(fplog) != 0)
1788                 {
1789                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1790                 }
1791             }
1792         }
1793         if (bDoExpanded)
1794         {
1795             /* Have to do this part _after_ outputting the logfile and the edr file */
1796             /* Gets written into the state at the beginning of next loop*/
1797             state->fep_state = lamnew;
1798         }
1799         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1800         {
1801             state->fep_state = awh->fepLambdaState();
1802         }
1803         /* Print the remaining wall clock time for the run */
1804         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1805         {
1806             if (shellfc)
1807             {
1808                 fprintf(stderr, "\n");
1809             }
1810             print_time(stderr, walltime_accounting, step, ir, cr);
1811         }
1812
1813         /* Ion/water position swapping.
1814          * Not done in last step since trajectory writing happens before this call
1815          * in the MD loop and exchanges would be lost anyway. */
1816         bNeedRepartition = FALSE;
1817         if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep && do_per_step(step, ir->swap->nstswap))
1818         {
1819             bNeedRepartition = do_swapcoords(cr,
1820                                              step,
1821                                              t,
1822                                              ir,
1823                                              swap,
1824                                              wcycle,
1825                                              as_rvec_array(state->x.data()),
1826                                              state->box,
1827                                              MASTER(cr) && mdrunOptions.verbose,
1828                                              bRerunMD);
1829
1830             if (bNeedRepartition && DOMAINDECOMP(cr))
1831             {
1832                 dd_collect_state(cr->dd, state, state_global);
1833             }
1834         }
1835
1836         /* Replica exchange */
1837         bExchanged = FALSE;
1838         if (bDoReplEx)
1839         {
1840             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1841         }
1842
1843         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1844         {
1845             dd_partition_system(fplog,
1846                                 mdlog,
1847                                 step,
1848                                 cr,
1849                                 TRUE,
1850                                 1,
1851                                 state_global,
1852                                 *top_global,
1853                                 ir,
1854                                 imdSession,
1855                                 pull_work,
1856                                 state,
1857                                 &f,
1858                                 mdAtoms,
1859                                 &top,
1860                                 fr,
1861                                 vsite,
1862                                 constr,
1863                                 nrnb,
1864                                 wcycle,
1865                                 FALSE);
1866             shouldCheckNumberOfBondedInteractions = true;
1867             upd.setNumAtoms(state->natoms);
1868         }
1869
1870         bFirstStep = FALSE;
1871         bInitStep  = FALSE;
1872
1873         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1874         /* With all integrators, except VV, we need to retain the pressure
1875          * at the current step for coupling at the next step.
1876          */
1877         if ((state->flags & (1U << estPRES_PREV))
1878             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1879         {
1880             /* Store the pressure in t_state for pressure coupling
1881              * at the next MD step.
1882              */
1883             copy_mat(pres, state->pres_prev);
1884         }
1885
1886         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1887
1888         if ((membed != nullptr) && (!bLastStep))
1889         {
1890             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1891         }
1892
1893         cycles = wallcycle_stop(wcycle, ewcSTEP);
1894         if (DOMAINDECOMP(cr) && wcycle)
1895         {
1896             dd_cycles_add(cr->dd, cycles, ddCyclStep);
1897         }
1898
1899         /* increase the MD step number */
1900         step++;
1901         step_rel++;
1902
1903 #if GMX_FAHCORE
1904         if (MASTER(cr))
1905         {
1906             fcReportProgress(ir->nsteps + ir->init_step, step);
1907         }
1908 #endif
1909
1910         resetHandler->resetCounters(
1911                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
1912
1913         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
1914         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
1915     }
1916     /* End of main MD loop */
1917
1918     /* Closing TNG files can include compressing data. Therefore it is good to do that
1919      * before stopping the time measurements. */
1920     mdoutf_tng_close(outf);
1921
1922     /* Stop measuring walltime */
1923     walltime_accounting_end_time(walltime_accounting);
1924
1925     if (!thisRankHasDuty(cr, DUTY_PME))
1926     {
1927         /* Tell the PME only node to finish */
1928         gmx_pme_send_finish(cr);
1929     }
1930
1931     if (MASTER(cr))
1932     {
1933         if (ir->nstcalcenergy > 0)
1934         {
1935             energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
1936
1937             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
1938             energyOutput.printAverages(fplog, groups);
1939         }
1940     }
1941     done_mdoutf(outf);
1942
1943     if (bPMETune)
1944     {
1945         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
1946     }
1947
1948     done_shellfc(fplog, shellfc, step_rel);
1949
1950     if (useReplicaExchange && MASTER(cr))
1951     {
1952         print_replica_exchange_statistics(fplog, repl_ex);
1953     }
1954
1955     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
1956
1957     global_stat_destroy(gstat);
1958 }