Merge "Merge branch 'release-2019' into master"
[alexxy/gromacs.git] / src / gromacs / mdrun / md.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 /*! \internal \file
38  *
39  * \brief Implements the integrator for normal molecular dynamics simulations
40  *
41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
42  * \ingroup module_mdrun
43  */
44 #include "gmxpre.h"
45
46 #include <cinttypes>
47 #include <cmath>
48 #include <cstdio>
49 #include <cstdlib>
50
51 #include <algorithm>
52 #include <memory>
53
54 #include "gromacs/awh/awh.h"
55 #include "gromacs/commandline/filenm.h"
56 #include "gromacs/domdec/collect.h"
57 #include "gromacs/domdec/dlbtiming.h"
58 #include "gromacs/domdec/domdec.h"
59 #include "gromacs/domdec/domdec_network.h"
60 #include "gromacs/domdec/domdec_struct.h"
61 #include "gromacs/domdec/mdsetup.h"
62 #include "gromacs/domdec/partition.h"
63 #include "gromacs/essentialdynamics/edsam.h"
64 #include "gromacs/ewald/pme.h"
65 #include "gromacs/ewald/pme_load_balancing.h"
66 #include "gromacs/ewald/pme_pp_comm_gpu.h"
67 #include "gromacs/fileio/trxio.h"
68 #include "gromacs/gmxlib/network.h"
69 #include "gromacs/gmxlib/nrnb.h"
70 #include "gromacs/gpu_utils/gpu_utils.h"
71 #include "gromacs/imd/imd.h"
72 #include "gromacs/listed_forces/manage_threading.h"
73 #include "gromacs/math/functions.h"
74 #include "gromacs/math/utilities.h"
75 #include "gromacs/math/vec.h"
76 #include "gromacs/math/vectypes.h"
77 #include "gromacs/mdlib/checkpointhandler.h"
78 #include "gromacs/mdlib/compute_io.h"
79 #include "gromacs/mdlib/constr.h"
80 #include "gromacs/mdlib/ebin.h"
81 #include "gromacs/mdlib/enerdata_utils.h"
82 #include "gromacs/mdlib/energyoutput.h"
83 #include "gromacs/mdlib/expanded.h"
84 #include "gromacs/mdlib/force.h"
85 #include "gromacs/mdlib/force_flags.h"
86 #include "gromacs/mdlib/forcerec.h"
87 #include "gromacs/mdlib/md_support.h"
88 #include "gromacs/mdlib/mdatoms.h"
89 #include "gromacs/mdlib/mdoutf.h"
90 #include "gromacs/mdlib/membed.h"
91 #include "gromacs/mdlib/resethandler.h"
92 #include "gromacs/mdlib/sighandler.h"
93 #include "gromacs/mdlib/simulationsignal.h"
94 #include "gromacs/mdlib/stat.h"
95 #include "gromacs/mdlib/stophandler.h"
96 #include "gromacs/mdlib/tgroup.h"
97 #include "gromacs/mdlib/trajectory_writing.h"
98 #include "gromacs/mdlib/update.h"
99 #include "gromacs/mdlib/update_constrain_cuda.h"
100 #include "gromacs/mdlib/vcm.h"
101 #include "gromacs/mdlib/vsite.h"
102 #include "gromacs/mdrunutility/handlerestart.h"
103 #include "gromacs/mdrunutility/multisim.h"
104 #include "gromacs/mdrunutility/printtime.h"
105 #include "gromacs/mdtypes/awh_history.h"
106 #include "gromacs/mdtypes/awh_params.h"
107 #include "gromacs/mdtypes/commrec.h"
108 #include "gromacs/mdtypes/df_history.h"
109 #include "gromacs/mdtypes/energyhistory.h"
110 #include "gromacs/mdtypes/fcdata.h"
111 #include "gromacs/mdtypes/forcerec.h"
112 #include "gromacs/mdtypes/group.h"
113 #include "gromacs/mdtypes/inputrec.h"
114 #include "gromacs/mdtypes/interaction_const.h"
115 #include "gromacs/mdtypes/md_enums.h"
116 #include "gromacs/mdtypes/mdatom.h"
117 #include "gromacs/mdtypes/mdrunoptions.h"
118 #include "gromacs/mdtypes/observableshistory.h"
119 #include "gromacs/mdtypes/pullhistory.h"
120 #include "gromacs/mdtypes/simulation_workload.h"
121 #include "gromacs/mdtypes/state.h"
122 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
123 #include "gromacs/modularsimulator/energyelement.h"
124 #include "gromacs/nbnxm/gpu_data_mgmt.h"
125 #include "gromacs/nbnxm/nbnxm.h"
126 #include "gromacs/pbcutil/mshift.h"
127 #include "gromacs/pbcutil/pbc.h"
128 #include "gromacs/pulling/output.h"
129 #include "gromacs/pulling/pull.h"
130 #include "gromacs/swap/swapcoords.h"
131 #include "gromacs/timing/wallcycle.h"
132 #include "gromacs/timing/walltime_accounting.h"
133 #include "gromacs/topology/atoms.h"
134 #include "gromacs/topology/idef.h"
135 #include "gromacs/topology/mtop_util.h"
136 #include "gromacs/topology/topology.h"
137 #include "gromacs/trajectory/trajectoryframe.h"
138 #include "gromacs/utility/basedefinitions.h"
139 #include "gromacs/utility/cstringutil.h"
140 #include "gromacs/utility/fatalerror.h"
141 #include "gromacs/utility/logger.h"
142 #include "gromacs/utility/real.h"
143 #include "gromacs/utility/smalloc.h"
144
145 #include "legacysimulator.h"
146 #include "replicaexchange.h"
147 #include "shellfc.h"
148
149 #if GMX_FAHCORE
150 #include "corewrap.h"
151 #endif
152
153 using gmx::SimulationSignaller;
154
155 void gmx::LegacySimulator::do_md()
156 {
157     // TODO Historically, the EM and MD "integrators" used different
158     // names for the t_inputrec *parameter, but these must have the
159     // same name, now that it's a member of a struct. We use this ir
160     // alias to avoid a large ripple of nearly useless changes.
161     // t_inputrec is being replaced by IMdpOptionsProvider, so this
162     // will go away eventually.
163     t_inputrec                 *ir   = inputrec;
164     int64_t                     step, step_rel;
165     double                      t, t0 = ir->init_t, lam0[efptNR];
166     gmx_bool                    bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
167     gmx_bool                    bNS, bNStList, bStopCM,
168                                 bFirstStep, bInitStep, bLastStep = FALSE;
169     gmx_bool                    bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
170     gmx_bool                    do_ene, do_log, do_verbose;
171     gmx_bool                    bMasterState;
172     unsigned int                force_flags;
173     tensor                      force_vir     = {{0}}, shake_vir = {{0}}, total_vir = {{0}},
174                                 tmp_vir       = {{0}}, pres = {{0}};
175     int                         i, m;
176     rvec                        mu_tot;
177     matrix                      parrinellorahmanMu, M;
178     gmx_repl_ex_t               repl_ex = nullptr;
179     gmx_localtop_t              top;
180     PaddedHostVector<gmx::RVec> f {};
181     gmx_global_stat_t           gstat;
182     t_graph                    *graph = nullptr;
183     gmx_shellfc_t              *shellfc;
184     gmx_bool                    bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
185     gmx_bool                    bTemp, bPres, bTrotter;
186     real                        dvdl_constr;
187     std::vector<RVec>           cbuf;
188     matrix                      lastbox;
189     int                         lamnew  = 0;
190     /* for FEP */
191     int                         nstfep = 0;
192     double                      cycles;
193     real                        saved_conserved_quantity = 0;
194     real                        last_ekin                = 0;
195     t_extmass                   MassQ;
196     char                        sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
197
198     /* PME load balancing data for GPU kernels */
199     gmx_bool              bPMETune         = FALSE;
200     gmx_bool              bPMETunePrinting = FALSE;
201
202     bool                  bInteractiveMDstep = false;
203
204     /* Domain decomposition could incorrectly miss a bonded
205        interaction, but checking for that requires a global
206        communication stage, which does not otherwise happen in DD
207        code. So we do that alongside the first global energy reduction
208        after a new DD is made. These variables handle whether the
209        check happens, and the result it returns. */
210     bool              shouldCheckNumberOfBondedInteractions = false;
211     int               totalNumberOfBondedInteractions       = -1;
212
213     SimulationSignals signals;
214     // Most global communnication stages don't propagate mdrun
215     // signals, and will use this object to achieve that.
216     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
217
218     if (!mdrunOptions.writeConfout)
219     {
220         // This is on by default, and the main known use case for
221         // turning it off is for convenience in benchmarking, which is
222         // something that should not show up in the general user
223         // interface.
224         GMX_LOG(mdlog.info).asParagraph().
225             appendText("The -noconfout functionality is deprecated, and may be removed in a future version.");
226     }
227
228     /* md-vv uses averaged full step velocities for T-control
229        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
230        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
231     bTrotter = (EI_VV(ir->eI) && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
232
233     const bool bRerunMD      = false;
234
235     int        nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
236     bGStatEveryStep = (nstglobalcomm == 1);
237
238     SimulationGroups                  *groups = &top_global->groups;
239
240     std::unique_ptr<EssentialDynamics> ed = nullptr;
241     if (opt2bSet("-ei", nfile, fnm))
242     {
243         /* Initialize essential dynamics sampling */
244         ed = init_edsam(mdlog,
245                         opt2fn_null("-ei", nfile, fnm), opt2fn("-eo", nfile, fnm),
246                         top_global,
247                         ir, cr, constr,
248                         state_global, observablesHistory,
249                         oenv,
250                         startingBehavior);
251     }
252     else if (observablesHistory->edsamHistory)
253     {
254         gmx_fatal(FARGS,
255                   "The checkpoint is from a run with essential dynamics sampling, "
256                   "but the current run did not specify the -ei option. "
257                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
258     }
259
260     initialize_lambdas(fplog, *ir, MASTER(cr), &state_global->fep_state, state_global->lambda, lam0);
261     Update     upd(ir, deform);
262     const bool doSimulatedAnnealing = initSimulatedAnnealing(ir, &upd);
263     if (startingBehavior != StartingBehavior::RestartWithAppending)
264     {
265         pleaseCiteCouplingAlgorithms(fplog, *ir);
266     }
267     gmx_mdoutf       *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, ir, top_global, oenv, wcycle,
268                                          startingBehavior);
269     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, ir, pull_work, mdoutf_get_fp_dhdl(outf), false, mdModulesNotifier);
270
271     gstat = global_stat_init(ir);
272
273     /* Check for polarizable models and flexible constraints */
274     shellfc = init_shell_flexcon(fplog,
275                                  top_global, constr ? constr->numFlexibleConstraints() : 0,
276                                  ir->nstcalcenergy, DOMAINDECOMP(cr));
277
278     {
279         double io = compute_io(ir, top_global->natoms, *groups, energyOutput.numEnergyTerms(), 1);
280         if ((io > 2000) && MASTER(cr))
281         {
282             fprintf(stderr,
283                     "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
284                     io);
285         }
286     }
287
288     // Local state only becomes valid now.
289     std::unique_ptr<t_state> stateInstance;
290     t_state *                state;
291
292
293     auto mdatoms = mdAtoms->mdatoms();
294
295     std::unique_ptr<UpdateConstrainCuda> integrator;
296
297     if (DOMAINDECOMP(cr))
298     {
299         dd_init_local_top(*top_global, &top);
300
301         stateInstance = std::make_unique<t_state>();
302         state         = stateInstance.get();
303         dd_init_local_state(cr->dd, state_global, state);
304
305         /* Distribute the charge groups over the nodes from the master node */
306         dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1,
307                             state_global, *top_global, ir, imdSession,
308                             pull_work,
309                             state, &f, mdAtoms, &top, fr,
310                             vsite, constr,
311                             nrnb, nullptr, FALSE);
312         shouldCheckNumberOfBondedInteractions = true;
313         upd.setNumAtoms(state->natoms);
314     }
315     else
316     {
317         state_change_natoms(state_global, state_global->natoms);
318         f.resizeWithPadding(state_global->natoms);
319         /* Copy the pointer to the global state */
320         state = state_global;
321
322         /* Generate and initialize new topology */
323         mdAlgorithmsSetupAtomData(cr, ir, *top_global, &top, fr,
324                                   &graph, mdAtoms, constr, vsite, shellfc);
325
326         upd.setNumAtoms(state->natoms);
327     }
328
329 /*****************************************************************************************/
330 // TODO: The following block of code should be refactored, once:
331 //       1. We have the useGpuForBufferOps variable set and available here and in do_force(...)
332 //       2. The proper GPU syncronization is introduced, so that the H2D and D2H data copies can be performed in the separate
333 //          stream owned by the StatePropagatorDataGpu
334     const auto &simulationWork     = runScheduleWork->simulationWork;
335     const bool  useGpuForPme       = simulationWork.useGpuPme;
336     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
337     // Temporary solution to make sure that the buffer ops are offloaded when update is offloaded
338     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
339     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
340
341
342     StatePropagatorDataGpu *stateGpu = fr->stateGpu;
343
344     if (useGpuForUpdate)
345     {
346         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr), "Domain decomposition is not supported with the GPU update.\n");
347         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
348                            "Either PME or short-ranged non-bonded interaction tasks must run on the GPU to use GPU update.\n");
349         GMX_RELEASE_ASSERT(ir->eI == eiMD, "Only the md integrator is supported with the GPU update.\n");
350         GMX_RELEASE_ASSERT(ir->etc != etcNOSEHOOVER, "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
351         GMX_RELEASE_ASSERT(ir->epc == epcNO || ir->epc == epcPARRINELLORAHMAN, "Only Parrinello-Rahman pressure control is supported with the GPU update.\n");
352         GMX_RELEASE_ASSERT(!mdatoms->haveVsites, "Virtual sites are not supported with the GPU update.\n");
353         GMX_RELEASE_ASSERT(ed == nullptr, "Essential dynamics is not supported with the GPU update.\n");
354         GMX_RELEASE_ASSERT(!ir->bPull && !ir->pull, "Pulling is not supported with the GPU update.\n");
355         GMX_RELEASE_ASSERT(fcd->orires.nr == 0, "Orientation restraints are not supported with the GPU update.\n");
356         GMX_RELEASE_ASSERT(fcd->disres.npair  == 0, "Distance restraints are not supported with the GPU update.\n");
357         GMX_RELEASE_ASSERT(ir->efep == efepNO, "Free energy perturbations are not supported with the GPU update.");
358
359         if (constr != nullptr && constr->numConstraintsTotal() > 0)
360         {
361             GMX_LOG(mdlog.info).asParagraph().
362                 appendText("Updating coordinates and applying constraints on the GPU.");
363         }
364         else
365         {
366             GMX_LOG(mdlog.info).asParagraph().
367                 appendText("Updating coordinates on the GPU.");
368         }
369         integrator = std::make_unique<UpdateConstrainCuda>(*ir, *top_global, stateGpu->getUpdateStream(), stateGpu->xUpdatedOnDevice());
370     }
371
372     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
373     {
374         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
375     }
376     if ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
377     {
378         changePinningPolicy(&f, PinningPolicy::PinnedIfSupported);
379     }
380     if (useGpuForUpdate)
381     {
382         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
383     }
384 /*****************************************************************************************/
385
386     // NOTE: The global state is no longer used at this point.
387     // But state_global is still used as temporary storage space for writing
388     // the global state to file and potentially for replica exchange.
389     // (Global topology should persist.)
390
391     update_mdatoms(mdatoms, state->lambda[efptMASS]);
392
393     if (ir->bExpanded)
394     {
395         /* Check nstexpanded here, because the grompp check was broken */
396         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
397         {
398             gmx_fatal(FARGS, "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
399         }
400         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation,
401                                ir, state->dfhist);
402     }
403
404     if (MASTER(cr))
405     {
406         EnergyElement::initializeEnergyHistory(
407                 startingBehavior, observablesHistory, &energyOutput);
408     }
409
410     preparePrevStepPullCom(ir, pull_work, mdatoms, state, state_global, cr,
411                            startingBehavior != StartingBehavior::NewSimulation);
412
413     // TODO: Remove this by converting AWH into a ForceProvider
414     auto awh = prepareAwhModule(fplog, *ir, state_global, cr, ms, startingBehavior != StartingBehavior::NewSimulation,
415                                 shellfc != nullptr,
416                                 opt2fn("-awh", nfile, fnm), pull_work);
417
418     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
419     if (useReplicaExchange && MASTER(cr))
420     {
421         repl_ex = init_replica_exchange(fplog, ms, top_global->natoms, ir,
422                                         replExParams);
423     }
424     /* PME tuning is only supported in the Verlet scheme, with PME for
425      * Coulomb. It is not supported with only LJ PME. */
426     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) &&
427                 !mdrunOptions.reproducible && ir->cutoff_scheme != ecutsGROUP);
428
429     pme_load_balancing_t *pme_loadbal      = nullptr;
430     if (bPMETune)
431     {
432         pme_loadbal_init(&pme_loadbal, cr, mdlog, *ir, state->box,
433                          *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu(),
434                          &bPMETunePrinting);
435     }
436
437     if (!ir->bContinuation)
438     {
439         if (state->flags & (1U << estV))
440         {
441             auto v = makeArrayRef(state->v);
442             /* Set the velocities of vsites, shells and frozen atoms to zero */
443             for (i = 0; i < mdatoms->homenr; i++)
444             {
445                 if (mdatoms->ptype[i] == eptVSite ||
446                     mdatoms->ptype[i] == eptShell)
447                 {
448                     clear_rvec(v[i]);
449                 }
450                 else if (mdatoms->cFREEZE)
451                 {
452                     for (m = 0; m < DIM; m++)
453                     {
454                         if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
455                         {
456                             v[i][m] = 0;
457                         }
458                     }
459                 }
460             }
461         }
462
463         if (constr)
464         {
465             /* Constrain the initial coordinates and velocities */
466             do_constrain_first(fplog, constr, ir, mdatoms,
467                                state->natoms,
468                                state->x.arrayRefWithPadding(),
469                                state->v.arrayRefWithPadding(),
470                                state->box, state->lambda[efptBONDED]);
471         }
472         if (vsite)
473         {
474             /* Construct the virtual sites for the initial configuration */
475             construct_vsites(vsite, state->x.rvec_array(), ir->delta_t, nullptr,
476                              top.idef.iparams, top.idef.il,
477                              fr->ePBC, fr->bMolPBC, cr, state->box);
478         }
479     }
480
481     if (ir->efep != efepNO)
482     {
483         /* Set free energy calculation frequency as the greatest common
484          * denominator of nstdhdl and repl_ex_nst. */
485         nstfep = ir->fepvals->nstdhdl;
486         if (ir->bExpanded)
487         {
488             nstfep = gmx_greatest_common_divisor(ir->expandedvals->nstexpanded, nstfep);
489         }
490         if (useReplicaExchange)
491         {
492             nstfep = gmx_greatest_common_divisor(replExParams.exchangeInterval, nstfep);
493         }
494     }
495
496     /* Be REALLY careful about what flags you set here. You CANNOT assume
497      * this is the first step, since we might be restarting from a checkpoint,
498      * and in that case we should not do any modifications to the state.
499      */
500     bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
501
502     // When restarting from a checkpoint, it can be appropriate to
503     // initialize ekind from quantities in the checkpoint. Otherwise,
504     // compute_globals must initialize ekind before the simulation
505     // starts/restarts. However, only the master rank knows what was
506     // found in the checkpoint file, so we have to communicate in
507     // order to coordinate the restart.
508     //
509     // TODO Consider removing this communication if/when checkpoint
510     // reading directly follows .tpr reading, because all ranks can
511     // agree on hasReadEkinState at that time.
512     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
513     if (PAR(cr))
514     {
515         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr);
516     }
517     if (hasReadEkinState)
518     {
519         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
520     }
521
522     unsigned int cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
523                                | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
524                                | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0)
525                                | (hasReadEkinState ? CGLO_READEKIN : 0));
526
527     bSumEkinhOld = FALSE;
528
529     t_vcm vcm(top_global->groups, *ir);
530     reportComRemovalInfo(fplog, vcm);
531
532     /* To minimize communication, compute_globals computes the COM velocity
533      * and the kinetic energy for the velocities without COM motion removed.
534      * Thus to get the kinetic energy without the COM contribution, we need
535      * to call compute_globals twice.
536      */
537     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
538     {
539         unsigned int cglo_flags_iteration = cglo_flags;
540         if (bStopCM && cgloIteration == 0)
541         {
542             cglo_flags_iteration |= CGLO_STOPCM;
543             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
544         }
545         compute_globals(gstat, cr, ir, fr, ekind,
546                         state->x.rvec_array(), state->v.rvec_array(), state->box, state->lambda[efptVDW],
547                         mdatoms, nrnb, &vcm,
548                         nullptr, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
549                         constr, &nullSignaller, state->box,
550                         &totalNumberOfBondedInteractions, &bSumEkinhOld, cglo_flags_iteration
551                         | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0));
552         if (cglo_flags_iteration & CGLO_STOPCM)
553         {
554             /* At initialization, do not pass x with acceleration-correction mode
555              * to avoid (incorrect) correction of the initial coordinates.
556              */
557             rvec *xPtr = nullptr;
558             if (vcm.mode != ecmLINEAR_ACCELERATION_CORRECTION)
559             {
560                 xPtr = state->x.rvec_array();
561             }
562             process_and_stopcm_grp(fplog, &vcm, *mdatoms, xPtr, state->v.rvec_array());
563             inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
564         }
565     }
566     checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
567                                     top_global, &top, state->x.rvec_array(), state->box,
568                                     &shouldCheckNumberOfBondedInteractions);
569     if (ir->eI == eiVVAK)
570     {
571         /* a second call to get the half step temperature initialized as well */
572         /* we do the same call as above, but turn the pressure off -- internally to
573            compute_globals, this is recognized as a velocity verlet half-step
574            kinetic energy calculation.  This minimized excess variables, but
575            perhaps loses some logic?*/
576
577         compute_globals(gstat, cr, ir, fr, ekind,
578                         state->x.rvec_array(), state->v.rvec_array(), state->box, state->lambda[efptVDW],
579                         mdatoms, nrnb, &vcm,
580                         nullptr, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
581                         constr, &nullSignaller, state->box,
582                         nullptr, &bSumEkinhOld,
583                         cglo_flags & ~CGLO_PRESSURE);
584     }
585
586     /* Calculate the initial half step temperature, and save the ekinh_old */
587     if (startingBehavior == StartingBehavior::NewSimulation)
588     {
589         for (i = 0; (i < ir->opts.ngtc); i++)
590         {
591             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
592         }
593     }
594
595     /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
596        temperature control */
597     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
598
599     if (MASTER(cr))
600     {
601         if (!ir->bContinuation)
602         {
603             if (constr && ir->eConstrAlg == econtLINCS)
604             {
605                 fprintf(fplog,
606                         "RMS relative constraint deviation after constraining: %.2e\n",
607                         constr->rmsd());
608             }
609             if (EI_STATE_VELOCITY(ir->eI))
610             {
611                 real temp = enerd->term[F_TEMP];
612                 if (ir->eI != eiVV)
613                 {
614                     /* Result of Ekin averaged over velocities of -half
615                      * and +half step, while we only have -half step here.
616                      */
617                     temp *= 2;
618                 }
619                 fprintf(fplog, "Initial temperature: %g K\n", temp);
620             }
621         }
622
623         char tbuf[20];
624         fprintf(stderr, "starting mdrun '%s'\n",
625                 *(top_global->name));
626         if (ir->nsteps >= 0)
627         {
628             sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t);
629         }
630         else
631         {
632             sprintf(tbuf, "%s", "infinite");
633         }
634         if (ir->init_step > 0)
635         {
636             fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
637                     gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf,
638                     gmx_step_str(ir->init_step, sbuf2),
639                     ir->init_step*ir->delta_t);
640         }
641         else
642         {
643             fprintf(stderr, "%s steps, %s ps.\n",
644                     gmx_step_str(ir->nsteps, sbuf), tbuf);
645         }
646         fprintf(fplog, "\n");
647     }
648
649     walltime_accounting_start_time(walltime_accounting);
650     wallcycle_start(wcycle, ewcRUN);
651     print_start(fplog, cr, walltime_accounting, "mdrun");
652
653 #if GMX_FAHCORE
654     /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
655     int chkpt_ret = fcCheckPointParallel( cr->nodeid,
656                                           NULL, 0);
657     if (chkpt_ret == 0)
658     {
659         gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 );
660     }
661 #endif
662
663     /***********************************************************
664      *
665      *             Loop over MD steps
666      *
667      ************************************************************/
668
669     bFirstStep       = TRUE;
670     /* Skip the first Nose-Hoover integration when we get the state from tpx */
671     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
672     bSumEkinhOld     = FALSE;
673     bExchanged       = FALSE;
674     bNeedRepartition = FALSE;
675
676     bool simulationsShareState = false;
677     int  nstSignalComm         = nstglobalcomm;
678     {
679         // TODO This implementation of ensemble orientation restraints is nasty because
680         // a user can't just do multi-sim with single-sim orientation restraints.
681         bool usingEnsembleRestraints = (fcd->disres.nsystems > 1) || ((ms != nullptr) && (fcd->orires.nr != 0));
682         bool awhUsesMultiSim         = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr));
683
684         // Replica exchange, ensemble restraints and AWH need all
685         // simulations to remain synchronized, so they need
686         // checkpoints and stop conditions to act on the same step, so
687         // the propagation of such signals must take place between
688         // simulations, not just within simulations.
689         // TODO: Make algorithm initializers set these flags.
690         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
691
692         if (simulationsShareState)
693         {
694             // Inter-simulation signal communication does not need to happen
695             // often, so we use a minimum of 200 steps to reduce overhead.
696             const int c_minimumInterSimulationSignallingInterval = 200;
697             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1)/nstglobalcomm)*nstglobalcomm;
698         }
699     }
700
701     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
702                 compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]), simulationsShareState,
703                 MASTER(cr), ir->nstlist, mdrunOptions.reproducible, nstSignalComm,
704                 mdrunOptions.maximumHoursToRun, ir->nstlist == 0, fplog, step, bNS, walltime_accounting);
705
706     auto checkpointHandler = std::make_unique<CheckpointHandler>(
707                 compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
708                 simulationsShareState, ir->nstlist == 0, MASTER(cr),
709                 mdrunOptions.writeConfout, mdrunOptions.checkpointOptions.period);
710
711     const bool resetCountersIsLocal = true;
712     auto       resetHandler         = std::make_unique<ResetHandler>(
713                 compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]), !resetCountersIsLocal,
714                 ir->nsteps, MASTER(cr), mdrunOptions.timingOptions.resetHalfway,
715                 mdrunOptions.maximumHoursToRun, mdlog, wcycle, walltime_accounting);
716
717     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
718
719     step     = ir->init_step;
720     step_rel = 0;
721
722     // TODO extract this to new multi-simulation module
723     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
724     {
725         if (!multisim_int_all_are_equal(ms, ir->nsteps))
726         {
727             GMX_LOG(mdlog.warning).appendText(
728                     "Note: The number of steps is not consistent across multi simulations,\n"
729                     "but we are proceeding anyway!");
730         }
731         if (!multisim_int_all_are_equal(ms, ir->init_step))
732         {
733             GMX_LOG(mdlog.warning).appendText(
734                     "Note: The initial step is not consistent across multi simulations,\n"
735                     "but we are proceeding anyway!");
736         }
737     }
738
739     /* and stop now if we should */
740     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
741     while (!bLastStep)
742     {
743
744         /* Determine if this is a neighbor search step */
745         bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
746
747         if (bPMETune && bNStList)
748         {
749             // This has to be here because PME load balancing is called so early.
750             // TODO: Move to after all booleans are defined.
751             if (useGpuForUpdate && !bFirstStep)
752             {
753                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
754                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
755             }
756             /* PME grid + cut-off optimization with GPUs or PME nodes */
757             pme_loadbal_do(pme_loadbal, cr,
758                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
759                            fplog, mdlog,
760                            *ir, fr, state->box, state->x,
761                            wcycle,
762                            step, step_rel,
763                            &bPMETunePrinting);
764         }
765
766         wallcycle_start(wcycle, ewcSTEP);
767
768         bLastStep = (step_rel == ir->nsteps);
769         t         = t0 + step*ir->delta_t;
770
771         // TODO Refactor this, so that nstfep does not need a default value of zero
772         if (ir->efep != efepNO || ir->bSimTemp)
773         {
774             /* find and set the current lambdas */
775             setCurrentLambdasLocal(step, ir->fepvals, lam0, state->lambda, state->fep_state);
776
777             bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
778             bDoFEP       = ((ir->efep != efepNO) && do_per_step(step, nstfep));
779             bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded)
780                             && (ir->bExpanded) && (step > 0) &&
781                             (startingBehavior == StartingBehavior::NewSimulation));
782         }
783
784         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep &&
785                      do_per_step(step, replExParams.exchangeInterval));
786
787         if (doSimulatedAnnealing)
788         {
789             update_annealing_target_temp(ir, t, &upd);
790         }
791
792         /* Stop Center of Mass motion */
793         bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
794
795         /* Determine whether or not to do Neighbour Searching */
796         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
797
798         /* Note that the stopHandler will cause termination at nstglobalcomm
799          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
800          * nstpcouple steps, we have computed the half-step kinetic energy
801          * of the previous step and can always output energies at the last step.
802          */
803         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
804
805         /* do_log triggers energy and virial calculation. Because this leads
806          * to different code paths, forces can be different. Thus for exact
807          * continuation we should avoid extra log output.
808          * Note that the || bLastStep can result in non-exact continuation
809          * beyond the last step. But we don't consider that to be an issue.
810          */
811         do_log     =
812             (do_per_step(step, ir->nstlog) ||
813              (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) ||
814              bLastStep);
815         do_verbose = mdrunOptions.verbose &&
816             (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
817
818         if (useGpuForUpdate && !bFirstStep)
819         {
820             // Copy velocities from the GPU when needed:
821             // - On search steps to keep copy on host (device buffers are reinitialized).
822             // - When needed for the output.
823             if (bNS || do_per_step(step, ir->nstvout))
824             {
825                 stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
826                 stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
827             }
828
829             // Copy coordinate from the GPU when needed at the search step.
830             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
831             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
832             if (bNS)
833             {
834                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
835                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
836             }
837         }
838
839         if (bNS && !(bFirstStep && ir->bContinuation))
840         {
841             bMasterState = FALSE;
842             /* Correct the new box if it is too skewed */
843             if (inputrecDynamicBox(ir))
844             {
845                 if (correct_box(fplog, step, state->box, graph))
846                 {
847                     bMasterState = TRUE;
848                 }
849             }
850             if (DOMAINDECOMP(cr) && bMasterState)
851             {
852                 dd_collect_state(cr->dd, state, state_global);
853             }
854
855             if (DOMAINDECOMP(cr))
856             {
857                 /* Repartition the domain decomposition */
858                 dd_partition_system(fplog, mdlog, step, cr,
859                                     bMasterState, nstglobalcomm,
860                                     state_global, *top_global, ir, imdSession,
861                                     pull_work,
862                                     state, &f, mdAtoms, &top, fr,
863                                     vsite, constr,
864                                     nrnb, wcycle,
865                                     do_verbose && !bPMETunePrinting);
866                 shouldCheckNumberOfBondedInteractions = true;
867                 upd.setNumAtoms(state->natoms);
868             }
869         }
870
871         if (MASTER(cr) && do_log)
872         {
873             energyOutput.printHeader(fplog, step, t); /* can we improve the information printed here? */
874         }
875
876         if (ir->efep != efepNO)
877         {
878             update_mdatoms(mdatoms, state->lambda[efptMASS]);
879         }
880
881         if (bExchanged)
882         {
883
884             /* We need the kinetic energy at minus the half step for determining
885              * the full step kinetic energy and possibly for T-coupling.*/
886             /* This may not be quite working correctly yet . . . . */
887             compute_globals(gstat, cr, ir, fr, ekind,
888                             state->x.rvec_array(), state->v.rvec_array(), state->box, state->lambda[efptVDW],
889                             mdatoms, nrnb, &vcm,
890                             wcycle, enerd, nullptr, nullptr, nullptr, nullptr, mu_tot,
891                             constr, &nullSignaller, state->box,
892                             &totalNumberOfBondedInteractions, &bSumEkinhOld,
893                             CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
894             checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
895                                             top_global, &top, state->x.rvec_array(), state->box,
896                                             &shouldCheckNumberOfBondedInteractions);
897         }
898         clear_mat(force_vir);
899
900         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
901
902         /* Determine the energy and pressure:
903          * at nstcalcenergy steps and at energy output steps (set below).
904          */
905         if (EI_VV(ir->eI) && (!bInitStep))
906         {
907             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
908             bCalcVir      = bCalcEnerStep ||
909                 (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple)));
910         }
911         else
912         {
913             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
914             bCalcVir      = bCalcEnerStep ||
915                 (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
916         }
917         bCalcEner = bCalcEnerStep;
918
919         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
920
921         if (do_ene || do_log || bDoReplEx)
922         {
923             bCalcVir  = TRUE;
924             bCalcEner = TRUE;
925         }
926
927         /* Do we need global communication ? */
928         bGStat = (bCalcVir || bCalcEner || bStopCM ||
929                   do_per_step(step, nstglobalcomm) ||
930                   (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step-1, nstglobalcomm)));
931
932         force_flags = (GMX_FORCE_STATECHANGED |
933                        ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0) |
934                        GMX_FORCE_ALLFORCES |
935                        (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
936                        (bCalcEner ? GMX_FORCE_ENERGY : 0) |
937                        (bDoFEP ? GMX_FORCE_DHDL : 0)
938                        );
939
940         if (shellfc)
941         {
942             /* Now is the time to relax the shells */
943             relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose,
944                                 enforcedRotation, step,
945                                 ir, imdSession, pull_work, bNS, force_flags, &top,
946                                 constr, enerd, fcd,
947                                 state->natoms,
948                                 state->x.arrayRefWithPadding(),
949                                 state->v.arrayRefWithPadding(),
950                                 state->box,
951                                 state->lambda,
952                                 &state->hist,
953                                 f.arrayRefWithPadding(), force_vir, mdatoms,
954                                 nrnb, wcycle, graph,
955                                 shellfc, fr, runScheduleWork, t, mu_tot,
956                                 vsite,
957                                 ddBalanceRegionHandler);
958         }
959         else
960         {
961             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias is updated
962                (or the AWH update will be performed twice for one step when continuing). It would be best to
963                call this update function from do_md_trajectory_writing but that would occur after do_force.
964                One would have to divide the update_awh function into one function applying the AWH force
965                and one doing the AWH bias update. The update AWH bias function could then be called after
966                do_md_trajectory_writing (then containing update_awh_history).
967                The checkpointing will in the future probably moved to the start of the md loop which will
968                rid of this issue. */
969             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
970             {
971                 awh->updateHistory(state_global->awhHistory.get());
972             }
973
974             /* The coordinates (x) are shifted (to get whole molecules)
975              * in do_force.
976              * This is parallellized as well, and does communication too.
977              * Check comments in sim_util.c
978              */
979             do_force(fplog, cr, ms, ir, awh.get(), enforcedRotation, imdSession,
980                      pull_work,
981                      step, nrnb, wcycle, &top,
982                      state->box, state->x.arrayRefWithPadding(), &state->hist,
983                      f.arrayRefWithPadding(), force_vir, mdatoms, enerd, fcd,
984                      state->lambda, graph,
985                      fr, runScheduleWork, vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
986                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
987                      ddBalanceRegionHandler);
988         }
989
990         // VV integrators do not need the following velocity half step
991         // if it is the first step after starting from a checkpoint.
992         // That is, the half step is needed on all other steps, and
993         // also the first step when starting from a .tpr file.
994         if (EI_VV(ir->eI) && (!bFirstStep || startingBehavior == StartingBehavior::NewSimulation))
995         /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
996         {
997             rvec *vbuf = nullptr;
998
999             wallcycle_start(wcycle, ewcUPDATE);
1000             if (ir->eI == eiVV && bInitStep)
1001             {
1002                 /* if using velocity verlet with full time step Ekin,
1003                  * take the first half step only to compute the
1004                  * virial for the first step. From there,
1005                  * revert back to the initial coordinates
1006                  * so that the input is actually the initial step.
1007                  */
1008                 snew(vbuf, state->natoms);
1009                 copy_rvecn(state->v.rvec_array(), vbuf, 0, state->natoms); /* should make this better for parallelizing? */
1010             }
1011             else
1012             {
1013                 /* this is for NHC in the Ekin(t+dt/2) version of vv */
1014                 trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1);
1015             }
1016
1017             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
1018                           ekind, M, &upd, etrtVELOCITY1,
1019                           cr, constr);
1020
1021             wallcycle_stop(wcycle, ewcUPDATE);
1022             constrain_velocities(step, nullptr,
1023                                  state,
1024                                  shake_vir,
1025                                  constr,
1026                                  bCalcVir, do_log, do_ene);
1027             wallcycle_start(wcycle, ewcUPDATE);
1028             /* if VV, compute the pressure and constraints */
1029             /* For VV2, we strictly only need this if using pressure
1030              * control, but we really would like to have accurate pressures
1031              * printed out.
1032              * Think about ways around this in the future?
1033              * For now, keep this choice in comments.
1034              */
1035             /*bPres = (ir->eI==eiVV || inputrecNptTrotter(ir)); */
1036             /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && inputrecNptTrotter(ir)));*/
1037             bPres = TRUE;
1038             bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
1039             if (bCalcEner && ir->eI == eiVVAK)
1040             {
1041                 bSumEkinhOld = TRUE;
1042             }
1043             /* for vv, the first half of the integration actually corresponds to the previous step.
1044                So we need information from the last step in the first half of the integration */
1045             if (bGStat || do_per_step(step-1, nstglobalcomm))
1046             {
1047                 wallcycle_stop(wcycle, ewcUPDATE);
1048                 compute_globals(gstat, cr, ir, fr, ekind,
1049                                 state->x.rvec_array(), state->v.rvec_array(), state->box, state->lambda[efptVDW],
1050                                 mdatoms, nrnb, &vcm,
1051                                 wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
1052                                 constr, &nullSignaller, state->box,
1053                                 &totalNumberOfBondedInteractions, &bSumEkinhOld,
1054                                 (bGStat ? CGLO_GSTAT : 0)
1055                                 | (bCalcEner ? CGLO_ENERGY : 0)
1056                                 | (bTemp ? CGLO_TEMPERATURE : 0)
1057                                 | (bPres ? CGLO_PRESSURE : 0)
1058                                 | (bPres ? CGLO_CONSTRAINT : 0)
1059                                 | (bStopCM ? CGLO_STOPCM : 0)
1060                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
1061                                 | CGLO_SCALEEKIN
1062                                 );
1063                 /* explanation of above:
1064                    a) We compute Ekin at the full time step
1065                    if 1) we are using the AveVel Ekin, and it's not the
1066                    initial step, or 2) if we are using AveEkin, but need the full
1067                    time step kinetic energy for the pressure (always true now, since we want accurate statistics).
1068                    b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
1069                    EkinAveVel because it's needed for the pressure */
1070                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1071                                                 top_global, &top, state->x.rvec_array(), state->box,
1072                                                 &shouldCheckNumberOfBondedInteractions);
1073                 if (bStopCM)
1074                 {
1075                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, state->x.rvec_array(), state->v.rvec_array());
1076                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1077                 }
1078                 wallcycle_start(wcycle, ewcUPDATE);
1079             }
1080             /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
1081             if (!bInitStep)
1082             {
1083                 if (bTrotter)
1084                 {
1085                     m_add(force_vir, shake_vir, total_vir);     /* we need the un-dispersion corrected total vir here */
1086                     trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
1087
1088                     /* TODO This is only needed when we're about to write
1089                      * a checkpoint, because we use it after the restart
1090                      * (in a kludge?). But what should we be doing if
1091                      * the startingBehavior is NewSimulation or bInitStep are true? */
1092                     if (inputrecNptTrotter(ir) || inputrecNphTrotter(ir))
1093                     {
1094                         copy_mat(shake_vir, state->svir_prev);
1095                         copy_mat(force_vir, state->fvir_prev);
1096                     }
1097                     if (inputrecNvtTrotter(ir) && ir->eI == eiVV)
1098                     {
1099                         /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
1100                         enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, nullptr, (ir->eI == eiVV), FALSE);
1101                         enerd->term[F_EKIN] = trace(ekind->ekin);
1102                     }
1103                 }
1104                 else if (bExchanged)
1105                 {
1106                     wallcycle_stop(wcycle, ewcUPDATE);
1107                     /* We need the kinetic energy at minus the half step for determining
1108                      * the full step kinetic energy and possibly for T-coupling.*/
1109                     /* This may not be quite working correctly yet . . . . */
1110                     compute_globals(gstat, cr, ir, fr, ekind,
1111                                     state->x.rvec_array(), state->v.rvec_array(), state->box, state->lambda[efptVDW],
1112                                     mdatoms, nrnb, &vcm,
1113                                     wcycle, enerd, nullptr, nullptr, nullptr, nullptr, mu_tot,
1114                                     constr, &nullSignaller, state->box,
1115                                     nullptr, &bSumEkinhOld,
1116                                     CGLO_GSTAT | CGLO_TEMPERATURE);
1117                     wallcycle_start(wcycle, ewcUPDATE);
1118                 }
1119             }
1120             /* if it's the initial step, we performed this first step just to get the constraint virial */
1121             if (ir->eI == eiVV && bInitStep)
1122             {
1123                 copy_rvecn(vbuf, state->v.rvec_array(), 0, state->natoms);
1124                 sfree(vbuf);
1125             }
1126             wallcycle_stop(wcycle, ewcUPDATE);
1127         }
1128
1129         /* compute the conserved quantity */
1130         if (EI_VV(ir->eI))
1131         {
1132             saved_conserved_quantity = NPT_energy(ir, state, &MassQ);
1133             if (ir->eI == eiVV)
1134             {
1135                 last_ekin = enerd->term[F_EKIN];
1136             }
1137             if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
1138             {
1139                 saved_conserved_quantity -= enerd->term[F_DISPCORR];
1140             }
1141             /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
1142             if (ir->efep != efepNO)
1143             {
1144                 sum_dhdl(enerd, state->lambda, *ir->fepvals);
1145             }
1146         }
1147
1148         /* ########  END FIRST UPDATE STEP  ############## */
1149         /* ########  If doing VV, we now have v(dt) ###### */
1150         if (bDoExpanded)
1151         {
1152             /* perform extended ensemble sampling in lambda - we don't
1153                actually move to the new state before outputting
1154                statistics, but if performing simulated tempering, we
1155                do update the velocities and the tau_t. */
1156
1157             lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state, state->dfhist, step, state->v.rvec_array(), mdatoms);
1158             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1159             if (MASTER(cr))
1160             {
1161                 copy_df_history(state_global->dfhist, state->dfhist);
1162             }
1163         }
1164
1165         // Copy coordinate from the GPU for the output if the update is offloaded and
1166         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1167         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork &&
1168             (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)))
1169         {
1170             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1171             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1172         }
1173         /* Now we have the energies and forces corresponding to the
1174          * coordinates at time t. We must output all of this before
1175          * the update.
1176          */
1177         do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t,
1178                                  ir, state, state_global, observablesHistory,
1179                                  top_global, fr,
1180                                  outf, energyOutput, ekind, f,
1181                                  checkpointHandler->isCheckpointingStep(),
1182                                  bRerunMD, bLastStep,
1183                                  mdrunOptions.writeConfout,
1184                                  bSumEkinhOld);
1185         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1186         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x.rvec_array(), t);
1187
1188         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1189         if (startingBehavior != StartingBehavior::NewSimulation &&
1190             (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1191         {
1192             copy_mat(state->svir_prev, shake_vir);
1193             copy_mat(state->fvir_prev, force_vir);
1194         }
1195
1196         stopHandler->setSignal();
1197         resetHandler->setSignal(walltime_accounting);
1198
1199         if (bGStat || !PAR(cr))
1200         {
1201             /* In parallel we only have to check for checkpointing in steps
1202              * where we do global communication,
1203              *  otherwise the other nodes don't know.
1204              */
1205             checkpointHandler->setSignal(walltime_accounting);
1206         }
1207
1208         /* #########   START SECOND UPDATE STEP ################# */
1209
1210         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen controlled
1211            in preprocessing */
1212
1213         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1214         {
1215             gmx_bool bIfRandomize;
1216             bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, &upd, constr);
1217             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1218             if (constr && bIfRandomize)
1219             {
1220                 constrain_velocities(step, nullptr,
1221                                      state,
1222                                      tmp_vir,
1223                                      constr,
1224                                      bCalcVir, do_log, do_ene);
1225             }
1226         }
1227         /* Box is changed in update() when we do pressure coupling,
1228          * but we should still use the old box for energy corrections and when
1229          * writing it to the energy file, so it matches the trajectory files for
1230          * the same timestep above. Make a copy in a separate array.
1231          */
1232         copy_mat(state->box, lastbox);
1233
1234         dvdl_constr = 0;
1235
1236         wallcycle_start(wcycle, ewcUPDATE);
1237         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1238         if (bTrotter)
1239         {
1240             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
1241             /* We can only do Berendsen coupling after we have summed
1242              * the kinetic energy or virial. Since the happens
1243              * in global_state after update, we should only do it at
1244              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1245              */
1246         }
1247         else
1248         {
1249             update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
1250             update_pcouple_before_coordinates(fplog, step, ir, state,
1251                                               parrinellorahmanMu, M,
1252                                               bInitStep);
1253         }
1254
1255         if (EI_VV(ir->eI))
1256         {
1257             /* velocity half-step update */
1258             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
1259                           ekind, M, &upd, etrtVELOCITY2,
1260                           cr, constr);
1261         }
1262
1263         /* Above, initialize just copies ekinh into ekin,
1264          * it doesn't copy position (for VV),
1265          * and entire integrator for MD.
1266          */
1267
1268         if (ir->eI == eiVVAK)
1269         {
1270             cbuf.resize(state->x.size());
1271             std::copy(state->x.begin(), state->x.end(), cbuf.begin());
1272         }
1273
1274         /* With leap-frog type integrators we compute the kinetic energy
1275          * at a whole time step as the average of the half-time step kinetic
1276          * energies of two subsequent steps. Therefore we need to compute the
1277          * half step kinetic energy also if we need energies at the next step.
1278          */
1279         const bool needHalfStepKineticEnergy = (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm));
1280
1281         if (useGpuForUpdate)
1282         {
1283             if (bNS)
1284             {
1285                 integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(), stateGpu->getForces(),
1286                                 top.idef, *mdatoms, ekind->ngtc);
1287                 t_pbc pbc;
1288                 set_pbc(&pbc, epbcXYZ, state->box);
1289                 integrator->setPbc(&pbc);
1290
1291                 // Copy data to the GPU after buffers might have being reinitialized
1292                 stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1293                 stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1294             }
1295
1296             stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), AtomLocality::All);
1297
1298             // TODO: Use StepWorkload fields.
1299             bool useGpuFBufferOps = simulationWork.useGpuBufferOps && !(bCalcVir || bCalcEner);
1300
1301             bool doTempCouple       = (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1302             bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1303
1304             // This applies Leap-Frog, LINCS and SETTLE in succession
1305             integrator->integrate(stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::Local, useGpuFBufferOps),
1306                                   ir->delta_t, true, bCalcVir, shake_vir,
1307                                   doTempCouple, ekind->tcstat,
1308                                   doParrinelloRahman, ir->nstpcouple*ir->delta_t, M);
1309
1310             // Copy velocities D2H after update if:
1311             // - Globals are computed this step (includes the energy output steps).
1312             // - Temperature is needed for the next step.
1313             if (bGStat || needHalfStepKineticEnergy)
1314             {
1315                 stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1316                 stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1317             }
1318         }
1319         else
1320         {
1321             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
1322                           ekind, M, &upd, etrtPOSITION, cr, constr);
1323
1324             wallcycle_stop(wcycle, ewcUPDATE);
1325
1326             constrain_coordinates(step, &dvdl_constr, state,
1327                                   shake_vir,
1328                                   &upd, constr,
1329                                   bCalcVir, do_log, do_ene);
1330
1331             update_sd_second_half(step, &dvdl_constr, ir, mdatoms, state,
1332                                   cr, nrnb, wcycle, &upd, constr, do_log, do_ene);
1333             finish_update(ir, mdatoms,
1334                           state, graph,
1335                           nrnb, wcycle, &upd, constr);
1336         }
1337
1338         if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1339         {
1340             updatePrevStepPullCom(pull_work, state);
1341         }
1342
1343         if (ir->eI == eiVVAK)
1344         {
1345             /* erase F_EKIN and F_TEMP here? */
1346             /* just compute the kinetic energy at the half step to perform a trotter step */
1347             compute_globals(gstat, cr, ir, fr, ekind,
1348                             state->x.rvec_array(), state->v.rvec_array(), state->box, state->lambda[efptVDW],
1349                             mdatoms, nrnb, &vcm,
1350                             wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
1351                             constr, &nullSignaller, lastbox,
1352                             nullptr, &bSumEkinhOld,
1353                             (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE
1354                             );
1355             wallcycle_start(wcycle, ewcUPDATE);
1356             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
1357             /* now we know the scaling, we can compute the positions again */
1358             std::copy(cbuf.begin(), cbuf.end(), state->x.begin());
1359
1360             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
1361                           ekind, M, &upd, etrtPOSITION, cr, constr);
1362             wallcycle_stop(wcycle, ewcUPDATE);
1363
1364             /* do we need an extra constraint here? just need to copy out of as_rvec_array(state->v.data()) to upd->xp? */
1365             /* are the small terms in the shake_vir here due
1366              * to numerical errors, or are they important
1367              * physically? I'm thinking they are just errors, but not completely sure.
1368              * For now, will call without actually constraining, constr=NULL*/
1369             finish_update(ir, mdatoms,
1370                           state, graph,
1371                           nrnb, wcycle, &upd, nullptr);
1372         }
1373         if (EI_VV(ir->eI))
1374         {
1375             /* this factor or 2 correction is necessary
1376                because half of the constraint force is removed
1377                in the vv step, so we have to double it.  See
1378                the Redmine issue #1255.  It is not yet clear
1379                if the factor of 2 is exact, or just a very
1380                good approximation, and this will be
1381                investigated.  The next step is to see if this
1382                can be done adding a dhdl contribution from the
1383                rattle step, but this is somewhat more
1384                complicated with the current code. Will be
1385                investigated, hopefully for 4.6.3. However,
1386                this current solution is much better than
1387                having it completely wrong.
1388              */
1389             enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
1390         }
1391         else
1392         {
1393             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1394         }
1395
1396         if (vsite != nullptr)
1397         {
1398             wallcycle_start(wcycle, ewcVSITECONSTR);
1399             if (graph != nullptr)
1400             {
1401                 shift_self(graph, state->box, state->x.rvec_array());
1402             }
1403             construct_vsites(vsite, state->x.rvec_array(), ir->delta_t, state->v.rvec_array(),
1404                              top.idef.iparams, top.idef.il,
1405                              fr->ePBC, fr->bMolPBC, cr, state->box);
1406
1407             if (graph != nullptr)
1408             {
1409                 unshift_self(graph, state->box, state->x.rvec_array());
1410             }
1411             wallcycle_stop(wcycle, ewcVSITECONSTR);
1412         }
1413
1414         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1415         /* With Leap-Frog we can skip compute_globals at
1416          * non-communication steps, but we need to calculate
1417          * the kinetic energy one step before communication.
1418          */
1419         {
1420             // Organize to do inter-simulation signalling on steps if
1421             // and when algorithms require it.
1422             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1423
1424             // With leap-frog we also need to compute the half-step
1425             // kinetic energy at the step before we need to write
1426             // the full-step kinetic energy
1427             const bool needEkinAtNextStep =
1428                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) ||
1429                                     step_rel + 1 == ir->nsteps));
1430
1431             if (bGStat || needEkinAtNextStep || doInterSimSignal)
1432             {
1433                 // Copy coordinates when needed to stop the CM motion.
1434                 if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
1435                 {
1436                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1437                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1438                 }
1439                 // Since we're already communicating at this step, we
1440                 // can propagate intra-simulation signals. Note that
1441                 // check_nstglobalcomm has the responsibility for
1442                 // choosing the value of nstglobalcomm that is one way
1443                 // bGStat becomes true, so we can't get into a
1444                 // situation where e.g. checkpointing can't be
1445                 // signalled.
1446                 bool                doIntraSimSignal = true;
1447                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1448
1449                 compute_globals(gstat, cr, ir, fr, ekind,
1450                                 state->x.rvec_array(), state->v.rvec_array(), state->box, state->lambda[efptVDW],
1451                                 mdatoms, nrnb, &vcm,
1452                                 wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
1453                                 constr, &signaller,
1454                                 lastbox,
1455                                 &totalNumberOfBondedInteractions, &bSumEkinhOld,
1456                                 (bGStat ? CGLO_GSTAT : 0)
1457                                 | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1458                                 | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1459                                 | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1460                                 | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
1461                                 | CGLO_CONSTRAINT
1462                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
1463                                 );
1464                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1465                                                 top_global, &top, state->x.rvec_array(), state->box,
1466                                                 &shouldCheckNumberOfBondedInteractions);
1467                 if (!EI_VV(ir->eI) && bStopCM)
1468                 {
1469                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, state->x.rvec_array(), state->v.rvec_array());
1470                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1471
1472                     // TODO: The special case of removing CM motion should be dealt more gracefully
1473                     if (useGpuForUpdate)
1474                     {
1475                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1476                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1477                     }
1478                 }
1479             }
1480         }
1481
1482         /* #############  END CALC EKIN AND PRESSURE ################# */
1483
1484         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1485            the virial that should probably be addressed eventually. state->veta has better properies,
1486            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1487            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1488
1489         if (ir->efep != efepNO && !EI_VV(ir->eI))
1490         {
1491             /* Sum up the foreign energy and dhdl terms for md and sd.
1492                Currently done every step so that dhdl is correct in the .edr */
1493             sum_dhdl(enerd, state->lambda, *ir->fepvals);
1494         }
1495
1496         update_pcouple_after_coordinates(fplog, step, ir, mdatoms,
1497                                          pres, force_vir, shake_vir,
1498                                          parrinellorahmanMu,
1499                                          state, nrnb, &upd);
1500
1501         /* ################# END UPDATE STEP 2 ################# */
1502         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1503
1504         /* The coordinates (x) were unshifted in update */
1505         if (!bGStat)
1506         {
1507             /* We will not sum ekinh_old,
1508              * so signal that we still have to do it.
1509              */
1510             bSumEkinhOld = TRUE;
1511         }
1512
1513         if (bCalcEner)
1514         {
1515             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1516
1517             /* use the directly determined last velocity, not actually the averaged half steps */
1518             if (bTrotter && ir->eI == eiVV)
1519             {
1520                 enerd->term[F_EKIN] = last_ekin;
1521             }
1522             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1523
1524             if (integratorHasConservedEnergyQuantity(ir))
1525             {
1526                 if (EI_VV(ir->eI))
1527                 {
1528                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1529                 }
1530                 else
1531                 {
1532                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1533                 }
1534             }
1535             /* #########  END PREPARING EDR OUTPUT  ###########  */
1536         }
1537
1538         /* Output stuff */
1539         if (MASTER(cr))
1540         {
1541             if (fplog && do_log && bDoExpanded)
1542             {
1543                 /* only needed if doing expanded ensemble */
1544                 PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : nullptr,
1545                                           state_global->dfhist, state->fep_state, ir->nstlog, step);
1546             }
1547             if (bCalcEner)
1548             {
1549                 energyOutput.addDataAtEnergyStep(bDoDHDL, bCalcEnerStep,
1550                                                  t, mdatoms->tmass, enerd, state,
1551                                                  ir->fepvals, ir->expandedvals, lastbox,
1552                                                  shake_vir, force_vir, total_vir, pres,
1553                                                  ekind, mu_tot, constr);
1554             }
1555             else
1556             {
1557                 energyOutput.recordNonEnergyStep();
1558             }
1559
1560             gmx_bool do_dr  = do_per_step(step, ir->nstdisreout);
1561             gmx_bool do_or  = do_per_step(step, ir->nstorireout);
1562
1563             if (doSimulatedAnnealing)
1564             {
1565                 energyOutput.printAnnealingTemperatures(do_log ? fplog : nullptr, groups, &(ir->opts));
1566             }
1567             if (do_log || do_ene || do_dr || do_or)
1568             {
1569                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or,
1570                                                    do_log ? fplog : nullptr,
1571                                                    step, t,
1572                                                    fcd, awh.get());
1573             }
1574
1575             if (ir->bPull)
1576             {
1577                 pull_print_output(pull_work, step, t);
1578             }
1579
1580             if (do_per_step(step, ir->nstlog))
1581             {
1582                 if (fflush(fplog) != 0)
1583                 {
1584                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1585                 }
1586             }
1587         }
1588         if (bDoExpanded)
1589         {
1590             /* Have to do this part _after_ outputting the logfile and the edr file */
1591             /* Gets written into the state at the beginning of next loop*/
1592             state->fep_state = lamnew;
1593         }
1594         /* Print the remaining wall clock time for the run */
1595         if (isMasterSimMasterRank(ms, MASTER(cr)) &&
1596             (do_verbose || gmx_got_usr_signal()) &&
1597             !bPMETunePrinting)
1598         {
1599             if (shellfc)
1600             {
1601                 fprintf(stderr, "\n");
1602             }
1603             print_time(stderr, walltime_accounting, step, ir, cr);
1604         }
1605
1606         /* Ion/water position swapping.
1607          * Not done in last step since trajectory writing happens before this call
1608          * in the MD loop and exchanges would be lost anyway. */
1609         bNeedRepartition = FALSE;
1610         if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep &&
1611             do_per_step(step, ir->swap->nstswap))
1612         {
1613             bNeedRepartition = do_swapcoords(cr, step, t, ir, swap, wcycle,
1614                                              as_rvec_array(state->x.data()),
1615                                              state->box,
1616                                              MASTER(cr) && mdrunOptions.verbose,
1617                                              bRerunMD);
1618
1619             if (bNeedRepartition && DOMAINDECOMP(cr))
1620             {
1621                 dd_collect_state(cr->dd, state, state_global);
1622             }
1623         }
1624
1625         /* Replica exchange */
1626         bExchanged = FALSE;
1627         if (bDoReplEx)
1628         {
1629             bExchanged = replica_exchange(fplog, cr, ms, repl_ex,
1630                                           state_global, enerd,
1631                                           state, step, t);
1632         }
1633
1634         if ( (bExchanged || bNeedRepartition) && DOMAINDECOMP(cr) )
1635         {
1636             dd_partition_system(fplog, mdlog, step, cr, TRUE, 1,
1637                                 state_global, *top_global, ir, imdSession,
1638                                 pull_work,
1639                                 state, &f, mdAtoms, &top, fr,
1640                                 vsite, constr,
1641                                 nrnb, wcycle, FALSE);
1642             shouldCheckNumberOfBondedInteractions = true;
1643             upd.setNumAtoms(state->natoms);
1644         }
1645
1646         bFirstStep             = FALSE;
1647         bInitStep              = FALSE;
1648
1649         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1650         /* With all integrators, except VV, we need to retain the pressure
1651          * at the current step for coupling at the next step.
1652          */
1653         if ((state->flags & (1U<<estPRES_PREV)) &&
1654             (bGStatEveryStep ||
1655              (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1656         {
1657             /* Store the pressure in t_state for pressure coupling
1658              * at the next MD step.
1659              */
1660             copy_mat(pres, state->pres_prev);
1661         }
1662
1663         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1664
1665         if ( (membed != nullptr) && (!bLastStep) )
1666         {
1667             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1668         }
1669
1670         cycles = wallcycle_stop(wcycle, ewcSTEP);
1671         if (DOMAINDECOMP(cr) && wcycle)
1672         {
1673             dd_cycles_add(cr->dd, cycles, ddCyclStep);
1674         }
1675
1676         /* increase the MD step number */
1677         step++;
1678         step_rel++;
1679
1680         resetHandler->resetCounters(
1681                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(),
1682                 nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
1683
1684         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
1685         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
1686
1687     }
1688     /* End of main MD loop */
1689
1690     /* Closing TNG files can include compressing data. Therefore it is good to do that
1691      * before stopping the time measurements. */
1692     mdoutf_tng_close(outf);
1693
1694     /* Stop measuring walltime */
1695     walltime_accounting_end_time(walltime_accounting);
1696
1697     if (!thisRankHasDuty(cr, DUTY_PME))
1698     {
1699         /* Tell the PME only node to finish */
1700         gmx_pme_send_finish(cr);
1701     }
1702
1703     if (MASTER(cr))
1704     {
1705         if (ir->nstcalcenergy > 0)
1706         {
1707             energyOutput.printAnnealingTemperatures(fplog, groups, &(ir->opts));
1708             energyOutput.printAverages(fplog, groups);
1709         }
1710     }
1711     done_mdoutf(outf);
1712
1713     if (bPMETune)
1714     {
1715         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
1716     }
1717
1718     done_shellfc(fplog, shellfc, step_rel);
1719
1720     if (useReplicaExchange && MASTER(cr))
1721     {
1722         print_replica_exchange_statistics(fplog, repl_ex);
1723     }
1724
1725     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
1726
1727     if (fr->pmePpCommGpu)
1728     {
1729         // destroy object since it is no longer required. (This needs to be done while the GPU context still exists.)
1730         fr->pmePpCommGpu.reset();
1731     }
1732
1733     global_stat_destroy(gstat);
1734
1735 }