Explicitly destroy PME-PP GPU communication object
[alexxy/gromacs.git] / src / gromacs / mdrun / md.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 /*! \internal \file
38  *
39  * \brief Implements the integrator for normal molecular dynamics simulations
40  *
41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
42  * \ingroup module_mdrun
43  */
44 #include "gmxpre.h"
45
46 #include <cinttypes>
47 #include <cmath>
48 #include <cstdio>
49 #include <cstdlib>
50
51 #include <algorithm>
52 #include <memory>
53
54 #include "gromacs/awh/awh.h"
55 #include "gromacs/commandline/filenm.h"
56 #include "gromacs/domdec/collect.h"
57 #include "gromacs/domdec/dlbtiming.h"
58 #include "gromacs/domdec/domdec.h"
59 #include "gromacs/domdec/domdec_network.h"
60 #include "gromacs/domdec/domdec_struct.h"
61 #include "gromacs/domdec/mdsetup.h"
62 #include "gromacs/domdec/partition.h"
63 #include "gromacs/essentialdynamics/edsam.h"
64 #include "gromacs/ewald/pme.h"
65 #include "gromacs/ewald/pme_load_balancing.h"
66 #include "gromacs/ewald/pme_pp_comm_gpu.h"
67 #include "gromacs/fileio/trxio.h"
68 #include "gromacs/gmxlib/network.h"
69 #include "gromacs/gmxlib/nrnb.h"
70 #include "gromacs/gpu_utils/gpu_utils.h"
71 #include "gromacs/imd/imd.h"
72 #include "gromacs/listed_forces/manage_threading.h"
73 #include "gromacs/math/functions.h"
74 #include "gromacs/math/utilities.h"
75 #include "gromacs/math/vec.h"
76 #include "gromacs/math/vectypes.h"
77 #include "gromacs/mdlib/checkpointhandler.h"
78 #include "gromacs/mdlib/compute_io.h"
79 #include "gromacs/mdlib/constr.h"
80 #include "gromacs/mdlib/ebin.h"
81 #include "gromacs/mdlib/enerdata_utils.h"
82 #include "gromacs/mdlib/energyoutput.h"
83 #include "gromacs/mdlib/expanded.h"
84 #include "gromacs/mdlib/force.h"
85 #include "gromacs/mdlib/force_flags.h"
86 #include "gromacs/mdlib/forcerec.h"
87 #include "gromacs/mdlib/md_support.h"
88 #include "gromacs/mdlib/mdatoms.h"
89 #include "gromacs/mdlib/mdoutf.h"
90 #include "gromacs/mdlib/membed.h"
91 #include "gromacs/mdlib/resethandler.h"
92 #include "gromacs/mdlib/sighandler.h"
93 #include "gromacs/mdlib/simulationsignal.h"
94 #include "gromacs/mdlib/stat.h"
95 #include "gromacs/mdlib/stophandler.h"
96 #include "gromacs/mdlib/tgroup.h"
97 #include "gromacs/mdlib/trajectory_writing.h"
98 #include "gromacs/mdlib/update.h"
99 #include "gromacs/mdlib/update_constrain_cuda.h"
100 #include "gromacs/mdlib/vcm.h"
101 #include "gromacs/mdlib/vsite.h"
102 #include "gromacs/mdrunutility/handlerestart.h"
103 #include "gromacs/mdrunutility/multisim.h"
104 #include "gromacs/mdrunutility/printtime.h"
105 #include "gromacs/mdtypes/awh_history.h"
106 #include "gromacs/mdtypes/awh_params.h"
107 #include "gromacs/mdtypes/commrec.h"
108 #include "gromacs/mdtypes/df_history.h"
109 #include "gromacs/mdtypes/energyhistory.h"
110 #include "gromacs/mdtypes/fcdata.h"
111 #include "gromacs/mdtypes/forcerec.h"
112 #include "gromacs/mdtypes/group.h"
113 #include "gromacs/mdtypes/inputrec.h"
114 #include "gromacs/mdtypes/interaction_const.h"
115 #include "gromacs/mdtypes/md_enums.h"
116 #include "gromacs/mdtypes/mdatom.h"
117 #include "gromacs/mdtypes/mdrunoptions.h"
118 #include "gromacs/mdtypes/observableshistory.h"
119 #include "gromacs/mdtypes/pullhistory.h"
120 #include "gromacs/mdtypes/simulation_workload.h"
121 #include "gromacs/mdtypes/state.h"
122 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
123 #include "gromacs/modularsimulator/energyelement.h"
124 #include "gromacs/nbnxm/gpu_data_mgmt.h"
125 #include "gromacs/nbnxm/nbnxm.h"
126 #include "gromacs/pbcutil/mshift.h"
127 #include "gromacs/pbcutil/pbc.h"
128 #include "gromacs/pulling/output.h"
129 #include "gromacs/pulling/pull.h"
130 #include "gromacs/swap/swapcoords.h"
131 #include "gromacs/timing/wallcycle.h"
132 #include "gromacs/timing/walltime_accounting.h"
133 #include "gromacs/topology/atoms.h"
134 #include "gromacs/topology/idef.h"
135 #include "gromacs/topology/mtop_util.h"
136 #include "gromacs/topology/topology.h"
137 #include "gromacs/trajectory/trajectoryframe.h"
138 #include "gromacs/utility/basedefinitions.h"
139 #include "gromacs/utility/cstringutil.h"
140 #include "gromacs/utility/fatalerror.h"
141 #include "gromacs/utility/logger.h"
142 #include "gromacs/utility/real.h"
143 #include "gromacs/utility/smalloc.h"
144
145 #include "legacysimulator.h"
146 #include "replicaexchange.h"
147 #include "shellfc.h"
148
149 #if GMX_FAHCORE
150 #include "corewrap.h"
151 #endif
152
153 using gmx::SimulationSignaller;
154
155 void gmx::LegacySimulator::do_md()
156 {
157     // TODO Historically, the EM and MD "integrators" used different
158     // names for the t_inputrec *parameter, but these must have the
159     // same name, now that it's a member of a struct. We use this ir
160     // alias to avoid a large ripple of nearly useless changes.
161     // t_inputrec is being replaced by IMdpOptionsProvider, so this
162     // will go away eventually.
163     t_inputrec                 *ir   = inputrec;
164     int64_t                     step, step_rel;
165     double                      t, t0 = ir->init_t, lam0[efptNR];
166     gmx_bool                    bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
167     gmx_bool                    bNS, bNStList, bStopCM,
168                                 bFirstStep, bInitStep, bLastStep = FALSE;
169     gmx_bool                    bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
170     gmx_bool                    do_ene, do_log, do_verbose;
171     gmx_bool                    bMasterState;
172     unsigned int                force_flags;
173     tensor                      force_vir     = {{0}}, shake_vir = {{0}}, total_vir = {{0}},
174                                 tmp_vir       = {{0}}, pres = {{0}};
175     int                         i, m;
176     rvec                        mu_tot;
177     matrix                      parrinellorahmanMu, M;
178     gmx_repl_ex_t               repl_ex = nullptr;
179     gmx_localtop_t              top;
180     PaddedHostVector<gmx::RVec> f {};
181     gmx_global_stat_t           gstat;
182     t_graph                    *graph = nullptr;
183     gmx_shellfc_t              *shellfc;
184     gmx_bool                    bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
185     gmx_bool                    bTemp, bPres, bTrotter;
186     real                        dvdl_constr;
187     std::vector<RVec>           cbuf;
188     matrix                      lastbox;
189     int                         lamnew  = 0;
190     /* for FEP */
191     int                         nstfep = 0;
192     double                      cycles;
193     real                        saved_conserved_quantity = 0;
194     real                        last_ekin                = 0;
195     t_extmass                   MassQ;
196     char                        sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
197
198     /* PME load balancing data for GPU kernels */
199     gmx_bool              bPMETune         = FALSE;
200     gmx_bool              bPMETunePrinting = FALSE;
201
202     bool                  bInteractiveMDstep = false;
203
204     /* Domain decomposition could incorrectly miss a bonded
205        interaction, but checking for that requires a global
206        communication stage, which does not otherwise happen in DD
207        code. So we do that alongside the first global energy reduction
208        after a new DD is made. These variables handle whether the
209        check happens, and the result it returns. */
210     bool              shouldCheckNumberOfBondedInteractions = false;
211     int               totalNumberOfBondedInteractions       = -1;
212
213     SimulationSignals signals;
214     // Most global communnication stages don't propagate mdrun
215     // signals, and will use this object to achieve that.
216     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
217
218     if (!mdrunOptions.writeConfout)
219     {
220         // This is on by default, and the main known use case for
221         // turning it off is for convenience in benchmarking, which is
222         // something that should not show up in the general user
223         // interface.
224         GMX_LOG(mdlog.info).asParagraph().
225             appendText("The -noconfout functionality is deprecated, and may be removed in a future version.");
226     }
227
228     /* md-vv uses averaged full step velocities for T-control
229        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
230        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
231     bTrotter = (EI_VV(ir->eI) && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
232
233     const bool bRerunMD      = false;
234
235     int        nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
236     bGStatEveryStep = (nstglobalcomm == 1);
237
238     SimulationGroups                  *groups = &top_global->groups;
239
240     std::unique_ptr<EssentialDynamics> ed = nullptr;
241     if (opt2bSet("-ei", nfile, fnm))
242     {
243         /* Initialize essential dynamics sampling */
244         ed = init_edsam(mdlog,
245                         opt2fn_null("-ei", nfile, fnm), opt2fn("-eo", nfile, fnm),
246                         top_global,
247                         ir, cr, constr,
248                         state_global, observablesHistory,
249                         oenv,
250                         startingBehavior);
251     }
252     else if (observablesHistory->edsamHistory)
253     {
254         gmx_fatal(FARGS,
255                   "The checkpoint is from a run with essential dynamics sampling, "
256                   "but the current run did not specify the -ei option. "
257                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
258     }
259
260     initialize_lambdas(fplog, *ir, MASTER(cr), &state_global->fep_state, state_global->lambda, lam0);
261     Update     upd(ir, deform);
262     const bool doSimulatedAnnealing = initSimulatedAnnealing(ir, &upd);
263     if (startingBehavior != StartingBehavior::RestartWithAppending)
264     {
265         pleaseCiteCouplingAlgorithms(fplog, *ir);
266     }
267     gmx_mdoutf       *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, ir, top_global, oenv, wcycle,
268                                          startingBehavior);
269     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, ir, pull_work, mdoutf_get_fp_dhdl(outf), false, mdModulesNotifier);
270
271     gstat = global_stat_init(ir);
272
273     /* Check for polarizable models and flexible constraints */
274     shellfc = init_shell_flexcon(fplog,
275                                  top_global, constr ? constr->numFlexibleConstraints() : 0,
276                                  ir->nstcalcenergy, DOMAINDECOMP(cr));
277
278     {
279         double io = compute_io(ir, top_global->natoms, *groups, energyOutput.numEnergyTerms(), 1);
280         if ((io > 2000) && MASTER(cr))
281         {
282             fprintf(stderr,
283                     "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
284                     io);
285         }
286     }
287
288     // Local state only becomes valid now.
289     std::unique_ptr<t_state> stateInstance;
290     t_state *                state;
291
292
293     auto mdatoms = mdAtoms->mdatoms();
294
295     std::unique_ptr<UpdateConstrainCuda> integrator;
296
297     if (DOMAINDECOMP(cr))
298     {
299         dd_init_local_top(*top_global, &top);
300
301         stateInstance = std::make_unique<t_state>();
302         state         = stateInstance.get();
303         dd_init_local_state(cr->dd, state_global, state);
304
305         /* Distribute the charge groups over the nodes from the master node */
306         dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1,
307                             state_global, *top_global, ir, imdSession,
308                             pull_work,
309                             state, &f, mdAtoms, &top, fr,
310                             vsite, constr,
311                             nrnb, nullptr, FALSE);
312         shouldCheckNumberOfBondedInteractions = true;
313         upd.setNumAtoms(state->natoms);
314     }
315     else
316     {
317         state_change_natoms(state_global, state_global->natoms);
318         f.resizeWithPadding(state_global->natoms);
319         /* Copy the pointer to the global state */
320         state = state_global;
321
322         /* Generate and initialize new topology */
323         mdAlgorithmsSetupAtomData(cr, ir, *top_global, &top, fr,
324                                   &graph, mdAtoms, constr, vsite, shellfc);
325
326         upd.setNumAtoms(state->natoms);
327     }
328
329 /*****************************************************************************************/
330 // TODO: The following block of code should be refactored, once:
331 //       1. We have the useGpuForBufferOps variable set and available here and in do_force(...)
332 //       2. The proper GPU syncronization is introduced, so that the H2D and D2H data copies can be performed in the separate
333 //          stream owned by the StatePropagatorDataGpu
334     const auto &simulationWork     = runScheduleWork->simulationWork;
335     const bool  useGpuForPme       = simulationWork.useGpuPme;
336     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
337     // Temporary solution to make sure that the buffer ops are offloaded when update is offloaded
338     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
339     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
340
341
342     StatePropagatorDataGpu *stateGpu = fr->stateGpu;
343
344     if (useGpuForUpdate)
345     {
346         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr), "Domain decomposition is not supported with the GPU update.\n");
347         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
348                            "Either PME or short-ranged non-bonded interaction tasks must run on the GPU to use GPU update.\n");
349         GMX_RELEASE_ASSERT(ir->eI == eiMD, "Only the md integrator is supported with the GPU update.\n");
350         GMX_RELEASE_ASSERT(ir->etc != etcNOSEHOOVER, "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
351         GMX_RELEASE_ASSERT(ir->epc == epcNO || ir->epc == epcPARRINELLORAHMAN, "Only Parrinello-Rahman pressure control is supported with the GPU update.\n");
352         GMX_RELEASE_ASSERT(!mdatoms->haveVsites, "Virtual sites are not supported with the GPU update.\n");
353         GMX_RELEASE_ASSERT(ed == nullptr, "Essential dynamics is not supported with the GPU update.\n");
354         GMX_RELEASE_ASSERT(!ir->bPull && !ir->pull, "Pulling is not supported with the GPU update.\n");
355         GMX_RELEASE_ASSERT(fcd->orires.nr == 0, "Orientation restraints are not supported with the GPU update.\n");
356         GMX_RELEASE_ASSERT(fcd->disres.npair  == 0, "Distance restraints are not supported with the GPU update.\n");
357         GMX_RELEASE_ASSERT(ir->efep == efepNO, "Free energy perturbations are not supported with the GPU update.");
358
359         if (constr != nullptr && constr->numConstraintsTotal() > 0)
360         {
361             GMX_LOG(mdlog.info).asParagraph().
362                 appendText("Updating coordinates and applying constraints on the GPU.");
363         }
364         else
365         {
366             GMX_LOG(mdlog.info).asParagraph().
367                 appendText("Updating coordinates on the GPU.");
368         }
369         integrator = std::make_unique<UpdateConstrainCuda>(*ir, *top_global, stateGpu->getUpdateStream(), stateGpu->xUpdatedOnDevice());
370     }
371
372     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
373     {
374         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
375     }
376     if ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
377     {
378         changePinningPolicy(&f, PinningPolicy::PinnedIfSupported);
379     }
380     if (useGpuForUpdate)
381     {
382         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
383     }
384 /*****************************************************************************************/
385
386     // NOTE: The global state is no longer used at this point.
387     // But state_global is still used as temporary storage space for writing
388     // the global state to file and potentially for replica exchange.
389     // (Global topology should persist.)
390
391     update_mdatoms(mdatoms, state->lambda[efptMASS]);
392
393     if (ir->bExpanded)
394     {
395         /* Check nstexpanded here, because the grompp check was broken */
396         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
397         {
398             gmx_fatal(FARGS, "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
399         }
400         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation,
401                                ir, state->dfhist);
402     }
403
404     if (MASTER(cr))
405     {
406         EnergyElement::initializeEnergyHistory(
407                 startingBehavior, observablesHistory, &energyOutput);
408     }
409
410     preparePrevStepPullCom(ir, pull_work, mdatoms, state, state_global, cr,
411                            startingBehavior != StartingBehavior::NewSimulation);
412
413     // TODO: Remove this by converting AWH into a ForceProvider
414     auto awh = prepareAwhModule(fplog, *ir, state_global, cr, ms, startingBehavior != StartingBehavior::NewSimulation,
415                                 shellfc != nullptr,
416                                 opt2fn("-awh", nfile, fnm), pull_work);
417
418     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
419     if (useReplicaExchange && MASTER(cr))
420     {
421         repl_ex = init_replica_exchange(fplog, ms, top_global->natoms, ir,
422                                         replExParams);
423     }
424     /* PME tuning is only supported in the Verlet scheme, with PME for
425      * Coulomb. It is not supported with only LJ PME. */
426     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) &&
427                 !mdrunOptions.reproducible && ir->cutoff_scheme != ecutsGROUP);
428
429     pme_load_balancing_t *pme_loadbal      = nullptr;
430     if (bPMETune)
431     {
432         pme_loadbal_init(&pme_loadbal, cr, mdlog, *ir, state->box,
433                          *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu(),
434                          &bPMETunePrinting);
435     }
436
437     if (!ir->bContinuation)
438     {
439         if (state->flags & (1U << estV))
440         {
441             auto v = makeArrayRef(state->v);
442             /* Set the velocities of vsites, shells and frozen atoms to zero */
443             for (i = 0; i < mdatoms->homenr; i++)
444             {
445                 if (mdatoms->ptype[i] == eptVSite ||
446                     mdatoms->ptype[i] == eptShell)
447                 {
448                     clear_rvec(v[i]);
449                 }
450                 else if (mdatoms->cFREEZE)
451                 {
452                     for (m = 0; m < DIM; m++)
453                     {
454                         if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
455                         {
456                             v[i][m] = 0;
457                         }
458                     }
459                 }
460             }
461         }
462
463         if (constr)
464         {
465             /* Constrain the initial coordinates and velocities */
466             do_constrain_first(fplog, constr, ir, mdatoms,
467                                state->natoms,
468                                state->x.arrayRefWithPadding(),
469                                state->v.arrayRefWithPadding(),
470                                state->box, state->lambda[efptBONDED]);
471         }
472         if (vsite)
473         {
474             /* Construct the virtual sites for the initial configuration */
475             construct_vsites(vsite, state->x.rvec_array(), ir->delta_t, nullptr,
476                              top.idef.iparams, top.idef.il,
477                              fr->ePBC, fr->bMolPBC, cr, state->box);
478         }
479     }
480
481     if (ir->efep != efepNO)
482     {
483         /* Set free energy calculation frequency as the greatest common
484          * denominator of nstdhdl and repl_ex_nst. */
485         nstfep = ir->fepvals->nstdhdl;
486         if (ir->bExpanded)
487         {
488             nstfep = gmx_greatest_common_divisor(ir->expandedvals->nstexpanded, nstfep);
489         }
490         if (useReplicaExchange)
491         {
492             nstfep = gmx_greatest_common_divisor(replExParams.exchangeInterval, nstfep);
493         }
494     }
495
496     /* Be REALLY careful about what flags you set here. You CANNOT assume
497      * this is the first step, since we might be restarting from a checkpoint,
498      * and in that case we should not do any modifications to the state.
499      */
500     bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
501
502     // When restarting from a checkpoint, it can be appropriate to
503     // initialize ekind from quantities in the checkpoint. Otherwise,
504     // compute_globals must initialize ekind before the simulation
505     // starts/restarts. However, only the master rank knows what was
506     // found in the checkpoint file, so we have to communicate in
507     // order to coordinate the restart.
508     //
509     // TODO Consider removing this communication if/when checkpoint
510     // reading directly follows .tpr reading, because all ranks can
511     // agree on hasReadEkinState at that time.
512     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
513     if (PAR(cr))
514     {
515         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr);
516     }
517     if (hasReadEkinState)
518     {
519         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
520     }
521
522     unsigned int cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
523                                | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
524                                | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0)
525                                | (hasReadEkinState ? CGLO_READEKIN : 0));
526
527     bSumEkinhOld = FALSE;
528
529     t_vcm vcm(top_global->groups, *ir);
530     reportComRemovalInfo(fplog, vcm);
531
532     /* To minimize communication, compute_globals computes the COM velocity
533      * and the kinetic energy for the velocities without COM motion removed.
534      * Thus to get the kinetic energy without the COM contribution, we need
535      * to call compute_globals twice.
536      */
537     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
538     {
539         unsigned int cglo_flags_iteration = cglo_flags;
540         if (bStopCM && cgloIteration == 0)
541         {
542             cglo_flags_iteration |= CGLO_STOPCM;
543             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
544         }
545         compute_globals(gstat, cr, ir, fr, ekind,
546                         state->x.rvec_array(), state->v.rvec_array(), state->box, state->lambda[efptVDW],
547                         mdatoms, nrnb, &vcm,
548                         nullptr, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
549                         constr, &nullSignaller, state->box,
550                         &totalNumberOfBondedInteractions, &bSumEkinhOld, cglo_flags_iteration
551                         | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0));
552         if (cglo_flags_iteration & CGLO_STOPCM)
553         {
554             /* At initialization, do not pass x with acceleration-correction mode
555              * to avoid (incorrect) correction of the initial coordinates.
556              */
557             rvec *xPtr = nullptr;
558             if (vcm.mode != ecmLINEAR_ACCELERATION_CORRECTION)
559             {
560                 xPtr = state->x.rvec_array();
561             }
562             process_and_stopcm_grp(fplog, &vcm, *mdatoms, xPtr, state->v.rvec_array());
563             inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
564         }
565     }
566     checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
567                                     top_global, &top, state->x.rvec_array(), state->box,
568                                     &shouldCheckNumberOfBondedInteractions);
569     if (ir->eI == eiVVAK)
570     {
571         /* a second call to get the half step temperature initialized as well */
572         /* we do the same call as above, but turn the pressure off -- internally to
573            compute_globals, this is recognized as a velocity verlet half-step
574            kinetic energy calculation.  This minimized excess variables, but
575            perhaps loses some logic?*/
576
577         compute_globals(gstat, cr, ir, fr, ekind,
578                         state->x.rvec_array(), state->v.rvec_array(), state->box, state->lambda[efptVDW],
579                         mdatoms, nrnb, &vcm,
580                         nullptr, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
581                         constr, &nullSignaller, state->box,
582                         nullptr, &bSumEkinhOld,
583                         cglo_flags & ~CGLO_PRESSURE);
584     }
585
586     /* Calculate the initial half step temperature, and save the ekinh_old */
587     if (startingBehavior == StartingBehavior::NewSimulation)
588     {
589         for (i = 0; (i < ir->opts.ngtc); i++)
590         {
591             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
592         }
593     }
594
595     /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
596        temperature control */
597     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
598
599     if (MASTER(cr))
600     {
601         if (!ir->bContinuation)
602         {
603             if (constr && ir->eConstrAlg == econtLINCS)
604             {
605                 fprintf(fplog,
606                         "RMS relative constraint deviation after constraining: %.2e\n",
607                         constr->rmsd());
608             }
609             if (EI_STATE_VELOCITY(ir->eI))
610             {
611                 real temp = enerd->term[F_TEMP];
612                 if (ir->eI != eiVV)
613                 {
614                     /* Result of Ekin averaged over velocities of -half
615                      * and +half step, while we only have -half step here.
616                      */
617                     temp *= 2;
618                 }
619                 fprintf(fplog, "Initial temperature: %g K\n", temp);
620             }
621         }
622
623         char tbuf[20];
624         fprintf(stderr, "starting mdrun '%s'\n",
625                 *(top_global->name));
626         if (ir->nsteps >= 0)
627         {
628             sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t);
629         }
630         else
631         {
632             sprintf(tbuf, "%s", "infinite");
633         }
634         if (ir->init_step > 0)
635         {
636             fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
637                     gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf,
638                     gmx_step_str(ir->init_step, sbuf2),
639                     ir->init_step*ir->delta_t);
640         }
641         else
642         {
643             fprintf(stderr, "%s steps, %s ps.\n",
644                     gmx_step_str(ir->nsteps, sbuf), tbuf);
645         }
646         fprintf(fplog, "\n");
647     }
648
649     walltime_accounting_start_time(walltime_accounting);
650     wallcycle_start(wcycle, ewcRUN);
651     print_start(fplog, cr, walltime_accounting, "mdrun");
652
653 #if GMX_FAHCORE
654     /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
655     int chkpt_ret = fcCheckPointParallel( cr->nodeid,
656                                           NULL, 0);
657     if (chkpt_ret == 0)
658     {
659         gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 );
660     }
661 #endif
662
663     /***********************************************************
664      *
665      *             Loop over MD steps
666      *
667      ************************************************************/
668
669     bFirstStep       = TRUE;
670     /* Skip the first Nose-Hoover integration when we get the state from tpx */
671     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
672     bSumEkinhOld     = FALSE;
673     bExchanged       = FALSE;
674     bNeedRepartition = FALSE;
675
676     bool simulationsShareState = false;
677     int  nstSignalComm         = nstglobalcomm;
678     {
679         // TODO This implementation of ensemble orientation restraints is nasty because
680         // a user can't just do multi-sim with single-sim orientation restraints.
681         bool usingEnsembleRestraints = (fcd->disres.nsystems > 1) || ((ms != nullptr) && (fcd->orires.nr != 0));
682         bool awhUsesMultiSim         = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr));
683
684         // Replica exchange, ensemble restraints and AWH need all
685         // simulations to remain synchronized, so they need
686         // checkpoints and stop conditions to act on the same step, so
687         // the propagation of such signals must take place between
688         // simulations, not just within simulations.
689         // TODO: Make algorithm initializers set these flags.
690         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
691
692         if (simulationsShareState)
693         {
694             // Inter-simulation signal communication does not need to happen
695             // often, so we use a minimum of 200 steps to reduce overhead.
696             const int c_minimumInterSimulationSignallingInterval = 200;
697             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1)/nstglobalcomm)*nstglobalcomm;
698         }
699     }
700
701     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
702                 compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]), simulationsShareState,
703                 MASTER(cr), ir->nstlist, mdrunOptions.reproducible, nstSignalComm,
704                 mdrunOptions.maximumHoursToRun, ir->nstlist == 0, fplog, step, bNS, walltime_accounting);
705
706     auto checkpointHandler = std::make_unique<CheckpointHandler>(
707                 compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
708                 simulationsShareState, ir->nstlist == 0, MASTER(cr),
709                 mdrunOptions.writeConfout, mdrunOptions.checkpointOptions.period);
710
711     const bool resetCountersIsLocal = true;
712     auto       resetHandler         = std::make_unique<ResetHandler>(
713                 compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]), !resetCountersIsLocal,
714                 ir->nsteps, MASTER(cr), mdrunOptions.timingOptions.resetHalfway,
715                 mdrunOptions.maximumHoursToRun, mdlog, wcycle, walltime_accounting);
716
717     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
718
719     step     = ir->init_step;
720     step_rel = 0;
721
722     // TODO extract this to new multi-simulation module
723     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
724     {
725         if (!multisim_int_all_are_equal(ms, ir->nsteps))
726         {
727             GMX_LOG(mdlog.warning).appendText(
728                     "Note: The number of steps is not consistent across multi simulations,\n"
729                     "but we are proceeding anyway!");
730         }
731         if (!multisim_int_all_are_equal(ms, ir->init_step))
732         {
733             GMX_LOG(mdlog.warning).appendText(
734                     "Note: The initial step is not consistent across multi simulations,\n"
735                     "but we are proceeding anyway!");
736         }
737     }
738
739     /* and stop now if we should */
740     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
741     while (!bLastStep)
742     {
743
744         /* Determine if this is a neighbor search step */
745         bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
746
747         if (bPMETune && bNStList)
748         {
749             // This has to be here because PME load balancing is called so early.
750             // TODO: Move to after all booleans are defined.
751             if (useGpuForUpdate && !bFirstStep)
752             {
753                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
754                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
755             }
756             /* PME grid + cut-off optimization with GPUs or PME nodes */
757             pme_loadbal_do(pme_loadbal, cr,
758                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
759                            fplog, mdlog,
760                            *ir, fr, state->box, state->x,
761                            wcycle,
762                            step, step_rel,
763                            &bPMETunePrinting);
764         }
765
766         wallcycle_start(wcycle, ewcSTEP);
767
768         bLastStep = (step_rel == ir->nsteps);
769         t         = t0 + step*ir->delta_t;
770
771         // TODO Refactor this, so that nstfep does not need a default value of zero
772         if (ir->efep != efepNO || ir->bSimTemp)
773         {
774             /* find and set the current lambdas */
775             setCurrentLambdasLocal(step, ir->fepvals, lam0, state->lambda, state->fep_state);
776
777             bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
778             bDoFEP       = ((ir->efep != efepNO) && do_per_step(step, nstfep));
779             bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded)
780                             && (ir->bExpanded) && (step > 0) &&
781                             (startingBehavior == StartingBehavior::NewSimulation));
782         }
783
784         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep &&
785                      do_per_step(step, replExParams.exchangeInterval));
786
787         if (doSimulatedAnnealing)
788         {
789             update_annealing_target_temp(ir, t, &upd);
790         }
791
792         /* Stop Center of Mass motion */
793         bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
794
795         /* Determine whether or not to do Neighbour Searching */
796         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
797
798         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
799
800         /* do_log triggers energy and virial calculation. Because this leads
801          * to different code paths, forces can be different. Thus for exact
802          * continuation we should avoid extra log output.
803          * Note that the || bLastStep can result in non-exact continuation
804          * beyond the last step. But we don't consider that to be an issue.
805          */
806         do_log     =
807             (do_per_step(step, ir->nstlog) ||
808              (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) ||
809              bLastStep);
810         do_verbose = mdrunOptions.verbose &&
811             (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
812
813         if (useGpuForUpdate && !bFirstStep)
814         {
815             // Copy velocities from the GPU when needed:
816             // - On search steps to keep copy on host (device buffers are reinitialized).
817             // - When needed for the output.
818             if (bNS || do_per_step(step, ir->nstvout))
819             {
820                 stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
821                 stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
822             }
823
824             // Copy coordinate from the GPU when needed at the search step.
825             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
826             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
827             if (bNS)
828             {
829                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
830                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
831             }
832         }
833
834         if (bNS && !(bFirstStep && ir->bContinuation))
835         {
836             bMasterState = FALSE;
837             /* Correct the new box if it is too skewed */
838             if (inputrecDynamicBox(ir))
839             {
840                 if (correct_box(fplog, step, state->box, graph))
841                 {
842                     bMasterState = TRUE;
843                 }
844             }
845             if (DOMAINDECOMP(cr) && bMasterState)
846             {
847                 dd_collect_state(cr->dd, state, state_global);
848             }
849
850             if (DOMAINDECOMP(cr))
851             {
852                 /* Repartition the domain decomposition */
853                 dd_partition_system(fplog, mdlog, step, cr,
854                                     bMasterState, nstglobalcomm,
855                                     state_global, *top_global, ir, imdSession,
856                                     pull_work,
857                                     state, &f, mdAtoms, &top, fr,
858                                     vsite, constr,
859                                     nrnb, wcycle,
860                                     do_verbose && !bPMETunePrinting);
861                 shouldCheckNumberOfBondedInteractions = true;
862                 upd.setNumAtoms(state->natoms);
863             }
864         }
865
866         if (MASTER(cr) && do_log)
867         {
868             energyOutput.printHeader(fplog, step, t); /* can we improve the information printed here? */
869         }
870
871         if (ir->efep != efepNO)
872         {
873             update_mdatoms(mdatoms, state->lambda[efptMASS]);
874         }
875
876         if (bExchanged)
877         {
878
879             /* We need the kinetic energy at minus the half step for determining
880              * the full step kinetic energy and possibly for T-coupling.*/
881             /* This may not be quite working correctly yet . . . . */
882             compute_globals(gstat, cr, ir, fr, ekind,
883                             state->x.rvec_array(), state->v.rvec_array(), state->box, state->lambda[efptVDW],
884                             mdatoms, nrnb, &vcm,
885                             wcycle, enerd, nullptr, nullptr, nullptr, nullptr, mu_tot,
886                             constr, &nullSignaller, state->box,
887                             &totalNumberOfBondedInteractions, &bSumEkinhOld,
888                             CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
889             checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
890                                             top_global, &top, state->x.rvec_array(), state->box,
891                                             &shouldCheckNumberOfBondedInteractions);
892         }
893         clear_mat(force_vir);
894
895         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
896
897         /* Determine the energy and pressure:
898          * at nstcalcenergy steps and at energy output steps (set below).
899          */
900         if (EI_VV(ir->eI) && (!bInitStep))
901         {
902             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
903             bCalcVir      = bCalcEnerStep ||
904                 (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple)));
905         }
906         else
907         {
908             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
909             bCalcVir      = bCalcEnerStep ||
910                 (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
911         }
912         bCalcEner = bCalcEnerStep;
913
914         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
915
916         if (do_ene || do_log || bDoReplEx)
917         {
918             bCalcVir  = TRUE;
919             bCalcEner = TRUE;
920         }
921
922         /* Do we need global communication ? */
923         bGStat = (bCalcVir || bCalcEner || bStopCM ||
924                   do_per_step(step, nstglobalcomm) ||
925                   (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step-1, nstglobalcomm)));
926
927         force_flags = (GMX_FORCE_STATECHANGED |
928                        ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0) |
929                        GMX_FORCE_ALLFORCES |
930                        (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
931                        (bCalcEner ? GMX_FORCE_ENERGY : 0) |
932                        (bDoFEP ? GMX_FORCE_DHDL : 0)
933                        );
934
935         if (shellfc)
936         {
937             /* Now is the time to relax the shells */
938             relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose,
939                                 enforcedRotation, step,
940                                 ir, imdSession, pull_work, bNS, force_flags, &top,
941                                 constr, enerd, fcd,
942                                 state->natoms,
943                                 state->x.arrayRefWithPadding(),
944                                 state->v.arrayRefWithPadding(),
945                                 state->box,
946                                 state->lambda,
947                                 &state->hist,
948                                 f.arrayRefWithPadding(), force_vir, mdatoms,
949                                 nrnb, wcycle, graph,
950                                 shellfc, fr, runScheduleWork, t, mu_tot,
951                                 vsite,
952                                 ddBalanceRegionHandler);
953         }
954         else
955         {
956             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias is updated
957                (or the AWH update will be performed twice for one step when continuing). It would be best to
958                call this update function from do_md_trajectory_writing but that would occur after do_force.
959                One would have to divide the update_awh function into one function applying the AWH force
960                and one doing the AWH bias update. The update AWH bias function could then be called after
961                do_md_trajectory_writing (then containing update_awh_history).
962                The checkpointing will in the future probably moved to the start of the md loop which will
963                rid of this issue. */
964             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
965             {
966                 awh->updateHistory(state_global->awhHistory.get());
967             }
968
969             /* The coordinates (x) are shifted (to get whole molecules)
970              * in do_force.
971              * This is parallellized as well, and does communication too.
972              * Check comments in sim_util.c
973              */
974             do_force(fplog, cr, ms, ir, awh.get(), enforcedRotation, imdSession,
975                      pull_work,
976                      step, nrnb, wcycle, &top,
977                      state->box, state->x.arrayRefWithPadding(), &state->hist,
978                      f.arrayRefWithPadding(), force_vir, mdatoms, enerd, fcd,
979                      state->lambda, graph,
980                      fr, runScheduleWork, vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
981                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
982                      ddBalanceRegionHandler);
983         }
984
985         // VV integrators do not need the following velocity half step
986         // if it is the first step after starting from a checkpoint.
987         // That is, the half step is needed on all other steps, and
988         // also the first step when starting from a .tpr file.
989         if (EI_VV(ir->eI) && (!bFirstStep || startingBehavior == StartingBehavior::NewSimulation))
990         /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
991         {
992             rvec *vbuf = nullptr;
993
994             wallcycle_start(wcycle, ewcUPDATE);
995             if (ir->eI == eiVV && bInitStep)
996             {
997                 /* if using velocity verlet with full time step Ekin,
998                  * take the first half step only to compute the
999                  * virial for the first step. From there,
1000                  * revert back to the initial coordinates
1001                  * so that the input is actually the initial step.
1002                  */
1003                 snew(vbuf, state->natoms);
1004                 copy_rvecn(state->v.rvec_array(), vbuf, 0, state->natoms); /* should make this better for parallelizing? */
1005             }
1006             else
1007             {
1008                 /* this is for NHC in the Ekin(t+dt/2) version of vv */
1009                 trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1);
1010             }
1011
1012             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
1013                           ekind, M, &upd, etrtVELOCITY1,
1014                           cr, constr);
1015
1016             wallcycle_stop(wcycle, ewcUPDATE);
1017             constrain_velocities(step, nullptr,
1018                                  state,
1019                                  shake_vir,
1020                                  constr,
1021                                  bCalcVir, do_log, do_ene);
1022             wallcycle_start(wcycle, ewcUPDATE);
1023             /* if VV, compute the pressure and constraints */
1024             /* For VV2, we strictly only need this if using pressure
1025              * control, but we really would like to have accurate pressures
1026              * printed out.
1027              * Think about ways around this in the future?
1028              * For now, keep this choice in comments.
1029              */
1030             /*bPres = (ir->eI==eiVV || inputrecNptTrotter(ir)); */
1031             /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && inputrecNptTrotter(ir)));*/
1032             bPres = TRUE;
1033             bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
1034             if (bCalcEner && ir->eI == eiVVAK)
1035             {
1036                 bSumEkinhOld = TRUE;
1037             }
1038             /* for vv, the first half of the integration actually corresponds to the previous step.
1039                So we need information from the last step in the first half of the integration */
1040             if (bGStat || do_per_step(step-1, nstglobalcomm))
1041             {
1042                 wallcycle_stop(wcycle, ewcUPDATE);
1043                 compute_globals(gstat, cr, ir, fr, ekind,
1044                                 state->x.rvec_array(), state->v.rvec_array(), state->box, state->lambda[efptVDW],
1045                                 mdatoms, nrnb, &vcm,
1046                                 wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
1047                                 constr, &nullSignaller, state->box,
1048                                 &totalNumberOfBondedInteractions, &bSumEkinhOld,
1049                                 (bGStat ? CGLO_GSTAT : 0)
1050                                 | (bCalcEner ? CGLO_ENERGY : 0)
1051                                 | (bTemp ? CGLO_TEMPERATURE : 0)
1052                                 | (bPres ? CGLO_PRESSURE : 0)
1053                                 | (bPres ? CGLO_CONSTRAINT : 0)
1054                                 | (bStopCM ? CGLO_STOPCM : 0)
1055                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
1056                                 | CGLO_SCALEEKIN
1057                                 );
1058                 /* explanation of above:
1059                    a) We compute Ekin at the full time step
1060                    if 1) we are using the AveVel Ekin, and it's not the
1061                    initial step, or 2) if we are using AveEkin, but need the full
1062                    time step kinetic energy for the pressure (always true now, since we want accurate statistics).
1063                    b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
1064                    EkinAveVel because it's needed for the pressure */
1065                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1066                                                 top_global, &top, state->x.rvec_array(), state->box,
1067                                                 &shouldCheckNumberOfBondedInteractions);
1068                 if (bStopCM)
1069                 {
1070                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, state->x.rvec_array(), state->v.rvec_array());
1071                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1072                 }
1073                 wallcycle_start(wcycle, ewcUPDATE);
1074             }
1075             /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
1076             if (!bInitStep)
1077             {
1078                 if (bTrotter)
1079                 {
1080                     m_add(force_vir, shake_vir, total_vir);     /* we need the un-dispersion corrected total vir here */
1081                     trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
1082
1083                     /* TODO This is only needed when we're about to write
1084                      * a checkpoint, because we use it after the restart
1085                      * (in a kludge?). But what should we be doing if
1086                      * the startingBehavior is NewSimulation or bInitStep are true? */
1087                     if (inputrecNptTrotter(ir) || inputrecNphTrotter(ir))
1088                     {
1089                         copy_mat(shake_vir, state->svir_prev);
1090                         copy_mat(force_vir, state->fvir_prev);
1091                     }
1092                     if (inputrecNvtTrotter(ir) && ir->eI == eiVV)
1093                     {
1094                         /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
1095                         enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, nullptr, (ir->eI == eiVV), FALSE);
1096                         enerd->term[F_EKIN] = trace(ekind->ekin);
1097                     }
1098                 }
1099                 else if (bExchanged)
1100                 {
1101                     wallcycle_stop(wcycle, ewcUPDATE);
1102                     /* We need the kinetic energy at minus the half step for determining
1103                      * the full step kinetic energy and possibly for T-coupling.*/
1104                     /* This may not be quite working correctly yet . . . . */
1105                     compute_globals(gstat, cr, ir, fr, ekind,
1106                                     state->x.rvec_array(), state->v.rvec_array(), state->box, state->lambda[efptVDW],
1107                                     mdatoms, nrnb, &vcm,
1108                                     wcycle, enerd, nullptr, nullptr, nullptr, nullptr, mu_tot,
1109                                     constr, &nullSignaller, state->box,
1110                                     nullptr, &bSumEkinhOld,
1111                                     CGLO_GSTAT | CGLO_TEMPERATURE);
1112                     wallcycle_start(wcycle, ewcUPDATE);
1113                 }
1114             }
1115             /* if it's the initial step, we performed this first step just to get the constraint virial */
1116             if (ir->eI == eiVV && bInitStep)
1117             {
1118                 copy_rvecn(vbuf, state->v.rvec_array(), 0, state->natoms);
1119                 sfree(vbuf);
1120             }
1121             wallcycle_stop(wcycle, ewcUPDATE);
1122         }
1123
1124         /* compute the conserved quantity */
1125         if (EI_VV(ir->eI))
1126         {
1127             saved_conserved_quantity = NPT_energy(ir, state, &MassQ);
1128             if (ir->eI == eiVV)
1129             {
1130                 last_ekin = enerd->term[F_EKIN];
1131             }
1132             if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
1133             {
1134                 saved_conserved_quantity -= enerd->term[F_DISPCORR];
1135             }
1136             /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
1137             if (ir->efep != efepNO)
1138             {
1139                 sum_dhdl(enerd, state->lambda, *ir->fepvals);
1140             }
1141         }
1142
1143         /* ########  END FIRST UPDATE STEP  ############## */
1144         /* ########  If doing VV, we now have v(dt) ###### */
1145         if (bDoExpanded)
1146         {
1147             /* perform extended ensemble sampling in lambda - we don't
1148                actually move to the new state before outputting
1149                statistics, but if performing simulated tempering, we
1150                do update the velocities and the tau_t. */
1151
1152             lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state, state->dfhist, step, state->v.rvec_array(), mdatoms);
1153             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1154             if (MASTER(cr))
1155             {
1156                 copy_df_history(state_global->dfhist, state->dfhist);
1157             }
1158         }
1159
1160         // Copy coordinate from the GPU for the output if the update is offloaded and
1161         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1162         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork &&
1163             (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)))
1164         {
1165             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1166             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1167         }
1168         /* Now we have the energies and forces corresponding to the
1169          * coordinates at time t. We must output all of this before
1170          * the update.
1171          */
1172         do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t,
1173                                  ir, state, state_global, observablesHistory,
1174                                  top_global, fr,
1175                                  outf, energyOutput, ekind, f,
1176                                  checkpointHandler->isCheckpointingStep(),
1177                                  bRerunMD, bLastStep,
1178                                  mdrunOptions.writeConfout,
1179                                  bSumEkinhOld);
1180         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1181         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x.rvec_array(), t);
1182
1183         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1184         if (startingBehavior != StartingBehavior::NewSimulation &&
1185             (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1186         {
1187             copy_mat(state->svir_prev, shake_vir);
1188             copy_mat(state->fvir_prev, force_vir);
1189         }
1190
1191         stopHandler->setSignal();
1192         resetHandler->setSignal(walltime_accounting);
1193
1194         if (bGStat || !PAR(cr))
1195         {
1196             /* In parallel we only have to check for checkpointing in steps
1197              * where we do global communication,
1198              *  otherwise the other nodes don't know.
1199              */
1200             checkpointHandler->setSignal(walltime_accounting);
1201         }
1202
1203         /* #########   START SECOND UPDATE STEP ################# */
1204
1205         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen controlled
1206            in preprocessing */
1207
1208         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1209         {
1210             gmx_bool bIfRandomize;
1211             bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, &upd, constr);
1212             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1213             if (constr && bIfRandomize)
1214             {
1215                 constrain_velocities(step, nullptr,
1216                                      state,
1217                                      tmp_vir,
1218                                      constr,
1219                                      bCalcVir, do_log, do_ene);
1220             }
1221         }
1222         /* Box is changed in update() when we do pressure coupling,
1223          * but we should still use the old box for energy corrections and when
1224          * writing it to the energy file, so it matches the trajectory files for
1225          * the same timestep above. Make a copy in a separate array.
1226          */
1227         copy_mat(state->box, lastbox);
1228
1229         dvdl_constr = 0;
1230
1231         wallcycle_start(wcycle, ewcUPDATE);
1232         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1233         if (bTrotter)
1234         {
1235             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
1236             /* We can only do Berendsen coupling after we have summed
1237              * the kinetic energy or virial. Since the happens
1238              * in global_state after update, we should only do it at
1239              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1240              */
1241         }
1242         else
1243         {
1244             update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
1245             update_pcouple_before_coordinates(fplog, step, ir, state,
1246                                               parrinellorahmanMu, M,
1247                                               bInitStep);
1248         }
1249
1250         if (EI_VV(ir->eI))
1251         {
1252             /* velocity half-step update */
1253             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
1254                           ekind, M, &upd, etrtVELOCITY2,
1255                           cr, constr);
1256         }
1257
1258         /* Above, initialize just copies ekinh into ekin,
1259          * it doesn't copy position (for VV),
1260          * and entire integrator for MD.
1261          */
1262
1263         if (ir->eI == eiVVAK)
1264         {
1265             cbuf.resize(state->x.size());
1266             std::copy(state->x.begin(), state->x.end(), cbuf.begin());
1267         }
1268
1269         /* With leap-frog type integrators we compute the kinetic energy
1270          * at a whole time step as the average of the half-time step kinetic
1271          * energies of two subsequent steps. Therefore we need to compute the
1272          * half step kinetic energy also if we need energies at the next step.
1273          */
1274         const bool needHalfStepKineticEnergy = (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm));
1275
1276         if (useGpuForUpdate)
1277         {
1278             if (bNS)
1279             {
1280                 integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(), stateGpu->getForces(),
1281                                 top.idef, *mdatoms, ekind->ngtc);
1282                 t_pbc pbc;
1283                 set_pbc(&pbc, epbcXYZ, state->box);
1284                 integrator->setPbc(&pbc);
1285
1286                 // Copy data to the GPU after buffers might have being reinitialized
1287                 stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1288                 stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1289             }
1290
1291             stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), AtomLocality::All);
1292
1293             // TODO: Use StepWorkload fields.
1294             bool useGpuFBufferOps = simulationWork.useGpuBufferOps && !(bCalcVir || bCalcEner);
1295
1296             bool doTempCouple       = (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1297             bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1298
1299             // This applies Leap-Frog, LINCS and SETTLE in succession
1300             integrator->integrate(stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::Local, useGpuFBufferOps),
1301                                   ir->delta_t, true, bCalcVir, shake_vir,
1302                                   doTempCouple, ekind->tcstat,
1303                                   doParrinelloRahman, ir->nstpcouple*ir->delta_t, M);
1304
1305             // Copy velocities D2H after update if:
1306             // - Globals are computed this step (includes the energy output steps).
1307             // - Temperature is needed for the next step.
1308             if (bGStat || needHalfStepKineticEnergy)
1309             {
1310                 stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1311                 stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1312             }
1313         }
1314         else
1315         {
1316             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
1317                           ekind, M, &upd, etrtPOSITION, cr, constr);
1318
1319             wallcycle_stop(wcycle, ewcUPDATE);
1320
1321             constrain_coordinates(step, &dvdl_constr, state,
1322                                   shake_vir,
1323                                   &upd, constr,
1324                                   bCalcVir, do_log, do_ene);
1325
1326             update_sd_second_half(step, &dvdl_constr, ir, mdatoms, state,
1327                                   cr, nrnb, wcycle, &upd, constr, do_log, do_ene);
1328             finish_update(ir, mdatoms,
1329                           state, graph,
1330                           nrnb, wcycle, &upd, constr);
1331         }
1332
1333         if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1334         {
1335             updatePrevStepPullCom(pull_work, state);
1336         }
1337
1338         if (ir->eI == eiVVAK)
1339         {
1340             /* erase F_EKIN and F_TEMP here? */
1341             /* just compute the kinetic energy at the half step to perform a trotter step */
1342             compute_globals(gstat, cr, ir, fr, ekind,
1343                             state->x.rvec_array(), state->v.rvec_array(), state->box, state->lambda[efptVDW],
1344                             mdatoms, nrnb, &vcm,
1345                             wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
1346                             constr, &nullSignaller, lastbox,
1347                             nullptr, &bSumEkinhOld,
1348                             (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE
1349                             );
1350             wallcycle_start(wcycle, ewcUPDATE);
1351             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
1352             /* now we know the scaling, we can compute the positions again */
1353             std::copy(cbuf.begin(), cbuf.end(), state->x.begin());
1354
1355             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
1356                           ekind, M, &upd, etrtPOSITION, cr, constr);
1357             wallcycle_stop(wcycle, ewcUPDATE);
1358
1359             /* do we need an extra constraint here? just need to copy out of as_rvec_array(state->v.data()) to upd->xp? */
1360             /* are the small terms in the shake_vir here due
1361              * to numerical errors, or are they important
1362              * physically? I'm thinking they are just errors, but not completely sure.
1363              * For now, will call without actually constraining, constr=NULL*/
1364             finish_update(ir, mdatoms,
1365                           state, graph,
1366                           nrnb, wcycle, &upd, nullptr);
1367         }
1368         if (EI_VV(ir->eI))
1369         {
1370             /* this factor or 2 correction is necessary
1371                because half of the constraint force is removed
1372                in the vv step, so we have to double it.  See
1373                the Redmine issue #1255.  It is not yet clear
1374                if the factor of 2 is exact, or just a very
1375                good approximation, and this will be
1376                investigated.  The next step is to see if this
1377                can be done adding a dhdl contribution from the
1378                rattle step, but this is somewhat more
1379                complicated with the current code. Will be
1380                investigated, hopefully for 4.6.3. However,
1381                this current solution is much better than
1382                having it completely wrong.
1383              */
1384             enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
1385         }
1386         else
1387         {
1388             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1389         }
1390
1391         if (vsite != nullptr)
1392         {
1393             wallcycle_start(wcycle, ewcVSITECONSTR);
1394             if (graph != nullptr)
1395             {
1396                 shift_self(graph, state->box, state->x.rvec_array());
1397             }
1398             construct_vsites(vsite, state->x.rvec_array(), ir->delta_t, state->v.rvec_array(),
1399                              top.idef.iparams, top.idef.il,
1400                              fr->ePBC, fr->bMolPBC, cr, state->box);
1401
1402             if (graph != nullptr)
1403             {
1404                 unshift_self(graph, state->box, state->x.rvec_array());
1405             }
1406             wallcycle_stop(wcycle, ewcVSITECONSTR);
1407         }
1408
1409         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1410         /* With Leap-Frog we can skip compute_globals at
1411          * non-communication steps, but we need to calculate
1412          * the kinetic energy one step before communication.
1413          */
1414         {
1415             // Organize to do inter-simulation signalling on steps if
1416             // and when algorithms require it.
1417             bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1418
1419             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1420             {
1421                 // Copy coordinates when needed to stop the CM motion.
1422                 if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
1423                 {
1424                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1425                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1426                 }
1427                 // Since we're already communicating at this step, we
1428                 // can propagate intra-simulation signals. Note that
1429                 // check_nstglobalcomm has the responsibility for
1430                 // choosing the value of nstglobalcomm that is one way
1431                 // bGStat becomes true, so we can't get into a
1432                 // situation where e.g. checkpointing can't be
1433                 // signalled.
1434                 bool                doIntraSimSignal = true;
1435                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1436
1437                 compute_globals(gstat, cr, ir, fr, ekind,
1438                                 state->x.rvec_array(), state->v.rvec_array(), state->box, state->lambda[efptVDW],
1439                                 mdatoms, nrnb, &vcm,
1440                                 wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
1441                                 constr, &signaller,
1442                                 lastbox,
1443                                 &totalNumberOfBondedInteractions, &bSumEkinhOld,
1444                                 (bGStat ? CGLO_GSTAT : 0)
1445                                 | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1446                                 | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1447                                 | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1448                                 | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
1449                                 | CGLO_CONSTRAINT
1450                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
1451                                 );
1452                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1453                                                 top_global, &top, state->x.rvec_array(), state->box,
1454                                                 &shouldCheckNumberOfBondedInteractions);
1455                 if (!EI_VV(ir->eI) && bStopCM)
1456                 {
1457                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, state->x.rvec_array(), state->v.rvec_array());
1458                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1459
1460                     // TODO: The special case of removing CM motion should be dealt more gracefully
1461                     if (useGpuForUpdate)
1462                     {
1463                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1464                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1465                     }
1466                 }
1467             }
1468         }
1469
1470         /* #############  END CALC EKIN AND PRESSURE ################# */
1471
1472         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1473            the virial that should probably be addressed eventually. state->veta has better properies,
1474            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1475            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1476
1477         if (ir->efep != efepNO && !EI_VV(ir->eI))
1478         {
1479             /* Sum up the foreign energy and dhdl terms for md and sd.
1480                Currently done every step so that dhdl is correct in the .edr */
1481             sum_dhdl(enerd, state->lambda, *ir->fepvals);
1482         }
1483
1484         update_pcouple_after_coordinates(fplog, step, ir, mdatoms,
1485                                          pres, force_vir, shake_vir,
1486                                          parrinellorahmanMu,
1487                                          state, nrnb, &upd);
1488
1489         /* ################# END UPDATE STEP 2 ################# */
1490         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1491
1492         /* The coordinates (x) were unshifted in update */
1493         if (!bGStat)
1494         {
1495             /* We will not sum ekinh_old,
1496              * so signal that we still have to do it.
1497              */
1498             bSumEkinhOld = TRUE;
1499         }
1500
1501         if (bCalcEner)
1502         {
1503             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1504
1505             /* use the directly determined last velocity, not actually the averaged half steps */
1506             if (bTrotter && ir->eI == eiVV)
1507             {
1508                 enerd->term[F_EKIN] = last_ekin;
1509             }
1510             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1511
1512             if (integratorHasConservedEnergyQuantity(ir))
1513             {
1514                 if (EI_VV(ir->eI))
1515                 {
1516                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1517                 }
1518                 else
1519                 {
1520                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1521                 }
1522             }
1523             /* #########  END PREPARING EDR OUTPUT  ###########  */
1524         }
1525
1526         /* Output stuff */
1527         if (MASTER(cr))
1528         {
1529             if (fplog && do_log && bDoExpanded)
1530             {
1531                 /* only needed if doing expanded ensemble */
1532                 PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : nullptr,
1533                                           state_global->dfhist, state->fep_state, ir->nstlog, step);
1534             }
1535             if (bCalcEner)
1536             {
1537                 energyOutput.addDataAtEnergyStep(bDoDHDL, bCalcEnerStep,
1538                                                  t, mdatoms->tmass, enerd, state,
1539                                                  ir->fepvals, ir->expandedvals, lastbox,
1540                                                  shake_vir, force_vir, total_vir, pres,
1541                                                  ekind, mu_tot, constr);
1542             }
1543             else
1544             {
1545                 energyOutput.recordNonEnergyStep();
1546             }
1547
1548             gmx_bool do_dr  = do_per_step(step, ir->nstdisreout);
1549             gmx_bool do_or  = do_per_step(step, ir->nstorireout);
1550
1551             if (doSimulatedAnnealing)
1552             {
1553                 energyOutput.printAnnealingTemperatures(do_log ? fplog : nullptr, groups, &(ir->opts));
1554             }
1555             if (do_log || do_ene || do_dr || do_or)
1556             {
1557                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or,
1558                                                    do_log ? fplog : nullptr,
1559                                                    step, t,
1560                                                    fcd, awh.get());
1561             }
1562
1563             if (ir->bPull)
1564             {
1565                 pull_print_output(pull_work, step, t);
1566             }
1567
1568             if (do_per_step(step, ir->nstlog))
1569             {
1570                 if (fflush(fplog) != 0)
1571                 {
1572                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1573                 }
1574             }
1575         }
1576         if (bDoExpanded)
1577         {
1578             /* Have to do this part _after_ outputting the logfile and the edr file */
1579             /* Gets written into the state at the beginning of next loop*/
1580             state->fep_state = lamnew;
1581         }
1582         /* Print the remaining wall clock time for the run */
1583         if (isMasterSimMasterRank(ms, MASTER(cr)) &&
1584             (do_verbose || gmx_got_usr_signal()) &&
1585             !bPMETunePrinting)
1586         {
1587             if (shellfc)
1588             {
1589                 fprintf(stderr, "\n");
1590             }
1591             print_time(stderr, walltime_accounting, step, ir, cr);
1592         }
1593
1594         /* Ion/water position swapping.
1595          * Not done in last step since trajectory writing happens before this call
1596          * in the MD loop and exchanges would be lost anyway. */
1597         bNeedRepartition = FALSE;
1598         if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep &&
1599             do_per_step(step, ir->swap->nstswap))
1600         {
1601             bNeedRepartition = do_swapcoords(cr, step, t, ir, swap, wcycle,
1602                                              as_rvec_array(state->x.data()),
1603                                              state->box,
1604                                              MASTER(cr) && mdrunOptions.verbose,
1605                                              bRerunMD);
1606
1607             if (bNeedRepartition && DOMAINDECOMP(cr))
1608             {
1609                 dd_collect_state(cr->dd, state, state_global);
1610             }
1611         }
1612
1613         /* Replica exchange */
1614         bExchanged = FALSE;
1615         if (bDoReplEx)
1616         {
1617             bExchanged = replica_exchange(fplog, cr, ms, repl_ex,
1618                                           state_global, enerd,
1619                                           state, step, t);
1620         }
1621
1622         if ( (bExchanged || bNeedRepartition) && DOMAINDECOMP(cr) )
1623         {
1624             dd_partition_system(fplog, mdlog, step, cr, TRUE, 1,
1625                                 state_global, *top_global, ir, imdSession,
1626                                 pull_work,
1627                                 state, &f, mdAtoms, &top, fr,
1628                                 vsite, constr,
1629                                 nrnb, wcycle, FALSE);
1630             shouldCheckNumberOfBondedInteractions = true;
1631             upd.setNumAtoms(state->natoms);
1632         }
1633
1634         bFirstStep             = FALSE;
1635         bInitStep              = FALSE;
1636
1637         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1638         /* With all integrators, except VV, we need to retain the pressure
1639          * at the current step for coupling at the next step.
1640          */
1641         if ((state->flags & (1U<<estPRES_PREV)) &&
1642             (bGStatEveryStep ||
1643              (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1644         {
1645             /* Store the pressure in t_state for pressure coupling
1646              * at the next MD step.
1647              */
1648             copy_mat(pres, state->pres_prev);
1649         }
1650
1651         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1652
1653         if ( (membed != nullptr) && (!bLastStep) )
1654         {
1655             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1656         }
1657
1658         cycles = wallcycle_stop(wcycle, ewcSTEP);
1659         if (DOMAINDECOMP(cr) && wcycle)
1660         {
1661             dd_cycles_add(cr->dd, cycles, ddCyclStep);
1662         }
1663
1664         /* increase the MD step number */
1665         step++;
1666         step_rel++;
1667
1668         resetHandler->resetCounters(
1669                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(),
1670                 nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
1671
1672         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
1673         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
1674
1675     }
1676     /* End of main MD loop */
1677
1678     /* Closing TNG files can include compressing data. Therefore it is good to do that
1679      * before stopping the time measurements. */
1680     mdoutf_tng_close(outf);
1681
1682     /* Stop measuring walltime */
1683     walltime_accounting_end_time(walltime_accounting);
1684
1685     if (!thisRankHasDuty(cr, DUTY_PME))
1686     {
1687         /* Tell the PME only node to finish */
1688         gmx_pme_send_finish(cr);
1689     }
1690
1691     if (MASTER(cr))
1692     {
1693         if (ir->nstcalcenergy > 0)
1694         {
1695             energyOutput.printAnnealingTemperatures(fplog, groups, &(ir->opts));
1696             energyOutput.printAverages(fplog, groups);
1697         }
1698     }
1699     done_mdoutf(outf);
1700
1701     if (bPMETune)
1702     {
1703         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
1704     }
1705
1706     done_shellfc(fplog, shellfc, step_rel);
1707
1708     if (useReplicaExchange && MASTER(cr))
1709     {
1710         print_replica_exchange_statistics(fplog, repl_ex);
1711     }
1712
1713     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
1714
1715     if (fr->pmePpCommGpu)
1716     {
1717         // destroy object since it is no longer required. (This needs to be done while the GPU context still exists.)
1718         fr->pmePpCommGpu.reset();
1719     }
1720
1721     global_stat_destroy(gstat);
1722
1723 }