StatePropagatorDataGpu object to manage GPU forces, positions and velocities buffers
[alexxy/gromacs.git] / src / gromacs / mdrun / md.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 /*! \internal \file
38  *
39  * \brief Implements the integrator for normal molecular dynamics simulations
40  *
41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
42  * \ingroup module_mdrun
43  */
44 #include "gmxpre.h"
45
46 #include <cinttypes>
47 #include <cmath>
48 #include <cstdio>
49 #include <cstdlib>
50
51 #include <algorithm>
52 #include <memory>
53
54 #include "gromacs/awh/awh.h"
55 #include "gromacs/commandline/filenm.h"
56 #include "gromacs/domdec/collect.h"
57 #include "gromacs/domdec/dlbtiming.h"
58 #include "gromacs/domdec/domdec.h"
59 #include "gromacs/domdec/domdec_network.h"
60 #include "gromacs/domdec/domdec_struct.h"
61 #include "gromacs/domdec/mdsetup.h"
62 #include "gromacs/domdec/partition.h"
63 #include "gromacs/essentialdynamics/edsam.h"
64 #include "gromacs/ewald/pme.h"
65 #include "gromacs/ewald/pme_load_balancing.h"
66 #include "gromacs/fileio/trxio.h"
67 #include "gromacs/gmxlib/network.h"
68 #include "gromacs/gmxlib/nrnb.h"
69 #include "gromacs/gpu_utils/gpu_utils.h"
70 #include "gromacs/imd/imd.h"
71 #include "gromacs/listed_forces/manage_threading.h"
72 #include "gromacs/math/functions.h"
73 #include "gromacs/math/utilities.h"
74 #include "gromacs/math/vec.h"
75 #include "gromacs/math/vectypes.h"
76 #include "gromacs/mdlib/checkpointhandler.h"
77 #include "gromacs/mdlib/compute_io.h"
78 #include "gromacs/mdlib/constr.h"
79 #include "gromacs/mdlib/ebin.h"
80 #include "gromacs/mdlib/enerdata_utils.h"
81 #include "gromacs/mdlib/energyoutput.h"
82 #include "gromacs/mdlib/expanded.h"
83 #include "gromacs/mdlib/force.h"
84 #include "gromacs/mdlib/force_flags.h"
85 #include "gromacs/mdlib/forcerec.h"
86 #include "gromacs/mdlib/md_support.h"
87 #include "gromacs/mdlib/mdatoms.h"
88 #include "gromacs/mdlib/mdoutf.h"
89 #include "gromacs/mdlib/membed.h"
90 #include "gromacs/mdlib/resethandler.h"
91 #include "gromacs/mdlib/sighandler.h"
92 #include "gromacs/mdlib/simulationsignal.h"
93 #include "gromacs/mdlib/stat.h"
94 #include "gromacs/mdlib/stophandler.h"
95 #include "gromacs/mdlib/tgroup.h"
96 #include "gromacs/mdlib/trajectory_writing.h"
97 #include "gromacs/mdlib/update.h"
98 #include "gromacs/mdlib/update_constrain_cuda.h"
99 #include "gromacs/mdlib/vcm.h"
100 #include "gromacs/mdlib/vsite.h"
101 #include "gromacs/mdrunutility/handlerestart.h"
102 #include "gromacs/mdrunutility/multisim.h"
103 #include "gromacs/mdrunutility/printtime.h"
104 #include "gromacs/mdtypes/awh_history.h"
105 #include "gromacs/mdtypes/awh_params.h"
106 #include "gromacs/mdtypes/commrec.h"
107 #include "gromacs/mdtypes/df_history.h"
108 #include "gromacs/mdtypes/energyhistory.h"
109 #include "gromacs/mdtypes/fcdata.h"
110 #include "gromacs/mdtypes/forcerec.h"
111 #include "gromacs/mdtypes/group.h"
112 #include "gromacs/mdtypes/inputrec.h"
113 #include "gromacs/mdtypes/interaction_const.h"
114 #include "gromacs/mdtypes/md_enums.h"
115 #include "gromacs/mdtypes/mdatom.h"
116 #include "gromacs/mdtypes/mdrunoptions.h"
117 #include "gromacs/mdtypes/observableshistory.h"
118 #include "gromacs/mdtypes/pullhistory.h"
119 #include "gromacs/mdtypes/state.h"
120 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
121 #include "gromacs/modularsimulator/energyelement.h"
122 #include "gromacs/nbnxm/gpu_data_mgmt.h"
123 #include "gromacs/nbnxm/nbnxm.h"
124 #include "gromacs/pbcutil/mshift.h"
125 #include "gromacs/pbcutil/pbc.h"
126 #include "gromacs/pulling/output.h"
127 #include "gromacs/pulling/pull.h"
128 #include "gromacs/swap/swapcoords.h"
129 #include "gromacs/timing/wallcycle.h"
130 #include "gromacs/timing/walltime_accounting.h"
131 #include "gromacs/topology/atoms.h"
132 #include "gromacs/topology/idef.h"
133 #include "gromacs/topology/mtop_util.h"
134 #include "gromacs/topology/topology.h"
135 #include "gromacs/trajectory/trajectoryframe.h"
136 #include "gromacs/utility/basedefinitions.h"
137 #include "gromacs/utility/cstringutil.h"
138 #include "gromacs/utility/fatalerror.h"
139 #include "gromacs/utility/logger.h"
140 #include "gromacs/utility/real.h"
141 #include "gromacs/utility/smalloc.h"
142
143 #include "legacysimulator.h"
144 #include "replicaexchange.h"
145 #include "shellfc.h"
146
147 #if GMX_FAHCORE
148 #include "corewrap.h"
149 #endif
150
151 using gmx::SimulationSignaller;
152
153 void gmx::LegacySimulator::do_md()
154 {
155     // TODO Historically, the EM and MD "integrators" used different
156     // names for the t_inputrec *parameter, but these must have the
157     // same name, now that it's a member of a struct. We use this ir
158     // alias to avoid a large ripple of nearly useless changes.
159     // t_inputrec is being replaced by IMdpOptionsProvider, so this
160     // will go away eventually.
161     t_inputrec                 *ir   = inputrec;
162     int64_t                     step, step_rel;
163     double                      t, t0 = ir->init_t, lam0[efptNR];
164     gmx_bool                    bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
165     gmx_bool                    bNS, bNStList, bStopCM,
166                                 bFirstStep, bInitStep, bLastStep = FALSE;
167     gmx_bool                    bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
168     gmx_bool                    do_ene, do_log, do_verbose;
169     gmx_bool                    bMasterState;
170     unsigned int                force_flags;
171     tensor                      force_vir     = {{0}}, shake_vir = {{0}}, total_vir = {{0}},
172                                 tmp_vir       = {{0}}, pres = {{0}};
173     int                         i, m;
174     rvec                        mu_tot;
175     matrix                      parrinellorahmanMu, M;
176     gmx_repl_ex_t               repl_ex = nullptr;
177     gmx_localtop_t              top;
178     PaddedHostVector<gmx::RVec> f {};
179     gmx_global_stat_t           gstat;
180     t_graph                    *graph = nullptr;
181     gmx_shellfc_t              *shellfc;
182     gmx_bool                    bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
183     gmx_bool                    bTemp, bPres, bTrotter;
184     real                        dvdl_constr;
185     std::vector<RVec>           cbuf;
186     matrix                      lastbox;
187     int                         lamnew  = 0;
188     /* for FEP */
189     int                         nstfep = 0;
190     double                      cycles;
191     real                        saved_conserved_quantity = 0;
192     real                        last_ekin                = 0;
193     t_extmass                   MassQ;
194     char                        sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
195
196     /* PME load balancing data for GPU kernels */
197     gmx_bool              bPMETune         = FALSE;
198     gmx_bool              bPMETunePrinting = FALSE;
199
200     bool                  bInteractiveMDstep = false;
201
202     /* Domain decomposition could incorrectly miss a bonded
203        interaction, but checking for that requires a global
204        communication stage, which does not otherwise happen in DD
205        code. So we do that alongside the first global energy reduction
206        after a new DD is made. These variables handle whether the
207        check happens, and the result it returns. */
208     bool              shouldCheckNumberOfBondedInteractions = false;
209     int               totalNumberOfBondedInteractions       = -1;
210
211     SimulationSignals signals;
212     // Most global communnication stages don't propagate mdrun
213     // signals, and will use this object to achieve that.
214     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
215
216     if (!mdrunOptions.writeConfout)
217     {
218         // This is on by default, and the main known use case for
219         // turning it off is for convenience in benchmarking, which is
220         // something that should not show up in the general user
221         // interface.
222         GMX_LOG(mdlog.info).asParagraph().
223             appendText("The -noconfout functionality is deprecated, and may be removed in a future version.");
224     }
225
226     /* md-vv uses averaged full step velocities for T-control
227        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
228        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
229     bTrotter = (EI_VV(ir->eI) && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
230
231     const bool bRerunMD      = false;
232
233     int        nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
234     bGStatEveryStep = (nstglobalcomm == 1);
235
236     SimulationGroups                  *groups = &top_global->groups;
237
238     std::unique_ptr<EssentialDynamics> ed = nullptr;
239     if (opt2bSet("-ei", nfile, fnm) || observablesHistory->edsamHistory != nullptr)
240     {
241         /* Initialize essential dynamics sampling */
242         ed = init_edsam(mdlog,
243                         opt2fn_null("-ei", nfile, fnm), opt2fn("-eo", nfile, fnm),
244                         top_global,
245                         ir, cr, constr,
246                         state_global, observablesHistory,
247                         oenv,
248                         startingBehavior);
249     }
250
251     initialize_lambdas(fplog, *ir, MASTER(cr), &state_global->fep_state, state_global->lambda, lam0);
252     Update     upd(ir, deform);
253     const bool doSimulatedAnnealing = initSimulatedAnnealing(ir, &upd);
254     if (startingBehavior != StartingBehavior::RestartWithAppending)
255     {
256         pleaseCiteCouplingAlgorithms(fplog, *ir);
257     }
258     gmx_mdoutf       *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, mdModulesNotifier, ir, top_global, oenv, wcycle,
259                                          startingBehavior);
260     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf), top_global, ir, pull_work, mdoutf_get_fp_dhdl(outf), false, mdModulesNotifier);
261
262     gstat = global_stat_init(ir);
263
264     /* Check for polarizable models and flexible constraints */
265     shellfc = init_shell_flexcon(fplog,
266                                  top_global, constr ? constr->numFlexibleConstraints() : 0,
267                                  ir->nstcalcenergy, DOMAINDECOMP(cr));
268
269     {
270         double io = compute_io(ir, top_global->natoms, *groups, energyOutput.numEnergyTerms(), 1);
271         if ((io > 2000) && MASTER(cr))
272         {
273             fprintf(stderr,
274                     "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
275                     io);
276         }
277     }
278
279     // Local state only becomes valid now.
280     std::unique_ptr<t_state> stateInstance;
281     t_state *                state;
282
283
284     auto mdatoms = mdAtoms->mdatoms();
285
286     std::unique_ptr<UpdateConstrainCuda> integrator;
287
288     if (DOMAINDECOMP(cr))
289     {
290         dd_init_local_top(*top_global, &top);
291
292         stateInstance = std::make_unique<t_state>();
293         state         = stateInstance.get();
294         dd_init_local_state(cr->dd, state_global, state);
295
296         /* Distribute the charge groups over the nodes from the master node */
297         dd_partition_system(fplog, mdlog, ir->init_step, cr, TRUE, 1,
298                             state_global, *top_global, ir, imdSession,
299                             pull_work,
300                             state, &f, mdAtoms, &top, fr,
301                             vsite, constr,
302                             nrnb, nullptr, FALSE);
303         shouldCheckNumberOfBondedInteractions = true;
304         upd.setNumAtoms(state->natoms);
305     }
306     else
307     {
308         state_change_natoms(state_global, state_global->natoms);
309         f.resizeWithPadding(state_global->natoms);
310         /* Copy the pointer to the global state */
311         state = state_global;
312
313         /* Generate and initialize new topology */
314         mdAlgorithmsSetupAtomData(cr, ir, *top_global, &top, fr,
315                                   &graph, mdAtoms, constr, vsite, shellfc);
316
317         upd.setNumAtoms(state->natoms);
318     }
319
320 /*****************************************************************************************/
321 // TODO: The following block of code should be refactored, once:
322 //       1. We have the useGpuForBufferOps variable set and available here and in do_force(...)
323 //       2. The proper GPU syncronization is introduced, so that the H2D and D2H data copies can be performed in the separate
324 //          stream owned by the StatePropagatorDataGpu
325     bool useGpuForPme       = (fr->pmedata != nullptr) && (pme_run_mode(fr->pmedata) != PmeRunMode::CPU);
326     bool useGpuForNonbonded = fr->nbv->useGpu();
327     // Temporary solution to make sure that the buffer ops are offloaded when update is offloaded
328     bool useGpuForBufferOps   = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
329
330     if (useGpuForUpdate)
331     {
332         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr), "Domain decomposition is not supported with the GPU update.\n");
333         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr),
334                            "Either PME or short-ranged non-bonded interaction tasks must run on the GPU to use GPU update.\n");
335         GMX_RELEASE_ASSERT(ir->eI == eiMD, "Only the md integrator is supported with the GPU update.\n");
336         GMX_RELEASE_ASSERT(ir->etc != etcNOSEHOOVER, "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
337         GMX_RELEASE_ASSERT(ir->epc == epcNO || ir->epc == epcPARRINELLORAHMAN, "Only Parrinello-Rahman pressure control is supported with the GPU update.\n");
338         GMX_RELEASE_ASSERT(!mdatoms->haveVsites, "Virtual sites are not supported with the GPU update.\n");
339         GMX_RELEASE_ASSERT(ed == nullptr, "Essential dynamics is not supported with the GPU update.\n");
340         GMX_RELEASE_ASSERT(!ir->bPull && !ir->pull, "Pulling is not supported with the GPU update.\n");
341         GMX_RELEASE_ASSERT(fcd->orires.nr == 0, "Orientation restraints are not supported with the GPU update.\n");
342         GMX_RELEASE_ASSERT(fcd->disres.npair  == 0, "Distance restraints are not supported with the GPU update.\n");
343         GMX_RELEASE_ASSERT(ir->efep == efepNO, "Free energy perturbations are not supported with the GPU update.");
344
345         if (constr != nullptr && constr->numConstraintsTotal() > 0)
346         {
347             GMX_LOG(mdlog.info).asParagraph().
348                 appendText("Updating coordinates and applying constraints on the GPU.");
349         }
350         else
351         {
352             GMX_LOG(mdlog.info).asParagraph().
353                 appendText("Updating coordinates on the GPU.");
354         }
355         integrator = std::make_unique<UpdateConstrainCuda>(*ir, *top_global, nullptr);
356     }
357
358     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
359     {
360         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
361     }
362     if ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
363     {
364         changePinningPolicy(&f, PinningPolicy::PinnedIfSupported);
365     }
366     if (useGpuForUpdate)
367     {
368         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
369     }
370 /*****************************************************************************************/
371
372     // NOTE: The global state is no longer used at this point.
373     // But state_global is still used as temporary storage space for writing
374     // the global state to file and potentially for replica exchange.
375     // (Global topology should persist.)
376
377     update_mdatoms(mdatoms, state->lambda[efptMASS]);
378
379     if (ir->bExpanded)
380     {
381         /* Check nstexpanded here, because the grompp check was broken */
382         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
383         {
384             gmx_fatal(FARGS, "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
385         }
386         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation,
387                                ir, state->dfhist);
388     }
389
390     if (MASTER(cr))
391     {
392         EnergyElement::initializeEnergyHistory(
393                 startingBehavior, observablesHistory, &energyOutput);
394     }
395
396     preparePrevStepPullCom(ir, pull_work, mdatoms, state, state_global, cr,
397                            startingBehavior != StartingBehavior::NewSimulation);
398
399     // TODO: Remove this by converting AWH into a ForceProvider
400     auto awh = prepareAwhModule(fplog, *ir, state_global, cr, ms, startingBehavior != StartingBehavior::NewSimulation,
401                                 shellfc != nullptr,
402                                 opt2fn("-awh", nfile, fnm), pull_work);
403
404     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
405     if (useReplicaExchange && MASTER(cr))
406     {
407         repl_ex = init_replica_exchange(fplog, ms, top_global->natoms, ir,
408                                         replExParams);
409     }
410     /* PME tuning is only supported in the Verlet scheme, with PME for
411      * Coulomb. It is not supported with only LJ PME. */
412     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) &&
413                 !mdrunOptions.reproducible && ir->cutoff_scheme != ecutsGROUP);
414
415     pme_load_balancing_t *pme_loadbal      = nullptr;
416     if (bPMETune)
417     {
418         pme_loadbal_init(&pme_loadbal, cr, mdlog, *ir, state->box,
419                          *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu(),
420                          &bPMETunePrinting);
421     }
422
423     if (!ir->bContinuation)
424     {
425         if (state->flags & (1U << estV))
426         {
427             auto v = makeArrayRef(state->v);
428             /* Set the velocities of vsites, shells and frozen atoms to zero */
429             for (i = 0; i < mdatoms->homenr; i++)
430             {
431                 if (mdatoms->ptype[i] == eptVSite ||
432                     mdatoms->ptype[i] == eptShell)
433                 {
434                     clear_rvec(v[i]);
435                 }
436                 else if (mdatoms->cFREEZE)
437                 {
438                     for (m = 0; m < DIM; m++)
439                     {
440                         if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
441                         {
442                             v[i][m] = 0;
443                         }
444                     }
445                 }
446             }
447         }
448
449         if (constr)
450         {
451             /* Constrain the initial coordinates and velocities */
452             do_constrain_first(fplog, constr, ir, mdatoms,
453                                state->natoms,
454                                state->x.arrayRefWithPadding(),
455                                state->v.arrayRefWithPadding(),
456                                state->box, state->lambda[efptBONDED]);
457         }
458         if (vsite)
459         {
460             /* Construct the virtual sites for the initial configuration */
461             construct_vsites(vsite, state->x.rvec_array(), ir->delta_t, nullptr,
462                              top.idef.iparams, top.idef.il,
463                              fr->ePBC, fr->bMolPBC, cr, state->box);
464         }
465     }
466
467     if (ir->efep != efepNO)
468     {
469         /* Set free energy calculation frequency as the greatest common
470          * denominator of nstdhdl and repl_ex_nst. */
471         nstfep = ir->fepvals->nstdhdl;
472         if (ir->bExpanded)
473         {
474             nstfep = gmx_greatest_common_divisor(ir->expandedvals->nstexpanded, nstfep);
475         }
476         if (useReplicaExchange)
477         {
478             nstfep = gmx_greatest_common_divisor(replExParams.exchangeInterval, nstfep);
479         }
480     }
481
482     /* Be REALLY careful about what flags you set here. You CANNOT assume
483      * this is the first step, since we might be restarting from a checkpoint,
484      * and in that case we should not do any modifications to the state.
485      */
486     bStopCM = (ir->comm_mode != ecmNO && !ir->bContinuation);
487
488     // When restarting from a checkpoint, it can be appropriate to
489     // initialize ekind from quantities in the checkpoint. Otherwise,
490     // compute_globals must initialize ekind before the simulation
491     // starts/restarts. However, only the master rank knows what was
492     // found in the checkpoint file, so we have to communicate in
493     // order to coordinate the restart.
494     //
495     // TODO Consider removing this communication if/when checkpoint
496     // reading directly follows .tpr reading, because all ranks can
497     // agree on hasReadEkinState at that time.
498     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
499     if (PAR(cr))
500     {
501         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr);
502     }
503     if (hasReadEkinState)
504     {
505         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
506     }
507
508     unsigned int cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
509                                | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
510                                | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0)
511                                | (hasReadEkinState ? CGLO_READEKIN : 0));
512
513     bSumEkinhOld = FALSE;
514
515     t_vcm vcm(top_global->groups, *ir);
516     reportComRemovalInfo(fplog, vcm);
517
518     /* To minimize communication, compute_globals computes the COM velocity
519      * and the kinetic energy for the velocities without COM motion removed.
520      * Thus to get the kinetic energy without the COM contribution, we need
521      * to call compute_globals twice.
522      */
523     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
524     {
525         unsigned int cglo_flags_iteration = cglo_flags;
526         if (bStopCM && cgloIteration == 0)
527         {
528             cglo_flags_iteration |= CGLO_STOPCM;
529             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
530         }
531         compute_globals(gstat, cr, ir, fr, ekind,
532                         state->x.rvec_array(), state->v.rvec_array(), state->box, state->lambda[efptVDW],
533                         mdatoms, nrnb, &vcm,
534                         nullptr, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
535                         constr, &nullSignaller, state->box,
536                         &totalNumberOfBondedInteractions, &bSumEkinhOld, cglo_flags_iteration
537                         | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0));
538         if (cglo_flags_iteration & CGLO_STOPCM)
539         {
540             /* At initialization, do not pass x with acceleration-correction mode
541              * to avoid (incorrect) correction of the initial coordinates.
542              */
543             rvec *xPtr = nullptr;
544             if (vcm.mode != ecmLINEAR_ACCELERATION_CORRECTION)
545             {
546                 xPtr = state->x.rvec_array();
547             }
548             process_and_stopcm_grp(fplog, &vcm, *mdatoms, xPtr, state->v.rvec_array());
549             inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
550         }
551     }
552     checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
553                                     top_global, &top, state->x.rvec_array(), state->box,
554                                     &shouldCheckNumberOfBondedInteractions);
555     if (ir->eI == eiVVAK)
556     {
557         /* a second call to get the half step temperature initialized as well */
558         /* we do the same call as above, but turn the pressure off -- internally to
559            compute_globals, this is recognized as a velocity verlet half-step
560            kinetic energy calculation.  This minimized excess variables, but
561            perhaps loses some logic?*/
562
563         compute_globals(gstat, cr, ir, fr, ekind,
564                         state->x.rvec_array(), state->v.rvec_array(), state->box, state->lambda[efptVDW],
565                         mdatoms, nrnb, &vcm,
566                         nullptr, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
567                         constr, &nullSignaller, state->box,
568                         nullptr, &bSumEkinhOld,
569                         cglo_flags & ~CGLO_PRESSURE);
570     }
571
572     /* Calculate the initial half step temperature, and save the ekinh_old */
573     if (startingBehavior == StartingBehavior::NewSimulation)
574     {
575         for (i = 0; (i < ir->opts.ngtc); i++)
576         {
577             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
578         }
579     }
580
581     /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
582        temperature control */
583     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
584
585     if (MASTER(cr))
586     {
587         if (!ir->bContinuation)
588         {
589             if (constr && ir->eConstrAlg == econtLINCS)
590             {
591                 fprintf(fplog,
592                         "RMS relative constraint deviation after constraining: %.2e\n",
593                         constr->rmsd());
594             }
595             if (EI_STATE_VELOCITY(ir->eI))
596             {
597                 real temp = enerd->term[F_TEMP];
598                 if (ir->eI != eiVV)
599                 {
600                     /* Result of Ekin averaged over velocities of -half
601                      * and +half step, while we only have -half step here.
602                      */
603                     temp *= 2;
604                 }
605                 fprintf(fplog, "Initial temperature: %g K\n", temp);
606             }
607         }
608
609         char tbuf[20];
610         fprintf(stderr, "starting mdrun '%s'\n",
611                 *(top_global->name));
612         if (ir->nsteps >= 0)
613         {
614             sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t);
615         }
616         else
617         {
618             sprintf(tbuf, "%s", "infinite");
619         }
620         if (ir->init_step > 0)
621         {
622             fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
623                     gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf,
624                     gmx_step_str(ir->init_step, sbuf2),
625                     ir->init_step*ir->delta_t);
626         }
627         else
628         {
629             fprintf(stderr, "%s steps, %s ps.\n",
630                     gmx_step_str(ir->nsteps, sbuf), tbuf);
631         }
632         fprintf(fplog, "\n");
633     }
634
635     walltime_accounting_start_time(walltime_accounting);
636     wallcycle_start(wcycle, ewcRUN);
637     print_start(fplog, cr, walltime_accounting, "mdrun");
638
639 #if GMX_FAHCORE
640     /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
641     int chkpt_ret = fcCheckPointParallel( cr->nodeid,
642                                           NULL, 0);
643     if (chkpt_ret == 0)
644     {
645         gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 );
646     }
647 #endif
648
649     /***********************************************************
650      *
651      *             Loop over MD steps
652      *
653      ************************************************************/
654
655     bFirstStep       = TRUE;
656     /* Skip the first Nose-Hoover integration when we get the state from tpx */
657     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
658     bSumEkinhOld     = FALSE;
659     bExchanged       = FALSE;
660     bNeedRepartition = FALSE;
661
662     bool simulationsShareState = false;
663     int  nstSignalComm         = nstglobalcomm;
664     {
665         // TODO This implementation of ensemble orientation restraints is nasty because
666         // a user can't just do multi-sim with single-sim orientation restraints.
667         bool usingEnsembleRestraints = (fcd->disres.nsystems > 1) || ((ms != nullptr) && (fcd->orires.nr != 0));
668         bool awhUsesMultiSim         = (ir->bDoAwh && ir->awhParams->shareBiasMultisim && (ms != nullptr));
669
670         // Replica exchange, ensemble restraints and AWH need all
671         // simulations to remain synchronized, so they need
672         // checkpoints and stop conditions to act on the same step, so
673         // the propagation of such signals must take place between
674         // simulations, not just within simulations.
675         // TODO: Make algorithm initializers set these flags.
676         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
677
678         if (simulationsShareState)
679         {
680             // Inter-simulation signal communication does not need to happen
681             // often, so we use a minimum of 200 steps to reduce overhead.
682             const int c_minimumInterSimulationSignallingInterval = 200;
683             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1)/nstglobalcomm)*nstglobalcomm;
684         }
685     }
686
687     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
688                 compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]), simulationsShareState,
689                 MASTER(cr), ir->nstlist, mdrunOptions.reproducible, nstSignalComm,
690                 mdrunOptions.maximumHoursToRun, ir->nstlist == 0, fplog, step, bNS, walltime_accounting);
691
692     auto checkpointHandler = std::make_unique<CheckpointHandler>(
693                 compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
694                 simulationsShareState, ir->nstlist == 0, MASTER(cr),
695                 mdrunOptions.writeConfout, mdrunOptions.checkpointOptions.period);
696
697     const bool resetCountersIsLocal = true;
698     auto       resetHandler         = std::make_unique<ResetHandler>(
699                 compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]), !resetCountersIsLocal,
700                 ir->nsteps, MASTER(cr), mdrunOptions.timingOptions.resetHalfway,
701                 mdrunOptions.maximumHoursToRun, mdlog, wcycle, walltime_accounting);
702
703     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
704
705     step     = ir->init_step;
706     step_rel = 0;
707
708     // TODO extract this to new multi-simulation module
709     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
710     {
711         if (!multisim_int_all_are_equal(ms, ir->nsteps))
712         {
713             GMX_LOG(mdlog.warning).appendText(
714                     "Note: The number of steps is not consistent across multi simulations,\n"
715                     "but we are proceeding anyway!");
716         }
717         if (!multisim_int_all_are_equal(ms, ir->init_step))
718         {
719             GMX_LOG(mdlog.warning).appendText(
720                     "Note: The initial step is not consistent across multi simulations,\n"
721                     "but we are proceeding anyway!");
722         }
723     }
724
725     /* and stop now if we should */
726     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
727     while (!bLastStep)
728     {
729
730         /* Determine if this is a neighbor search step */
731         bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
732
733         if (bPMETune && bNStList)
734         {
735             /* PME grid + cut-off optimization with GPUs or PME nodes */
736             pme_loadbal_do(pme_loadbal, cr,
737                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
738                            fplog, mdlog,
739                            *ir, fr, state->box, state->x,
740                            wcycle,
741                            step, step_rel,
742                            &bPMETunePrinting);
743         }
744
745         wallcycle_start(wcycle, ewcSTEP);
746
747         bLastStep = (step_rel == ir->nsteps);
748         t         = t0 + step*ir->delta_t;
749
750         // TODO Refactor this, so that nstfep does not need a default value of zero
751         if (ir->efep != efepNO || ir->bSimTemp)
752         {
753             /* find and set the current lambdas */
754             setCurrentLambdasLocal(step, ir->fepvals, lam0, state->lambda, state->fep_state);
755
756             bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
757             bDoFEP       = ((ir->efep != efepNO) && do_per_step(step, nstfep));
758             bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded)
759                             && (ir->bExpanded) && (step > 0) &&
760                             (startingBehavior == StartingBehavior::NewSimulation));
761         }
762
763         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep &&
764                      do_per_step(step, replExParams.exchangeInterval));
765
766         if (doSimulatedAnnealing)
767         {
768             update_annealing_target_temp(ir, t, &upd);
769         }
770
771         /* Stop Center of Mass motion */
772         bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
773
774         /* Determine whether or not to do Neighbour Searching */
775         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
776
777         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
778
779         /* do_log triggers energy and virial calculation. Because this leads
780          * to different code paths, forces can be different. Thus for exact
781          * continuation we should avoid extra log output.
782          * Note that the || bLastStep can result in non-exact continuation
783          * beyond the last step. But we don't consider that to be an issue.
784          */
785         do_log     =
786             (do_per_step(step, ir->nstlog) ||
787              (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) ||
788              bLastStep);
789         do_verbose = mdrunOptions.verbose &&
790             (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
791
792         if (bNS && !(bFirstStep && ir->bContinuation))
793         {
794             bMasterState = FALSE;
795             /* Correct the new box if it is too skewed */
796             if (inputrecDynamicBox(ir))
797             {
798                 if (correct_box(fplog, step, state->box, graph))
799                 {
800                     bMasterState = TRUE;
801                 }
802             }
803             if (DOMAINDECOMP(cr) && bMasterState)
804             {
805                 dd_collect_state(cr->dd, state, state_global);
806             }
807
808             if (DOMAINDECOMP(cr))
809             {
810                 /* Repartition the domain decomposition */
811                 dd_partition_system(fplog, mdlog, step, cr,
812                                     bMasterState, nstglobalcomm,
813                                     state_global, *top_global, ir, imdSession,
814                                     pull_work,
815                                     state, &f, mdAtoms, &top, fr,
816                                     vsite, constr,
817                                     nrnb, wcycle,
818                                     do_verbose && !bPMETunePrinting);
819                 shouldCheckNumberOfBondedInteractions = true;
820                 upd.setNumAtoms(state->natoms);
821             }
822         }
823
824         if (MASTER(cr) && do_log)
825         {
826             energyOutput.printHeader(fplog, step, t); /* can we improve the information printed here? */
827         }
828
829         if (ir->efep != efepNO)
830         {
831             update_mdatoms(mdatoms, state->lambda[efptMASS]);
832         }
833
834         if (bExchanged)
835         {
836
837             /* We need the kinetic energy at minus the half step for determining
838              * the full step kinetic energy and possibly for T-coupling.*/
839             /* This may not be quite working correctly yet . . . . */
840             compute_globals(gstat, cr, ir, fr, ekind,
841                             state->x.rvec_array(), state->v.rvec_array(), state->box, state->lambda[efptVDW],
842                             mdatoms, nrnb, &vcm,
843                             wcycle, enerd, nullptr, nullptr, nullptr, nullptr, mu_tot,
844                             constr, &nullSignaller, state->box,
845                             &totalNumberOfBondedInteractions, &bSumEkinhOld,
846                             CGLO_GSTAT | CGLO_TEMPERATURE | CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS);
847             checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
848                                             top_global, &top, state->x.rvec_array(), state->box,
849                                             &shouldCheckNumberOfBondedInteractions);
850         }
851         clear_mat(force_vir);
852
853         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
854
855         /* Determine the energy and pressure:
856          * at nstcalcenergy steps and at energy output steps (set below).
857          */
858         if (EI_VV(ir->eI) && (!bInitStep))
859         {
860             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
861             bCalcVir      = bCalcEnerStep ||
862                 (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple)));
863         }
864         else
865         {
866             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
867             bCalcVir      = bCalcEnerStep ||
868                 (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
869         }
870         bCalcEner = bCalcEnerStep;
871
872         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
873
874         if (do_ene || do_log || bDoReplEx)
875         {
876             bCalcVir  = TRUE;
877             bCalcEner = TRUE;
878         }
879
880         /* Do we need global communication ? */
881         bGStat = (bCalcVir || bCalcEner || bStopCM ||
882                   do_per_step(step, nstglobalcomm) ||
883                   (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step-1, nstglobalcomm)));
884
885         force_flags = (GMX_FORCE_STATECHANGED |
886                        ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0) |
887                        GMX_FORCE_ALLFORCES |
888                        (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
889                        (bCalcEner ? GMX_FORCE_ENERGY : 0) |
890                        (bDoFEP ? GMX_FORCE_DHDL : 0)
891                        );
892
893         if (shellfc)
894         {
895             /* Now is the time to relax the shells */
896             relax_shell_flexcon(fplog, cr, ms, mdrunOptions.verbose,
897                                 enforcedRotation, step,
898                                 ir, imdSession, pull_work, bNS, force_flags, &top,
899                                 constr, enerd, fcd,
900                                 state->natoms,
901                                 state->x.arrayRefWithPadding(),
902                                 state->v.arrayRefWithPadding(),
903                                 state->box,
904                                 state->lambda,
905                                 &state->hist,
906                                 f.arrayRefWithPadding(), force_vir, mdatoms,
907                                 nrnb, wcycle, graph,
908                                 shellfc, fr, runScheduleWork, t, mu_tot,
909                                 vsite,
910                                 ddBalanceRegionHandler);
911         }
912         else
913         {
914             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias is updated
915                (or the AWH update will be performed twice for one step when continuing). It would be best to
916                call this update function from do_md_trajectory_writing but that would occur after do_force.
917                One would have to divide the update_awh function into one function applying the AWH force
918                and one doing the AWH bias update. The update AWH bias function could then be called after
919                do_md_trajectory_writing (then containing update_awh_history).
920                The checkpointing will in the future probably moved to the start of the md loop which will
921                rid of this issue. */
922             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
923             {
924                 awh->updateHistory(state_global->awhHistory.get());
925             }
926
927             /* The coordinates (x) are shifted (to get whole molecules)
928              * in do_force.
929              * This is parallellized as well, and does communication too.
930              * Check comments in sim_util.c
931              */
932             do_force(fplog, cr, ms, ir, awh.get(), enforcedRotation, imdSession,
933                      pull_work,
934                      step, nrnb, wcycle, &top,
935                      state->box, state->x.arrayRefWithPadding(), &state->hist,
936                      f.arrayRefWithPadding(), force_vir, mdatoms, enerd, fcd,
937                      state->lambda, graph,
938                      fr, runScheduleWork, vsite, mu_tot, t, ed ? ed->getLegacyED() : nullptr,
939                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
940                      ddBalanceRegionHandler);
941         }
942
943         // VV integrators do not need the following velocity half step
944         // if it is the first step after starting from a checkpoint.
945         // That is, the half step is needed on all other steps, and
946         // also the first step when starting from a .tpr file.
947         if (EI_VV(ir->eI) && (!bFirstStep || startingBehavior == StartingBehavior::NewSimulation))
948         /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
949         {
950             rvec *vbuf = nullptr;
951
952             wallcycle_start(wcycle, ewcUPDATE);
953             if (ir->eI == eiVV && bInitStep)
954             {
955                 /* if using velocity verlet with full time step Ekin,
956                  * take the first half step only to compute the
957                  * virial for the first step. From there,
958                  * revert back to the initial coordinates
959                  * so that the input is actually the initial step.
960                  */
961                 snew(vbuf, state->natoms);
962                 copy_rvecn(state->v.rvec_array(), vbuf, 0, state->natoms); /* should make this better for parallelizing? */
963             }
964             else
965             {
966                 /* this is for NHC in the Ekin(t+dt/2) version of vv */
967                 trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1);
968             }
969
970             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
971                           ekind, M, &upd, etrtVELOCITY1,
972                           cr, constr);
973
974             wallcycle_stop(wcycle, ewcUPDATE);
975             constrain_velocities(step, nullptr,
976                                  state,
977                                  shake_vir,
978                                  constr,
979                                  bCalcVir, do_log, do_ene);
980             wallcycle_start(wcycle, ewcUPDATE);
981             /* if VV, compute the pressure and constraints */
982             /* For VV2, we strictly only need this if using pressure
983              * control, but we really would like to have accurate pressures
984              * printed out.
985              * Think about ways around this in the future?
986              * For now, keep this choice in comments.
987              */
988             /*bPres = (ir->eI==eiVV || inputrecNptTrotter(ir)); */
989             /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && inputrecNptTrotter(ir)));*/
990             bPres = TRUE;
991             bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
992             if (bCalcEner && ir->eI == eiVVAK)
993             {
994                 bSumEkinhOld = TRUE;
995             }
996             /* for vv, the first half of the integration actually corresponds to the previous step.
997                So we need information from the last step in the first half of the integration */
998             if (bGStat || do_per_step(step-1, nstglobalcomm))
999             {
1000                 wallcycle_stop(wcycle, ewcUPDATE);
1001                 compute_globals(gstat, cr, ir, fr, ekind,
1002                                 state->x.rvec_array(), state->v.rvec_array(), state->box, state->lambda[efptVDW],
1003                                 mdatoms, nrnb, &vcm,
1004                                 wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
1005                                 constr, &nullSignaller, state->box,
1006                                 &totalNumberOfBondedInteractions, &bSumEkinhOld,
1007                                 (bGStat ? CGLO_GSTAT : 0)
1008                                 | (bCalcEner ? CGLO_ENERGY : 0)
1009                                 | (bTemp ? CGLO_TEMPERATURE : 0)
1010                                 | (bPres ? CGLO_PRESSURE : 0)
1011                                 | (bPres ? CGLO_CONSTRAINT : 0)
1012                                 | (bStopCM ? CGLO_STOPCM : 0)
1013                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
1014                                 | CGLO_SCALEEKIN
1015                                 );
1016                 /* explanation of above:
1017                    a) We compute Ekin at the full time step
1018                    if 1) we are using the AveVel Ekin, and it's not the
1019                    initial step, or 2) if we are using AveEkin, but need the full
1020                    time step kinetic energy for the pressure (always true now, since we want accurate statistics).
1021                    b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
1022                    EkinAveVel because it's needed for the pressure */
1023                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1024                                                 top_global, &top, state->x.rvec_array(), state->box,
1025                                                 &shouldCheckNumberOfBondedInteractions);
1026                 if (bStopCM)
1027                 {
1028                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, state->x.rvec_array(), state->v.rvec_array());
1029                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1030                 }
1031                 wallcycle_start(wcycle, ewcUPDATE);
1032             }
1033             /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
1034             if (!bInitStep)
1035             {
1036                 if (bTrotter)
1037                 {
1038                     m_add(force_vir, shake_vir, total_vir);     /* we need the un-dispersion corrected total vir here */
1039                     trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
1040
1041                     /* TODO This is only needed when we're about to write
1042                      * a checkpoint, because we use it after the restart
1043                      * (in a kludge?). But what should we be doing if
1044                      * the startingBehavior is NewSimulation or bInitStep are true? */
1045                     if (inputrecNptTrotter(ir) || inputrecNphTrotter(ir))
1046                     {
1047                         copy_mat(shake_vir, state->svir_prev);
1048                         copy_mat(force_vir, state->fvir_prev);
1049                     }
1050                     if (inputrecNvtTrotter(ir) && ir->eI == eiVV)
1051                     {
1052                         /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
1053                         enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, nullptr, (ir->eI == eiVV), FALSE);
1054                         enerd->term[F_EKIN] = trace(ekind->ekin);
1055                     }
1056                 }
1057                 else if (bExchanged)
1058                 {
1059                     wallcycle_stop(wcycle, ewcUPDATE);
1060                     /* We need the kinetic energy at minus the half step for determining
1061                      * the full step kinetic energy and possibly for T-coupling.*/
1062                     /* This may not be quite working correctly yet . . . . */
1063                     compute_globals(gstat, cr, ir, fr, ekind,
1064                                     state->x.rvec_array(), state->v.rvec_array(), state->box, state->lambda[efptVDW],
1065                                     mdatoms, nrnb, &vcm,
1066                                     wcycle, enerd, nullptr, nullptr, nullptr, nullptr, mu_tot,
1067                                     constr, &nullSignaller, state->box,
1068                                     nullptr, &bSumEkinhOld,
1069                                     CGLO_GSTAT | CGLO_TEMPERATURE);
1070                     wallcycle_start(wcycle, ewcUPDATE);
1071                 }
1072             }
1073             /* if it's the initial step, we performed this first step just to get the constraint virial */
1074             if (ir->eI == eiVV && bInitStep)
1075             {
1076                 copy_rvecn(vbuf, state->v.rvec_array(), 0, state->natoms);
1077                 sfree(vbuf);
1078             }
1079             wallcycle_stop(wcycle, ewcUPDATE);
1080         }
1081
1082         /* compute the conserved quantity */
1083         if (EI_VV(ir->eI))
1084         {
1085             saved_conserved_quantity = NPT_energy(ir, state, &MassQ);
1086             if (ir->eI == eiVV)
1087             {
1088                 last_ekin = enerd->term[F_EKIN];
1089             }
1090             if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
1091             {
1092                 saved_conserved_quantity -= enerd->term[F_DISPCORR];
1093             }
1094             /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
1095             if (ir->efep != efepNO)
1096             {
1097                 sum_dhdl(enerd, state->lambda, *ir->fepvals);
1098             }
1099         }
1100
1101         /* ########  END FIRST UPDATE STEP  ############## */
1102         /* ########  If doing VV, we now have v(dt) ###### */
1103         if (bDoExpanded)
1104         {
1105             /* perform extended ensemble sampling in lambda - we don't
1106                actually move to the new state before outputting
1107                statistics, but if performing simulated tempering, we
1108                do update the velocities and the tau_t. */
1109
1110             lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, state->fep_state, state->dfhist, step, state->v.rvec_array(), mdatoms);
1111             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1112             if (MASTER(cr))
1113             {
1114                 copy_df_history(state_global->dfhist, state->dfhist);
1115             }
1116         }
1117
1118         /* Now we have the energies and forces corresponding to the
1119          * coordinates at time t. We must output all of this before
1120          * the update.
1121          */
1122         do_md_trajectory_writing(fplog, cr, nfile, fnm, step, step_rel, t,
1123                                  ir, state, state_global, observablesHistory,
1124                                  top_global, fr,
1125                                  outf, energyOutput, ekind, f,
1126                                  checkpointHandler->isCheckpointingStep(),
1127                                  bRerunMD, bLastStep,
1128                                  mdrunOptions.writeConfout,
1129                                  bSumEkinhOld);
1130         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1131         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x.rvec_array(), t);
1132
1133         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1134         if (startingBehavior != StartingBehavior::NewSimulation &&
1135             (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1136         {
1137             copy_mat(state->svir_prev, shake_vir);
1138             copy_mat(state->fvir_prev, force_vir);
1139         }
1140
1141         stopHandler->setSignal();
1142         resetHandler->setSignal(walltime_accounting);
1143
1144         if (bGStat || !PAR(cr))
1145         {
1146             /* In parallel we only have to check for checkpointing in steps
1147              * where we do global communication,
1148              *  otherwise the other nodes don't know.
1149              */
1150             checkpointHandler->setSignal(walltime_accounting);
1151         }
1152
1153         /* #########   START SECOND UPDATE STEP ################# */
1154
1155         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen controlled
1156            in preprocessing */
1157
1158         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1159         {
1160             gmx_bool bIfRandomize;
1161             bIfRandomize = update_randomize_velocities(ir, step, cr, mdatoms, state->v, &upd, constr);
1162             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1163             if (constr && bIfRandomize)
1164             {
1165                 constrain_velocities(step, nullptr,
1166                                      state,
1167                                      tmp_vir,
1168                                      constr,
1169                                      bCalcVir, do_log, do_ene);
1170             }
1171         }
1172         /* Box is changed in update() when we do pressure coupling,
1173          * but we should still use the old box for energy corrections and when
1174          * writing it to the energy file, so it matches the trajectory files for
1175          * the same timestep above. Make a copy in a separate array.
1176          */
1177         copy_mat(state->box, lastbox);
1178
1179         dvdl_constr = 0;
1180
1181         wallcycle_start(wcycle, ewcUPDATE);
1182         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1183         if (bTrotter)
1184         {
1185             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
1186             /* We can only do Berendsen coupling after we have summed
1187              * the kinetic energy or virial. Since the happens
1188              * in global_state after update, we should only do it at
1189              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1190              */
1191         }
1192         else
1193         {
1194             update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
1195             update_pcouple_before_coordinates(fplog, step, ir, state,
1196                                               parrinellorahmanMu, M,
1197                                               bInitStep);
1198         }
1199
1200         if (EI_VV(ir->eI))
1201         {
1202             /* velocity half-step update */
1203             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
1204                           ekind, M, &upd, etrtVELOCITY2,
1205                           cr, constr);
1206         }
1207
1208         /* Above, initialize just copies ekinh into ekin,
1209          * it doesn't copy position (for VV),
1210          * and entire integrator for MD.
1211          */
1212
1213         if (ir->eI == eiVVAK)
1214         {
1215             cbuf.resize(state->x.size());
1216             std::copy(state->x.begin(), state->x.end(), cbuf.begin());
1217         }
1218
1219         if (useGpuForUpdate)
1220         {
1221             StatePropagatorDataGpu *stateGpu = fr->stateGpu;
1222             if (bNS)
1223             {
1224                 integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(), stateGpu->getForces(),
1225                                 top.idef, *mdatoms, ekind->ngtc);
1226                 t_pbc pbc;
1227                 set_pbc(&pbc, epbcXYZ, state->box);
1228                 integrator->setPbc(&pbc);
1229             }
1230
1231             stateGpu->copyCoordinatesToGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);
1232             stateGpu->copyVelocitiesToGpu(state->v, StatePropagatorDataGpu::AtomLocality::All);
1233             stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), StatePropagatorDataGpu::AtomLocality::All);
1234
1235             bool doTempCouple     = (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1236             bool doPressureCouple = (ir->epc == epcPARRINELLORAHMAN && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1237
1238             // This applies Leap-Frog, LINCS and SETTLE in succession
1239             integrator->integrate(ir->delta_t, true, bCalcVir, shake_vir,
1240                                   doTempCouple, ekind->tcstat,
1241                                   doPressureCouple, ir->nstpcouple*ir->delta_t, M);
1242             stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);
1243             stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::All);
1244             stateGpu->synchronizeStream();
1245         }
1246         else
1247         {
1248             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
1249                           ekind, M, &upd, etrtPOSITION, cr, constr);
1250
1251             wallcycle_stop(wcycle, ewcUPDATE);
1252
1253             constrain_coordinates(step, &dvdl_constr, state,
1254                                   shake_vir,
1255                                   &upd, constr,
1256                                   bCalcVir, do_log, do_ene);
1257
1258             update_sd_second_half(step, &dvdl_constr, ir, mdatoms, state,
1259                                   cr, nrnb, wcycle, &upd, constr, do_log, do_ene);
1260             finish_update(ir, mdatoms,
1261                           state, graph,
1262                           nrnb, wcycle, &upd, constr);
1263         }
1264
1265         if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1266         {
1267             updatePrevStepPullCom(pull_work, state);
1268         }
1269
1270         if (ir->eI == eiVVAK)
1271         {
1272             /* erase F_EKIN and F_TEMP here? */
1273             /* just compute the kinetic energy at the half step to perform a trotter step */
1274             compute_globals(gstat, cr, ir, fr, ekind,
1275                             state->x.rvec_array(), state->v.rvec_array(), state->box, state->lambda[efptVDW],
1276                             mdatoms, nrnb, &vcm,
1277                             wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
1278                             constr, &nullSignaller, lastbox,
1279                             nullptr, &bSumEkinhOld,
1280                             (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE
1281                             );
1282             wallcycle_start(wcycle, ewcUPDATE);
1283             trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
1284             /* now we know the scaling, we can compute the positions again */
1285             std::copy(cbuf.begin(), cbuf.end(), state->x.begin());
1286
1287             update_coords(step, ir, mdatoms, state, f.arrayRefWithPadding(), fcd,
1288                           ekind, M, &upd, etrtPOSITION, cr, constr);
1289             wallcycle_stop(wcycle, ewcUPDATE);
1290
1291             /* do we need an extra constraint here? just need to copy out of as_rvec_array(state->v.data()) to upd->xp? */
1292             /* are the small terms in the shake_vir here due
1293              * to numerical errors, or are they important
1294              * physically? I'm thinking they are just errors, but not completely sure.
1295              * For now, will call without actually constraining, constr=NULL*/
1296             finish_update(ir, mdatoms,
1297                           state, graph,
1298                           nrnb, wcycle, &upd, nullptr);
1299         }
1300         if (EI_VV(ir->eI))
1301         {
1302             /* this factor or 2 correction is necessary
1303                because half of the constraint force is removed
1304                in the vv step, so we have to double it.  See
1305                the Redmine issue #1255.  It is not yet clear
1306                if the factor of 2 is exact, or just a very
1307                good approximation, and this will be
1308                investigated.  The next step is to see if this
1309                can be done adding a dhdl contribution from the
1310                rattle step, but this is somewhat more
1311                complicated with the current code. Will be
1312                investigated, hopefully for 4.6.3. However,
1313                this current solution is much better than
1314                having it completely wrong.
1315              */
1316             enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
1317         }
1318         else
1319         {
1320             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1321         }
1322
1323         if (vsite != nullptr)
1324         {
1325             wallcycle_start(wcycle, ewcVSITECONSTR);
1326             if (graph != nullptr)
1327             {
1328                 shift_self(graph, state->box, state->x.rvec_array());
1329             }
1330             construct_vsites(vsite, state->x.rvec_array(), ir->delta_t, state->v.rvec_array(),
1331                              top.idef.iparams, top.idef.il,
1332                              fr->ePBC, fr->bMolPBC, cr, state->box);
1333
1334             if (graph != nullptr)
1335             {
1336                 unshift_self(graph, state->box, state->x.rvec_array());
1337             }
1338             wallcycle_stop(wcycle, ewcVSITECONSTR);
1339         }
1340
1341         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1342         /* With Leap-Frog we can skip compute_globals at
1343          * non-communication steps, but we need to calculate
1344          * the kinetic energy one step before communication.
1345          */
1346         {
1347             // Organize to do inter-simulation signalling on steps if
1348             // and when algorithms require it.
1349             bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1350
1351             if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)) || doInterSimSignal)
1352             {
1353                 // Since we're already communicating at this step, we
1354                 // can propagate intra-simulation signals. Note that
1355                 // check_nstglobalcomm has the responsibility for
1356                 // choosing the value of nstglobalcomm that is one way
1357                 // bGStat becomes true, so we can't get into a
1358                 // situation where e.g. checkpointing can't be
1359                 // signalled.
1360                 bool                doIntraSimSignal = true;
1361                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1362
1363                 compute_globals(gstat, cr, ir, fr, ekind,
1364                                 state->x.rvec_array(), state->v.rvec_array(), state->box, state->lambda[efptVDW],
1365                                 mdatoms, nrnb, &vcm,
1366                                 wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
1367                                 constr, &signaller,
1368                                 lastbox,
1369                                 &totalNumberOfBondedInteractions, &bSumEkinhOld,
1370                                 (bGStat ? CGLO_GSTAT : 0)
1371                                 | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1372                                 | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1373                                 | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1374                                 | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
1375                                 | CGLO_CONSTRAINT
1376                                 | (shouldCheckNumberOfBondedInteractions ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS : 0)
1377                                 );
1378                 checkNumberOfBondedInteractions(mdlog, cr, totalNumberOfBondedInteractions,
1379                                                 top_global, &top, state->x.rvec_array(), state->box,
1380                                                 &shouldCheckNumberOfBondedInteractions);
1381                 if (!EI_VV(ir->eI) && bStopCM)
1382                 {
1383                     process_and_stopcm_grp(fplog, &vcm, *mdatoms, state->x.rvec_array(), state->v.rvec_array());
1384                     inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
1385                 }
1386             }
1387         }
1388
1389         /* #############  END CALC EKIN AND PRESSURE ################# */
1390
1391         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1392            the virial that should probably be addressed eventually. state->veta has better properies,
1393            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1394            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1395
1396         if (ir->efep != efepNO && !EI_VV(ir->eI))
1397         {
1398             /* Sum up the foreign energy and dhdl terms for md and sd.
1399                Currently done every step so that dhdl is correct in the .edr */
1400             sum_dhdl(enerd, state->lambda, *ir->fepvals);
1401         }
1402
1403         update_pcouple_after_coordinates(fplog, step, ir, mdatoms,
1404                                          pres, force_vir, shake_vir,
1405                                          parrinellorahmanMu,
1406                                          state, nrnb, &upd);
1407
1408         /* ################# END UPDATE STEP 2 ################# */
1409         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1410
1411         /* The coordinates (x) were unshifted in update */
1412         if (!bGStat)
1413         {
1414             /* We will not sum ekinh_old,
1415              * so signal that we still have to do it.
1416              */
1417             bSumEkinhOld = TRUE;
1418         }
1419
1420         if (bCalcEner)
1421         {
1422             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1423
1424             /* use the directly determined last velocity, not actually the averaged half steps */
1425             if (bTrotter && ir->eI == eiVV)
1426             {
1427                 enerd->term[F_EKIN] = last_ekin;
1428             }
1429             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1430
1431             if (integratorHasConservedEnergyQuantity(ir))
1432             {
1433                 if (EI_VV(ir->eI))
1434                 {
1435                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1436                 }
1437                 else
1438                 {
1439                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1440                 }
1441             }
1442             /* #########  END PREPARING EDR OUTPUT  ###########  */
1443         }
1444
1445         /* Output stuff */
1446         if (MASTER(cr))
1447         {
1448             if (fplog && do_log && bDoExpanded)
1449             {
1450                 /* only needed if doing expanded ensemble */
1451                 PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : nullptr,
1452                                           state_global->dfhist, state->fep_state, ir->nstlog, step);
1453             }
1454             if (bCalcEner)
1455             {
1456                 energyOutput.addDataAtEnergyStep(bDoDHDL, bCalcEnerStep,
1457                                                  t, mdatoms->tmass, enerd, state,
1458                                                  ir->fepvals, ir->expandedvals, lastbox,
1459                                                  shake_vir, force_vir, total_vir, pres,
1460                                                  ekind, mu_tot, constr);
1461             }
1462             else
1463             {
1464                 energyOutput.recordNonEnergyStep();
1465             }
1466
1467             gmx_bool do_dr  = do_per_step(step, ir->nstdisreout);
1468             gmx_bool do_or  = do_per_step(step, ir->nstorireout);
1469
1470             if (doSimulatedAnnealing)
1471             {
1472                 energyOutput.printAnnealingTemperatures(do_log ? fplog : nullptr, groups, &(ir->opts));
1473             }
1474             if (do_log || do_ene || do_dr || do_or)
1475             {
1476                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf), do_ene, do_dr, do_or,
1477                                                    do_log ? fplog : nullptr,
1478                                                    step, t,
1479                                                    fcd, awh.get());
1480             }
1481
1482             if (ir->bPull)
1483             {
1484                 pull_print_output(pull_work, step, t);
1485             }
1486
1487             if (do_per_step(step, ir->nstlog))
1488             {
1489                 if (fflush(fplog) != 0)
1490                 {
1491                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1492                 }
1493             }
1494         }
1495         if (bDoExpanded)
1496         {
1497             /* Have to do this part _after_ outputting the logfile and the edr file */
1498             /* Gets written into the state at the beginning of next loop*/
1499             state->fep_state = lamnew;
1500         }
1501         /* Print the remaining wall clock time for the run */
1502         if (isMasterSimMasterRank(ms, MASTER(cr)) &&
1503             (do_verbose || gmx_got_usr_signal()) &&
1504             !bPMETunePrinting)
1505         {
1506             if (shellfc)
1507             {
1508                 fprintf(stderr, "\n");
1509             }
1510             print_time(stderr, walltime_accounting, step, ir, cr);
1511         }
1512
1513         /* Ion/water position swapping.
1514          * Not done in last step since trajectory writing happens before this call
1515          * in the MD loop and exchanges would be lost anyway. */
1516         bNeedRepartition = FALSE;
1517         if ((ir->eSwapCoords != eswapNO) && (step > 0) && !bLastStep &&
1518             do_per_step(step, ir->swap->nstswap))
1519         {
1520             bNeedRepartition = do_swapcoords(cr, step, t, ir, swap, wcycle,
1521                                              as_rvec_array(state->x.data()),
1522                                              state->box,
1523                                              MASTER(cr) && mdrunOptions.verbose,
1524                                              bRerunMD);
1525
1526             if (bNeedRepartition && DOMAINDECOMP(cr))
1527             {
1528                 dd_collect_state(cr->dd, state, state_global);
1529             }
1530         }
1531
1532         /* Replica exchange */
1533         bExchanged = FALSE;
1534         if (bDoReplEx)
1535         {
1536             bExchanged = replica_exchange(fplog, cr, ms, repl_ex,
1537                                           state_global, enerd,
1538                                           state, step, t);
1539         }
1540
1541         if ( (bExchanged || bNeedRepartition) && DOMAINDECOMP(cr) )
1542         {
1543             dd_partition_system(fplog, mdlog, step, cr, TRUE, 1,
1544                                 state_global, *top_global, ir, imdSession,
1545                                 pull_work,
1546                                 state, &f, mdAtoms, &top, fr,
1547                                 vsite, constr,
1548                                 nrnb, wcycle, FALSE);
1549             shouldCheckNumberOfBondedInteractions = true;
1550             upd.setNumAtoms(state->natoms);
1551         }
1552
1553         bFirstStep             = FALSE;
1554         bInitStep              = FALSE;
1555
1556         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1557         /* With all integrators, except VV, we need to retain the pressure
1558          * at the current step for coupling at the next step.
1559          */
1560         if ((state->flags & (1U<<estPRES_PREV)) &&
1561             (bGStatEveryStep ||
1562              (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1563         {
1564             /* Store the pressure in t_state for pressure coupling
1565              * at the next MD step.
1566              */
1567             copy_mat(pres, state->pres_prev);
1568         }
1569
1570         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
1571
1572         if ( (membed != nullptr) && (!bLastStep) )
1573         {
1574             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
1575         }
1576
1577         cycles = wallcycle_stop(wcycle, ewcSTEP);
1578         if (DOMAINDECOMP(cr) && wcycle)
1579         {
1580             dd_cycles_add(cr->dd, cycles, ddCyclStep);
1581         }
1582
1583         /* increase the MD step number */
1584         step++;
1585         step_rel++;
1586
1587         resetHandler->resetCounters(
1588                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(),
1589                 nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
1590
1591         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
1592         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
1593
1594     }
1595     /* End of main MD loop */
1596
1597     /* Closing TNG files can include compressing data. Therefore it is good to do that
1598      * before stopping the time measurements. */
1599     mdoutf_tng_close(outf);
1600
1601     /* Stop measuring walltime */
1602     walltime_accounting_end_time(walltime_accounting);
1603
1604     if (!thisRankHasDuty(cr, DUTY_PME))
1605     {
1606         /* Tell the PME only node to finish */
1607         gmx_pme_send_finish(cr);
1608     }
1609
1610     if (MASTER(cr))
1611     {
1612         if (ir->nstcalcenergy > 0)
1613         {
1614             energyOutput.printAnnealingTemperatures(fplog, groups, &(ir->opts));
1615             energyOutput.printAverages(fplog, groups);
1616         }
1617     }
1618     done_mdoutf(outf);
1619
1620     if (bPMETune)
1621     {
1622         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
1623     }
1624
1625     done_shellfc(fplog, shellfc, step_rel);
1626
1627     if (useReplicaExchange && MASTER(cr))
1628     {
1629         print_replica_exchange_statistics(fplog, repl_ex);
1630     }
1631
1632     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
1633
1634     global_stat_destroy(gstat);
1635
1636 }