Move the ownership of the xUpdatedOnDevice event to update constraints
[alexxy/gromacs.git] / src / gromacs / mdrun / md.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 /*! \internal \file
38  *
39  * \brief Implements the integrator for normal molecular dynamics simulations
40  *
41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
42  * \ingroup module_mdrun
43  */
44 #include "gmxpre.h"
45
46 #include <cinttypes>
47 #include <cmath>
48 #include <cstdio>
49 #include <cstdlib>
50
51 #include <algorithm>
52 #include <memory>
53 #include <numeric>
54
55 #include "gromacs/applied_forces/awh/awh.h"
56 #include "gromacs/applied_forces/awh/read_params.h"
57 #include "gromacs/commandline/filenm.h"
58 #include "gromacs/domdec/collect.h"
59 #include "gromacs/domdec/dlbtiming.h"
60 #include "gromacs/domdec/domdec.h"
61 #include "gromacs/domdec/domdec_network.h"
62 #include "gromacs/domdec/domdec_struct.h"
63 #include "gromacs/domdec/gpuhaloexchange.h"
64 #include "gromacs/domdec/localtopologychecker.h"
65 #include "gromacs/domdec/mdsetup.h"
66 #include "gromacs/domdec/partition.h"
67 #include "gromacs/essentialdynamics/edsam.h"
68 #include "gromacs/ewald/pme_load_balancing.h"
69 #include "gromacs/ewald/pme_pp.h"
70 #include "gromacs/fileio/trxio.h"
71 #include "gromacs/gmxlib/network.h"
72 #include "gromacs/gmxlib/nrnb.h"
73 #include "gromacs/gpu_utils/device_stream_manager.h"
74 #include "gromacs/gpu_utils/gpu_utils.h"
75 #include "gromacs/imd/imd.h"
76 #include "gromacs/listed_forces/listed_forces.h"
77 #include "gromacs/math/functions.h"
78 #include "gromacs/math/invertmatrix.h"
79 #include "gromacs/math/vec.h"
80 #include "gromacs/math/vectypes.h"
81 #include "gromacs/mdlib/checkpointhandler.h"
82 #include "gromacs/mdlib/compute_io.h"
83 #include "gromacs/mdlib/constr.h"
84 #include "gromacs/mdlib/coupling.h"
85 #include "gromacs/mdlib/ebin.h"
86 #include "gromacs/mdlib/enerdata_utils.h"
87 #include "gromacs/mdlib/energyoutput.h"
88 #include "gromacs/mdlib/expanded.h"
89 #include "gromacs/mdlib/force.h"
90 #include "gromacs/mdlib/force_flags.h"
91 #include "gromacs/mdlib/forcerec.h"
92 #include "gromacs/mdlib/freeenergyparameters.h"
93 #include "gromacs/mdlib/md_support.h"
94 #include "gromacs/mdlib/mdatoms.h"
95 #include "gromacs/mdlib/mdoutf.h"
96 #include "gromacs/mdlib/membed.h"
97 #include "gromacs/mdlib/resethandler.h"
98 #include "gromacs/mdlib/sighandler.h"
99 #include "gromacs/mdlib/simulationsignal.h"
100 #include "gromacs/mdlib/stat.h"
101 #include "gromacs/mdlib/stophandler.h"
102 #include "gromacs/mdlib/tgroup.h"
103 #include "gromacs/mdlib/trajectory_writing.h"
104 #include "gromacs/mdlib/update.h"
105 #include "gromacs/mdlib/update_constrain_gpu.h"
106 #include "gromacs/mdlib/update_vv.h"
107 #include "gromacs/mdlib/vcm.h"
108 #include "gromacs/mdlib/vsite.h"
109 #include "gromacs/mdrunutility/freeenergy.h"
110 #include "gromacs/mdrunutility/handlerestart.h"
111 #include "gromacs/mdrunutility/multisim.h"
112 #include "gromacs/mdrunutility/printtime.h"
113 #include "gromacs/mdtypes/awh_history.h"
114 #include "gromacs/mdtypes/awh_params.h"
115 #include "gromacs/mdtypes/commrec.h"
116 #include "gromacs/mdtypes/df_history.h"
117 #include "gromacs/mdtypes/energyhistory.h"
118 #include "gromacs/mdtypes/fcdata.h"
119 #include "gromacs/mdtypes/forcebuffers.h"
120 #include "gromacs/mdtypes/forcerec.h"
121 #include "gromacs/mdtypes/group.h"
122 #include "gromacs/mdtypes/inputrec.h"
123 #include "gromacs/mdtypes/interaction_const.h"
124 #include "gromacs/mdtypes/md_enums.h"
125 #include "gromacs/mdtypes/mdatom.h"
126 #include "gromacs/mdtypes/mdrunoptions.h"
127 #include "gromacs/mdtypes/multipletimestepping.h"
128 #include "gromacs/mdtypes/observableshistory.h"
129 #include "gromacs/mdtypes/pullhistory.h"
130 #include "gromacs/mdtypes/simulation_workload.h"
131 #include "gromacs/mdtypes/state.h"
132 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
133 #include "gromacs/modularsimulator/energydata.h"
134 #include "gromacs/nbnxm/gpu_data_mgmt.h"
135 #include "gromacs/nbnxm/nbnxm.h"
136 #include "gromacs/pbcutil/pbc.h"
137 #include "gromacs/pulling/output.h"
138 #include "gromacs/pulling/pull.h"
139 #include "gromacs/swap/swapcoords.h"
140 #include "gromacs/timing/wallcycle.h"
141 #include "gromacs/timing/walltime_accounting.h"
142 #include "gromacs/topology/atoms.h"
143 #include "gromacs/topology/idef.h"
144 #include "gromacs/topology/mtop_util.h"
145 #include "gromacs/topology/topology.h"
146 #include "gromacs/trajectory/trajectoryframe.h"
147 #include "gromacs/utility/basedefinitions.h"
148 #include "gromacs/utility/cstringutil.h"
149 #include "gromacs/utility/fatalerror.h"
150 #include "gromacs/utility/logger.h"
151 #include "gromacs/utility/real.h"
152 #include "gromacs/utility/smalloc.h"
153
154 #include "legacysimulator.h"
155 #include "replicaexchange.h"
156 #include "shellfc.h"
157
158 using gmx::SimulationSignaller;
159
160 void gmx::LegacySimulator::do_md()
161 {
162     // TODO Historically, the EM and MD "integrators" used different
163     // names for the t_inputrec *parameter, but these must have the
164     // same name, now that it's a member of a struct. We use this ir
165     // alias to avoid a large ripple of nearly useless changes.
166     // t_inputrec is being replaced by IMdpOptionsProvider, so this
167     // will go away eventually.
168     const t_inputrec* ir = inputrec;
169
170     int64_t      step, step_rel;
171     double       t, t0 = ir->init_t;
172     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
173     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
174     gmx_bool     bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
175     gmx_bool     do_ene, do_log, do_verbose;
176     gmx_bool     bMasterState;
177     unsigned int force_flags;
178     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
179     int    i, m;
180     rvec   mu_tot;
181     matrix pressureCouplingMu, M;
182     gmx_repl_ex_t     repl_ex = nullptr;
183     gmx_global_stat_t gstat;
184     gmx_shellfc_t*    shellfc;
185     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
186     gmx_bool          bTrotter;
187     real              dvdl_constr;
188     std::vector<RVec> cbuf;
189     matrix            lastbox;
190     int               lamnew = 0;
191     /* for FEP */
192     double    cycles;
193     real      saved_conserved_quantity = 0;
194     real      last_ekin                = 0;
195     t_extmass MassQ;
196     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
197
198     /* PME load balancing data for GPU kernels */
199     gmx_bool bPMETune         = FALSE;
200     gmx_bool bPMETunePrinting = FALSE;
201
202     bool bInteractiveMDstep = false;
203
204     SimulationSignals signals;
205     // Most global communnication stages don't propagate mdrun
206     // signals, and will use this object to achieve that.
207     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
208
209     if (!mdrunOptions.writeConfout)
210     {
211         // This is on by default, and the main known use case for
212         // turning it off is for convenience in benchmarking, which is
213         // something that should not show up in the general user
214         // interface.
215         GMX_LOG(mdlog.info)
216                 .asParagraph()
217                 .appendText(
218                         "The -noconfout functionality is deprecated, and may be removed in a "
219                         "future version.");
220     }
221
222     /* md-vv uses averaged full step velocities for T-control
223        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
224        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
225     bTrotter = (EI_VV(ir->eI)
226                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
227
228     const bool bRerunMD = false;
229
230     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
231     bGStatEveryStep   = (nstglobalcomm == 1);
232
233     const SimulationGroups* groups = &top_global.groups;
234
235     std::unique_ptr<EssentialDynamics> ed = nullptr;
236     if (opt2bSet("-ei", nfile, fnm))
237     {
238         /* Initialize essential dynamics sampling */
239         ed = init_edsam(mdlog,
240                         opt2fn_null("-ei", nfile, fnm),
241                         opt2fn("-eo", nfile, fnm),
242                         top_global,
243                         *ir,
244                         cr,
245                         constr,
246                         state_global,
247                         observablesHistory,
248                         oenv,
249                         startingBehavior);
250     }
251     else if (observablesHistory->edsamHistory)
252     {
253         gmx_fatal(FARGS,
254                   "The checkpoint is from a run with essential dynamics sampling, "
255                   "but the current run did not specify the -ei option. "
256                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
257     }
258
259     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
260     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
261     initialize_lambdas(fplog,
262                        ir->efep,
263                        ir->bSimTemp,
264                        *ir->fepvals,
265                        ir->simtempvals->temperatures,
266                        gmx::arrayRefFromArray(ir->opts.ref_t, ir->opts.ngtc),
267                        MASTER(cr),
268                        fep_state,
269                        lambda);
270     Update upd(*ir, deform);
271     bool   doSimulatedAnnealing = false;
272     {
273         // TODO: Avoid changing inputrec (#3854)
274         // Simulated annealing updates the reference temperature.
275         auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
276         doSimulatedAnnealing   = initSimulatedAnnealing(nonConstInputrec, &upd);
277     }
278     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
279
280     const t_fcdata& fcdata = *fr->fcdata;
281
282     bool simulationsShareState = false;
283     int  nstSignalComm         = nstglobalcomm;
284     {
285         // TODO This implementation of ensemble orientation restraints is nasty because
286         // a user can't just do multi-sim with single-sim orientation restraints.
287         bool usingEnsembleRestraints = (fcdata.disres->nsystems > 1) || ((ms != nullptr) && fcdata.orires);
288         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim() && (ms != nullptr));
289
290         // Replica exchange, ensemble restraints and AWH need all
291         // simulations to remain synchronized, so they need
292         // checkpoints and stop conditions to act on the same step, so
293         // the propagation of such signals must take place between
294         // simulations, not just within simulations.
295         // TODO: Make algorithm initializers set these flags.
296         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
297
298         if (simulationsShareState)
299         {
300             // Inter-simulation signal communication does not need to happen
301             // often, so we use a minimum of 200 steps to reduce overhead.
302             const int c_minimumInterSimulationSignallingInterval = 200;
303             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
304                             * nstglobalcomm;
305         }
306     }
307
308     if (startingBehavior != StartingBehavior::RestartWithAppending)
309     {
310         pleaseCiteCouplingAlgorithms(fplog, *ir);
311     }
312     gmx_mdoutf*       outf = init_mdoutf(fplog,
313                                    nfile,
314                                    fnm,
315                                    mdrunOptions,
316                                    cr,
317                                    outputProvider,
318                                    mdModulesNotifiers,
319                                    ir,
320                                    top_global,
321                                    oenv,
322                                    wcycle,
323                                    startingBehavior,
324                                    simulationsShareState,
325                                    ms);
326     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf),
327                                    top_global,
328                                    *ir,
329                                    pull_work,
330                                    mdoutf_get_fp_dhdl(outf),
331                                    false,
332                                    startingBehavior,
333                                    simulationsShareState,
334                                    mdModulesNotifiers);
335
336     gstat = global_stat_init(ir);
337
338     const auto& simulationWork     = runScheduleWork->simulationWork;
339     const bool  useGpuForPme       = simulationWork.useGpuPme;
340     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
341     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
342     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
343
344     /* Check for polarizable models and flexible constraints */
345     shellfc = init_shell_flexcon(fplog,
346                                  top_global,
347                                  constr ? constr->numFlexibleConstraints() : 0,
348                                  ir->nstcalcenergy,
349                                  DOMAINDECOMP(cr),
350                                  useGpuForPme);
351
352     {
353         double io = compute_io(ir, top_global.natoms, *groups, energyOutput.numEnergyTerms(), 1);
354         if ((io > 2000) && MASTER(cr))
355         {
356             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
357         }
358     }
359
360     // Local state only becomes valid now.
361     std::unique_ptr<t_state> stateInstance;
362     t_state*                 state;
363
364     gmx_localtop_t top(top_global.ffparams);
365
366     ForceBuffers     f(simulationWork.useMts,
367                    ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
368                                ? PinningPolicy::PinnedIfSupported
369                                : PinningPolicy::CannotBePinned);
370     const t_mdatoms* md = mdAtoms->mdatoms();
371     if (DOMAINDECOMP(cr))
372     {
373         stateInstance = std::make_unique<t_state>();
374         state         = stateInstance.get();
375         dd_init_local_state(*cr->dd, state_global, state);
376
377         /* Distribute the charge groups over the nodes from the master node */
378         dd_partition_system(fplog,
379                             mdlog,
380                             ir->init_step,
381                             cr,
382                             TRUE,
383                             1,
384                             state_global,
385                             top_global,
386                             *ir,
387                             imdSession,
388                             pull_work,
389                             state,
390                             &f,
391                             mdAtoms,
392                             &top,
393                             fr,
394                             vsite,
395                             constr,
396                             nrnb,
397                             nullptr,
398                             FALSE);
399         upd.updateAfterPartition(state->natoms,
400                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
401                                              : gmx::ArrayRef<const unsigned short>(),
402                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
403                                          : gmx::ArrayRef<const unsigned short>());
404     }
405     else
406     {
407         state_change_natoms(state_global, state_global->natoms);
408         /* Copy the pointer to the global state */
409         state = state_global;
410
411         /* Generate and initialize new topology */
412         mdAlgorithmsSetupAtomData(cr, *ir, top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
413
414         upd.updateAfterPartition(state->natoms,
415                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
416                                              : gmx::ArrayRef<const unsigned short>(),
417                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
418                                          : gmx::ArrayRef<const unsigned short>());
419     }
420
421     std::unique_ptr<UpdateConstrainGpu> integrator;
422
423     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
424
425     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
426     if (useGpuForUpdate)
427     {
428         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
429                                    || constr->numConstraintsTotal() == 0,
430                            "Constraints in domain decomposition are only supported with update "
431                            "groups if using GPU update.\n");
432         GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
433                                    || constr->numConstraintsTotal() == 0,
434                            "SHAKE is not supported with GPU update.");
435         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
436                            "Either PME or short-ranged non-bonded interaction tasks must run on "
437                            "the GPU to use GPU update.\n");
438         GMX_RELEASE_ASSERT(ir->eI == IntegrationAlgorithm::MD,
439                            "Only the md integrator is supported with the GPU update.\n");
440         GMX_RELEASE_ASSERT(
441                 ir->etc != TemperatureCoupling::NoseHoover,
442                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
443         GMX_RELEASE_ASSERT(
444                 ir->epc == PressureCoupling::No || ir->epc == PressureCoupling::ParrinelloRahman
445                         || ir->epc == PressureCoupling::Berendsen || ir->epc == PressureCoupling::CRescale,
446                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
447                 "with the GPU update.\n");
448         GMX_RELEASE_ASSERT(!md->haveVsites,
449                            "Virtual sites are not supported with the GPU update.\n");
450         GMX_RELEASE_ASSERT(ed == nullptr,
451                            "Essential dynamics is not supported with the GPU update.\n");
452         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
453                            "Constraints pulling is not supported with the GPU update.\n");
454         GMX_RELEASE_ASSERT(fcdata.orires == nullptr,
455                            "Orientation restraints are not supported with the GPU update.\n");
456         GMX_RELEASE_ASSERT(
457                 ir->efep == FreeEnergyPerturbationType::No
458                         || (!haveFepPerturbedMasses(top_global) && !havePerturbedConstraints(top_global)),
459                 "Free energy perturbation of masses and constraints are not supported with the GPU "
460                 "update.");
461
462         if (constr != nullptr && constr->numConstraintsTotal() > 0)
463         {
464             GMX_LOG(mdlog.info)
465                     .asParagraph()
466                     .appendText("Updating coordinates and applying constraints on the GPU.");
467         }
468         else
469         {
470             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
471         }
472         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
473                            "Device stream manager should be initialized in order to use GPU "
474                            "update-constraints.");
475         GMX_RELEASE_ASSERT(
476                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
477                 "Update stream should be initialized in order to use GPU "
478                 "update-constraints.");
479         integrator = std::make_unique<UpdateConstrainGpu>(
480                 *ir,
481                 top_global,
482                 ekind->ngtc,
483                 fr->deviceStreamManager->context(),
484                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
485                 wcycle);
486
487         stateGpu->setXUpdatedOnDeviceEvent(integrator->xUpdatedOnDeviceEvent());
488
489         integrator->setPbc(PbcType::Xyz, state->box);
490     }
491
492     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
493     {
494         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
495     }
496     if (useGpuForUpdate)
497     {
498         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
499     }
500
501     // NOTE: The global state is no longer used at this point.
502     // But state_global is still used as temporary storage space for writing
503     // the global state to file and potentially for replica exchange.
504     // (Global topology should persist.)
505
506     update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
507
508     if (ir->bExpanded)
509     {
510         /* Check nstexpanded here, because the grompp check was broken */
511         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
512         {
513             gmx_fatal(FARGS,
514                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
515         }
516         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
517     }
518
519     if (MASTER(cr))
520     {
521         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
522     }
523
524     preparePrevStepPullCom(ir,
525                            pull_work,
526                            gmx::arrayRefFromArray(md->massT, md->nr),
527                            state,
528                            state_global,
529                            cr,
530                            startingBehavior != StartingBehavior::NewSimulation);
531
532     // TODO: Remove this by converting AWH into a ForceProvider
533     auto awh = prepareAwhModule(fplog,
534                                 *ir,
535                                 state_global,
536                                 cr,
537                                 ms,
538                                 startingBehavior != StartingBehavior::NewSimulation,
539                                 shellfc != nullptr,
540                                 opt2fn("-awh", nfile, fnm),
541                                 pull_work);
542
543     if (useReplicaExchange && MASTER(cr))
544     {
545         repl_ex = init_replica_exchange(fplog, ms, top_global.natoms, ir, replExParams);
546     }
547     /* PME tuning is only supported in the Verlet scheme, with PME for
548      * Coulomb. It is not supported with only LJ PME. */
549     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
550                 && ir->cutoff_scheme != CutoffScheme::Group);
551
552     pme_load_balancing_t* pme_loadbal = nullptr;
553     if (bPMETune)
554     {
555         pme_loadbal_init(
556                 &pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu());
557     }
558
559     if (!ir->bContinuation)
560     {
561         if (state->flags & enumValueToBitMask(StateEntry::V))
562         {
563             auto v = makeArrayRef(state->v);
564             /* Set the velocities of vsites, shells and frozen atoms to zero */
565             for (i = 0; i < md->homenr; i++)
566             {
567                 if (md->ptype[i] == ParticleType::Shell)
568                 {
569                     clear_rvec(v[i]);
570                 }
571                 else if (md->cFREEZE)
572                 {
573                     for (m = 0; m < DIM; m++)
574                     {
575                         if (ir->opts.nFreeze[md->cFREEZE[i]][m])
576                         {
577                             v[i][m] = 0;
578                         }
579                     }
580                 }
581             }
582         }
583
584         if (constr)
585         {
586             /* Constrain the initial coordinates and velocities */
587             do_constrain_first(fplog,
588                                constr,
589                                ir,
590                                md->nr,
591                                md->homenr,
592                                state->x.arrayRefWithPadding(),
593                                state->v.arrayRefWithPadding(),
594                                state->box,
595                                state->lambda[FreeEnergyPerturbationCouplingType::Bonded]);
596         }
597     }
598
599     const int nstfep = computeFepPeriod(*ir, replExParams);
600
601     /* Be REALLY careful about what flags you set here. You CANNOT assume
602      * this is the first step, since we might be restarting from a checkpoint,
603      * and in that case we should not do any modifications to the state.
604      */
605     bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && !ir->bContinuation);
606
607     // When restarting from a checkpoint, it can be appropriate to
608     // initialize ekind from quantities in the checkpoint. Otherwise,
609     // compute_globals must initialize ekind before the simulation
610     // starts/restarts. However, only the master rank knows what was
611     // found in the checkpoint file, so we have to communicate in
612     // order to coordinate the restart.
613     //
614     // TODO Consider removing this communication if/when checkpoint
615     // reading directly follows .tpr reading, because all ranks can
616     // agree on hasReadEkinState at that time.
617     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
618     if (PAR(cr))
619     {
620         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
621     }
622     if (hasReadEkinState)
623     {
624         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
625     }
626
627     unsigned int cglo_flags =
628             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
629              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
630
631     bSumEkinhOld = FALSE;
632
633     t_vcm vcm(top_global.groups, *ir);
634     reportComRemovalInfo(fplog, vcm);
635
636     /* To minimize communication, compute_globals computes the COM velocity
637      * and the kinetic energy for the velocities without COM motion removed.
638      * Thus to get the kinetic energy without the COM contribution, we need
639      * to call compute_globals twice.
640      */
641     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
642     {
643         unsigned int cglo_flags_iteration = cglo_flags;
644         if (bStopCM && cgloIteration == 0)
645         {
646             cglo_flags_iteration |= CGLO_STOPCM;
647             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
648         }
649         if (DOMAINDECOMP(cr) && dd_localTopologyChecker(*cr->dd).shouldCheckNumberOfBondedInteractions()
650             && cgloIteration == 0)
651         {
652             cglo_flags_iteration |= CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS;
653         }
654         compute_globals(gstat,
655                         cr,
656                         ir,
657                         fr,
658                         ekind,
659                         makeConstArrayRef(state->x),
660                         makeConstArrayRef(state->v),
661                         state->box,
662                         md,
663                         nrnb,
664                         &vcm,
665                         nullptr,
666                         enerd,
667                         force_vir,
668                         shake_vir,
669                         total_vir,
670                         pres,
671                         gmx::ArrayRef<real>{},
672                         &nullSignaller,
673                         state->box,
674                         &bSumEkinhOld,
675                         cglo_flags_iteration);
676         if (cglo_flags_iteration & CGLO_STOPCM)
677         {
678             /* At initialization, do not pass x with acceleration-correction mode
679              * to avoid (incorrect) correction of the initial coordinates.
680              */
681             auto x = (vcm.mode == ComRemovalAlgorithm::LinearAccelerationCorrection)
682                              ? ArrayRef<RVec>()
683                              : makeArrayRef(state->x);
684             process_and_stopcm_grp(fplog, &vcm, *md, x, makeArrayRef(state->v));
685             inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
686         }
687     }
688     if (DOMAINDECOMP(cr))
689     {
690         dd_localTopologyChecker(cr->dd)->checkNumberOfBondedInteractions(
691                 &top, makeConstArrayRef(state->x), state->box);
692     }
693     if (ir->eI == IntegrationAlgorithm::VVAK)
694     {
695         /* a second call to get the half step temperature initialized as well */
696         /* we do the same call as above, but turn the pressure off -- internally to
697            compute_globals, this is recognized as a velocity verlet half-step
698            kinetic energy calculation.  This minimized excess variables, but
699            perhaps loses some logic?*/
700
701         compute_globals(gstat,
702                         cr,
703                         ir,
704                         fr,
705                         ekind,
706                         makeConstArrayRef(state->x),
707                         makeConstArrayRef(state->v),
708                         state->box,
709                         md,
710                         nrnb,
711                         &vcm,
712                         nullptr,
713                         enerd,
714                         force_vir,
715                         shake_vir,
716                         total_vir,
717                         pres,
718                         gmx::ArrayRef<real>{},
719                         &nullSignaller,
720                         state->box,
721                         &bSumEkinhOld,
722                         cglo_flags & ~CGLO_PRESSURE);
723     }
724
725     /* Calculate the initial half step temperature, and save the ekinh_old */
726     if (startingBehavior == StartingBehavior::NewSimulation)
727     {
728         for (i = 0; (i < ir->opts.ngtc); i++)
729         {
730             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
731         }
732     }
733
734     /* need to make an initiation call to get the Trotter variables set, as well as other constants
735        for non-trotter temperature control */
736     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
737
738     if (MASTER(cr))
739     {
740         if (!ir->bContinuation)
741         {
742             if (constr && ir->eConstrAlg == ConstraintAlgorithm::Lincs)
743             {
744                 fprintf(fplog,
745                         "RMS relative constraint deviation after constraining: %.2e\n",
746                         constr->rmsd());
747             }
748             if (EI_STATE_VELOCITY(ir->eI))
749             {
750                 real temp = enerd->term[F_TEMP];
751                 if (ir->eI != IntegrationAlgorithm::VV)
752                 {
753                     /* Result of Ekin averaged over velocities of -half
754                      * and +half step, while we only have -half step here.
755                      */
756                     temp *= 2;
757                 }
758                 fprintf(fplog, "Initial temperature: %g K\n", temp);
759             }
760         }
761
762         char tbuf[20];
763         fprintf(stderr, "starting mdrun '%s'\n", *(top_global.name));
764         if (ir->nsteps >= 0)
765         {
766             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
767         }
768         else
769         {
770             sprintf(tbuf, "%s", "infinite");
771         }
772         if (ir->init_step > 0)
773         {
774             fprintf(stderr,
775                     "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
776                     gmx_step_str(ir->init_step + ir->nsteps, sbuf),
777                     tbuf,
778                     gmx_step_str(ir->init_step, sbuf2),
779                     ir->init_step * ir->delta_t);
780         }
781         else
782         {
783             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
784         }
785         fprintf(fplog, "\n");
786     }
787
788     walltime_accounting_start_time(walltime_accounting);
789     wallcycle_start(wcycle, WallCycleCounter::Run);
790     print_start(fplog, cr, walltime_accounting, "mdrun");
791
792     /***********************************************************
793      *
794      *             Loop over MD steps
795      *
796      ************************************************************/
797
798     bFirstStep = TRUE;
799     /* Skip the first Nose-Hoover integration when we get the state from tpx */
800     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
801     bSumEkinhOld     = FALSE;
802     bExchanged       = FALSE;
803     bNeedRepartition = FALSE;
804
805     step     = ir->init_step;
806     step_rel = 0;
807
808     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
809             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]),
810             simulationsShareState,
811             MASTER(cr),
812             ir->nstlist,
813             mdrunOptions.reproducible,
814             nstSignalComm,
815             mdrunOptions.maximumHoursToRun,
816             ir->nstlist == 0,
817             fplog,
818             step,
819             bNS,
820             walltime_accounting);
821
822     auto checkpointHandler = std::make_unique<CheckpointHandler>(
823             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
824             simulationsShareState,
825             ir->nstlist == 0,
826             MASTER(cr),
827             mdrunOptions.writeConfout,
828             mdrunOptions.checkpointOptions.period);
829
830     const bool resetCountersIsLocal = true;
831     auto       resetHandler         = std::make_unique<ResetHandler>(
832             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
833             !resetCountersIsLocal,
834             ir->nsteps,
835             MASTER(cr),
836             mdrunOptions.timingOptions.resetHalfway,
837             mdrunOptions.maximumHoursToRun,
838             mdlog,
839             wcycle,
840             walltime_accounting);
841
842     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
843
844     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
845     {
846         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
847     }
848
849     /* and stop now if we should */
850     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
851     while (!bLastStep)
852     {
853
854         /* Determine if this is a neighbor search step */
855         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
856
857         if (bPMETune && bNStList)
858         {
859             // This has to be here because PME load balancing is called so early.
860             // TODO: Move to after all booleans are defined.
861             if (useGpuForUpdate && !bFirstStep)
862             {
863                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
864                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
865             }
866             /* PME grid + cut-off optimization with GPUs or PME nodes */
867             pme_loadbal_do(pme_loadbal,
868                            cr,
869                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
870                            fplog,
871                            mdlog,
872                            *ir,
873                            fr,
874                            state->box,
875                            state->x,
876                            wcycle,
877                            step,
878                            step_rel,
879                            &bPMETunePrinting,
880                            simulationWork.useGpuPmePpCommunication);
881         }
882
883         wallcycle_start(wcycle, WallCycleCounter::Step);
884
885         bLastStep = (step_rel == ir->nsteps);
886         t         = t0 + step * ir->delta_t;
887
888         // TODO Refactor this, so that nstfep does not need a default value of zero
889         if (ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
890         {
891             /* find and set the current lambdas */
892             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
893
894             bDoDHDL = do_per_step(step, ir->fepvals->nstdhdl);
895             bDoFEP  = ((ir->efep != FreeEnergyPerturbationType::No) && do_per_step(step, nstfep));
896             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
897                            && (!bFirstStep));
898         }
899
900         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
901                      && do_per_step(step, replExParams.exchangeInterval));
902
903         if (doSimulatedAnnealing)
904         {
905             // TODO: Avoid changing inputrec (#3854)
906             // Simulated annealing updates the reference temperature.
907             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
908             update_annealing_target_temp(nonConstInputrec, t, &upd);
909         }
910
911         /* Stop Center of Mass motion */
912         bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && do_per_step(step, ir->nstcomm));
913
914         /* Determine whether or not to do Neighbour Searching */
915         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
916
917         /* Note that the stopHandler will cause termination at nstglobalcomm
918          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
919          * nstpcouple steps, we have computed the half-step kinetic energy
920          * of the previous step and can always output energies at the last step.
921          */
922         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
923
924         /* do_log triggers energy and virial calculation. Because this leads
925          * to different code paths, forces can be different. Thus for exact
926          * continuation we should avoid extra log output.
927          * Note that the || bLastStep can result in non-exact continuation
928          * beyond the last step. But we don't consider that to be an issue.
929          */
930         do_log     = (do_per_step(step, ir->nstlog)
931                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
932         do_verbose = mdrunOptions.verbose
933                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
934
935         if (useGpuForUpdate && !bFirstStep && bNS)
936         {
937             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
938             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
939             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
940             // Copy coordinate from the GPU when needed at the search step.
941             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
942             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
943             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
944             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
945         }
946
947         // We only need to calculate virtual velocities if we are writing them in the current step
948         const bool needVirtualVelocitiesThisStep =
949                 (vsite != nullptr)
950                 && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep());
951
952         if (vsite != nullptr)
953         {
954             // Virtual sites need to be updated before domain decomposition and forces are calculated
955             wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
956             // md-vv calculates virtual velocities once it has full-step real velocities
957             vsite->construct(state->x,
958                              state->v,
959                              state->box,
960                              (!EI_VV(inputrec->eI) && needVirtualVelocitiesThisStep)
961                                      ? VSiteOperation::PositionsAndVelocities
962                                      : VSiteOperation::Positions);
963             wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
964         }
965
966         if (bNS && !(bFirstStep && ir->bContinuation))
967         {
968             bMasterState = FALSE;
969             /* Correct the new box if it is too skewed */
970             if (inputrecDynamicBox(ir))
971             {
972                 if (correct_box(fplog, step, state->box))
973                 {
974                     bMasterState = TRUE;
975                     // If update is offloaded, it should be informed about the box size change
976                     if (useGpuForUpdate)
977                     {
978                         integrator->setPbc(PbcType::Xyz, state->box);
979                     }
980                 }
981             }
982             if (DOMAINDECOMP(cr) && bMasterState)
983             {
984                 dd_collect_state(cr->dd, state, state_global);
985             }
986
987             if (DOMAINDECOMP(cr))
988             {
989                 /* Repartition the domain decomposition */
990                 dd_partition_system(fplog,
991                                     mdlog,
992                                     step,
993                                     cr,
994                                     bMasterState,
995                                     nstglobalcomm,
996                                     state_global,
997                                     top_global,
998                                     *ir,
999                                     imdSession,
1000                                     pull_work,
1001                                     state,
1002                                     &f,
1003                                     mdAtoms,
1004                                     &top,
1005                                     fr,
1006                                     vsite,
1007                                     constr,
1008                                     nrnb,
1009                                     wcycle,
1010                                     do_verbose && !bPMETunePrinting);
1011                 upd.updateAfterPartition(state->natoms,
1012                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1013                                                      : gmx::ArrayRef<const unsigned short>(),
1014                                          md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1015                                                  : gmx::ArrayRef<const unsigned short>());
1016             }
1017         }
1018
1019         // Allocate or re-size GPU halo exchange object, if necessary
1020         if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange)
1021         {
1022             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
1023                                "GPU device manager has to be initialized to use GPU "
1024                                "version of halo exchange.");
1025             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
1026         }
1027
1028         if (MASTER(cr) && do_log)
1029         {
1030             gmx::EnergyOutput::printHeader(
1031                     fplog, step, t); /* can we improve the information printed here? */
1032         }
1033
1034         if (ir->efep != FreeEnergyPerturbationType::No)
1035         {
1036             update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
1037         }
1038
1039         if (bExchanged)
1040         {
1041             /* We need the kinetic energy at minus the half step for determining
1042              * the full step kinetic energy and possibly for T-coupling.*/
1043             /* This may not be quite working correctly yet . . . . */
1044             int cglo_flags = CGLO_GSTAT | CGLO_TEMPERATURE;
1045             if (DOMAINDECOMP(cr) && dd_localTopologyChecker(*cr->dd).shouldCheckNumberOfBondedInteractions())
1046             {
1047                 cglo_flags |= CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS;
1048             }
1049             compute_globals(gstat,
1050                             cr,
1051                             ir,
1052                             fr,
1053                             ekind,
1054                             makeConstArrayRef(state->x),
1055                             makeConstArrayRef(state->v),
1056                             state->box,
1057                             md,
1058                             nrnb,
1059                             &vcm,
1060                             wcycle,
1061                             enerd,
1062                             nullptr,
1063                             nullptr,
1064                             nullptr,
1065                             nullptr,
1066                             gmx::ArrayRef<real>{},
1067                             &nullSignaller,
1068                             state->box,
1069                             &bSumEkinhOld,
1070                             cglo_flags);
1071             if (DOMAINDECOMP(cr))
1072             {
1073                 dd_localTopologyChecker(cr->dd)->checkNumberOfBondedInteractions(
1074                         &top, makeConstArrayRef(state->x), state->box);
1075             }
1076         }
1077         clear_mat(force_vir);
1078
1079         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
1080
1081         /* Determine the energy and pressure:
1082          * at nstcalcenergy steps and at energy output steps (set below).
1083          */
1084         if (EI_VV(ir->eI) && (!bInitStep))
1085         {
1086             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1087             bCalcVir      = bCalcEnerStep
1088                        || (ir->epc != PressureCoupling::No
1089                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
1090         }
1091         else
1092         {
1093             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1094             bCalcVir      = bCalcEnerStep
1095                        || (ir->epc != PressureCoupling::No && do_per_step(step, ir->nstpcouple));
1096         }
1097         bCalcEner = bCalcEnerStep;
1098
1099         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
1100
1101         if (do_ene || do_log || bDoReplEx)
1102         {
1103             bCalcVir  = TRUE;
1104             bCalcEner = TRUE;
1105         }
1106
1107         /* Do we need global communication ? */
1108         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
1109                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
1110
1111         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
1112                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
1113                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (bDoFEP ? GMX_FORCE_DHDL : 0));
1114         if (simulationWork.useMts && !do_per_step(step, ir->nstfout))
1115         {
1116             // TODO: merge this with stepWork.useOnlyMtsCombinedForceBuffer
1117             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
1118         }
1119
1120         if (shellfc)
1121         {
1122             /* Now is the time to relax the shells */
1123             relax_shell_flexcon(fplog,
1124                                 cr,
1125                                 ms,
1126                                 mdrunOptions.verbose,
1127                                 enforcedRotation,
1128                                 step,
1129                                 ir,
1130                                 imdSession,
1131                                 pull_work,
1132                                 bNS,
1133                                 force_flags,
1134                                 &top,
1135                                 constr,
1136                                 enerd,
1137                                 state->natoms,
1138                                 state->x.arrayRefWithPadding(),
1139                                 state->v.arrayRefWithPadding(),
1140                                 state->box,
1141                                 state->lambda,
1142                                 &state->hist,
1143                                 &f.view(),
1144                                 force_vir,
1145                                 *md,
1146                                 nrnb,
1147                                 wcycle,
1148                                 shellfc,
1149                                 fr,
1150                                 runScheduleWork,
1151                                 t,
1152                                 mu_tot,
1153                                 vsite,
1154                                 ddBalanceRegionHandler);
1155         }
1156         else
1157         {
1158             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
1159                is updated (or the AWH update will be performed twice for one step when continuing).
1160                It would be best to call this update function from do_md_trajectory_writing but that
1161                would occur after do_force. One would have to divide the update_awh function into one
1162                function applying the AWH force and one doing the AWH bias update. The update AWH
1163                bias function could then be called after do_md_trajectory_writing (then containing
1164                update_awh_history). The checkpointing will in the future probably moved to the start
1165                of the md loop which will rid of this issue. */
1166             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
1167             {
1168                 awh->updateHistory(state_global->awhHistory.get());
1169             }
1170
1171             /* The coordinates (x) are shifted (to get whole molecules)
1172              * in do_force.
1173              * This is parallellized as well, and does communication too.
1174              * Check comments in sim_util.c
1175              */
1176             do_force(fplog,
1177                      cr,
1178                      ms,
1179                      *ir,
1180                      awh.get(),
1181                      enforcedRotation,
1182                      imdSession,
1183                      pull_work,
1184                      step,
1185                      nrnb,
1186                      wcycle,
1187                      &top,
1188                      state->box,
1189                      state->x.arrayRefWithPadding(),
1190                      &state->hist,
1191                      &f.view(),
1192                      force_vir,
1193                      md,
1194                      enerd,
1195                      state->lambda,
1196                      fr,
1197                      runScheduleWork,
1198                      vsite,
1199                      mu_tot,
1200                      t,
1201                      ed ? ed->getLegacyED() : nullptr,
1202                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
1203                      ddBalanceRegionHandler);
1204         }
1205
1206         // VV integrators do not need the following velocity half step
1207         // if it is the first step after starting from a checkpoint.
1208         // That is, the half step is needed on all other steps, and
1209         // also the first step when starting from a .tpr file.
1210         if (EI_VV(ir->eI))
1211         {
1212             integrateVVFirstStep(step,
1213                                  bFirstStep,
1214                                  bInitStep,
1215                                  startingBehavior,
1216                                  nstglobalcomm,
1217                                  ir,
1218                                  fr,
1219                                  cr,
1220                                  state,
1221                                  mdAtoms->mdatoms(),
1222                                  fcdata,
1223                                  &MassQ,
1224                                  &vcm,
1225                                  top,
1226                                  enerd,
1227                                  ekind,
1228                                  gstat,
1229                                  &last_ekin,
1230                                  bCalcVir,
1231                                  total_vir,
1232                                  shake_vir,
1233                                  force_vir,
1234                                  pres,
1235                                  M,
1236                                  do_log,
1237                                  do_ene,
1238                                  bCalcEner,
1239                                  bGStat,
1240                                  bStopCM,
1241                                  bTrotter,
1242                                  bExchanged,
1243                                  &bSumEkinhOld,
1244                                  &saved_conserved_quantity,
1245                                  &f,
1246                                  &upd,
1247                                  constr,
1248                                  &nullSignaller,
1249                                  trotter_seq,
1250                                  nrnb,
1251                                  fplog,
1252                                  wcycle);
1253             if (vsite != nullptr && needVirtualVelocitiesThisStep)
1254             {
1255                 // Positions were calculated earlier
1256                 wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
1257                 vsite->construct(state->x, state->v, state->box, VSiteOperation::Velocities);
1258                 wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
1259             }
1260         }
1261
1262         /* ########  END FIRST UPDATE STEP  ############## */
1263         /* ########  If doing VV, we now have v(dt) ###### */
1264         if (bDoExpanded)
1265         {
1266             /* perform extended ensemble sampling in lambda - we don't
1267                actually move to the new state before outputting
1268                statistics, but if performing simulated tempering, we
1269                do update the velocities and the tau_t. */
1270             // TODO: Avoid changing inputrec (#3854)
1271             // Simulated tempering updates the reference temperature.
1272             // Expanded ensemble without simulated tempering does not change the inputrec.
1273             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
1274             lamnew                 = ExpandedEnsembleDynamics(fplog,
1275                                               nonConstInputrec,
1276                                               enerd,
1277                                               state,
1278                                               &MassQ,
1279                                               state->fep_state,
1280                                               state->dfhist,
1281                                               step,
1282                                               state->v.rvec_array(),
1283                                               md->homenr,
1284                                               md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1285                                                                       : gmx::ArrayRef<const unsigned short>());
1286             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1287             if (MASTER(cr))
1288             {
1289                 copy_df_history(state_global->dfhist, state->dfhist);
1290             }
1291         }
1292
1293         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1294         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1295         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1296             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1297                 || checkpointHandler->isCheckpointingStep()))
1298         {
1299             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1300             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1301         }
1302         // Copy velocities if needed for the output/checkpointing.
1303         // NOTE: Copy on the search steps is done at the beginning of the step.
1304         if (useGpuForUpdate && !bNS
1305             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1306         {
1307             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1308             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1309         }
1310         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1311         // and update is offloaded hence forces are kept on the GPU for update and have not been
1312         // already transferred in do_force().
1313         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1314         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1315         //       prior to GPU update.
1316         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1317         //       copy call in do_force(...).
1318         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1319         //       on host after the D2H copy in do_force(...).
1320         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1321             && do_per_step(step, ir->nstfout))
1322         {
1323             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1324             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1325         }
1326         /* Now we have the energies and forces corresponding to the
1327          * coordinates at time t. We must output all of this before
1328          * the update.
1329          */
1330         do_md_trajectory_writing(fplog,
1331                                  cr,
1332                                  nfile,
1333                                  fnm,
1334                                  step,
1335                                  step_rel,
1336                                  t,
1337                                  ir,
1338                                  state,
1339                                  state_global,
1340                                  observablesHistory,
1341                                  top_global,
1342                                  fr,
1343                                  outf,
1344                                  energyOutput,
1345                                  ekind,
1346                                  f.view().force(),
1347                                  checkpointHandler->isCheckpointingStep(),
1348                                  bRerunMD,
1349                                  bLastStep,
1350                                  mdrunOptions.writeConfout,
1351                                  bSumEkinhOld);
1352         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1353         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x, t);
1354
1355         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1356         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1357             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1358         {
1359             copy_mat(state->svir_prev, shake_vir);
1360             copy_mat(state->fvir_prev, force_vir);
1361         }
1362
1363         stopHandler->setSignal();
1364         resetHandler->setSignal(walltime_accounting);
1365
1366         if (bGStat || !PAR(cr))
1367         {
1368             /* In parallel we only have to check for checkpointing in steps
1369              * where we do global communication,
1370              *  otherwise the other nodes don't know.
1371              */
1372             checkpointHandler->setSignal(walltime_accounting);
1373         }
1374
1375         /* #########   START SECOND UPDATE STEP ################# */
1376
1377         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1378            controlled in preprocessing */
1379
1380         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1381         {
1382             gmx_bool bIfRandomize;
1383             bIfRandomize = update_randomize_velocities(ir,
1384                                                        step,
1385                                                        cr,
1386                                                        md->homenr,
1387                                                        md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1388                                                                : gmx::ArrayRef<const unsigned short>(),
1389                                                        gmx::arrayRefFromArray(md->invmass, md->nr),
1390                                                        state->v,
1391                                                        &upd,
1392                                                        constr);
1393             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1394             if (constr && bIfRandomize)
1395             {
1396                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1397             }
1398         }
1399         /* Box is changed in update() when we do pressure coupling,
1400          * but we should still use the old box for energy corrections and when
1401          * writing it to the energy file, so it matches the trajectory files for
1402          * the same timestep above. Make a copy in a separate array.
1403          */
1404         copy_mat(state->box, lastbox);
1405
1406         dvdl_constr = 0;
1407
1408         if (!useGpuForUpdate)
1409         {
1410             wallcycle_start(wcycle, WallCycleCounter::Update);
1411         }
1412         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1413         if (bTrotter)
1414         {
1415             trotter_update(ir,
1416                            step,
1417                            ekind,
1418                            enerd,
1419                            state,
1420                            total_vir,
1421                            md->homenr,
1422                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1423                                    : gmx::ArrayRef<const unsigned short>(),
1424                            gmx::arrayRefFromArray(md->invmass, md->nr),
1425                            &MassQ,
1426                            trotter_seq,
1427                            TrotterSequence::Three);
1428             /* We can only do Berendsen coupling after we have summed
1429              * the kinetic energy or virial. Since the happens
1430              * in global_state after update, we should only do it at
1431              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1432              */
1433         }
1434         else
1435         {
1436             update_tcouple(step,
1437                            ir,
1438                            state,
1439                            ekind,
1440                            &MassQ,
1441                            md->homenr,
1442                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1443                                    : gmx::ArrayRef<const unsigned short>());
1444             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1445         }
1446
1447         /* With leap-frog type integrators we compute the kinetic energy
1448          * at a whole time step as the average of the half-time step kinetic
1449          * energies of two subsequent steps. Therefore we need to compute the
1450          * half step kinetic energy also if we need energies at the next step.
1451          */
1452         const bool needHalfStepKineticEnergy =
1453                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1454
1455         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1456         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1457         const bool doParrinelloRahman = (ir->epc == PressureCoupling::ParrinelloRahman
1458                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1459
1460         if (EI_VV(ir->eI))
1461         {
1462             GMX_ASSERT(!useGpuForUpdate, "GPU update is not supported with VVAK integrator.");
1463
1464             integrateVVSecondStep(step,
1465                                   ir,
1466                                   fr,
1467                                   cr,
1468                                   state,
1469                                   mdAtoms->mdatoms(),
1470                                   fcdata,
1471                                   &MassQ,
1472                                   &vcm,
1473                                   pull_work,
1474                                   enerd,
1475                                   ekind,
1476                                   gstat,
1477                                   &dvdl_constr,
1478                                   bCalcVir,
1479                                   total_vir,
1480                                   shake_vir,
1481                                   force_vir,
1482                                   pres,
1483                                   M,
1484                                   lastbox,
1485                                   do_log,
1486                                   do_ene,
1487                                   bGStat,
1488                                   &bSumEkinhOld,
1489                                   &f,
1490                                   &cbuf,
1491                                   &upd,
1492                                   constr,
1493                                   &nullSignaller,
1494                                   trotter_seq,
1495                                   nrnb,
1496                                   wcycle);
1497         }
1498         else
1499         {
1500             if (useGpuForUpdate)
1501             {
1502                 if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1503                 {
1504                     integrator->set(stateGpu->getCoordinates(),
1505                                     stateGpu->getVelocities(),
1506                                     stateGpu->getForces(),
1507                                     top.idef,
1508                                     *md);
1509
1510                     // Copy data to the GPU after buffers might have being reinitialized
1511                     /* The velocity copy is redundant if we had Center-of-Mass motion removed on
1512                      * the previous step. We don't check that now. */
1513                     stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1514                     // coordinates have been copied already if PME or buffer ops has not needed it this step.
1515                     const bool useGpuPmeOnThisRank = runScheduleWork->simulationWork.useGpuPme
1516                                                      && thisRankHasDuty(cr, DUTY_PME)
1517                                                      && runScheduleWork->stepWork.computeSlowForces;
1518                     if (!useGpuPmeOnThisRank && !runScheduleWork->stepWork.useGpuXBufferOps)
1519                     {
1520                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1521                     }
1522                 }
1523
1524                 if (simulationWork.useGpuPme && !runScheduleWork->simulationWork.useGpuPmePpCommunication
1525                     && !thisRankHasDuty(cr, DUTY_PME))
1526                 {
1527                     // The PME forces were recieved to the host, so have to be copied
1528                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::All);
1529                 }
1530                 else if (!runScheduleWork->stepWork.useGpuFBufferOps)
1531                 {
1532                     // The buffer ops were not offloaded this step, so the forces are on the
1533                     // host and have to be copied
1534                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1535                 }
1536
1537                 const bool doTemperatureScaling =
1538                         (ir->etc != TemperatureCoupling::No
1539                          && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1540
1541                 // This applies Leap-Frog, LINCS and SETTLE in succession
1542                 integrator->integrate(
1543                         stateGpu->getForcesReadyOnDeviceEvent(
1544                                 AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1545                         ir->delta_t,
1546                         true,
1547                         bCalcVir,
1548                         shake_vir,
1549                         doTemperatureScaling,
1550                         ekind->tcstat,
1551                         doParrinelloRahman,
1552                         ir->nstpcouple * ir->delta_t,
1553                         M);
1554
1555                 // Copy velocities D2H after update if:
1556                 // - Globals are computed this step (includes the energy output steps).
1557                 // - Temperature is needed for the next step.
1558                 if (bGStat || needHalfStepKineticEnergy)
1559                 {
1560                     stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1561                     stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1562                 }
1563             }
1564             else
1565             {
1566                 /* With multiple time stepping we need to do an additional normal
1567                  * update step to obtain the virial, as the actual MTS integration
1568                  * using an acceleration where the slow forces are multiplied by mtsFactor.
1569                  * Using that acceleration would result in a virial with the slow
1570                  * force contribution would be a factor mtsFactor too large.
1571                  */
1572                 if (simulationWork.useMts && bCalcVir && constr != nullptr)
1573                 {
1574                     upd.update_for_constraint_virial(*ir,
1575                                                      md->homenr,
1576                                                      md->havePartiallyFrozenAtoms,
1577                                                      gmx::arrayRefFromArray(md->invmass, md->nr),
1578                                                      gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1579                                                      *state,
1580                                                      f.view().forceWithPadding(),
1581                                                      *ekind);
1582
1583                     constrain_coordinates(constr,
1584                                           do_log,
1585                                           do_ene,
1586                                           step,
1587                                           state,
1588                                           upd.xp()->arrayRefWithPadding(),
1589                                           &dvdl_constr,
1590                                           bCalcVir,
1591                                           shake_vir);
1592                 }
1593
1594                 ArrayRefWithPadding<const RVec> forceCombined =
1595                         (simulationWork.useMts && step % ir->mtsLevels[1].stepFactor == 0)
1596                                 ? f.view().forceMtsCombinedWithPadding()
1597                                 : f.view().forceWithPadding();
1598                 upd.update_coords(*ir,
1599                                   step,
1600                                   md->homenr,
1601                                   md->havePartiallyFrozenAtoms,
1602                                   gmx::arrayRefFromArray(md->ptype, md->nr),
1603                                   gmx::arrayRefFromArray(md->invmass, md->nr),
1604                                   gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1605                                   state,
1606                                   forceCombined,
1607                                   fcdata,
1608                                   ekind,
1609                                   M,
1610                                   etrtPOSITION,
1611                                   cr,
1612                                   constr != nullptr);
1613
1614                 wallcycle_stop(wcycle, WallCycleCounter::Update);
1615
1616                 constrain_coordinates(constr,
1617                                       do_log,
1618                                       do_ene,
1619                                       step,
1620                                       state,
1621                                       upd.xp()->arrayRefWithPadding(),
1622                                       &dvdl_constr,
1623                                       bCalcVir && !simulationWork.useMts,
1624                                       shake_vir);
1625
1626                 upd.update_sd_second_half(*ir,
1627                                           step,
1628                                           &dvdl_constr,
1629                                           md->homenr,
1630                                           gmx::arrayRefFromArray(md->ptype, md->nr),
1631                                           gmx::arrayRefFromArray(md->invmass, md->nr),
1632                                           state,
1633                                           cr,
1634                                           nrnb,
1635                                           wcycle,
1636                                           constr,
1637                                           do_log,
1638                                           do_ene);
1639                 upd.finish_update(
1640                         *ir, md->havePartiallyFrozenAtoms, md->homenr, state, wcycle, constr != nullptr);
1641             }
1642
1643             if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1644             {
1645                 updatePrevStepPullCom(pull_work, state);
1646             }
1647
1648             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1649         }
1650
1651         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1652         /* With Leap-Frog we can skip compute_globals at
1653          * non-communication steps, but we need to calculate
1654          * the kinetic energy one step before communication.
1655          */
1656         {
1657             // Organize to do inter-simulation signalling on steps if
1658             // and when algorithms require it.
1659             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1660
1661             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1662             {
1663                 // Copy coordinates when needed to stop the CM motion.
1664                 if (useGpuForUpdate && (bDoReplEx || (!EI_VV(ir->eI) && bStopCM)))
1665                 {
1666                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1667                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1668                 }
1669                 // Since we're already communicating at this step, we
1670                 // can propagate intra-simulation signals. Note that
1671                 // check_nstglobalcomm has the responsibility for
1672                 // choosing the value of nstglobalcomm that is one way
1673                 // bGStat becomes true, so we can't get into a
1674                 // situation where e.g. checkpointing can't be
1675                 // signalled.
1676                 bool                doIntraSimSignal = true;
1677                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1678
1679                 compute_globals(
1680                         gstat,
1681                         cr,
1682                         ir,
1683                         fr,
1684                         ekind,
1685                         makeConstArrayRef(state->x),
1686                         makeConstArrayRef(state->v),
1687                         state->box,
1688                         md,
1689                         nrnb,
1690                         &vcm,
1691                         wcycle,
1692                         enerd,
1693                         force_vir,
1694                         shake_vir,
1695                         total_vir,
1696                         pres,
1697                         (!EI_VV(ir->eI) && bCalcEner && constr != nullptr) ? constr->rmsdData()
1698                                                                            : gmx::ArrayRef<real>{},
1699                         &signaller,
1700                         lastbox,
1701                         &bSumEkinhOld,
1702                         (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1703                                 | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1704                                 | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1705                                 | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
1706                                 | (DOMAINDECOMP(cr) && dd_localTopologyChecker(*cr->dd).shouldCheckNumberOfBondedInteractions()
1707                                            ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1708                                            : 0));
1709                 if (DOMAINDECOMP(cr))
1710                 {
1711                     dd_localTopologyChecker(cr->dd)->checkNumberOfBondedInteractions(
1712                             &top, makeConstArrayRef(state->x), state->box);
1713                 }
1714                 if (!EI_VV(ir->eI) && bStopCM)
1715                 {
1716                     process_and_stopcm_grp(
1717                             fplog, &vcm, *md, makeArrayRef(state->x), makeArrayRef(state->v));
1718                     inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
1719
1720                     // TODO: The special case of removing CM motion should be dealt more gracefully
1721                     if (useGpuForUpdate)
1722                     {
1723                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1724                         // Here we block until the H2D copy completes because event sync with the
1725                         // force kernels that use the coordinates on the next steps is not implemented
1726                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1727                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1728                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1729                         if (vcm.mode != ComRemovalAlgorithm::No)
1730                         {
1731                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1732                         }
1733                     }
1734                 }
1735             }
1736         }
1737
1738         /* #############  END CALC EKIN AND PRESSURE ################# */
1739
1740         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1741            the virial that should probably be addressed eventually. state->veta has better properies,
1742            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1743            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1744
1745         if (ir->efep != FreeEnergyPerturbationType::No && !EI_VV(ir->eI))
1746         {
1747             /* Sum up the foreign energy and dK/dl terms for md and sd.
1748                Currently done every step so that dH/dl is correct in the .edr */
1749             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1750         }
1751
1752         update_pcouple_after_coordinates(fplog,
1753                                          step,
1754                                          ir,
1755                                          md->homenr,
1756                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1757                                                      : gmx::ArrayRef<const unsigned short>(),
1758                                          pres,
1759                                          force_vir,
1760                                          shake_vir,
1761                                          pressureCouplingMu,
1762                                          state,
1763                                          nrnb,
1764                                          upd.deform(),
1765                                          !useGpuForUpdate);
1766
1767         const bool doBerendsenPressureCoupling = (inputrec->epc == PressureCoupling::Berendsen
1768                                                   && do_per_step(step, inputrec->nstpcouple));
1769         const bool doCRescalePressureCoupling  = (inputrec->epc == PressureCoupling::CRescale
1770                                                  && do_per_step(step, inputrec->nstpcouple));
1771         if (useGpuForUpdate
1772             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1773         {
1774             integrator->scaleCoordinates(pressureCouplingMu);
1775             if (doCRescalePressureCoupling)
1776             {
1777                 matrix pressureCouplingInvMu;
1778                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1779                 integrator->scaleVelocities(pressureCouplingInvMu);
1780             }
1781             integrator->setPbc(PbcType::Xyz, state->box);
1782         }
1783
1784         /* ################# END UPDATE STEP 2 ################# */
1785         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1786
1787         /* The coordinates (x) were unshifted in update */
1788         if (!bGStat)
1789         {
1790             /* We will not sum ekinh_old,
1791              * so signal that we still have to do it.
1792              */
1793             bSumEkinhOld = TRUE;
1794         }
1795
1796         if (bCalcEner)
1797         {
1798             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1799
1800             /* use the directly determined last velocity, not actually the averaged half steps */
1801             if (bTrotter && ir->eI == IntegrationAlgorithm::VV)
1802             {
1803                 enerd->term[F_EKIN] = last_ekin;
1804             }
1805             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1806
1807             if (integratorHasConservedEnergyQuantity(ir))
1808             {
1809                 if (EI_VV(ir->eI))
1810                 {
1811                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1812                 }
1813                 else
1814                 {
1815                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1816                 }
1817             }
1818             /* #########  END PREPARING EDR OUTPUT  ###########  */
1819         }
1820
1821         /* Output stuff */
1822         if (MASTER(cr))
1823         {
1824             if (fplog && do_log && bDoExpanded)
1825             {
1826                 /* only needed if doing expanded ensemble */
1827                 PrintFreeEnergyInfoToFile(fplog,
1828                                           ir->fepvals.get(),
1829                                           ir->expandedvals.get(),
1830                                           ir->bSimTemp ? ir->simtempvals.get() : nullptr,
1831                                           state_global->dfhist,
1832                                           state->fep_state,
1833                                           ir->nstlog,
1834                                           step);
1835             }
1836             if (bCalcEner)
1837             {
1838                 energyOutput.addDataAtEnergyStep(bDoDHDL,
1839                                                  bCalcEnerStep,
1840                                                  t,
1841                                                  md->tmass,
1842                                                  enerd,
1843                                                  ir->fepvals.get(),
1844                                                  ir->expandedvals.get(),
1845                                                  lastbox,
1846                                                  PTCouplingArrays{ state->boxv,
1847                                                                    state->nosehoover_xi,
1848                                                                    state->nosehoover_vxi,
1849                                                                    state->nhpres_xi,
1850                                                                    state->nhpres_vxi },
1851                                                  state->fep_state,
1852                                                  total_vir,
1853                                                  pres,
1854                                                  ekind,
1855                                                  mu_tot,
1856                                                  constr);
1857             }
1858             else
1859             {
1860                 energyOutput.recordNonEnergyStep();
1861             }
1862
1863             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1864             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1865
1866             if (doSimulatedAnnealing)
1867             {
1868                 gmx::EnergyOutput::printAnnealingTemperatures(
1869                         do_log ? fplog : nullptr, groups, &(ir->opts));
1870             }
1871             if (do_log || do_ene || do_dr || do_or)
1872             {
1873                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf),
1874                                                    do_ene,
1875                                                    do_dr,
1876                                                    do_or,
1877                                                    do_log ? fplog : nullptr,
1878                                                    step,
1879                                                    t,
1880                                                    fr->fcdata.get(),
1881                                                    awh.get());
1882             }
1883             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1884             {
1885                 const bool isInitialOutput = false;
1886                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1887             }
1888
1889             if (ir->bPull)
1890             {
1891                 pull_print_output(pull_work, step, t);
1892             }
1893
1894             if (do_per_step(step, ir->nstlog))
1895             {
1896                 if (fflush(fplog) != 0)
1897                 {
1898                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1899                 }
1900             }
1901         }
1902         if (bDoExpanded)
1903         {
1904             /* Have to do this part _after_ outputting the logfile and the edr file */
1905             /* Gets written into the state at the beginning of next loop*/
1906             state->fep_state = lamnew;
1907         }
1908         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1909         {
1910             state->fep_state = awh->fepLambdaState();
1911         }
1912         /* Print the remaining wall clock time for the run */
1913         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1914         {
1915             if (shellfc)
1916             {
1917                 fprintf(stderr, "\n");
1918             }
1919             print_time(stderr, walltime_accounting, step, ir, cr);
1920         }
1921
1922         /* Ion/water position swapping.
1923          * Not done in last step since trajectory writing happens before this call
1924          * in the MD loop and exchanges would be lost anyway. */
1925         bNeedRepartition = FALSE;
1926         if ((ir->eSwapCoords != SwapType::No) && (step > 0) && !bLastStep
1927             && do_per_step(step, ir->swap->nstswap))
1928         {
1929             bNeedRepartition = do_swapcoords(cr,
1930                                              step,
1931                                              t,
1932                                              ir,
1933                                              swap,
1934                                              wcycle,
1935                                              as_rvec_array(state->x.data()),
1936                                              state->box,
1937                                              MASTER(cr) && mdrunOptions.verbose,
1938                                              bRerunMD);
1939
1940             if (bNeedRepartition && DOMAINDECOMP(cr))
1941             {
1942                 dd_collect_state(cr->dd, state, state_global);
1943             }
1944         }
1945
1946         /* Replica exchange */
1947         bExchanged = FALSE;
1948         if (bDoReplEx)
1949         {
1950             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1951         }
1952
1953         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1954         {
1955             dd_partition_system(fplog,
1956                                 mdlog,
1957                                 step,
1958                                 cr,
1959                                 TRUE,
1960                                 1,
1961                                 state_global,
1962                                 top_global,
1963                                 *ir,
1964                                 imdSession,
1965                                 pull_work,
1966                                 state,
1967                                 &f,
1968                                 mdAtoms,
1969                                 &top,
1970                                 fr,
1971                                 vsite,
1972                                 constr,
1973                                 nrnb,
1974                                 wcycle,
1975                                 FALSE);
1976             upd.updateAfterPartition(state->natoms,
1977                                      md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1978                                                  : gmx::ArrayRef<const unsigned short>(),
1979                                      md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1980                                              : gmx::ArrayRef<const unsigned short>());
1981         }
1982
1983         bFirstStep = FALSE;
1984         bInitStep  = FALSE;
1985
1986         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
1987         /* With all integrators, except VV, we need to retain the pressure
1988          * at the current step for coupling at the next step.
1989          */
1990         if ((state->flags & enumValueToBitMask(StateEntry::PressurePrevious))
1991             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
1992         {
1993             /* Store the pressure in t_state for pressure coupling
1994              * at the next MD step.
1995              */
1996             copy_mat(pres, state->pres_prev);
1997         }
1998
1999         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
2000
2001         if ((membed != nullptr) && (!bLastStep))
2002         {
2003             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
2004         }
2005
2006         cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
2007         if (DOMAINDECOMP(cr) && wcycle)
2008         {
2009             dd_cycles_add(cr->dd, cycles, ddCyclStep);
2010         }
2011
2012         /* increase the MD step number */
2013         step++;
2014         step_rel++;
2015
2016 #if GMX_FAHCORE
2017         if (MASTER(cr))
2018         {
2019             fcReportProgress(ir->nsteps + ir->init_step, step);
2020         }
2021 #endif
2022
2023         resetHandler->resetCounters(
2024                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
2025
2026         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
2027         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
2028     }
2029     /* End of main MD loop */
2030
2031     /* Closing TNG files can include compressing data. Therefore it is good to do that
2032      * before stopping the time measurements. */
2033     mdoutf_tng_close(outf);
2034
2035     /* Stop measuring walltime */
2036     walltime_accounting_end_time(walltime_accounting);
2037
2038     if (!thisRankHasDuty(cr, DUTY_PME))
2039     {
2040         /* Tell the PME only node to finish */
2041         gmx_pme_send_finish(cr);
2042     }
2043
2044     if (MASTER(cr))
2045     {
2046         if (ir->nstcalcenergy > 0)
2047         {
2048             energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
2049
2050             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
2051             energyOutput.printAverages(fplog, groups);
2052         }
2053     }
2054     done_mdoutf(outf);
2055
2056     if (bPMETune)
2057     {
2058         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
2059     }
2060
2061     done_shellfc(fplog, shellfc, step_rel);
2062
2063     if (useReplicaExchange && MASTER(cr))
2064     {
2065         print_replica_exchange_statistics(fplog, repl_ex);
2066     }
2067
2068     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
2069
2070     global_stat_destroy(gstat);
2071 }