Make CpuPpLongRangeNonbondeds class
[alexxy/gromacs.git] / src / gromacs / mdrun / md.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2011-2019,2020,2021, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 /*! \internal \file
38  *
39  * \brief Implements the integrator for normal molecular dynamics simulations
40  *
41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
42  * \ingroup module_mdrun
43  */
44 #include "gmxpre.h"
45
46 #include <cinttypes>
47 #include <cmath>
48 #include <cstdio>
49 #include <cstdlib>
50
51 #include <algorithm>
52 #include <memory>
53 #include <numeric>
54
55 #include "gromacs/applied_forces/awh/awh.h"
56 #include "gromacs/applied_forces/awh/read_params.h"
57 #include "gromacs/commandline/filenm.h"
58 #include "gromacs/domdec/collect.h"
59 #include "gromacs/domdec/dlbtiming.h"
60 #include "gromacs/domdec/domdec.h"
61 #include "gromacs/domdec/domdec_network.h"
62 #include "gromacs/domdec/domdec_struct.h"
63 #include "gromacs/domdec/gpuhaloexchange.h"
64 #include "gromacs/domdec/localtopologychecker.h"
65 #include "gromacs/domdec/mdsetup.h"
66 #include "gromacs/domdec/partition.h"
67 #include "gromacs/essentialdynamics/edsam.h"
68 #include "gromacs/ewald/pme_load_balancing.h"
69 #include "gromacs/ewald/pme_pp.h"
70 #include "gromacs/fileio/trxio.h"
71 #include "gromacs/gmxlib/network.h"
72 #include "gromacs/gmxlib/nrnb.h"
73 #include "gromacs/gpu_utils/device_stream_manager.h"
74 #include "gromacs/gpu_utils/gpu_utils.h"
75 #include "gromacs/imd/imd.h"
76 #include "gromacs/listed_forces/listed_forces.h"
77 #include "gromacs/math/functions.h"
78 #include "gromacs/math/invertmatrix.h"
79 #include "gromacs/math/vec.h"
80 #include "gromacs/math/vectypes.h"
81 #include "gromacs/mdlib/checkpointhandler.h"
82 #include "gromacs/mdlib/compute_io.h"
83 #include "gromacs/mdlib/constr.h"
84 #include "gromacs/mdlib/coupling.h"
85 #include "gromacs/mdlib/ebin.h"
86 #include "gromacs/mdlib/enerdata_utils.h"
87 #include "gromacs/mdlib/energyoutput.h"
88 #include "gromacs/mdlib/expanded.h"
89 #include "gromacs/mdlib/force.h"
90 #include "gromacs/mdlib/force_flags.h"
91 #include "gromacs/mdlib/forcerec.h"
92 #include "gromacs/mdlib/freeenergyparameters.h"
93 #include "gromacs/mdlib/md_support.h"
94 #include "gromacs/mdlib/mdatoms.h"
95 #include "gromacs/mdlib/mdoutf.h"
96 #include "gromacs/mdlib/membed.h"
97 #include "gromacs/mdlib/resethandler.h"
98 #include "gromacs/mdlib/sighandler.h"
99 #include "gromacs/mdlib/simulationsignal.h"
100 #include "gromacs/mdlib/stat.h"
101 #include "gromacs/mdlib/stophandler.h"
102 #include "gromacs/mdlib/tgroup.h"
103 #include "gromacs/mdlib/trajectory_writing.h"
104 #include "gromacs/mdlib/update.h"
105 #include "gromacs/mdlib/update_constrain_gpu.h"
106 #include "gromacs/mdlib/update_vv.h"
107 #include "gromacs/mdlib/vcm.h"
108 #include "gromacs/mdlib/vsite.h"
109 #include "gromacs/mdrunutility/freeenergy.h"
110 #include "gromacs/mdrunutility/handlerestart.h"
111 #include "gromacs/mdrunutility/multisim.h"
112 #include "gromacs/mdrunutility/printtime.h"
113 #include "gromacs/mdtypes/awh_history.h"
114 #include "gromacs/mdtypes/awh_params.h"
115 #include "gromacs/mdtypes/commrec.h"
116 #include "gromacs/mdtypes/df_history.h"
117 #include "gromacs/mdtypes/energyhistory.h"
118 #include "gromacs/mdtypes/fcdata.h"
119 #include "gromacs/mdtypes/forcebuffers.h"
120 #include "gromacs/mdtypes/forcerec.h"
121 #include "gromacs/mdtypes/group.h"
122 #include "gromacs/mdtypes/inputrec.h"
123 #include "gromacs/mdtypes/interaction_const.h"
124 #include "gromacs/mdtypes/md_enums.h"
125 #include "gromacs/mdtypes/mdatom.h"
126 #include "gromacs/mdtypes/mdrunoptions.h"
127 #include "gromacs/mdtypes/multipletimestepping.h"
128 #include "gromacs/mdtypes/observableshistory.h"
129 #include "gromacs/mdtypes/observablesreducer.h"
130 #include "gromacs/mdtypes/pullhistory.h"
131 #include "gromacs/mdtypes/simulation_workload.h"
132 #include "gromacs/mdtypes/state.h"
133 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
134 #include "gromacs/modularsimulator/energydata.h"
135 #include "gromacs/nbnxm/gpu_data_mgmt.h"
136 #include "gromacs/nbnxm/nbnxm.h"
137 #include "gromacs/pbcutil/pbc.h"
138 #include "gromacs/pulling/output.h"
139 #include "gromacs/pulling/pull.h"
140 #include "gromacs/swap/swapcoords.h"
141 #include "gromacs/timing/wallcycle.h"
142 #include "gromacs/timing/walltime_accounting.h"
143 #include "gromacs/topology/atoms.h"
144 #include "gromacs/topology/idef.h"
145 #include "gromacs/topology/mtop_util.h"
146 #include "gromacs/topology/topology.h"
147 #include "gromacs/trajectory/trajectoryframe.h"
148 #include "gromacs/utility/basedefinitions.h"
149 #include "gromacs/utility/cstringutil.h"
150 #include "gromacs/utility/fatalerror.h"
151 #include "gromacs/utility/logger.h"
152 #include "gromacs/utility/real.h"
153 #include "gromacs/utility/smalloc.h"
154
155 #include "legacysimulator.h"
156 #include "replicaexchange.h"
157 #include "shellfc.h"
158
159 using gmx::SimulationSignaller;
160
161 void gmx::LegacySimulator::do_md()
162 {
163     // TODO Historically, the EM and MD "integrators" used different
164     // names for the t_inputrec *parameter, but these must have the
165     // same name, now that it's a member of a struct. We use this ir
166     // alias to avoid a large ripple of nearly useless changes.
167     // t_inputrec is being replaced by IMdpOptionsProvider, so this
168     // will go away eventually.
169     const t_inputrec* ir = inputrec;
170
171     double       t, t0 = ir->init_t;
172     gmx_bool     bGStatEveryStep, bGStat, bCalcVir, bCalcEnerStep, bCalcEner;
173     gmx_bool     bNS = FALSE, bNStList, bStopCM, bFirstStep, bInitStep, bLastStep = FALSE;
174     gmx_bool     bDoExpanded = FALSE;
175     gmx_bool     do_ene, do_log, do_verbose;
176     gmx_bool     bMasterState;
177     unsigned int force_flags;
178     tensor force_vir = { { 0 } }, shake_vir = { { 0 } }, total_vir = { { 0 } }, pres = { { 0 } };
179     int    i, m;
180     rvec   mu_tot;
181     matrix pressureCouplingMu, M;
182     gmx_repl_ex_t     repl_ex = nullptr;
183     gmx_global_stat_t gstat;
184     gmx_shellfc_t*    shellfc;
185     gmx_bool          bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
186     gmx_bool          bTrotter;
187     real              dvdl_constr;
188     std::vector<RVec> cbuf;
189     matrix            lastbox;
190     int               lamnew = 0;
191     /* for FEP */
192     double    cycles;
193     real      saved_conserved_quantity = 0;
194     real      last_ekin                = 0;
195     t_extmass MassQ;
196     char      sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
197
198     /* PME load balancing data for GPU kernels */
199     gmx_bool bPMETune         = FALSE;
200     gmx_bool bPMETunePrinting = FALSE;
201
202     bool bInteractiveMDstep = false;
203
204     SimulationSignals signals;
205     // Most global communnication stages don't propagate mdrun
206     // signals, and will use this object to achieve that.
207     SimulationSignaller nullSignaller(nullptr, nullptr, nullptr, false, false);
208
209     if (!mdrunOptions.writeConfout)
210     {
211         // This is on by default, and the main known use case for
212         // turning it off is for convenience in benchmarking, which is
213         // something that should not show up in the general user
214         // interface.
215         GMX_LOG(mdlog.info)
216                 .asParagraph()
217                 .appendText(
218                         "The -noconfout functionality is deprecated, and may be removed in a "
219                         "future version.");
220     }
221
222     /* md-vv uses averaged full step velocities for T-control
223        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
224        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
225     bTrotter = (EI_VV(ir->eI)
226                 && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir) || inputrecNvtTrotter(ir)));
227
228     const bool bRerunMD = false;
229
230     int nstglobalcomm = computeGlobalCommunicationPeriod(mdlog, ir, cr);
231     bGStatEveryStep   = (nstglobalcomm == 1);
232
233     const SimulationGroups* groups = &top_global.groups;
234
235     std::unique_ptr<EssentialDynamics> ed = nullptr;
236     if (opt2bSet("-ei", nfile, fnm))
237     {
238         /* Initialize essential dynamics sampling */
239         ed = init_edsam(mdlog,
240                         opt2fn_null("-ei", nfile, fnm),
241                         opt2fn("-eo", nfile, fnm),
242                         top_global,
243                         *ir,
244                         cr,
245                         constr,
246                         state_global,
247                         observablesHistory,
248                         oenv,
249                         startingBehavior);
250     }
251     else if (observablesHistory->edsamHistory)
252     {
253         gmx_fatal(FARGS,
254                   "The checkpoint is from a run with essential dynamics sampling, "
255                   "but the current run did not specify the -ei option. "
256                   "Either specify the -ei option to mdrun, or do not use this checkpoint file.");
257     }
258
259     int*                fep_state = MASTER(cr) ? &state_global->fep_state : nullptr;
260     gmx::ArrayRef<real> lambda    = MASTER(cr) ? state_global->lambda : gmx::ArrayRef<real>();
261     initialize_lambdas(fplog,
262                        ir->efep,
263                        ir->bSimTemp,
264                        *ir->fepvals,
265                        ir->simtempvals->temperatures,
266                        gmx::arrayRefFromArray(ir->opts.ref_t, ir->opts.ngtc),
267                        MASTER(cr),
268                        fep_state,
269                        lambda);
270     Update upd(*ir, deform);
271     bool   doSimulatedAnnealing = false;
272     {
273         // TODO: Avoid changing inputrec (#3854)
274         // Simulated annealing updates the reference temperature.
275         auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
276         doSimulatedAnnealing   = initSimulatedAnnealing(nonConstInputrec, &upd);
277     }
278     const bool useReplicaExchange = (replExParams.exchangeInterval > 0);
279
280     t_fcdata& fcdata = *fr->fcdata;
281
282     bool simulationsShareState = false;
283     int  nstSignalComm         = nstglobalcomm;
284     {
285         // TODO This implementation of ensemble orientation restraints is nasty because
286         // a user can't just do multi-sim with single-sim orientation restraints.
287         bool usingEnsembleRestraints = (fcdata.disres->nsystems > 1) || ((ms != nullptr) && fcdata.orires);
288         bool awhUsesMultiSim = (ir->bDoAwh && ir->awhParams->shareBiasMultisim() && (ms != nullptr));
289
290         // Replica exchange, ensemble restraints and AWH need all
291         // simulations to remain synchronized, so they need
292         // checkpoints and stop conditions to act on the same step, so
293         // the propagation of such signals must take place between
294         // simulations, not just within simulations.
295         // TODO: Make algorithm initializers set these flags.
296         simulationsShareState = useReplicaExchange || usingEnsembleRestraints || awhUsesMultiSim;
297
298         if (simulationsShareState)
299         {
300             // Inter-simulation signal communication does not need to happen
301             // often, so we use a minimum of 200 steps to reduce overhead.
302             const int c_minimumInterSimulationSignallingInterval = 200;
303             nstSignalComm = ((c_minimumInterSimulationSignallingInterval + nstglobalcomm - 1) / nstglobalcomm)
304                             * nstglobalcomm;
305         }
306     }
307
308     if (startingBehavior != StartingBehavior::RestartWithAppending)
309     {
310         pleaseCiteCouplingAlgorithms(fplog, *ir);
311     }
312     gmx_mdoutf*       outf = init_mdoutf(fplog,
313                                    nfile,
314                                    fnm,
315                                    mdrunOptions,
316                                    cr,
317                                    outputProvider,
318                                    mdModulesNotifiers,
319                                    ir,
320                                    top_global,
321                                    oenv,
322                                    wcycle,
323                                    startingBehavior,
324                                    simulationsShareState,
325                                    ms);
326     gmx::EnergyOutput energyOutput(mdoutf_get_fp_ene(outf),
327                                    top_global,
328                                    *ir,
329                                    pull_work,
330                                    mdoutf_get_fp_dhdl(outf),
331                                    false,
332                                    startingBehavior,
333                                    simulationsShareState,
334                                    mdModulesNotifiers);
335
336     gstat = global_stat_init(ir);
337
338     const auto& simulationWork     = runScheduleWork->simulationWork;
339     const bool  useGpuForPme       = simulationWork.useGpuPme;
340     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
341     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
342     const bool  useGpuForUpdate    = simulationWork.useGpuUpdate;
343
344     /* Check for polarizable models and flexible constraints */
345     shellfc = init_shell_flexcon(fplog,
346                                  top_global,
347                                  constr ? constr->numFlexibleConstraints() : 0,
348                                  ir->nstcalcenergy,
349                                  DOMAINDECOMP(cr),
350                                  useGpuForPme);
351
352     {
353         double io = compute_io(ir, top_global.natoms, *groups, energyOutput.numEnergyTerms(), 1);
354         if ((io > 2000) && MASTER(cr))
355         {
356             fprintf(stderr, "\nWARNING: This run will generate roughly %.0f Mb of data\n\n", io);
357         }
358     }
359
360     // Local state only becomes valid now.
361     std::unique_ptr<t_state> stateInstance;
362     t_state*                 state;
363
364     gmx_localtop_t     top(top_global.ffparams);
365     ObservablesReducer observablesReducer = observablesReducerBuilder->build();
366
367     ForceBuffers     f(simulationWork.useMts,
368                    ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
369                                ? PinningPolicy::PinnedIfSupported
370                                : PinningPolicy::CannotBePinned);
371     const t_mdatoms* md = mdAtoms->mdatoms();
372     if (DOMAINDECOMP(cr))
373     {
374         stateInstance = std::make_unique<t_state>();
375         state         = stateInstance.get();
376         dd_init_local_state(*cr->dd, state_global, state);
377
378         /* Distribute the charge groups over the nodes from the master node */
379         dd_partition_system(fplog,
380                             mdlog,
381                             ir->init_step,
382                             cr,
383                             TRUE,
384                             1,
385                             state_global,
386                             top_global,
387                             *ir,
388                             imdSession,
389                             pull_work,
390                             state,
391                             &f,
392                             mdAtoms,
393                             &top,
394                             fr,
395                             vsite,
396                             constr,
397                             nrnb,
398                             nullptr,
399                             FALSE);
400         upd.updateAfterPartition(state->natoms,
401                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
402                                              : gmx::ArrayRef<const unsigned short>(),
403                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
404                                          : gmx::ArrayRef<const unsigned short>());
405         fr->longRangeNonbondeds->updateAfterPartition(*md);
406     }
407     else
408     {
409         state_change_natoms(state_global, state_global->natoms);
410         /* Copy the pointer to the global state */
411         state = state_global;
412
413         /* Generate and initialize new topology */
414         mdAlgorithmsSetupAtomData(cr, *ir, top_global, &top, fr, &f, mdAtoms, constr, vsite, shellfc);
415
416         upd.updateAfterPartition(state->natoms,
417                                  md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
418                                              : gmx::ArrayRef<const unsigned short>(),
419                                  md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
420                                          : gmx::ArrayRef<const unsigned short>());
421         fr->longRangeNonbondeds->updateAfterPartition(*md);
422     }
423
424     std::unique_ptr<UpdateConstrainGpu> integrator;
425
426     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
427
428     // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
429     if (useGpuForUpdate)
430     {
431         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
432                                    || constr->numConstraintsTotal() == 0,
433                            "Constraints in domain decomposition are only supported with update "
434                            "groups if using GPU update.\n");
435         GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
436                                    || constr->numConstraintsTotal() == 0,
437                            "SHAKE is not supported with GPU update.");
438         GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps),
439                            "Either PME or short-ranged non-bonded interaction tasks must run on "
440                            "the GPU to use GPU update.\n");
441         GMX_RELEASE_ASSERT(ir->eI == IntegrationAlgorithm::MD,
442                            "Only the md integrator is supported with the GPU update.\n");
443         GMX_RELEASE_ASSERT(
444                 ir->etc != TemperatureCoupling::NoseHoover,
445                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
446         GMX_RELEASE_ASSERT(
447                 ir->epc == PressureCoupling::No || ir->epc == PressureCoupling::ParrinelloRahman
448                         || ir->epc == PressureCoupling::Berendsen || ir->epc == PressureCoupling::CRescale,
449                 "Only Parrinello-Rahman, Berendsen, and C-rescale pressure coupling are supported "
450                 "with the GPU update.\n");
451         GMX_RELEASE_ASSERT(!md->haveVsites,
452                            "Virtual sites are not supported with the GPU update.\n");
453         GMX_RELEASE_ASSERT(ed == nullptr,
454                            "Essential dynamics is not supported with the GPU update.\n");
455         GMX_RELEASE_ASSERT(!ir->bPull || !pull_have_constraint(*ir->pull),
456                            "Constraints pulling is not supported with the GPU update.\n");
457         GMX_RELEASE_ASSERT(fcdata.orires == nullptr,
458                            "Orientation restraints are not supported with the GPU update.\n");
459         GMX_RELEASE_ASSERT(
460                 ir->efep == FreeEnergyPerturbationType::No
461                         || (!haveFepPerturbedMasses(top_global) && !havePerturbedConstraints(top_global)),
462                 "Free energy perturbation of masses and constraints are not supported with the GPU "
463                 "update.");
464
465         if (constr != nullptr && constr->numConstraintsTotal() > 0)
466         {
467             GMX_LOG(mdlog.info)
468                     .asParagraph()
469                     .appendText("Updating coordinates and applying constraints on the GPU.");
470         }
471         else
472         {
473             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
474         }
475         GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
476                            "Device stream manager should be initialized in order to use GPU "
477                            "update-constraints.");
478         GMX_RELEASE_ASSERT(
479                 fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
480                 "Update stream should be initialized in order to use GPU "
481                 "update-constraints.");
482         integrator = std::make_unique<UpdateConstrainGpu>(
483                 *ir,
484                 top_global,
485                 ekind->ngtc,
486                 fr->deviceStreamManager->context(),
487                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
488                 wcycle);
489
490         stateGpu->setXUpdatedOnDeviceEvent(integrator->xUpdatedOnDeviceEvent());
491
492         integrator->setPbc(PbcType::Xyz, state->box);
493     }
494
495     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
496     {
497         changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
498     }
499     if (useGpuForUpdate)
500     {
501         changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
502     }
503
504     // NOTE: The global state is no longer used at this point.
505     // But state_global is still used as temporary storage space for writing
506     // the global state to file and potentially for replica exchange.
507     // (Global topology should persist.)
508
509     update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
510
511     if (ir->bExpanded)
512     {
513         /* Check nstexpanded here, because the grompp check was broken */
514         if (ir->expandedvals->nstexpanded % ir->nstcalcenergy != 0)
515         {
516             gmx_fatal(FARGS,
517                       "With expanded ensemble, nstexpanded should be a multiple of nstcalcenergy");
518         }
519         init_expanded_ensemble(startingBehavior != StartingBehavior::NewSimulation, ir, state->dfhist);
520     }
521
522     if (MASTER(cr))
523     {
524         EnergyData::initializeEnergyHistory(startingBehavior, observablesHistory, &energyOutput);
525     }
526
527     preparePrevStepPullCom(ir,
528                            pull_work,
529                            gmx::arrayRefFromArray(md->massT, md->nr),
530                            state,
531                            state_global,
532                            cr,
533                            startingBehavior != StartingBehavior::NewSimulation);
534
535     // TODO: Remove this by converting AWH into a ForceProvider
536     auto awh = prepareAwhModule(fplog,
537                                 *ir,
538                                 state_global,
539                                 cr,
540                                 ms,
541                                 startingBehavior != StartingBehavior::NewSimulation,
542                                 shellfc != nullptr,
543                                 opt2fn("-awh", nfile, fnm),
544                                 pull_work);
545
546     if (useReplicaExchange && MASTER(cr))
547     {
548         repl_ex = init_replica_exchange(fplog, ms, top_global.natoms, ir, replExParams);
549     }
550     /* PME tuning is only supported in the Verlet scheme, with PME for
551      * Coulomb. It is not supported with only LJ PME. */
552     bPMETune = (mdrunOptions.tunePme && EEL_PME(fr->ic->eeltype) && !mdrunOptions.reproducible
553                 && ir->cutoff_scheme != CutoffScheme::Group);
554
555     pme_load_balancing_t* pme_loadbal = nullptr;
556     if (bPMETune)
557     {
558         pme_loadbal_init(
559                 &pme_loadbal, cr, mdlog, *ir, state->box, *fr->ic, *fr->nbv, fr->pmedata, fr->nbv->useGpu());
560     }
561
562     if (!ir->bContinuation)
563     {
564         if (state->flags & enumValueToBitMask(StateEntry::V))
565         {
566             auto v = makeArrayRef(state->v);
567             /* Set the velocities of vsites, shells and frozen atoms to zero */
568             for (i = 0; i < md->homenr; i++)
569             {
570                 if (md->ptype[i] == ParticleType::Shell)
571                 {
572                     clear_rvec(v[i]);
573                 }
574                 else if (md->cFREEZE)
575                 {
576                     for (m = 0; m < DIM; m++)
577                     {
578                         if (ir->opts.nFreeze[md->cFREEZE[i]][m])
579                         {
580                             v[i][m] = 0;
581                         }
582                     }
583                 }
584             }
585         }
586
587         if (constr)
588         {
589             /* Constrain the initial coordinates and velocities */
590             do_constrain_first(fplog,
591                                constr,
592                                ir,
593                                md->nr,
594                                md->homenr,
595                                state->x.arrayRefWithPadding(),
596                                state->v.arrayRefWithPadding(),
597                                state->box,
598                                state->lambda[FreeEnergyPerturbationCouplingType::Bonded]);
599         }
600     }
601
602     const int nstfep = computeFepPeriod(*ir, replExParams);
603
604     /* Be REALLY careful about what flags you set here. You CANNOT assume
605      * this is the first step, since we might be restarting from a checkpoint,
606      * and in that case we should not do any modifications to the state.
607      */
608     bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && !ir->bContinuation);
609
610     // When restarting from a checkpoint, it can be appropriate to
611     // initialize ekind from quantities in the checkpoint. Otherwise,
612     // compute_globals must initialize ekind before the simulation
613     // starts/restarts. However, only the master rank knows what was
614     // found in the checkpoint file, so we have to communicate in
615     // order to coordinate the restart.
616     //
617     // TODO Consider removing this communication if/when checkpoint
618     // reading directly follows .tpr reading, because all ranks can
619     // agree on hasReadEkinState at that time.
620     bool hasReadEkinState = MASTER(cr) ? state_global->ekinstate.hasReadEkinState : false;
621     if (PAR(cr))
622     {
623         gmx_bcast(sizeof(hasReadEkinState), &hasReadEkinState, cr->mpi_comm_mygroup);
624     }
625     if (hasReadEkinState)
626     {
627         restore_ekinstate_from_state(cr, ekind, &state_global->ekinstate);
628     }
629
630     unsigned int cglo_flags =
631             (CGLO_TEMPERATURE | CGLO_GSTAT | (EI_VV(ir->eI) ? CGLO_PRESSURE : 0)
632              | (EI_VV(ir->eI) ? CGLO_CONSTRAINT : 0) | (hasReadEkinState ? CGLO_READEKIN : 0));
633
634     bSumEkinhOld = FALSE;
635
636     t_vcm vcm(top_global.groups, *ir);
637     reportComRemovalInfo(fplog, vcm);
638
639     int64_t step     = ir->init_step;
640     int64_t step_rel = 0;
641
642     /* To minimize communication, compute_globals computes the COM velocity
643      * and the kinetic energy for the velocities without COM motion removed.
644      * Thus to get the kinetic energy without the COM contribution, we need
645      * to call compute_globals twice.
646      */
647     for (int cgloIteration = 0; cgloIteration < (bStopCM ? 2 : 1); cgloIteration++)
648     {
649         unsigned int cglo_flags_iteration = cglo_flags;
650         if (bStopCM && cgloIteration == 0)
651         {
652             cglo_flags_iteration |= CGLO_STOPCM;
653             cglo_flags_iteration &= ~CGLO_TEMPERATURE;
654         }
655         if (DOMAINDECOMP(cr) && dd_localTopologyChecker(*cr->dd).shouldCheckNumberOfBondedInteractions()
656             && cgloIteration == 0)
657         {
658             cglo_flags_iteration |= CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS;
659         }
660         compute_globals(gstat,
661                         cr,
662                         ir,
663                         fr,
664                         ekind,
665                         makeConstArrayRef(state->x),
666                         makeConstArrayRef(state->v),
667                         state->box,
668                         md,
669                         nrnb,
670                         &vcm,
671                         nullptr,
672                         enerd,
673                         force_vir,
674                         shake_vir,
675                         total_vir,
676                         pres,
677                         gmx::ArrayRef<real>{},
678                         &nullSignaller,
679                         state->box,
680                         &bSumEkinhOld,
681                         cglo_flags_iteration,
682                         step,
683                         &observablesReducer);
684         if (cglo_flags_iteration & CGLO_STOPCM)
685         {
686             /* At initialization, do not pass x with acceleration-correction mode
687              * to avoid (incorrect) correction of the initial coordinates.
688              */
689             auto x = (vcm.mode == ComRemovalAlgorithm::LinearAccelerationCorrection)
690                              ? ArrayRef<RVec>()
691                              : makeArrayRef(state->x);
692             process_and_stopcm_grp(fplog, &vcm, *md, x, makeArrayRef(state->v));
693             inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
694         }
695     }
696     if (DOMAINDECOMP(cr))
697     {
698         dd_localTopologyChecker(cr->dd)->checkNumberOfBondedInteractions(
699                 &top, makeConstArrayRef(state->x), state->box);
700     }
701     if (ir->eI == IntegrationAlgorithm::VVAK)
702     {
703         /* a second call to get the half step temperature initialized as well */
704         /* we do the same call as above, but turn the pressure off -- internally to
705            compute_globals, this is recognized as a velocity verlet half-step
706            kinetic energy calculation.  This minimized excess variables, but
707            perhaps loses some logic?*/
708
709         compute_globals(gstat,
710                         cr,
711                         ir,
712                         fr,
713                         ekind,
714                         makeConstArrayRef(state->x),
715                         makeConstArrayRef(state->v),
716                         state->box,
717                         md,
718                         nrnb,
719                         &vcm,
720                         nullptr,
721                         enerd,
722                         force_vir,
723                         shake_vir,
724                         total_vir,
725                         pres,
726                         gmx::ArrayRef<real>{},
727                         &nullSignaller,
728                         state->box,
729                         &bSumEkinhOld,
730                         cglo_flags & ~CGLO_PRESSURE,
731                         step,
732                         &observablesReducer);
733     }
734
735     /* Calculate the initial half step temperature, and save the ekinh_old */
736     if (startingBehavior == StartingBehavior::NewSimulation)
737     {
738         for (i = 0; (i < ir->opts.ngtc); i++)
739         {
740             copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
741         }
742     }
743
744     /* need to make an initiation call to get the Trotter variables set, as well as other constants
745        for non-trotter temperature control */
746     auto trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
747
748     if (MASTER(cr))
749     {
750         if (!ir->bContinuation)
751         {
752             if (constr && ir->eConstrAlg == ConstraintAlgorithm::Lincs)
753             {
754                 fprintf(fplog,
755                         "RMS relative constraint deviation after constraining: %.2e\n",
756                         constr->rmsd());
757             }
758             if (EI_STATE_VELOCITY(ir->eI))
759             {
760                 real temp = enerd->term[F_TEMP];
761                 if (ir->eI != IntegrationAlgorithm::VV)
762                 {
763                     /* Result of Ekin averaged over velocities of -half
764                      * and +half step, while we only have -half step here.
765                      */
766                     temp *= 2;
767                 }
768                 fprintf(fplog, "Initial temperature: %g K\n", temp);
769             }
770         }
771
772         char tbuf[20];
773         fprintf(stderr, "starting mdrun '%s'\n", *(top_global.name));
774         if (ir->nsteps >= 0)
775         {
776             sprintf(tbuf, "%8.1f", (ir->init_step + ir->nsteps) * ir->delta_t);
777         }
778         else
779         {
780             sprintf(tbuf, "%s", "infinite");
781         }
782         if (ir->init_step > 0)
783         {
784             fprintf(stderr,
785                     "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
786                     gmx_step_str(ir->init_step + ir->nsteps, sbuf),
787                     tbuf,
788                     gmx_step_str(ir->init_step, sbuf2),
789                     ir->init_step * ir->delta_t);
790         }
791         else
792         {
793             fprintf(stderr, "%s steps, %s ps.\n", gmx_step_str(ir->nsteps, sbuf), tbuf);
794         }
795         fprintf(fplog, "\n");
796     }
797
798     walltime_accounting_start_time(walltime_accounting);
799     wallcycle_start(wcycle, WallCycleCounter::Run);
800     print_start(fplog, cr, walltime_accounting, "mdrun");
801
802     /***********************************************************
803      *
804      *             Loop over MD steps
805      *
806      ************************************************************/
807
808     bFirstStep = TRUE;
809     /* Skip the first Nose-Hoover integration when we get the state from tpx */
810     bInitStep        = startingBehavior == StartingBehavior::NewSimulation || EI_VV(ir->eI);
811     bSumEkinhOld     = FALSE;
812     bExchanged       = FALSE;
813     bNeedRepartition = FALSE;
814
815     auto stopHandler = stopHandlerBuilder->getStopHandlerMD(
816             compat::not_null<SimulationSignal*>(&signals[eglsSTOPCOND]),
817             simulationsShareState,
818             MASTER(cr),
819             ir->nstlist,
820             mdrunOptions.reproducible,
821             nstSignalComm,
822             mdrunOptions.maximumHoursToRun,
823             ir->nstlist == 0,
824             fplog,
825             step,
826             bNS,
827             walltime_accounting);
828
829     auto checkpointHandler = std::make_unique<CheckpointHandler>(
830             compat::make_not_null<SimulationSignal*>(&signals[eglsCHKPT]),
831             simulationsShareState,
832             ir->nstlist == 0,
833             MASTER(cr),
834             mdrunOptions.writeConfout,
835             mdrunOptions.checkpointOptions.period);
836
837     const bool resetCountersIsLocal = true;
838     auto       resetHandler         = std::make_unique<ResetHandler>(
839             compat::make_not_null<SimulationSignal*>(&signals[eglsRESETCOUNTERS]),
840             !resetCountersIsLocal,
841             ir->nsteps,
842             MASTER(cr),
843             mdrunOptions.timingOptions.resetHalfway,
844             mdrunOptions.maximumHoursToRun,
845             mdlog,
846             wcycle,
847             walltime_accounting);
848
849     const DDBalanceRegionHandler ddBalanceRegionHandler(cr);
850
851     if (MASTER(cr) && isMultiSim(ms) && !useReplicaExchange)
852     {
853         logInitialMultisimStatus(ms, cr, mdlog, simulationsShareState, ir->nsteps, ir->init_step);
854     }
855
856     /* and stop now if we should */
857     bLastStep = (bLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
858     while (!bLastStep)
859     {
860
861         /* Determine if this is a neighbor search step */
862         bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
863
864         if (bPMETune && bNStList)
865         {
866             // This has to be here because PME load balancing is called so early.
867             // TODO: Move to after all booleans are defined.
868             if (useGpuForUpdate && !bFirstStep)
869             {
870                 stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
871                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
872             }
873             /* PME grid + cut-off optimization with GPUs or PME nodes */
874             pme_loadbal_do(pme_loadbal,
875                            cr,
876                            (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr,
877                            fplog,
878                            mdlog,
879                            *ir,
880                            fr,
881                            state->box,
882                            state->x,
883                            wcycle,
884                            step,
885                            step_rel,
886                            &bPMETunePrinting,
887                            simulationWork.useGpuPmePpCommunication);
888         }
889
890         wallcycle_start(wcycle, WallCycleCounter::Step);
891
892         bLastStep = (step_rel == ir->nsteps);
893         t         = t0 + step * ir->delta_t;
894
895         // TODO Refactor this, so that nstfep does not need a default value of zero
896         if (ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
897         {
898             /* find and set the current lambdas */
899             state->lambda = currentLambdas(step, *(ir->fepvals), state->fep_state);
900
901             bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded)
902                            && (!bFirstStep));
903         }
904
905         bDoReplEx = (useReplicaExchange && (step > 0) && !bLastStep
906                      && do_per_step(step, replExParams.exchangeInterval));
907
908         if (doSimulatedAnnealing)
909         {
910             // TODO: Avoid changing inputrec (#3854)
911             // Simulated annealing updates the reference temperature.
912             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
913             update_annealing_target_temp(nonConstInputrec, t, &upd);
914         }
915
916         /* Stop Center of Mass motion */
917         bStopCM = (ir->comm_mode != ComRemovalAlgorithm::No && do_per_step(step, ir->nstcomm));
918
919         /* Determine whether or not to do Neighbour Searching */
920         bNS = (bFirstStep || bNStList || bExchanged || bNeedRepartition);
921
922         /* Note that the stopHandler will cause termination at nstglobalcomm
923          * steps. Since this concides with nstcalcenergy, nsttcouple and/or
924          * nstpcouple steps, we have computed the half-step kinetic energy
925          * of the previous step and can always output energies at the last step.
926          */
927         bLastStep = bLastStep || stopHandler->stoppingAfterCurrentStep(bNS);
928
929         /* do_log triggers energy and virial calculation. Because this leads
930          * to different code paths, forces can be different. Thus for exact
931          * continuation we should avoid extra log output.
932          * Note that the || bLastStep can result in non-exact continuation
933          * beyond the last step. But we don't consider that to be an issue.
934          */
935         do_log     = (do_per_step(step, ir->nstlog)
936                   || (bFirstStep && startingBehavior == StartingBehavior::NewSimulation) || bLastStep);
937         do_verbose = mdrunOptions.verbose
938                      && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
939
940         if (useGpuForUpdate && !bFirstStep && bNS)
941         {
942             // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
943             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
944             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
945             // Copy coordinate from the GPU when needed at the search step.
946             // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
947             // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
948             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
949             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
950         }
951
952         // We only need to calculate virtual velocities if we are writing them in the current step
953         const bool needVirtualVelocitiesThisStep =
954                 (vsite != nullptr)
955                 && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep());
956
957         if (vsite != nullptr)
958         {
959             // Virtual sites need to be updated before domain decomposition and forces are calculated
960             wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
961             // md-vv calculates virtual velocities once it has full-step real velocities
962             vsite->construct(state->x,
963                              state->v,
964                              state->box,
965                              (!EI_VV(inputrec->eI) && needVirtualVelocitiesThisStep)
966                                      ? VSiteOperation::PositionsAndVelocities
967                                      : VSiteOperation::Positions);
968             wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
969         }
970
971         if (bNS && !(bFirstStep && ir->bContinuation))
972         {
973             bMasterState = FALSE;
974             /* Correct the new box if it is too skewed */
975             if (inputrecDynamicBox(ir))
976             {
977                 if (correct_box(fplog, step, state->box))
978                 {
979                     bMasterState = TRUE;
980                     // If update is offloaded, it should be informed about the box size change
981                     if (useGpuForUpdate)
982                     {
983                         integrator->setPbc(PbcType::Xyz, state->box);
984                     }
985                 }
986             }
987             if (DOMAINDECOMP(cr) && bMasterState)
988             {
989                 dd_collect_state(cr->dd, state, state_global);
990             }
991
992             if (DOMAINDECOMP(cr))
993             {
994                 /* Repartition the domain decomposition */
995                 dd_partition_system(fplog,
996                                     mdlog,
997                                     step,
998                                     cr,
999                                     bMasterState,
1000                                     nstglobalcomm,
1001                                     state_global,
1002                                     top_global,
1003                                     *ir,
1004                                     imdSession,
1005                                     pull_work,
1006                                     state,
1007                                     &f,
1008                                     mdAtoms,
1009                                     &top,
1010                                     fr,
1011                                     vsite,
1012                                     constr,
1013                                     nrnb,
1014                                     wcycle,
1015                                     do_verbose && !bPMETunePrinting);
1016                 upd.updateAfterPartition(state->natoms,
1017                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1018                                                      : gmx::ArrayRef<const unsigned short>(),
1019                                          md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1020                                                  : gmx::ArrayRef<const unsigned short>());
1021                 fr->longRangeNonbondeds->updateAfterPartition(*md);
1022             }
1023         }
1024
1025         // Allocate or re-size GPU halo exchange object, if necessary
1026         if (bNS && simulationWork.havePpDomainDecomposition && simulationWork.useGpuHaloExchange)
1027         {
1028             GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
1029                                "GPU device manager has to be initialized to use GPU "
1030                                "version of halo exchange.");
1031             constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
1032         }
1033
1034         if (MASTER(cr) && do_log)
1035         {
1036             gmx::EnergyOutput::printHeader(
1037                     fplog, step, t); /* can we improve the information printed here? */
1038         }
1039
1040         if (ir->efep != FreeEnergyPerturbationType::No)
1041         {
1042             update_mdatoms(mdAtoms->mdatoms(), state->lambda[FreeEnergyPerturbationCouplingType::Mass]);
1043         }
1044
1045         if (bExchanged)
1046         {
1047             /* We need the kinetic energy at minus the half step for determining
1048              * the full step kinetic energy and possibly for T-coupling.*/
1049             /* This may not be quite working correctly yet . . . . */
1050             int cglo_flags = CGLO_GSTAT | CGLO_TEMPERATURE;
1051             if (DOMAINDECOMP(cr) && dd_localTopologyChecker(*cr->dd).shouldCheckNumberOfBondedInteractions())
1052             {
1053                 cglo_flags |= CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS;
1054             }
1055             compute_globals(gstat,
1056                             cr,
1057                             ir,
1058                             fr,
1059                             ekind,
1060                             makeConstArrayRef(state->x),
1061                             makeConstArrayRef(state->v),
1062                             state->box,
1063                             md,
1064                             nrnb,
1065                             &vcm,
1066                             wcycle,
1067                             enerd,
1068                             nullptr,
1069                             nullptr,
1070                             nullptr,
1071                             nullptr,
1072                             gmx::ArrayRef<real>{},
1073                             &nullSignaller,
1074                             state->box,
1075                             &bSumEkinhOld,
1076                             cglo_flags,
1077                             step,
1078                             &observablesReducer);
1079             if (DOMAINDECOMP(cr))
1080             {
1081                 dd_localTopologyChecker(cr->dd)->checkNumberOfBondedInteractions(
1082                         &top, makeConstArrayRef(state->x), state->box);
1083             }
1084         }
1085         clear_mat(force_vir);
1086
1087         checkpointHandler->decideIfCheckpointingThisStep(bNS, bFirstStep, bLastStep);
1088
1089         /* Determine the energy and pressure:
1090          * at nstcalcenergy steps and at energy output steps (set below).
1091          */
1092         if (EI_VV(ir->eI) && (!bInitStep))
1093         {
1094             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1095             bCalcVir      = bCalcEnerStep
1096                        || (ir->epc != PressureCoupling::No
1097                            && (do_per_step(step, ir->nstpcouple) || do_per_step(step - 1, ir->nstpcouple)));
1098         }
1099         else
1100         {
1101             bCalcEnerStep = do_per_step(step, ir->nstcalcenergy);
1102             bCalcVir      = bCalcEnerStep
1103                        || (ir->epc != PressureCoupling::No && do_per_step(step, ir->nstpcouple));
1104         }
1105         bCalcEner = bCalcEnerStep;
1106
1107         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
1108
1109         if (do_ene || do_log || bDoReplEx)
1110         {
1111             bCalcVir  = TRUE;
1112             bCalcEner = TRUE;
1113         }
1114
1115         // bCalcEner is only here for when the last step is not a mulitple of nstfep
1116         const bool computeDHDL = ((ir->efep != FreeEnergyPerturbationType::No || ir->bSimTemp)
1117                                   && (do_per_step(step, nstfep) || bCalcEner));
1118
1119         /* Do we need global communication ? */
1120         bGStat = (bCalcVir || bCalcEner || bStopCM || do_per_step(step, nstglobalcomm)
1121                   || (EI_VV(ir->eI) && inputrecNvtTrotter(ir) && do_per_step(step - 1, nstglobalcomm)));
1122
1123         force_flags = (GMX_FORCE_STATECHANGED | ((inputrecDynamicBox(ir)) ? GMX_FORCE_DYNAMICBOX : 0)
1124                        | GMX_FORCE_ALLFORCES | (bCalcVir ? GMX_FORCE_VIRIAL : 0)
1125                        | (bCalcEner ? GMX_FORCE_ENERGY : 0) | (computeDHDL ? GMX_FORCE_DHDL : 0));
1126         if (simulationWork.useMts && !do_per_step(step, ir->nstfout))
1127         {
1128             // TODO: merge this with stepWork.useOnlyMtsCombinedForceBuffer
1129             force_flags |= GMX_FORCE_DO_NOT_NEED_NORMAL_FORCE;
1130         }
1131
1132         if (shellfc)
1133         {
1134             /* Now is the time to relax the shells */
1135             relax_shell_flexcon(fplog,
1136                                 cr,
1137                                 ms,
1138                                 mdrunOptions.verbose,
1139                                 enforcedRotation,
1140                                 step,
1141                                 ir,
1142                                 imdSession,
1143                                 pull_work,
1144                                 bNS,
1145                                 force_flags,
1146                                 &top,
1147                                 constr,
1148                                 enerd,
1149                                 state->natoms,
1150                                 state->x.arrayRefWithPadding(),
1151                                 state->v.arrayRefWithPadding(),
1152                                 state->box,
1153                                 state->lambda,
1154                                 &state->hist,
1155                                 &f.view(),
1156                                 force_vir,
1157                                 *md,
1158                                 fr->longRangeNonbondeds.get(),
1159                                 nrnb,
1160                                 wcycle,
1161                                 shellfc,
1162                                 fr,
1163                                 runScheduleWork,
1164                                 t,
1165                                 mu_tot,
1166                                 vsite,
1167                                 ddBalanceRegionHandler);
1168         }
1169         else
1170         {
1171             /* The AWH history need to be saved _before_ doing force calculations where the AWH bias
1172                is updated (or the AWH update will be performed twice for one step when continuing).
1173                It would be best to call this update function from do_md_trajectory_writing but that
1174                would occur after do_force. One would have to divide the update_awh function into one
1175                function applying the AWH force and one doing the AWH bias update. The update AWH
1176                bias function could then be called after do_md_trajectory_writing (then containing
1177                update_awh_history). The checkpointing will in the future probably moved to the start
1178                of the md loop which will rid of this issue. */
1179             if (awh && checkpointHandler->isCheckpointingStep() && MASTER(cr))
1180             {
1181                 awh->updateHistory(state_global->awhHistory.get());
1182             }
1183
1184             /* The coordinates (x) are shifted (to get whole molecules)
1185              * in do_force.
1186              * This is parallellized as well, and does communication too.
1187              * Check comments in sim_util.c
1188              */
1189             do_force(fplog,
1190                      cr,
1191                      ms,
1192                      *ir,
1193                      awh.get(),
1194                      enforcedRotation,
1195                      imdSession,
1196                      pull_work,
1197                      step,
1198                      nrnb,
1199                      wcycle,
1200                      &top,
1201                      state->box,
1202                      state->x.arrayRefWithPadding(),
1203                      &state->hist,
1204                      &f.view(),
1205                      force_vir,
1206                      md,
1207                      enerd,
1208                      state->lambda,
1209                      fr,
1210                      runScheduleWork,
1211                      vsite,
1212                      mu_tot,
1213                      t,
1214                      ed ? ed->getLegacyED() : nullptr,
1215                      fr->longRangeNonbondeds.get(),
1216                      (bNS ? GMX_FORCE_NS : 0) | force_flags,
1217                      ddBalanceRegionHandler);
1218         }
1219
1220         // VV integrators do not need the following velocity half step
1221         // if it is the first step after starting from a checkpoint.
1222         // That is, the half step is needed on all other steps, and
1223         // also the first step when starting from a .tpr file.
1224         if (EI_VV(ir->eI))
1225         {
1226             integrateVVFirstStep(step,
1227                                  bFirstStep,
1228                                  bInitStep,
1229                                  startingBehavior,
1230                                  nstglobalcomm,
1231                                  ir,
1232                                  fr,
1233                                  cr,
1234                                  state,
1235                                  mdAtoms->mdatoms(),
1236                                  &fcdata,
1237                                  &MassQ,
1238                                  &vcm,
1239                                  top,
1240                                  enerd,
1241                                  &observablesReducer,
1242                                  ekind,
1243                                  gstat,
1244                                  &last_ekin,
1245                                  bCalcVir,
1246                                  total_vir,
1247                                  shake_vir,
1248                                  force_vir,
1249                                  pres,
1250                                  M,
1251                                  do_log,
1252                                  do_ene,
1253                                  bCalcEner,
1254                                  bGStat,
1255                                  bStopCM,
1256                                  bTrotter,
1257                                  bExchanged,
1258                                  &bSumEkinhOld,
1259                                  &saved_conserved_quantity,
1260                                  &f,
1261                                  &upd,
1262                                  constr,
1263                                  &nullSignaller,
1264                                  trotter_seq,
1265                                  nrnb,
1266                                  fplog,
1267                                  wcycle);
1268             if (vsite != nullptr && needVirtualVelocitiesThisStep)
1269             {
1270                 // Positions were calculated earlier
1271                 wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
1272                 vsite->construct(state->x, state->v, state->box, VSiteOperation::Velocities);
1273                 wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
1274             }
1275         }
1276
1277         /* ########  END FIRST UPDATE STEP  ############## */
1278         /* ########  If doing VV, we now have v(dt) ###### */
1279         if (bDoExpanded)
1280         {
1281             /* perform extended ensemble sampling in lambda - we don't
1282                actually move to the new state before outputting
1283                statistics, but if performing simulated tempering, we
1284                do update the velocities and the tau_t. */
1285             // TODO: Avoid changing inputrec (#3854)
1286             // Simulated tempering updates the reference temperature.
1287             // Expanded ensemble without simulated tempering does not change the inputrec.
1288             auto* nonConstInputrec = const_cast<t_inputrec*>(inputrec);
1289             lamnew                 = ExpandedEnsembleDynamics(fplog,
1290                                               nonConstInputrec,
1291                                               enerd,
1292                                               state,
1293                                               &MassQ,
1294                                               state->fep_state,
1295                                               state->dfhist,
1296                                               step,
1297                                               state->v.rvec_array(),
1298                                               md->homenr,
1299                                               md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1300                                                                       : gmx::ArrayRef<const unsigned short>());
1301             /* history is maintained in state->dfhist, but state_global is what is sent to trajectory and log output */
1302             if (MASTER(cr))
1303             {
1304                 copy_df_history(state_global->dfhist, state->dfhist);
1305             }
1306         }
1307
1308         // Copy coordinate from the GPU for the output/checkpointing if the update is offloaded and
1309         // coordinates have not already been copied for i) search or ii) CPU force tasks.
1310         if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork
1311             && (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)
1312                 || checkpointHandler->isCheckpointingStep()))
1313         {
1314             stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1315             stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1316         }
1317         // Copy velocities if needed for the output/checkpointing.
1318         // NOTE: Copy on the search steps is done at the beginning of the step.
1319         if (useGpuForUpdate && !bNS
1320             && (do_per_step(step, ir->nstvout) || checkpointHandler->isCheckpointingStep()))
1321         {
1322             stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1323             stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1324         }
1325         // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
1326         // and update is offloaded hence forces are kept on the GPU for update and have not been
1327         // already transferred in do_force().
1328         // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
1329         //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
1330         //       prior to GPU update.
1331         // TODO: When the output flags will be included in step workload, this copy can be combined with the
1332         //       copy call in do_force(...).
1333         // NOTE: The forces should not be copied here if the vsites are present, since they were modified
1334         //       on host after the D2H copy in do_force(...).
1335         if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite)
1336             && do_per_step(step, ir->nstfout))
1337         {
1338             stateGpu->copyForcesFromGpu(f.view().force(), AtomLocality::Local);
1339             stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
1340         }
1341         /* Now we have the energies and forces corresponding to the
1342          * coordinates at time t. We must output all of this before
1343          * the update.
1344          */
1345         do_md_trajectory_writing(fplog,
1346                                  cr,
1347                                  nfile,
1348                                  fnm,
1349                                  step,
1350                                  step_rel,
1351                                  t,
1352                                  ir,
1353                                  state,
1354                                  state_global,
1355                                  observablesHistory,
1356                                  top_global,
1357                                  fr,
1358                                  outf,
1359                                  energyOutput,
1360                                  ekind,
1361                                  f.view().force(),
1362                                  checkpointHandler->isCheckpointingStep(),
1363                                  bRerunMD,
1364                                  bLastStep,
1365                                  mdrunOptions.writeConfout,
1366                                  bSumEkinhOld);
1367         /* Check if IMD step and do IMD communication, if bIMD is TRUE. */
1368         bInteractiveMDstep = imdSession->run(step, bNS, state->box, state->x, t);
1369
1370         /* kludge -- virial is lost with restart for MTTK NPT control. Must reload (saved earlier). */
1371         if (startingBehavior != StartingBehavior::NewSimulation && bFirstStep
1372             && (inputrecNptTrotter(ir) || inputrecNphTrotter(ir)))
1373         {
1374             copy_mat(state->svir_prev, shake_vir);
1375             copy_mat(state->fvir_prev, force_vir);
1376         }
1377
1378         stopHandler->setSignal();
1379         resetHandler->setSignal(walltime_accounting);
1380
1381         if (bGStat || !PAR(cr))
1382         {
1383             /* In parallel we only have to check for checkpointing in steps
1384              * where we do global communication,
1385              *  otherwise the other nodes don't know.
1386              */
1387             checkpointHandler->setSignal(walltime_accounting);
1388         }
1389
1390         /* #########   START SECOND UPDATE STEP ################# */
1391
1392         /* at the start of step, randomize or scale the velocities ((if vv. Restriction of Andersen
1393            controlled in preprocessing */
1394
1395         if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
1396         {
1397             gmx_bool bIfRandomize;
1398             bIfRandomize = update_randomize_velocities(ir,
1399                                                        step,
1400                                                        cr,
1401                                                        md->homenr,
1402                                                        md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1403                                                                : gmx::ArrayRef<const unsigned short>(),
1404                                                        gmx::arrayRefFromArray(md->invmass, md->nr),
1405                                                        state->v,
1406                                                        &upd,
1407                                                        constr);
1408             /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
1409             if (constr && bIfRandomize)
1410             {
1411                 constrain_velocities(constr, do_log, do_ene, step, state, nullptr, false, nullptr);
1412             }
1413         }
1414         /* Box is changed in update() when we do pressure coupling,
1415          * but we should still use the old box for energy corrections and when
1416          * writing it to the energy file, so it matches the trajectory files for
1417          * the same timestep above. Make a copy in a separate array.
1418          */
1419         copy_mat(state->box, lastbox);
1420
1421         dvdl_constr = 0;
1422
1423         if (!useGpuForUpdate)
1424         {
1425             wallcycle_start(wcycle, WallCycleCounter::Update);
1426         }
1427         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
1428         if (bTrotter)
1429         {
1430             trotter_update(ir,
1431                            step,
1432                            ekind,
1433                            enerd,
1434                            state,
1435                            total_vir,
1436                            md->homenr,
1437                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1438                                    : gmx::ArrayRef<const unsigned short>(),
1439                            gmx::arrayRefFromArray(md->invmass, md->nr),
1440                            &MassQ,
1441                            trotter_seq,
1442                            TrotterSequence::Three);
1443             /* We can only do Berendsen coupling after we have summed
1444              * the kinetic energy or virial. Since the happens
1445              * in global_state after update, we should only do it at
1446              * step % nstlist = 1 with bGStatEveryStep=FALSE.
1447              */
1448         }
1449         else
1450         {
1451             update_tcouple(step,
1452                            ir,
1453                            state,
1454                            ekind,
1455                            &MassQ,
1456                            md->homenr,
1457                            md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1458                                    : gmx::ArrayRef<const unsigned short>());
1459             update_pcouple_before_coordinates(fplog, step, ir, state, pressureCouplingMu, M, bInitStep);
1460         }
1461
1462         /* With leap-frog type integrators we compute the kinetic energy
1463          * at a whole time step as the average of the half-time step kinetic
1464          * energies of two subsequent steps. Therefore we need to compute the
1465          * half step kinetic energy also if we need energies at the next step.
1466          */
1467         const bool needHalfStepKineticEnergy =
1468                 (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
1469
1470         // Parrinello-Rahman requires the pressure to be availible before the update to compute
1471         // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
1472         const bool doParrinelloRahman = (ir->epc == PressureCoupling::ParrinelloRahman
1473                                          && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
1474
1475         if (EI_VV(ir->eI))
1476         {
1477             GMX_ASSERT(!useGpuForUpdate, "GPU update is not supported with VVAK integrator.");
1478
1479             integrateVVSecondStep(step,
1480                                   ir,
1481                                   fr,
1482                                   cr,
1483                                   state,
1484                                   mdAtoms->mdatoms(),
1485                                   &fcdata,
1486                                   &MassQ,
1487                                   &vcm,
1488                                   pull_work,
1489                                   enerd,
1490                                   &observablesReducer,
1491                                   ekind,
1492                                   gstat,
1493                                   &dvdl_constr,
1494                                   bCalcVir,
1495                                   total_vir,
1496                                   shake_vir,
1497                                   force_vir,
1498                                   pres,
1499                                   M,
1500                                   lastbox,
1501                                   do_log,
1502                                   do_ene,
1503                                   bGStat,
1504                                   &bSumEkinhOld,
1505                                   &f,
1506                                   &cbuf,
1507                                   &upd,
1508                                   constr,
1509                                   &nullSignaller,
1510                                   trotter_seq,
1511                                   nrnb,
1512                                   wcycle);
1513         }
1514         else
1515         {
1516             if (useGpuForUpdate)
1517             {
1518                 if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
1519                 {
1520                     integrator->set(stateGpu->getCoordinates(),
1521                                     stateGpu->getVelocities(),
1522                                     stateGpu->getForces(),
1523                                     top.idef,
1524                                     *md);
1525
1526                     // Copy data to the GPU after buffers might have being reinitialized
1527                     /* The velocity copy is redundant if we had Center-of-Mass motion removed on
1528                      * the previous step. We don't check that now. */
1529                     stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1530                     if (!runScheduleWork->stepWork.haveGpuPmeOnThisRank
1531                         && !runScheduleWork->stepWork.useGpuXBufferOps)
1532                     {
1533                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1534                     }
1535                 }
1536
1537                 if (simulationWork.useGpuPme && simulationWork.useCpuPmePpCommunication)
1538                 {
1539                     // The PME forces were recieved to the host, so have to be copied
1540                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::All);
1541                 }
1542                 else if (!runScheduleWork->stepWork.useGpuFBufferOps)
1543                 {
1544                     // The buffer ops were not offloaded this step, so the forces are on the
1545                     // host and have to be copied
1546                     stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local);
1547                 }
1548
1549                 const bool doTemperatureScaling =
1550                         (ir->etc != TemperatureCoupling::No
1551                          && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
1552
1553                 // This applies Leap-Frog, LINCS and SETTLE in succession
1554                 integrator->integrate(
1555                         stateGpu->getForcesReadyOnDeviceEvent(
1556                                 AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
1557                         ir->delta_t,
1558                         true,
1559                         bCalcVir,
1560                         shake_vir,
1561                         doTemperatureScaling,
1562                         ekind->tcstat,
1563                         doParrinelloRahman,
1564                         ir->nstpcouple * ir->delta_t,
1565                         M);
1566
1567                 // Copy velocities D2H after update if:
1568                 // - Globals are computed this step (includes the energy output steps).
1569                 // - Temperature is needed for the next step.
1570                 if (bGStat || needHalfStepKineticEnergy)
1571                 {
1572                     stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
1573                     stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
1574                 }
1575             }
1576             else
1577             {
1578                 /* With multiple time stepping we need to do an additional normal
1579                  * update step to obtain the virial, as the actual MTS integration
1580                  * using an acceleration where the slow forces are multiplied by mtsFactor.
1581                  * Using that acceleration would result in a virial with the slow
1582                  * force contribution would be a factor mtsFactor too large.
1583                  */
1584                 if (simulationWork.useMts && bCalcVir && constr != nullptr)
1585                 {
1586                     upd.update_for_constraint_virial(*ir,
1587                                                      md->homenr,
1588                                                      md->havePartiallyFrozenAtoms,
1589                                                      gmx::arrayRefFromArray(md->invmass, md->nr),
1590                                                      gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1591                                                      *state,
1592                                                      f.view().forceWithPadding(),
1593                                                      *ekind);
1594
1595                     constrain_coordinates(constr,
1596                                           do_log,
1597                                           do_ene,
1598                                           step,
1599                                           state,
1600                                           upd.xp()->arrayRefWithPadding(),
1601                                           &dvdl_constr,
1602                                           bCalcVir,
1603                                           shake_vir);
1604                 }
1605
1606                 ArrayRefWithPadding<const RVec> forceCombined =
1607                         (simulationWork.useMts && step % ir->mtsLevels[1].stepFactor == 0)
1608                                 ? f.view().forceMtsCombinedWithPadding()
1609                                 : f.view().forceWithPadding();
1610                 upd.update_coords(*ir,
1611                                   step,
1612                                   md->homenr,
1613                                   md->havePartiallyFrozenAtoms,
1614                                   gmx::arrayRefFromArray(md->ptype, md->nr),
1615                                   gmx::arrayRefFromArray(md->invmass, md->nr),
1616                                   gmx::arrayRefFromArray(md->invMassPerDim, md->nr),
1617                                   state,
1618                                   forceCombined,
1619                                   &fcdata,
1620                                   ekind,
1621                                   M,
1622                                   etrtPOSITION,
1623                                   cr,
1624                                   constr != nullptr);
1625
1626                 wallcycle_stop(wcycle, WallCycleCounter::Update);
1627
1628                 constrain_coordinates(constr,
1629                                       do_log,
1630                                       do_ene,
1631                                       step,
1632                                       state,
1633                                       upd.xp()->arrayRefWithPadding(),
1634                                       &dvdl_constr,
1635                                       bCalcVir && !simulationWork.useMts,
1636                                       shake_vir);
1637
1638                 upd.update_sd_second_half(*ir,
1639                                           step,
1640                                           &dvdl_constr,
1641                                           md->homenr,
1642                                           gmx::arrayRefFromArray(md->ptype, md->nr),
1643                                           gmx::arrayRefFromArray(md->invmass, md->nr),
1644                                           state,
1645                                           cr,
1646                                           nrnb,
1647                                           wcycle,
1648                                           constr,
1649                                           do_log,
1650                                           do_ene);
1651                 upd.finish_update(
1652                         *ir, md->havePartiallyFrozenAtoms, md->homenr, state, wcycle, constr != nullptr);
1653             }
1654
1655             if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
1656             {
1657                 updatePrevStepPullCom(pull_work, state);
1658             }
1659
1660             enerd->term[F_DVDL_CONSTR] += dvdl_constr;
1661         }
1662
1663         /* ############## IF NOT VV, Calculate globals HERE  ############ */
1664         /* With Leap-Frog we can skip compute_globals at
1665          * non-communication steps, but we need to calculate
1666          * the kinetic energy one step before communication.
1667          */
1668         {
1669             // Organize to do inter-simulation signalling on steps if
1670             // and when algorithms require it.
1671             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
1672
1673             if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
1674             {
1675                 // Copy coordinates when needed to stop the CM motion.
1676                 if (useGpuForUpdate && (bDoReplEx || (!EI_VV(ir->eI) && bStopCM)))
1677                 {
1678                     stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
1679                     stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
1680                 }
1681                 // Since we're already communicating at this step, we
1682                 // can propagate intra-simulation signals. Note that
1683                 // check_nstglobalcomm has the responsibility for
1684                 // choosing the value of nstglobalcomm that is one way
1685                 // bGStat becomes true, so we can't get into a
1686                 // situation where e.g. checkpointing can't be
1687                 // signalled.
1688                 bool                doIntraSimSignal = true;
1689                 SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
1690
1691                 compute_globals(
1692                         gstat,
1693                         cr,
1694                         ir,
1695                         fr,
1696                         ekind,
1697                         makeConstArrayRef(state->x),
1698                         makeConstArrayRef(state->v),
1699                         state->box,
1700                         md,
1701                         nrnb,
1702                         &vcm,
1703                         wcycle,
1704                         enerd,
1705                         force_vir,
1706                         shake_vir,
1707                         total_vir,
1708                         pres,
1709                         (!EI_VV(ir->eI) && bCalcEner && constr != nullptr) ? constr->rmsdData()
1710                                                                            : gmx::ArrayRef<real>{},
1711                         &signaller,
1712                         lastbox,
1713                         &bSumEkinhOld,
1714                         (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
1715                                 | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
1716                                 | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
1717                                 | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT
1718                                 | (DOMAINDECOMP(cr) && dd_localTopologyChecker(*cr->dd).shouldCheckNumberOfBondedInteractions()
1719                                            ? CGLO_CHECK_NUMBER_OF_BONDED_INTERACTIONS
1720                                            : 0),
1721                         step,
1722                         &observablesReducer);
1723                 if (DOMAINDECOMP(cr))
1724                 {
1725                     dd_localTopologyChecker(cr->dd)->checkNumberOfBondedInteractions(
1726                             &top, makeConstArrayRef(state->x), state->box);
1727                 }
1728                 if (!EI_VV(ir->eI) && bStopCM)
1729                 {
1730                     process_and_stopcm_grp(
1731                             fplog, &vcm, *md, makeArrayRef(state->x), makeArrayRef(state->v));
1732                     inc_nrnb(nrnb, eNR_STOPCM, md->homenr);
1733
1734                     // TODO: The special case of removing CM motion should be dealt more gracefully
1735                     if (useGpuForUpdate)
1736                     {
1737                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
1738                         // Here we block until the H2D copy completes because event sync with the
1739                         // force kernels that use the coordinates on the next steps is not implemented
1740                         // (not because of a race on state->x being modified on the CPU while H2D is in progress).
1741                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
1742                         // If the COM removal changed the velocities on the CPU, this has to be accounted for.
1743                         if (vcm.mode != ComRemovalAlgorithm::No)
1744                         {
1745                             stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
1746                         }
1747                     }
1748                 }
1749             }
1750         }
1751
1752         /* #############  END CALC EKIN AND PRESSURE ################# */
1753
1754         /* Note: this is OK, but there are some numerical precision issues with using the convergence of
1755            the virial that should probably be addressed eventually. state->veta has better properies,
1756            but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
1757            generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
1758
1759         if (ir->efep != FreeEnergyPerturbationType::No && !EI_VV(ir->eI))
1760         {
1761             /* Sum up the foreign energy and dK/dl terms for md and sd.
1762                Currently done every step so that dH/dl is correct in the .edr */
1763             accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
1764         }
1765
1766         update_pcouple_after_coordinates(fplog,
1767                                          step,
1768                                          ir,
1769                                          md->homenr,
1770                                          md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1771                                                      : gmx::ArrayRef<const unsigned short>(),
1772                                          pres,
1773                                          force_vir,
1774                                          shake_vir,
1775                                          pressureCouplingMu,
1776                                          state,
1777                                          nrnb,
1778                                          upd.deform(),
1779                                          !useGpuForUpdate);
1780
1781         const bool doBerendsenPressureCoupling = (inputrec->epc == PressureCoupling::Berendsen
1782                                                   && do_per_step(step, inputrec->nstpcouple));
1783         const bool doCRescalePressureCoupling  = (inputrec->epc == PressureCoupling::CRescale
1784                                                  && do_per_step(step, inputrec->nstpcouple));
1785         if (useGpuForUpdate
1786             && (doBerendsenPressureCoupling || doCRescalePressureCoupling || doParrinelloRahman))
1787         {
1788             integrator->scaleCoordinates(pressureCouplingMu);
1789             if (doCRescalePressureCoupling)
1790             {
1791                 matrix pressureCouplingInvMu;
1792                 gmx::invertBoxMatrix(pressureCouplingMu, pressureCouplingInvMu);
1793                 integrator->scaleVelocities(pressureCouplingInvMu);
1794             }
1795             integrator->setPbc(PbcType::Xyz, state->box);
1796         }
1797
1798         /* ################# END UPDATE STEP 2 ################# */
1799         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
1800
1801         /* The coordinates (x) were unshifted in update */
1802         if (!bGStat)
1803         {
1804             /* We will not sum ekinh_old,
1805              * so signal that we still have to do it.
1806              */
1807             bSumEkinhOld = TRUE;
1808         }
1809
1810         if (bCalcEner)
1811         {
1812             /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
1813
1814             /* use the directly determined last velocity, not actually the averaged half steps */
1815             if (bTrotter && ir->eI == IntegrationAlgorithm::VV)
1816             {
1817                 enerd->term[F_EKIN] = last_ekin;
1818             }
1819             enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
1820
1821             if (integratorHasConservedEnergyQuantity(ir))
1822             {
1823                 if (EI_VV(ir->eI))
1824                 {
1825                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
1826                 }
1827                 else
1828                 {
1829                     enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + NPT_energy(ir, state, &MassQ);
1830                 }
1831             }
1832             /* #########  END PREPARING EDR OUTPUT  ###########  */
1833         }
1834
1835         /* Output stuff */
1836         if (MASTER(cr))
1837         {
1838             if (fplog && do_log && bDoExpanded)
1839             {
1840                 /* only needed if doing expanded ensemble */
1841                 PrintFreeEnergyInfoToFile(fplog,
1842                                           ir->fepvals.get(),
1843                                           ir->expandedvals.get(),
1844                                           ir->bSimTemp ? ir->simtempvals.get() : nullptr,
1845                                           state_global->dfhist,
1846                                           state->fep_state,
1847                                           ir->nstlog,
1848                                           step);
1849             }
1850             if (bCalcEner)
1851             {
1852                 const bool outputDHDL = (computeDHDL && do_per_step(step, ir->fepvals->nstdhdl));
1853
1854                 energyOutput.addDataAtEnergyStep(outputDHDL,
1855                                                  bCalcEnerStep,
1856                                                  t,
1857                                                  md->tmass,
1858                                                  enerd,
1859                                                  ir->fepvals.get(),
1860                                                  ir->expandedvals.get(),
1861                                                  lastbox,
1862                                                  PTCouplingArrays{ state->boxv,
1863                                                                    state->nosehoover_xi,
1864                                                                    state->nosehoover_vxi,
1865                                                                    state->nhpres_xi,
1866                                                                    state->nhpres_vxi },
1867                                                  state->fep_state,
1868                                                  total_vir,
1869                                                  pres,
1870                                                  ekind,
1871                                                  mu_tot,
1872                                                  constr);
1873             }
1874             else
1875             {
1876                 energyOutput.recordNonEnergyStep();
1877             }
1878
1879             gmx_bool do_dr = do_per_step(step, ir->nstdisreout);
1880             gmx_bool do_or = do_per_step(step, ir->nstorireout);
1881
1882             if (doSimulatedAnnealing)
1883             {
1884                 gmx::EnergyOutput::printAnnealingTemperatures(
1885                         do_log ? fplog : nullptr, groups, &(ir->opts));
1886             }
1887             if (do_log || do_ene || do_dr || do_or)
1888             {
1889                 energyOutput.printStepToEnergyFile(mdoutf_get_fp_ene(outf),
1890                                                    do_ene,
1891                                                    do_dr,
1892                                                    do_or,
1893                                                    do_log ? fplog : nullptr,
1894                                                    step,
1895                                                    t,
1896                                                    fr->fcdata.get(),
1897                                                    awh.get());
1898             }
1899             if (do_log && ir->bDoAwh && awh->hasFepLambdaDimension())
1900             {
1901                 const bool isInitialOutput = false;
1902                 printLambdaStateToLog(fplog, state->lambda, isInitialOutput);
1903             }
1904
1905             if (ir->bPull)
1906             {
1907                 pull_print_output(pull_work, step, t);
1908             }
1909
1910             if (do_per_step(step, ir->nstlog))
1911             {
1912                 if (fflush(fplog) != 0)
1913                 {
1914                     gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
1915                 }
1916             }
1917         }
1918         if (bDoExpanded)
1919         {
1920             /* Have to do this part _after_ outputting the logfile and the edr file */
1921             /* Gets written into the state at the beginning of next loop*/
1922             state->fep_state = lamnew;
1923         }
1924         else if (ir->bDoAwh && awh->needForeignEnergyDifferences(step))
1925         {
1926             state->fep_state = awh->fepLambdaState();
1927         }
1928         /* Print the remaining wall clock time for the run */
1929         if (isMasterSimMasterRank(ms, MASTER(cr)) && (do_verbose || gmx_got_usr_signal()) && !bPMETunePrinting)
1930         {
1931             if (shellfc)
1932             {
1933                 fprintf(stderr, "\n");
1934             }
1935             print_time(stderr, walltime_accounting, step, ir, cr);
1936         }
1937
1938         /* Ion/water position swapping.
1939          * Not done in last step since trajectory writing happens before this call
1940          * in the MD loop and exchanges would be lost anyway. */
1941         bNeedRepartition = FALSE;
1942         if ((ir->eSwapCoords != SwapType::No) && (step > 0) && !bLastStep
1943             && do_per_step(step, ir->swap->nstswap))
1944         {
1945             bNeedRepartition = do_swapcoords(cr,
1946                                              step,
1947                                              t,
1948                                              ir,
1949                                              swap,
1950                                              wcycle,
1951                                              as_rvec_array(state->x.data()),
1952                                              state->box,
1953                                              MASTER(cr) && mdrunOptions.verbose,
1954                                              bRerunMD);
1955
1956             if (bNeedRepartition && DOMAINDECOMP(cr))
1957             {
1958                 dd_collect_state(cr->dd, state, state_global);
1959             }
1960         }
1961
1962         /* Replica exchange */
1963         bExchanged = FALSE;
1964         if (bDoReplEx)
1965         {
1966             bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
1967         }
1968
1969         if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
1970         {
1971             dd_partition_system(fplog,
1972                                 mdlog,
1973                                 step,
1974                                 cr,
1975                                 TRUE,
1976                                 1,
1977                                 state_global,
1978                                 top_global,
1979                                 *ir,
1980                                 imdSession,
1981                                 pull_work,
1982                                 state,
1983                                 &f,
1984                                 mdAtoms,
1985                                 &top,
1986                                 fr,
1987                                 vsite,
1988                                 constr,
1989                                 nrnb,
1990                                 wcycle,
1991                                 FALSE);
1992             upd.updateAfterPartition(state->natoms,
1993                                      md->cFREEZE ? gmx::arrayRefFromArray(md->cFREEZE, md->nr)
1994                                                  : gmx::ArrayRef<const unsigned short>(),
1995                                      md->cTC ? gmx::arrayRefFromArray(md->cTC, md->nr)
1996                                              : gmx::ArrayRef<const unsigned short>());
1997             fr->longRangeNonbondeds->updateAfterPartition(*md);
1998         }
1999
2000         bFirstStep = FALSE;
2001         bInitStep  = FALSE;
2002
2003         /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
2004         /* With all integrators, except VV, we need to retain the pressure
2005          * at the current step for coupling at the next step.
2006          */
2007         if ((state->flags & enumValueToBitMask(StateEntry::PressurePrevious))
2008             && (bGStatEveryStep || (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
2009         {
2010             /* Store the pressure in t_state for pressure coupling
2011              * at the next MD step.
2012              */
2013             copy_mat(pres, state->pres_prev);
2014         }
2015
2016         /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
2017
2018         if ((membed != nullptr) && (!bLastStep))
2019         {
2020             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
2021         }
2022
2023         cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
2024         if (DOMAINDECOMP(cr) && wcycle)
2025         {
2026             dd_cycles_add(cr->dd, cycles, ddCyclStep);
2027         }
2028
2029         /* increase the MD step number */
2030         step++;
2031         step_rel++;
2032
2033 #if GMX_FAHCORE
2034         if (MASTER(cr))
2035         {
2036             fcReportProgress(ir->nsteps + ir->init_step, step);
2037         }
2038 #endif
2039
2040         resetHandler->resetCounters(
2041                 step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting);
2042
2043         /* If bIMD is TRUE, the master updates the IMD energy record and sends positions to VMD client */
2044         imdSession->updateEnergyRecordAndSendPositionsAndEnergies(bInteractiveMDstep, step, bCalcEner);
2045     }
2046     /* End of main MD loop */
2047
2048     /* Closing TNG files can include compressing data. Therefore it is good to do that
2049      * before stopping the time measurements. */
2050     mdoutf_tng_close(outf);
2051
2052     /* Stop measuring walltime */
2053     walltime_accounting_end_time(walltime_accounting);
2054
2055     if (simulationWork.haveSeparatePmeRank)
2056     {
2057         /* Tell the PME only node to finish */
2058         gmx_pme_send_finish(cr);
2059     }
2060
2061     if (MASTER(cr))
2062     {
2063         if (ir->nstcalcenergy > 0)
2064         {
2065             energyOutput.printEnergyConservation(fplog, ir->simulation_part, EI_MD(ir->eI));
2066
2067             gmx::EnergyOutput::printAnnealingTemperatures(fplog, groups, &(ir->opts));
2068             energyOutput.printAverages(fplog, groups);
2069         }
2070     }
2071     done_mdoutf(outf);
2072
2073     if (bPMETune)
2074     {
2075         pme_loadbal_done(pme_loadbal, fplog, mdlog, fr->nbv->useGpu());
2076     }
2077
2078     done_shellfc(fplog, shellfc, step_rel);
2079
2080     if (useReplicaExchange && MASTER(cr))
2081     {
2082         print_replica_exchange_statistics(fplog, repl_ex);
2083     }
2084
2085     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
2086
2087     global_stat_destroy(gstat);
2088 }