fdc93ac07e2827483542a22c9371b5977d4f2c8d
[alexxy/gromacs.git] / src / gromacs / mdrun / runner.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 /*! \internal \file
38  *
39  * \brief Implements the MD runner routine calling all integrators.
40  *
41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
42  * \ingroup module_mdrun
43  */
44 #include "gmxpre.h"
45
46 #include "runner.h"
47
48 #include "config.h"
49
50 #include <cassert>
51 #include <cinttypes>
52 #include <csignal>
53 #include <cstdlib>
54 #include <cstring>
55
56 #include <algorithm>
57 #include <memory>
58
59 #include "gromacs/commandline/filenm.h"
60 #include "gromacs/domdec/builder.h"
61 #include "gromacs/domdec/domdec.h"
62 #include "gromacs/domdec/domdec_struct.h"
63 #include "gromacs/domdec/gpuhaloexchange.h"
64 #include "gromacs/domdec/localatomsetmanager.h"
65 #include "gromacs/domdec/partition.h"
66 #include "gromacs/ewald/ewald_utils.h"
67 #include "gromacs/ewald/pme.h"
68 #include "gromacs/ewald/pme_gpu_program.h"
69 #include "gromacs/fileio/checkpoint.h"
70 #include "gromacs/fileio/gmxfio.h"
71 #include "gromacs/fileio/oenv.h"
72 #include "gromacs/fileio/tpxio.h"
73 #include "gromacs/gmxlib/network.h"
74 #include "gromacs/gmxlib/nrnb.h"
75 #include "gromacs/gpu_utils/gpu_utils.h"
76 #include "gromacs/hardware/cpuinfo.h"
77 #include "gromacs/hardware/detecthardware.h"
78 #include "gromacs/hardware/printhardware.h"
79 #include "gromacs/imd/imd.h"
80 #include "gromacs/listed_forces/disre.h"
81 #include "gromacs/listed_forces/gpubonded.h"
82 #include "gromacs/listed_forces/orires.h"
83 #include "gromacs/math/functions.h"
84 #include "gromacs/math/utilities.h"
85 #include "gromacs/math/vec.h"
86 #include "gromacs/mdlib/boxdeformation.h"
87 #include "gromacs/mdlib/broadcaststructs.h"
88 #include "gromacs/mdlib/calc_verletbuf.h"
89 #include "gromacs/mdlib/dispersioncorrection.h"
90 #include "gromacs/mdlib/enerdata_utils.h"
91 #include "gromacs/mdlib/force.h"
92 #include "gromacs/mdlib/forcerec.h"
93 #include "gromacs/mdlib/gmx_omp_nthreads.h"
94 #include "gromacs/mdlib/makeconstraints.h"
95 #include "gromacs/mdlib/md_support.h"
96 #include "gromacs/mdlib/mdatoms.h"
97 #include "gromacs/mdlib/membed.h"
98 #include "gromacs/mdlib/qmmm.h"
99 #include "gromacs/mdlib/sighandler.h"
100 #include "gromacs/mdlib/stophandler.h"
101 #include "gromacs/mdrun/mdmodules.h"
102 #include "gromacs/mdrun/simulationcontext.h"
103 #include "gromacs/mdrunutility/handlerestart.h"
104 #include "gromacs/mdrunutility/logging.h"
105 #include "gromacs/mdrunutility/multisim.h"
106 #include "gromacs/mdrunutility/printtime.h"
107 #include "gromacs/mdrunutility/threadaffinity.h"
108 #include "gromacs/mdtypes/commrec.h"
109 #include "gromacs/mdtypes/enerdata.h"
110 #include "gromacs/mdtypes/fcdata.h"
111 #include "gromacs/mdtypes/group.h"
112 #include "gromacs/mdtypes/inputrec.h"
113 #include "gromacs/mdtypes/md_enums.h"
114 #include "gromacs/mdtypes/mdrunoptions.h"
115 #include "gromacs/mdtypes/observableshistory.h"
116 #include "gromacs/mdtypes/simulation_workload.h"
117 #include "gromacs/mdtypes/state.h"
118 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
119 #include "gromacs/nbnxm/gpu_data_mgmt.h"
120 #include "gromacs/nbnxm/nbnxm.h"
121 #include "gromacs/nbnxm/pairlist_tuning.h"
122 #include "gromacs/pbcutil/pbc.h"
123 #include "gromacs/pulling/output.h"
124 #include "gromacs/pulling/pull.h"
125 #include "gromacs/pulling/pull_rotation.h"
126 #include "gromacs/restraint/manager.h"
127 #include "gromacs/restraint/restraintmdmodule.h"
128 #include "gromacs/restraint/restraintpotential.h"
129 #include "gromacs/swap/swapcoords.h"
130 #include "gromacs/taskassignment/decidegpuusage.h"
131 #include "gromacs/taskassignment/decidesimulationworkload.h"
132 #include "gromacs/taskassignment/resourcedivision.h"
133 #include "gromacs/taskassignment/taskassignment.h"
134 #include "gromacs/taskassignment/usergpuids.h"
135 #include "gromacs/timing/gpu_timing.h"
136 #include "gromacs/timing/wallcycle.h"
137 #include "gromacs/timing/wallcyclereporting.h"
138 #include "gromacs/topology/mtop_util.h"
139 #include "gromacs/trajectory/trajectoryframe.h"
140 #include "gromacs/utility/basenetwork.h"
141 #include "gromacs/utility/cstringutil.h"
142 #include "gromacs/utility/exceptions.h"
143 #include "gromacs/utility/fatalerror.h"
144 #include "gromacs/utility/filestream.h"
145 #include "gromacs/utility/gmxassert.h"
146 #include "gromacs/utility/gmxmpi.h"
147 #include "gromacs/utility/keyvaluetree.h"
148 #include "gromacs/utility/logger.h"
149 #include "gromacs/utility/loggerbuilder.h"
150 #include "gromacs/utility/mdmodulenotification.h"
151 #include "gromacs/utility/physicalnodecommunicator.h"
152 #include "gromacs/utility/pleasecite.h"
153 #include "gromacs/utility/programcontext.h"
154 #include "gromacs/utility/smalloc.h"
155 #include "gromacs/utility/stringutil.h"
156
157 #include "isimulator.h"
158 #include "replicaexchange.h"
159 #include "simulatorbuilder.h"
160
161 #if GMX_FAHCORE
162 #include "corewrap.h"
163 #endif
164
165 namespace gmx
166 {
167
168 /*! \brief Structure that holds boolean flags corresponding to the development
169  *        features present enabled through environemnt variables.
170  *
171  */
172 struct DevelopmentFeatureFlags
173 {
174     ///! True if the Buffer ops development feature is enabled
175     // TODO: when the trigger of the buffer ops offload is fully automated this should go away
176     bool enableGpuBufferOps     = false;
177     ///! True if the update-constraints development feature is enabled
178     // TODO This needs to be reomved when the code gets cleaned up of GMX_UPDATE_CONSTRAIN_GPU
179     bool useGpuUpdateConstrain  = false;
180     ///! True if the GPU halo exchange development feature is enabled
181     bool enableGpuHaloExchange  = false;
182     ///! True if the PME PP direct commuinication GPU development feature is enabled
183     bool enableGpuPmePPComm     = false;
184 };
185
186 /*! \brief Manage any development feature flag variables encountered
187  *
188  * The use of dev features indicated by environment variables is
189  * logged in order to ensure that runs with such features enabled can
190  * be identified from their log and standard output. Any cross
191  * dependencies are also checked, and if unsatisfied, a fatal error
192  * issued.
193  *
194  * Note that some development features overrides are applied already here:
195  * the GPU communication flags are set to false in non-tMPI and non-CUDA builds.
196  *
197  * \param[in]  mdlog        Logger object.
198  * \returns                 The object populated with development feature flags.
199  */
200 static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger &mdlog)
201 {
202     DevelopmentFeatureFlags devFlags;
203
204     devFlags.enableGpuBufferOps    = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
205     devFlags.useGpuUpdateConstrain = (getenv("GMX_UPDATE_CONSTRAIN_GPU") != nullptr);
206     devFlags.enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
207     devFlags.enableGpuPmePPComm    = (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
208
209     if (devFlags.enableGpuBufferOps)
210     {
211         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
212                 "NOTE: This run uses the 'GPU buffer ops' feature, enabled by the GMX_USE_GPU_BUFFER_OPS environment variable.");
213     }
214
215     if (devFlags.enableGpuHaloExchange)
216     {
217         if (!devFlags.enableGpuBufferOps)
218         {
219             gmx_fatal(FARGS, "Cannot enable GPU halo exchange without GPU buffer operations, set GMX_USE_GPU_BUFFER_OPS=1\n");
220         }
221     }
222
223     if (devFlags.useGpuUpdateConstrain)
224     {
225         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
226                 "NOTE: This run uses the 'GPU update/constraints' feature, enabled by the GMX_UPDATE_CONSTRAIN_GPU environment variable.");
227     }
228
229     return devFlags;
230 }
231
232 /*! \brief Barrier for safe simultaneous thread access to mdrunner data
233  *
234  * Used to ensure that the master thread does not modify mdrunner during copy
235  * on the spawned threads. */
236 static void threadMpiMdrunnerAccessBarrier()
237 {
238 #if GMX_THREAD_MPI
239     MPI_Barrier(MPI_COMM_WORLD);
240 #endif
241 }
242
243 Mdrunner Mdrunner::cloneOnSpawnedThread() const
244 {
245     auto newRunner = Mdrunner(std::make_unique<MDModules>());
246
247     // All runners in the same process share a restraint manager resource because it is
248     // part of the interface to the client code, which is associated only with the
249     // original thread. Handles to the same resources can be obtained by copy.
250     {
251         newRunner.restraintManager_ = std::make_unique<RestraintManager>(*restraintManager_);
252     }
253
254     // Copy members of master runner.
255     // \todo Replace with builder when Simulation context and/or runner phases are better defined.
256     // Ref https://redmine.gromacs.org/issues/2587 and https://redmine.gromacs.org/issues/2375
257     newRunner.hw_opt    = hw_opt;
258     newRunner.filenames = filenames;
259
260     newRunner.oenv                = oenv;
261     newRunner.mdrunOptions        = mdrunOptions;
262     newRunner.domdecOptions       = domdecOptions;
263     newRunner.nbpu_opt            = nbpu_opt;
264     newRunner.pme_opt             = pme_opt;
265     newRunner.pme_fft_opt         = pme_fft_opt;
266     newRunner.bonded_opt          = bonded_opt;
267     newRunner.update_opt          = update_opt;
268     newRunner.nstlist_cmdline     = nstlist_cmdline;
269     newRunner.replExParams        = replExParams;
270     newRunner.pforce              = pforce;
271     // Give the spawned thread the newly created valid communicator
272     // for the simulation.
273     newRunner.communicator        = MPI_COMM_WORLD;
274     newRunner.ms                  = ms;
275     newRunner.startingBehavior    = startingBehavior;
276     newRunner.stopHandlerBuilder_ = std::make_unique<StopHandlerBuilder>(*stopHandlerBuilder_);
277
278     threadMpiMdrunnerAccessBarrier();
279
280     return newRunner;
281 }
282
283 /*! \brief The callback used for running on spawned threads.
284  *
285  * Obtains the pointer to the master mdrunner object from the one
286  * argument permitted to the thread-launch API call, copies it to make
287  * a new runner for this thread, reinitializes necessary data, and
288  * proceeds to the simulation. */
289 static void mdrunner_start_fn(const void *arg)
290 {
291     try
292     {
293         auto masterMdrunner = reinterpret_cast<const gmx::Mdrunner *>(arg);
294         /* copy the arg list to make sure that it's thread-local. This
295            doesn't copy pointed-to items, of course; fnm, cr and fplog
296            are reset in the call below, all others should be const. */
297         gmx::Mdrunner mdrunner = masterMdrunner->cloneOnSpawnedThread();
298         mdrunner.mdrunner();
299     }
300     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
301 }
302
303
304 void Mdrunner::spawnThreads(int numThreadsToLaunch)
305 {
306 #if GMX_THREAD_MPI
307     /* now spawn new threads that start mdrunner_start_fn(), while
308        the main thread returns. Thread affinity is handled later. */
309     if (tMPI_Init_fn(TRUE, numThreadsToLaunch, TMPI_AFFINITY_NONE,
310                      mdrunner_start_fn, static_cast<const void*>(this)) != TMPI_SUCCESS)
311     {
312         GMX_THROW(gmx::InternalError("Failed to spawn thread-MPI threads"));
313     }
314
315     // Give the master thread the newly created valid communicator for
316     // the simulation.
317     communicator = MPI_COMM_WORLD;
318     threadMpiMdrunnerAccessBarrier();
319 #else
320     GMX_UNUSED_VALUE(numThreadsToLaunch);
321     GMX_UNUSED_VALUE(mdrunner_start_fn);
322 #endif
323 }
324
325 }  // namespace gmx
326
327 /*! \brief Initialize variables for Verlet scheme simulation */
328 static void prepare_verlet_scheme(FILE                           *fplog,
329                                   t_commrec                      *cr,
330                                   t_inputrec                     *ir,
331                                   int                             nstlist_cmdline,
332                                   const gmx_mtop_t               *mtop,
333                                   const matrix                    box,
334                                   bool                            makeGpuPairList,
335                                   const gmx::CpuInfo             &cpuinfo)
336 {
337     /* For NVE simulations, we will retain the initial list buffer */
338     if (EI_DYNAMICS(ir->eI) &&
339         ir->verletbuf_tol > 0 &&
340         !(EI_MD(ir->eI) && ir->etc == etcNO))
341     {
342         /* Update the Verlet buffer size for the current run setup */
343
344         /* Here we assume SIMD-enabled kernels are being used. But as currently
345          * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
346          * and 4x2 gives a larger buffer than 4x4, this is ok.
347          */
348         ListSetupType      listType  = (makeGpuPairList ? ListSetupType::Gpu : ListSetupType::CpuSimdWhenSupported);
349         VerletbufListSetup listSetup = verletbufGetSafeListSetup(listType);
350
351         const real         rlist_new =
352             calcVerletBufferSize(*mtop, det(box), *ir, ir->nstlist, ir->nstlist - 1, -1, listSetup);
353
354         if (rlist_new != ir->rlist)
355         {
356             if (fplog != nullptr)
357             {
358                 fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
359                         ir->rlist, rlist_new,
360                         listSetup.cluster_size_i, listSetup.cluster_size_j);
361             }
362             ir->rlist     = rlist_new;
363         }
364     }
365
366     if (nstlist_cmdline > 0 && (!EI_DYNAMICS(ir->eI) || ir->verletbuf_tol <= 0))
367     {
368         gmx_fatal(FARGS, "Can not set nstlist without %s",
369                   !EI_DYNAMICS(ir->eI) ? "dynamics" : "verlet-buffer-tolerance");
370     }
371
372     if (EI_DYNAMICS(ir->eI))
373     {
374         /* Set or try nstlist values */
375         increaseNstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, makeGpuPairList, cpuinfo);
376     }
377 }
378
379 /*! \brief Override the nslist value in inputrec
380  *
381  * with value passed on the command line (if any)
382  */
383 static void override_nsteps_cmdline(const gmx::MDLogger &mdlog,
384                                     int64_t              nsteps_cmdline,
385                                     t_inputrec          *ir)
386 {
387     assert(ir);
388
389     /* override with anything else than the default -2 */
390     if (nsteps_cmdline > -2)
391     {
392         char sbuf_steps[STEPSTRSIZE];
393         char sbuf_msg[STRLEN];
394
395         ir->nsteps = nsteps_cmdline;
396         if (EI_DYNAMICS(ir->eI) && nsteps_cmdline != -1)
397         {
398             sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps, %.3g ps",
399                     gmx_step_str(nsteps_cmdline, sbuf_steps),
400                     fabs(nsteps_cmdline*ir->delta_t));
401         }
402         else
403         {
404             sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps",
405                     gmx_step_str(nsteps_cmdline, sbuf_steps));
406         }
407
408         GMX_LOG(mdlog.warning).asParagraph().appendText(sbuf_msg);
409     }
410     else if (nsteps_cmdline < -2)
411     {
412         gmx_fatal(FARGS, "Invalid nsteps value passed on the command line: %" PRId64,
413                   nsteps_cmdline);
414     }
415     /* Do nothing if nsteps_cmdline == -2 */
416 }
417
418 namespace gmx
419 {
420
421 /*! \brief Return whether GPU acceleration of nonbondeds is supported with the given settings.
422  *
423  * If not, and if a warning may be issued, logs a warning about
424  * falling back to CPU code. With thread-MPI, only the first
425  * call to this function should have \c issueWarning true. */
426 static bool gpuAccelerationOfNonbondedIsUseful(const MDLogger   &mdlog,
427                                                const t_inputrec &ir,
428                                                bool              issueWarning)
429 {
430     bool        gpuIsUseful = true;
431     std::string warning;
432
433     if (ir.opts.ngener - ir.nwall > 1)
434     {
435         /* The GPU code does not support more than one energy group.
436          * If the user requested GPUs explicitly, a fatal error is given later.
437          */
438         gpuIsUseful = false;
439         warning     =
440             "Multiple energy groups is not implemented for GPUs, falling back to the CPU. "
441             "For better performance, run on the GPU without energy groups and then do "
442             "gmx mdrun -rerun option on the trajectory with an energy group .tpr file.";
443     }
444
445     if (EI_TPI(ir.eI))
446     {
447         gpuIsUseful = false;
448         warning     = "TPI is not implemented for GPUs.";
449     }
450
451     if (!gpuIsUseful && issueWarning)
452     {
453         GMX_LOG(mdlog.warning).asParagraph().appendText(warning);
454     }
455
456     return gpuIsUseful;
457 }
458
459 //! Initializes the logger for mdrun.
460 static gmx::LoggerOwner buildLogger(FILE      *fplog,
461                                     const bool isSimulationMasterRank)
462 {
463     gmx::LoggerBuilder builder;
464     if (fplog != nullptr)
465     {
466         builder.addTargetFile(gmx::MDLogger::LogLevel::Info, fplog);
467     }
468     if (isSimulationMasterRank)
469     {
470         builder.addTargetStream(gmx::MDLogger::LogLevel::Warning,
471                                 &gmx::TextOutputFile::standardError());
472     }
473     return builder.build();
474 }
475
476 //! Make a TaskTarget from an mdrun argument string.
477 static TaskTarget findTaskTarget(const char *optionString)
478 {
479     TaskTarget returnValue = TaskTarget::Auto;
480
481     if (strncmp(optionString, "auto", 3) == 0)
482     {
483         returnValue = TaskTarget::Auto;
484     }
485     else if (strncmp(optionString, "cpu", 3) == 0)
486     {
487         returnValue = TaskTarget::Cpu;
488     }
489     else if (strncmp(optionString, "gpu", 3) == 0)
490     {
491         returnValue = TaskTarget::Gpu;
492     }
493     else
494     {
495         GMX_ASSERT(false, "Option string should have been checked for sanity already");
496     }
497
498     return returnValue;
499 }
500
501 //! Finish run, aggregate data to print performance info.
502 static void finish_run(FILE *fplog,
503                        const gmx::MDLogger &mdlog,
504                        const t_commrec *cr,
505                        const t_inputrec *inputrec,
506                        t_nrnb nrnb[], gmx_wallcycle_t wcycle,
507                        gmx_walltime_accounting_t walltime_accounting,
508                        nonbonded_verlet_t *nbv,
509                        const gmx_pme_t *pme,
510                        gmx_bool bWriteStat)
511 {
512     double  delta_t  = 0;
513     double  nbfs     = 0, mflop = 0;
514     double  elapsed_time,
515             elapsed_time_over_all_ranks,
516             elapsed_time_over_all_threads,
517             elapsed_time_over_all_threads_over_all_ranks;
518     /* Control whether it is valid to print a report. Only the
519        simulation master may print, but it should not do so if the run
520        terminated e.g. before a scheduled reset step. This is
521        complicated by the fact that PME ranks are unaware of the
522        reason why they were sent a pmerecvqxFINISH. To avoid
523        communication deadlocks, we always do the communication for the
524        report, even if we've decided not to write the report, because
525        how long it takes to finish the run is not important when we've
526        decided not to report on the simulation performance.
527
528        Further, we only report performance for dynamical integrators,
529        because those are the only ones for which we plan to
530        consider doing any optimizations. */
531     bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
532
533     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
534     {
535         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
536         printReport = false;
537     }
538
539     t_nrnb                  *nrnb_tot;
540     std::unique_ptr<t_nrnb>  nrnbTotalStorage;
541     if (cr->nnodes > 1)
542     {
543         nrnbTotalStorage = std::make_unique<t_nrnb>();
544         nrnb_tot         = nrnbTotalStorage.get();
545 #if GMX_MPI
546         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
547                       cr->mpi_comm_mysim);
548 #endif
549     }
550     else
551     {
552         nrnb_tot = nrnb;
553     }
554
555     elapsed_time                  = walltime_accounting_get_time_since_reset(walltime_accounting);
556     elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
557     if (cr->nnodes > 1)
558     {
559 #if GMX_MPI
560         /* reduce elapsed_time over all MPI ranks in the current simulation */
561         MPI_Allreduce(&elapsed_time,
562                       &elapsed_time_over_all_ranks,
563                       1, MPI_DOUBLE, MPI_SUM,
564                       cr->mpi_comm_mysim);
565         elapsed_time_over_all_ranks /= cr->nnodes;
566         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
567          * current simulation. */
568         MPI_Allreduce(&elapsed_time_over_all_threads,
569                       &elapsed_time_over_all_threads_over_all_ranks,
570                       1, MPI_DOUBLE, MPI_SUM,
571                       cr->mpi_comm_mysim);
572 #endif
573     }
574     else
575     {
576         elapsed_time_over_all_ranks                  = elapsed_time;
577         elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
578     }
579
580     if (printReport)
581     {
582         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
583     }
584
585     if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
586     {
587         print_dd_statistics(cr, inputrec, fplog);
588     }
589
590     /* TODO Move the responsibility for any scaling by thread counts
591      * to the code that handled the thread region, so that there's a
592      * mechanism to keep cycle counting working during the transition
593      * to task parallelism. */
594     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
595     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
596     wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
597     auto cycle_sum(wallcycle_sum(cr, wcycle));
598
599     if (printReport)
600     {
601         auto                    nbnxn_gpu_timings = (nbv != nullptr && nbv->useGpu()) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
602         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
603
604         if (pme_gpu_task_enabled(pme))
605         {
606             pme_gpu_get_timings(pme, &pme_gpu_timings);
607         }
608         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
609                         elapsed_time_over_all_ranks,
610                         wcycle, cycle_sum,
611                         nbnxn_gpu_timings,
612                         &pme_gpu_timings);
613
614         if (EI_DYNAMICS(inputrec->eI))
615         {
616             delta_t = inputrec->delta_t;
617         }
618
619         if (fplog)
620         {
621             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
622                        elapsed_time_over_all_ranks,
623                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
624                        delta_t, nbfs, mflop);
625         }
626         if (bWriteStat)
627         {
628             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
629                        elapsed_time_over_all_ranks,
630                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
631                        delta_t, nbfs, mflop);
632         }
633     }
634 }
635
636 int Mdrunner::mdrunner()
637 {
638     matrix                    box;
639     t_forcerec               *fr               = nullptr;
640     t_fcdata                 *fcd              = nullptr;
641     real                      ewaldcoeff_q     = 0;
642     real                      ewaldcoeff_lj    = 0;
643     int                       nChargePerturbed = -1, nTypePerturbed = 0;
644     gmx_wallcycle_t           wcycle;
645     gmx_walltime_accounting_t walltime_accounting = nullptr;
646     gmx_membed_t *            membed              = nullptr;
647     gmx_hw_info_t            *hwinfo              = nullptr;
648
649     /* CAUTION: threads may be started later on in this function, so
650        cr doesn't reflect the final parallel state right now */
651     gmx_mtop_t                      mtop;
652
653     bool doMembed = opt2bSet("-membed", filenames.size(), filenames.data());
654     bool doRerun  = mdrunOptions.rerun;
655
656     // Handle task-assignment related user options.
657     EmulateGpuNonbonded emulateGpuNonbonded = (getenv("GMX_EMULATE_GPU") != nullptr ?
658                                                EmulateGpuNonbonded::Yes : EmulateGpuNonbonded::No);
659
660     std::vector<int> userGpuTaskAssignment;
661     try
662     {
663         userGpuTaskAssignment = parseUserTaskAssignmentString(hw_opt.userGpuTaskAssignment);
664     }
665     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
666     auto       nonbondedTarget = findTaskTarget(nbpu_opt);
667     auto       pmeTarget       = findTaskTarget(pme_opt);
668     auto       pmeFftTarget    = findTaskTarget(pme_fft_opt);
669     auto       bondedTarget    = findTaskTarget(bonded_opt);
670     auto       updateTarget    = findTaskTarget(update_opt);
671     PmeRunMode pmeRunMode      = PmeRunMode::None;
672
673     FILE      *fplog = nullptr;
674     // If we are appending, we don't write log output because we need
675     // to check that the old log file matches what the checkpoint file
676     // expects. Otherwise, we should start to write log output now if
677     // there is a file ready for it.
678     if (logFileHandle != nullptr && startingBehavior != StartingBehavior::RestartWithAppending)
679     {
680         fplog = gmx_fio_getfp(logFileHandle);
681     }
682     const bool       isSimulationMasterRank = findIsSimulationMasterRank(ms, communicator);
683     gmx::LoggerOwner logOwner(buildLogger(fplog, isSimulationMasterRank));
684     gmx::MDLogger    mdlog(logOwner.logger());
685
686     // report any development features that may be enabled by environment variables
687     const DevelopmentFeatureFlags devFlags = manageDevelopmentFeatures(mdlog);
688
689     // TODO The thread-MPI master rank makes a working
690     // PhysicalNodeCommunicator here, but it gets rebuilt by all ranks
691     // after the threads have been launched. This works because no use
692     // is made of that communicator until after the execution paths
693     // have rejoined. But it is likely that we can improve the way
694     // this is expressed, e.g. by expressly running detection only the
695     // master rank for thread-MPI, rather than relying on the mutex
696     // and reference count.
697     PhysicalNodeCommunicator physicalNodeComm(communicator, gmx_physicalnode_id_hash());
698     hwinfo = gmx_detect_hardware(mdlog, physicalNodeComm);
699
700     gmx_print_detected_hardware(fplog, isSimulationMasterRank && isMasterSim(ms), mdlog, hwinfo);
701
702     std::vector<int> gpuIdsToUse = makeGpuIdsToUse(hwinfo->gpu_info, hw_opt.gpuIdsAvailable);
703
704     // Print citation requests after all software/hardware printing
705     pleaseCiteGromacs(fplog);
706
707     // TODO Replace this by unique_ptr once t_inputrec is C++
708     t_inputrec               inputrecInstance;
709     t_inputrec              *inputrec = nullptr;
710     std::unique_ptr<t_state> globalState;
711
712     auto                     partialDeserializedTpr = std::make_unique<PartialDeserializedTprFile>();
713
714     if (isSimulationMasterRank)
715     {
716         /* Only the master rank has the global state */
717         globalState = std::make_unique<t_state>();
718
719         /* Read (nearly) all data required for the simulation
720          * and keep the partly serialized tpr contents to send to other ranks later
721          */
722         *partialDeserializedTpr = read_tpx_state(ftp2fn(efTPR, filenames.size(), filenames.data()), &inputrecInstance, globalState.get(), &mtop);
723         inputrec                = &inputrecInstance;
724     }
725
726     /* Check and update the hardware options for internal consistency */
727     checkAndUpdateHardwareOptions(mdlog, &hw_opt, isSimulationMasterRank, domdecOptions.numPmeRanks, inputrec);
728
729     if (GMX_THREAD_MPI && isSimulationMasterRank)
730     {
731         bool useGpuForNonbonded = false;
732         bool useGpuForPme       = false;
733         try
734         {
735             GMX_RELEASE_ASSERT(inputrec != nullptr, "Keep the compiler happy");
736
737             // If the user specified the number of ranks, then we must
738             // respect that, but in default mode, we need to allow for
739             // the number of GPUs to choose the number of ranks.
740             auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
741             useGpuForNonbonded = decideWhetherToUseGpusForNonbondedWithThreadMpi
742                     (nonbondedTarget, gpuIdsToUse, userGpuTaskAssignment, emulateGpuNonbonded,
743                     canUseGpuForNonbonded,
744                     gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, GMX_THREAD_MPI),
745                     hw_opt.nthreads_tmpi);
746             useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi
747                     (useGpuForNonbonded, pmeTarget, gpuIdsToUse, userGpuTaskAssignment,
748                     *hwinfo, *inputrec, mtop, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
749
750         }
751         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
752
753         /* Determine how many thread-MPI ranks to start.
754          *
755          * TODO Over-writing the user-supplied value here does
756          * prevent any possible subsequent checks from working
757          * correctly. */
758         hw_opt.nthreads_tmpi = get_nthreads_mpi(hwinfo,
759                                                 &hw_opt,
760                                                 gpuIdsToUse,
761                                                 useGpuForNonbonded,
762                                                 useGpuForPme,
763                                                 inputrec, &mtop,
764                                                 mdlog,
765                                                 doMembed);
766
767         // Now start the threads for thread MPI.
768         spawnThreads(hw_opt.nthreads_tmpi);
769         // The spawned threads enter mdrunner() and execution of
770         // master and spawned threads joins at the end of this block.
771         physicalNodeComm = PhysicalNodeCommunicator(communicator, gmx_physicalnode_id_hash());
772     }
773
774     GMX_RELEASE_ASSERT(communicator == MPI_COMM_WORLD, "Must have valid world communicator");
775     CommrecHandle crHandle = init_commrec(communicator, ms);
776     t_commrec    *cr       = crHandle.get();
777     GMX_RELEASE_ASSERT(cr != nullptr, "Must have valid commrec");
778
779     if (PAR(cr))
780     {
781         /* now broadcast everything to the non-master nodes/threads: */
782         if (!isSimulationMasterRank)
783         {
784             inputrec = &inputrecInstance;
785         }
786         init_parallel(cr, inputrec, &mtop, partialDeserializedTpr.get());
787     }
788     GMX_RELEASE_ASSERT(inputrec != nullptr, "All ranks should have a valid inputrec now");
789     partialDeserializedTpr.reset(nullptr);
790
791     // Now the number of ranks is known to all ranks, and each knows
792     // the inputrec read by the master rank. The ranks can now all run
793     // the task-deciding functions and will agree on the result
794     // without needing to communicate.
795     //
796     // TODO Should we do the communication in debug mode to support
797     // having an assertion?
798     //
799     // Note that these variables describe only their own node.
800     //
801     // Note that when bonded interactions run on a GPU they always run
802     // alongside a nonbonded task, so do not influence task assignment
803     // even though they affect the force calculation workload.
804     bool useGpuForNonbonded     = false;
805     bool useGpuForPme           = false;
806     bool useGpuForBonded        = false;
807     bool useGpuForUpdate        = false;
808     bool gpusWereDetected       = hwinfo->ngpu_compatible_tot > 0;
809     try
810     {
811         // It's possible that there are different numbers of GPUs on
812         // different nodes, which is the user's responsibility to
813         // handle. If unsuitable, we will notice that during task
814         // assignment.
815         auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
816         useGpuForNonbonded = decideWhetherToUseGpusForNonbonded(nonbondedTarget, userGpuTaskAssignment,
817                                                                 emulateGpuNonbonded,
818                                                                 canUseGpuForNonbonded,
819                                                                 gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, !GMX_THREAD_MPI),
820                                                                 gpusWereDetected);
821         useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment,
822                                                     *hwinfo, *inputrec, mtop,
823                                                     cr->nnodes, domdecOptions.numPmeRanks,
824                                                     gpusWereDetected);
825         auto canUseGpuForBonded = buildSupportsGpuBondeds(nullptr) && inputSupportsGpuBondeds(*inputrec, mtop, nullptr);
826         useGpuForBonded =
827             decideWhetherToUseGpusForBonded(useGpuForNonbonded, useGpuForPme,
828                                             bondedTarget, canUseGpuForBonded,
829                                             EVDW_PME(inputrec->vdwtype),
830                                             EEL_PME_EWALD(inputrec->coulombtype),
831                                             domdecOptions.numPmeRanks, gpusWereDetected);
832
833         pmeRunMode   = (useGpuForPme ? PmeRunMode::GPU : PmeRunMode::CPU);
834         if (pmeRunMode == PmeRunMode::GPU)
835         {
836             if (pmeFftTarget == TaskTarget::Cpu)
837             {
838                 pmeRunMode = PmeRunMode::Mixed;
839             }
840         }
841         else if (pmeFftTarget == TaskTarget::Gpu)
842         {
843             gmx_fatal(FARGS, "Assigning FFTs to GPU requires PME to be assigned to GPU as well. With PME on CPU you should not be using -pmefft.");
844         }
845     }
846     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
847
848     // Build restraints.
849     // TODO: hide restraint implementation details from Mdrunner.
850     // There is nothing unique about restraints at this point as far as the
851     // Mdrunner is concerned. The Mdrunner should just be getting a sequence of
852     // factory functions from the SimulationContext on which to call mdModules_->add().
853     // TODO: capture all restraints into a single RestraintModule, passed to the runner builder.
854     for (auto && restraint : restraintManager_->getRestraints())
855     {
856         auto module = RestraintMDModule::create(restraint,
857                                                 restraint->sites());
858         mdModules_->add(std::move(module));
859     }
860
861     // TODO: Error handling
862     mdModules_->assignOptionsToModules(*inputrec->params, nullptr);
863     const auto &mdModulesNotifier = mdModules_->notifier().notifier_;
864
865     if (inputrec->internalParameters != nullptr)
866     {
867         mdModulesNotifier.notify(*inputrec->internalParameters);
868     }
869
870     if (fplog != nullptr)
871     {
872         pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
873         fprintf(fplog, "\n");
874     }
875
876     if (SIMMASTER(cr))
877     {
878         /* In rerun, set velocities to zero if present */
879         if (doRerun && ((globalState->flags & (1 << estV)) != 0))
880         {
881             // rerun does not use velocities
882             GMX_LOG(mdlog.info).asParagraph().appendText(
883                     "Rerun trajectory contains velocities. Rerun does only evaluate "
884                     "potential energy and forces. The velocities will be ignored.");
885             for (int i = 0; i < globalState->natoms; i++)
886             {
887                 clear_rvec(globalState->v[i]);
888             }
889             globalState->flags &= ~(1 << estV);
890         }
891
892         /* now make sure the state is initialized and propagated */
893         set_state_entries(globalState.get(), inputrec);
894     }
895
896     /* NM and TPI parallelize over force/energy calculations, not atoms,
897      * so we need to initialize and broadcast the global state.
898      */
899     if (inputrec->eI == eiNM || inputrec->eI == eiTPI)
900     {
901         if (!MASTER(cr))
902         {
903             globalState = std::make_unique<t_state>();
904         }
905         broadcastStateWithoutDynamics(cr, globalState.get());
906     }
907
908     /* A parallel command line option consistency check that we can
909        only do after any threads have started. */
910     if (!PAR(cr) && (domdecOptions.numCells[XX] > 1 ||
911                      domdecOptions.numCells[YY] > 1 ||
912                      domdecOptions.numCells[ZZ] > 1 ||
913                      domdecOptions.numPmeRanks > 0))
914     {
915         gmx_fatal(FARGS,
916                   "The -dd or -npme option request a parallel simulation, "
917 #if !GMX_MPI
918                   "but %s was compiled without threads or MPI enabled", output_env_get_program_display_name(oenv));
919 #elif GMX_THREAD_MPI
920                   "but the number of MPI-threads (option -ntmpi) is not set or is 1");
921 #else
922                   "but %s was not started through mpirun/mpiexec or only one rank was requested through mpirun/mpiexec", output_env_get_program_display_name(oenv));
923 #endif
924     }
925
926     if (doRerun &&
927         (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
928     {
929         gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
930     }
931
932     if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
933     {
934         if (domdecOptions.numPmeRanks > 0)
935         {
936             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
937                                  "PME-only ranks are requested, but the system does not use PME for electrostatics or LJ");
938         }
939
940         domdecOptions.numPmeRanks = 0;
941     }
942
943     if (useGpuForNonbonded && domdecOptions.numPmeRanks < 0)
944     {
945         /* With NB GPUs we don't automatically use PME-only CPU ranks. PME ranks can
946          * improve performance with many threads per GPU, since our OpenMP
947          * scaling is bad, but it's difficult to automate the setup.
948          */
949         domdecOptions.numPmeRanks = 0;
950     }
951     if (useGpuForPme)
952     {
953         if (domdecOptions.numPmeRanks < 0)
954         {
955             domdecOptions.numPmeRanks = 0;
956             // TODO possibly print a note that one can opt-in for a separate PME GPU rank?
957         }
958         else
959         {
960             GMX_RELEASE_ASSERT(domdecOptions.numPmeRanks <= 1, "PME GPU decomposition is not supported");
961         }
962     }
963
964 #if GMX_FAHCORE
965     if (MASTER(cr))
966     {
967         fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
968     }
969 #endif
970
971     /* NMR restraints must be initialized before load_checkpoint,
972      * since with time averaging the history is added to t_state.
973      * For proper consistency check we therefore need to extend
974      * t_state here.
975      * So the PME-only nodes (if present) will also initialize
976      * the distance restraints.
977      */
978     snew(fcd, 1);
979
980     /* This needs to be called before read_checkpoint to extend the state */
981     init_disres(fplog, &mtop, inputrec, cr, ms, fcd, globalState.get(), replExParams.exchangeInterval > 0);
982
983     init_orires(fplog, &mtop, inputrec, cr, ms, globalState.get(), &(fcd->orires));
984
985     auto                 deform = prepareBoxDeformation(globalState->box, cr, *inputrec);
986
987     ObservablesHistory   observablesHistory = {};
988
989     if (startingBehavior != StartingBehavior::NewSimulation)
990     {
991         /* Check if checkpoint file exists before doing continuation.
992          * This way we can use identical input options for the first and subsequent runs...
993          */
994         if (mdrunOptions.numStepsCommandline > -2)
995         {
996             /* Temporarily set the number of steps to unlimited to avoid
997              * triggering the nsteps check in load_checkpoint().
998              * This hack will go away soon when the -nsteps option is removed.
999              */
1000             inputrec->nsteps = -1;
1001         }
1002
1003         load_checkpoint(opt2fn_master("-cpi", filenames.size(), filenames.data(), cr),
1004                         logFileHandle,
1005                         cr, domdecOptions.numCells,
1006                         inputrec, globalState.get(),
1007                         &observablesHistory,
1008                         mdrunOptions.reproducible, mdModules_->notifier());
1009
1010         if (startingBehavior == StartingBehavior::RestartWithAppending && logFileHandle)
1011         {
1012             // Now we can start normal logging to the truncated log file.
1013             fplog    = gmx_fio_getfp(logFileHandle);
1014             prepareLogAppending(fplog);
1015             logOwner = buildLogger(fplog, MASTER(cr));
1016             mdlog    = logOwner.logger();
1017         }
1018     }
1019
1020     if (mdrunOptions.numStepsCommandline > -2)
1021     {
1022         GMX_LOG(mdlog.info).asParagraph().
1023             appendText("The -nsteps functionality is deprecated, and may be removed in a future version. "
1024                        "Consider using gmx convert-tpr -nsteps or changing the appropriate .mdp file field.");
1025     }
1026     /* override nsteps with value set on the commandline */
1027     override_nsteps_cmdline(mdlog, mdrunOptions.numStepsCommandline, inputrec);
1028
1029     if (SIMMASTER(cr))
1030     {
1031         copy_mat(globalState->box, box);
1032     }
1033
1034     if (PAR(cr))
1035     {
1036         gmx_bcast(sizeof(box), box, cr);
1037     }
1038
1039     if (inputrec->cutoff_scheme != ecutsVERLET)
1040     {
1041         gmx_fatal(FARGS, "This group-scheme .tpr file can no longer be run by mdrun. Please update to the Verlet scheme, or use an earlier version of GROMACS if necessary.");
1042     }
1043     /* Update rlist and nstlist. */
1044     prepare_verlet_scheme(fplog, cr, inputrec, nstlist_cmdline, &mtop, box,
1045                           useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes), *hwinfo->cpuInfo);
1046
1047     const bool prefer1DAnd1PulseDD = (devFlags.enableGpuHaloExchange && useGpuForNonbonded);
1048     // This builder is necessary while we have multi-part construction
1049     // of DD. Before DD is constructed, we use the existence of
1050     // the builder object to indicate that further construction of DD
1051     // is needed.
1052     std::unique_ptr<DomainDecompositionBuilder> ddBuilder;
1053     if (PAR(cr) && !(EI_TPI(inputrec->eI) ||
1054                      inputrec->eI == eiNM))
1055     {
1056         ddBuilder = std::make_unique<DomainDecompositionBuilder>
1057                 (mdlog, cr, domdecOptions, mdrunOptions,
1058                 prefer1DAnd1PulseDD, mtop, *inputrec,
1059                 box, positionsFromStatePointer(globalState.get()));
1060     }
1061     else
1062     {
1063         /* PME, if used, is done on all nodes with 1D decomposition */
1064         cr->npmenodes = 0;
1065         cr->duty      = (DUTY_PP | DUTY_PME);
1066
1067         if (inputrec->ePBC == epbcSCREW)
1068         {
1069             gmx_fatal(FARGS,
1070                       "pbc=screw is only implemented with domain decomposition");
1071         }
1072     }
1073
1074     // Produce the task assignment for this rank.
1075     GpuTaskAssignmentsBuilder gpuTaskAssignmentsBuilder;
1076     GpuTaskAssignments        gpuTaskAssignments =
1077         gpuTaskAssignmentsBuilder.build(gpuIdsToUse,
1078                                         userGpuTaskAssignment,
1079                                         *hwinfo,
1080                                         cr,
1081                                         ms,
1082                                         physicalNodeComm,
1083                                         nonbondedTarget,
1084                                         pmeTarget,
1085                                         bondedTarget,
1086                                         updateTarget,
1087                                         useGpuForNonbonded,
1088                                         useGpuForPme,
1089                                         thisRankHasDuty(cr, DUTY_PP),
1090                                         // TODO cr->duty & DUTY_PME should imply that a PME
1091                                         // algorithm is active, but currently does not.
1092                                         EEL_PME(inputrec->coulombtype) &&
1093                                         thisRankHasDuty(cr, DUTY_PME));
1094
1095     const bool printHostName = (cr->nnodes > 1);
1096     gpuTaskAssignments.reportGpuUsage(mdlog, printHostName, useGpuForBonded, pmeRunMode);
1097
1098     // If the user chose a task assignment, give them some hints
1099     // where appropriate.
1100     if (!userGpuTaskAssignment.empty())
1101     {
1102         gpuTaskAssignments.logPerformanceHints(mdlog,
1103                                                ssize(gpuIdsToUse));
1104     }
1105
1106     // Get the device handles for the modules, nullptr when no task is assigned.
1107     gmx_device_info_t *nonbondedDeviceInfo   = gpuTaskAssignments.initNonbondedDevice(cr);
1108     gmx_device_info_t *pmeDeviceInfo         = gpuTaskAssignments.initPmeDevice();
1109
1110     // TODO Initialize GPU streams here.
1111
1112     // TODO Currently this is always built, yet DD partition code
1113     // checks if it is built before using it. Probably it should
1114     // become an MDModule that is made only when another module
1115     // requires it (e.g. pull, CompEl, density fitting), so that we
1116     // don't update the local atom sets unilaterally every step.
1117     LocalAtomSetManager atomSets;
1118     if (ddBuilder)
1119     {
1120         // TODO Pass the GPU streams to ddBuilder to use in buffer
1121         // transfers (e.g. halo exchange)
1122         cr->dd = ddBuilder->build(&atomSets);
1123         // The builder's job is done, so destruct it
1124         ddBuilder.reset(nullptr);
1125         // Note that local state still does not exist yet.
1126     }
1127
1128     if (PAR(cr))
1129     {
1130         /* After possible communicator splitting in make_dd_communicators.
1131          * we can set up the intra/inter node communication.
1132          */
1133         gmx_setup_nodecomm(fplog, cr);
1134     }
1135
1136 #if GMX_MPI
1137     if (isMultiSim(ms))
1138     {
1139         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
1140                 "This is simulation %d out of %d running as a composite GROMACS\n"
1141                 "multi-simulation job. Setup for this simulation:\n",
1142                 ms->sim, ms->nsim);
1143     }
1144     GMX_LOG(mdlog.warning).appendTextFormatted(
1145             "Using %d MPI %s\n",
1146             cr->nnodes,
1147 #if GMX_THREAD_MPI
1148             cr->nnodes == 1 ? "thread" : "threads"
1149 #else
1150             cr->nnodes == 1 ? "process" : "processes"
1151 #endif
1152             );
1153     fflush(stderr);
1154 #endif
1155
1156     // If mdrun -pin auto honors any affinity setting that already
1157     // exists. If so, it is nice to provide feedback about whether
1158     // that existing affinity setting was from OpenMP or something
1159     // else, so we run this code both before and after we initialize
1160     // the OpenMP support.
1161     gmx_check_thread_affinity_set(mdlog,
1162                                   &hw_opt, hwinfo->nthreads_hw_avail, FALSE);
1163     /* Check and update the number of OpenMP threads requested */
1164     checkAndUpdateRequestedNumOpenmpThreads(&hw_opt, *hwinfo, cr, ms, physicalNodeComm.size_,
1165                                             pmeRunMode, mtop, *inputrec);
1166
1167     gmx_omp_nthreads_init(mdlog, cr,
1168                           hwinfo->nthreads_hw_avail,
1169                           physicalNodeComm.size_,
1170                           hw_opt.nthreads_omp,
1171                           hw_opt.nthreads_omp_pme,
1172                           !thisRankHasDuty(cr, DUTY_PP));
1173
1174     // Enable FP exception detection, but not in
1175     // Release mode and not for compilers with known buggy FP
1176     // exception support (clang with any optimization) or suspected
1177     // buggy FP exception support (gcc 7.* with optimization).
1178 #if !defined NDEBUG && \
1179     !((defined __clang__ || (defined(__GNUC__) && !defined(__ICC) && __GNUC__ == 7)) \
1180     && defined __OPTIMIZE__)
1181     const bool bEnableFPE = true;
1182 #else
1183     const bool bEnableFPE = false;
1184 #endif
1185     //FIXME - reconcile with gmx_feenableexcept() call from CommandLineModuleManager::run()
1186     if (bEnableFPE)
1187     {
1188         gmx_feenableexcept();
1189     }
1190
1191     /* Now that we know the setup is consistent, check for efficiency */
1192     check_resource_division_efficiency(hwinfo,
1193                                        gpuTaskAssignments.thisRankHasAnyGpuTask(),
1194                                        mdrunOptions.ntompOptionIsSet,
1195                                        cr,
1196                                        mdlog);
1197
1198     /* getting number of PP/PME threads on this MPI / tMPI rank.
1199        PME: env variable should be read only on one node to make sure it is
1200        identical everywhere;
1201      */
1202     const int numThreadsOnThisRank =
1203         thisRankHasDuty(cr, DUTY_PP) ? gmx_omp_nthreads_get(emntNonbonded) : gmx_omp_nthreads_get(emntPME);
1204     checkHardwareOversubscription(numThreadsOnThisRank, cr->nodeid,
1205                                   *hwinfo->hardwareTopology,
1206                                   physicalNodeComm, mdlog);
1207
1208     if (hw_opt.threadAffinity != ThreadAffinity::Off)
1209     {
1210         /* Before setting affinity, check whether the affinity has changed
1211          * - which indicates that probably the OpenMP library has changed it
1212          * since we first checked).
1213          */
1214         gmx_check_thread_affinity_set(mdlog,
1215                                       &hw_opt, hwinfo->nthreads_hw_avail, TRUE);
1216
1217         int numThreadsOnThisNode, intraNodeThreadOffset;
1218         analyzeThreadsOnThisNode(physicalNodeComm, numThreadsOnThisRank, &numThreadsOnThisNode,
1219                                  &intraNodeThreadOffset);
1220
1221         /* Set the CPU affinity */
1222         gmx_set_thread_affinity(mdlog, cr, &hw_opt, *hwinfo->hardwareTopology,
1223                                 numThreadsOnThisRank, numThreadsOnThisNode,
1224                                 intraNodeThreadOffset, nullptr);
1225     }
1226
1227     if (mdrunOptions.timingOptions.resetStep > -1)
1228     {
1229         GMX_LOG(mdlog.info).asParagraph().
1230             appendText("The -resetstep functionality is deprecated, and may be removed in a future version.");
1231     }
1232     wcycle = wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
1233
1234     if (PAR(cr))
1235     {
1236         /* Master synchronizes its value of reset_counters with all nodes
1237          * including PME only nodes */
1238         int64_t reset_counters = wcycle_get_reset_counters(wcycle);
1239         gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
1240         wcycle_set_reset_counters(wcycle, reset_counters);
1241     }
1242
1243     // Membrane embedding must be initialized before we call init_forcerec()
1244     if (doMembed)
1245     {
1246         if (MASTER(cr))
1247         {
1248             fprintf(stderr, "Initializing membed");
1249         }
1250         /* Note that membed cannot work in parallel because mtop is
1251          * changed here. Fix this if we ever want to make it run with
1252          * multiple ranks. */
1253         membed = init_membed(fplog, filenames.size(), filenames.data(), &mtop, inputrec, globalState.get(), cr,
1254                              &mdrunOptions
1255                                  .checkpointOptions.period);
1256     }
1257
1258     const bool                   thisRankHasPmeGpuTask = gpuTaskAssignments.thisRankHasPmeGpuTask();
1259     std::unique_ptr<MDAtoms>     mdAtoms;
1260     std::unique_ptr<gmx_vsite_t> vsite;
1261
1262     t_nrnb nrnb;
1263     if (thisRankHasDuty(cr, DUTY_PP))
1264     {
1265         mdModulesNotifier.notify(*cr);
1266         mdModulesNotifier.notify(&atomSets);
1267         mdModulesNotifier.notify(PeriodicBoundaryConditionType {inputrec->ePBC});
1268         /* Initiate forcerecord */
1269         fr                 = new t_forcerec;
1270         fr->forceProviders = mdModules_->initForceProviders();
1271         init_forcerec(fplog, mdlog, fr, fcd,
1272                       inputrec, &mtop, cr, box,
1273                       opt2fn("-table", filenames.size(), filenames.data()),
1274                       opt2fn("-tablep", filenames.size(), filenames.data()),
1275                       opt2fns("-tableb", filenames.size(), filenames.data()),
1276                       *hwinfo, nonbondedDeviceInfo,
1277                       useGpuForBonded,
1278                       pforce,
1279                       wcycle);
1280
1281         // TODO Move this to happen during domain decomposition setup,
1282         // once stream and event handling works well with that.
1283         // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
1284         if (havePPDomainDecomposition(cr) && prefer1DAnd1PulseDD && is1DAnd1PulseDD(*cr->dd))
1285         {
1286             GMX_RELEASE_ASSERT(devFlags.enableGpuBufferOps, "Must use GMX_GPU_BUFFER_OPS=1 to use GMX_GPU_DD_COMMS=1");
1287             void *streamLocal              = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::Local);
1288             void *streamNonLocal           = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
1289             void *coordinatesOnDeviceEvent = fr->nbv->get_x_on_device_event();
1290             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
1291                     "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the GMX_GPU_DD_COMMS environment variable.");
1292             cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(cr->dd, cr->mpi_comm_mysim, streamLocal,
1293                                                                         streamNonLocal, coordinatesOnDeviceEvent);
1294         }
1295
1296         /* Initialize the mdAtoms structure.
1297          * mdAtoms is not filled with atom data,
1298          * as this can not be done now with domain decomposition.
1299          */
1300         mdAtoms = makeMDAtoms(fplog, mtop, *inputrec, thisRankHasPmeGpuTask);
1301         if (globalState && thisRankHasPmeGpuTask)
1302         {
1303             // The pinning of coordinates in the global state object works, because we only use
1304             // PME on GPU without DD or on a separate PME rank, and because the local state pointer
1305             // points to the global state object without DD.
1306             // FIXME: MD and EM separately set up the local state - this should happen in the same function,
1307             // which should also perform the pinning.
1308             changePinningPolicy(&globalState->x, pme_get_pinning_policy());
1309         }
1310
1311         /* Initialize the virtual site communication */
1312         vsite = initVsite(mtop, cr);
1313
1314         calc_shifts(box, fr->shift_vec);
1315
1316         /* With periodic molecules the charge groups should be whole at start up
1317          * and the virtual sites should not be far from their proper positions.
1318          */
1319         if (!inputrec->bContinuation && MASTER(cr) &&
1320             !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
1321         {
1322             /* Make molecules whole at start of run */
1323             if (fr->ePBC != epbcNONE)
1324             {
1325                 do_pbc_first_mtop(fplog, inputrec->ePBC, box, &mtop, globalState->x.rvec_array());
1326             }
1327             if (vsite)
1328             {
1329                 /* Correct initial vsite positions are required
1330                  * for the initial distribution in the domain decomposition
1331                  * and for the initial shell prediction.
1332                  */
1333                 constructVsitesGlobal(mtop, globalState->x);
1334             }
1335         }
1336
1337         if (EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
1338         {
1339             ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
1340             ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
1341         }
1342     }
1343     else
1344     {
1345         /* This is a PME only node */
1346
1347         GMX_ASSERT(globalState == nullptr, "We don't need the state on a PME only rank and expect it to be unitialized");
1348
1349         ewaldcoeff_q  = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol);
1350         ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj);
1351     }
1352
1353     gmx_pme_t *sepPmeData = nullptr;
1354     // This reference hides the fact that PME data is owned by runner on PME-only ranks and by forcerec on other ranks
1355     GMX_ASSERT(thisRankHasDuty(cr, DUTY_PP) == (fr != nullptr), "Double-checking that only PME-only ranks have no forcerec");
1356     gmx_pme_t * &pmedata = fr ? fr->pmedata : sepPmeData;
1357
1358     // TODO should live in ewald module once its testing is improved
1359     //
1360     // Later, this program could contain kernels that might be later
1361     // re-used as auto-tuning progresses, or subsequent simulations
1362     // are invoked.
1363     PmeGpuProgramStorage pmeGpuProgram;
1364     if (thisRankHasPmeGpuTask)
1365     {
1366         pmeGpuProgram = buildPmeGpuProgram(pmeDeviceInfo);
1367     }
1368
1369     /* Initiate PME if necessary,
1370      * either on all nodes or on dedicated PME nodes only. */
1371     if (EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
1372     {
1373         if (mdAtoms && mdAtoms->mdatoms())
1374         {
1375             nChargePerturbed = mdAtoms->mdatoms()->nChargePerturbed;
1376             if (EVDW_PME(inputrec->vdwtype))
1377             {
1378                 nTypePerturbed   = mdAtoms->mdatoms()->nTypePerturbed;
1379             }
1380         }
1381         if (cr->npmenodes > 0)
1382         {
1383             /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/
1384             gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
1385             gmx_bcast_sim(sizeof(nTypePerturbed), &nTypePerturbed, cr);
1386         }
1387
1388         if (thisRankHasDuty(cr, DUTY_PME))
1389         {
1390             try
1391             {
1392                 pmedata = gmx_pme_init(cr,
1393                                        getNumPmeDomains(cr->dd),
1394                                        inputrec,
1395                                        nChargePerturbed != 0, nTypePerturbed != 0,
1396                                        mdrunOptions.reproducible,
1397                                        ewaldcoeff_q, ewaldcoeff_lj,
1398                                        gmx_omp_nthreads_get(emntPME),
1399                                        pmeRunMode, nullptr,
1400                                        pmeDeviceInfo, pmeGpuProgram.get(), mdlog);
1401             }
1402             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1403         }
1404     }
1405
1406
1407     if (EI_DYNAMICS(inputrec->eI))
1408     {
1409         /* Turn on signal handling on all nodes */
1410         /*
1411          * (A user signal from the PME nodes (if any)
1412          * is communicated to the PP nodes.
1413          */
1414         signal_handler_install();
1415     }
1416
1417     pull_t *pull_work = nullptr;
1418     if (thisRankHasDuty(cr, DUTY_PP))
1419     {
1420         /* Assumes uniform use of the number of OpenMP threads */
1421         walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
1422
1423         if (inputrec->bPull)
1424         {
1425             /* Initialize pull code */
1426             pull_work =
1427                 init_pull(fplog, inputrec->pull, inputrec,
1428                           &mtop, cr, &atomSets, inputrec->fepvals->init_lambda);
1429             if (inputrec->pull->bXOutAverage || inputrec->pull->bFOutAverage)
1430             {
1431                 initPullHistory(pull_work, &observablesHistory);
1432             }
1433             if (EI_DYNAMICS(inputrec->eI) && MASTER(cr))
1434             {
1435                 init_pull_output_files(pull_work,
1436                                        filenames.size(), filenames.data(), oenv,
1437                                        startingBehavior);
1438             }
1439         }
1440
1441         std::unique_ptr<EnforcedRotation> enforcedRotation;
1442         if (inputrec->bRot)
1443         {
1444             /* Initialize enforced rotation code */
1445             enforcedRotation = init_rot(fplog,
1446                                         inputrec,
1447                                         filenames.size(),
1448                                         filenames.data(),
1449                                         cr,
1450                                         &atomSets,
1451                                         globalState.get(),
1452                                         &mtop,
1453                                         oenv,
1454                                         mdrunOptions,
1455                                         startingBehavior);
1456         }
1457
1458         t_swap *swap = nullptr;
1459         if (inputrec->eSwapCoords != eswapNO)
1460         {
1461             /* Initialize ion swapping code */
1462             swap = init_swapcoords(fplog, inputrec,
1463                                    opt2fn_master("-swap", filenames.size(), filenames.data(), cr),
1464                                    &mtop, globalState.get(), &observablesHistory,
1465                                    cr, &atomSets, oenv, mdrunOptions,
1466                                    startingBehavior);
1467         }
1468
1469         /* Let makeConstraints know whether we have essential dynamics constraints.
1470          * TODO: inputrec should tell us whether we use an algorithm, not a file option or the checkpoint
1471          */
1472         bool doEssentialDynamics = (opt2fn_null("-ei", filenames.size(), filenames.data()) != nullptr
1473                                     || observablesHistory.edsamHistory);
1474         auto constr              = makeConstraints(mtop, *inputrec, pull_work, doEssentialDynamics,
1475                                                    fplog, *mdAtoms->mdatoms(),
1476                                                    cr, ms, &nrnb, wcycle, fr->bMolPBC);
1477
1478         /* Energy terms and groups */
1479         gmx_enerdata_t enerd(mtop.groups.groups[SimulationAtomGroupType::EnergyOutput].size(), inputrec->fepvals->n_lambda);
1480
1481         /* Kinetic energy data */
1482         gmx_ekindata_t ekind;
1483         init_ekindata(fplog, &mtop, &(inputrec->opts), &ekind, inputrec->cos_accel);
1484
1485         /* Set up interactive MD (IMD) */
1486         auto imdSession = makeImdSession(inputrec, cr, wcycle, &enerd, ms, &mtop, mdlog,
1487                                          MASTER(cr) ? globalState->x.rvec_array() : nullptr,
1488                                          filenames.size(), filenames.data(), oenv, mdrunOptions.imdOptions,
1489                                          startingBehavior);
1490
1491         if (DOMAINDECOMP(cr))
1492         {
1493             GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
1494             /* This call is not included in init_domain_decomposition mainly
1495              * because fr->cginfo_mb is set later.
1496              */
1497             dd_init_bondeds(fplog, cr->dd, &mtop, vsite.get(), inputrec,
1498                             domdecOptions.checkBondedInteractions,
1499                             fr->cginfo_mb);
1500         }
1501
1502         if (updateTarget == TaskTarget::Gpu)
1503         {
1504             if (SIMMASTER(cr))
1505             {
1506                 gmx_fatal(FARGS, "It is currently not possible to redirect the calculation "
1507                           "of update and constraints to the GPU!");
1508             }
1509         }
1510
1511         // Before we start the actual simulator, try if we can run the update task on the GPU.
1512         useGpuForUpdate = decideWhetherToUseGpuForUpdate(DOMAINDECOMP(cr),
1513                                                          useGpuForPme,
1514                                                          useGpuForNonbonded,
1515                                                          devFlags.enableGpuBufferOps,
1516                                                          updateTarget,
1517                                                          gpusWereDetected,
1518                                                          *inputrec,
1519                                                          *mdAtoms,
1520                                                          doEssentialDynamics,
1521                                                          fcd->orires.nr != 0,
1522                                                          fcd->disres.nsystems != 0);
1523
1524         const bool inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible(
1525                     false,
1526                     inputrec, doRerun, vsite.get(), ms, replExParams,
1527                     fcd, static_cast<int>(filenames.size()), filenames.data(),
1528                     &observablesHistory, membed);
1529
1530         const bool useModularSimulator = inputIsCompatibleWithModularSimulator && !(getenv("GMX_DISABLE_MODULAR_SIMULATOR") != nullptr);
1531
1532         std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
1533         if (gpusWereDetected && ((useGpuForPme && thisRankHasDuty(cr, DUTY_PME)) || devFlags.enableGpuBufferOps))
1534         {
1535             const void         *pmeStream      = pme_gpu_get_device_stream(fr->pmedata);
1536             const void         *localStream    = fr->nbv->gpu_nbv != nullptr ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::Local) : nullptr;
1537             const void         *nonLocalStream = fr->nbv->gpu_nbv != nullptr ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal) : nullptr;
1538             const void         *deviceContext  = pme_gpu_get_device_context(fr->pmedata);
1539             const int           paddingSize    = pme_gpu_get_padding_size(fr->pmedata);
1540             GpuApiCallBehavior  transferKind   = (inputrec->eI == eiMD && !doRerun && !useModularSimulator) ? GpuApiCallBehavior::Async : GpuApiCallBehavior::Sync;
1541
1542             stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(pmeStream,
1543                                                                      localStream,
1544                                                                      nonLocalStream,
1545                                                                      deviceContext,
1546                                                                      transferKind,
1547                                                                      paddingSize);
1548             fr->stateGpu = stateGpu.get();
1549         }
1550
1551         // TODO This is not the right place to manage the lifetime of
1552         // this data structure, but currently it's the easiest way to
1553         // make it work.
1554         MdrunScheduleWorkload runScheduleWork;
1555         // Also populates the simulation constant workload description.
1556         runScheduleWork.simulationWork = createSimulationWorkload(useGpuForNonbonded,
1557                                                                   useGpuForPme,
1558                                                                   (pmeRunMode == PmeRunMode::GPU),
1559                                                                   useGpuForBonded,
1560                                                                   useGpuForUpdate,
1561                                                                   devFlags.enableGpuBufferOps,
1562                                                                   devFlags.enableGpuHaloExchange,
1563                                                                   devFlags.enableGpuPmePPComm);
1564
1565
1566         GMX_ASSERT(stopHandlerBuilder_, "Runner must provide StopHandlerBuilder to simulator.");
1567         SimulatorBuilder simulatorBuilder;
1568
1569         // build and run simulator object based on user-input
1570         auto simulator = simulatorBuilder.build(
1571                     inputIsCompatibleWithModularSimulator,
1572                     fplog, cr, ms, mdlog, static_cast<int>(filenames.size()), filenames.data(),
1573                     oenv,
1574                     mdrunOptions,
1575                     startingBehavior,
1576                     vsite.get(), constr.get(),
1577                     enforcedRotation ? enforcedRotation->getLegacyEnfrot() : nullptr,
1578                     deform.get(),
1579                     mdModules_->outputProvider(),
1580                     mdModules_->notifier(),
1581                     inputrec, imdSession.get(), pull_work, swap, &mtop,
1582                     fcd,
1583                     globalState.get(),
1584                     &observablesHistory,
1585                     mdAtoms.get(), &nrnb, wcycle, fr,
1586                     &enerd,
1587                     &ekind,
1588                     &runScheduleWork,
1589                     replExParams,
1590                     membed,
1591                     walltime_accounting,
1592                     std::move(stopHandlerBuilder_),
1593                     doRerun);
1594         simulator->run();
1595
1596         if (inputrec->bPull)
1597         {
1598             finish_pull(pull_work);
1599         }
1600         finish_swapcoords(swap);
1601     }
1602     else
1603     {
1604         GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
1605         /* do PME only */
1606         walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
1607         gmx_pmeonly(pmedata, cr, &nrnb, wcycle, walltime_accounting, inputrec, pmeRunMode);
1608     }
1609
1610     wallcycle_stop(wcycle, ewcRUN);
1611
1612     /* Finish up, write some stuff
1613      * if rerunMD, don't write last frame again
1614      */
1615     finish_run(fplog, mdlog, cr,
1616                inputrec, &nrnb, wcycle, walltime_accounting,
1617                fr ? fr->nbv.get() : nullptr,
1618                pmedata,
1619                EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms));
1620
1621     // clean up cycle counter
1622     wallcycle_destroy(wcycle);
1623
1624 // Free PME data
1625     if (pmedata)
1626     {
1627         gmx_pme_destroy(pmedata);
1628         pmedata = nullptr;
1629     }
1630
1631     // FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
1632     // before we destroy the GPU context(s) in free_gpu_resources().
1633     // Pinned buffers are associated with contexts in CUDA.
1634     // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
1635     mdAtoms.reset(nullptr);
1636     globalState.reset(nullptr);
1637     mdModules_.reset(nullptr);   // destruct force providers here as they might also use the GPU
1638
1639     /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
1640     free_gpu_resources(fr, physicalNodeComm, hwinfo->gpu_info);
1641     free_gpu(nonbondedDeviceInfo);
1642     free_gpu(pmeDeviceInfo);
1643     done_forcerec(fr, mtop.molblock.size());
1644     sfree(fcd);
1645
1646     if (doMembed)
1647     {
1648         free_membed(membed);
1649     }
1650
1651     /* Does what it says */
1652     print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
1653     walltime_accounting_destroy(walltime_accounting);
1654
1655     // Ensure log file content is written
1656     if (logFileHandle)
1657     {
1658         gmx_fio_flush(logFileHandle);
1659     }
1660
1661     /* Reset FPEs (important for unit tests) by disabling them. Assumes no
1662      * exceptions were enabled before function was called. */
1663     if (bEnableFPE)
1664     {
1665         gmx_fedisableexcept();
1666     }
1667
1668     auto rc = static_cast<int>(gmx_get_stop_condition());
1669
1670 #if GMX_THREAD_MPI
1671     /* we need to join all threads. The sub-threads join when they
1672        exit this function, but the master thread needs to be told to
1673        wait for that. */
1674     if (PAR(cr) && MASTER(cr))
1675     {
1676         tMPI_Finalize();
1677     }
1678 #endif
1679     return rc;
1680 }
1681
1682 Mdrunner::~Mdrunner()
1683 {
1684     // Clean up of the Manager.
1685     // This will end up getting called on every thread-MPI rank, which is unnecessary,
1686     // but okay as long as threads synchronize some time before adding or accessing
1687     // a new set of restraints.
1688     if (restraintManager_)
1689     {
1690         restraintManager_->clear();
1691         GMX_ASSERT(restraintManager_->countRestraints() == 0,
1692                    "restraints added during runner life time should be cleared at runner destruction.");
1693     }
1694 };
1695
1696 void Mdrunner::addPotential(std::shared_ptr<gmx::IRestraintPotential> puller,
1697                             const std::string                        &name)
1698 {
1699     GMX_ASSERT(restraintManager_, "Mdrunner must have a restraint manager.");
1700     // Not sure if this should be logged through the md logger or something else,
1701     // but it is helpful to have some sort of INFO level message sent somewhere.
1702     //    std::cout << "Registering restraint named " << name << std::endl;
1703
1704     // When multiple restraints are used, it may be wasteful to register them separately.
1705     // Maybe instead register an entire Restraint Manager as a force provider.
1706     restraintManager_->addToSpec(std::move(puller),
1707                                  name);
1708 }
1709
1710 Mdrunner::Mdrunner(std::unique_ptr<MDModules> mdModules)
1711     : mdModules_(std::move(mdModules))
1712 {
1713 }
1714
1715 Mdrunner::Mdrunner(Mdrunner &&) noexcept = default;
1716
1717 //NOLINTNEXTLINE(performance-noexcept-move-constructor) working around GCC bug 58265
1718 Mdrunner &Mdrunner::operator=(Mdrunner && /*handle*/) noexcept(BUGFREE_NOEXCEPT_STRING) = default;
1719
1720 class Mdrunner::BuilderImplementation
1721 {
1722     public:
1723         BuilderImplementation() = delete;
1724         BuilderImplementation(std::unique_ptr<MDModules>           mdModules,
1725                               compat::not_null<SimulationContext*> context);
1726         ~BuilderImplementation();
1727
1728         BuilderImplementation &setExtraMdrunOptions(const MdrunOptions &options,
1729                                                     real                forceWarningThreshold,
1730                                                     StartingBehavior    startingBehavior);
1731
1732         void addDomdec(const DomdecOptions &options);
1733
1734         void addVerletList(int nstlist);
1735
1736         void addReplicaExchange(const ReplicaExchangeParameters &params);
1737
1738         void addNonBonded(const char* nbpu_opt);
1739
1740         void addPME(const char* pme_opt_, const char* pme_fft_opt_);
1741
1742         void addBondedTaskAssignment(const char* bonded_opt);
1743
1744         void addUpdateTaskAssignment(const char* update_opt);
1745
1746         void addHardwareOptions(const gmx_hw_opt_t &hardwareOptions);
1747
1748         void addFilenames(ArrayRef <const t_filenm> filenames);
1749
1750         void addOutputEnvironment(gmx_output_env_t* outputEnvironment);
1751
1752         void addLogFile(t_fileio *logFileHandle);
1753
1754         void addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder);
1755
1756         Mdrunner build();
1757
1758     private:
1759
1760         // Default parameters copied from runner.h
1761         // \todo Clarify source(s) of default parameters.
1762
1763         const char* nbpu_opt_          = nullptr;
1764         const char* pme_opt_           = nullptr;
1765         const char* pme_fft_opt_       = nullptr;
1766         const char *bonded_opt_        = nullptr;
1767         const char *update_opt_        = nullptr;
1768
1769         MdrunOptions                          mdrunOptions_;
1770
1771         DomdecOptions                         domdecOptions_;
1772
1773         ReplicaExchangeParameters             replicaExchangeParameters_;
1774
1775         //! Command-line override for the duration of a neighbor list with the Verlet scheme.
1776         int         nstlist_ = 0;
1777
1778         //! Multisim communicator handle.
1779         gmx_multisim_t *multiSimulation_;
1780
1781         //! mdrun communicator
1782         MPI_Comm communicator_ = MPI_COMM_NULL;
1783
1784         //! Print a warning if any force is larger than this (in kJ/mol nm).
1785         real forceWarningThreshold_ = -1;
1786
1787         //! Whether the simulation will start afresh, or restart with/without appending.
1788         StartingBehavior startingBehavior_ = StartingBehavior::NewSimulation;
1789
1790         //! The modules that comprise the functionality of mdrun.
1791         std::unique_ptr<MDModules> mdModules_;
1792
1793         //! \brief Parallelism information.
1794         gmx_hw_opt_t hardwareOptions_;
1795
1796         //! filename options for simulation.
1797         ArrayRef<const t_filenm> filenames_;
1798
1799         /*! \brief Handle to output environment.
1800          *
1801          * \todo gmx_output_env_t needs lifetime management.
1802          */
1803         gmx_output_env_t*    outputEnvironment_ = nullptr;
1804
1805         /*! \brief Non-owning handle to MD log file.
1806          *
1807          * \todo Context should own output facilities for client.
1808          * \todo Improve log file handle management.
1809          * \internal
1810          * Code managing the FILE* relies on the ability to set it to
1811          * nullptr to check whether the filehandle is valid.
1812          */
1813         t_fileio* logFileHandle_ = nullptr;
1814
1815         /*!
1816          * \brief Builder for simulation stop signal handler.
1817          */
1818         std::unique_ptr<StopHandlerBuilder> stopHandlerBuilder_ = nullptr;
1819 };
1820
1821 Mdrunner::BuilderImplementation::BuilderImplementation(std::unique_ptr<MDModules>           mdModules,
1822                                                        compat::not_null<SimulationContext*> context) :
1823     mdModules_(std::move(mdModules))
1824 {
1825     communicator_    = context->communicator_;
1826     multiSimulation_ = context->multiSimulation_.get();
1827 }
1828
1829 Mdrunner::BuilderImplementation::~BuilderImplementation() = default;
1830
1831 Mdrunner::BuilderImplementation &
1832 Mdrunner::BuilderImplementation::setExtraMdrunOptions(const MdrunOptions    &options,
1833                                                       const real             forceWarningThreshold,
1834                                                       const StartingBehavior startingBehavior)
1835 {
1836     mdrunOptions_          = options;
1837     forceWarningThreshold_ = forceWarningThreshold;
1838     startingBehavior_      = startingBehavior;
1839     return *this;
1840 }
1841
1842 void Mdrunner::BuilderImplementation::addDomdec(const DomdecOptions &options)
1843 {
1844     domdecOptions_ = options;
1845 }
1846
1847 void Mdrunner::BuilderImplementation::addVerletList(int nstlist)
1848 {
1849     nstlist_ = nstlist;
1850 }
1851
1852 void Mdrunner::BuilderImplementation::addReplicaExchange(const ReplicaExchangeParameters &params)
1853 {
1854     replicaExchangeParameters_ = params;
1855 }
1856
1857 Mdrunner Mdrunner::BuilderImplementation::build()
1858 {
1859     auto newRunner = Mdrunner(std::move(mdModules_));
1860
1861     newRunner.mdrunOptions          = mdrunOptions_;
1862     newRunner.pforce                = forceWarningThreshold_;
1863     newRunner.startingBehavior      = startingBehavior_;
1864     newRunner.domdecOptions         = domdecOptions_;
1865
1866     // \todo determine an invariant to check or confirm that all gmx_hw_opt_t objects are valid
1867     newRunner.hw_opt          = hardwareOptions_;
1868
1869     // No invariant to check. This parameter exists to optionally override other behavior.
1870     newRunner.nstlist_cmdline = nstlist_;
1871
1872     newRunner.replExParams    = replicaExchangeParameters_;
1873
1874     newRunner.filenames = filenames_;
1875
1876     newRunner.communicator = communicator_;
1877
1878     // nullptr is a valid value for the multisim handle
1879     newRunner.ms = multiSimulation_;
1880
1881     // \todo Clarify ownership and lifetime management for gmx_output_env_t
1882     // \todo Update sanity checking when output environment has clearly specified invariants.
1883     // Initialization and default values for oenv are not well specified in the current version.
1884     if (outputEnvironment_)
1885     {
1886         newRunner.oenv  = outputEnvironment_;
1887     }
1888     else
1889     {
1890         GMX_THROW(gmx::APIError("MdrunnerBuilder::addOutputEnvironment() is required before build()"));
1891     }
1892
1893     newRunner.logFileHandle = logFileHandle_;
1894
1895     if (nbpu_opt_)
1896     {
1897         newRunner.nbpu_opt    = nbpu_opt_;
1898     }
1899     else
1900     {
1901         GMX_THROW(gmx::APIError("MdrunnerBuilder::addNonBonded() is required before build()"));
1902     }
1903
1904     if (pme_opt_ && pme_fft_opt_)
1905     {
1906         newRunner.pme_opt     = pme_opt_;
1907         newRunner.pme_fft_opt = pme_fft_opt_;
1908     }
1909     else
1910     {
1911         GMX_THROW(gmx::APIError("MdrunnerBuilder::addElectrostatics() is required before build()"));
1912     }
1913
1914     if (bonded_opt_)
1915     {
1916         newRunner.bonded_opt = bonded_opt_;
1917     }
1918     else
1919     {
1920         GMX_THROW(gmx::APIError("MdrunnerBuilder::addBondedTaskAssignment() is required before build()"));
1921     }
1922
1923     if (update_opt_)
1924     {
1925         newRunner.update_opt = update_opt_;
1926     }
1927     else
1928     {
1929         GMX_THROW(gmx::APIError("MdrunnerBuilder::addUpdateTaskAssignment() is required before build()  "));
1930     }
1931
1932
1933     newRunner.restraintManager_ = std::make_unique<gmx::RestraintManager>();
1934
1935     if (stopHandlerBuilder_)
1936     {
1937         newRunner.stopHandlerBuilder_ = std::move(stopHandlerBuilder_);
1938     }
1939     else
1940     {
1941         newRunner.stopHandlerBuilder_ = std::make_unique<StopHandlerBuilder>();
1942     }
1943
1944     return newRunner;
1945 }
1946
1947 void Mdrunner::BuilderImplementation::addNonBonded(const char* nbpu_opt)
1948 {
1949     nbpu_opt_ = nbpu_opt;
1950 }
1951
1952 void Mdrunner::BuilderImplementation::addPME(const char* pme_opt,
1953                                              const char* pme_fft_opt)
1954 {
1955     pme_opt_     = pme_opt;
1956     pme_fft_opt_ = pme_fft_opt;
1957 }
1958
1959 void Mdrunner::BuilderImplementation::addBondedTaskAssignment(const char* bonded_opt)
1960 {
1961     bonded_opt_ = bonded_opt;
1962 }
1963
1964 void Mdrunner::BuilderImplementation::addUpdateTaskAssignment(const char* update_opt)
1965 {
1966     update_opt_ = update_opt;
1967 }
1968
1969 void Mdrunner::BuilderImplementation::addHardwareOptions(const gmx_hw_opt_t &hardwareOptions)
1970 {
1971     hardwareOptions_ = hardwareOptions;
1972 }
1973
1974 void Mdrunner::BuilderImplementation::addFilenames(ArrayRef<const t_filenm> filenames)
1975 {
1976     filenames_ = filenames;
1977 }
1978
1979 void Mdrunner::BuilderImplementation::addOutputEnvironment(gmx_output_env_t* outputEnvironment)
1980 {
1981     outputEnvironment_ = outputEnvironment;
1982 }
1983
1984 void Mdrunner::BuilderImplementation::addLogFile(t_fileio *logFileHandle)
1985 {
1986     logFileHandle_ = logFileHandle;
1987 }
1988
1989 void Mdrunner::BuilderImplementation::addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder)
1990 {
1991     stopHandlerBuilder_ = std::move(builder);
1992 }
1993
1994 MdrunnerBuilder::MdrunnerBuilder(std::unique_ptr<MDModules>           mdModules,
1995                                  compat::not_null<SimulationContext*> context) :
1996     impl_ {std::make_unique<Mdrunner::BuilderImplementation>(std::move(mdModules), context)}
1997 {
1998 }
1999
2000 MdrunnerBuilder::~MdrunnerBuilder() = default;
2001
2002 MdrunnerBuilder &MdrunnerBuilder::addSimulationMethod(const MdrunOptions    &options,
2003                                                       real                   forceWarningThreshold,
2004                                                       const StartingBehavior startingBehavior)
2005 {
2006     impl_->setExtraMdrunOptions(options, forceWarningThreshold, startingBehavior);
2007     return *this;
2008 }
2009
2010 MdrunnerBuilder &MdrunnerBuilder::addDomainDecomposition(const DomdecOptions &options)
2011 {
2012     impl_->addDomdec(options);
2013     return *this;
2014 }
2015
2016 MdrunnerBuilder &MdrunnerBuilder::addNeighborList(int nstlist)
2017 {
2018     impl_->addVerletList(nstlist);
2019     return *this;
2020 }
2021
2022 MdrunnerBuilder &MdrunnerBuilder::addReplicaExchange(const ReplicaExchangeParameters &params)
2023 {
2024     impl_->addReplicaExchange(params);
2025     return *this;
2026 }
2027
2028 MdrunnerBuilder &MdrunnerBuilder::addNonBonded(const char* nbpu_opt)
2029 {
2030     impl_->addNonBonded(nbpu_opt);
2031     return *this;
2032 }
2033
2034 MdrunnerBuilder &MdrunnerBuilder::addElectrostatics(const char* pme_opt,
2035                                                     const char* pme_fft_opt)
2036 {
2037     // The builder method may become more general in the future, but in this version,
2038     // parameters for PME electrostatics are both required and the only parameters
2039     // available.
2040     if (pme_opt && pme_fft_opt)
2041     {
2042         impl_->addPME(pme_opt, pme_fft_opt);
2043     }
2044     else
2045     {
2046         GMX_THROW(gmx::InvalidInputError("addElectrostatics() arguments must be non-null pointers."));
2047     }
2048     return *this;
2049 }
2050
2051 MdrunnerBuilder &MdrunnerBuilder::addBondedTaskAssignment(const char* bonded_opt)
2052 {
2053     impl_->addBondedTaskAssignment(bonded_opt);
2054     return *this;
2055 }
2056
2057 MdrunnerBuilder &MdrunnerBuilder::addUpdateTaskAssignment(const char* update_opt)
2058 {
2059     impl_->addUpdateTaskAssignment(update_opt);
2060     return *this;
2061 }
2062
2063 Mdrunner MdrunnerBuilder::build()
2064 {
2065     return impl_->build();
2066 }
2067
2068 MdrunnerBuilder &MdrunnerBuilder::addHardwareOptions(const gmx_hw_opt_t &hardwareOptions)
2069 {
2070     impl_->addHardwareOptions(hardwareOptions);
2071     return *this;
2072 }
2073
2074 MdrunnerBuilder &MdrunnerBuilder::addFilenames(ArrayRef<const t_filenm> filenames)
2075 {
2076     impl_->addFilenames(filenames);
2077     return *this;
2078 }
2079
2080 MdrunnerBuilder &MdrunnerBuilder::addOutputEnvironment(gmx_output_env_t* outputEnvironment)
2081 {
2082     impl_->addOutputEnvironment(outputEnvironment);
2083     return *this;
2084 }
2085
2086 MdrunnerBuilder &MdrunnerBuilder::addLogFile(t_fileio *logFileHandle)
2087 {
2088     impl_->addLogFile(logFileHandle);
2089     return *this;
2090 }
2091
2092 MdrunnerBuilder &MdrunnerBuilder::addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder)
2093 {
2094     impl_->addStopHandlerBuilder(std::move(builder));
2095     return *this;
2096 }
2097
2098 MdrunnerBuilder::MdrunnerBuilder(MdrunnerBuilder &&) noexcept = default;
2099
2100 MdrunnerBuilder &MdrunnerBuilder::operator=(MdrunnerBuilder &&) noexcept = default;
2101
2102 } // namespace gmx