src/gromacs/mdrun/runner.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief Implements the MD runner routine calling all integrators.
  40  *
  41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
  42  * \ingroup module_mdrun
  43  */
  44 #include "gmxpre.h"
  45
  46 #include "runner.h"
  47
  48 #include "config.h"
  49
  50 #include <cassert>
  51 #include <cinttypes>
  52 #include <csignal>
  53 #include <cstdlib>
  54 #include <cstring>
  55
  56 #include <algorithm>
  57 #include <memory>
  58
  59 #include "gromacs/commandline/filenm.h"
  60 #include "gromacs/domdec/domdec.h"
  61 #include "gromacs/domdec/domdec_struct.h"
  62 #include "gromacs/domdec/gpuhaloexchange.h"
  63 #include "gromacs/domdec/localatomsetmanager.h"
  64 #include "gromacs/domdec/partition.h"
  65 #include "gromacs/ewald/ewald_utils.h"
  66 #include "gromacs/ewald/pme.h"
  67 #include "gromacs/ewald/pme_gpu_program.h"
  68 #include "gromacs/fileio/checkpoint.h"
  69 #include "gromacs/fileio/gmxfio.h"
  70 #include "gromacs/fileio/oenv.h"
  71 #include "gromacs/fileio/tpxio.h"
  72 #include "gromacs/gmxlib/network.h"
  73 #include "gromacs/gmxlib/nrnb.h"
  74 #include "gromacs/gpu_utils/gpu_utils.h"
  75 #include "gromacs/hardware/cpuinfo.h"
  76 #include "gromacs/hardware/detecthardware.h"
  77 #include "gromacs/hardware/printhardware.h"
  78 #include "gromacs/imd/imd.h"
  79 #include "gromacs/listed_forces/disre.h"
  80 #include "gromacs/listed_forces/gpubonded.h"
  81 #include "gromacs/listed_forces/orires.h"
  82 #include "gromacs/math/functions.h"
  83 #include "gromacs/math/utilities.h"
  84 #include "gromacs/math/vec.h"
  85 #include "gromacs/mdlib/boxdeformation.h"
  86 #include "gromacs/mdlib/broadcaststructs.h"
  87 #include "gromacs/mdlib/calc_verletbuf.h"
  88 #include "gromacs/mdlib/dispersioncorrection.h"
  89 #include "gromacs/mdlib/enerdata_utils.h"
  90 #include "gromacs/mdlib/force.h"
  91 #include "gromacs/mdlib/forcerec.h"
  92 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  93 #include "gromacs/mdlib/makeconstraints.h"
  94 #include "gromacs/mdlib/md_support.h"
  95 #include "gromacs/mdlib/mdatoms.h"
  96 #include "gromacs/mdlib/membed.h"
  97 #include "gromacs/mdlib/ppforceworkload.h"
  98 #include "gromacs/mdlib/qmmm.h"
  99 #include "gromacs/mdlib/sighandler.h"
 100 #include "gromacs/mdlib/stophandler.h"
 101 #include "gromacs/mdrun/mdmodules.h"
 102 #include "gromacs/mdrun/simulationcontext.h"
 103 #include "gromacs/mdrunutility/handlerestart.h"
 104 #include "gromacs/mdrunutility/logging.h"
 105 #include "gromacs/mdrunutility/multisim.h"
 106 #include "gromacs/mdrunutility/printtime.h"
 107 #include "gromacs/mdrunutility/threadaffinity.h"
 108 #include "gromacs/mdtypes/commrec.h"
 109 #include "gromacs/mdtypes/enerdata.h"
 110 #include "gromacs/mdtypes/fcdata.h"
 111 #include "gromacs/mdtypes/group.h"
 112 #include "gromacs/mdtypes/inputrec.h"
 113 #include "gromacs/mdtypes/md_enums.h"
 114 #include "gromacs/mdtypes/mdrunoptions.h"
 115 #include "gromacs/mdtypes/observableshistory.h"
 116 #include "gromacs/mdtypes/state.h"
 117 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 118 #include "gromacs/nbnxm/nbnxm.h"
 119 #include "gromacs/nbnxm/pairlist_tuning.h"
 120 #include "gromacs/pbcutil/pbc.h"
 121 #include "gromacs/pulling/output.h"
 122 #include "gromacs/pulling/pull.h"
 123 #include "gromacs/pulling/pull_rotation.h"
 124 #include "gromacs/restraint/manager.h"
 125 #include "gromacs/restraint/restraintmdmodule.h"
 126 #include "gromacs/restraint/restraintpotential.h"
 127 #include "gromacs/swap/swapcoords.h"
 128 #include "gromacs/taskassignment/decidegpuusage.h"
 129 #include "gromacs/taskassignment/resourcedivision.h"
 130 #include "gromacs/taskassignment/taskassignment.h"
 131 #include "gromacs/taskassignment/usergpuids.h"
 132 #include "gromacs/timing/gpu_timing.h"
 133 #include "gromacs/timing/wallcycle.h"
 134 #include "gromacs/timing/wallcyclereporting.h"
 135 #include "gromacs/topology/mtop_util.h"
 136 #include "gromacs/trajectory/trajectoryframe.h"
 137 #include "gromacs/utility/basenetwork.h"
 138 #include "gromacs/utility/cstringutil.h"
 139 #include "gromacs/utility/exceptions.h"
 140 #include "gromacs/utility/fatalerror.h"
 141 #include "gromacs/utility/filestream.h"
 142 #include "gromacs/utility/gmxassert.h"
 143 #include "gromacs/utility/gmxmpi.h"
 144 #include "gromacs/utility/keyvaluetree.h"
 145 #include "gromacs/utility/logger.h"
 146 #include "gromacs/utility/loggerbuilder.h"
 147 #include "gromacs/utility/mdmodulenotification.h"
 148 #include "gromacs/utility/physicalnodecommunicator.h"
 149 #include "gromacs/utility/pleasecite.h"
 150 #include "gromacs/utility/programcontext.h"
 151 #include "gromacs/utility/smalloc.h"
 152 #include "gromacs/utility/stringutil.h"
 153
 154 #include "isimulator.h"
 155 #include "replicaexchange.h"
 156 #include "simulatorbuilder.h"
 157
 158 #if GMX_FAHCORE
 159 #include "corewrap.h"
 160 #endif
 161
 162 namespace gmx
 163 {
 164
 165 /*! \brief environment variable to enable GPU P2P communication */
 166 static const bool c_enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr)
 167     && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA);
 168
 169 /*! \brief Manage any development feature flag variables encountered
 170  *
 171  * The use of dev features indicated by environment variables is
 172  * logged in order to ensure that runs with such featrues enabled can
 173  * be identified from their log and standard output. Any cross
 174  * dependencies are also checked, and if unsatisified, a fatal error
 175  * issued.
 176  *
 177  * \param[in]  mdlog        Logger object.
 178  */
 179 static void manageDevelopmentFeatures(const gmx::MDLogger &mdlog)
 180 {
 181     const bool enableGpuBufOps       = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
 182     const bool useGpuUpdateConstrain = (getenv("GMX_UPDATE_CONSTRAIN_GPU") != nullptr);
 183     const bool enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
 184
 185     if (enableGpuBufOps)
 186     {
 187         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 188                 "NOTE: This run uses the 'GPU buffer ops' feature, enabled by the GMX_USE_GPU_BUFFER_OPS environment variable.");
 189     }
 190
 191     if (enableGpuHaloExchange)
 192     {
 193         if (!enableGpuBufOps)
 194         {
 195             gmx_fatal(FARGS, "Cannot enable GPU halo exchange without GPU buffer operations, set GMX_USE_GPU_BUFFER_OPS=1\n");
 196         }
 197         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 198                 "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the GMX_GPU_DD_COMMS environment variable.");
 199     }
 200
 201     if (useGpuUpdateConstrain)
 202     {
 203         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 204                 "NOTE: This run uses the 'GPU update/constraints' feature, enabled by the GMX_UPDATE_CONSTRAIN_GPU environment variable.");
 205     }
 206
 207 }
 208
 209 /*! \brief Barrier for safe simultaneous thread access to mdrunner data
 210  *
 211  * Used to ensure that the master thread does not modify mdrunner during copy
 212  * on the spawned threads. */
 213 static void threadMpiMdrunnerAccessBarrier()
 214 {
 215 #if GMX_THREAD_MPI
 216     MPI_Barrier(MPI_COMM_WORLD);
 217 #endif
 218 }
 219
 220 Mdrunner Mdrunner::cloneOnSpawnedThread() const
 221 {
 222     auto newRunner = Mdrunner(std::make_unique<MDModules>());
 223
 224     // All runners in the same process share a restraint manager resource because it is
 225     // part of the interface to the client code, which is associated only with the
 226     // original thread. Handles to the same resources can be obtained by copy.
 227     {
 228         newRunner.restraintManager_ = std::make_unique<RestraintManager>(*restraintManager_);
 229     }
 230
 231     // Copy original cr pointer before master thread can pass the thread barrier
 232     newRunner.cr  = reinitialize_commrec_for_this_thread(cr);
 233
 234     // Copy members of master runner.
 235     // \todo Replace with builder when Simulation context and/or runner phases are better defined.
 236     // Ref https://redmine.gromacs.org/issues/2587 and https://redmine.gromacs.org/issues/2375
 237     newRunner.hw_opt    = hw_opt;
 238     newRunner.filenames = filenames;
 239
 240     newRunner.oenv                = oenv;
 241     newRunner.mdrunOptions        = mdrunOptions;
 242     newRunner.domdecOptions       = domdecOptions;
 243     newRunner.nbpu_opt            = nbpu_opt;
 244     newRunner.pme_opt             = pme_opt;
 245     newRunner.pme_fft_opt         = pme_fft_opt;
 246     newRunner.bonded_opt          = bonded_opt;
 247     newRunner.nstlist_cmdline     = nstlist_cmdline;
 248     newRunner.replExParams        = replExParams;
 249     newRunner.pforce              = pforce;
 250     newRunner.ms                  = ms;
 251     newRunner.startingBehavior    = startingBehavior;
 252     newRunner.stopHandlerBuilder_ = std::make_unique<StopHandlerBuilder>(*stopHandlerBuilder_);
 253
 254     threadMpiMdrunnerAccessBarrier();
 255
 256     GMX_RELEASE_ASSERT(!MASTER(newRunner.cr), "cloneOnSpawnedThread should only be called on spawned threads");
 257
 258     return newRunner;
 259 }
 260
 261 /*! \brief The callback used for running on spawned threads.
 262  *
 263  * Obtains the pointer to the master mdrunner object from the one
 264  * argument permitted to the thread-launch API call, copies it to make
 265  * a new runner for this thread, reinitializes necessary data, and
 266  * proceeds to the simulation. */
 267 static void mdrunner_start_fn(const void *arg)
 268 {
 269     try
 270     {
 271         auto masterMdrunner = reinterpret_cast<const gmx::Mdrunner *>(arg);
 272         /* copy the arg list to make sure that it's thread-local. This
 273            doesn't copy pointed-to items, of course; fnm, cr and fplog
 274            are reset in the call below, all others should be const. */
 275         gmx::Mdrunner mdrunner = masterMdrunner->cloneOnSpawnedThread();
 276         mdrunner.mdrunner();
 277     }
 278     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 279 }
 280
 281
 282 /*! \brief Start thread-MPI threads.
 283  *
 284  * Called by mdrunner() to start a specific number of threads
 285  * (including the main thread) for thread-parallel runs. This in turn
 286  * calls mdrunner() for each thread. All options are the same as for
 287  * mdrunner(). */
 288 t_commrec *Mdrunner::spawnThreads(int numThreadsToLaunch) const
 289 {
 290
 291     /* first check whether we even need to start tMPI */
 292     if (numThreadsToLaunch < 2)
 293     {
 294         return cr;
 295     }
 296
 297 #if GMX_THREAD_MPI
 298     /* now spawn new threads that start mdrunner_start_fn(), while
 299        the main thread returns. Thread affinity is handled later. */
 300     if (tMPI_Init_fn(TRUE, numThreadsToLaunch, TMPI_AFFINITY_NONE,
 301                      mdrunner_start_fn, static_cast<const void*>(this)) != TMPI_SUCCESS)
 302     {
 303         GMX_THROW(gmx::InternalError("Failed to spawn thread-MPI threads"));
 304     }
 305
 306     threadMpiMdrunnerAccessBarrier();
 307 #else
 308     GMX_UNUSED_VALUE(mdrunner_start_fn);
 309 #endif
 310
 311     return reinitialize_commrec_for_this_thread(cr);
 312 }
 313
 314 }  // namespace gmx
 315
 316 /*! \brief Initialize variables for Verlet scheme simulation */
 317 static void prepare_verlet_scheme(FILE                           *fplog,
 318                                   t_commrec                      *cr,
 319                                   t_inputrec                     *ir,
 320                                   int                             nstlist_cmdline,
 321                                   const gmx_mtop_t               *mtop,
 322                                   const matrix                    box,
 323                                   bool                            makeGpuPairList,
 324                                   const gmx::CpuInfo             &cpuinfo)
 325 {
 326     /* For NVE simulations, we will retain the initial list buffer */
 327     if (EI_DYNAMICS(ir->eI) &&
 328         ir->verletbuf_tol > 0 &&
 329         !(EI_MD(ir->eI) && ir->etc == etcNO))
 330     {
 331         /* Update the Verlet buffer size for the current run setup */
 332
 333         /* Here we assume SIMD-enabled kernels are being used. But as currently
 334          * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
 335          * and 4x2 gives a larger buffer than 4x4, this is ok.
 336          */
 337         ListSetupType      listType  = (makeGpuPairList ? ListSetupType::Gpu : ListSetupType::CpuSimdWhenSupported);
 338         VerletbufListSetup listSetup = verletbufGetSafeListSetup(listType);
 339
 340         const real         rlist_new =
 341             calcVerletBufferSize(*mtop, det(box), *ir, ir->nstlist, ir->nstlist - 1, -1, listSetup);
 342
 343         if (rlist_new != ir->rlist)
 344         {
 345             if (fplog != nullptr)
 346             {
 347                 fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
 348                         ir->rlist, rlist_new,
 349                         listSetup.cluster_size_i, listSetup.cluster_size_j);
 350             }
 351             ir->rlist     = rlist_new;
 352         }
 353     }
 354
 355     if (nstlist_cmdline > 0 && (!EI_DYNAMICS(ir->eI) || ir->verletbuf_tol <= 0))
 356     {
 357         gmx_fatal(FARGS, "Can not set nstlist without %s",
 358                   !EI_DYNAMICS(ir->eI) ? "dynamics" : "verlet-buffer-tolerance");
 359     }
 360
 361     if (EI_DYNAMICS(ir->eI))
 362     {
 363         /* Set or try nstlist values */
 364         increaseNstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, makeGpuPairList, cpuinfo);
 365     }
 366 }
 367
 368 /*! \brief Override the nslist value in inputrec
 369  *
 370  * with value passed on the command line (if any)
 371  */
 372 static void override_nsteps_cmdline(const gmx::MDLogger &mdlog,
 373                                     int64_t              nsteps_cmdline,
 374                                     t_inputrec          *ir)
 375 {
 376     assert(ir);
 377
 378     /* override with anything else than the default -2 */
 379     if (nsteps_cmdline > -2)
 380     {
 381         char sbuf_steps[STEPSTRSIZE];
 382         char sbuf_msg[STRLEN];
 383
 384         ir->nsteps = nsteps_cmdline;
 385         if (EI_DYNAMICS(ir->eI) && nsteps_cmdline != -1)
 386         {
 387             sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps, %.3g ps",
 388                     gmx_step_str(nsteps_cmdline, sbuf_steps),
 389                     fabs(nsteps_cmdline*ir->delta_t));
 390         }
 391         else
 392         {
 393             sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps",
 394                     gmx_step_str(nsteps_cmdline, sbuf_steps));
 395         }
 396
 397         GMX_LOG(mdlog.warning).asParagraph().appendText(sbuf_msg);
 398     }
 399     else if (nsteps_cmdline < -2)
 400     {
 401         gmx_fatal(FARGS, "Invalid nsteps value passed on the command line: %" PRId64,
 402                   nsteps_cmdline);
 403     }
 404     /* Do nothing if nsteps_cmdline == -2 */
 405 }
 406
 407 namespace gmx
 408 {
 409
 410 /*! \brief Return whether GPU acceleration of nonbondeds is supported with the given settings.
 411  *
 412  * If not, and if a warning may be issued, logs a warning about
 413  * falling back to CPU code. With thread-MPI, only the first
 414  * call to this function should have \c issueWarning true. */
 415 static bool gpuAccelerationOfNonbondedIsUseful(const MDLogger   &mdlog,
 416                                                const t_inputrec &ir,
 417                                                bool              issueWarning)
 418 {
 419     bool        gpuIsUseful = true;
 420     std::string warning;
 421
 422     if (ir.opts.ngener - ir.nwall > 1)
 423     {
 424         /* The GPU code does not support more than one energy group.
 425          * If the user requested GPUs explicitly, a fatal error is given later.
 426          */
 427         gpuIsUseful = false;
 428         warning     =
 429             "Multiple energy groups is not implemented for GPUs, falling back to the CPU. "
 430             "For better performance, run on the GPU without energy groups and then do "
 431             "gmx mdrun -rerun option on the trajectory with an energy group .tpr file.";
 432     }
 433
 434     if (EI_TPI(ir.eI))
 435     {
 436         gpuIsUseful = false;
 437         warning     = "TPI is not implemented for GPUs.";
 438     }
 439
 440     if (!gpuIsUseful && issueWarning)
 441     {
 442         GMX_LOG(mdlog.warning).asParagraph().appendText(warning);
 443     }
 444
 445     return gpuIsUseful;
 446 }
 447
 448 //! Initializes the logger for mdrun.
 449 static gmx::LoggerOwner buildLogger(FILE *fplog, const t_commrec *cr)
 450 {
 451     gmx::LoggerBuilder builder;
 452     if (fplog != nullptr)
 453     {
 454         builder.addTargetFile(gmx::MDLogger::LogLevel::Info, fplog);
 455     }
 456     if (cr == nullptr || SIMMASTER(cr))
 457     {
 458         builder.addTargetStream(gmx::MDLogger::LogLevel::Warning,
 459                                 &gmx::TextOutputFile::standardError());
 460     }
 461     return builder.build();
 462 }
 463
 464 //! Make a TaskTarget from an mdrun argument string.
 465 static TaskTarget findTaskTarget(const char *optionString)
 466 {
 467     TaskTarget returnValue = TaskTarget::Auto;
 468
 469     if (strncmp(optionString, "auto", 3) == 0)
 470     {
 471         returnValue = TaskTarget::Auto;
 472     }
 473     else if (strncmp(optionString, "cpu", 3) == 0)
 474     {
 475         returnValue = TaskTarget::Cpu;
 476     }
 477     else if (strncmp(optionString, "gpu", 3) == 0)
 478     {
 479         returnValue = TaskTarget::Gpu;
 480     }
 481     else
 482     {
 483         GMX_ASSERT(false, "Option string should have been checked for sanity already");
 484     }
 485
 486     return returnValue;
 487 }
 488
 489 //! Finish run, aggregate data to print performance info.
 490 static void finish_run(FILE *fplog,
 491                        const gmx::MDLogger &mdlog,
 492                        const t_commrec *cr,
 493                        const t_inputrec *inputrec,
 494                        t_nrnb nrnb[], gmx_wallcycle_t wcycle,
 495                        gmx_walltime_accounting_t walltime_accounting,
 496                        nonbonded_verlet_t *nbv,
 497                        const gmx_pme_t *pme,
 498                        gmx_bool bWriteStat)
 499 {
 500     double  delta_t  = 0;
 501     double  nbfs     = 0, mflop = 0;
 502     double  elapsed_time,
 503             elapsed_time_over_all_ranks,
 504             elapsed_time_over_all_threads,
 505             elapsed_time_over_all_threads_over_all_ranks;
 506     /* Control whether it is valid to print a report. Only the
 507        simulation master may print, but it should not do so if the run
 508        terminated e.g. before a scheduled reset step. This is
 509        complicated by the fact that PME ranks are unaware of the
 510        reason why they were sent a pmerecvqxFINISH. To avoid
 511        communication deadlocks, we always do the communication for the
 512        report, even if we've decided not to write the report, because
 513        how long it takes to finish the run is not important when we've
 514        decided not to report on the simulation performance.
 515
 516        Further, we only report performance for dynamical integrators,
 517        because those are the only ones for which we plan to
 518        consider doing any optimizations. */
 519     bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
 520
 521     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
 522     {
 523         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
 524         printReport = false;
 525     }
 526
 527     t_nrnb                  *nrnb_tot;
 528     std::unique_ptr<t_nrnb>  nrnbTotalStorage;
 529     if (cr->nnodes > 1)
 530     {
 531         nrnbTotalStorage = std::make_unique<t_nrnb>();
 532         nrnb_tot         = nrnbTotalStorage.get();
 533 #if GMX_MPI
 534         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
 535                       cr->mpi_comm_mysim);
 536 #endif
 537     }
 538     else
 539     {
 540         nrnb_tot = nrnb;
 541     }
 542
 543     elapsed_time                  = walltime_accounting_get_time_since_reset(walltime_accounting);
 544     elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
 545     if (cr->nnodes > 1)
 546     {
 547 #if GMX_MPI
 548         /* reduce elapsed_time over all MPI ranks in the current simulation */
 549         MPI_Allreduce(&elapsed_time,
 550                       &elapsed_time_over_all_ranks,
 551                       1, MPI_DOUBLE, MPI_SUM,
 552                       cr->mpi_comm_mysim);
 553         elapsed_time_over_all_ranks /= cr->nnodes;
 554         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
 555          * current simulation. */
 556         MPI_Allreduce(&elapsed_time_over_all_threads,
 557                       &elapsed_time_over_all_threads_over_all_ranks,
 558                       1, MPI_DOUBLE, MPI_SUM,
 559                       cr->mpi_comm_mysim);
 560 #endif
 561     }
 562     else
 563     {
 564         elapsed_time_over_all_ranks                  = elapsed_time;
 565         elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
 566     }
 567
 568     if (printReport)
 569     {
 570         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
 571     }
 572
 573     if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
 574     {
 575         print_dd_statistics(cr, inputrec, fplog);
 576     }
 577
 578     /* TODO Move the responsibility for any scaling by thread counts
 579      * to the code that handled the thread region, so that there's a
 580      * mechanism to keep cycle counting working during the transition
 581      * to task parallelism. */
 582     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
 583     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
 584     wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
 585     auto cycle_sum(wallcycle_sum(cr, wcycle));
 586
 587     if (printReport)
 588     {
 589         auto                    nbnxn_gpu_timings = (nbv != nullptr && nbv->useGpu()) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
 590         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
 591
 592         if (pme_gpu_task_enabled(pme))
 593         {
 594             pme_gpu_get_timings(pme, &pme_gpu_timings);
 595         }
 596         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
 597                         elapsed_time_over_all_ranks,
 598                         wcycle, cycle_sum,
 599                         nbnxn_gpu_timings,
 600                         &pme_gpu_timings);
 601
 602         if (EI_DYNAMICS(inputrec->eI))
 603         {
 604             delta_t = inputrec->delta_t;
 605         }
 606
 607         if (fplog)
 608         {
 609             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
 610                        elapsed_time_over_all_ranks,
 611                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
 612                        delta_t, nbfs, mflop);
 613         }
 614         if (bWriteStat)
 615         {
 616             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
 617                        elapsed_time_over_all_ranks,
 618                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
 619                        delta_t, nbfs, mflop);
 620         }
 621     }
 622 }
 623
 624 int Mdrunner::mdrunner()
 625 {
 626     matrix                    box;
 627     t_forcerec               *fr               = nullptr;
 628     t_fcdata                 *fcd              = nullptr;
 629     real                      ewaldcoeff_q     = 0;
 630     real                      ewaldcoeff_lj    = 0;
 631     int                       nChargePerturbed = -1, nTypePerturbed = 0;
 632     gmx_wallcycle_t           wcycle;
 633     gmx_walltime_accounting_t walltime_accounting = nullptr;
 634     gmx_membed_t *            membed              = nullptr;
 635     gmx_hw_info_t            *hwinfo              = nullptr;
 636
 637     /* CAUTION: threads may be started later on in this function, so
 638        cr doesn't reflect the final parallel state right now */
 639     gmx_mtop_t                      mtop;
 640
 641     bool doMembed = opt2bSet("-membed", filenames.size(), filenames.data());
 642     bool doRerun  = mdrunOptions.rerun;
 643
 644     // Handle task-assignment related user options.
 645     EmulateGpuNonbonded emulateGpuNonbonded = (getenv("GMX_EMULATE_GPU") != nullptr ?
 646                                                EmulateGpuNonbonded::Yes : EmulateGpuNonbonded::No);
 647
 648     std::vector<int> userGpuTaskAssignment;
 649     try
 650     {
 651         userGpuTaskAssignment = parseUserTaskAssignmentString(hw_opt.userGpuTaskAssignment);
 652     }
 653     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 654     auto       nonbondedTarget = findTaskTarget(nbpu_opt);
 655     auto       pmeTarget       = findTaskTarget(pme_opt);
 656     auto       pmeFftTarget    = findTaskTarget(pme_fft_opt);
 657     auto       bondedTarget    = findTaskTarget(bonded_opt);
 658     PmeRunMode pmeRunMode      = PmeRunMode::None;
 659
 660     // Here we assume that SIMMASTER(cr) does not change even after the
 661     // threads are started.
 662
 663     FILE *fplog = nullptr;
 664     // If we are appending, we don't write log output because we need
 665     // to check that the old log file matches what the checkpoint file
 666     // expects. Otherwise, we should start to write log output now if
 667     // there is a file ready for it.
 668     if (logFileHandle != nullptr && startingBehavior != StartingBehavior::RestartWithAppending)
 669     {
 670         fplog = gmx_fio_getfp(logFileHandle);
 671     }
 672     gmx::LoggerOwner logOwner(buildLogger(fplog, cr));
 673     gmx::MDLogger    mdlog(logOwner.logger());
 674
 675     // report any development features that may be enabled by environment variables
 676     manageDevelopmentFeatures(mdlog);
 677
 678     // With thread-MPI, the communicator changes after threads are
 679     // launched, so this is rebuilt for the master rank at that
 680     // time. The non-master ranks are fine to keep the one made here.
 681     PhysicalNodeCommunicator physicalNodeComm(MPI_COMM_WORLD, gmx_physicalnode_id_hash());
 682     hwinfo = gmx_detect_hardware(mdlog, physicalNodeComm);
 683
 684     gmx_print_detected_hardware(fplog, isMasterSimMasterRank(ms, MASTER(cr)), mdlog, hwinfo);
 685
 686     std::vector<int> gpuIdsToUse = makeGpuIdsToUse(hwinfo->gpu_info, hw_opt.gpuIdsAvailable);
 687
 688     // Print citation requests after all software/hardware printing
 689     pleaseCiteGromacs(fplog);
 690
 691     // TODO Replace this by unique_ptr once t_inputrec is C++
 692     t_inputrec               inputrecInstance;
 693     t_inputrec              *inputrec = nullptr;
 694     std::unique_ptr<t_state> globalState;
 695
 696     if (SIMMASTER(cr))
 697     {
 698         /* Only the master rank has the global state */
 699         globalState = std::make_unique<t_state>();
 700
 701         /* Read (nearly) all data required for the simulation */
 702         read_tpx_state(ftp2fn(efTPR, filenames.size(), filenames.data()),
 703                        &inputrecInstance, globalState.get(), &mtop);
 704         inputrec = &inputrecInstance;
 705     }
 706
 707     /* Check and update the hardware options for internal consistency */
 708     checkAndUpdateHardwareOptions(mdlog, &hw_opt, SIMMASTER(cr), domdecOptions.numPmeRanks, inputrec);
 709
 710     if (GMX_THREAD_MPI && SIMMASTER(cr))
 711     {
 712         bool useGpuForNonbonded = false;
 713         bool useGpuForPme       = false;
 714         try
 715         {
 716             GMX_RELEASE_ASSERT(inputrec != nullptr, "Keep the compiler happy");
 717
 718             // If the user specified the number of ranks, then we must
 719             // respect that, but in default mode, we need to allow for
 720             // the number of GPUs to choose the number of ranks.
 721             auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
 722             useGpuForNonbonded = decideWhetherToUseGpusForNonbondedWithThreadMpi
 723                     (nonbondedTarget, gpuIdsToUse, userGpuTaskAssignment, emulateGpuNonbonded,
 724                     canUseGpuForNonbonded,
 725                     gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, GMX_THREAD_MPI),
 726                     hw_opt.nthreads_tmpi);
 727             useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi
 728                     (useGpuForNonbonded, pmeTarget, gpuIdsToUse, userGpuTaskAssignment,
 729                     *hwinfo, *inputrec, mtop, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
 730
 731         }
 732         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 733
 734         /* Determine how many thread-MPI ranks to start.
 735          *
 736          * TODO Over-writing the user-supplied value here does
 737          * prevent any possible subsequent checks from working
 738          * correctly. */
 739         hw_opt.nthreads_tmpi = get_nthreads_mpi(hwinfo,
 740                                                 &hw_opt,
 741                                                 gpuIdsToUse,
 742                                                 useGpuForNonbonded,
 743                                                 useGpuForPme,
 744                                                 inputrec, &mtop,
 745                                                 mdlog,
 746                                                 doMembed);
 747
 748         // Now start the threads for thread MPI.
 749         cr = spawnThreads(hw_opt.nthreads_tmpi);
 750         /* The main thread continues here with a new cr. We don't deallocate
 751            the old cr because other threads may still be reading it. */
 752         // TODO Both master and spawned threads call dup_tfn and
 753         // reinitialize_commrec_for_this_thread. Find a way to express
 754         // this better.
 755         physicalNodeComm = PhysicalNodeCommunicator(MPI_COMM_WORLD, gmx_physicalnode_id_hash());
 756     }
 757     // END OF CAUTION: cr and physicalNodeComm are now reliable
 758
 759     if (PAR(cr))
 760     {
 761         /* now broadcast everything to the non-master nodes/threads: */
 762         if (!SIMMASTER(cr))
 763         {
 764             inputrec = &inputrecInstance;
 765         }
 766         init_parallel(cr, inputrec, &mtop);
 767     }
 768     GMX_RELEASE_ASSERT(inputrec != nullptr, "All range should have a valid inputrec now");
 769
 770     // Now each rank knows the inputrec that SIMMASTER read and used,
 771     // and (if applicable) cr->nnodes has been assigned the number of
 772     // thread-MPI ranks that have been chosen. The ranks can now all
 773     // run the task-deciding functions and will agree on the result
 774     // without needing to communicate.
 775     //
 776     // TODO Should we do the communication in debug mode to support
 777     // having an assertion?
 778     //
 779     // Note that these variables describe only their own node.
 780     //
 781     // Note that when bonded interactions run on a GPU they always run
 782     // alongside a nonbonded task, so do not influence task assignment
 783     // even though they affect the force calculation workload.
 784     bool useGpuForNonbonded = false;
 785     bool useGpuForPme       = false;
 786     bool useGpuForBonded    = false;
 787     try
 788     {
 789         // It's possible that there are different numbers of GPUs on
 790         // different nodes, which is the user's responsibilty to
 791         // handle. If unsuitable, we will notice that during task
 792         // assignment.
 793         bool gpusWereDetected      = hwinfo->ngpu_compatible_tot > 0;
 794         auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
 795         useGpuForNonbonded = decideWhetherToUseGpusForNonbonded(nonbondedTarget, userGpuTaskAssignment,
 796                                                                 emulateGpuNonbonded,
 797                                                                 canUseGpuForNonbonded,
 798                                                                 gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, !GMX_THREAD_MPI),
 799                                                                 gpusWereDetected);
 800         useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment,
 801                                                     *hwinfo, *inputrec, mtop,
 802                                                     cr->nnodes, domdecOptions.numPmeRanks,
 803                                                     gpusWereDetected);
 804         auto canUseGpuForBonded = buildSupportsGpuBondeds(nullptr) && inputSupportsGpuBondeds(*inputrec, mtop, nullptr);
 805         useGpuForBonded =
 806             decideWhetherToUseGpusForBonded(useGpuForNonbonded, useGpuForPme,
 807                                             bondedTarget, canUseGpuForBonded,
 808                                             EVDW_PME(inputrec->vdwtype),
 809                                             EEL_PME_EWALD(inputrec->coulombtype),
 810                                             domdecOptions.numPmeRanks, gpusWereDetected);
 811
 812         pmeRunMode   = (useGpuForPme ? PmeRunMode::GPU : PmeRunMode::CPU);
 813         if (pmeRunMode == PmeRunMode::GPU)
 814         {
 815             if (pmeFftTarget == TaskTarget::Cpu)
 816             {
 817                 pmeRunMode = PmeRunMode::Mixed;
 818             }
 819         }
 820         else if (pmeFftTarget == TaskTarget::Gpu)
 821         {
 822             gmx_fatal(FARGS, "Assigning FFTs to GPU requires PME to be assigned to GPU as well. With PME on CPU you should not be using -pmefft.");
 823         }
 824     }
 825     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 826
 827     // Build restraints.
 828     // TODO: hide restraint implementation details from Mdrunner.
 829     // There is nothing unique about restraints at this point as far as the
 830     // Mdrunner is concerned. The Mdrunner should just be getting a sequence of
 831     // factory functions from the SimulationContext on which to call mdModules_->add().
 832     // TODO: capture all restraints into a single RestraintModule, passed to the runner builder.
 833     for (auto && restraint : restraintManager_->getRestraints())
 834     {
 835         auto module = RestraintMDModule::create(restraint,
 836                                                 restraint->sites());
 837         mdModules_->add(std::move(module));
 838     }
 839
 840     // TODO: Error handling
 841     mdModules_->assignOptionsToModules(*inputrec->params, nullptr);
 842     const auto &mdModulesNotifier = mdModules_->notifier().notifier_;
 843
 844     if (inputrec->internalParameters != nullptr)
 845     {
 846         mdModulesNotifier.notify(*inputrec->internalParameters);
 847     }
 848
 849     if (fplog != nullptr)
 850     {
 851         pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
 852         fprintf(fplog, "\n");
 853     }
 854
 855     if (SIMMASTER(cr))
 856     {
 857         /* In rerun, set velocities to zero if present */
 858         if (doRerun && ((globalState->flags & (1 << estV)) != 0))
 859         {
 860             // rerun does not use velocities
 861             GMX_LOG(mdlog.info).asParagraph().appendText(
 862                     "Rerun trajectory contains velocities. Rerun does only evaluate "
 863                     "potential energy and forces. The velocities will be ignored.");
 864             for (int i = 0; i < globalState->natoms; i++)
 865             {
 866                 clear_rvec(globalState->v[i]);
 867             }
 868             globalState->flags &= ~(1 << estV);
 869         }
 870
 871         /* now make sure the state is initialized and propagated */
 872         set_state_entries(globalState.get(), inputrec);
 873     }
 874
 875     /* NM and TPI parallelize over force/energy calculations, not atoms,
 876      * so we need to initialize and broadcast the global state.
 877      */
 878     if (inputrec->eI == eiNM || inputrec->eI == eiTPI)
 879     {
 880         if (!MASTER(cr))
 881         {
 882             globalState = std::make_unique<t_state>();
 883         }
 884         broadcastStateWithoutDynamics(cr, globalState.get());
 885     }
 886
 887     /* A parallel command line option consistency check that we can
 888        only do after any threads have started. */
 889     if (!PAR(cr) && (domdecOptions.numCells[XX] > 1 ||
 890                      domdecOptions.numCells[YY] > 1 ||
 891                      domdecOptions.numCells[ZZ] > 1 ||
 892                      domdecOptions.numPmeRanks > 0))
 893     {
 894         gmx_fatal(FARGS,
 895                   "The -dd or -npme option request a parallel simulation, "
 896 #if !GMX_MPI
 897                   "but %s was compiled without threads or MPI enabled", output_env_get_program_display_name(oenv));
 898 #else
 899 #if GMX_THREAD_MPI
 900                   "but the number of MPI-threads (option -ntmpi) is not set or is 1");
 901 #else
 902                   "but %s was not started through mpirun/mpiexec or only one rank was requested through mpirun/mpiexec", output_env_get_program_display_name(oenv));
 903 #endif
 904 #endif
 905     }
 906
 907     if (doRerun &&
 908         (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
 909     {
 910         gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
 911     }
 912
 913     if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
 914     {
 915         if (domdecOptions.numPmeRanks > 0)
 916         {
 917             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
 918                                  "PME-only ranks are requested, but the system does not use PME for electrostatics or LJ");
 919         }
 920
 921         domdecOptions.numPmeRanks = 0;
 922     }
 923
 924     if (useGpuForNonbonded && domdecOptions.numPmeRanks < 0)
 925     {
 926         /* With NB GPUs we don't automatically use PME-only CPU ranks. PME ranks can
 927          * improve performance with many threads per GPU, since our OpenMP
 928          * scaling is bad, but it's difficult to automate the setup.
 929          */
 930         domdecOptions.numPmeRanks = 0;
 931     }
 932     if (useGpuForPme)
 933     {
 934         if (domdecOptions.numPmeRanks < 0)
 935         {
 936             domdecOptions.numPmeRanks = 0;
 937             // TODO possibly print a note that one can opt-in for a separate PME GPU rank?
 938         }
 939         else
 940         {
 941             GMX_RELEASE_ASSERT(domdecOptions.numPmeRanks <= 1, "PME GPU decomposition is not supported");
 942         }
 943     }
 944
 945 #if GMX_FAHCORE
 946     if (MASTER(cr))
 947     {
 948         fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
 949     }
 950 #endif
 951
 952     /* NMR restraints must be initialized before load_checkpoint,
 953      * since with time averaging the history is added to t_state.
 954      * For proper consistency check we therefore need to extend
 955      * t_state here.
 956      * So the PME-only nodes (if present) will also initialize
 957      * the distance restraints.
 958      */
 959     snew(fcd, 1);
 960
 961     /* This needs to be called before read_checkpoint to extend the state */
 962     init_disres(fplog, &mtop, inputrec, cr, ms, fcd, globalState.get(), replExParams.exchangeInterval > 0);
 963
 964     init_orires(fplog, &mtop, inputrec, cr, ms, globalState.get(), &(fcd->orires));
 965
 966     auto                 deform = prepareBoxDeformation(globalState->box, cr, *inputrec);
 967
 968     ObservablesHistory   observablesHistory = {};
 969
 970     if (startingBehavior != StartingBehavior::NewSimulation)
 971     {
 972         /* Check if checkpoint file exists before doing continuation.
 973          * This way we can use identical input options for the first and subsequent runs...
 974          */
 975         if (mdrunOptions.numStepsCommandline > -2)
 976         {
 977             /* Temporarily set the number of steps to unmlimited to avoid
 978              * triggering the nsteps check in load_checkpoint().
 979              * This hack will go away soon when the -nsteps option is removed.
 980              */
 981             inputrec->nsteps = -1;
 982         }
 983
 984         load_checkpoint(opt2fn_master("-cpi", filenames.size(), filenames.data(), cr),
 985                         logFileHandle,
 986                         cr, domdecOptions.numCells,
 987                         inputrec, globalState.get(),
 988                         &observablesHistory,
 989                         mdrunOptions.reproducible, mdModules_->notifier());
 990
 991         if (startingBehavior == StartingBehavior::RestartWithAppending && logFileHandle)
 992         {
 993             // Now we can start normal logging to the truncated log file.
 994             fplog    = gmx_fio_getfp(logFileHandle);
 995             prepareLogAppending(fplog);
 996             logOwner = buildLogger(fplog, cr);
 997             mdlog    = logOwner.logger();
 998         }
 999     }
1000
1001     if (mdrunOptions.numStepsCommandline > -2)
1002     {
1003         GMX_LOG(mdlog.info).asParagraph().
1004             appendText("The -nsteps functionality is deprecated, and may be removed in a future version. "
1005                        "Consider using gmx convert-tpr -nsteps or changing the appropriate .mdp file field.");
1006     }
1007     /* override nsteps with value set on the commamdline */
1008     override_nsteps_cmdline(mdlog, mdrunOptions.numStepsCommandline, inputrec);
1009
1010     if (SIMMASTER(cr))
1011     {
1012         copy_mat(globalState->box, box);
1013     }
1014
1015     if (PAR(cr))
1016     {
1017         gmx_bcast(sizeof(box), box, cr);
1018     }
1019
1020     if (inputrec->cutoff_scheme != ecutsVERLET)
1021     {
1022         gmx_fatal(FARGS, "This group-scheme .tpr file can no longer be run by mdrun. Please update to the Verlet scheme, or use an earlier version of GROMACS if necessary.");
1023     }
1024     /* Update rlist and nstlist. */
1025     prepare_verlet_scheme(fplog, cr, inputrec, nstlist_cmdline, &mtop, box,
1026                           useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes), *hwinfo->cpuInfo);
1027
1028     LocalAtomSetManager atomSets;
1029     if (PAR(cr) && !(EI_TPI(inputrec->eI) ||
1030                      inputrec->eI == eiNM))
1031     {
1032         cr->dd = init_domain_decomposition(mdlog, cr, domdecOptions, mdrunOptions,
1033                                            &mtop, inputrec,
1034                                            box, positionsFromStatePointer(globalState.get()),
1035                                            &atomSets);
1036         // Note that local state still does not exist yet.
1037     }
1038     else
1039     {
1040         /* PME, if used, is done on all nodes with 1D decomposition */
1041         cr->npmenodes = 0;
1042         cr->duty      = (DUTY_PP | DUTY_PME);
1043
1044         if (inputrec->ePBC == epbcSCREW)
1045         {
1046             gmx_fatal(FARGS,
1047                       "pbc=screw is only implemented with domain decomposition");
1048         }
1049     }
1050
1051     if (PAR(cr))
1052     {
1053         /* After possible communicator splitting in make_dd_communicators.
1054          * we can set up the intra/inter node communication.
1055          */
1056         gmx_setup_nodecomm(fplog, cr);
1057     }
1058
1059 #if GMX_MPI
1060     if (isMultiSim(ms))
1061     {
1062         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
1063                 "This is simulation %d out of %d running as a composite GROMACS\n"
1064                 "multi-simulation job. Setup for this simulation:\n",
1065                 ms->sim, ms->nsim);
1066     }
1067     GMX_LOG(mdlog.warning).appendTextFormatted(
1068             "Using %d MPI %s\n",
1069             cr->nnodes,
1070 #if GMX_THREAD_MPI
1071             cr->nnodes == 1 ? "thread" : "threads"
1072 #else
1073             cr->nnodes == 1 ? "process" : "processes"
1074 #endif
1075             );
1076     fflush(stderr);
1077 #endif
1078
1079     // If mdrun -pin auto honors any affinity setting that already
1080     // exists. If so, it is nice to provide feedback about whether
1081     // that existing affinity setting was from OpenMP or something
1082     // else, so we run this code both before and after we initialize
1083     // the OpenMP support.
1084     gmx_check_thread_affinity_set(mdlog,
1085                                   &hw_opt, hwinfo->nthreads_hw_avail, FALSE);
1086     /* Check and update the number of OpenMP threads requested */
1087     checkAndUpdateRequestedNumOpenmpThreads(&hw_opt, *hwinfo, cr, ms, physicalNodeComm.size_,
1088                                             pmeRunMode, mtop, *inputrec);
1089
1090     gmx_omp_nthreads_init(mdlog, cr,
1091                           hwinfo->nthreads_hw_avail,
1092                           physicalNodeComm.size_,
1093                           hw_opt.nthreads_omp,
1094                           hw_opt.nthreads_omp_pme,
1095                           !thisRankHasDuty(cr, DUTY_PP));
1096
1097     // Enable FP exception detection, but not in
1098     // Release mode and not for compilers with known buggy FP
1099     // exception support (clang with any optimization) or suspected
1100     // buggy FP exception support (gcc 7.* with optimization).
1101 #if !defined NDEBUG && \
1102     !((defined __clang__ || (defined(__GNUC__) && !defined(__ICC) && __GNUC__ == 7)) \
1103     && defined __OPTIMIZE__)
1104     const bool bEnableFPE = true;
1105 #else
1106     const bool bEnableFPE = false;
1107 #endif
1108     //FIXME - reconcile with gmx_feenableexcept() call from CommandLineModuleManager::run()
1109     if (bEnableFPE)
1110     {
1111         gmx_feenableexcept();
1112     }
1113
1114     // Build a data structure that expresses which kinds of non-bonded
1115     // task are handled by this rank.
1116     //
1117     // TODO Later, this might become a loop over all registered modules
1118     // relevant to the mdp inputs, to find those that have such tasks.
1119     //
1120     // TODO This could move before init_domain_decomposition() as part
1121     // of refactoring that separates the responsibility for duty
1122     // assignment from setup for communication between tasks, and
1123     // setup for tasks handled with a domain (ie including short-ranged
1124     // tasks, bonded tasks, etc.).
1125     //
1126     // Note that in general useGpuForNonbonded, etc. can have a value
1127     // that is inconsistent with the presence of actual GPUs on any
1128     // rank, and that is not known to be a problem until the
1129     // duty of the ranks on a node become known.
1130     //
1131     // TODO Later we might need the concept of computeTasksOnThisRank,
1132     // from which we construct gpuTasksOnThisRank.
1133     //
1134     // Currently the DD code assigns duty to ranks that can
1135     // include PP work that currently can be executed on a single
1136     // GPU, if present and compatible.  This has to be coordinated
1137     // across PP ranks on a node, with possible multiple devices
1138     // or sharing devices on a node, either from the user
1139     // selection, or automatically.
1140     auto                 haveGpus = !gpuIdsToUse.empty();
1141     std::vector<GpuTask> gpuTasksOnThisRank;
1142     if (thisRankHasDuty(cr, DUTY_PP))
1143     {
1144         if (useGpuForNonbonded)
1145         {
1146             // Note that any bonded tasks on a GPU always accompany a
1147             // non-bonded task.
1148             if (haveGpus)
1149             {
1150                 gpuTasksOnThisRank.push_back(GpuTask::Nonbonded);
1151             }
1152             else if (nonbondedTarget == TaskTarget::Gpu)
1153             {
1154                 gmx_fatal(FARGS, "Cannot run short-ranged nonbonded interactions on a GPU because no GPU is detected.");
1155             }
1156             else if (bondedTarget == TaskTarget::Gpu)
1157             {
1158                 gmx_fatal(FARGS, "Cannot run bonded interactions on a GPU because no GPU is detected.");
1159             }
1160         }
1161     }
1162     // TODO cr->duty & DUTY_PME should imply that a PME algorithm is active, but currently does not.
1163     if (EEL_PME(inputrec->coulombtype) && (thisRankHasDuty(cr, DUTY_PME)))
1164     {
1165         if (useGpuForPme)
1166         {
1167             if (haveGpus)
1168             {
1169                 gpuTasksOnThisRank.push_back(GpuTask::Pme);
1170             }
1171             else if (pmeTarget == TaskTarget::Gpu)
1172             {
1173                 gmx_fatal(FARGS, "Cannot run PME on a GPU because no GPU is detected.");
1174             }
1175         }
1176     }
1177
1178     GpuTaskAssignment gpuTaskAssignment;
1179     try
1180     {
1181         // Produce the task assignment for this rank.
1182         gpuTaskAssignment = runTaskAssignment(gpuIdsToUse, userGpuTaskAssignment, *hwinfo,
1183                                               mdlog, cr, ms, physicalNodeComm, gpuTasksOnThisRank,
1184                                               useGpuForBonded, pmeRunMode);
1185     }
1186     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1187
1188     /* Prevent other ranks from continuing after an issue was found
1189      * and reported as a fatal error.
1190      *
1191      * TODO This function implements a barrier so that MPI runtimes
1192      * can organize an orderly shutdown if one of the ranks has had to
1193      * issue a fatal error in various code already run. When we have
1194      * MPI-aware error handling and reporting, this should be
1195      * improved. */
1196 #if GMX_MPI
1197     if (PAR(cr))
1198     {
1199         MPI_Barrier(cr->mpi_comm_mysim);
1200     }
1201     if (isMultiSim(ms))
1202     {
1203         if (SIMMASTER(cr))
1204         {
1205             MPI_Barrier(ms->mpi_comm_masters);
1206         }
1207         /* We need another barrier to prevent non-master ranks from contiuing
1208          * when an error occured in a different simulation.
1209          */
1210         MPI_Barrier(cr->mpi_comm_mysim);
1211     }
1212 #endif
1213
1214     /* Now that we know the setup is consistent, check for efficiency */
1215     check_resource_division_efficiency(hwinfo, !gpuTaskAssignment.empty(), mdrunOptions.ntompOptionIsSet,
1216                                        cr, mdlog);
1217
1218     gmx_device_info_t *nonbondedDeviceInfo = nullptr;
1219
1220     if (thisRankHasDuty(cr, DUTY_PP))
1221     {
1222         // This works because only one task of each type is currently permitted.
1223         auto nbGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(),
1224                                              hasTaskType<GpuTask::Nonbonded>);
1225         if (nbGpuTaskMapping != gpuTaskAssignment.end())
1226         {
1227             int nonbondedDeviceId = nbGpuTaskMapping->deviceId_;
1228             nonbondedDeviceInfo = getDeviceInfo(hwinfo->gpu_info, nonbondedDeviceId);
1229             init_gpu(nonbondedDeviceInfo);
1230
1231             if (DOMAINDECOMP(cr))
1232             {
1233                 /* When we share GPUs over ranks, we need to know this for the DLB */
1234                 dd_setup_dlb_resource_sharing(cr, nonbondedDeviceId);
1235             }
1236
1237         }
1238     }
1239
1240     gmx_device_info_t                *pmeDeviceInfo = nullptr;
1241     // Later, this program could contain kernels that might be later
1242     // re-used as auto-tuning progresses, or subsequent simulations
1243     // are invoked.
1244     PmeGpuProgramStorage pmeGpuProgram;
1245     // This works because only one task of each type is currently permitted.
1246     auto                 pmeGpuTaskMapping     = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(), hasTaskType<GpuTask::Pme>);
1247     const bool           thisRankHasPmeGpuTask = (pmeGpuTaskMapping != gpuTaskAssignment.end());
1248     if (thisRankHasPmeGpuTask)
1249     {
1250         pmeDeviceInfo = getDeviceInfo(hwinfo->gpu_info, pmeGpuTaskMapping->deviceId_);
1251         init_gpu(pmeDeviceInfo);
1252         pmeGpuProgram = buildPmeGpuProgram(pmeDeviceInfo);
1253     }
1254
1255     /* getting number of PP/PME threads on this MPI / tMPI rank.
1256        PME: env variable should be read only on one node to make sure it is
1257        identical everywhere;
1258      */
1259     const int numThreadsOnThisRank =
1260         thisRankHasDuty(cr, DUTY_PP) ? gmx_omp_nthreads_get(emntNonbonded) : gmx_omp_nthreads_get(emntPME);
1261     checkHardwareOversubscription(numThreadsOnThisRank, cr->nodeid,
1262                                   *hwinfo->hardwareTopology,
1263                                   physicalNodeComm, mdlog);
1264
1265     if (hw_opt.threadAffinity != ThreadAffinity::Off)
1266     {
1267         /* Before setting affinity, check whether the affinity has changed
1268          * - which indicates that probably the OpenMP library has changed it
1269          * since we first checked).
1270          */
1271         gmx_check_thread_affinity_set(mdlog,
1272                                       &hw_opt, hwinfo->nthreads_hw_avail, TRUE);
1273
1274         int numThreadsOnThisNode, intraNodeThreadOffset;
1275         analyzeThreadsOnThisNode(physicalNodeComm, numThreadsOnThisRank, &numThreadsOnThisNode,
1276                                  &intraNodeThreadOffset);
1277
1278         /* Set the CPU affinity */
1279         gmx_set_thread_affinity(mdlog, cr, &hw_opt, *hwinfo->hardwareTopology,
1280                                 numThreadsOnThisRank, numThreadsOnThisNode,
1281                                 intraNodeThreadOffset, nullptr);
1282     }
1283
1284     if (mdrunOptions.timingOptions.resetStep > -1)
1285     {
1286         GMX_LOG(mdlog.info).asParagraph().
1287             appendText("The -resetstep functionality is deprecated, and may be removed in a future version.");
1288     }
1289     wcycle = wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
1290
1291     if (PAR(cr))
1292     {
1293         /* Master synchronizes its value of reset_counters with all nodes
1294          * including PME only nodes */
1295         int64_t reset_counters = wcycle_get_reset_counters(wcycle);
1296         gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
1297         wcycle_set_reset_counters(wcycle, reset_counters);
1298     }
1299
1300     // Membrane embedding must be initialized before we call init_forcerec()
1301     if (doMembed)
1302     {
1303         if (MASTER(cr))
1304         {
1305             fprintf(stderr, "Initializing membed");
1306         }
1307         /* Note that membed cannot work in parallel because mtop is
1308          * changed here. Fix this if we ever want to make it run with
1309          * multiple ranks. */
1310         membed = init_membed(fplog, filenames.size(), filenames.data(), &mtop, inputrec, globalState.get(), cr,
1311                              &mdrunOptions
1312                                  .checkpointOptions.period);
1313     }
1314
1315     std::unique_ptr<MDAtoms>     mdAtoms;
1316     std::unique_ptr<gmx_vsite_t> vsite;
1317
1318     t_nrnb nrnb;
1319     if (thisRankHasDuty(cr, DUTY_PP))
1320     {
1321         mdModulesNotifier.notify(*cr);
1322         mdModulesNotifier.notify(&atomSets);
1323         mdModulesNotifier.notify(PeriodicBoundaryConditionType {inputrec->ePBC});
1324         /* Initiate forcerecord */
1325         fr                 = new t_forcerec;
1326         fr->forceProviders = mdModules_->initForceProviders();
1327         init_forcerec(fplog, mdlog, fr, fcd,
1328                       inputrec, &mtop, cr, box,
1329                       opt2fn("-table", filenames.size(), filenames.data()),
1330                       opt2fn("-tablep", filenames.size(), filenames.data()),
1331                       opt2fns("-tableb", filenames.size(), filenames.data()),
1332                       *hwinfo, nonbondedDeviceInfo,
1333                       useGpuForBonded,
1334                       FALSE,
1335                       pforce,
1336                       wcycle);
1337
1338         // TODO Move this to happen during domain decomposition setup,
1339         // once stream and event handling works well with that.
1340         // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
1341         if (havePPDomainDecomposition(cr) && c_enableGpuHaloExchange && useGpuForNonbonded)
1342         {
1343             void *streamLocal                   = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
1344             void *streamNonLocal                =
1345                 Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
1346             void *coordinatesOnDeviceEvent = fr->nbv->get_x_on_device_event();
1347             cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(cr->dd, cr->mpi_comm_mysim, streamLocal,
1348                                                                         streamNonLocal, coordinatesOnDeviceEvent);
1349         }
1350
1351         /* Initialize the mdAtoms structure.
1352          * mdAtoms is not filled with atom data,
1353          * as this can not be done now with domain decomposition.
1354          */
1355         mdAtoms = makeMDAtoms(fplog, mtop, *inputrec, thisRankHasPmeGpuTask);
1356         if (globalState && thisRankHasPmeGpuTask)
1357         {
1358             // The pinning of coordinates in the global state object works, because we only use
1359             // PME on GPU without DD or on a separate PME rank, and because the local state pointer
1360             // points to the global state object without DD.
1361             // FIXME: MD and EM separately set up the local state - this should happen in the same function,
1362             // which should also perform the pinning.
1363             changePinningPolicy(&globalState->x, pme_get_pinning_policy());
1364         }
1365
1366         /* Initialize the virtual site communication */
1367         vsite = initVsite(mtop, cr);
1368
1369         calc_shifts(box, fr->shift_vec);
1370
1371         /* With periodic molecules the charge groups should be whole at start up
1372          * and the virtual sites should not be far from their proper positions.
1373          */
1374         if (!inputrec->bContinuation && MASTER(cr) &&
1375             !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
1376         {
1377             /* Make molecules whole at start of run */
1378             if (fr->ePBC != epbcNONE)
1379             {
1380                 do_pbc_first_mtop(fplog, inputrec->ePBC, box, &mtop, globalState->x.rvec_array());
1381             }
1382             if (vsite)
1383             {
1384                 /* Correct initial vsite positions are required
1385                  * for the initial distribution in the domain decomposition
1386                  * and for the initial shell prediction.
1387                  */
1388                 constructVsitesGlobal(mtop, globalState->x);
1389             }
1390         }
1391
1392         if (EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
1393         {
1394             ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
1395             ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
1396         }
1397     }
1398     else
1399     {
1400         /* This is a PME only node */
1401
1402         GMX_ASSERT(globalState == nullptr, "We don't need the state on a PME only rank and expect it to be unitialized");
1403
1404         ewaldcoeff_q  = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol);
1405         ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj);
1406     }
1407
1408     gmx_pme_t *sepPmeData = nullptr;
1409     // This reference hides the fact that PME data is owned by runner on PME-only ranks and by forcerec on other ranks
1410     GMX_ASSERT(thisRankHasDuty(cr, DUTY_PP) == (fr != nullptr), "Double-checking that only PME-only ranks have no forcerec");
1411     gmx_pme_t * &pmedata = fr ? fr->pmedata : sepPmeData;
1412
1413     /* Initiate PME if necessary,
1414      * either on all nodes or on dedicated PME nodes only. */
1415     if (EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
1416     {
1417         if (mdAtoms && mdAtoms->mdatoms())
1418         {
1419             nChargePerturbed = mdAtoms->mdatoms()->nChargePerturbed;
1420             if (EVDW_PME(inputrec->vdwtype))
1421             {
1422                 nTypePerturbed   = mdAtoms->mdatoms()->nTypePerturbed;
1423             }
1424         }
1425         if (cr->npmenodes > 0)
1426         {
1427             /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/
1428             gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
1429             gmx_bcast_sim(sizeof(nTypePerturbed), &nTypePerturbed, cr);
1430         }
1431
1432         if (thisRankHasDuty(cr, DUTY_PME))
1433         {
1434             try
1435             {
1436                 pmedata = gmx_pme_init(cr,
1437                                        getNumPmeDomains(cr->dd),
1438                                        inputrec,
1439                                        nChargePerturbed != 0, nTypePerturbed != 0,
1440                                        mdrunOptions.reproducible,
1441                                        ewaldcoeff_q, ewaldcoeff_lj,
1442                                        gmx_omp_nthreads_get(emntPME),
1443                                        pmeRunMode, nullptr,
1444                                        pmeDeviceInfo, pmeGpuProgram.get(), mdlog);
1445             }
1446             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1447         }
1448     }
1449
1450
1451     if (EI_DYNAMICS(inputrec->eI))
1452     {
1453         /* Turn on signal handling on all nodes */
1454         /*
1455          * (A user signal from the PME nodes (if any)
1456          * is communicated to the PP nodes.
1457          */
1458         signal_handler_install();
1459     }
1460
1461     pull_t *pull_work = nullptr;
1462     if (thisRankHasDuty(cr, DUTY_PP))
1463     {
1464         /* Assumes uniform use of the number of OpenMP threads */
1465         walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
1466
1467         if (inputrec->bPull)
1468         {
1469             /* Initialize pull code */
1470             pull_work =
1471                 init_pull(fplog, inputrec->pull, inputrec,
1472                           &mtop, cr, &atomSets, inputrec->fepvals->init_lambda);
1473             if (inputrec->pull->bXOutAverage || inputrec->pull->bFOutAverage)
1474             {
1475                 initPullHistory(pull_work, &observablesHistory);
1476             }
1477             if (EI_DYNAMICS(inputrec->eI) && MASTER(cr))
1478             {
1479                 init_pull_output_files(pull_work,
1480                                        filenames.size(), filenames.data(), oenv,
1481                                        startingBehavior);
1482             }
1483         }
1484
1485         std::unique_ptr<EnforcedRotation> enforcedRotation;
1486         if (inputrec->bRot)
1487         {
1488             /* Initialize enforced rotation code */
1489             enforcedRotation = init_rot(fplog,
1490                                         inputrec,
1491                                         filenames.size(),
1492                                         filenames.data(),
1493                                         cr,
1494                                         &atomSets,
1495                                         globalState.get(),
1496                                         &mtop,
1497                                         oenv,
1498                                         mdrunOptions,
1499                                         startingBehavior);
1500         }
1501
1502         t_swap *swap = nullptr;
1503         if (inputrec->eSwapCoords != eswapNO)
1504         {
1505             /* Initialize ion swapping code */
1506             swap = init_swapcoords(fplog, inputrec,
1507                                    opt2fn_master("-swap", filenames.size(), filenames.data(), cr),
1508                                    &mtop, globalState.get(), &observablesHistory,
1509                                    cr, &atomSets, oenv, mdrunOptions,
1510                                    startingBehavior);
1511         }
1512
1513         /* Let makeConstraints know whether we have essential dynamics constraints.
1514          * TODO: inputrec should tell us whether we use an algorithm, not a file option or the checkpoint
1515          */
1516         bool doEssentialDynamics = (opt2fn_null("-ei", filenames.size(), filenames.data()) != nullptr
1517                                     || observablesHistory.edsamHistory);
1518         auto constr              = makeConstraints(mtop, *inputrec, pull_work, doEssentialDynamics,
1519                                                    fplog, *mdAtoms->mdatoms(),
1520                                                    cr, ms, &nrnb, wcycle, fr->bMolPBC);
1521
1522         /* Energy terms and groups */
1523         gmx_enerdata_t enerd(mtop.groups.groups[SimulationAtomGroupType::EnergyOutput].size(), inputrec->fepvals->n_lambda);
1524
1525         /* Kinetic energy data */
1526         gmx_ekindata_t ekind;
1527         init_ekindata(fplog, &mtop, &(inputrec->opts), &ekind, inputrec->cos_accel);
1528
1529         /* Set up interactive MD (IMD) */
1530         auto imdSession = makeImdSession(inputrec, cr, wcycle, &enerd, ms, &mtop, mdlog,
1531                                          MASTER(cr) ? globalState->x.rvec_array() : nullptr,
1532                                          filenames.size(), filenames.data(), oenv, mdrunOptions.imdOptions,
1533                                          startingBehavior);
1534
1535         if (DOMAINDECOMP(cr))
1536         {
1537             GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
1538             /* This call is not included in init_domain_decomposition mainly
1539              * because fr->cginfo_mb is set later.
1540              */
1541             dd_init_bondeds(fplog, cr->dd, &mtop, vsite.get(), inputrec,
1542                             domdecOptions.checkBondedInteractions,
1543                             fr->cginfo_mb);
1544         }
1545
1546         // TODO This is not the right place to manage the lifetime of
1547         // this data structure, but currently it's the easiest way to
1548         // make it work.
1549         MdScheduleWorkload mdScheduleWork;
1550
1551         GMX_ASSERT(stopHandlerBuilder_, "Runner must provide StopHandlerBuilder to simulator.");
1552         SimulatorBuilder simulatorBuilder;
1553
1554         // build and run simulator object based on user-input
1555         const bool inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible(
1556                     false,
1557                     inputrec, doRerun, vsite.get(), ms, replExParams,
1558                     fcd, static_cast<int>(filenames.size()), filenames.data(),
1559                     &observablesHistory, membed);
1560         auto simulator = simulatorBuilder.build(
1561                     inputIsCompatibleWithModularSimulator,
1562                     fplog, cr, ms, mdlog, static_cast<int>(filenames.size()), filenames.data(),
1563                     oenv,
1564                     mdrunOptions,
1565                     startingBehavior,
1566                     vsite.get(), constr.get(),
1567                     enforcedRotation ? enforcedRotation->getLegacyEnfrot() : nullptr,
1568                     deform.get(),
1569                     mdModules_->outputProvider(),
1570                     mdModules_->notifier(),
1571                     inputrec, imdSession.get(), pull_work, swap, &mtop,
1572                     fcd,
1573                     globalState.get(),
1574                     &observablesHistory,
1575                     mdAtoms.get(), &nrnb, wcycle, fr,
1576                     &enerd,
1577                     &ekind,
1578                     &mdScheduleWork,
1579                     replExParams,
1580                     membed,
1581                     walltime_accounting,
1582                     std::move(stopHandlerBuilder_),
1583                     doRerun);
1584         simulator->run();
1585
1586         if (inputrec->bPull)
1587         {
1588             finish_pull(pull_work);
1589         }
1590         finish_swapcoords(swap);
1591     }
1592     else
1593     {
1594         GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
1595         /* do PME only */
1596         walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
1597         gmx_pmeonly(pmedata, cr, &nrnb, wcycle, walltime_accounting, inputrec, pmeRunMode);
1598     }
1599
1600     wallcycle_stop(wcycle, ewcRUN);
1601
1602     /* Finish up, write some stuff
1603      * if rerunMD, don't write last frame again
1604      */
1605     finish_run(fplog, mdlog, cr,
1606                inputrec, &nrnb, wcycle, walltime_accounting,
1607                fr ? fr->nbv.get() : nullptr,
1608                pmedata,
1609                EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms));
1610
1611     // clean up cycle counter
1612     wallcycle_destroy(wcycle);
1613
1614 // Free PME data
1615     if (pmedata)
1616     {
1617         gmx_pme_destroy(pmedata);
1618         pmedata = nullptr;
1619     }
1620
1621     // FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
1622     // before we destroy the GPU context(s) in free_gpu_resources().
1623     // Pinned buffers are associated with contexts in CUDA.
1624     // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
1625     mdAtoms.reset(nullptr);
1626     globalState.reset(nullptr);
1627     mdModules_.reset(nullptr);   // destruct force providers here as they might also use the GPU
1628
1629     /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
1630     free_gpu_resources(fr, physicalNodeComm, hwinfo->gpu_info);
1631     free_gpu(nonbondedDeviceInfo);
1632     free_gpu(pmeDeviceInfo);
1633     done_forcerec(fr, mtop.molblock.size());
1634     sfree(fcd);
1635
1636     if (doMembed)
1637     {
1638         free_membed(membed);
1639     }
1640
1641     /* Does what it says */
1642     print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
1643     walltime_accounting_destroy(walltime_accounting);
1644
1645     // Ensure log file content is written
1646     if (logFileHandle)
1647     {
1648         gmx_fio_flush(logFileHandle);
1649     }
1650
1651     /* Reset FPEs (important for unit tests) by disabling them. Assumes no
1652      * exceptions were enabled before function was called. */
1653     if (bEnableFPE)
1654     {
1655         gmx_fedisableexcept();
1656     }
1657
1658     auto rc = static_cast<int>(gmx_get_stop_condition());
1659
1660 #if GMX_THREAD_MPI
1661     /* we need to join all threads. The sub-threads join when they
1662        exit this function, but the master thread needs to be told to
1663        wait for that. */
1664     if (PAR(cr) && MASTER(cr))
1665     {
1666         tMPI_Finalize();
1667     }
1668 #endif
1669     return rc;
1670 }
1671
1672 Mdrunner::~Mdrunner()
1673 {
1674     // Clean up of the Manager.
1675     // This will end up getting called on every thread-MPI rank, which is unnecessary,
1676     // but okay as long as threads synchronize some time before adding or accessing
1677     // a new set of restraints.
1678     if (restraintManager_)
1679     {
1680         restraintManager_->clear();
1681         GMX_ASSERT(restraintManager_->countRestraints() == 0,
1682                    "restraints added during runner life time should be cleared at runner destruction.");
1683     }
1684 };
1685
1686 void Mdrunner::addPotential(std::shared_ptr<gmx::IRestraintPotential> puller,
1687                             const std::string                        &name)
1688 {
1689     GMX_ASSERT(restraintManager_, "Mdrunner must have a restraint manager.");
1690     // Not sure if this should be logged through the md logger or something else,
1691     // but it is helpful to have some sort of INFO level message sent somewhere.
1692     //    std::cout << "Registering restraint named " << name << std::endl;
1693
1694     // When multiple restraints are used, it may be wasteful to register them separately.
1695     // Maybe instead register an entire Restraint Manager as a force provider.
1696     restraintManager_->addToSpec(std::move(puller),
1697                                  name);
1698 }
1699
1700 Mdrunner::Mdrunner(std::unique_ptr<MDModules> mdModules)
1701     : mdModules_(std::move(mdModules))
1702 {
1703 }
1704
1705 Mdrunner::Mdrunner(Mdrunner &&) noexcept = default;
1706
1707 //NOLINTNEXTLINE(performance-noexcept-move-constructor) working around GCC bug 58265
1708 Mdrunner &Mdrunner::operator=(Mdrunner && /*handle*/) noexcept(BUGFREE_NOEXCEPT_STRING) = default;
1709
1710 class Mdrunner::BuilderImplementation
1711 {
1712     public:
1713         BuilderImplementation() = delete;
1714         BuilderImplementation(std::unique_ptr<MDModules> mdModules);
1715         ~BuilderImplementation();
1716
1717         BuilderImplementation &setExtraMdrunOptions(const MdrunOptions &options,
1718                                                     real                forceWarningThreshold,
1719                                                     StartingBehavior    startingBehavior);
1720
1721         void addDomdec(const DomdecOptions &options);
1722
1723         void addVerletList(int nstlist);
1724
1725         void addReplicaExchange(const ReplicaExchangeParameters &params);
1726
1727         void addMultiSim(gmx_multisim_t* multisim);
1728
1729         void addCommunicationRecord(t_commrec *cr);
1730
1731         void addNonBonded(const char* nbpu_opt);
1732
1733         void addPME(const char* pme_opt_, const char* pme_fft_opt_);
1734
1735         void addBondedTaskAssignment(const char* bonded_opt);
1736
1737         void addHardwareOptions(const gmx_hw_opt_t &hardwareOptions);
1738
1739         void addFilenames(ArrayRef <const t_filenm> filenames);
1740
1741         void addOutputEnvironment(gmx_output_env_t* outputEnvironment);
1742
1743         void addLogFile(t_fileio *logFileHandle);
1744
1745         void addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder);
1746
1747         Mdrunner build();
1748
1749     private:
1750
1751         // Default parameters copied from runner.h
1752         // \todo Clarify source(s) of default parameters.
1753
1754         const char* nbpu_opt_    = nullptr;
1755         const char* pme_opt_     = nullptr;
1756         const char* pme_fft_opt_ = nullptr;
1757         const char *bonded_opt_  = nullptr;
1758
1759         MdrunOptions                          mdrunOptions_;
1760
1761         DomdecOptions                         domdecOptions_;
1762
1763         ReplicaExchangeParameters             replicaExchangeParameters_;
1764
1765         //! Command-line override for the duration of a neighbor list with the Verlet scheme.
1766         int         nstlist_ = 0;
1767
1768         //! Multisim communicator handle.
1769         std::unique_ptr<gmx_multisim_t*>      multisim_ = nullptr;
1770
1771         //! Non-owning communication record (only used when building command-line mdrun)
1772         t_commrec *communicationRecord_ = nullptr;
1773
1774         //! Print a warning if any force is larger than this (in kJ/mol nm).
1775         real forceWarningThreshold_ = -1;
1776
1777         //! Whether the simulation will start afresh, or restart with/without appending.
1778         StartingBehavior startingBehavior_ = StartingBehavior::NewSimulation;
1779
1780         //! The modules that comprise the functionality of mdrun.
1781         std::unique_ptr<MDModules> mdModules_;
1782
1783         //! \brief Parallelism information.
1784         gmx_hw_opt_t hardwareOptions_;
1785
1786         //! filename options for simulation.
1787         ArrayRef<const t_filenm> filenames_;
1788
1789         /*! \brief Handle to output environment.
1790          *
1791          * \todo gmx_output_env_t needs lifetime management.
1792          */
1793         gmx_output_env_t*    outputEnvironment_ = nullptr;
1794
1795         /*! \brief Non-owning handle to MD log file.
1796          *
1797          * \todo Context should own output facilities for client.
1798          * \todo Improve log file handle management.
1799          * \internal
1800          * Code managing the FILE* relies on the ability to set it to
1801          * nullptr to check whether the filehandle is valid.
1802          */
1803         t_fileio* logFileHandle_ = nullptr;
1804
1805         /*!
1806          * \brief Builder for simulation stop signal handler.
1807          */
1808         std::unique_ptr<StopHandlerBuilder> stopHandlerBuilder_ = nullptr;
1809 };
1810
1811 Mdrunner::BuilderImplementation::BuilderImplementation(std::unique_ptr<MDModules> mdModules) :
1812     mdModules_(std::move(mdModules))
1813 {
1814 }
1815
1816 Mdrunner::BuilderImplementation::~BuilderImplementation() = default;
1817
1818 Mdrunner::BuilderImplementation &
1819 Mdrunner::BuilderImplementation::setExtraMdrunOptions(const MdrunOptions    &options,
1820                                                       const real             forceWarningThreshold,
1821                                                       const StartingBehavior startingBehavior)
1822 {
1823     mdrunOptions_          = options;
1824     forceWarningThreshold_ = forceWarningThreshold;
1825     startingBehavior_      = startingBehavior;
1826     return *this;
1827 }
1828
1829 void Mdrunner::BuilderImplementation::addDomdec(const DomdecOptions &options)
1830 {
1831     domdecOptions_ = options;
1832 }
1833
1834 void Mdrunner::BuilderImplementation::addVerletList(int nstlist)
1835 {
1836     nstlist_ = nstlist;
1837 }
1838
1839 void Mdrunner::BuilderImplementation::addReplicaExchange(const ReplicaExchangeParameters &params)
1840 {
1841     replicaExchangeParameters_ = params;
1842 }
1843
1844 void Mdrunner::BuilderImplementation::addMultiSim(gmx_multisim_t* multisim)
1845 {
1846     multisim_ = std::make_unique<gmx_multisim_t*>(multisim);
1847 }
1848
1849 void Mdrunner::BuilderImplementation::addCommunicationRecord(t_commrec *cr)
1850 {
1851     communicationRecord_ = cr;
1852 }
1853
1854 Mdrunner Mdrunner::BuilderImplementation::build()
1855 {
1856     auto newRunner = Mdrunner(std::move(mdModules_));
1857
1858     newRunner.mdrunOptions          = mdrunOptions_;
1859     newRunner.pforce                = forceWarningThreshold_;
1860     newRunner.startingBehavior      = startingBehavior_;
1861     newRunner.domdecOptions         = domdecOptions_;
1862
1863     // \todo determine an invariant to check or confirm that all gmx_hw_opt_t objects are valid
1864     newRunner.hw_opt          = hardwareOptions_;
1865
1866     // No invariant to check. This parameter exists to optionally override other behavior.
1867     newRunner.nstlist_cmdline = nstlist_;
1868
1869     newRunner.replExParams    = replicaExchangeParameters_;
1870
1871     newRunner.filenames = filenames_;
1872
1873     GMX_ASSERT(communicationRecord_, "Bug found. It should not be possible to call build() without a valid communicationRecord_.");
1874     newRunner.cr = communicationRecord_;
1875
1876     if (multisim_)
1877     {
1878         // nullptr is a valid value for the multisim handle, so we don't check the pointed-to pointer.
1879         newRunner.ms = *multisim_;
1880     }
1881     else
1882     {
1883         GMX_THROW(gmx::APIError("MdrunnerBuilder::addMultiSim() is required before build()"));
1884     }
1885
1886     // \todo Clarify ownership and lifetime management for gmx_output_env_t
1887     // \todo Update sanity checking when output environment has clearly specified invariants.
1888     // Initialization and default values for oenv are not well specified in the current version.
1889     if (outputEnvironment_)
1890     {
1891         newRunner.oenv  = outputEnvironment_;
1892     }
1893     else
1894     {
1895         GMX_THROW(gmx::APIError("MdrunnerBuilder::addOutputEnvironment() is required before build()"));
1896     }
1897
1898     newRunner.logFileHandle = logFileHandle_;
1899
1900     if (nbpu_opt_)
1901     {
1902         newRunner.nbpu_opt    = nbpu_opt_;
1903     }
1904     else
1905     {
1906         GMX_THROW(gmx::APIError("MdrunnerBuilder::addNonBonded() is required before build()"));
1907     }
1908
1909     if (pme_opt_ && pme_fft_opt_)
1910     {
1911         newRunner.pme_opt     = pme_opt_;
1912         newRunner.pme_fft_opt = pme_fft_opt_;
1913     }
1914     else
1915     {
1916         GMX_THROW(gmx::APIError("MdrunnerBuilder::addElectrostatics() is required before build()"));
1917     }
1918
1919     if (bonded_opt_)
1920     {
1921         newRunner.bonded_opt = bonded_opt_;
1922     }
1923     else
1924     {
1925         GMX_THROW(gmx::APIError("MdrunnerBuilder::addBondedTaskAssignment() is required before build()"));
1926     }
1927
1928     newRunner.restraintManager_ = std::make_unique<gmx::RestraintManager>();
1929
1930     if (stopHandlerBuilder_)
1931     {
1932         newRunner.stopHandlerBuilder_ = std::move(stopHandlerBuilder_);
1933     }
1934     else
1935     {
1936         newRunner.stopHandlerBuilder_ = std::make_unique<StopHandlerBuilder>();
1937     }
1938
1939     return newRunner;
1940 }
1941
1942 void Mdrunner::BuilderImplementation::addNonBonded(const char* nbpu_opt)
1943 {
1944     nbpu_opt_ = nbpu_opt;
1945 }
1946
1947 void Mdrunner::BuilderImplementation::addPME(const char* pme_opt,
1948                                              const char* pme_fft_opt)
1949 {
1950     pme_opt_     = pme_opt;
1951     pme_fft_opt_ = pme_fft_opt;
1952 }
1953
1954 void Mdrunner::BuilderImplementation::addBondedTaskAssignment(const char* bonded_opt)
1955 {
1956     bonded_opt_ = bonded_opt;
1957 }
1958
1959 void Mdrunner::BuilderImplementation::addHardwareOptions(const gmx_hw_opt_t &hardwareOptions)
1960 {
1961     hardwareOptions_ = hardwareOptions;
1962 }
1963
1964 void Mdrunner::BuilderImplementation::addFilenames(ArrayRef<const t_filenm> filenames)
1965 {
1966     filenames_ = filenames;
1967 }
1968
1969 void Mdrunner::BuilderImplementation::addOutputEnvironment(gmx_output_env_t* outputEnvironment)
1970 {
1971     outputEnvironment_ = outputEnvironment;
1972 }
1973
1974 void Mdrunner::BuilderImplementation::addLogFile(t_fileio *logFileHandle)
1975 {
1976     logFileHandle_ = logFileHandle;
1977 }
1978
1979 void Mdrunner::BuilderImplementation::addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder)
1980 {
1981     stopHandlerBuilder_ = std::move(builder);
1982 }
1983
1984 MdrunnerBuilder::MdrunnerBuilder(std::unique_ptr<MDModules>           mdModules) :
1985     impl_ {std::make_unique<Mdrunner::BuilderImplementation>(std::move(mdModules))}
1986 {
1987 }
1988
1989 MdrunnerBuilder::~MdrunnerBuilder() = default;
1990
1991 MdrunnerBuilder &MdrunnerBuilder::addSimulationMethod(const MdrunOptions    &options,
1992                                                       real                   forceWarningThreshold,
1993                                                       const StartingBehavior startingBehavior)
1994 {
1995     impl_->setExtraMdrunOptions(options, forceWarningThreshold, startingBehavior);
1996     return *this;
1997 }
1998
1999 MdrunnerBuilder &MdrunnerBuilder::addDomainDecomposition(const DomdecOptions &options)
2000 {
2001     impl_->addDomdec(options);
2002     return *this;
2003 }
2004
2005 MdrunnerBuilder &MdrunnerBuilder::addNeighborList(int nstlist)
2006 {
2007     impl_->addVerletList(nstlist);
2008     return *this;
2009 }
2010
2011 MdrunnerBuilder &MdrunnerBuilder::addReplicaExchange(const ReplicaExchangeParameters &params)
2012 {
2013     impl_->addReplicaExchange(params);
2014     return *this;
2015 }
2016
2017 MdrunnerBuilder &MdrunnerBuilder::addMultiSim(gmx_multisim_t* multisim)
2018 {
2019     impl_->addMultiSim(multisim);
2020     return *this;
2021 }
2022
2023 MdrunnerBuilder &MdrunnerBuilder::addCommunicationRecord(t_commrec *cr)
2024 {
2025     impl_->addCommunicationRecord(cr);
2026     return *this;
2027 }
2028
2029 MdrunnerBuilder &MdrunnerBuilder::addNonBonded(const char* nbpu_opt)
2030 {
2031     impl_->addNonBonded(nbpu_opt);
2032     return *this;
2033 }
2034
2035 MdrunnerBuilder &MdrunnerBuilder::addElectrostatics(const char* pme_opt,
2036                                                     const char* pme_fft_opt)
2037 {
2038     // The builder method may become more general in the future, but in this version,
2039     // parameters for PME electrostatics are both required and the only parameters
2040     // available.
2041     if (pme_opt && pme_fft_opt)
2042     {
2043         impl_->addPME(pme_opt, pme_fft_opt);
2044     }
2045     else
2046     {
2047         GMX_THROW(gmx::InvalidInputError("addElectrostatics() arguments must be non-null pointers."));
2048     }
2049     return *this;
2050 }
2051
2052 MdrunnerBuilder &MdrunnerBuilder::addBondedTaskAssignment(const char* bonded_opt)
2053 {
2054     impl_->addBondedTaskAssignment(bonded_opt);
2055     return *this;
2056 }
2057
2058 Mdrunner MdrunnerBuilder::build()
2059 {
2060     return impl_->build();
2061 }
2062
2063 MdrunnerBuilder &MdrunnerBuilder::addHardwareOptions(const gmx_hw_opt_t &hardwareOptions)
2064 {
2065     impl_->addHardwareOptions(hardwareOptions);
2066     return *this;
2067 }
2068
2069 MdrunnerBuilder &MdrunnerBuilder::addFilenames(ArrayRef<const t_filenm> filenames)
2070 {
2071     impl_->addFilenames(filenames);
2072     return *this;
2073 }
2074
2075 MdrunnerBuilder &MdrunnerBuilder::addOutputEnvironment(gmx_output_env_t* outputEnvironment)
2076 {
2077     impl_->addOutputEnvironment(outputEnvironment);
2078     return *this;
2079 }
2080
2081 MdrunnerBuilder &MdrunnerBuilder::addLogFile(t_fileio *logFileHandle)
2082 {
2083     impl_->addLogFile(logFileHandle);
2084     return *this;
2085 }
2086
2087 MdrunnerBuilder &MdrunnerBuilder::addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder)
2088 {
2089     impl_->addStopHandlerBuilder(std::move(builder));
2090     return *this;
2091 }
2092
2093 MdrunnerBuilder::MdrunnerBuilder(MdrunnerBuilder &&) noexcept = default;
2094
2095 MdrunnerBuilder &MdrunnerBuilder::operator=(MdrunnerBuilder &&) noexcept = default;
2096
2097 } // namespace gmx