src/gromacs/mdrun/runner.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief Implements the MD runner routine calling all integrators.
  40  *
  41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
  42  * \ingroup module_mdrun
  43  */
  44 #include "gmxpre.h"
  45
  46 #include "runner.h"
  47
  48 #include "config.h"
  49
  50 #include <cassert>
  51 #include <cinttypes>
  52 #include <csignal>
  53 #include <cstdlib>
  54 #include <cstring>
  55
  56 #include <algorithm>
  57 #include <memory>
  58
  59 #include "gromacs/commandline/filenm.h"
  60 #include "gromacs/domdec/builder.h"
  61 #include "gromacs/domdec/domdec.h"
  62 #include "gromacs/domdec/domdec_struct.h"
  63 #include "gromacs/domdec/gpuhaloexchange.h"
  64 #include "gromacs/domdec/localatomsetmanager.h"
  65 #include "gromacs/domdec/partition.h"
  66 #include "gromacs/ewald/ewald_utils.h"
  67 #include "gromacs/ewald/pme.h"
  68 #include "gromacs/ewald/pme_gpu_program.h"
  69 #include "gromacs/fileio/checkpoint.h"
  70 #include "gromacs/fileio/gmxfio.h"
  71 #include "gromacs/fileio/oenv.h"
  72 #include "gromacs/fileio/tpxio.h"
  73 #include "gromacs/gmxlib/network.h"
  74 #include "gromacs/gmxlib/nrnb.h"
  75 #include "gromacs/gpu_utils/gpu_utils.h"
  76 #include "gromacs/hardware/cpuinfo.h"
  77 #include "gromacs/hardware/detecthardware.h"
  78 #include "gromacs/hardware/printhardware.h"
  79 #include "gromacs/imd/imd.h"
  80 #include "gromacs/listed_forces/disre.h"
  81 #include "gromacs/listed_forces/gpubonded.h"
  82 #include "gromacs/listed_forces/orires.h"
  83 #include "gromacs/math/functions.h"
  84 #include "gromacs/math/utilities.h"
  85 #include "gromacs/math/vec.h"
  86 #include "gromacs/mdlib/boxdeformation.h"
  87 #include "gromacs/mdlib/broadcaststructs.h"
  88 #include "gromacs/mdlib/calc_verletbuf.h"
  89 #include "gromacs/mdlib/dispersioncorrection.h"
  90 #include "gromacs/mdlib/enerdata_utils.h"
  91 #include "gromacs/mdlib/force.h"
  92 #include "gromacs/mdlib/forcerec.h"
  93 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  94 #include "gromacs/mdlib/makeconstraints.h"
  95 #include "gromacs/mdlib/md_support.h"
  96 #include "gromacs/mdlib/mdatoms.h"
  97 #include "gromacs/mdlib/membed.h"
  98 #include "gromacs/mdlib/qmmm.h"
  99 #include "gromacs/mdlib/sighandler.h"
 100 #include "gromacs/mdlib/stophandler.h"
 101 #include "gromacs/mdrun/mdmodules.h"
 102 #include "gromacs/mdrun/simulationcontext.h"
 103 #include "gromacs/mdrunutility/handlerestart.h"
 104 #include "gromacs/mdrunutility/logging.h"
 105 #include "gromacs/mdrunutility/multisim.h"
 106 #include "gromacs/mdrunutility/printtime.h"
 107 #include "gromacs/mdrunutility/threadaffinity.h"
 108 #include "gromacs/mdtypes/commrec.h"
 109 #include "gromacs/mdtypes/enerdata.h"
 110 #include "gromacs/mdtypes/fcdata.h"
 111 #include "gromacs/mdtypes/group.h"
 112 #include "gromacs/mdtypes/inputrec.h"
 113 #include "gromacs/mdtypes/md_enums.h"
 114 #include "gromacs/mdtypes/mdrunoptions.h"
 115 #include "gromacs/mdtypes/observableshistory.h"
 116 #include "gromacs/mdtypes/simulation_workload.h"
 117 #include "gromacs/mdtypes/state.h"
 118 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
 119 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 120 #include "gromacs/nbnxm/nbnxm.h"
 121 #include "gromacs/nbnxm/pairlist_tuning.h"
 122 #include "gromacs/pbcutil/pbc.h"
 123 #include "gromacs/pulling/output.h"
 124 #include "gromacs/pulling/pull.h"
 125 #include "gromacs/pulling/pull_rotation.h"
 126 #include "gromacs/restraint/manager.h"
 127 #include "gromacs/restraint/restraintmdmodule.h"
 128 #include "gromacs/restraint/restraintpotential.h"
 129 #include "gromacs/swap/swapcoords.h"
 130 #include "gromacs/taskassignment/decidegpuusage.h"
 131 #include "gromacs/taskassignment/resourcedivision.h"
 132 #include "gromacs/taskassignment/taskassignment.h"
 133 #include "gromacs/taskassignment/usergpuids.h"
 134 #include "gromacs/timing/gpu_timing.h"
 135 #include "gromacs/timing/wallcycle.h"
 136 #include "gromacs/timing/wallcyclereporting.h"
 137 #include "gromacs/topology/mtop_util.h"
 138 #include "gromacs/trajectory/trajectoryframe.h"
 139 #include "gromacs/utility/basenetwork.h"
 140 #include "gromacs/utility/cstringutil.h"
 141 #include "gromacs/utility/exceptions.h"
 142 #include "gromacs/utility/fatalerror.h"
 143 #include "gromacs/utility/filestream.h"
 144 #include "gromacs/utility/gmxassert.h"
 145 #include "gromacs/utility/gmxmpi.h"
 146 #include "gromacs/utility/keyvaluetree.h"
 147 #include "gromacs/utility/logger.h"
 148 #include "gromacs/utility/loggerbuilder.h"
 149 #include "gromacs/utility/mdmodulenotification.h"
 150 #include "gromacs/utility/physicalnodecommunicator.h"
 151 #include "gromacs/utility/pleasecite.h"
 152 #include "gromacs/utility/programcontext.h"
 153 #include "gromacs/utility/smalloc.h"
 154 #include "gromacs/utility/stringutil.h"
 155
 156 #include "isimulator.h"
 157 #include "replicaexchange.h"
 158 #include "simulatorbuilder.h"
 159
 160 #if GMX_FAHCORE
 161 #include "corewrap.h"
 162 #endif
 163
 164 namespace gmx
 165 {
 166
 167 /*! \brief environment variable to enable GPU P2P communication */
 168 static const bool c_enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr)
 169     && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA);
 170
 171 /*! \brief environment variable to enable GPU buffer operations */
 172 static const bool c_enableGpuBufOps       = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
 173
 174 /*! \brief Manage any development feature flag variables encountered
 175  *
 176  * The use of dev features indicated by environment variables is
 177  * logged in order to ensure that runs with such features enabled can
 178  * be identified from their log and standard output. Any cross
 179  * dependencies are also checked, and if unsatisfied, a fatal error
 180  * issued.
 181  *
 182  * \param[in]  mdlog        Logger object.
 183  */
 184 static void manageDevelopmentFeatures(const gmx::MDLogger &mdlog)
 185 {
 186     const bool enableGpuBufOps       = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
 187     const bool useGpuUpdateConstrain = (getenv("GMX_UPDATE_CONSTRAIN_GPU") != nullptr);
 188     const bool enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
 189
 190     if (enableGpuBufOps)
 191     {
 192         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 193                 "NOTE: This run uses the 'GPU buffer ops' feature, enabled by the GMX_USE_GPU_BUFFER_OPS environment variable.");
 194     }
 195
 196     if (enableGpuHaloExchange)
 197     {
 198         if (!enableGpuBufOps)
 199         {
 200             gmx_fatal(FARGS, "Cannot enable GPU halo exchange without GPU buffer operations, set GMX_USE_GPU_BUFFER_OPS=1\n");
 201         }
 202     }
 203
 204     if (useGpuUpdateConstrain)
 205     {
 206         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 207                 "NOTE: This run uses the 'GPU update/constraints' feature, enabled by the GMX_UPDATE_CONSTRAIN_GPU environment variable.");
 208     }
 209
 210 }
 211
 212 /*! \brief Barrier for safe simultaneous thread access to mdrunner data
 213  *
 214  * Used to ensure that the master thread does not modify mdrunner during copy
 215  * on the spawned threads. */
 216 static void threadMpiMdrunnerAccessBarrier()
 217 {
 218 #if GMX_THREAD_MPI
 219     MPI_Barrier(MPI_COMM_WORLD);
 220 #endif
 221 }
 222
 223 Mdrunner Mdrunner::cloneOnSpawnedThread() const
 224 {
 225     auto newRunner = Mdrunner(std::make_unique<MDModules>());
 226
 227     // All runners in the same process share a restraint manager resource because it is
 228     // part of the interface to the client code, which is associated only with the
 229     // original thread. Handles to the same resources can be obtained by copy.
 230     {
 231         newRunner.restraintManager_ = std::make_unique<RestraintManager>(*restraintManager_);
 232     }
 233
 234     // Copy members of master runner.
 235     // \todo Replace with builder when Simulation context and/or runner phases are better defined.
 236     // Ref https://redmine.gromacs.org/issues/2587 and https://redmine.gromacs.org/issues/2375
 237     newRunner.hw_opt    = hw_opt;
 238     newRunner.filenames = filenames;
 239
 240     newRunner.oenv                = oenv;
 241     newRunner.mdrunOptions        = mdrunOptions;
 242     newRunner.domdecOptions       = domdecOptions;
 243     newRunner.nbpu_opt            = nbpu_opt;
 244     newRunner.pme_opt             = pme_opt;
 245     newRunner.pme_fft_opt         = pme_fft_opt;
 246     newRunner.bonded_opt          = bonded_opt;
 247     newRunner.update_opt          = update_opt;
 248     newRunner.nstlist_cmdline     = nstlist_cmdline;
 249     newRunner.replExParams        = replExParams;
 250     newRunner.pforce              = pforce;
 251     // Give the spawned thread the newly created valid communicator
 252     // for the simulation.
 253     newRunner.communicator        = MPI_COMM_WORLD;
 254     newRunner.ms                  = ms;
 255     newRunner.startingBehavior    = startingBehavior;
 256     newRunner.stopHandlerBuilder_ = std::make_unique<StopHandlerBuilder>(*stopHandlerBuilder_);
 257
 258     threadMpiMdrunnerAccessBarrier();
 259
 260     return newRunner;
 261 }
 262
 263 /*! \brief The callback used for running on spawned threads.
 264  *
 265  * Obtains the pointer to the master mdrunner object from the one
 266  * argument permitted to the thread-launch API call, copies it to make
 267  * a new runner for this thread, reinitializes necessary data, and
 268  * proceeds to the simulation. */
 269 static void mdrunner_start_fn(const void *arg)
 270 {
 271     try
 272     {
 273         auto masterMdrunner = reinterpret_cast<const gmx::Mdrunner *>(arg);
 274         /* copy the arg list to make sure that it's thread-local. This
 275            doesn't copy pointed-to items, of course; fnm, cr and fplog
 276            are reset in the call below, all others should be const. */
 277         gmx::Mdrunner mdrunner = masterMdrunner->cloneOnSpawnedThread();
 278         mdrunner.mdrunner();
 279     }
 280     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 281 }
 282
 283
 284 void Mdrunner::spawnThreads(int numThreadsToLaunch)
 285 {
 286 #if GMX_THREAD_MPI
 287     /* now spawn new threads that start mdrunner_start_fn(), while
 288        the main thread returns. Thread affinity is handled later. */
 289     if (tMPI_Init_fn(TRUE, numThreadsToLaunch, TMPI_AFFINITY_NONE,
 290                      mdrunner_start_fn, static_cast<const void*>(this)) != TMPI_SUCCESS)
 291     {
 292         GMX_THROW(gmx::InternalError("Failed to spawn thread-MPI threads"));
 293     }
 294
 295     // Give the master thread the newly created valid communicator for
 296     // the simulation.
 297     communicator = MPI_COMM_WORLD;
 298     threadMpiMdrunnerAccessBarrier();
 299 #else
 300     GMX_UNUSED_VALUE(numThreadsToLaunch);
 301     GMX_UNUSED_VALUE(mdrunner_start_fn);
 302 #endif
 303 }
 304
 305 }  // namespace gmx
 306
 307 /*! \brief Initialize variables for Verlet scheme simulation */
 308 static void prepare_verlet_scheme(FILE                           *fplog,
 309                                   t_commrec                      *cr,
 310                                   t_inputrec                     *ir,
 311                                   int                             nstlist_cmdline,
 312                                   const gmx_mtop_t               *mtop,
 313                                   const matrix                    box,
 314                                   bool                            makeGpuPairList,
 315                                   const gmx::CpuInfo             &cpuinfo)
 316 {
 317     /* For NVE simulations, we will retain the initial list buffer */
 318     if (EI_DYNAMICS(ir->eI) &&
 319         ir->verletbuf_tol > 0 &&
 320         !(EI_MD(ir->eI) && ir->etc == etcNO))
 321     {
 322         /* Update the Verlet buffer size for the current run setup */
 323
 324         /* Here we assume SIMD-enabled kernels are being used. But as currently
 325          * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
 326          * and 4x2 gives a larger buffer than 4x4, this is ok.
 327          */
 328         ListSetupType      listType  = (makeGpuPairList ? ListSetupType::Gpu : ListSetupType::CpuSimdWhenSupported);
 329         VerletbufListSetup listSetup = verletbufGetSafeListSetup(listType);
 330
 331         const real         rlist_new =
 332             calcVerletBufferSize(*mtop, det(box), *ir, ir->nstlist, ir->nstlist - 1, -1, listSetup);
 333
 334         if (rlist_new != ir->rlist)
 335         {
 336             if (fplog != nullptr)
 337             {
 338                 fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
 339                         ir->rlist, rlist_new,
 340                         listSetup.cluster_size_i, listSetup.cluster_size_j);
 341             }
 342             ir->rlist     = rlist_new;
 343         }
 344     }
 345
 346     if (nstlist_cmdline > 0 && (!EI_DYNAMICS(ir->eI) || ir->verletbuf_tol <= 0))
 347     {
 348         gmx_fatal(FARGS, "Can not set nstlist without %s",
 349                   !EI_DYNAMICS(ir->eI) ? "dynamics" : "verlet-buffer-tolerance");
 350     }
 351
 352     if (EI_DYNAMICS(ir->eI))
 353     {
 354         /* Set or try nstlist values */
 355         increaseNstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, makeGpuPairList, cpuinfo);
 356     }
 357 }
 358
 359 /*! \brief Override the nslist value in inputrec
 360  *
 361  * with value passed on the command line (if any)
 362  */
 363 static void override_nsteps_cmdline(const gmx::MDLogger &mdlog,
 364                                     int64_t              nsteps_cmdline,
 365                                     t_inputrec          *ir)
 366 {
 367     assert(ir);
 368
 369     /* override with anything else than the default -2 */
 370     if (nsteps_cmdline > -2)
 371     {
 372         char sbuf_steps[STEPSTRSIZE];
 373         char sbuf_msg[STRLEN];
 374
 375         ir->nsteps = nsteps_cmdline;
 376         if (EI_DYNAMICS(ir->eI) && nsteps_cmdline != -1)
 377         {
 378             sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps, %.3g ps",
 379                     gmx_step_str(nsteps_cmdline, sbuf_steps),
 380                     fabs(nsteps_cmdline*ir->delta_t));
 381         }
 382         else
 383         {
 384             sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps",
 385                     gmx_step_str(nsteps_cmdline, sbuf_steps));
 386         }
 387
 388         GMX_LOG(mdlog.warning).asParagraph().appendText(sbuf_msg);
 389     }
 390     else if (nsteps_cmdline < -2)
 391     {
 392         gmx_fatal(FARGS, "Invalid nsteps value passed on the command line: %" PRId64,
 393                   nsteps_cmdline);
 394     }
 395     /* Do nothing if nsteps_cmdline == -2 */
 396 }
 397
 398 namespace gmx
 399 {
 400
 401 /*! \brief Return whether GPU acceleration of nonbondeds is supported with the given settings.
 402  *
 403  * If not, and if a warning may be issued, logs a warning about
 404  * falling back to CPU code. With thread-MPI, only the first
 405  * call to this function should have \c issueWarning true. */
 406 static bool gpuAccelerationOfNonbondedIsUseful(const MDLogger   &mdlog,
 407                                                const t_inputrec &ir,
 408                                                bool              issueWarning)
 409 {
 410     bool        gpuIsUseful = true;
 411     std::string warning;
 412
 413     if (ir.opts.ngener - ir.nwall > 1)
 414     {
 415         /* The GPU code does not support more than one energy group.
 416          * If the user requested GPUs explicitly, a fatal error is given later.
 417          */
 418         gpuIsUseful = false;
 419         warning     =
 420             "Multiple energy groups is not implemented for GPUs, falling back to the CPU. "
 421             "For better performance, run on the GPU without energy groups and then do "
 422             "gmx mdrun -rerun option on the trajectory with an energy group .tpr file.";
 423     }
 424
 425     if (EI_TPI(ir.eI))
 426     {
 427         gpuIsUseful = false;
 428         warning     = "TPI is not implemented for GPUs.";
 429     }
 430
 431     if (!gpuIsUseful && issueWarning)
 432     {
 433         GMX_LOG(mdlog.warning).asParagraph().appendText(warning);
 434     }
 435
 436     return gpuIsUseful;
 437 }
 438
 439 //! Initializes the logger for mdrun.
 440 static gmx::LoggerOwner buildLogger(FILE      *fplog,
 441                                     const bool isSimulationMasterRank)
 442 {
 443     gmx::LoggerBuilder builder;
 444     if (fplog != nullptr)
 445     {
 446         builder.addTargetFile(gmx::MDLogger::LogLevel::Info, fplog);
 447     }
 448     if (isSimulationMasterRank)
 449     {
 450         builder.addTargetStream(gmx::MDLogger::LogLevel::Warning,
 451                                 &gmx::TextOutputFile::standardError());
 452     }
 453     return builder.build();
 454 }
 455
 456 //! Make a TaskTarget from an mdrun argument string.
 457 static TaskTarget findTaskTarget(const char *optionString)
 458 {
 459     TaskTarget returnValue = TaskTarget::Auto;
 460
 461     if (strncmp(optionString, "auto", 3) == 0)
 462     {
 463         returnValue = TaskTarget::Auto;
 464     }
 465     else if (strncmp(optionString, "cpu", 3) == 0)
 466     {
 467         returnValue = TaskTarget::Cpu;
 468     }
 469     else if (strncmp(optionString, "gpu", 3) == 0)
 470     {
 471         returnValue = TaskTarget::Gpu;
 472     }
 473     else
 474     {
 475         GMX_ASSERT(false, "Option string should have been checked for sanity already");
 476     }
 477
 478     return returnValue;
 479 }
 480
 481 //! Finish run, aggregate data to print performance info.
 482 static void finish_run(FILE *fplog,
 483                        const gmx::MDLogger &mdlog,
 484                        const t_commrec *cr,
 485                        const t_inputrec *inputrec,
 486                        t_nrnb nrnb[], gmx_wallcycle_t wcycle,
 487                        gmx_walltime_accounting_t walltime_accounting,
 488                        nonbonded_verlet_t *nbv,
 489                        const gmx_pme_t *pme,
 490                        gmx_bool bWriteStat)
 491 {
 492     double  delta_t  = 0;
 493     double  nbfs     = 0, mflop = 0;
 494     double  elapsed_time,
 495             elapsed_time_over_all_ranks,
 496             elapsed_time_over_all_threads,
 497             elapsed_time_over_all_threads_over_all_ranks;
 498     /* Control whether it is valid to print a report. Only the
 499        simulation master may print, but it should not do so if the run
 500        terminated e.g. before a scheduled reset step. This is
 501        complicated by the fact that PME ranks are unaware of the
 502        reason why they were sent a pmerecvqxFINISH. To avoid
 503        communication deadlocks, we always do the communication for the
 504        report, even if we've decided not to write the report, because
 505        how long it takes to finish the run is not important when we've
 506        decided not to report on the simulation performance.
 507
 508        Further, we only report performance for dynamical integrators,
 509        because those are the only ones for which we plan to
 510        consider doing any optimizations. */
 511     bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
 512
 513     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
 514     {
 515         GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
 516         printReport = false;
 517     }
 518
 519     t_nrnb                  *nrnb_tot;
 520     std::unique_ptr<t_nrnb>  nrnbTotalStorage;
 521     if (cr->nnodes > 1)
 522     {
 523         nrnbTotalStorage = std::make_unique<t_nrnb>();
 524         nrnb_tot         = nrnbTotalStorage.get();
 525 #if GMX_MPI
 526         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
 527                       cr->mpi_comm_mysim);
 528 #endif
 529     }
 530     else
 531     {
 532         nrnb_tot = nrnb;
 533     }
 534
 535     elapsed_time                  = walltime_accounting_get_time_since_reset(walltime_accounting);
 536     elapsed_time_over_all_threads = walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
 537     if (cr->nnodes > 1)
 538     {
 539 #if GMX_MPI
 540         /* reduce elapsed_time over all MPI ranks in the current simulation */
 541         MPI_Allreduce(&elapsed_time,
 542                       &elapsed_time_over_all_ranks,
 543                       1, MPI_DOUBLE, MPI_SUM,
 544                       cr->mpi_comm_mysim);
 545         elapsed_time_over_all_ranks /= cr->nnodes;
 546         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
 547          * current simulation. */
 548         MPI_Allreduce(&elapsed_time_over_all_threads,
 549                       &elapsed_time_over_all_threads_over_all_ranks,
 550                       1, MPI_DOUBLE, MPI_SUM,
 551                       cr->mpi_comm_mysim);
 552 #endif
 553     }
 554     else
 555     {
 556         elapsed_time_over_all_ranks                  = elapsed_time;
 557         elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
 558     }
 559
 560     if (printReport)
 561     {
 562         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
 563     }
 564
 565     if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
 566     {
 567         print_dd_statistics(cr, inputrec, fplog);
 568     }
 569
 570     /* TODO Move the responsibility for any scaling by thread counts
 571      * to the code that handled the thread region, so that there's a
 572      * mechanism to keep cycle counting working during the transition
 573      * to task parallelism. */
 574     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
 575     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
 576     wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
 577     auto cycle_sum(wallcycle_sum(cr, wcycle));
 578
 579     if (printReport)
 580     {
 581         auto                    nbnxn_gpu_timings = (nbv != nullptr && nbv->useGpu()) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
 582         gmx_wallclock_gpu_pme_t pme_gpu_timings   = {};
 583
 584         if (pme_gpu_task_enabled(pme))
 585         {
 586             pme_gpu_get_timings(pme, &pme_gpu_timings);
 587         }
 588         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
 589                         elapsed_time_over_all_ranks,
 590                         wcycle, cycle_sum,
 591                         nbnxn_gpu_timings,
 592                         &pme_gpu_timings);
 593
 594         if (EI_DYNAMICS(inputrec->eI))
 595         {
 596             delta_t = inputrec->delta_t;
 597         }
 598
 599         if (fplog)
 600         {
 601             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
 602                        elapsed_time_over_all_ranks,
 603                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
 604                        delta_t, nbfs, mflop);
 605         }
 606         if (bWriteStat)
 607         {
 608             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
 609                        elapsed_time_over_all_ranks,
 610                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
 611                        delta_t, nbfs, mflop);
 612         }
 613     }
 614 }
 615
 616 int Mdrunner::mdrunner()
 617 {
 618     matrix                    box;
 619     t_forcerec               *fr               = nullptr;
 620     t_fcdata                 *fcd              = nullptr;
 621     real                      ewaldcoeff_q     = 0;
 622     real                      ewaldcoeff_lj    = 0;
 623     int                       nChargePerturbed = -1, nTypePerturbed = 0;
 624     gmx_wallcycle_t           wcycle;
 625     gmx_walltime_accounting_t walltime_accounting = nullptr;
 626     gmx_membed_t *            membed              = nullptr;
 627     gmx_hw_info_t            *hwinfo              = nullptr;
 628
 629     /* CAUTION: threads may be started later on in this function, so
 630        cr doesn't reflect the final parallel state right now */
 631     gmx_mtop_t                      mtop;
 632
 633     bool doMembed = opt2bSet("-membed", filenames.size(), filenames.data());
 634     bool doRerun  = mdrunOptions.rerun;
 635
 636     // Handle task-assignment related user options.
 637     EmulateGpuNonbonded emulateGpuNonbonded = (getenv("GMX_EMULATE_GPU") != nullptr ?
 638                                                EmulateGpuNonbonded::Yes : EmulateGpuNonbonded::No);
 639
 640     std::vector<int> userGpuTaskAssignment;
 641     try
 642     {
 643         userGpuTaskAssignment = parseUserTaskAssignmentString(hw_opt.userGpuTaskAssignment);
 644     }
 645     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 646     auto       nonbondedTarget = findTaskTarget(nbpu_opt);
 647     auto       pmeTarget       = findTaskTarget(pme_opt);
 648     auto       pmeFftTarget    = findTaskTarget(pme_fft_opt);
 649     auto       bondedTarget    = findTaskTarget(bonded_opt);
 650     auto       updateTarget    = findTaskTarget(update_opt);
 651     PmeRunMode pmeRunMode      = PmeRunMode::None;
 652
 653     FILE      *fplog = nullptr;
 654     // If we are appending, we don't write log output because we need
 655     // to check that the old log file matches what the checkpoint file
 656     // expects. Otherwise, we should start to write log output now if
 657     // there is a file ready for it.
 658     if (logFileHandle != nullptr && startingBehavior != StartingBehavior::RestartWithAppending)
 659     {
 660         fplog = gmx_fio_getfp(logFileHandle);
 661     }
 662     const bool       isSimulationMasterRank = findIsSimulationMasterRank(ms, communicator);
 663     gmx::LoggerOwner logOwner(buildLogger(fplog, isSimulationMasterRank));
 664     gmx::MDLogger    mdlog(logOwner.logger());
 665
 666     // report any development features that may be enabled by environment variables
 667     manageDevelopmentFeatures(mdlog);
 668
 669     // TODO The thread-MPI master rank makes a working
 670     // PhysicalNodeCommunicator here, but it gets rebuilt by all ranks
 671     // after the threads have been launched. This works because no use
 672     // is made of that communicator until after the execution paths
 673     // have rejoined. But it is likely that we can improve the way
 674     // this is expressed, e.g. by expressly running detection only the
 675     // master rank for thread-MPI, rather than relying on the mutex
 676     // and reference count.
 677     PhysicalNodeCommunicator physicalNodeComm(communicator, gmx_physicalnode_id_hash());
 678     hwinfo = gmx_detect_hardware(mdlog, physicalNodeComm);
 679
 680     gmx_print_detected_hardware(fplog, isSimulationMasterRank && isMasterSim(ms), mdlog, hwinfo);
 681
 682     std::vector<int> gpuIdsToUse = makeGpuIdsToUse(hwinfo->gpu_info, hw_opt.gpuIdsAvailable);
 683
 684     // Print citation requests after all software/hardware printing
 685     pleaseCiteGromacs(fplog);
 686
 687     // TODO Replace this by unique_ptr once t_inputrec is C++
 688     t_inputrec               inputrecInstance;
 689     t_inputrec              *inputrec = nullptr;
 690     std::unique_ptr<t_state> globalState;
 691
 692     auto                     partialDeserializedTpr = std::make_unique<PartialDeserializedTprFile>();
 693
 694     if (isSimulationMasterRank)
 695     {
 696         /* Only the master rank has the global state */
 697         globalState = std::make_unique<t_state>();
 698
 699         /* Read (nearly) all data required for the simulation
 700          * and keep the partly serialized tpr contents to send to other ranks later
 701          */
 702         *partialDeserializedTpr = read_tpx_state(ftp2fn(efTPR, filenames.size(), filenames.data()), &inputrecInstance, globalState.get(), &mtop);
 703         inputrec                = &inputrecInstance;
 704     }
 705
 706     /* Check and update the hardware options for internal consistency */
 707     checkAndUpdateHardwareOptions(mdlog, &hw_opt, isSimulationMasterRank, domdecOptions.numPmeRanks, inputrec);
 708
 709     if (GMX_THREAD_MPI && isSimulationMasterRank)
 710     {
 711         bool useGpuForNonbonded = false;
 712         bool useGpuForPme       = false;
 713         try
 714         {
 715             GMX_RELEASE_ASSERT(inputrec != nullptr, "Keep the compiler happy");
 716
 717             // If the user specified the number of ranks, then we must
 718             // respect that, but in default mode, we need to allow for
 719             // the number of GPUs to choose the number of ranks.
 720             auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
 721             useGpuForNonbonded = decideWhetherToUseGpusForNonbondedWithThreadMpi
 722                     (nonbondedTarget, gpuIdsToUse, userGpuTaskAssignment, emulateGpuNonbonded,
 723                     canUseGpuForNonbonded,
 724                     gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, GMX_THREAD_MPI),
 725                     hw_opt.nthreads_tmpi);
 726             useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi
 727                     (useGpuForNonbonded, pmeTarget, gpuIdsToUse, userGpuTaskAssignment,
 728                     *hwinfo, *inputrec, mtop, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
 729
 730         }
 731         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 732
 733         /* Determine how many thread-MPI ranks to start.
 734          *
 735          * TODO Over-writing the user-supplied value here does
 736          * prevent any possible subsequent checks from working
 737          * correctly. */
 738         hw_opt.nthreads_tmpi = get_nthreads_mpi(hwinfo,
 739                                                 &hw_opt,
 740                                                 gpuIdsToUse,
 741                                                 useGpuForNonbonded,
 742                                                 useGpuForPme,
 743                                                 inputrec, &mtop,
 744                                                 mdlog,
 745                                                 doMembed);
 746
 747         // Now start the threads for thread MPI.
 748         spawnThreads(hw_opt.nthreads_tmpi);
 749         // The spawned threads enter mdrunner() and execution of
 750         // master and spawned threads joins at the end of this block.
 751         physicalNodeComm = PhysicalNodeCommunicator(communicator, gmx_physicalnode_id_hash());
 752     }
 753
 754     GMX_RELEASE_ASSERT(communicator == MPI_COMM_WORLD, "Must have valid world communicator");
 755     CommrecHandle crHandle = init_commrec(communicator, ms);
 756     t_commrec    *cr       = crHandle.get();
 757     GMX_RELEASE_ASSERT(cr != nullptr, "Must have valid commrec");
 758
 759     if (PAR(cr))
 760     {
 761         /* now broadcast everything to the non-master nodes/threads: */
 762         if (!isSimulationMasterRank)
 763         {
 764             inputrec = &inputrecInstance;
 765         }
 766         init_parallel(cr, inputrec, &mtop, partialDeserializedTpr.get());
 767     }
 768     GMX_RELEASE_ASSERT(inputrec != nullptr, "All ranks should have a valid inputrec now");
 769     partialDeserializedTpr.reset(nullptr);
 770
 771     // Now the number of ranks is known to all ranks, and each knows
 772     // the inputrec read by the master rank. The ranks can now all run
 773     // the task-deciding functions and will agree on the result
 774     // without needing to communicate.
 775     //
 776     // TODO Should we do the communication in debug mode to support
 777     // having an assertion?
 778     //
 779     // Note that these variables describe only their own node.
 780     //
 781     // Note that when bonded interactions run on a GPU they always run
 782     // alongside a nonbonded task, so do not influence task assignment
 783     // even though they affect the force calculation workload.
 784     bool useGpuForNonbonded     = false;
 785     bool useGpuForPme           = false;
 786     bool useGpuForBonded        = false;
 787     bool useGpuForUpdate        = false;
 788     bool gpusWereDetected       = hwinfo->ngpu_compatible_tot > 0;
 789     try
 790     {
 791         // It's possible that there are different numbers of GPUs on
 792         // different nodes, which is the user's responsibility to
 793         // handle. If unsuitable, we will notice that during task
 794         // assignment.
 795         auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
 796         useGpuForNonbonded = decideWhetherToUseGpusForNonbonded(nonbondedTarget, userGpuTaskAssignment,
 797                                                                 emulateGpuNonbonded,
 798                                                                 canUseGpuForNonbonded,
 799                                                                 gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, !GMX_THREAD_MPI),
 800                                                                 gpusWereDetected);
 801         useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment,
 802                                                     *hwinfo, *inputrec, mtop,
 803                                                     cr->nnodes, domdecOptions.numPmeRanks,
 804                                                     gpusWereDetected);
 805         auto canUseGpuForBonded = buildSupportsGpuBondeds(nullptr) && inputSupportsGpuBondeds(*inputrec, mtop, nullptr);
 806         useGpuForBonded =
 807             decideWhetherToUseGpusForBonded(useGpuForNonbonded, useGpuForPme,
 808                                             bondedTarget, canUseGpuForBonded,
 809                                             EVDW_PME(inputrec->vdwtype),
 810                                             EEL_PME_EWALD(inputrec->coulombtype),
 811                                             domdecOptions.numPmeRanks, gpusWereDetected);
 812
 813         pmeRunMode   = (useGpuForPme ? PmeRunMode::GPU : PmeRunMode::CPU);
 814         if (pmeRunMode == PmeRunMode::GPU)
 815         {
 816             if (pmeFftTarget == TaskTarget::Cpu)
 817             {
 818                 pmeRunMode = PmeRunMode::Mixed;
 819             }
 820         }
 821         else if (pmeFftTarget == TaskTarget::Gpu)
 822         {
 823             gmx_fatal(FARGS, "Assigning FFTs to GPU requires PME to be assigned to GPU as well. With PME on CPU you should not be using -pmefft.");
 824         }
 825     }
 826     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 827
 828     // Build restraints.
 829     // TODO: hide restraint implementation details from Mdrunner.
 830     // There is nothing unique about restraints at this point as far as the
 831     // Mdrunner is concerned. The Mdrunner should just be getting a sequence of
 832     // factory functions from the SimulationContext on which to call mdModules_->add().
 833     // TODO: capture all restraints into a single RestraintModule, passed to the runner builder.
 834     for (auto && restraint : restraintManager_->getRestraints())
 835     {
 836         auto module = RestraintMDModule::create(restraint,
 837                                                 restraint->sites());
 838         mdModules_->add(std::move(module));
 839     }
 840
 841     // TODO: Error handling
 842     mdModules_->assignOptionsToModules(*inputrec->params, nullptr);
 843     const auto &mdModulesNotifier = mdModules_->notifier().notifier_;
 844
 845     if (inputrec->internalParameters != nullptr)
 846     {
 847         mdModulesNotifier.notify(*inputrec->internalParameters);
 848     }
 849
 850     if (fplog != nullptr)
 851     {
 852         pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
 853         fprintf(fplog, "\n");
 854     }
 855
 856     if (SIMMASTER(cr))
 857     {
 858         /* In rerun, set velocities to zero if present */
 859         if (doRerun && ((globalState->flags & (1 << estV)) != 0))
 860         {
 861             // rerun does not use velocities
 862             GMX_LOG(mdlog.info).asParagraph().appendText(
 863                     "Rerun trajectory contains velocities. Rerun does only evaluate "
 864                     "potential energy and forces. The velocities will be ignored.");
 865             for (int i = 0; i < globalState->natoms; i++)
 866             {
 867                 clear_rvec(globalState->v[i]);
 868             }
 869             globalState->flags &= ~(1 << estV);
 870         }
 871
 872         /* now make sure the state is initialized and propagated */
 873         set_state_entries(globalState.get(), inputrec);
 874     }
 875
 876     /* NM and TPI parallelize over force/energy calculations, not atoms,
 877      * so we need to initialize and broadcast the global state.
 878      */
 879     if (inputrec->eI == eiNM || inputrec->eI == eiTPI)
 880     {
 881         if (!MASTER(cr))
 882         {
 883             globalState = std::make_unique<t_state>();
 884         }
 885         broadcastStateWithoutDynamics(cr, globalState.get());
 886     }
 887
 888     /* A parallel command line option consistency check that we can
 889        only do after any threads have started. */
 890     if (!PAR(cr) && (domdecOptions.numCells[XX] > 1 ||
 891                      domdecOptions.numCells[YY] > 1 ||
 892                      domdecOptions.numCells[ZZ] > 1 ||
 893                      domdecOptions.numPmeRanks > 0))
 894     {
 895         gmx_fatal(FARGS,
 896                   "The -dd or -npme option request a parallel simulation, "
 897 #if !GMX_MPI
 898                   "but %s was compiled without threads or MPI enabled", output_env_get_program_display_name(oenv));
 899 #elif GMX_THREAD_MPI
 900                   "but the number of MPI-threads (option -ntmpi) is not set or is 1");
 901 #else
 902                   "but %s was not started through mpirun/mpiexec or only one rank was requested through mpirun/mpiexec", output_env_get_program_display_name(oenv));
 903 #endif
 904     }
 905
 906     if (doRerun &&
 907         (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
 908     {
 909         gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
 910     }
 911
 912     if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
 913     {
 914         if (domdecOptions.numPmeRanks > 0)
 915         {
 916             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
 917                                  "PME-only ranks are requested, but the system does not use PME for electrostatics or LJ");
 918         }
 919
 920         domdecOptions.numPmeRanks = 0;
 921     }
 922
 923     if (useGpuForNonbonded && domdecOptions.numPmeRanks < 0)
 924     {
 925         /* With NB GPUs we don't automatically use PME-only CPU ranks. PME ranks can
 926          * improve performance with many threads per GPU, since our OpenMP
 927          * scaling is bad, but it's difficult to automate the setup.
 928          */
 929         domdecOptions.numPmeRanks = 0;
 930     }
 931     if (useGpuForPme)
 932     {
 933         if (domdecOptions.numPmeRanks < 0)
 934         {
 935             domdecOptions.numPmeRanks = 0;
 936             // TODO possibly print a note that one can opt-in for a separate PME GPU rank?
 937         }
 938         else
 939         {
 940             GMX_RELEASE_ASSERT(domdecOptions.numPmeRanks <= 1, "PME GPU decomposition is not supported");
 941         }
 942     }
 943
 944 #if GMX_FAHCORE
 945     if (MASTER(cr))
 946     {
 947         fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
 948     }
 949 #endif
 950
 951     /* NMR restraints must be initialized before load_checkpoint,
 952      * since with time averaging the history is added to t_state.
 953      * For proper consistency check we therefore need to extend
 954      * t_state here.
 955      * So the PME-only nodes (if present) will also initialize
 956      * the distance restraints.
 957      */
 958     snew(fcd, 1);
 959
 960     /* This needs to be called before read_checkpoint to extend the state */
 961     init_disres(fplog, &mtop, inputrec, cr, ms, fcd, globalState.get(), replExParams.exchangeInterval > 0);
 962
 963     init_orires(fplog, &mtop, inputrec, cr, ms, globalState.get(), &(fcd->orires));
 964
 965     auto                 deform = prepareBoxDeformation(globalState->box, cr, *inputrec);
 966
 967     ObservablesHistory   observablesHistory = {};
 968
 969     if (startingBehavior != StartingBehavior::NewSimulation)
 970     {
 971         /* Check if checkpoint file exists before doing continuation.
 972          * This way we can use identical input options for the first and subsequent runs...
 973          */
 974         if (mdrunOptions.numStepsCommandline > -2)
 975         {
 976             /* Temporarily set the number of steps to unlimited to avoid
 977              * triggering the nsteps check in load_checkpoint().
 978              * This hack will go away soon when the -nsteps option is removed.
 979              */
 980             inputrec->nsteps = -1;
 981         }
 982
 983         load_checkpoint(opt2fn_master("-cpi", filenames.size(), filenames.data(), cr),
 984                         logFileHandle,
 985                         cr, domdecOptions.numCells,
 986                         inputrec, globalState.get(),
 987                         &observablesHistory,
 988                         mdrunOptions.reproducible, mdModules_->notifier());
 989
 990         if (startingBehavior == StartingBehavior::RestartWithAppending && logFileHandle)
 991         {
 992             // Now we can start normal logging to the truncated log file.
 993             fplog    = gmx_fio_getfp(logFileHandle);
 994             prepareLogAppending(fplog);
 995             logOwner = buildLogger(fplog, MASTER(cr));
 996             mdlog    = logOwner.logger();
 997         }
 998     }
 999
1000     if (mdrunOptions.numStepsCommandline > -2)
1001     {
1002         GMX_LOG(mdlog.info).asParagraph().
1003             appendText("The -nsteps functionality is deprecated, and may be removed in a future version. "
1004                        "Consider using gmx convert-tpr -nsteps or changing the appropriate .mdp file field.");
1005     }
1006     /* override nsteps with value set on the commandline */
1007     override_nsteps_cmdline(mdlog, mdrunOptions.numStepsCommandline, inputrec);
1008
1009     if (SIMMASTER(cr))
1010     {
1011         copy_mat(globalState->box, box);
1012     }
1013
1014     if (PAR(cr))
1015     {
1016         gmx_bcast(sizeof(box), box, cr);
1017     }
1018
1019     if (inputrec->cutoff_scheme != ecutsVERLET)
1020     {
1021         gmx_fatal(FARGS, "This group-scheme .tpr file can no longer be run by mdrun. Please update to the Verlet scheme, or use an earlier version of GROMACS if necessary.");
1022     }
1023     /* Update rlist and nstlist. */
1024     prepare_verlet_scheme(fplog, cr, inputrec, nstlist_cmdline, &mtop, box,
1025                           useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes), *hwinfo->cpuInfo);
1026
1027     const bool prefer1DAnd1PulseDD = (c_enableGpuHaloExchange && useGpuForNonbonded);
1028     // This builder is necessary while we have multi-part construction
1029     // of DD. Before DD is constructed, we use the existence of
1030     // the builder object to indicate that further construction of DD
1031     // is needed.
1032     std::unique_ptr<DomainDecompositionBuilder> ddBuilder;
1033     if (PAR(cr) && !(EI_TPI(inputrec->eI) ||
1034                      inputrec->eI == eiNM))
1035     {
1036         ddBuilder = std::make_unique<DomainDecompositionBuilder>
1037                 (mdlog, cr, domdecOptions, mdrunOptions,
1038                 prefer1DAnd1PulseDD, mtop, *inputrec,
1039                 box, positionsFromStatePointer(globalState.get()));
1040     }
1041     else
1042     {
1043         /* PME, if used, is done on all nodes with 1D decomposition */
1044         cr->npmenodes = 0;
1045         cr->duty      = (DUTY_PP | DUTY_PME);
1046
1047         if (inputrec->ePBC == epbcSCREW)
1048         {
1049             gmx_fatal(FARGS,
1050                       "pbc=screw is only implemented with domain decomposition");
1051         }
1052     }
1053
1054     // Produce the task assignment for this rank.
1055     GpuTaskAssignmentsBuilder gpuTaskAssignmentsBuilder;
1056     GpuTaskAssignments        gpuTaskAssignments =
1057         gpuTaskAssignmentsBuilder.build(gpuIdsToUse,
1058                                         userGpuTaskAssignment,
1059                                         *hwinfo,
1060                                         cr,
1061                                         ms,
1062                                         physicalNodeComm,
1063                                         nonbondedTarget,
1064                                         pmeTarget,
1065                                         bondedTarget,
1066                                         updateTarget,
1067                                         useGpuForNonbonded,
1068                                         useGpuForPme,
1069                                         thisRankHasDuty(cr, DUTY_PP),
1070                                         // TODO cr->duty & DUTY_PME should imply that a PME
1071                                         // algorithm is active, but currently does not.
1072                                         EEL_PME(inputrec->coulombtype) &&
1073                                         thisRankHasDuty(cr, DUTY_PME));
1074
1075     const bool printHostName = (cr->nnodes > 1);
1076     gpuTaskAssignments.reportGpuUsage(mdlog, printHostName, useGpuForBonded, pmeRunMode);
1077
1078     // If the user chose a task assignment, give them some hints
1079     // where appropriate.
1080     if (!userGpuTaskAssignment.empty())
1081     {
1082         gpuTaskAssignments.logPerformanceHints(mdlog,
1083                                                ssize(gpuIdsToUse));
1084     }
1085
1086     // Get the device handles for the modules, nullptr when no task is assigned.
1087     gmx_device_info_t *nonbondedDeviceInfo   = gpuTaskAssignments.initNonbondedDevice(cr);
1088     gmx_device_info_t *pmeDeviceInfo         = gpuTaskAssignments.initPmeDevice();
1089
1090     // TODO Initialize GPU streams here.
1091
1092     // TODO Currently this is always built, yet DD partition code
1093     // checks if it is built before using it. Probably it should
1094     // become an MDModule that is made only when another module
1095     // requires it (e.g. pull, CompEl, density fitting), so that we
1096     // don't update the local atom sets unilaterally every step.
1097     LocalAtomSetManager atomSets;
1098     if (ddBuilder)
1099     {
1100         // TODO Pass the GPU streams to ddBuilder to use in buffer
1101         // transfers (e.g. halo exchange)
1102         cr->dd = ddBuilder->build(&atomSets);
1103         // The builder's job is done, so destruct it
1104         ddBuilder.reset(nullptr);
1105         // Note that local state still does not exist yet.
1106     }
1107
1108     if (PAR(cr))
1109     {
1110         /* After possible communicator splitting in make_dd_communicators.
1111          * we can set up the intra/inter node communication.
1112          */
1113         gmx_setup_nodecomm(fplog, cr);
1114     }
1115
1116 #if GMX_MPI
1117     if (isMultiSim(ms))
1118     {
1119         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
1120                 "This is simulation %d out of %d running as a composite GROMACS\n"
1121                 "multi-simulation job. Setup for this simulation:\n",
1122                 ms->sim, ms->nsim);
1123     }
1124     GMX_LOG(mdlog.warning).appendTextFormatted(
1125             "Using %d MPI %s\n",
1126             cr->nnodes,
1127 #if GMX_THREAD_MPI
1128             cr->nnodes == 1 ? "thread" : "threads"
1129 #else
1130             cr->nnodes == 1 ? "process" : "processes"
1131 #endif
1132             );
1133     fflush(stderr);
1134 #endif
1135
1136     // If mdrun -pin auto honors any affinity setting that already
1137     // exists. If so, it is nice to provide feedback about whether
1138     // that existing affinity setting was from OpenMP or something
1139     // else, so we run this code both before and after we initialize
1140     // the OpenMP support.
1141     gmx_check_thread_affinity_set(mdlog,
1142                                   &hw_opt, hwinfo->nthreads_hw_avail, FALSE);
1143     /* Check and update the number of OpenMP threads requested */
1144     checkAndUpdateRequestedNumOpenmpThreads(&hw_opt, *hwinfo, cr, ms, physicalNodeComm.size_,
1145                                             pmeRunMode, mtop, *inputrec);
1146
1147     gmx_omp_nthreads_init(mdlog, cr,
1148                           hwinfo->nthreads_hw_avail,
1149                           physicalNodeComm.size_,
1150                           hw_opt.nthreads_omp,
1151                           hw_opt.nthreads_omp_pme,
1152                           !thisRankHasDuty(cr, DUTY_PP));
1153
1154     // Enable FP exception detection, but not in
1155     // Release mode and not for compilers with known buggy FP
1156     // exception support (clang with any optimization) or suspected
1157     // buggy FP exception support (gcc 7.* with optimization).
1158 #if !defined NDEBUG && \
1159     !((defined __clang__ || (defined(__GNUC__) && !defined(__ICC) && __GNUC__ == 7)) \
1160     && defined __OPTIMIZE__)
1161     const bool bEnableFPE = true;
1162 #else
1163     const bool bEnableFPE = false;
1164 #endif
1165     //FIXME - reconcile with gmx_feenableexcept() call from CommandLineModuleManager::run()
1166     if (bEnableFPE)
1167     {
1168         gmx_feenableexcept();
1169     }
1170
1171     /* Now that we know the setup is consistent, check for efficiency */
1172     check_resource_division_efficiency(hwinfo,
1173                                        gpuTaskAssignments.thisRankHasAnyGpuTask(),
1174                                        mdrunOptions.ntompOptionIsSet,
1175                                        cr,
1176                                        mdlog);
1177
1178     /* getting number of PP/PME threads on this MPI / tMPI rank.
1179        PME: env variable should be read only on one node to make sure it is
1180        identical everywhere;
1181      */
1182     const int numThreadsOnThisRank =
1183         thisRankHasDuty(cr, DUTY_PP) ? gmx_omp_nthreads_get(emntNonbonded) : gmx_omp_nthreads_get(emntPME);
1184     checkHardwareOversubscription(numThreadsOnThisRank, cr->nodeid,
1185                                   *hwinfo->hardwareTopology,
1186                                   physicalNodeComm, mdlog);
1187
1188     if (hw_opt.threadAffinity != ThreadAffinity::Off)
1189     {
1190         /* Before setting affinity, check whether the affinity has changed
1191          * - which indicates that probably the OpenMP library has changed it
1192          * since we first checked).
1193          */
1194         gmx_check_thread_affinity_set(mdlog,
1195                                       &hw_opt, hwinfo->nthreads_hw_avail, TRUE);
1196
1197         int numThreadsOnThisNode, intraNodeThreadOffset;
1198         analyzeThreadsOnThisNode(physicalNodeComm, numThreadsOnThisRank, &numThreadsOnThisNode,
1199                                  &intraNodeThreadOffset);
1200
1201         /* Set the CPU affinity */
1202         gmx_set_thread_affinity(mdlog, cr, &hw_opt, *hwinfo->hardwareTopology,
1203                                 numThreadsOnThisRank, numThreadsOnThisNode,
1204                                 intraNodeThreadOffset, nullptr);
1205     }
1206
1207     if (mdrunOptions.timingOptions.resetStep > -1)
1208     {
1209         GMX_LOG(mdlog.info).asParagraph().
1210             appendText("The -resetstep functionality is deprecated, and may be removed in a future version.");
1211     }
1212     wcycle = wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
1213
1214     if (PAR(cr))
1215     {
1216         /* Master synchronizes its value of reset_counters with all nodes
1217          * including PME only nodes */
1218         int64_t reset_counters = wcycle_get_reset_counters(wcycle);
1219         gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
1220         wcycle_set_reset_counters(wcycle, reset_counters);
1221     }
1222
1223     // Membrane embedding must be initialized before we call init_forcerec()
1224     if (doMembed)
1225     {
1226         if (MASTER(cr))
1227         {
1228             fprintf(stderr, "Initializing membed");
1229         }
1230         /* Note that membed cannot work in parallel because mtop is
1231          * changed here. Fix this if we ever want to make it run with
1232          * multiple ranks. */
1233         membed = init_membed(fplog, filenames.size(), filenames.data(), &mtop, inputrec, globalState.get(), cr,
1234                              &mdrunOptions
1235                                  .checkpointOptions.period);
1236     }
1237
1238     const bool                   thisRankHasPmeGpuTask = gpuTaskAssignments.thisRankHasPmeGpuTask();
1239     std::unique_ptr<MDAtoms>     mdAtoms;
1240     std::unique_ptr<gmx_vsite_t> vsite;
1241
1242     t_nrnb nrnb;
1243     if (thisRankHasDuty(cr, DUTY_PP))
1244     {
1245         mdModulesNotifier.notify(*cr);
1246         mdModulesNotifier.notify(&atomSets);
1247         mdModulesNotifier.notify(PeriodicBoundaryConditionType {inputrec->ePBC});
1248         /* Initiate forcerecord */
1249         fr                 = new t_forcerec;
1250         fr->forceProviders = mdModules_->initForceProviders();
1251         init_forcerec(fplog, mdlog, fr, fcd,
1252                       inputrec, &mtop, cr, box,
1253                       opt2fn("-table", filenames.size(), filenames.data()),
1254                       opt2fn("-tablep", filenames.size(), filenames.data()),
1255                       opt2fns("-tableb", filenames.size(), filenames.data()),
1256                       *hwinfo, nonbondedDeviceInfo,
1257                       useGpuForBonded,
1258                       pforce,
1259                       wcycle);
1260
1261         // TODO Move this to happen during domain decomposition setup,
1262         // once stream and event handling works well with that.
1263         // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
1264         if (havePPDomainDecomposition(cr) && prefer1DAnd1PulseDD && is1DAnd1PulseDD(*cr->dd))
1265         {
1266             GMX_RELEASE_ASSERT(c_enableGpuBufOps, "Must use GMX_GPU_BUFFER_OPS=1 to use GMX_GPU_DD_COMMS=1");
1267             void *streamLocal              = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::Local);
1268             void *streamNonLocal           = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
1269             void *coordinatesOnDeviceEvent = fr->nbv->get_x_on_device_event();
1270             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
1271                     "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the GMX_GPU_DD_COMMS environment variable.");
1272             cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(cr->dd, cr->mpi_comm_mysim, streamLocal,
1273                                                                         streamNonLocal, coordinatesOnDeviceEvent);
1274         }
1275
1276         /* Initialize the mdAtoms structure.
1277          * mdAtoms is not filled with atom data,
1278          * as this can not be done now with domain decomposition.
1279          */
1280         mdAtoms = makeMDAtoms(fplog, mtop, *inputrec, thisRankHasPmeGpuTask);
1281         if (globalState && thisRankHasPmeGpuTask)
1282         {
1283             // The pinning of coordinates in the global state object works, because we only use
1284             // PME on GPU without DD or on a separate PME rank, and because the local state pointer
1285             // points to the global state object without DD.
1286             // FIXME: MD and EM separately set up the local state - this should happen in the same function,
1287             // which should also perform the pinning.
1288             changePinningPolicy(&globalState->x, pme_get_pinning_policy());
1289         }
1290
1291         /* Initialize the virtual site communication */
1292         vsite = initVsite(mtop, cr);
1293
1294         calc_shifts(box, fr->shift_vec);
1295
1296         /* With periodic molecules the charge groups should be whole at start up
1297          * and the virtual sites should not be far from their proper positions.
1298          */
1299         if (!inputrec->bContinuation && MASTER(cr) &&
1300             !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
1301         {
1302             /* Make molecules whole at start of run */
1303             if (fr->ePBC != epbcNONE)
1304             {
1305                 do_pbc_first_mtop(fplog, inputrec->ePBC, box, &mtop, globalState->x.rvec_array());
1306             }
1307             if (vsite)
1308             {
1309                 /* Correct initial vsite positions are required
1310                  * for the initial distribution in the domain decomposition
1311                  * and for the initial shell prediction.
1312                  */
1313                 constructVsitesGlobal(mtop, globalState->x);
1314             }
1315         }
1316
1317         if (EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
1318         {
1319             ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
1320             ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
1321         }
1322     }
1323     else
1324     {
1325         /* This is a PME only node */
1326
1327         GMX_ASSERT(globalState == nullptr, "We don't need the state on a PME only rank and expect it to be unitialized");
1328
1329         ewaldcoeff_q  = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol);
1330         ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj);
1331     }
1332
1333     gmx_pme_t *sepPmeData = nullptr;
1334     // This reference hides the fact that PME data is owned by runner on PME-only ranks and by forcerec on other ranks
1335     GMX_ASSERT(thisRankHasDuty(cr, DUTY_PP) == (fr != nullptr), "Double-checking that only PME-only ranks have no forcerec");
1336     gmx_pme_t * &pmedata = fr ? fr->pmedata : sepPmeData;
1337
1338     // TODO should live in ewald module once its testing is improved
1339     //
1340     // Later, this program could contain kernels that might be later
1341     // re-used as auto-tuning progresses, or subsequent simulations
1342     // are invoked.
1343     PmeGpuProgramStorage pmeGpuProgram;
1344     if (thisRankHasPmeGpuTask)
1345     {
1346         pmeGpuProgram = buildPmeGpuProgram(pmeDeviceInfo);
1347     }
1348
1349     /* Initiate PME if necessary,
1350      * either on all nodes or on dedicated PME nodes only. */
1351     if (EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
1352     {
1353         if (mdAtoms && mdAtoms->mdatoms())
1354         {
1355             nChargePerturbed = mdAtoms->mdatoms()->nChargePerturbed;
1356             if (EVDW_PME(inputrec->vdwtype))
1357             {
1358                 nTypePerturbed   = mdAtoms->mdatoms()->nTypePerturbed;
1359             }
1360         }
1361         if (cr->npmenodes > 0)
1362         {
1363             /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/
1364             gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
1365             gmx_bcast_sim(sizeof(nTypePerturbed), &nTypePerturbed, cr);
1366         }
1367
1368         if (thisRankHasDuty(cr, DUTY_PME))
1369         {
1370             try
1371             {
1372                 pmedata = gmx_pme_init(cr,
1373                                        getNumPmeDomains(cr->dd),
1374                                        inputrec,
1375                                        nChargePerturbed != 0, nTypePerturbed != 0,
1376                                        mdrunOptions.reproducible,
1377                                        ewaldcoeff_q, ewaldcoeff_lj,
1378                                        gmx_omp_nthreads_get(emntPME),
1379                                        pmeRunMode, nullptr,
1380                                        pmeDeviceInfo, pmeGpuProgram.get(), mdlog);
1381             }
1382             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1383         }
1384     }
1385
1386
1387     if (EI_DYNAMICS(inputrec->eI))
1388     {
1389         /* Turn on signal handling on all nodes */
1390         /*
1391          * (A user signal from the PME nodes (if any)
1392          * is communicated to the PP nodes.
1393          */
1394         signal_handler_install();
1395     }
1396
1397     pull_t *pull_work = nullptr;
1398     if (thisRankHasDuty(cr, DUTY_PP))
1399     {
1400         /* Assumes uniform use of the number of OpenMP threads */
1401         walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
1402
1403         if (inputrec->bPull)
1404         {
1405             /* Initialize pull code */
1406             pull_work =
1407                 init_pull(fplog, inputrec->pull, inputrec,
1408                           &mtop, cr, &atomSets, inputrec->fepvals->init_lambda);
1409             if (inputrec->pull->bXOutAverage || inputrec->pull->bFOutAverage)
1410             {
1411                 initPullHistory(pull_work, &observablesHistory);
1412             }
1413             if (EI_DYNAMICS(inputrec->eI) && MASTER(cr))
1414             {
1415                 init_pull_output_files(pull_work,
1416                                        filenames.size(), filenames.data(), oenv,
1417                                        startingBehavior);
1418             }
1419         }
1420
1421         std::unique_ptr<EnforcedRotation> enforcedRotation;
1422         if (inputrec->bRot)
1423         {
1424             /* Initialize enforced rotation code */
1425             enforcedRotation = init_rot(fplog,
1426                                         inputrec,
1427                                         filenames.size(),
1428                                         filenames.data(),
1429                                         cr,
1430                                         &atomSets,
1431                                         globalState.get(),
1432                                         &mtop,
1433                                         oenv,
1434                                         mdrunOptions,
1435                                         startingBehavior);
1436         }
1437
1438         t_swap *swap = nullptr;
1439         if (inputrec->eSwapCoords != eswapNO)
1440         {
1441             /* Initialize ion swapping code */
1442             swap = init_swapcoords(fplog, inputrec,
1443                                    opt2fn_master("-swap", filenames.size(), filenames.data(), cr),
1444                                    &mtop, globalState.get(), &observablesHistory,
1445                                    cr, &atomSets, oenv, mdrunOptions,
1446                                    startingBehavior);
1447         }
1448
1449         /* Let makeConstraints know whether we have essential dynamics constraints.
1450          * TODO: inputrec should tell us whether we use an algorithm, not a file option or the checkpoint
1451          */
1452         bool doEssentialDynamics = (opt2fn_null("-ei", filenames.size(), filenames.data()) != nullptr
1453                                     || observablesHistory.edsamHistory);
1454         auto constr              = makeConstraints(mtop, *inputrec, pull_work, doEssentialDynamics,
1455                                                    fplog, *mdAtoms->mdatoms(),
1456                                                    cr, ms, &nrnb, wcycle, fr->bMolPBC);
1457
1458         /* Energy terms and groups */
1459         gmx_enerdata_t enerd(mtop.groups.groups[SimulationAtomGroupType::EnergyOutput].size(), inputrec->fepvals->n_lambda);
1460
1461         /* Kinetic energy data */
1462         gmx_ekindata_t ekind;
1463         init_ekindata(fplog, &mtop, &(inputrec->opts), &ekind, inputrec->cos_accel);
1464
1465         /* Set up interactive MD (IMD) */
1466         auto imdSession = makeImdSession(inputrec, cr, wcycle, &enerd, ms, &mtop, mdlog,
1467                                          MASTER(cr) ? globalState->x.rvec_array() : nullptr,
1468                                          filenames.size(), filenames.data(), oenv, mdrunOptions.imdOptions,
1469                                          startingBehavior);
1470
1471         if (DOMAINDECOMP(cr))
1472         {
1473             GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
1474             /* This call is not included in init_domain_decomposition mainly
1475              * because fr->cginfo_mb is set later.
1476              */
1477             dd_init_bondeds(fplog, cr->dd, &mtop, vsite.get(), inputrec,
1478                             domdecOptions.checkBondedInteractions,
1479                             fr->cginfo_mb);
1480         }
1481
1482         if (updateTarget == TaskTarget::Gpu)
1483         {
1484             if (SIMMASTER(cr))
1485             {
1486                 gmx_fatal(FARGS, "It is currently not possible to redirect the calculation "
1487                           "of update and constraints to the GPU!");
1488             }
1489         }
1490
1491         // Before we start the actual simulator, try if we can run the update task on the GPU.
1492         useGpuForUpdate = decideWhetherToUseGpuForUpdate(DOMAINDECOMP(cr),
1493                                                          useGpuForPme,
1494                                                          useGpuForNonbonded,
1495                                                          c_enableGpuBufOps,
1496                                                          updateTarget,
1497                                                          gpusWereDetected,
1498                                                          *inputrec,
1499                                                          *mdAtoms,
1500                                                          doEssentialDynamics,
1501                                                          fcd->orires.nr != 0,
1502                                                          fcd->disres.nsystems != 0);
1503
1504         const bool inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible(
1505                     false,
1506                     inputrec, doRerun, vsite.get(), ms, replExParams,
1507                     fcd, static_cast<int>(filenames.size()), filenames.data(),
1508                     &observablesHistory, membed);
1509
1510         const bool useModularSimulator = inputIsCompatibleWithModularSimulator && !(getenv("GMX_DISABLE_MODULAR_SIMULATOR") != nullptr);
1511
1512         std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
1513         if (gpusWereDetected && ((useGpuForPme && thisRankHasDuty(cr, DUTY_PME)) || useGpuForUpdate))
1514         {
1515             const void         *pmeStream      = pme_gpu_get_device_stream(fr->pmedata);
1516             const void         *localStream    = fr->nbv->gpu_nbv != nullptr ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::Local) : nullptr;
1517             const void         *nonLocalStream = fr->nbv->gpu_nbv != nullptr ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal) : nullptr;
1518             const void         *deviceContext  = pme_gpu_get_device_context(fr->pmedata);
1519             const int           paddingSize    = pme_gpu_get_padding_size(fr->pmedata);
1520             GpuApiCallBehavior  transferKind   = (inputrec->eI == eiMD && !doRerun && !useModularSimulator) ? GpuApiCallBehavior::Async : GpuApiCallBehavior::Sync;
1521
1522             stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(pmeStream,
1523                                                                      localStream,
1524                                                                      nonLocalStream,
1525                                                                      deviceContext,
1526                                                                      transferKind,
1527                                                                      paddingSize);
1528             fr->stateGpu = stateGpu.get();
1529         }
1530
1531         // TODO This is not the right place to manage the lifetime of
1532         // this data structure, but currently it's the easiest way to
1533         // make it work.
1534         MdrunScheduleWorkload runScheduleWork;
1535
1536         GMX_ASSERT(stopHandlerBuilder_, "Runner must provide StopHandlerBuilder to simulator.");
1537         SimulatorBuilder simulatorBuilder;
1538
1539         // build and run simulator object based on user-input
1540         auto simulator = simulatorBuilder.build(
1541                     inputIsCompatibleWithModularSimulator,
1542                     fplog, cr, ms, mdlog, static_cast<int>(filenames.size()), filenames.data(),
1543                     oenv,
1544                     mdrunOptions,
1545                     startingBehavior,
1546                     vsite.get(), constr.get(),
1547                     enforcedRotation ? enforcedRotation->getLegacyEnfrot() : nullptr,
1548                     deform.get(),
1549                     mdModules_->outputProvider(),
1550                     mdModules_->notifier(),
1551                     inputrec, imdSession.get(), pull_work, swap, &mtop,
1552                     fcd,
1553                     globalState.get(),
1554                     &observablesHistory,
1555                     mdAtoms.get(), &nrnb, wcycle, fr,
1556                     &enerd,
1557                     &ekind,
1558                     &runScheduleWork,
1559                     replExParams,
1560                     membed,
1561                     walltime_accounting,
1562                     std::move(stopHandlerBuilder_),
1563                     doRerun,
1564                     useGpuForUpdate);
1565         simulator->run();
1566
1567         if (inputrec->bPull)
1568         {
1569             finish_pull(pull_work);
1570         }
1571         finish_swapcoords(swap);
1572     }
1573     else
1574     {
1575         GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
1576         /* do PME only */
1577         walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
1578         gmx_pmeonly(pmedata, cr, &nrnb, wcycle, walltime_accounting, inputrec, pmeRunMode);
1579     }
1580
1581     wallcycle_stop(wcycle, ewcRUN);
1582
1583     /* Finish up, write some stuff
1584      * if rerunMD, don't write last frame again
1585      */
1586     finish_run(fplog, mdlog, cr,
1587                inputrec, &nrnb, wcycle, walltime_accounting,
1588                fr ? fr->nbv.get() : nullptr,
1589                pmedata,
1590                EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms));
1591
1592     // clean up cycle counter
1593     wallcycle_destroy(wcycle);
1594
1595 // Free PME data
1596     if (pmedata)
1597     {
1598         gmx_pme_destroy(pmedata);
1599         pmedata = nullptr;
1600     }
1601
1602     // FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
1603     // before we destroy the GPU context(s) in free_gpu_resources().
1604     // Pinned buffers are associated with contexts in CUDA.
1605     // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
1606     mdAtoms.reset(nullptr);
1607     globalState.reset(nullptr);
1608     mdModules_.reset(nullptr);   // destruct force providers here as they might also use the GPU
1609
1610     /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
1611     free_gpu_resources(fr, physicalNodeComm, hwinfo->gpu_info);
1612     free_gpu(nonbondedDeviceInfo);
1613     free_gpu(pmeDeviceInfo);
1614     done_forcerec(fr, mtop.molblock.size());
1615     sfree(fcd);
1616
1617     if (doMembed)
1618     {
1619         free_membed(membed);
1620     }
1621
1622     /* Does what it says */
1623     print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
1624     walltime_accounting_destroy(walltime_accounting);
1625
1626     // Ensure log file content is written
1627     if (logFileHandle)
1628     {
1629         gmx_fio_flush(logFileHandle);
1630     }
1631
1632     /* Reset FPEs (important for unit tests) by disabling them. Assumes no
1633      * exceptions were enabled before function was called. */
1634     if (bEnableFPE)
1635     {
1636         gmx_fedisableexcept();
1637     }
1638
1639     auto rc = static_cast<int>(gmx_get_stop_condition());
1640
1641 #if GMX_THREAD_MPI
1642     /* we need to join all threads. The sub-threads join when they
1643        exit this function, but the master thread needs to be told to
1644        wait for that. */
1645     if (PAR(cr) && MASTER(cr))
1646     {
1647         tMPI_Finalize();
1648     }
1649 #endif
1650     return rc;
1651 }
1652
1653 Mdrunner::~Mdrunner()
1654 {
1655     // Clean up of the Manager.
1656     // This will end up getting called on every thread-MPI rank, which is unnecessary,
1657     // but okay as long as threads synchronize some time before adding or accessing
1658     // a new set of restraints.
1659     if (restraintManager_)
1660     {
1661         restraintManager_->clear();
1662         GMX_ASSERT(restraintManager_->countRestraints() == 0,
1663                    "restraints added during runner life time should be cleared at runner destruction.");
1664     }
1665 };
1666
1667 void Mdrunner::addPotential(std::shared_ptr<gmx::IRestraintPotential> puller,
1668                             const std::string                        &name)
1669 {
1670     GMX_ASSERT(restraintManager_, "Mdrunner must have a restraint manager.");
1671     // Not sure if this should be logged through the md logger or something else,
1672     // but it is helpful to have some sort of INFO level message sent somewhere.
1673     //    std::cout << "Registering restraint named " << name << std::endl;
1674
1675     // When multiple restraints are used, it may be wasteful to register them separately.
1676     // Maybe instead register an entire Restraint Manager as a force provider.
1677     restraintManager_->addToSpec(std::move(puller),
1678                                  name);
1679 }
1680
1681 Mdrunner::Mdrunner(std::unique_ptr<MDModules> mdModules)
1682     : mdModules_(std::move(mdModules))
1683 {
1684 }
1685
1686 Mdrunner::Mdrunner(Mdrunner &&) noexcept = default;
1687
1688 //NOLINTNEXTLINE(performance-noexcept-move-constructor) working around GCC bug 58265
1689 Mdrunner &Mdrunner::operator=(Mdrunner && /*handle*/) noexcept(BUGFREE_NOEXCEPT_STRING) = default;
1690
1691 class Mdrunner::BuilderImplementation
1692 {
1693     public:
1694         BuilderImplementation() = delete;
1695         BuilderImplementation(std::unique_ptr<MDModules>           mdModules,
1696                               compat::not_null<SimulationContext*> context);
1697         ~BuilderImplementation();
1698
1699         BuilderImplementation &setExtraMdrunOptions(const MdrunOptions &options,
1700                                                     real                forceWarningThreshold,
1701                                                     StartingBehavior    startingBehavior);
1702
1703         void addDomdec(const DomdecOptions &options);
1704
1705         void addVerletList(int nstlist);
1706
1707         void addReplicaExchange(const ReplicaExchangeParameters &params);
1708
1709         void addNonBonded(const char* nbpu_opt);
1710
1711         void addPME(const char* pme_opt_, const char* pme_fft_opt_);
1712
1713         void addBondedTaskAssignment(const char* bonded_opt);
1714
1715         void addUpdateTaskAssignment(const char* update_opt);
1716
1717         void addHardwareOptions(const gmx_hw_opt_t &hardwareOptions);
1718
1719         void addFilenames(ArrayRef <const t_filenm> filenames);
1720
1721         void addOutputEnvironment(gmx_output_env_t* outputEnvironment);
1722
1723         void addLogFile(t_fileio *logFileHandle);
1724
1725         void addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder);
1726
1727         Mdrunner build();
1728
1729     private:
1730
1731         // Default parameters copied from runner.h
1732         // \todo Clarify source(s) of default parameters.
1733
1734         const char* nbpu_opt_          = nullptr;
1735         const char* pme_opt_           = nullptr;
1736         const char* pme_fft_opt_       = nullptr;
1737         const char *bonded_opt_        = nullptr;
1738         const char *update_opt_        = nullptr;
1739
1740         MdrunOptions                          mdrunOptions_;
1741
1742         DomdecOptions                         domdecOptions_;
1743
1744         ReplicaExchangeParameters             replicaExchangeParameters_;
1745
1746         //! Command-line override for the duration of a neighbor list with the Verlet scheme.
1747         int         nstlist_ = 0;
1748
1749         //! Multisim communicator handle.
1750         gmx_multisim_t *multiSimulation_;
1751
1752         //! mdrun communicator
1753         MPI_Comm communicator_ = MPI_COMM_NULL;
1754
1755         //! Print a warning if any force is larger than this (in kJ/mol nm).
1756         real forceWarningThreshold_ = -1;
1757
1758         //! Whether the simulation will start afresh, or restart with/without appending.
1759         StartingBehavior startingBehavior_ = StartingBehavior::NewSimulation;
1760
1761         //! The modules that comprise the functionality of mdrun.
1762         std::unique_ptr<MDModules> mdModules_;
1763
1764         //! \brief Parallelism information.
1765         gmx_hw_opt_t hardwareOptions_;
1766
1767         //! filename options for simulation.
1768         ArrayRef<const t_filenm> filenames_;
1769
1770         /*! \brief Handle to output environment.
1771          *
1772          * \todo gmx_output_env_t needs lifetime management.
1773          */
1774         gmx_output_env_t*    outputEnvironment_ = nullptr;
1775
1776         /*! \brief Non-owning handle to MD log file.
1777          *
1778          * \todo Context should own output facilities for client.
1779          * \todo Improve log file handle management.
1780          * \internal
1781          * Code managing the FILE* relies on the ability to set it to
1782          * nullptr to check whether the filehandle is valid.
1783          */
1784         t_fileio* logFileHandle_ = nullptr;
1785
1786         /*!
1787          * \brief Builder for simulation stop signal handler.
1788          */
1789         std::unique_ptr<StopHandlerBuilder> stopHandlerBuilder_ = nullptr;
1790 };
1791
1792 Mdrunner::BuilderImplementation::BuilderImplementation(std::unique_ptr<MDModules>           mdModules,
1793                                                        compat::not_null<SimulationContext*> context) :
1794     mdModules_(std::move(mdModules))
1795 {
1796     communicator_    = context->communicator_;
1797     multiSimulation_ = context->multiSimulation_.get();
1798 }
1799
1800 Mdrunner::BuilderImplementation::~BuilderImplementation() = default;
1801
1802 Mdrunner::BuilderImplementation &
1803 Mdrunner::BuilderImplementation::setExtraMdrunOptions(const MdrunOptions    &options,
1804                                                       const real             forceWarningThreshold,
1805                                                       const StartingBehavior startingBehavior)
1806 {
1807     mdrunOptions_          = options;
1808     forceWarningThreshold_ = forceWarningThreshold;
1809     startingBehavior_      = startingBehavior;
1810     return *this;
1811 }
1812
1813 void Mdrunner::BuilderImplementation::addDomdec(const DomdecOptions &options)
1814 {
1815     domdecOptions_ = options;
1816 }
1817
1818 void Mdrunner::BuilderImplementation::addVerletList(int nstlist)
1819 {
1820     nstlist_ = nstlist;
1821 }
1822
1823 void Mdrunner::BuilderImplementation::addReplicaExchange(const ReplicaExchangeParameters &params)
1824 {
1825     replicaExchangeParameters_ = params;
1826 }
1827
1828 Mdrunner Mdrunner::BuilderImplementation::build()
1829 {
1830     auto newRunner = Mdrunner(std::move(mdModules_));
1831
1832     newRunner.mdrunOptions          = mdrunOptions_;
1833     newRunner.pforce                = forceWarningThreshold_;
1834     newRunner.startingBehavior      = startingBehavior_;
1835     newRunner.domdecOptions         = domdecOptions_;
1836
1837     // \todo determine an invariant to check or confirm that all gmx_hw_opt_t objects are valid
1838     newRunner.hw_opt          = hardwareOptions_;
1839
1840     // No invariant to check. This parameter exists to optionally override other behavior.
1841     newRunner.nstlist_cmdline = nstlist_;
1842
1843     newRunner.replExParams    = replicaExchangeParameters_;
1844
1845     newRunner.filenames = filenames_;
1846
1847     newRunner.communicator = communicator_;
1848
1849     // nullptr is a valid value for the multisim handle
1850     newRunner.ms = multiSimulation_;
1851
1852     // \todo Clarify ownership and lifetime management for gmx_output_env_t
1853     // \todo Update sanity checking when output environment has clearly specified invariants.
1854     // Initialization and default values for oenv are not well specified in the current version.
1855     if (outputEnvironment_)
1856     {
1857         newRunner.oenv  = outputEnvironment_;
1858     }
1859     else
1860     {
1861         GMX_THROW(gmx::APIError("MdrunnerBuilder::addOutputEnvironment() is required before build()"));
1862     }
1863
1864     newRunner.logFileHandle = logFileHandle_;
1865
1866     if (nbpu_opt_)
1867     {
1868         newRunner.nbpu_opt    = nbpu_opt_;
1869     }
1870     else
1871     {
1872         GMX_THROW(gmx::APIError("MdrunnerBuilder::addNonBonded() is required before build()"));
1873     }
1874
1875     if (pme_opt_ && pme_fft_opt_)
1876     {
1877         newRunner.pme_opt     = pme_opt_;
1878         newRunner.pme_fft_opt = pme_fft_opt_;
1879     }
1880     else
1881     {
1882         GMX_THROW(gmx::APIError("MdrunnerBuilder::addElectrostatics() is required before build()"));
1883     }
1884
1885     if (bonded_opt_)
1886     {
1887         newRunner.bonded_opt = bonded_opt_;
1888     }
1889     else
1890     {
1891         GMX_THROW(gmx::APIError("MdrunnerBuilder::addBondedTaskAssignment() is required before build()"));
1892     }
1893
1894     if (update_opt_)
1895     {
1896         newRunner.update_opt = update_opt_;
1897     }
1898     else
1899     {
1900         GMX_THROW(gmx::APIError("MdrunnerBuilder::addUpdateTaskAssignment() is required before build()  "));
1901     }
1902
1903
1904     newRunner.restraintManager_ = std::make_unique<gmx::RestraintManager>();
1905
1906     if (stopHandlerBuilder_)
1907     {
1908         newRunner.stopHandlerBuilder_ = std::move(stopHandlerBuilder_);
1909     }
1910     else
1911     {
1912         newRunner.stopHandlerBuilder_ = std::make_unique<StopHandlerBuilder>();
1913     }
1914
1915     return newRunner;
1916 }
1917
1918 void Mdrunner::BuilderImplementation::addNonBonded(const char* nbpu_opt)
1919 {
1920     nbpu_opt_ = nbpu_opt;
1921 }
1922
1923 void Mdrunner::BuilderImplementation::addPME(const char* pme_opt,
1924                                              const char* pme_fft_opt)
1925 {
1926     pme_opt_     = pme_opt;
1927     pme_fft_opt_ = pme_fft_opt;
1928 }
1929
1930 void Mdrunner::BuilderImplementation::addBondedTaskAssignment(const char* bonded_opt)
1931 {
1932     bonded_opt_ = bonded_opt;
1933 }
1934
1935 void Mdrunner::BuilderImplementation::addUpdateTaskAssignment(const char* update_opt)
1936 {
1937     update_opt_ = update_opt;
1938 }
1939
1940 void Mdrunner::BuilderImplementation::addHardwareOptions(const gmx_hw_opt_t &hardwareOptions)
1941 {
1942     hardwareOptions_ = hardwareOptions;
1943 }
1944
1945 void Mdrunner::BuilderImplementation::addFilenames(ArrayRef<const t_filenm> filenames)
1946 {
1947     filenames_ = filenames;
1948 }
1949
1950 void Mdrunner::BuilderImplementation::addOutputEnvironment(gmx_output_env_t* outputEnvironment)
1951 {
1952     outputEnvironment_ = outputEnvironment;
1953 }
1954
1955 void Mdrunner::BuilderImplementation::addLogFile(t_fileio *logFileHandle)
1956 {
1957     logFileHandle_ = logFileHandle;
1958 }
1959
1960 void Mdrunner::BuilderImplementation::addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder)
1961 {
1962     stopHandlerBuilder_ = std::move(builder);
1963 }
1964
1965 MdrunnerBuilder::MdrunnerBuilder(std::unique_ptr<MDModules>           mdModules,
1966                                  compat::not_null<SimulationContext*> context) :
1967     impl_ {std::make_unique<Mdrunner::BuilderImplementation>(std::move(mdModules), context)}
1968 {
1969 }
1970
1971 MdrunnerBuilder::~MdrunnerBuilder() = default;
1972
1973 MdrunnerBuilder &MdrunnerBuilder::addSimulationMethod(const MdrunOptions    &options,
1974                                                       real                   forceWarningThreshold,
1975                                                       const StartingBehavior startingBehavior)
1976 {
1977     impl_->setExtraMdrunOptions(options, forceWarningThreshold, startingBehavior);
1978     return *this;
1979 }
1980
1981 MdrunnerBuilder &MdrunnerBuilder::addDomainDecomposition(const DomdecOptions &options)
1982 {
1983     impl_->addDomdec(options);
1984     return *this;
1985 }
1986
1987 MdrunnerBuilder &MdrunnerBuilder::addNeighborList(int nstlist)
1988 {
1989     impl_->addVerletList(nstlist);
1990     return *this;
1991 }
1992
1993 MdrunnerBuilder &MdrunnerBuilder::addReplicaExchange(const ReplicaExchangeParameters &params)
1994 {
1995     impl_->addReplicaExchange(params);
1996     return *this;
1997 }
1998
1999 MdrunnerBuilder &MdrunnerBuilder::addNonBonded(const char* nbpu_opt)
2000 {
2001     impl_->addNonBonded(nbpu_opt);
2002     return *this;
2003 }
2004
2005 MdrunnerBuilder &MdrunnerBuilder::addElectrostatics(const char* pme_opt,
2006                                                     const char* pme_fft_opt)
2007 {
2008     // The builder method may become more general in the future, but in this version,
2009     // parameters for PME electrostatics are both required and the only parameters
2010     // available.
2011     if (pme_opt && pme_fft_opt)
2012     {
2013         impl_->addPME(pme_opt, pme_fft_opt);
2014     }
2015     else
2016     {
2017         GMX_THROW(gmx::InvalidInputError("addElectrostatics() arguments must be non-null pointers."));
2018     }
2019     return *this;
2020 }
2021
2022 MdrunnerBuilder &MdrunnerBuilder::addBondedTaskAssignment(const char* bonded_opt)
2023 {
2024     impl_->addBondedTaskAssignment(bonded_opt);
2025     return *this;
2026 }
2027
2028 MdrunnerBuilder &MdrunnerBuilder::addUpdateTaskAssignment(const char* update_opt)
2029 {
2030     impl_->addUpdateTaskAssignment(update_opt);
2031     return *this;
2032 }
2033
2034 Mdrunner MdrunnerBuilder::build()
2035 {
2036     return impl_->build();
2037 }
2038
2039 MdrunnerBuilder &MdrunnerBuilder::addHardwareOptions(const gmx_hw_opt_t &hardwareOptions)
2040 {
2041     impl_->addHardwareOptions(hardwareOptions);
2042     return *this;
2043 }
2044
2045 MdrunnerBuilder &MdrunnerBuilder::addFilenames(ArrayRef<const t_filenm> filenames)
2046 {
2047     impl_->addFilenames(filenames);
2048     return *this;
2049 }
2050
2051 MdrunnerBuilder &MdrunnerBuilder::addOutputEnvironment(gmx_output_env_t* outputEnvironment)
2052 {
2053     impl_->addOutputEnvironment(outputEnvironment);
2054     return *this;
2055 }
2056
2057 MdrunnerBuilder &MdrunnerBuilder::addLogFile(t_fileio *logFileHandle)
2058 {
2059     impl_->addLogFile(logFileHandle);
2060     return *this;
2061 }
2062
2063 MdrunnerBuilder &MdrunnerBuilder::addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder)
2064 {
2065     impl_->addStopHandlerBuilder(std::move(builder));
2066     return *this;
2067 }
2068
2069 MdrunnerBuilder::MdrunnerBuilder(MdrunnerBuilder &&) noexcept = default;
2070
2071 MdrunnerBuilder &MdrunnerBuilder::operator=(MdrunnerBuilder &&) noexcept = default;
2072
2073 } // namespace gmx