src/gromacs/mdrun/runner.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2011-2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief Implements the MD runner routine calling all integrators.
  40  *
  41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
  42  * \ingroup module_mdrun
  43  */
  44 #include "gmxpre.h"
  45
  46 #include "runner.h"
  47
  48 #include "config.h"
  49
  50 #include <cassert>
  51 #include <cinttypes>
  52 #include <csignal>
  53 #include <cstdlib>
  54 #include <cstring>
  55
  56 #include <algorithm>
  57 #include <memory>
  58
  59 #include "gromacs/commandline/filenm.h"
  60 #include "gromacs/domdec/builder.h"
  61 #include "gromacs/domdec/domdec.h"
  62 #include "gromacs/domdec/domdec_struct.h"
  63 #include "gromacs/domdec/gpuhaloexchange.h"
  64 #include "gromacs/domdec/localatomsetmanager.h"
  65 #include "gromacs/domdec/partition.h"
  66 #include "gromacs/ewald/ewald_utils.h"
  67 #include "gromacs/ewald/pme.h"
  68 #include "gromacs/ewald/pme_gpu_program.h"
  69 #include "gromacs/ewald/pme_pp_comm_gpu.h"
  70 #include "gromacs/fileio/checkpoint.h"
  71 #include "gromacs/fileio/gmxfio.h"
  72 #include "gromacs/fileio/oenv.h"
  73 #include "gromacs/fileio/tpxio.h"
  74 #include "gromacs/gmxlib/network.h"
  75 #include "gromacs/gmxlib/nrnb.h"
  76 #include "gromacs/gpu_utils/gpu_utils.h"
  77 #include "gromacs/hardware/cpuinfo.h"
  78 #include "gromacs/hardware/detecthardware.h"
  79 #include "gromacs/hardware/printhardware.h"
  80 #include "gromacs/imd/imd.h"
  81 #include "gromacs/listed_forces/disre.h"
  82 #include "gromacs/listed_forces/gpubonded.h"
  83 #include "gromacs/listed_forces/orires.h"
  84 #include "gromacs/math/functions.h"
  85 #include "gromacs/math/utilities.h"
  86 #include "gromacs/math/vec.h"
  87 #include "gromacs/mdlib/boxdeformation.h"
  88 #include "gromacs/mdlib/broadcaststructs.h"
  89 #include "gromacs/mdlib/calc_verletbuf.h"
  90 #include "gromacs/mdlib/dispersioncorrection.h"
  91 #include "gromacs/mdlib/enerdata_utils.h"
  92 #include "gromacs/mdlib/force.h"
  93 #include "gromacs/mdlib/forcerec.h"
  94 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  95 #include "gromacs/mdlib/makeconstraints.h"
  96 #include "gromacs/mdlib/md_support.h"
  97 #include "gromacs/mdlib/mdatoms.h"
  98 #include "gromacs/mdlib/membed.h"
  99 #include "gromacs/mdlib/qmmm.h"
 100 #include "gromacs/mdlib/sighandler.h"
 101 #include "gromacs/mdlib/stophandler.h"
 102 #include "gromacs/mdrun/mdmodules.h"
 103 #include "gromacs/mdrun/simulationcontext.h"
 104 #include "gromacs/mdrunutility/handlerestart.h"
 105 #include "gromacs/mdrunutility/logging.h"
 106 #include "gromacs/mdrunutility/multisim.h"
 107 #include "gromacs/mdrunutility/printtime.h"
 108 #include "gromacs/mdrunutility/threadaffinity.h"
 109 #include "gromacs/mdtypes/commrec.h"
 110 #include "gromacs/mdtypes/enerdata.h"
 111 #include "gromacs/mdtypes/fcdata.h"
 112 #include "gromacs/mdtypes/group.h"
 113 #include "gromacs/mdtypes/inputrec.h"
 114 #include "gromacs/mdtypes/md_enums.h"
 115 #include "gromacs/mdtypes/mdrunoptions.h"
 116 #include "gromacs/mdtypes/observableshistory.h"
 117 #include "gromacs/mdtypes/simulation_workload.h"
 118 #include "gromacs/mdtypes/state.h"
 119 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
 120 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 121 #include "gromacs/nbnxm/nbnxm.h"
 122 #include "gromacs/nbnxm/pairlist_tuning.h"
 123 #include "gromacs/pbcutil/pbc.h"
 124 #include "gromacs/pulling/output.h"
 125 #include "gromacs/pulling/pull.h"
 126 #include "gromacs/pulling/pull_rotation.h"
 127 #include "gromacs/restraint/manager.h"
 128 #include "gromacs/restraint/restraintmdmodule.h"
 129 #include "gromacs/restraint/restraintpotential.h"
 130 #include "gromacs/swap/swapcoords.h"
 131 #include "gromacs/taskassignment/decidegpuusage.h"
 132 #include "gromacs/taskassignment/decidesimulationworkload.h"
 133 #include "gromacs/taskassignment/resourcedivision.h"
 134 #include "gromacs/taskassignment/taskassignment.h"
 135 #include "gromacs/taskassignment/usergpuids.h"
 136 #include "gromacs/timing/gpu_timing.h"
 137 #include "gromacs/timing/wallcycle.h"
 138 #include "gromacs/timing/wallcyclereporting.h"
 139 #include "gromacs/topology/mtop_util.h"
 140 #include "gromacs/trajectory/trajectoryframe.h"
 141 #include "gromacs/utility/basenetwork.h"
 142 #include "gromacs/utility/cstringutil.h"
 143 #include "gromacs/utility/exceptions.h"
 144 #include "gromacs/utility/fatalerror.h"
 145 #include "gromacs/utility/filestream.h"
 146 #include "gromacs/utility/gmxassert.h"
 147 #include "gromacs/utility/gmxmpi.h"
 148 #include "gromacs/utility/keyvaluetree.h"
 149 #include "gromacs/utility/logger.h"
 150 #include "gromacs/utility/loggerbuilder.h"
 151 #include "gromacs/utility/mdmodulenotification.h"
 152 #include "gromacs/utility/physicalnodecommunicator.h"
 153 #include "gromacs/utility/pleasecite.h"
 154 #include "gromacs/utility/programcontext.h"
 155 #include "gromacs/utility/smalloc.h"
 156 #include "gromacs/utility/stringutil.h"
 157
 158 #include "isimulator.h"
 159 #include "replicaexchange.h"
 160 #include "simulatorbuilder.h"
 161
 162 #if GMX_FAHCORE
 163 #    include "corewrap.h"
 164 #endif
 165
 166 namespace gmx
 167 {
 168
 169 /*! \brief Structure that holds boolean flags corresponding to the development
 170  *        features present enabled through environment variables.
 171  *
 172  */
 173 struct DevelopmentFeatureFlags
 174 {
 175     //! True if the Buffer ops development feature is enabled
 176     // TODO: when the trigger of the buffer ops offload is fully automated this should go away
 177     bool enableGpuBufferOps = false;
 178     //! If true, forces 'mdrun -update auto' default to 'gpu'
 179     bool forceGpuUpdateDefaultOn = false;
 180     //! True if the GPU halo exchange development feature is enabled
 181     bool enableGpuHaloExchange = false;
 182     //! True if the PME PP direct communication GPU development feature is enabled
 183     bool enableGpuPmePPComm = false;
 184 };
 185
 186 /*! \brief Manage any development feature flag variables encountered
 187  *
 188  * The use of dev features indicated by environment variables is
 189  * logged in order to ensure that runs with such features enabled can
 190  * be identified from their log and standard output. Any cross
 191  * dependencies are also checked, and if unsatisfied, a fatal error
 192  * issued.
 193  *
 194  * Note that some development features overrides are applied already here:
 195  * the GPU communication flags are set to false in non-tMPI and non-CUDA builds.
 196  *
 197  * \param[in]  mdlog                Logger object.
 198  * \param[in]  useGpuForNonbonded   True if the nonbonded task is offloaded in this run.
 199  * \param[in]  useGpuForPme         True if the PME task is offloaded in this run.
 200  * \returns                         The object populated with development feature flags.
 201  */
 202 static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& mdlog,
 203                                                          const bool           useGpuForNonbonded,
 204                                                          const bool           useGpuForPme)
 205 {
 206     DevelopmentFeatureFlags devFlags;
 207
 208     // Some builds of GCC 5 give false positive warnings that these
 209     // getenv results are ignored when clearly they are used.
 210 #pragma GCC diagnostic push
 211 #pragma GCC diagnostic ignored "-Wunused-result"
 212     devFlags.enableGpuBufferOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr)
 213                                   && (GMX_GPU == GMX_GPU_CUDA) && useGpuForNonbonded;
 214     devFlags.forceGpuUpdateDefaultOn = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr);
 215     devFlags.enableGpuHaloExchange =
 216             (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
 217     devFlags.enableGpuPmePPComm =
 218             (getenv("GMX_GPU_PME_PP_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
 219 #pragma GCC diagnostic pop
 220
 221     if (devFlags.enableGpuBufferOps)
 222     {
 223         GMX_LOG(mdlog.warning)
 224                 .asParagraph()
 225                 .appendTextFormatted(
 226                         "NOTE: This run uses the 'GPU buffer ops' feature, enabled by the "
 227                         "GMX_USE_GPU_BUFFER_OPS environment variable.");
 228     }
 229
 230     if (devFlags.enableGpuHaloExchange)
 231     {
 232         if (useGpuForNonbonded)
 233         {
 234             if (!devFlags.enableGpuBufferOps)
 235             {
 236                 gmx_fatal(FARGS,
 237                           "Cannot enable GPU halo exchange without GPU buffer operations, set "
 238                           "GMX_USE_GPU_BUFFER_OPS=1\n");
 239             }
 240             GMX_LOG(mdlog.warning)
 241                     .asParagraph()
 242                     .appendTextFormatted(
 243                             "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the "
 244                             "GMX_GPU_DD_COMMS environment variable.");
 245         }
 246         else
 247         {
 248             GMX_LOG(mdlog.warning)
 249                     .asParagraph()
 250                     .appendTextFormatted(
 251                             "NOTE: GMX_GPU_DD_COMMS environment variable detected, but the 'GPU "
 252                             "halo exchange' feature will not be enabled as nonbonded interactions "
 253                             "are not offloaded.");
 254             devFlags.enableGpuHaloExchange = false;
 255         }
 256     }
 257
 258     if (devFlags.forceGpuUpdateDefaultOn)
 259     {
 260         GMX_LOG(mdlog.warning)
 261                 .asParagraph()
 262                 .appendTextFormatted(
 263                         "NOTE: This run will default to '-update gpu' as requested by the "
 264                         "GMX_FORCE_UPDATE_DEFAULT_GPU environment variable.");
 265     }
 266
 267     if (devFlags.enableGpuPmePPComm)
 268     {
 269         if (useGpuForPme)
 270         {
 271             GMX_LOG(mdlog.warning)
 272                     .asParagraph()
 273                     .appendTextFormatted(
 274                             "NOTE: This run uses the 'GPU PME-PP communications' feature, enabled "
 275                             "by the GMX_GPU_PME_PP_COMMS environment variable.");
 276         }
 277         else
 278         {
 279             GMX_LOG(mdlog.warning)
 280                     .asParagraph()
 281                     .appendTextFormatted(
 282                             "NOTE: GMX_GPU_PME_PP_COMMS environment variable detected, but the "
 283                             "'GPU PME-PP communications' feature was not enabled as PME is not "
 284                             "offloaded to the GPU.");
 285             devFlags.enableGpuPmePPComm = false;
 286         }
 287     }
 288
 289     return devFlags;
 290 }
 291
 292 /*! \brief Barrier for safe simultaneous thread access to mdrunner data
 293  *
 294  * Used to ensure that the master thread does not modify mdrunner during copy
 295  * on the spawned threads. */
 296 static void threadMpiMdrunnerAccessBarrier()
 297 {
 298 #if GMX_THREAD_MPI
 299     MPI_Barrier(MPI_COMM_WORLD);
 300 #endif
 301 }
 302
 303 Mdrunner Mdrunner::cloneOnSpawnedThread() const
 304 {
 305     auto newRunner = Mdrunner(std::make_unique<MDModules>());
 306
 307     // All runners in the same process share a restraint manager resource because it is
 308     // part of the interface to the client code, which is associated only with the
 309     // original thread. Handles to the same resources can be obtained by copy.
 310     {
 311         newRunner.restraintManager_ = std::make_unique<RestraintManager>(*restraintManager_);
 312     }
 313
 314     // Copy members of master runner.
 315     // \todo Replace with builder when Simulation context and/or runner phases are better defined.
 316     // Ref https://redmine.gromacs.org/issues/2587 and https://redmine.gromacs.org/issues/2375
 317     newRunner.hw_opt    = hw_opt;
 318     newRunner.filenames = filenames;
 319
 320     newRunner.oenv            = oenv;
 321     newRunner.mdrunOptions    = mdrunOptions;
 322     newRunner.domdecOptions   = domdecOptions;
 323     newRunner.nbpu_opt        = nbpu_opt;
 324     newRunner.pme_opt         = pme_opt;
 325     newRunner.pme_fft_opt     = pme_fft_opt;
 326     newRunner.bonded_opt      = bonded_opt;
 327     newRunner.update_opt      = update_opt;
 328     newRunner.nstlist_cmdline = nstlist_cmdline;
 329     newRunner.replExParams    = replExParams;
 330     newRunner.pforce          = pforce;
 331     // Give the spawned thread the newly created valid communicator
 332     // for the simulation.
 333     newRunner.communicator        = MPI_COMM_WORLD;
 334     newRunner.ms                  = ms;
 335     newRunner.startingBehavior    = startingBehavior;
 336     newRunner.stopHandlerBuilder_ = std::make_unique<StopHandlerBuilder>(*stopHandlerBuilder_);
 337
 338     threadMpiMdrunnerAccessBarrier();
 339
 340     return newRunner;
 341 }
 342
 343 /*! \brief The callback used for running on spawned threads.
 344  *
 345  * Obtains the pointer to the master mdrunner object from the one
 346  * argument permitted to the thread-launch API call, copies it to make
 347  * a new runner for this thread, reinitializes necessary data, and
 348  * proceeds to the simulation. */
 349 static void mdrunner_start_fn(const void* arg)
 350 {
 351     try
 352     {
 353         auto masterMdrunner = reinterpret_cast<const gmx::Mdrunner*>(arg);
 354         /* copy the arg list to make sure that it's thread-local. This
 355            doesn't copy pointed-to items, of course; fnm, cr and fplog
 356            are reset in the call below, all others should be const. */
 357         gmx::Mdrunner mdrunner = masterMdrunner->cloneOnSpawnedThread();
 358         mdrunner.mdrunner();
 359     }
 360     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
 361 }
 362
 363
 364 void Mdrunner::spawnThreads(int numThreadsToLaunch)
 365 {
 366 #if GMX_THREAD_MPI
 367     /* now spawn new threads that start mdrunner_start_fn(), while
 368        the main thread returns. Thread affinity is handled later. */
 369     if (tMPI_Init_fn(TRUE, numThreadsToLaunch, TMPI_AFFINITY_NONE, mdrunner_start_fn,
 370                      static_cast<const void*>(this))
 371         != TMPI_SUCCESS)
 372     {
 373         GMX_THROW(gmx::InternalError("Failed to spawn thread-MPI threads"));
 374     }
 375
 376     // Give the master thread the newly created valid communicator for
 377     // the simulation.
 378     communicator = MPI_COMM_WORLD;
 379     threadMpiMdrunnerAccessBarrier();
 380 #else
 381     GMX_UNUSED_VALUE(numThreadsToLaunch);
 382     GMX_UNUSED_VALUE(mdrunner_start_fn);
 383 #endif
 384 }
 385
 386 } // namespace gmx
 387
 388 /*! \brief Initialize variables for Verlet scheme simulation */
 389 static void prepare_verlet_scheme(FILE*               fplog,
 390                                   t_commrec*          cr,
 391                                   t_inputrec*         ir,
 392                                   int                 nstlist_cmdline,
 393                                   const gmx_mtop_t*   mtop,
 394                                   const matrix        box,
 395                                   bool                makeGpuPairList,
 396                                   const gmx::CpuInfo& cpuinfo)
 397 {
 398     /* For NVE simulations, we will retain the initial list buffer */
 399     if (EI_DYNAMICS(ir->eI) && ir->verletbuf_tol > 0 && !(EI_MD(ir->eI) && ir->etc == etcNO))
 400     {
 401         /* Update the Verlet buffer size for the current run setup */
 402
 403         /* Here we assume SIMD-enabled kernels are being used. But as currently
 404          * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
 405          * and 4x2 gives a larger buffer than 4x4, this is ok.
 406          */
 407         ListSetupType listType =
 408                 (makeGpuPairList ? ListSetupType::Gpu : ListSetupType::CpuSimdWhenSupported);
 409         VerletbufListSetup listSetup = verletbufGetSafeListSetup(listType);
 410
 411         const real rlist_new =
 412                 calcVerletBufferSize(*mtop, det(box), *ir, ir->nstlist, ir->nstlist - 1, -1, listSetup);
 413
 414         if (rlist_new != ir->rlist)
 415         {
 416             if (fplog != nullptr)
 417             {
 418                 fprintf(fplog,
 419                         "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
 420                         ir->rlist, rlist_new, listSetup.cluster_size_i, listSetup.cluster_size_j);
 421             }
 422             ir->rlist = rlist_new;
 423         }
 424     }
 425
 426     if (nstlist_cmdline > 0 && (!EI_DYNAMICS(ir->eI) || ir->verletbuf_tol <= 0))
 427     {
 428         gmx_fatal(FARGS, "Can not set nstlist without %s",
 429                   !EI_DYNAMICS(ir->eI) ? "dynamics" : "verlet-buffer-tolerance");
 430     }
 431
 432     if (EI_DYNAMICS(ir->eI))
 433     {
 434         /* Set or try nstlist values */
 435         increaseNstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, makeGpuPairList, cpuinfo);
 436     }
 437 }
 438
 439 /*! \brief Override the nslist value in inputrec
 440  *
 441  * with value passed on the command line (if any)
 442  */
 443 static void override_nsteps_cmdline(const gmx::MDLogger& mdlog, int64_t nsteps_cmdline, t_inputrec* ir)
 444 {
 445     assert(ir);
 446
 447     /* override with anything else than the default -2 */
 448     if (nsteps_cmdline > -2)
 449     {
 450         char sbuf_steps[STEPSTRSIZE];
 451         char sbuf_msg[STRLEN];
 452
 453         ir->nsteps = nsteps_cmdline;
 454         if (EI_DYNAMICS(ir->eI) && nsteps_cmdline != -1)
 455         {
 456             sprintf(sbuf_msg,
 457                     "Overriding nsteps with value passed on the command line: %s steps, %.3g ps",
 458                     gmx_step_str(nsteps_cmdline, sbuf_steps), fabs(nsteps_cmdline * ir->delta_t));
 459         }
 460         else
 461         {
 462             sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps",
 463                     gmx_step_str(nsteps_cmdline, sbuf_steps));
 464         }
 465
 466         GMX_LOG(mdlog.warning).asParagraph().appendText(sbuf_msg);
 467     }
 468     else if (nsteps_cmdline < -2)
 469     {
 470         gmx_fatal(FARGS, "Invalid nsteps value passed on the command line: %" PRId64, nsteps_cmdline);
 471     }
 472     /* Do nothing if nsteps_cmdline == -2 */
 473 }
 474
 475 namespace gmx
 476 {
 477
 478 /*! \brief Return whether GPU acceleration of nonbondeds is supported with the given settings.
 479  *
 480  * If not, and if a warning may be issued, logs a warning about
 481  * falling back to CPU code. With thread-MPI, only the first
 482  * call to this function should have \c issueWarning true. */
 483 static bool gpuAccelerationOfNonbondedIsUseful(const MDLogger& mdlog, const t_inputrec& ir, bool issueWarning)
 484 {
 485     bool        gpuIsUseful = true;
 486     std::string warning;
 487
 488     if (ir.opts.ngener - ir.nwall > 1)
 489     {
 490         /* The GPU code does not support more than one energy group.
 491          * If the user requested GPUs explicitly, a fatal error is given later.
 492          */
 493         gpuIsUseful = false;
 494         warning =
 495                 "Multiple energy groups is not implemented for GPUs, falling back to the CPU. "
 496                 "For better performance, run on the GPU without energy groups and then do "
 497                 "gmx mdrun -rerun option on the trajectory with an energy group .tpr file.";
 498     }
 499
 500     if (EI_TPI(ir.eI))
 501     {
 502         gpuIsUseful = false;
 503         warning     = "TPI is not implemented for GPUs.";
 504     }
 505
 506     if (!gpuIsUseful && issueWarning)
 507     {
 508         GMX_LOG(mdlog.warning).asParagraph().appendText(warning);
 509     }
 510
 511     return gpuIsUseful;
 512 }
 513
 514 //! Initializes the logger for mdrun.
 515 static gmx::LoggerOwner buildLogger(FILE* fplog, const bool isSimulationMasterRank)
 516 {
 517     gmx::LoggerBuilder builder;
 518     if (fplog != nullptr)
 519     {
 520         builder.addTargetFile(gmx::MDLogger::LogLevel::Info, fplog);
 521     }
 522     if (isSimulationMasterRank)
 523     {
 524         builder.addTargetStream(gmx::MDLogger::LogLevel::Warning, &gmx::TextOutputFile::standardError());
 525     }
 526     return builder.build();
 527 }
 528
 529 //! Make a TaskTarget from an mdrun argument string.
 530 static TaskTarget findTaskTarget(const char* optionString)
 531 {
 532     TaskTarget returnValue = TaskTarget::Auto;
 533
 534     if (strncmp(optionString, "auto", 3) == 0)
 535     {
 536         returnValue = TaskTarget::Auto;
 537     }
 538     else if (strncmp(optionString, "cpu", 3) == 0)
 539     {
 540         returnValue = TaskTarget::Cpu;
 541     }
 542     else if (strncmp(optionString, "gpu", 3) == 0)
 543     {
 544         returnValue = TaskTarget::Gpu;
 545     }
 546     else
 547     {
 548         GMX_ASSERT(false, "Option string should have been checked for sanity already");
 549     }
 550
 551     return returnValue;
 552 }
 553
 554 //! Finish run, aggregate data to print performance info.
 555 static void finish_run(FILE*                     fplog,
 556                        const gmx::MDLogger&      mdlog,
 557                        const t_commrec*          cr,
 558                        const t_inputrec*         inputrec,
 559                        t_nrnb                    nrnb[],
 560                        gmx_wallcycle_t           wcycle,
 561                        gmx_walltime_accounting_t walltime_accounting,
 562                        nonbonded_verlet_t*       nbv,
 563                        const gmx_pme_t*          pme,
 564                        gmx_bool                  bWriteStat)
 565 {
 566     double delta_t = 0;
 567     double nbfs = 0, mflop = 0;
 568     double elapsed_time, elapsed_time_over_all_ranks, elapsed_time_over_all_threads,
 569             elapsed_time_over_all_threads_over_all_ranks;
 570     /* Control whether it is valid to print a report. Only the
 571        simulation master may print, but it should not do so if the run
 572        terminated e.g. before a scheduled reset step. This is
 573        complicated by the fact that PME ranks are unaware of the
 574        reason why they were sent a pmerecvqxFINISH. To avoid
 575        communication deadlocks, we always do the communication for the
 576        report, even if we've decided not to write the report, because
 577        how long it takes to finish the run is not important when we've
 578        decided not to report on the simulation performance.
 579
 580        Further, we only report performance for dynamical integrators,
 581        because those are the only ones for which we plan to
 582        consider doing any optimizations. */
 583     bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
 584
 585     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
 586     {
 587         GMX_LOG(mdlog.warning)
 588                 .asParagraph()
 589                 .appendText("Simulation ended prematurely, no performance report will be written.");
 590         printReport = false;
 591     }
 592
 593     t_nrnb*                 nrnb_tot;
 594     std::unique_ptr<t_nrnb> nrnbTotalStorage;
 595     if (cr->nnodes > 1)
 596     {
 597         nrnbTotalStorage = std::make_unique<t_nrnb>();
 598         nrnb_tot         = nrnbTotalStorage.get();
 599 #if GMX_MPI
 600         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 601 #endif
 602     }
 603     else
 604     {
 605         nrnb_tot = nrnb;
 606     }
 607
 608     elapsed_time = walltime_accounting_get_time_since_reset(walltime_accounting);
 609     elapsed_time_over_all_threads =
 610             walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
 611     if (cr->nnodes > 1)
 612     {
 613 #if GMX_MPI
 614         /* reduce elapsed_time over all MPI ranks in the current simulation */
 615         MPI_Allreduce(&elapsed_time, &elapsed_time_over_all_ranks, 1, MPI_DOUBLE, MPI_SUM,
 616                       cr->mpi_comm_mysim);
 617         elapsed_time_over_all_ranks /= cr->nnodes;
 618         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
 619          * current simulation. */
 620         MPI_Allreduce(&elapsed_time_over_all_threads, &elapsed_time_over_all_threads_over_all_ranks,
 621                       1, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 622 #endif
 623     }
 624     else
 625     {
 626         elapsed_time_over_all_ranks                  = elapsed_time;
 627         elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
 628     }
 629
 630     if (printReport)
 631     {
 632         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
 633     }
 634
 635     if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
 636     {
 637         print_dd_statistics(cr, inputrec, fplog);
 638     }
 639
 640     /* TODO Move the responsibility for any scaling by thread counts
 641      * to the code that handled the thread region, so that there's a
 642      * mechanism to keep cycle counting working during the transition
 643      * to task parallelism. */
 644     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
 645     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
 646     wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP),
 647                                    nthreads_pp, nthreads_pme);
 648     auto cycle_sum(wallcycle_sum(cr, wcycle));
 649
 650     if (printReport)
 651     {
 652         auto nbnxn_gpu_timings =
 653                 (nbv != nullptr && nbv->useGpu()) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
 654         gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
 655
 656         if (pme_gpu_task_enabled(pme))
 657         {
 658             pme_gpu_get_timings(pme, &pme_gpu_timings);
 659         }
 660         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
 661                         elapsed_time_over_all_ranks, wcycle, cycle_sum, nbnxn_gpu_timings,
 662                         &pme_gpu_timings);
 663
 664         if (EI_DYNAMICS(inputrec->eI))
 665         {
 666             delta_t = inputrec->delta_t;
 667         }
 668
 669         if (fplog)
 670         {
 671             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks, elapsed_time_over_all_ranks,
 672                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
 673                        delta_t, nbfs, mflop);
 674         }
 675         if (bWriteStat)
 676         {
 677             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks, elapsed_time_over_all_ranks,
 678                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
 679                        delta_t, nbfs, mflop);
 680         }
 681     }
 682 }
 683
 684 int Mdrunner::mdrunner()
 685 {
 686     matrix                    box;
 687     t_forcerec*               fr               = nullptr;
 688     t_fcdata*                 fcd              = nullptr;
 689     real                      ewaldcoeff_q     = 0;
 690     real                      ewaldcoeff_lj    = 0;
 691     int                       nChargePerturbed = -1, nTypePerturbed = 0;
 692     gmx_wallcycle_t           wcycle;
 693     gmx_walltime_accounting_t walltime_accounting = nullptr;
 694     gmx_membed_t*             membed              = nullptr;
 695     gmx_hw_info_t*            hwinfo              = nullptr;
 696
 697     /* CAUTION: threads may be started later on in this function, so
 698        cr doesn't reflect the final parallel state right now */
 699     gmx_mtop_t mtop;
 700
 701     /* TODO: inputrec should tell us whether we use an algorithm, not a file option */
 702     const bool doEssentialDynamics = opt2bSet("-ei", filenames.size(), filenames.data());
 703     const bool doMembed            = opt2bSet("-membed", filenames.size(), filenames.data());
 704     const bool doRerun             = mdrunOptions.rerun;
 705
 706     // Handle task-assignment related user options.
 707     EmulateGpuNonbonded emulateGpuNonbonded =
 708             (getenv("GMX_EMULATE_GPU") != nullptr ? EmulateGpuNonbonded::Yes : EmulateGpuNonbonded::No);
 709
 710     std::vector<int> userGpuTaskAssignment;
 711     try
 712     {
 713         userGpuTaskAssignment = parseUserTaskAssignmentString(hw_opt.userGpuTaskAssignment);
 714     }
 715     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
 716     auto       nonbondedTarget = findTaskTarget(nbpu_opt);
 717     auto       pmeTarget       = findTaskTarget(pme_opt);
 718     auto       pmeFftTarget    = findTaskTarget(pme_fft_opt);
 719     auto       bondedTarget    = findTaskTarget(bonded_opt);
 720     auto       updateTarget    = findTaskTarget(update_opt);
 721     PmeRunMode pmeRunMode      = PmeRunMode::None;
 722
 723     FILE* fplog = nullptr;
 724     // If we are appending, we don't write log output because we need
 725     // to check that the old log file matches what the checkpoint file
 726     // expects. Otherwise, we should start to write log output now if
 727     // there is a file ready for it.
 728     if (logFileHandle != nullptr && startingBehavior != StartingBehavior::RestartWithAppending)
 729     {
 730         fplog = gmx_fio_getfp(logFileHandle);
 731     }
 732     const bool       isSimulationMasterRank = findIsSimulationMasterRank(ms, communicator);
 733     gmx::LoggerOwner logOwner(buildLogger(fplog, isSimulationMasterRank));
 734     gmx::MDLogger    mdlog(logOwner.logger());
 735
 736     // TODO The thread-MPI master rank makes a working
 737     // PhysicalNodeCommunicator here, but it gets rebuilt by all ranks
 738     // after the threads have been launched. This works because no use
 739     // is made of that communicator until after the execution paths
 740     // have rejoined. But it is likely that we can improve the way
 741     // this is expressed, e.g. by expressly running detection only the
 742     // master rank for thread-MPI, rather than relying on the mutex
 743     // and reference count.
 744     PhysicalNodeCommunicator physicalNodeComm(communicator, gmx_physicalnode_id_hash());
 745     hwinfo = gmx_detect_hardware(mdlog, physicalNodeComm);
 746
 747     gmx_print_detected_hardware(fplog, isSimulationMasterRank && isMasterSim(ms), mdlog, hwinfo);
 748
 749     std::vector<int> gpuIdsToUse = makeGpuIdsToUse(hwinfo->gpu_info, hw_opt.gpuIdsAvailable);
 750
 751     // Print citation requests after all software/hardware printing
 752     pleaseCiteGromacs(fplog);
 753
 754     // TODO Replace this by unique_ptr once t_inputrec is C++
 755     t_inputrec               inputrecInstance;
 756     t_inputrec*              inputrec = nullptr;
 757     std::unique_ptr<t_state> globalState;
 758
 759     auto partialDeserializedTpr = std::make_unique<PartialDeserializedTprFile>();
 760
 761     if (isSimulationMasterRank)
 762     {
 763         /* Only the master rank has the global state */
 764         globalState = std::make_unique<t_state>();
 765
 766         /* Read (nearly) all data required for the simulation
 767          * and keep the partly serialized tpr contents to send to other ranks later
 768          */
 769         *partialDeserializedTpr = read_tpx_state(ftp2fn(efTPR, filenames.size(), filenames.data()),
 770                                                  &inputrecInstance, globalState.get(), &mtop);
 771         inputrec                = &inputrecInstance;
 772     }
 773
 774     /* Check and update the hardware options for internal consistency */
 775     checkAndUpdateHardwareOptions(mdlog, &hw_opt, isSimulationMasterRank, domdecOptions.numPmeRanks,
 776                                   inputrec);
 777
 778     if (GMX_THREAD_MPI && isSimulationMasterRank)
 779     {
 780         bool useGpuForNonbonded = false;
 781         bool useGpuForPme       = false;
 782         try
 783         {
 784             GMX_RELEASE_ASSERT(inputrec != nullptr, "Keep the compiler happy");
 785
 786             // If the user specified the number of ranks, then we must
 787             // respect that, but in default mode, we need to allow for
 788             // the number of GPUs to choose the number of ranks.
 789             auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
 790             useGpuForNonbonded         = decideWhetherToUseGpusForNonbondedWithThreadMpi(
 791                     nonbondedTarget, gpuIdsToUse, userGpuTaskAssignment, emulateGpuNonbonded,
 792                     canUseGpuForNonbonded,
 793                     gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, GMX_THREAD_MPI),
 794                     hw_opt.nthreads_tmpi);
 795             useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi(
 796                     useGpuForNonbonded, pmeTarget, gpuIdsToUse, userGpuTaskAssignment, *hwinfo,
 797                     *inputrec, mtop, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
 798         }
 799         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
 800
 801         /* Determine how many thread-MPI ranks to start.
 802          *
 803          * TODO Over-writing the user-supplied value here does
 804          * prevent any possible subsequent checks from working
 805          * correctly. */
 806         hw_opt.nthreads_tmpi = get_nthreads_mpi(hwinfo, &hw_opt, gpuIdsToUse, useGpuForNonbonded,
 807                                                 useGpuForPme, inputrec, &mtop, mdlog, doMembed);
 808
 809         // Now start the threads for thread MPI.
 810         spawnThreads(hw_opt.nthreads_tmpi);
 811         // The spawned threads enter mdrunner() and execution of
 812         // master and spawned threads joins at the end of this block.
 813         physicalNodeComm = PhysicalNodeCommunicator(communicator, gmx_physicalnode_id_hash());
 814     }
 815
 816     GMX_RELEASE_ASSERT(communicator == MPI_COMM_WORLD, "Must have valid world communicator");
 817     CommrecHandle crHandle = init_commrec(communicator, ms);
 818     t_commrec*    cr       = crHandle.get();
 819     GMX_RELEASE_ASSERT(cr != nullptr, "Must have valid commrec");
 820
 821     if (PAR(cr))
 822     {
 823         /* now broadcast everything to the non-master nodes/threads: */
 824         if (!isSimulationMasterRank)
 825         {
 826             inputrec = &inputrecInstance;
 827         }
 828         init_parallel(cr, inputrec, &mtop, partialDeserializedTpr.get());
 829     }
 830     GMX_RELEASE_ASSERT(inputrec != nullptr, "All ranks should have a valid inputrec now");
 831     partialDeserializedTpr.reset(nullptr);
 832
 833     // Now the number of ranks is known to all ranks, and each knows
 834     // the inputrec read by the master rank. The ranks can now all run
 835     // the task-deciding functions and will agree on the result
 836     // without needing to communicate.
 837     //
 838     // TODO Should we do the communication in debug mode to support
 839     // having an assertion?
 840     const bool useDomainDecomposition = (PAR(cr) && !(EI_TPI(inputrec->eI) || inputrec->eI == eiNM));
 841
 842     // Note that these variables describe only their own node.
 843     //
 844     // Note that when bonded interactions run on a GPU they always run
 845     // alongside a nonbonded task, so do not influence task assignment
 846     // even though they affect the force calculation workload.
 847     bool useGpuForNonbonded = false;
 848     bool useGpuForPme       = false;
 849     bool useGpuForBonded    = false;
 850     bool useGpuForUpdate    = false;
 851     bool gpusWereDetected   = hwinfo->ngpu_compatible_tot > 0;
 852     try
 853     {
 854         // It's possible that there are different numbers of GPUs on
 855         // different nodes, which is the user's responsibility to
 856         // handle. If unsuitable, we will notice that during task
 857         // assignment.
 858         auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
 859         useGpuForNonbonded         = decideWhetherToUseGpusForNonbonded(
 860                 nonbondedTarget, userGpuTaskAssignment, emulateGpuNonbonded, canUseGpuForNonbonded,
 861                 gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, !GMX_THREAD_MPI), gpusWereDetected);
 862         useGpuForPme = decideWhetherToUseGpusForPme(
 863                 useGpuForNonbonded, pmeTarget, userGpuTaskAssignment, *hwinfo, *inputrec, mtop,
 864                 cr->nnodes, domdecOptions.numPmeRanks, gpusWereDetected);
 865         auto canUseGpuForBonded = buildSupportsGpuBondeds(nullptr)
 866                                   && inputSupportsGpuBondeds(*inputrec, mtop, nullptr);
 867         useGpuForBonded = decideWhetherToUseGpusForBonded(
 868                 useGpuForNonbonded, useGpuForPme, bondedTarget, canUseGpuForBonded,
 869                 EVDW_PME(inputrec->vdwtype), EEL_PME_EWALD(inputrec->coulombtype),
 870                 domdecOptions.numPmeRanks, gpusWereDetected);
 871
 872         pmeRunMode = (useGpuForPme ? PmeRunMode::GPU : PmeRunMode::CPU);
 873         if (pmeRunMode == PmeRunMode::GPU)
 874         {
 875             if (pmeFftTarget == TaskTarget::Cpu)
 876             {
 877                 pmeRunMode = PmeRunMode::Mixed;
 878             }
 879         }
 880         else if (pmeFftTarget == TaskTarget::Gpu)
 881         {
 882             gmx_fatal(FARGS,
 883                       "Assigning FFTs to GPU requires PME to be assigned to GPU as well. With PME "
 884                       "on CPU you should not be using -pmefft.");
 885         }
 886     }
 887     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
 888
 889     // Initialize development feature flags that enabled by environment variable
 890     // and report those features that are enabled.
 891     const DevelopmentFeatureFlags devFlags =
 892             manageDevelopmentFeatures(mdlog, useGpuForNonbonded, useGpuForPme);
 893
 894     // NOTE: The devFlags need decideWhetherToUseGpusForNonbonded(...) and decideWhetherToUseGpusForPme(...) for overrides,
 895     //       decideWhetherToUseGpuForUpdate() needs devFlags for the '-update auto' override, hence the interleaving.
 896     // NOTE: When the simulationWork is constructed, the useGpuForUpdate overrides the devFlags.enableGpuBufferOps.
 897     try
 898     {
 899         useGpuForUpdate = decideWhetherToUseGpuForUpdate(
 900                 devFlags.forceGpuUpdateDefaultOn, useDomainDecomposition, useGpuForPme,
 901                 useGpuForNonbonded, updateTarget, gpusWereDetected, *inputrec,
 902                 gmx_mtop_interaction_count(mtop, IF_VSITE) > 0, doEssentialDynamics,
 903                 gmx_mtop_ftype_count(mtop, F_ORIRES) > 0, replExParams.exchangeInterval > 0);
 904     }
 905     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
 906
 907
 908     // Build restraints.
 909     // TODO: hide restraint implementation details from Mdrunner.
 910     // There is nothing unique about restraints at this point as far as the
 911     // Mdrunner is concerned. The Mdrunner should just be getting a sequence of
 912     // factory functions from the SimulationContext on which to call mdModules_->add().
 913     // TODO: capture all restraints into a single RestraintModule, passed to the runner builder.
 914     for (auto&& restraint : restraintManager_->getRestraints())
 915     {
 916         auto module = RestraintMDModule::create(restraint, restraint->sites());
 917         mdModules_->add(std::move(module));
 918     }
 919
 920     // TODO: Error handling
 921     mdModules_->assignOptionsToModules(*inputrec->params, nullptr);
 922     const auto& mdModulesNotifier = mdModules_->notifier().notifier_;
 923
 924     if (inputrec->internalParameters != nullptr)
 925     {
 926         mdModulesNotifier.notify(*inputrec->internalParameters);
 927     }
 928
 929     if (fplog != nullptr)
 930     {
 931         pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
 932         fprintf(fplog, "\n");
 933     }
 934
 935     if (SIMMASTER(cr))
 936     {
 937         /* In rerun, set velocities to zero if present */
 938         if (doRerun && ((globalState->flags & (1 << estV)) != 0))
 939         {
 940             // rerun does not use velocities
 941             GMX_LOG(mdlog.info)
 942                     .asParagraph()
 943                     .appendText(
 944                             "Rerun trajectory contains velocities. Rerun does only evaluate "
 945                             "potential energy and forces. The velocities will be ignored.");
 946             for (int i = 0; i < globalState->natoms; i++)
 947             {
 948                 clear_rvec(globalState->v[i]);
 949             }
 950             globalState->flags &= ~(1 << estV);
 951         }
 952
 953         /* now make sure the state is initialized and propagated */
 954         set_state_entries(globalState.get(), inputrec);
 955     }
 956
 957     /* NM and TPI parallelize over force/energy calculations, not atoms,
 958      * so we need to initialize and broadcast the global state.
 959      */
 960     if (inputrec->eI == eiNM || inputrec->eI == eiTPI)
 961     {
 962         if (!MASTER(cr))
 963         {
 964             globalState = std::make_unique<t_state>();
 965         }
 966         broadcastStateWithoutDynamics(cr, globalState.get());
 967     }
 968
 969     /* A parallel command line option consistency check that we can
 970        only do after any threads have started. */
 971     if (!PAR(cr)
 972         && (domdecOptions.numCells[XX] > 1 || domdecOptions.numCells[YY] > 1
 973             || domdecOptions.numCells[ZZ] > 1 || domdecOptions.numPmeRanks > 0))
 974     {
 975         gmx_fatal(FARGS,
 976                   "The -dd or -npme option request a parallel simulation, "
 977 #if !GMX_MPI
 978                   "but %s was compiled without threads or MPI enabled",
 979                   output_env_get_program_display_name(oenv));
 980 #elif GMX_THREAD_MPI
 981                   "but the number of MPI-threads (option -ntmpi) is not set or is 1");
 982 #else
 983                   "but %s was not started through mpirun/mpiexec or only one rank was requested "
 984                   "through mpirun/mpiexec",
 985                   output_env_get_program_display_name(oenv));
 986 #endif
 987     }
 988
 989     if (doRerun && (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
 990     {
 991         gmx_fatal(FARGS,
 992                   "The .mdp file specified an energy mininization or normal mode algorithm, and "
 993                   "these are not compatible with mdrun -rerun");
 994     }
 995
 996     if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
 997     {
 998         if (domdecOptions.numPmeRanks > 0)
 999         {
1000             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
1001                                  "PME-only ranks are requested, but the system does not use PME "
1002                                  "for electrostatics or LJ");
1003         }
1004
1005         domdecOptions.numPmeRanks = 0;
1006     }
1007
1008     if (useGpuForNonbonded && domdecOptions.numPmeRanks < 0)
1009     {
1010         /* With NB GPUs we don't automatically use PME-only CPU ranks. PME ranks can
1011          * improve performance with many threads per GPU, since our OpenMP
1012          * scaling is bad, but it's difficult to automate the setup.
1013          */
1014         domdecOptions.numPmeRanks = 0;
1015     }
1016     if (useGpuForPme)
1017     {
1018         if (domdecOptions.numPmeRanks < 0)
1019         {
1020             domdecOptions.numPmeRanks = 0;
1021             // TODO possibly print a note that one can opt-in for a separate PME GPU rank?
1022         }
1023         else
1024         {
1025             GMX_RELEASE_ASSERT(domdecOptions.numPmeRanks <= 1,
1026                                "PME GPU decomposition is not supported");
1027         }
1028     }
1029
1030 #if GMX_FAHCORE
1031     if (MASTER(cr))
1032     {
1033         fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
1034     }
1035 #endif
1036
1037     /* NMR restraints must be initialized before load_checkpoint,
1038      * since with time averaging the history is added to t_state.
1039      * For proper consistency check we therefore need to extend
1040      * t_state here.
1041      * So the PME-only nodes (if present) will also initialize
1042      * the distance restraints.
1043      */
1044     snew(fcd, 1);
1045
1046     /* This needs to be called before read_checkpoint to extend the state */
1047     init_disres(fplog, &mtop, inputrec, cr, ms, fcd, globalState.get(), replExParams.exchangeInterval > 0);
1048
1049     init_orires(fplog, &mtop, inputrec, cr, ms, globalState.get(), &(fcd->orires));
1050
1051     auto deform = prepareBoxDeformation(globalState->box, cr, *inputrec);
1052
1053     ObservablesHistory observablesHistory = {};
1054
1055     if (startingBehavior != StartingBehavior::NewSimulation)
1056     {
1057         /* Check if checkpoint file exists before doing continuation.
1058          * This way we can use identical input options for the first and subsequent runs...
1059          */
1060         if (mdrunOptions.numStepsCommandline > -2)
1061         {
1062             /* Temporarily set the number of steps to unlimited to avoid
1063              * triggering the nsteps check in load_checkpoint().
1064              * This hack will go away soon when the -nsteps option is removed.
1065              */
1066             inputrec->nsteps = -1;
1067         }
1068
1069         load_checkpoint(opt2fn_master("-cpi", filenames.size(), filenames.data(), cr),
1070                         logFileHandle, cr, domdecOptions.numCells, inputrec, globalState.get(),
1071                         &observablesHistory, mdrunOptions.reproducible, mdModules_->notifier());
1072
1073         if (startingBehavior == StartingBehavior::RestartWithAppending && logFileHandle)
1074         {
1075             // Now we can start normal logging to the truncated log file.
1076             fplog = gmx_fio_getfp(logFileHandle);
1077             prepareLogAppending(fplog);
1078             logOwner = buildLogger(fplog, MASTER(cr));
1079             mdlog    = logOwner.logger();
1080         }
1081     }
1082
1083     if (mdrunOptions.numStepsCommandline > -2)
1084     {
1085         GMX_LOG(mdlog.info)
1086                 .asParagraph()
1087                 .appendText(
1088                         "The -nsteps functionality is deprecated, and may be removed in a future "
1089                         "version. "
1090                         "Consider using gmx convert-tpr -nsteps or changing the appropriate .mdp "
1091                         "file field.");
1092     }
1093     /* override nsteps with value set on the commandline */
1094     override_nsteps_cmdline(mdlog, mdrunOptions.numStepsCommandline, inputrec);
1095
1096     if (SIMMASTER(cr))
1097     {
1098         copy_mat(globalState->box, box);
1099     }
1100
1101     if (PAR(cr))
1102     {
1103         gmx_bcast(sizeof(box), box, cr);
1104     }
1105
1106     if (inputrec->cutoff_scheme != ecutsVERLET)
1107     {
1108         gmx_fatal(FARGS,
1109                   "This group-scheme .tpr file can no longer be run by mdrun. Please update to the "
1110                   "Verlet scheme, or use an earlier version of GROMACS if necessary.");
1111     }
1112     /* Update rlist and nstlist. */
1113     prepare_verlet_scheme(fplog, cr, inputrec, nstlist_cmdline, &mtop, box,
1114                           useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes),
1115                           *hwinfo->cpuInfo);
1116
1117     const bool prefer1DAnd1PulseDD = (devFlags.enableGpuHaloExchange && useGpuForNonbonded);
1118     // This builder is necessary while we have multi-part construction
1119     // of DD. Before DD is constructed, we use the existence of
1120     // the builder object to indicate that further construction of DD
1121     // is needed.
1122     std::unique_ptr<DomainDecompositionBuilder> ddBuilder;
1123     if (useDomainDecomposition)
1124     {
1125         ddBuilder = std::make_unique<DomainDecompositionBuilder>(
1126                 mdlog, cr, domdecOptions, mdrunOptions, prefer1DAnd1PulseDD, mtop, *inputrec, box,
1127                 positionsFromStatePointer(globalState.get()));
1128     }
1129     else
1130     {
1131         /* PME, if used, is done on all nodes with 1D decomposition */
1132         cr->npmenodes = 0;
1133         cr->duty      = (DUTY_PP | DUTY_PME);
1134
1135         if (inputrec->ePBC == epbcSCREW)
1136         {
1137             gmx_fatal(FARGS, "pbc=screw is only implemented with domain decomposition");
1138         }
1139     }
1140
1141     // Produce the task assignment for this rank.
1142     GpuTaskAssignmentsBuilder gpuTaskAssignmentsBuilder;
1143     GpuTaskAssignments        gpuTaskAssignments = gpuTaskAssignmentsBuilder.build(
1144             gpuIdsToUse, userGpuTaskAssignment, *hwinfo, cr, ms, physicalNodeComm, nonbondedTarget,
1145             pmeTarget, bondedTarget, updateTarget, useGpuForNonbonded, useGpuForPme,
1146             thisRankHasDuty(cr, DUTY_PP),
1147             // TODO cr->duty & DUTY_PME should imply that a PME
1148             // algorithm is active, but currently does not.
1149             EEL_PME(inputrec->coulombtype) && thisRankHasDuty(cr, DUTY_PME));
1150
1151     const bool printHostName = (cr->nnodes > 1);
1152     gpuTaskAssignments.reportGpuUsage(mdlog, printHostName, useGpuForBonded, pmeRunMode);
1153
1154     // If the user chose a task assignment, give them some hints
1155     // where appropriate.
1156     if (!userGpuTaskAssignment.empty())
1157     {
1158         gpuTaskAssignments.logPerformanceHints(mdlog, ssize(gpuIdsToUse));
1159     }
1160
1161     // Get the device handles for the modules, nullptr when no task is assigned.
1162     gmx_device_info_t* nonbondedDeviceInfo = gpuTaskAssignments.initNonbondedDevice(cr);
1163     gmx_device_info_t* pmeDeviceInfo       = gpuTaskAssignments.initPmeDevice();
1164
1165     // TODO Initialize GPU streams here.
1166
1167     // TODO Currently this is always built, yet DD partition code
1168     // checks if it is built before using it. Probably it should
1169     // become an MDModule that is made only when another module
1170     // requires it (e.g. pull, CompEl, density fitting), so that we
1171     // don't update the local atom sets unilaterally every step.
1172     LocalAtomSetManager atomSets;
1173     if (ddBuilder)
1174     {
1175         // TODO Pass the GPU streams to ddBuilder to use in buffer
1176         // transfers (e.g. halo exchange)
1177         cr->dd = ddBuilder->build(&atomSets);
1178         // The builder's job is done, so destruct it
1179         ddBuilder.reset(nullptr);
1180         // Note that local state still does not exist yet.
1181     }
1182
1183     if (PAR(cr))
1184     {
1185         /* After possible communicator splitting in make_dd_communicators.
1186          * we can set up the intra/inter node communication.
1187          */
1188         gmx_setup_nodecomm(fplog, cr);
1189     }
1190
1191 #if GMX_MPI
1192     if (isMultiSim(ms))
1193     {
1194         GMX_LOG(mdlog.warning)
1195                 .asParagraph()
1196                 .appendTextFormatted(
1197                         "This is simulation %d out of %d running as a composite GROMACS\n"
1198                         "multi-simulation job. Setup for this simulation:\n",
1199                         ms->sim, ms->nsim);
1200     }
1201     GMX_LOG(mdlog.warning)
1202             .appendTextFormatted("Using %d MPI %s\n", cr->nnodes,
1203 #    if GMX_THREAD_MPI
1204                                  cr->nnodes == 1 ? "thread" : "threads"
1205 #    else
1206                                  cr->nnodes == 1 ? "process" : "processes"
1207 #    endif
1208             );
1209     fflush(stderr);
1210 #endif
1211
1212     // If mdrun -pin auto honors any affinity setting that already
1213     // exists. If so, it is nice to provide feedback about whether
1214     // that existing affinity setting was from OpenMP or something
1215     // else, so we run this code both before and after we initialize
1216     // the OpenMP support.
1217     gmx_check_thread_affinity_set(mdlog, &hw_opt, hwinfo->nthreads_hw_avail, FALSE);
1218     /* Check and update the number of OpenMP threads requested */
1219     checkAndUpdateRequestedNumOpenmpThreads(&hw_opt, *hwinfo, cr, ms, physicalNodeComm.size_,
1220                                             pmeRunMode, mtop, *inputrec);
1221
1222     gmx_omp_nthreads_init(mdlog, cr, hwinfo->nthreads_hw_avail, physicalNodeComm.size_,
1223                           hw_opt.nthreads_omp, hw_opt.nthreads_omp_pme, !thisRankHasDuty(cr, DUTY_PP));
1224
1225     // Enable FP exception detection, but not in
1226     // Release mode and not for compilers with known buggy FP
1227     // exception support (clang with any optimization) or suspected
1228     // buggy FP exception support (gcc 7.* with optimization).
1229 #if !defined NDEBUG                                                                         \
1230         && !((defined __clang__ || (defined(__GNUC__) && !defined(__ICC) && __GNUC__ == 7)) \
1231              && defined __OPTIMIZE__)
1232     const bool bEnableFPE = true;
1233 #else
1234     const bool bEnableFPE = false;
1235 #endif
1236     // FIXME - reconcile with gmx_feenableexcept() call from CommandLineModuleManager::run()
1237     if (bEnableFPE)
1238     {
1239         gmx_feenableexcept();
1240     }
1241
1242     /* Now that we know the setup is consistent, check for efficiency */
1243     check_resource_division_efficiency(hwinfo, gpuTaskAssignments.thisRankHasAnyGpuTask(),
1244                                        mdrunOptions.ntompOptionIsSet, cr, mdlog);
1245
1246     /* getting number of PP/PME threads on this MPI / tMPI rank.
1247        PME: env variable should be read only on one node to make sure it is
1248        identical everywhere;
1249      */
1250     const int numThreadsOnThisRank = thisRankHasDuty(cr, DUTY_PP) ? gmx_omp_nthreads_get(emntNonbonded)
1251                                                                   : gmx_omp_nthreads_get(emntPME);
1252     checkHardwareOversubscription(numThreadsOnThisRank, cr->nodeid, *hwinfo->hardwareTopology,
1253                                   physicalNodeComm, mdlog);
1254
1255     // Enable Peer access between GPUs where available
1256     // Only for DD, only master PP rank needs to perform setup, and only if thread MPI plus
1257     // any of the GPU communication features are active.
1258     if (DOMAINDECOMP(cr) && MASTER(cr) && thisRankHasDuty(cr, DUTY_PP) && GMX_THREAD_MPI
1259         && (devFlags.enableGpuHaloExchange || devFlags.enableGpuPmePPComm))
1260     {
1261         setupGpuDevicePeerAccess(gpuIdsToUse, mdlog);
1262     }
1263
1264     if (hw_opt.threadAffinity != ThreadAffinity::Off)
1265     {
1266         /* Before setting affinity, check whether the affinity has changed
1267          * - which indicates that probably the OpenMP library has changed it
1268          * since we first checked).
1269          */
1270         gmx_check_thread_affinity_set(mdlog, &hw_opt, hwinfo->nthreads_hw_avail, TRUE);
1271
1272         int numThreadsOnThisNode, intraNodeThreadOffset;
1273         analyzeThreadsOnThisNode(physicalNodeComm, numThreadsOnThisRank, &numThreadsOnThisNode,
1274                                  &intraNodeThreadOffset);
1275
1276         /* Set the CPU affinity */
1277         gmx_set_thread_affinity(mdlog, cr, &hw_opt, *hwinfo->hardwareTopology, numThreadsOnThisRank,
1278                                 numThreadsOnThisNode, intraNodeThreadOffset, nullptr);
1279     }
1280
1281     if (mdrunOptions.timingOptions.resetStep > -1)
1282     {
1283         GMX_LOG(mdlog.info)
1284                 .asParagraph()
1285                 .appendText(
1286                         "The -resetstep functionality is deprecated, and may be removed in a "
1287                         "future version.");
1288     }
1289     wcycle = wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
1290
1291     if (PAR(cr))
1292     {
1293         /* Master synchronizes its value of reset_counters with all nodes
1294          * including PME only nodes */
1295         int64_t reset_counters = wcycle_get_reset_counters(wcycle);
1296         gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
1297         wcycle_set_reset_counters(wcycle, reset_counters);
1298     }
1299
1300     // Membrane embedding must be initialized before we call init_forcerec()
1301     if (doMembed)
1302     {
1303         if (MASTER(cr))
1304         {
1305             fprintf(stderr, "Initializing membed");
1306         }
1307         /* Note that membed cannot work in parallel because mtop is
1308          * changed here. Fix this if we ever want to make it run with
1309          * multiple ranks. */
1310         membed = init_membed(fplog, filenames.size(), filenames.data(), &mtop, inputrec,
1311                              globalState.get(), cr, &mdrunOptions.checkpointOptions.period);
1312     }
1313
1314     const bool                   thisRankHasPmeGpuTask = gpuTaskAssignments.thisRankHasPmeGpuTask();
1315     std::unique_ptr<MDAtoms>     mdAtoms;
1316     std::unique_ptr<gmx_vsite_t> vsite;
1317
1318     t_nrnb nrnb;
1319     if (thisRankHasDuty(cr, DUTY_PP))
1320     {
1321         mdModulesNotifier.notify(*cr);
1322         mdModulesNotifier.notify(&atomSets);
1323         mdModulesNotifier.notify(PeriodicBoundaryConditionType{ inputrec->ePBC });
1324         mdModulesNotifier.notify(SimulationTimeStep{ inputrec->delta_t });
1325         /* Initiate forcerecord */
1326         fr                 = new t_forcerec;
1327         fr->forceProviders = mdModules_->initForceProviders();
1328         init_forcerec(fplog, mdlog, fr, fcd, inputrec, &mtop, cr, box,
1329                       opt2fn("-table", filenames.size(), filenames.data()),
1330                       opt2fn("-tablep", filenames.size(), filenames.data()),
1331                       opt2fns("-tableb", filenames.size(), filenames.data()), *hwinfo,
1332                       nonbondedDeviceInfo, useGpuForBonded,
1333                       pmeRunMode == PmeRunMode::GPU && !thisRankHasDuty(cr, DUTY_PME), pforce, wcycle);
1334
1335         // TODO Move this to happen during domain decomposition setup,
1336         // once stream and event handling works well with that.
1337         // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
1338         if (havePPDomainDecomposition(cr) && prefer1DAnd1PulseDD && is1DAnd1PulseDD(*cr->dd))
1339         {
1340             GMX_RELEASE_ASSERT(devFlags.enableGpuBufferOps,
1341                                "Must use GMX_USE_GPU_BUFFER_OPS=1 to use GMX_GPU_DD_COMMS=1");
1342             void* streamLocal =
1343                     Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
1344             void* streamNonLocal =
1345                     Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
1346             GMX_LOG(mdlog.warning)
1347                     .asParagraph()
1348                     .appendTextFormatted(
1349                             "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the "
1350                             "GMX_GPU_DD_COMMS environment variable.");
1351             cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(
1352                     cr->dd, cr->mpi_comm_mysim, streamLocal, streamNonLocal);
1353         }
1354
1355         /* Initialize the mdAtoms structure.
1356          * mdAtoms is not filled with atom data,
1357          * as this can not be done now with domain decomposition.
1358          */
1359         mdAtoms = makeMDAtoms(fplog, mtop, *inputrec, thisRankHasPmeGpuTask);
1360         if (globalState && thisRankHasPmeGpuTask)
1361         {
1362             // The pinning of coordinates in the global state object works, because we only use
1363             // PME on GPU without DD or on a separate PME rank, and because the local state pointer
1364             // points to the global state object without DD.
1365             // FIXME: MD and EM separately set up the local state - this should happen in the same
1366             // function, which should also perform the pinning.
1367             changePinningPolicy(&globalState->x, pme_get_pinning_policy());
1368         }
1369
1370         /* Initialize the virtual site communication */
1371         vsite = initVsite(mtop, cr);
1372
1373         calc_shifts(box, fr->shift_vec);
1374
1375         /* With periodic molecules the charge groups should be whole at start up
1376          * and the virtual sites should not be far from their proper positions.
1377          */
1378         if (!inputrec->bContinuation && MASTER(cr) && !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
1379         {
1380             /* Make molecules whole at start of run */
1381             if (fr->ePBC != epbcNONE)
1382             {
1383                 do_pbc_first_mtop(fplog, inputrec->ePBC, box, &mtop, globalState->x.rvec_array());
1384             }
1385             if (vsite)
1386             {
1387                 /* Correct initial vsite positions are required
1388                  * for the initial distribution in the domain decomposition
1389                  * and for the initial shell prediction.
1390                  */
1391                 constructVsitesGlobal(mtop, globalState->x);
1392             }
1393         }
1394
1395         if (EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
1396         {
1397             ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
1398             ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
1399         }
1400     }
1401     else
1402     {
1403         /* This is a PME only node */
1404
1405         GMX_ASSERT(globalState == nullptr,
1406                    "We don't need the state on a PME only rank and expect it to be unitialized");
1407
1408         ewaldcoeff_q  = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol);
1409         ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj);
1410     }
1411
1412     gmx_pme_t* sepPmeData = nullptr;
1413     // This reference hides the fact that PME data is owned by runner on PME-only ranks and by forcerec on other ranks
1414     GMX_ASSERT(thisRankHasDuty(cr, DUTY_PP) == (fr != nullptr),
1415                "Double-checking that only PME-only ranks have no forcerec");
1416     gmx_pme_t*& pmedata = fr ? fr->pmedata : sepPmeData;
1417
1418     // TODO should live in ewald module once its testing is improved
1419     //
1420     // Later, this program could contain kernels that might be later
1421     // re-used as auto-tuning progresses, or subsequent simulations
1422     // are invoked.
1423     PmeGpuProgramStorage pmeGpuProgram;
1424     if (thisRankHasPmeGpuTask)
1425     {
1426         pmeGpuProgram = buildPmeGpuProgram(pmeDeviceInfo);
1427     }
1428
1429     /* Initiate PME if necessary,
1430      * either on all nodes or on dedicated PME nodes only. */
1431     if (EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
1432     {
1433         if (mdAtoms && mdAtoms->mdatoms())
1434         {
1435             nChargePerturbed = mdAtoms->mdatoms()->nChargePerturbed;
1436             if (EVDW_PME(inputrec->vdwtype))
1437             {
1438                 nTypePerturbed = mdAtoms->mdatoms()->nTypePerturbed;
1439             }
1440         }
1441         if (cr->npmenodes > 0)
1442         {
1443             /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/
1444             gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
1445             gmx_bcast_sim(sizeof(nTypePerturbed), &nTypePerturbed, cr);
1446         }
1447
1448         if (thisRankHasDuty(cr, DUTY_PME))
1449         {
1450             try
1451             {
1452                 pmedata = gmx_pme_init(cr, getNumPmeDomains(cr->dd), inputrec, nChargePerturbed != 0,
1453                                        nTypePerturbed != 0, mdrunOptions.reproducible, ewaldcoeff_q,
1454                                        ewaldcoeff_lj, gmx_omp_nthreads_get(emntPME), pmeRunMode,
1455                                        nullptr, pmeDeviceInfo, pmeGpuProgram.get(), mdlog);
1456             }
1457             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
1458         }
1459     }
1460
1461
1462     if (EI_DYNAMICS(inputrec->eI))
1463     {
1464         /* Turn on signal handling on all nodes */
1465         /*
1466          * (A user signal from the PME nodes (if any)
1467          * is communicated to the PP nodes.
1468          */
1469         signal_handler_install();
1470     }
1471
1472     pull_t* pull_work = nullptr;
1473     if (thisRankHasDuty(cr, DUTY_PP))
1474     {
1475         /* Assumes uniform use of the number of OpenMP threads */
1476         walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
1477
1478         if (inputrec->bPull)
1479         {
1480             /* Initialize pull code */
1481             pull_work = init_pull(fplog, inputrec->pull, inputrec, &mtop, cr, &atomSets,
1482                                   inputrec->fepvals->init_lambda);
1483             if (inputrec->pull->bXOutAverage || inputrec->pull->bFOutAverage)
1484             {
1485                 initPullHistory(pull_work, &observablesHistory);
1486             }
1487             if (EI_DYNAMICS(inputrec->eI) && MASTER(cr))
1488             {
1489                 init_pull_output_files(pull_work, filenames.size(), filenames.data(), oenv, startingBehavior);
1490             }
1491         }
1492
1493         std::unique_ptr<EnforcedRotation> enforcedRotation;
1494         if (inputrec->bRot)
1495         {
1496             /* Initialize enforced rotation code */
1497             enforcedRotation =
1498                     init_rot(fplog, inputrec, filenames.size(), filenames.data(), cr, &atomSets,
1499                              globalState.get(), &mtop, oenv, mdrunOptions, startingBehavior);
1500         }
1501
1502         t_swap* swap = nullptr;
1503         if (inputrec->eSwapCoords != eswapNO)
1504         {
1505             /* Initialize ion swapping code */
1506             swap = init_swapcoords(fplog, inputrec,
1507                                    opt2fn_master("-swap", filenames.size(), filenames.data(), cr),
1508                                    &mtop, globalState.get(), &observablesHistory, cr, &atomSets,
1509                                    oenv, mdrunOptions, startingBehavior);
1510         }
1511
1512         /* Let makeConstraints know whether we have essential dynamics constraints. */
1513         auto constr = makeConstraints(mtop, *inputrec, pull_work, doEssentialDynamics, fplog,
1514                                       *mdAtoms->mdatoms(), cr, ms, &nrnb, wcycle, fr->bMolPBC);
1515
1516         /* Energy terms and groups */
1517         gmx_enerdata_t enerd(mtop.groups.groups[SimulationAtomGroupType::EnergyOutput].size(),
1518                              inputrec->fepvals->n_lambda);
1519
1520         /* Kinetic energy data */
1521         gmx_ekindata_t ekind;
1522         init_ekindata(fplog, &mtop, &(inputrec->opts), &ekind, inputrec->cos_accel);
1523
1524         /* Set up interactive MD (IMD) */
1525         auto imdSession =
1526                 makeImdSession(inputrec, cr, wcycle, &enerd, ms, &mtop, mdlog,
1527                                MASTER(cr) ? globalState->x.rvec_array() : nullptr, filenames.size(),
1528                                filenames.data(), oenv, mdrunOptions.imdOptions, startingBehavior);
1529
1530         if (DOMAINDECOMP(cr))
1531         {
1532             GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
1533             /* This call is not included in init_domain_decomposition mainly
1534              * because fr->cginfo_mb is set later.
1535              */
1536             dd_init_bondeds(fplog, cr->dd, &mtop, vsite.get(), inputrec,
1537                             domdecOptions.checkBondedInteractions, fr->cginfo_mb);
1538         }
1539
1540         const bool inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible(
1541                 false, inputrec, doRerun, vsite.get(), ms, replExParams, fcd,
1542                 static_cast<int>(filenames.size()), filenames.data(), &observablesHistory, membed);
1543
1544         const bool useModularSimulator = inputIsCompatibleWithModularSimulator
1545                                          && !(getenv("GMX_DISABLE_MODULAR_SIMULATOR") != nullptr);
1546
1547         // TODO This is not the right place to manage the lifetime of
1548         // this data structure, but currently it's the easiest way to
1549         // make it work.
1550         MdrunScheduleWorkload runScheduleWork;
1551         // Also populates the simulation constant workload description.
1552         runScheduleWork.simulationWork = createSimulationWorkload(
1553                 useGpuForNonbonded, pmeRunMode, useGpuForBonded, useGpuForUpdate,
1554                 devFlags.enableGpuBufferOps, devFlags.enableGpuHaloExchange,
1555                 devFlags.enableGpuPmePPComm, haveEwaldSurfaceContribution(*inputrec));
1556
1557         std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
1558         if (gpusWereDetected
1559             && ((useGpuForPme && thisRankHasDuty(cr, DUTY_PME))
1560                 || runScheduleWork.simulationWork.useGpuBufferOps))
1561         {
1562             const void* pmeStream = pme_gpu_get_device_stream(fr->pmedata);
1563             const void* localStream =
1564                     fr->nbv->gpu_nbv != nullptr
1565                             ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local)
1566                             : nullptr;
1567             const void* nonLocalStream =
1568                     fr->nbv->gpu_nbv != nullptr
1569                             ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal)
1570                             : nullptr;
1571             const void*        deviceContext = pme_gpu_get_device_context(fr->pmedata);
1572             const int          paddingSize   = pme_gpu_get_padding_size(fr->pmedata);
1573             GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator)
1574                                                       ? GpuApiCallBehavior::Async
1575                                                       : GpuApiCallBehavior::Sync;
1576
1577             stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
1578                     pmeStream, localStream, nonLocalStream, deviceContext, transferKind, paddingSize);
1579             fr->stateGpu = stateGpu.get();
1580         }
1581
1582         GMX_ASSERT(stopHandlerBuilder_, "Runner must provide StopHandlerBuilder to simulator.");
1583         SimulatorBuilder simulatorBuilder;
1584
1585         // build and run simulator object based on user-input
1586         auto simulator = simulatorBuilder.build(
1587                 inputIsCompatibleWithModularSimulator, fplog, cr, ms, mdlog,
1588                 static_cast<int>(filenames.size()), filenames.data(), oenv, mdrunOptions,
1589                 startingBehavior, vsite.get(), constr.get(),
1590                 enforcedRotation ? enforcedRotation->getLegacyEnfrot() : nullptr, deform.get(),
1591                 mdModules_->outputProvider(), mdModules_->notifier(), inputrec, imdSession.get(),
1592                 pull_work, swap, &mtop, fcd, globalState.get(), &observablesHistory, mdAtoms.get(),
1593                 &nrnb, wcycle, fr, &enerd, &ekind, &runScheduleWork, replExParams, membed,
1594                 walltime_accounting, std::move(stopHandlerBuilder_), doRerun);
1595         simulator->run();
1596
1597         if (fr->pmePpCommGpu)
1598         {
1599             // destroy object since it is no longer required. (This needs to be done while the GPU context still exists.)
1600             fr->pmePpCommGpu.reset();
1601         }
1602
1603         if (inputrec->bPull)
1604         {
1605             finish_pull(pull_work);
1606         }
1607         finish_swapcoords(swap);
1608     }
1609     else
1610     {
1611         GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
1612         /* do PME only */
1613         walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
1614         gmx_pmeonly(pmedata, cr, &nrnb, wcycle, walltime_accounting, inputrec, pmeRunMode);
1615     }
1616
1617     wallcycle_stop(wcycle, ewcRUN);
1618
1619     /* Finish up, write some stuff
1620      * if rerunMD, don't write last frame again
1621      */
1622     finish_run(fplog, mdlog, cr, inputrec, &nrnb, wcycle, walltime_accounting,
1623                fr ? fr->nbv.get() : nullptr, pmedata, EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms));
1624
1625     // clean up cycle counter
1626     wallcycle_destroy(wcycle);
1627
1628     // Free PME data
1629     if (pmedata)
1630     {
1631         gmx_pme_destroy(pmedata);
1632         pmedata = nullptr;
1633     }
1634
1635     // FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
1636     // before we destroy the GPU context(s) in free_gpu_resources().
1637     // Pinned buffers are associated with contexts in CUDA.
1638     // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
1639     mdAtoms.reset(nullptr);
1640     globalState.reset(nullptr);
1641     mdModules_.reset(nullptr); // destruct force providers here as they might also use the GPU
1642
1643     /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
1644     free_gpu_resources(fr, physicalNodeComm, hwinfo->gpu_info);
1645     free_gpu(nonbondedDeviceInfo);
1646     free_gpu(pmeDeviceInfo);
1647     done_forcerec(fr, mtop.molblock.size());
1648     sfree(fcd);
1649
1650     if (doMembed)
1651     {
1652         free_membed(membed);
1653     }
1654
1655     /* Does what it says */
1656     print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
1657     walltime_accounting_destroy(walltime_accounting);
1658
1659     // Ensure log file content is written
1660     if (logFileHandle)
1661     {
1662         gmx_fio_flush(logFileHandle);
1663     }
1664
1665     /* Reset FPEs (important for unit tests) by disabling them. Assumes no
1666      * exceptions were enabled before function was called. */
1667     if (bEnableFPE)
1668     {
1669         gmx_fedisableexcept();
1670     }
1671
1672     auto rc = static_cast<int>(gmx_get_stop_condition());
1673
1674 #if GMX_THREAD_MPI
1675     /* we need to join all threads. The sub-threads join when they
1676        exit this function, but the master thread needs to be told to
1677        wait for that. */
1678     if (PAR(cr) && MASTER(cr))
1679     {
1680         tMPI_Finalize();
1681     }
1682 #endif
1683     return rc;
1684 }
1685
1686 Mdrunner::~Mdrunner()
1687 {
1688     // Clean up of the Manager.
1689     // This will end up getting called on every thread-MPI rank, which is unnecessary,
1690     // but okay as long as threads synchronize some time before adding or accessing
1691     // a new set of restraints.
1692     if (restraintManager_)
1693     {
1694         restraintManager_->clear();
1695         GMX_ASSERT(restraintManager_->countRestraints() == 0,
1696                    "restraints added during runner life time should be cleared at runner "
1697                    "destruction.");
1698     }
1699 };
1700
1701 void Mdrunner::addPotential(std::shared_ptr<gmx::IRestraintPotential> puller, const std::string& name)
1702 {
1703     GMX_ASSERT(restraintManager_, "Mdrunner must have a restraint manager.");
1704     // Not sure if this should be logged through the md logger or something else,
1705     // but it is helpful to have some sort of INFO level message sent somewhere.
1706     //    std::cout << "Registering restraint named " << name << std::endl;
1707
1708     // When multiple restraints are used, it may be wasteful to register them separately.
1709     // Maybe instead register an entire Restraint Manager as a force provider.
1710     restraintManager_->addToSpec(std::move(puller), name);
1711 }
1712
1713 Mdrunner::Mdrunner(std::unique_ptr<MDModules> mdModules) : mdModules_(std::move(mdModules)) {}
1714
1715 Mdrunner::Mdrunner(Mdrunner&&) noexcept = default;
1716
1717 //NOLINTNEXTLINE(performance-noexcept-move-constructor) working around GCC bug 58265
1718 Mdrunner& Mdrunner::operator=(Mdrunner&& /*handle*/) noexcept(BUGFREE_NOEXCEPT_STRING) = default;
1719
1720 class Mdrunner::BuilderImplementation
1721 {
1722 public:
1723     BuilderImplementation() = delete;
1724     BuilderImplementation(std::unique_ptr<MDModules> mdModules, compat::not_null<SimulationContext*> context);
1725     ~BuilderImplementation();
1726
1727     BuilderImplementation& setExtraMdrunOptions(const MdrunOptions& options,
1728                                                 real                forceWarningThreshold,
1729                                                 StartingBehavior    startingBehavior);
1730
1731     void addDomdec(const DomdecOptions& options);
1732
1733     void addVerletList(int nstlist);
1734
1735     void addReplicaExchange(const ReplicaExchangeParameters& params);
1736
1737     void addNonBonded(const char* nbpu_opt);
1738
1739     void addPME(const char* pme_opt_, const char* pme_fft_opt_);
1740
1741     void addBondedTaskAssignment(const char* bonded_opt);
1742
1743     void addUpdateTaskAssignment(const char* update_opt);
1744
1745     void addHardwareOptions(const gmx_hw_opt_t& hardwareOptions);
1746
1747     void addFilenames(ArrayRef<const t_filenm> filenames);
1748
1749     void addOutputEnvironment(gmx_output_env_t* outputEnvironment);
1750
1751     void addLogFile(t_fileio* logFileHandle);
1752
1753     void addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder);
1754
1755     Mdrunner build();
1756
1757 private:
1758     // Default parameters copied from runner.h
1759     // \todo Clarify source(s) of default parameters.
1760
1761     const char* nbpu_opt_    = nullptr;
1762     const char* pme_opt_     = nullptr;
1763     const char* pme_fft_opt_ = nullptr;
1764     const char* bonded_opt_  = nullptr;
1765     const char* update_opt_  = nullptr;
1766
1767     MdrunOptions mdrunOptions_;
1768
1769     DomdecOptions domdecOptions_;
1770
1771     ReplicaExchangeParameters replicaExchangeParameters_;
1772
1773     //! Command-line override for the duration of a neighbor list with the Verlet scheme.
1774     int nstlist_ = 0;
1775
1776     //! Multisim communicator handle.
1777     gmx_multisim_t* multiSimulation_;
1778
1779     //! mdrun communicator
1780     MPI_Comm communicator_ = MPI_COMM_NULL;
1781
1782     //! Print a warning if any force is larger than this (in kJ/mol nm).
1783     real forceWarningThreshold_ = -1;
1784
1785     //! Whether the simulation will start afresh, or restart with/without appending.
1786     StartingBehavior startingBehavior_ = StartingBehavior::NewSimulation;
1787
1788     //! The modules that comprise the functionality of mdrun.
1789     std::unique_ptr<MDModules> mdModules_;
1790
1791     //! \brief Parallelism information.
1792     gmx_hw_opt_t hardwareOptions_;
1793
1794     //! filename options for simulation.
1795     ArrayRef<const t_filenm> filenames_;
1796
1797     /*! \brief Handle to output environment.
1798      *
1799      * \todo gmx_output_env_t needs lifetime management.
1800      */
1801     gmx_output_env_t* outputEnvironment_ = nullptr;
1802
1803     /*! \brief Non-owning handle to MD log file.
1804      *
1805      * \todo Context should own output facilities for client.
1806      * \todo Improve log file handle management.
1807      * \internal
1808      * Code managing the FILE* relies on the ability to set it to
1809      * nullptr to check whether the filehandle is valid.
1810      */
1811     t_fileio* logFileHandle_ = nullptr;
1812
1813     /*!
1814      * \brief Builder for simulation stop signal handler.
1815      */
1816     std::unique_ptr<StopHandlerBuilder> stopHandlerBuilder_ = nullptr;
1817 };
1818
1819 Mdrunner::BuilderImplementation::BuilderImplementation(std::unique_ptr<MDModules> mdModules,
1820                                                        compat::not_null<SimulationContext*> context) :
1821     mdModules_(std::move(mdModules))
1822 {
1823     communicator_    = context->communicator_;
1824     multiSimulation_ = context->multiSimulation_.get();
1825 }
1826
1827 Mdrunner::BuilderImplementation::~BuilderImplementation() = default;
1828
1829 Mdrunner::BuilderImplementation&
1830 Mdrunner::BuilderImplementation::setExtraMdrunOptions(const MdrunOptions&    options,
1831                                                       const real             forceWarningThreshold,
1832                                                       const StartingBehavior startingBehavior)
1833 {
1834     mdrunOptions_          = options;
1835     forceWarningThreshold_ = forceWarningThreshold;
1836     startingBehavior_      = startingBehavior;
1837     return *this;
1838 }
1839
1840 void Mdrunner::BuilderImplementation::addDomdec(const DomdecOptions& options)
1841 {
1842     domdecOptions_ = options;
1843 }
1844
1845 void Mdrunner::BuilderImplementation::addVerletList(int nstlist)
1846 {
1847     nstlist_ = nstlist;
1848 }
1849
1850 void Mdrunner::BuilderImplementation::addReplicaExchange(const ReplicaExchangeParameters& params)
1851 {
1852     replicaExchangeParameters_ = params;
1853 }
1854
1855 Mdrunner Mdrunner::BuilderImplementation::build()
1856 {
1857     auto newRunner = Mdrunner(std::move(mdModules_));
1858
1859     newRunner.mdrunOptions     = mdrunOptions_;
1860     newRunner.pforce           = forceWarningThreshold_;
1861     newRunner.startingBehavior = startingBehavior_;
1862     newRunner.domdecOptions    = domdecOptions_;
1863
1864     // \todo determine an invariant to check or confirm that all gmx_hw_opt_t objects are valid
1865     newRunner.hw_opt = hardwareOptions_;
1866
1867     // No invariant to check. This parameter exists to optionally override other behavior.
1868     newRunner.nstlist_cmdline = nstlist_;
1869
1870     newRunner.replExParams = replicaExchangeParameters_;
1871
1872     newRunner.filenames = filenames_;
1873
1874     newRunner.communicator = communicator_;
1875
1876     // nullptr is a valid value for the multisim handle
1877     newRunner.ms = multiSimulation_;
1878
1879     // \todo Clarify ownership and lifetime management for gmx_output_env_t
1880     // \todo Update sanity checking when output environment has clearly specified invariants.
1881     // Initialization and default values for oenv are not well specified in the current version.
1882     if (outputEnvironment_)
1883     {
1884         newRunner.oenv = outputEnvironment_;
1885     }
1886     else
1887     {
1888         GMX_THROW(gmx::APIError(
1889                 "MdrunnerBuilder::addOutputEnvironment() is required before build()"));
1890     }
1891
1892     newRunner.logFileHandle = logFileHandle_;
1893
1894     if (nbpu_opt_)
1895     {
1896         newRunner.nbpu_opt = nbpu_opt_;
1897     }
1898     else
1899     {
1900         GMX_THROW(gmx::APIError("MdrunnerBuilder::addNonBonded() is required before build()"));
1901     }
1902
1903     if (pme_opt_ && pme_fft_opt_)
1904     {
1905         newRunner.pme_opt     = pme_opt_;
1906         newRunner.pme_fft_opt = pme_fft_opt_;
1907     }
1908     else
1909     {
1910         GMX_THROW(gmx::APIError("MdrunnerBuilder::addElectrostatics() is required before build()"));
1911     }
1912
1913     if (bonded_opt_)
1914     {
1915         newRunner.bonded_opt = bonded_opt_;
1916     }
1917     else
1918     {
1919         GMX_THROW(gmx::APIError(
1920                 "MdrunnerBuilder::addBondedTaskAssignment() is required before build()"));
1921     }
1922
1923     if (update_opt_)
1924     {
1925         newRunner.update_opt = update_opt_;
1926     }
1927     else
1928     {
1929         GMX_THROW(gmx::APIError(
1930                 "MdrunnerBuilder::addUpdateTaskAssignment() is required before build()  "));
1931     }
1932
1933
1934     newRunner.restraintManager_ = std::make_unique<gmx::RestraintManager>();
1935
1936     if (stopHandlerBuilder_)
1937     {
1938         newRunner.stopHandlerBuilder_ = std::move(stopHandlerBuilder_);
1939     }
1940     else
1941     {
1942         newRunner.stopHandlerBuilder_ = std::make_unique<StopHandlerBuilder>();
1943     }
1944
1945     return newRunner;
1946 }
1947
1948 void Mdrunner::BuilderImplementation::addNonBonded(const char* nbpu_opt)
1949 {
1950     nbpu_opt_ = nbpu_opt;
1951 }
1952
1953 void Mdrunner::BuilderImplementation::addPME(const char* pme_opt, const char* pme_fft_opt)
1954 {
1955     pme_opt_     = pme_opt;
1956     pme_fft_opt_ = pme_fft_opt;
1957 }
1958
1959 void Mdrunner::BuilderImplementation::addBondedTaskAssignment(const char* bonded_opt)
1960 {
1961     bonded_opt_ = bonded_opt;
1962 }
1963
1964 void Mdrunner::BuilderImplementation::addUpdateTaskAssignment(const char* update_opt)
1965 {
1966     update_opt_ = update_opt;
1967 }
1968
1969 void Mdrunner::BuilderImplementation::addHardwareOptions(const gmx_hw_opt_t& hardwareOptions)
1970 {
1971     hardwareOptions_ = hardwareOptions;
1972 }
1973
1974 void Mdrunner::BuilderImplementation::addFilenames(ArrayRef<const t_filenm> filenames)
1975 {
1976     filenames_ = filenames;
1977 }
1978
1979 void Mdrunner::BuilderImplementation::addOutputEnvironment(gmx_output_env_t* outputEnvironment)
1980 {
1981     outputEnvironment_ = outputEnvironment;
1982 }
1983
1984 void Mdrunner::BuilderImplementation::addLogFile(t_fileio* logFileHandle)
1985 {
1986     logFileHandle_ = logFileHandle;
1987 }
1988
1989 void Mdrunner::BuilderImplementation::addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder)
1990 {
1991     stopHandlerBuilder_ = std::move(builder);
1992 }
1993
1994 MdrunnerBuilder::MdrunnerBuilder(std::unique_ptr<MDModules>           mdModules,
1995                                  compat::not_null<SimulationContext*> context) :
1996     impl_{ std::make_unique<Mdrunner::BuilderImplementation>(std::move(mdModules), context) }
1997 {
1998 }
1999
2000 MdrunnerBuilder::~MdrunnerBuilder() = default;
2001
2002 MdrunnerBuilder& MdrunnerBuilder::addSimulationMethod(const MdrunOptions&    options,
2003                                                       real                   forceWarningThreshold,
2004                                                       const StartingBehavior startingBehavior)
2005 {
2006     impl_->setExtraMdrunOptions(options, forceWarningThreshold, startingBehavior);
2007     return *this;
2008 }
2009
2010 MdrunnerBuilder& MdrunnerBuilder::addDomainDecomposition(const DomdecOptions& options)
2011 {
2012     impl_->addDomdec(options);
2013     return *this;
2014 }
2015
2016 MdrunnerBuilder& MdrunnerBuilder::addNeighborList(int nstlist)
2017 {
2018     impl_->addVerletList(nstlist);
2019     return *this;
2020 }
2021
2022 MdrunnerBuilder& MdrunnerBuilder::addReplicaExchange(const ReplicaExchangeParameters& params)
2023 {
2024     impl_->addReplicaExchange(params);
2025     return *this;
2026 }
2027
2028 MdrunnerBuilder& MdrunnerBuilder::addNonBonded(const char* nbpu_opt)
2029 {
2030     impl_->addNonBonded(nbpu_opt);
2031     return *this;
2032 }
2033
2034 MdrunnerBuilder& MdrunnerBuilder::addElectrostatics(const char* pme_opt, const char* pme_fft_opt)
2035 {
2036     // The builder method may become more general in the future, but in this version,
2037     // parameters for PME electrostatics are both required and the only parameters
2038     // available.
2039     if (pme_opt && pme_fft_opt)
2040     {
2041         impl_->addPME(pme_opt, pme_fft_opt);
2042     }
2043     else
2044     {
2045         GMX_THROW(
2046                 gmx::InvalidInputError("addElectrostatics() arguments must be non-null pointers."));
2047     }
2048     return *this;
2049 }
2050
2051 MdrunnerBuilder& MdrunnerBuilder::addBondedTaskAssignment(const char* bonded_opt)
2052 {
2053     impl_->addBondedTaskAssignment(bonded_opt);
2054     return *this;
2055 }
2056
2057 MdrunnerBuilder& MdrunnerBuilder::addUpdateTaskAssignment(const char* update_opt)
2058 {
2059     impl_->addUpdateTaskAssignment(update_opt);
2060     return *this;
2061 }
2062
2063 Mdrunner MdrunnerBuilder::build()
2064 {
2065     return impl_->build();
2066 }
2067
2068 MdrunnerBuilder& MdrunnerBuilder::addHardwareOptions(const gmx_hw_opt_t& hardwareOptions)
2069 {
2070     impl_->addHardwareOptions(hardwareOptions);
2071     return *this;
2072 }
2073
2074 MdrunnerBuilder& MdrunnerBuilder::addFilenames(ArrayRef<const t_filenm> filenames)
2075 {
2076     impl_->addFilenames(filenames);
2077     return *this;
2078 }
2079
2080 MdrunnerBuilder& MdrunnerBuilder::addOutputEnvironment(gmx_output_env_t* outputEnvironment)
2081 {
2082     impl_->addOutputEnvironment(outputEnvironment);
2083     return *this;
2084 }
2085
2086 MdrunnerBuilder& MdrunnerBuilder::addLogFile(t_fileio* logFileHandle)
2087 {
2088     impl_->addLogFile(logFileHandle);
2089     return *this;
2090 }
2091
2092 MdrunnerBuilder& MdrunnerBuilder::addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder)
2093 {
2094     impl_->addStopHandlerBuilder(std::move(builder));
2095     return *this;
2096 }
2097
2098 MdrunnerBuilder::MdrunnerBuilder(MdrunnerBuilder&&) noexcept = default;
2099
2100 MdrunnerBuilder& MdrunnerBuilder::operator=(MdrunnerBuilder&&) noexcept = default;
2101
2102 } // namespace gmx