src/gromacs/mdrun/runner.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2011-2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief Implements the MD runner routine calling all integrators.
  40  *
  41  * \author David van der Spoel <david.vanderspoel@icm.uu.se>
  42  * \ingroup module_mdrun
  43  */
  44 #include "gmxpre.h"
  45
  46 #include "runner.h"
  47
  48 #include "config.h"
  49
  50 #include <cassert>
  51 #include <cinttypes>
  52 #include <csignal>
  53 #include <cstdlib>
  54 #include <cstring>
  55
  56 #include <algorithm>
  57 #include <memory>
  58
  59 #include "gromacs/commandline/filenm.h"
  60 #include "gromacs/domdec/builder.h"
  61 #include "gromacs/domdec/domdec.h"
  62 #include "gromacs/domdec/domdec_struct.h"
  63 #include "gromacs/domdec/gpuhaloexchange.h"
  64 #include "gromacs/domdec/localatomsetmanager.h"
  65 #include "gromacs/domdec/partition.h"
  66 #include "gromacs/ewald/ewald_utils.h"
  67 #include "gromacs/ewald/pme.h"
  68 #include "gromacs/ewald/pme_gpu_program.h"
  69 #include "gromacs/ewald/pme_pp_comm_gpu.h"
  70 #include "gromacs/fileio/checkpoint.h"
  71 #include "gromacs/fileio/gmxfio.h"
  72 #include "gromacs/fileio/oenv.h"
  73 #include "gromacs/fileio/tpxio.h"
  74 #include "gromacs/gmxlib/network.h"
  75 #include "gromacs/gmxlib/nrnb.h"
  76 #include "gromacs/gpu_utils/gpu_utils.h"
  77 #include "gromacs/hardware/cpuinfo.h"
  78 #include "gromacs/hardware/detecthardware.h"
  79 #include "gromacs/hardware/printhardware.h"
  80 #include "gromacs/imd/imd.h"
  81 #include "gromacs/listed_forces/disre.h"
  82 #include "gromacs/listed_forces/gpubonded.h"
  83 #include "gromacs/listed_forces/orires.h"
  84 #include "gromacs/math/functions.h"
  85 #include "gromacs/math/utilities.h"
  86 #include "gromacs/math/vec.h"
  87 #include "gromacs/mdlib/boxdeformation.h"
  88 #include "gromacs/mdlib/broadcaststructs.h"
  89 #include "gromacs/mdlib/calc_verletbuf.h"
  90 #include "gromacs/mdlib/dispersioncorrection.h"
  91 #include "gromacs/mdlib/enerdata_utils.h"
  92 #include "gromacs/mdlib/force.h"
  93 #include "gromacs/mdlib/forcerec.h"
  94 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  95 #include "gromacs/mdlib/makeconstraints.h"
  96 #include "gromacs/mdlib/md_support.h"
  97 #include "gromacs/mdlib/mdatoms.h"
  98 #include "gromacs/mdlib/membed.h"
  99 #include "gromacs/mdlib/qmmm.h"
 100 #include "gromacs/mdlib/sighandler.h"
 101 #include "gromacs/mdlib/stophandler.h"
 102 #include "gromacs/mdrun/mdmodules.h"
 103 #include "gromacs/mdrun/simulationcontext.h"
 104 #include "gromacs/mdrunutility/handlerestart.h"
 105 #include "gromacs/mdrunutility/logging.h"
 106 #include "gromacs/mdrunutility/multisim.h"
 107 #include "gromacs/mdrunutility/printtime.h"
 108 #include "gromacs/mdrunutility/threadaffinity.h"
 109 #include "gromacs/mdtypes/commrec.h"
 110 #include "gromacs/mdtypes/enerdata.h"
 111 #include "gromacs/mdtypes/fcdata.h"
 112 #include "gromacs/mdtypes/group.h"
 113 #include "gromacs/mdtypes/inputrec.h"
 114 #include "gromacs/mdtypes/md_enums.h"
 115 #include "gromacs/mdtypes/mdrunoptions.h"
 116 #include "gromacs/mdtypes/observableshistory.h"
 117 #include "gromacs/mdtypes/simulation_workload.h"
 118 #include "gromacs/mdtypes/state.h"
 119 #include "gromacs/mdtypes/state_propagator_data_gpu.h"
 120 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 121 #include "gromacs/nbnxm/nbnxm.h"
 122 #include "gromacs/nbnxm/pairlist_tuning.h"
 123 #include "gromacs/pbcutil/pbc.h"
 124 #include "gromacs/pulling/output.h"
 125 #include "gromacs/pulling/pull.h"
 126 #include "gromacs/pulling/pull_rotation.h"
 127 #include "gromacs/restraint/manager.h"
 128 #include "gromacs/restraint/restraintmdmodule.h"
 129 #include "gromacs/restraint/restraintpotential.h"
 130 #include "gromacs/swap/swapcoords.h"
 131 #include "gromacs/taskassignment/decidegpuusage.h"
 132 #include "gromacs/taskassignment/decidesimulationworkload.h"
 133 #include "gromacs/taskassignment/resourcedivision.h"
 134 #include "gromacs/taskassignment/taskassignment.h"
 135 #include "gromacs/taskassignment/usergpuids.h"
 136 #include "gromacs/timing/gpu_timing.h"
 137 #include "gromacs/timing/wallcycle.h"
 138 #include "gromacs/timing/wallcyclereporting.h"
 139 #include "gromacs/topology/mtop_util.h"
 140 #include "gromacs/trajectory/trajectoryframe.h"
 141 #include "gromacs/utility/basenetwork.h"
 142 #include "gromacs/utility/cstringutil.h"
 143 #include "gromacs/utility/exceptions.h"
 144 #include "gromacs/utility/fatalerror.h"
 145 #include "gromacs/utility/filestream.h"
 146 #include "gromacs/utility/gmxassert.h"
 147 #include "gromacs/utility/gmxmpi.h"
 148 #include "gromacs/utility/keyvaluetree.h"
 149 #include "gromacs/utility/logger.h"
 150 #include "gromacs/utility/loggerbuilder.h"
 151 #include "gromacs/utility/mdmodulenotification.h"
 152 #include "gromacs/utility/physicalnodecommunicator.h"
 153 #include "gromacs/utility/pleasecite.h"
 154 #include "gromacs/utility/programcontext.h"
 155 #include "gromacs/utility/smalloc.h"
 156 #include "gromacs/utility/stringutil.h"
 157
 158 #include "isimulator.h"
 159 #include "replicaexchange.h"
 160 #include "simulatorbuilder.h"
 161
 162 #if GMX_FAHCORE
 163 #    include "corewrap.h"
 164 #endif
 165
 166 namespace gmx
 167 {
 168
 169 /*! \brief Structure that holds boolean flags corresponding to the development
 170  *        features present enabled through environment variables.
 171  *
 172  */
 173 struct DevelopmentFeatureFlags
 174 {
 175     //! True if the Buffer ops development feature is enabled
 176     // TODO: when the trigger of the buffer ops offload is fully automated this should go away
 177     bool enableGpuBufferOps = false;
 178     //! If true, forces 'mdrun -update auto' default to 'gpu'
 179     bool forceGpuUpdateDefaultOn = false;
 180     //! True if the GPU halo exchange development feature is enabled
 181     bool enableGpuHaloExchange = false;
 182     //! True if the PME PP direct communication GPU development feature is enabled
 183     bool enableGpuPmePPComm = false;
 184 };
 185
 186 /*! \brief Manage any development feature flag variables encountered
 187  *
 188  * The use of dev features indicated by environment variables is
 189  * logged in order to ensure that runs with such features enabled can
 190  * be identified from their log and standard output. Any cross
 191  * dependencies are also checked, and if unsatisfied, a fatal error
 192  * issued.
 193  *
 194  * Note that some development features overrides are applied already here:
 195  * the GPU communication flags are set to false in non-tMPI and non-CUDA builds.
 196  *
 197  * \param[in]  mdlog                Logger object.
 198  * \param[in]  useGpuForNonbonded   True if the nonbonded task is offloaded in this run.
 199  * \param[in]  useGpuForPme         True if the PME task is offloaded in this run.
 200  * \returns                         The object populated with development feature flags.
 201  */
 202 static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& mdlog,
 203                                                          const bool           useGpuForNonbonded,
 204                                                          const bool           useGpuForPme)
 205 {
 206     DevelopmentFeatureFlags devFlags;
 207
 208     // Some builds of GCC 5 give false positive warnings that these
 209     // getenv results are ignored when clearly they are used.
 210 #pragma GCC diagnostic push
 211 #pragma GCC diagnostic ignored "-Wunused-result"
 212     devFlags.enableGpuBufferOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr)
 213                                   && (GMX_GPU == GMX_GPU_CUDA) && useGpuForNonbonded;
 214     devFlags.forceGpuUpdateDefaultOn = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr);
 215     devFlags.enableGpuHaloExchange =
 216             (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
 217     devFlags.enableGpuPmePPComm =
 218             (getenv("GMX_GPU_PME_PP_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
 219 #pragma GCC diagnostic pop
 220
 221     if (devFlags.enableGpuBufferOps)
 222     {
 223         GMX_LOG(mdlog.warning)
 224                 .asParagraph()
 225                 .appendTextFormatted(
 226                         "NOTE: This run uses the 'GPU buffer ops' feature, enabled by the "
 227                         "GMX_USE_GPU_BUFFER_OPS environment variable.");
 228     }
 229
 230     if (devFlags.enableGpuHaloExchange)
 231     {
 232         if (useGpuForNonbonded)
 233         {
 234             if (!devFlags.enableGpuBufferOps)
 235             {
 236                 gmx_fatal(FARGS,
 237                           "Cannot enable GPU halo exchange without GPU buffer operations, set "
 238                           "GMX_USE_GPU_BUFFER_OPS=1\n");
 239             }
 240             GMX_LOG(mdlog.warning)
 241                     .asParagraph()
 242                     .appendTextFormatted(
 243                             "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the "
 244                             "GMX_GPU_DD_COMMS environment variable.");
 245         }
 246         else
 247         {
 248             GMX_LOG(mdlog.warning)
 249                     .asParagraph()
 250                     .appendTextFormatted(
 251                             "NOTE: GMX_GPU_DD_COMMS environment variable detected, but the 'GPU "
 252                             "halo exchange' feature will not be enabled as nonbonded interactions "
 253                             "are not offloaded.");
 254             devFlags.enableGpuHaloExchange = false;
 255         }
 256     }
 257
 258     if (devFlags.forceGpuUpdateDefaultOn)
 259     {
 260         GMX_LOG(mdlog.warning)
 261                 .asParagraph()
 262                 .appendTextFormatted(
 263                         "NOTE: This run will default to '-update gpu' as requested by the "
 264                         "GMX_FORCE_UPDATE_DEFAULT_GPU environment variable.");
 265     }
 266
 267     if (devFlags.enableGpuPmePPComm)
 268     {
 269         if (useGpuForPme)
 270         {
 271             GMX_LOG(mdlog.warning)
 272                     .asParagraph()
 273                     .appendTextFormatted(
 274                             "NOTE: This run uses the 'GPU PME-PP communications' feature, enabled "
 275                             "by the GMX_GPU_PME_PP_COMMS environment variable.");
 276         }
 277         else
 278         {
 279             GMX_LOG(mdlog.warning)
 280                     .asParagraph()
 281                     .appendTextFormatted(
 282                             "NOTE: GMX_GPU_PME_PP_COMMS environment variable detected, but the "
 283                             "'GPU PME-PP communications' feature was not enabled as PME is not "
 284                             "offloaded to the GPU.");
 285             devFlags.enableGpuPmePPComm = false;
 286         }
 287     }
 288
 289     return devFlags;
 290 }
 291
 292 /*! \brief Barrier for safe simultaneous thread access to mdrunner data
 293  *
 294  * Used to ensure that the master thread does not modify mdrunner during copy
 295  * on the spawned threads. */
 296 static void threadMpiMdrunnerAccessBarrier()
 297 {
 298 #if GMX_THREAD_MPI
 299     MPI_Barrier(MPI_COMM_WORLD);
 300 #endif
 301 }
 302
 303 Mdrunner Mdrunner::cloneOnSpawnedThread() const
 304 {
 305     auto newRunner = Mdrunner(std::make_unique<MDModules>());
 306
 307     // All runners in the same process share a restraint manager resource because it is
 308     // part of the interface to the client code, which is associated only with the
 309     // original thread. Handles to the same resources can be obtained by copy.
 310     {
 311         newRunner.restraintManager_ = std::make_unique<RestraintManager>(*restraintManager_);
 312     }
 313
 314     // Copy members of master runner.
 315     // \todo Replace with builder when Simulation context and/or runner phases are better defined.
 316     // Ref https://redmine.gromacs.org/issues/2587 and https://redmine.gromacs.org/issues/2375
 317     newRunner.hw_opt    = hw_opt;
 318     newRunner.filenames = filenames;
 319
 320     newRunner.oenv            = oenv;
 321     newRunner.mdrunOptions    = mdrunOptions;
 322     newRunner.domdecOptions   = domdecOptions;
 323     newRunner.nbpu_opt        = nbpu_opt;
 324     newRunner.pme_opt         = pme_opt;
 325     newRunner.pme_fft_opt     = pme_fft_opt;
 326     newRunner.bonded_opt      = bonded_opt;
 327     newRunner.update_opt      = update_opt;
 328     newRunner.nstlist_cmdline = nstlist_cmdline;
 329     newRunner.replExParams    = replExParams;
 330     newRunner.pforce          = pforce;
 331     // Give the spawned thread the newly created valid communicator
 332     // for the simulation.
 333     newRunner.communicator        = MPI_COMM_WORLD;
 334     newRunner.ms                  = ms;
 335     newRunner.startingBehavior    = startingBehavior;
 336     newRunner.stopHandlerBuilder_ = std::make_unique<StopHandlerBuilder>(*stopHandlerBuilder_);
 337
 338     threadMpiMdrunnerAccessBarrier();
 339
 340     return newRunner;
 341 }
 342
 343 /*! \brief The callback used for running on spawned threads.
 344  *
 345  * Obtains the pointer to the master mdrunner object from the one
 346  * argument permitted to the thread-launch API call, copies it to make
 347  * a new runner for this thread, reinitializes necessary data, and
 348  * proceeds to the simulation. */
 349 static void mdrunner_start_fn(const void* arg)
 350 {
 351     try
 352     {
 353         auto masterMdrunner = reinterpret_cast<const gmx::Mdrunner*>(arg);
 354         /* copy the arg list to make sure that it's thread-local. This
 355            doesn't copy pointed-to items, of course; fnm, cr and fplog
 356            are reset in the call below, all others should be const. */
 357         gmx::Mdrunner mdrunner = masterMdrunner->cloneOnSpawnedThread();
 358         mdrunner.mdrunner();
 359     }
 360     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
 361 }
 362
 363
 364 void Mdrunner::spawnThreads(int numThreadsToLaunch)
 365 {
 366 #if GMX_THREAD_MPI
 367     /* now spawn new threads that start mdrunner_start_fn(), while
 368        the main thread returns. Thread affinity is handled later. */
 369     if (tMPI_Init_fn(TRUE, numThreadsToLaunch, TMPI_AFFINITY_NONE, mdrunner_start_fn,
 370                      static_cast<const void*>(this))
 371         != TMPI_SUCCESS)
 372     {
 373         GMX_THROW(gmx::InternalError("Failed to spawn thread-MPI threads"));
 374     }
 375
 376     // Give the master thread the newly created valid communicator for
 377     // the simulation.
 378     communicator = MPI_COMM_WORLD;
 379     threadMpiMdrunnerAccessBarrier();
 380 #else
 381     GMX_UNUSED_VALUE(numThreadsToLaunch);
 382     GMX_UNUSED_VALUE(mdrunner_start_fn);
 383 #endif
 384 }
 385
 386 } // namespace gmx
 387
 388 /*! \brief Initialize variables for Verlet scheme simulation */
 389 static void prepare_verlet_scheme(FILE*               fplog,
 390                                   t_commrec*          cr,
 391                                   t_inputrec*         ir,
 392                                   int                 nstlist_cmdline,
 393                                   const gmx_mtop_t*   mtop,
 394                                   const matrix        box,
 395                                   bool                makeGpuPairList,
 396                                   const gmx::CpuInfo& cpuinfo)
 397 {
 398     /* For NVE simulations, we will retain the initial list buffer */
 399     if (EI_DYNAMICS(ir->eI) && ir->verletbuf_tol > 0 && !(EI_MD(ir->eI) && ir->etc == etcNO))
 400     {
 401         /* Update the Verlet buffer size for the current run setup */
 402
 403         /* Here we assume SIMD-enabled kernels are being used. But as currently
 404          * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
 405          * and 4x2 gives a larger buffer than 4x4, this is ok.
 406          */
 407         ListSetupType listType =
 408                 (makeGpuPairList ? ListSetupType::Gpu : ListSetupType::CpuSimdWhenSupported);
 409         VerletbufListSetup listSetup = verletbufGetSafeListSetup(listType);
 410
 411         const real rlist_new =
 412                 calcVerletBufferSize(*mtop, det(box), *ir, ir->nstlist, ir->nstlist - 1, -1, listSetup);
 413
 414         if (rlist_new != ir->rlist)
 415         {
 416             if (fplog != nullptr)
 417             {
 418                 fprintf(fplog,
 419                         "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
 420                         ir->rlist, rlist_new, listSetup.cluster_size_i, listSetup.cluster_size_j);
 421             }
 422             ir->rlist = rlist_new;
 423         }
 424     }
 425
 426     if (nstlist_cmdline > 0 && (!EI_DYNAMICS(ir->eI) || ir->verletbuf_tol <= 0))
 427     {
 428         gmx_fatal(FARGS, "Can not set nstlist without %s",
 429                   !EI_DYNAMICS(ir->eI) ? "dynamics" : "verlet-buffer-tolerance");
 430     }
 431
 432     if (EI_DYNAMICS(ir->eI))
 433     {
 434         /* Set or try nstlist values */
 435         increaseNstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, makeGpuPairList, cpuinfo);
 436     }
 437 }
 438
 439 /*! \brief Override the nslist value in inputrec
 440  *
 441  * with value passed on the command line (if any)
 442  */
 443 static void override_nsteps_cmdline(const gmx::MDLogger& mdlog, int64_t nsteps_cmdline, t_inputrec* ir)
 444 {
 445     assert(ir);
 446
 447     /* override with anything else than the default -2 */
 448     if (nsteps_cmdline > -2)
 449     {
 450         char sbuf_steps[STEPSTRSIZE];
 451         char sbuf_msg[STRLEN];
 452
 453         ir->nsteps = nsteps_cmdline;
 454         if (EI_DYNAMICS(ir->eI) && nsteps_cmdline != -1)
 455         {
 456             sprintf(sbuf_msg,
 457                     "Overriding nsteps with value passed on the command line: %s steps, %.3g ps",
 458                     gmx_step_str(nsteps_cmdline, sbuf_steps), fabs(nsteps_cmdline * ir->delta_t));
 459         }
 460         else
 461         {
 462             sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps",
 463                     gmx_step_str(nsteps_cmdline, sbuf_steps));
 464         }
 465
 466         GMX_LOG(mdlog.warning).asParagraph().appendText(sbuf_msg);
 467     }
 468     else if (nsteps_cmdline < -2)
 469     {
 470         gmx_fatal(FARGS, "Invalid nsteps value passed on the command line: %" PRId64, nsteps_cmdline);
 471     }
 472     /* Do nothing if nsteps_cmdline == -2 */
 473 }
 474
 475 namespace gmx
 476 {
 477
 478 /*! \brief Return whether GPU acceleration of nonbondeds is supported with the given settings.
 479  *
 480  * If not, and if a warning may be issued, logs a warning about
 481  * falling back to CPU code. With thread-MPI, only the first
 482  * call to this function should have \c issueWarning true. */
 483 static bool gpuAccelerationOfNonbondedIsUseful(const MDLogger& mdlog, const t_inputrec& ir, bool issueWarning)
 484 {
 485     bool        gpuIsUseful = true;
 486     std::string warning;
 487
 488     if (ir.opts.ngener - ir.nwall > 1)
 489     {
 490         /* The GPU code does not support more than one energy group.
 491          * If the user requested GPUs explicitly, a fatal error is given later.
 492          */
 493         gpuIsUseful = false;
 494         warning =
 495                 "Multiple energy groups is not implemented for GPUs, falling back to the CPU. "
 496                 "For better performance, run on the GPU without energy groups and then do "
 497                 "gmx mdrun -rerun option on the trajectory with an energy group .tpr file.";
 498     }
 499
 500     if (EI_TPI(ir.eI))
 501     {
 502         gpuIsUseful = false;
 503         warning     = "TPI is not implemented for GPUs.";
 504     }
 505
 506     if (!gpuIsUseful && issueWarning)
 507     {
 508         GMX_LOG(mdlog.warning).asParagraph().appendText(warning);
 509     }
 510
 511     return gpuIsUseful;
 512 }
 513
 514 //! Initializes the logger for mdrun.
 515 static gmx::LoggerOwner buildLogger(FILE* fplog, const bool isSimulationMasterRank)
 516 {
 517     gmx::LoggerBuilder builder;
 518     if (fplog != nullptr)
 519     {
 520         builder.addTargetFile(gmx::MDLogger::LogLevel::Info, fplog);
 521     }
 522     if (isSimulationMasterRank)
 523     {
 524         builder.addTargetStream(gmx::MDLogger::LogLevel::Warning, &gmx::TextOutputFile::standardError());
 525     }
 526     return builder.build();
 527 }
 528
 529 //! Make a TaskTarget from an mdrun argument string.
 530 static TaskTarget findTaskTarget(const char* optionString)
 531 {
 532     TaskTarget returnValue = TaskTarget::Auto;
 533
 534     if (strncmp(optionString, "auto", 3) == 0)
 535     {
 536         returnValue = TaskTarget::Auto;
 537     }
 538     else if (strncmp(optionString, "cpu", 3) == 0)
 539     {
 540         returnValue = TaskTarget::Cpu;
 541     }
 542     else if (strncmp(optionString, "gpu", 3) == 0)
 543     {
 544         returnValue = TaskTarget::Gpu;
 545     }
 546     else
 547     {
 548         GMX_ASSERT(false, "Option string should have been checked for sanity already");
 549     }
 550
 551     return returnValue;
 552 }
 553
 554 //! Finish run, aggregate data to print performance info.
 555 static void finish_run(FILE*                     fplog,
 556                        const gmx::MDLogger&      mdlog,
 557                        const t_commrec*          cr,
 558                        const t_inputrec*         inputrec,
 559                        t_nrnb                    nrnb[],
 560                        gmx_wallcycle_t           wcycle,
 561                        gmx_walltime_accounting_t walltime_accounting,
 562                        nonbonded_verlet_t*       nbv,
 563                        const gmx_pme_t*          pme,
 564                        gmx_bool                  bWriteStat)
 565 {
 566     double delta_t = 0;
 567     double nbfs = 0, mflop = 0;
 568     double elapsed_time, elapsed_time_over_all_ranks, elapsed_time_over_all_threads,
 569             elapsed_time_over_all_threads_over_all_ranks;
 570     /* Control whether it is valid to print a report. Only the
 571        simulation master may print, but it should not do so if the run
 572        terminated e.g. before a scheduled reset step. This is
 573        complicated by the fact that PME ranks are unaware of the
 574        reason why they were sent a pmerecvqxFINISH. To avoid
 575        communication deadlocks, we always do the communication for the
 576        report, even if we've decided not to write the report, because
 577        how long it takes to finish the run is not important when we've
 578        decided not to report on the simulation performance.
 579
 580        Further, we only report performance for dynamical integrators,
 581        because those are the only ones for which we plan to
 582        consider doing any optimizations. */
 583     bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
 584
 585     if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
 586     {
 587         GMX_LOG(mdlog.warning)
 588                 .asParagraph()
 589                 .appendText("Simulation ended prematurely, no performance report will be written.");
 590         printReport = false;
 591     }
 592
 593     t_nrnb*                 nrnb_tot;
 594     std::unique_ptr<t_nrnb> nrnbTotalStorage;
 595     if (cr->nnodes > 1)
 596     {
 597         nrnbTotalStorage = std::make_unique<t_nrnb>();
 598         nrnb_tot         = nrnbTotalStorage.get();
 599 #if GMX_MPI
 600         MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 601 #endif
 602     }
 603     else
 604     {
 605         nrnb_tot = nrnb;
 606     }
 607
 608     elapsed_time = walltime_accounting_get_time_since_reset(walltime_accounting);
 609     elapsed_time_over_all_threads =
 610             walltime_accounting_get_time_since_reset_over_all_threads(walltime_accounting);
 611     if (cr->nnodes > 1)
 612     {
 613 #if GMX_MPI
 614         /* reduce elapsed_time over all MPI ranks in the current simulation */
 615         MPI_Allreduce(&elapsed_time, &elapsed_time_over_all_ranks, 1, MPI_DOUBLE, MPI_SUM,
 616                       cr->mpi_comm_mysim);
 617         elapsed_time_over_all_ranks /= cr->nnodes;
 618         /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
 619          * current simulation. */
 620         MPI_Allreduce(&elapsed_time_over_all_threads, &elapsed_time_over_all_threads_over_all_ranks,
 621                       1, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
 622 #endif
 623     }
 624     else
 625     {
 626         elapsed_time_over_all_ranks                  = elapsed_time;
 627         elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
 628     }
 629
 630     if (printReport)
 631     {
 632         print_flop(fplog, nrnb_tot, &nbfs, &mflop);
 633     }
 634
 635     if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
 636     {
 637         print_dd_statistics(cr, inputrec, fplog);
 638     }
 639
 640     /* TODO Move the responsibility for any scaling by thread counts
 641      * to the code that handled the thread region, so that there's a
 642      * mechanism to keep cycle counting working during the transition
 643      * to task parallelism. */
 644     int nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
 645     int nthreads_pme = gmx_omp_nthreads_get(emntPME);
 646     wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP),
 647                                    nthreads_pp, nthreads_pme);
 648     auto cycle_sum(wallcycle_sum(cr, wcycle));
 649
 650     if (printReport)
 651     {
 652         auto nbnxn_gpu_timings =
 653                 (nbv != nullptr && nbv->useGpu()) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
 654         gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
 655
 656         if (pme_gpu_task_enabled(pme))
 657         {
 658             pme_gpu_get_timings(pme, &pme_gpu_timings);
 659         }
 660         wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
 661                         elapsed_time_over_all_ranks, wcycle, cycle_sum, nbnxn_gpu_timings,
 662                         &pme_gpu_timings);
 663
 664         if (EI_DYNAMICS(inputrec->eI))
 665         {
 666             delta_t = inputrec->delta_t;
 667         }
 668
 669         if (fplog)
 670         {
 671             print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks, elapsed_time_over_all_ranks,
 672                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
 673                        delta_t, nbfs, mflop);
 674         }
 675         if (bWriteStat)
 676         {
 677             print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks, elapsed_time_over_all_ranks,
 678                        walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
 679                        delta_t, nbfs, mflop);
 680         }
 681     }
 682 }
 683
 684 int Mdrunner::mdrunner()
 685 {
 686     matrix                    box;
 687     t_forcerec*               fr               = nullptr;
 688     t_fcdata*                 fcd              = nullptr;
 689     real                      ewaldcoeff_q     = 0;
 690     real                      ewaldcoeff_lj    = 0;
 691     int                       nChargePerturbed = -1, nTypePerturbed = 0;
 692     gmx_wallcycle_t           wcycle;
 693     gmx_walltime_accounting_t walltime_accounting = nullptr;
 694     gmx_membed_t*             membed              = nullptr;
 695     gmx_hw_info_t*            hwinfo              = nullptr;
 696
 697     /* CAUTION: threads may be started later on in this function, so
 698        cr doesn't reflect the final parallel state right now */
 699     gmx_mtop_t mtop;
 700
 701     /* TODO: inputrec should tell us whether we use an algorithm, not a file option */
 702     const bool doEssentialDynamics = opt2bSet("-ei", filenames.size(), filenames.data());
 703     const bool doMembed            = opt2bSet("-membed", filenames.size(), filenames.data());
 704     const bool doRerun             = mdrunOptions.rerun;
 705
 706     // Handle task-assignment related user options.
 707     EmulateGpuNonbonded emulateGpuNonbonded =
 708             (getenv("GMX_EMULATE_GPU") != nullptr ? EmulateGpuNonbonded::Yes : EmulateGpuNonbonded::No);
 709
 710     std::vector<int> userGpuTaskAssignment;
 711     try
 712     {
 713         userGpuTaskAssignment = parseUserTaskAssignmentString(hw_opt.userGpuTaskAssignment);
 714     }
 715     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
 716     auto       nonbondedTarget = findTaskTarget(nbpu_opt);
 717     auto       pmeTarget       = findTaskTarget(pme_opt);
 718     auto       pmeFftTarget    = findTaskTarget(pme_fft_opt);
 719     auto       bondedTarget    = findTaskTarget(bonded_opt);
 720     auto       updateTarget    = findTaskTarget(update_opt);
 721     PmeRunMode pmeRunMode      = PmeRunMode::None;
 722
 723     FILE* fplog = nullptr;
 724     // If we are appending, we don't write log output because we need
 725     // to check that the old log file matches what the checkpoint file
 726     // expects. Otherwise, we should start to write log output now if
 727     // there is a file ready for it.
 728     if (logFileHandle != nullptr && startingBehavior != StartingBehavior::RestartWithAppending)
 729     {
 730         fplog = gmx_fio_getfp(logFileHandle);
 731     }
 732     const bool       isSimulationMasterRank = findIsSimulationMasterRank(ms, communicator);
 733     gmx::LoggerOwner logOwner(buildLogger(fplog, isSimulationMasterRank));
 734     gmx::MDLogger    mdlog(logOwner.logger());
 735
 736     // TODO The thread-MPI master rank makes a working
 737     // PhysicalNodeCommunicator here, but it gets rebuilt by all ranks
 738     // after the threads have been launched. This works because no use
 739     // is made of that communicator until after the execution paths
 740     // have rejoined. But it is likely that we can improve the way
 741     // this is expressed, e.g. by expressly running detection only the
 742     // master rank for thread-MPI, rather than relying on the mutex
 743     // and reference count.
 744     PhysicalNodeCommunicator physicalNodeComm(communicator, gmx_physicalnode_id_hash());
 745     hwinfo = gmx_detect_hardware(mdlog, physicalNodeComm);
 746
 747     gmx_print_detected_hardware(fplog, isSimulationMasterRank && isMasterSim(ms), mdlog, hwinfo);
 748
 749     std::vector<int> gpuIdsToUse = makeGpuIdsToUse(hwinfo->gpu_info, hw_opt.gpuIdsAvailable);
 750
 751     // Print citation requests after all software/hardware printing
 752     pleaseCiteGromacs(fplog);
 753
 754     // TODO Replace this by unique_ptr once t_inputrec is C++
 755     t_inputrec               inputrecInstance;
 756     t_inputrec*              inputrec = nullptr;
 757     std::unique_ptr<t_state> globalState;
 758
 759     auto partialDeserializedTpr = std::make_unique<PartialDeserializedTprFile>();
 760
 761     if (isSimulationMasterRank)
 762     {
 763         /* Only the master rank has the global state */
 764         globalState = std::make_unique<t_state>();
 765
 766         /* Read (nearly) all data required for the simulation
 767          * and keep the partly serialized tpr contents to send to other ranks later
 768          */
 769         *partialDeserializedTpr = read_tpx_state(ftp2fn(efTPR, filenames.size(), filenames.data()),
 770                                                  &inputrecInstance, globalState.get(), &mtop);
 771         inputrec                = &inputrecInstance;
 772     }
 773
 774     /* Check and update the hardware options for internal consistency */
 775     checkAndUpdateHardwareOptions(mdlog, &hw_opt, isSimulationMasterRank, domdecOptions.numPmeRanks,
 776                                   inputrec);
 777
 778     if (GMX_THREAD_MPI && isSimulationMasterRank)
 779     {
 780         bool useGpuForNonbonded = false;
 781         bool useGpuForPme       = false;
 782         try
 783         {
 784             GMX_RELEASE_ASSERT(inputrec != nullptr, "Keep the compiler happy");
 785
 786             // If the user specified the number of ranks, then we must
 787             // respect that, but in default mode, we need to allow for
 788             // the number of GPUs to choose the number of ranks.
 789             auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
 790             useGpuForNonbonded         = decideWhetherToUseGpusForNonbondedWithThreadMpi(
 791                     nonbondedTarget, gpuIdsToUse, userGpuTaskAssignment, emulateGpuNonbonded,
 792                     canUseGpuForNonbonded,
 793                     gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, GMX_THREAD_MPI),
 794                     hw_opt.nthreads_tmpi);
 795             useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi(
 796                     useGpuForNonbonded, pmeTarget, gpuIdsToUse, userGpuTaskAssignment, *hwinfo,
 797                     *inputrec, mtop, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
 798         }
 799         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
 800
 801         /* Determine how many thread-MPI ranks to start.
 802          *
 803          * TODO Over-writing the user-supplied value here does
 804          * prevent any possible subsequent checks from working
 805          * correctly. */
 806         hw_opt.nthreads_tmpi = get_nthreads_mpi(hwinfo, &hw_opt, gpuIdsToUse, useGpuForNonbonded,
 807                                                 useGpuForPme, inputrec, &mtop, mdlog, doMembed);
 808
 809         // Now start the threads for thread MPI.
 810         spawnThreads(hw_opt.nthreads_tmpi);
 811         // The spawned threads enter mdrunner() and execution of
 812         // master and spawned threads joins at the end of this block.
 813         physicalNodeComm = PhysicalNodeCommunicator(communicator, gmx_physicalnode_id_hash());
 814     }
 815
 816     GMX_RELEASE_ASSERT(communicator == MPI_COMM_WORLD, "Must have valid world communicator");
 817     CommrecHandle crHandle = init_commrec(communicator, ms);
 818     t_commrec*    cr       = crHandle.get();
 819     GMX_RELEASE_ASSERT(cr != nullptr, "Must have valid commrec");
 820
 821     if (PAR(cr))
 822     {
 823         /* now broadcast everything to the non-master nodes/threads: */
 824         if (!isSimulationMasterRank)
 825         {
 826             inputrec = &inputrecInstance;
 827         }
 828         init_parallel(cr, inputrec, &mtop, partialDeserializedTpr.get());
 829     }
 830     GMX_RELEASE_ASSERT(inputrec != nullptr, "All ranks should have a valid inputrec now");
 831     partialDeserializedTpr.reset(nullptr);
 832
 833     // Now the number of ranks is known to all ranks, and each knows
 834     // the inputrec read by the master rank. The ranks can now all run
 835     // the task-deciding functions and will agree on the result
 836     // without needing to communicate.
 837     //
 838     // TODO Should we do the communication in debug mode to support
 839     // having an assertion?
 840     const bool useDomainDecomposition = (PAR(cr) && !(EI_TPI(inputrec->eI) || inputrec->eI == eiNM));
 841
 842     // Note that these variables describe only their own node.
 843     //
 844     // Note that when bonded interactions run on a GPU they always run
 845     // alongside a nonbonded task, so do not influence task assignment
 846     // even though they affect the force calculation workload.
 847     bool useGpuForNonbonded = false;
 848     bool useGpuForPme       = false;
 849     bool useGpuForBonded    = false;
 850     bool useGpuForUpdate    = false;
 851     bool gpusWereDetected   = hwinfo->ngpu_compatible_tot > 0;
 852     try
 853     {
 854         // It's possible that there are different numbers of GPUs on
 855         // different nodes, which is the user's responsibility to
 856         // handle. If unsuitable, we will notice that during task
 857         // assignment.
 858         auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
 859         useGpuForNonbonded         = decideWhetherToUseGpusForNonbonded(
 860                 nonbondedTarget, userGpuTaskAssignment, emulateGpuNonbonded, canUseGpuForNonbonded,
 861                 gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, !GMX_THREAD_MPI), gpusWereDetected);
 862         useGpuForPme = decideWhetherToUseGpusForPme(
 863                 useGpuForNonbonded, pmeTarget, userGpuTaskAssignment, *hwinfo, *inputrec, mtop,
 864                 cr->nnodes, domdecOptions.numPmeRanks, gpusWereDetected);
 865         auto canUseGpuForBonded = buildSupportsGpuBondeds(nullptr)
 866                                   && inputSupportsGpuBondeds(*inputrec, mtop, nullptr);
 867         useGpuForBonded = decideWhetherToUseGpusForBonded(
 868                 useGpuForNonbonded, useGpuForPme, bondedTarget, canUseGpuForBonded,
 869                 EVDW_PME(inputrec->vdwtype), EEL_PME_EWALD(inputrec->coulombtype),
 870                 domdecOptions.numPmeRanks, gpusWereDetected);
 871
 872         pmeRunMode = (useGpuForPme ? PmeRunMode::GPU : PmeRunMode::CPU);
 873         if (pmeRunMode == PmeRunMode::GPU)
 874         {
 875             if (pmeFftTarget == TaskTarget::Cpu)
 876             {
 877                 pmeRunMode = PmeRunMode::Mixed;
 878             }
 879         }
 880         else if (pmeFftTarget == TaskTarget::Gpu)
 881         {
 882             gmx_fatal(FARGS,
 883                       "Assigning FFTs to GPU requires PME to be assigned to GPU as well. With PME "
 884                       "on CPU you should not be using -pmefft.");
 885         }
 886     }
 887     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
 888
 889     // Initialize development feature flags that enabled by environment variable
 890     // and report those features that are enabled.
 891     const DevelopmentFeatureFlags devFlags =
 892             manageDevelopmentFeatures(mdlog, useGpuForNonbonded, useGpuForPme);
 893
 894     // NOTE: The devFlags need decideWhetherToUseGpusForNonbonded(...) and decideWhetherToUseGpusForPme(...) for overrides,
 895     //       decideWhetherToUseGpuForUpdate() needs devFlags for the '-update auto' override, hence the interleaving.
 896     // NOTE: When the simulationWork is constructed, the useGpuForUpdate overrides the devFlags.enableGpuBufferOps.
 897     try
 898     {
 899         useGpuForUpdate = decideWhetherToUseGpuForUpdate(
 900                 devFlags.forceGpuUpdateDefaultOn, useDomainDecomposition, useGpuForPme,
 901                 useGpuForNonbonded, updateTarget, gpusWereDetected, *inputrec,
 902                 gmx_mtop_interaction_count(mtop, IF_VSITE) > 0, doEssentialDynamics,
 903                 gmx_mtop_ftype_count(mtop, F_ORIRES) > 0, replExParams.exchangeInterval > 0);
 904     }
 905     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
 906
 907
 908     // Build restraints.
 909     // TODO: hide restraint implementation details from Mdrunner.
 910     // There is nothing unique about restraints at this point as far as the
 911     // Mdrunner is concerned. The Mdrunner should just be getting a sequence of
 912     // factory functions from the SimulationContext on which to call mdModules_->add().
 913     // TODO: capture all restraints into a single RestraintModule, passed to the runner builder.
 914     for (auto&& restraint : restraintManager_->getRestraints())
 915     {
 916         auto module = RestraintMDModule::create(restraint, restraint->sites());
 917         mdModules_->add(std::move(module));
 918     }
 919
 920     // TODO: Error handling
 921     mdModules_->assignOptionsToModules(*inputrec->params, nullptr);
 922     const auto& mdModulesNotifier = mdModules_->notifier().notifier_;
 923
 924     if (inputrec->internalParameters != nullptr)
 925     {
 926         mdModulesNotifier.notify(*inputrec->internalParameters);
 927     }
 928
 929     if (fplog != nullptr)
 930     {
 931         pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
 932         fprintf(fplog, "\n");
 933     }
 934
 935     if (SIMMASTER(cr))
 936     {
 937         /* In rerun, set velocities to zero if present */
 938         if (doRerun && ((globalState->flags & (1 << estV)) != 0))
 939         {
 940             // rerun does not use velocities
 941             GMX_LOG(mdlog.info)
 942                     .asParagraph()
 943                     .appendText(
 944                             "Rerun trajectory contains velocities. Rerun does only evaluate "
 945                             "potential energy and forces. The velocities will be ignored.");
 946             for (int i = 0; i < globalState->natoms; i++)
 947             {
 948                 clear_rvec(globalState->v[i]);
 949             }
 950             globalState->flags &= ~(1 << estV);
 951         }
 952
 953         /* now make sure the state is initialized and propagated */
 954         set_state_entries(globalState.get(), inputrec);
 955     }
 956
 957     /* NM and TPI parallelize over force/energy calculations, not atoms,
 958      * so we need to initialize and broadcast the global state.
 959      */
 960     if (inputrec->eI == eiNM || inputrec->eI == eiTPI)
 961     {
 962         if (!MASTER(cr))
 963         {
 964             globalState = std::make_unique<t_state>();
 965         }
 966         broadcastStateWithoutDynamics(cr, globalState.get());
 967     }
 968
 969     /* A parallel command line option consistency check that we can
 970        only do after any threads have started. */
 971     if (!PAR(cr)
 972         && (domdecOptions.numCells[XX] > 1 || domdecOptions.numCells[YY] > 1
 973             || domdecOptions.numCells[ZZ] > 1 || domdecOptions.numPmeRanks > 0))
 974     {
 975         gmx_fatal(FARGS,
 976                   "The -dd or -npme option request a parallel simulation, "
 977 #if !GMX_MPI
 978                   "but %s was compiled without threads or MPI enabled",
 979                   output_env_get_program_display_name(oenv));
 980 #elif GMX_THREAD_MPI
 981                   "but the number of MPI-threads (option -ntmpi) is not set or is 1");
 982 #else
 983                   "but %s was not started through mpirun/mpiexec or only one rank was requested "
 984                   "through mpirun/mpiexec",
 985                   output_env_get_program_display_name(oenv));
 986 #endif
 987     }
 988
 989     if (doRerun && (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
 990     {
 991         gmx_fatal(FARGS,
 992                   "The .mdp file specified an energy mininization or normal mode algorithm, and "
 993                   "these are not compatible with mdrun -rerun");
 994     }
 995
 996     if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
 997     {
 998         if (domdecOptions.numPmeRanks > 0)
 999         {
1000             gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
1001                                  "PME-only ranks are requested, but the system does not use PME "
1002                                  "for electrostatics or LJ");
1003         }
1004
1005         domdecOptions.numPmeRanks = 0;
1006     }
1007
1008     if (useGpuForNonbonded && domdecOptions.numPmeRanks < 0)
1009     {
1010         /* With NB GPUs we don't automatically use PME-only CPU ranks. PME ranks can
1011          * improve performance with many threads per GPU, since our OpenMP
1012          * scaling is bad, but it's difficult to automate the setup.
1013          */
1014         domdecOptions.numPmeRanks = 0;
1015     }
1016     if (useGpuForPme)
1017     {
1018         if (domdecOptions.numPmeRanks < 0)
1019         {
1020             domdecOptions.numPmeRanks = 0;
1021             // TODO possibly print a note that one can opt-in for a separate PME GPU rank?
1022         }
1023         else
1024         {
1025             GMX_RELEASE_ASSERT(domdecOptions.numPmeRanks <= 1,
1026                                "PME GPU decomposition is not supported");
1027         }
1028     }
1029
1030 #if GMX_FAHCORE
1031     if (MASTER(cr))
1032     {
1033         fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
1034     }
1035 #endif
1036
1037     /* NMR restraints must be initialized before load_checkpoint,
1038      * since with time averaging the history is added to t_state.
1039      * For proper consistency check we therefore need to extend
1040      * t_state here.
1041      * So the PME-only nodes (if present) will also initialize
1042      * the distance restraints.
1043      */
1044     snew(fcd, 1);
1045
1046     /* This needs to be called before read_checkpoint to extend the state */
1047     init_disres(fplog, &mtop, inputrec, cr, ms, fcd, globalState.get(), replExParams.exchangeInterval > 0);
1048
1049     init_orires(fplog, &mtop, inputrec, cr, ms, globalState.get(), &(fcd->orires));
1050
1051     auto deform = prepareBoxDeformation(globalState->box, cr, *inputrec);
1052
1053     ObservablesHistory observablesHistory = {};
1054
1055     if (startingBehavior != StartingBehavior::NewSimulation)
1056     {
1057         /* Check if checkpoint file exists before doing continuation.
1058          * This way we can use identical input options for the first and subsequent runs...
1059          */
1060         if (mdrunOptions.numStepsCommandline > -2)
1061         {
1062             /* Temporarily set the number of steps to unlimited to avoid
1063              * triggering the nsteps check in load_checkpoint().
1064              * This hack will go away soon when the -nsteps option is removed.
1065              */
1066             inputrec->nsteps = -1;
1067         }
1068
1069         load_checkpoint(opt2fn_master("-cpi", filenames.size(), filenames.data(), cr),
1070                         logFileHandle, cr, domdecOptions.numCells, inputrec, globalState.get(),
1071                         &observablesHistory, mdrunOptions.reproducible, mdModules_->notifier());
1072
1073         if (startingBehavior == StartingBehavior::RestartWithAppending && logFileHandle)
1074         {
1075             // Now we can start normal logging to the truncated log file.
1076             fplog = gmx_fio_getfp(logFileHandle);
1077             prepareLogAppending(fplog);
1078             logOwner = buildLogger(fplog, MASTER(cr));
1079             mdlog    = logOwner.logger();
1080         }
1081     }
1082
1083     if (mdrunOptions.numStepsCommandline > -2)
1084     {
1085         GMX_LOG(mdlog.info)
1086                 .asParagraph()
1087                 .appendText(
1088                         "The -nsteps functionality is deprecated, and may be removed in a future "
1089                         "version. "
1090                         "Consider using gmx convert-tpr -nsteps or changing the appropriate .mdp "
1091                         "file field.");
1092     }
1093     /* override nsteps with value set on the commandline */
1094     override_nsteps_cmdline(mdlog, mdrunOptions.numStepsCommandline, inputrec);
1095
1096     if (SIMMASTER(cr))
1097     {
1098         copy_mat(globalState->box, box);
1099     }
1100
1101     if (PAR(cr))
1102     {
1103         gmx_bcast(sizeof(box), box, cr);
1104     }
1105
1106     if (inputrec->cutoff_scheme != ecutsVERLET)
1107     {
1108         gmx_fatal(FARGS,
1109                   "This group-scheme .tpr file can no longer be run by mdrun. Please update to the "
1110                   "Verlet scheme, or use an earlier version of GROMACS if necessary.");
1111     }
1112     /* Update rlist and nstlist. */
1113     prepare_verlet_scheme(fplog, cr, inputrec, nstlist_cmdline, &mtop, box,
1114                           useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes),
1115                           *hwinfo->cpuInfo);
1116
1117     const bool prefer1DAnd1PulseDD = (devFlags.enableGpuHaloExchange && useGpuForNonbonded);
1118     // This builder is necessary while we have multi-part construction
1119     // of DD. Before DD is constructed, we use the existence of
1120     // the builder object to indicate that further construction of DD
1121     // is needed.
1122     std::unique_ptr<DomainDecompositionBuilder> ddBuilder;
1123     if (useDomainDecomposition)
1124     {
1125         ddBuilder = std::make_unique<DomainDecompositionBuilder>(
1126                 mdlog, cr, domdecOptions, mdrunOptions, prefer1DAnd1PulseDD, mtop, *inputrec, box,
1127                 positionsFromStatePointer(globalState.get()));
1128     }
1129     else
1130     {
1131         /* PME, if used, is done on all nodes with 1D decomposition */
1132         cr->npmenodes = 0;
1133         cr->duty      = (DUTY_PP | DUTY_PME);
1134
1135         if (inputrec->ePBC == epbcSCREW)
1136         {
1137             gmx_fatal(FARGS, "pbc=screw is only implemented with domain decomposition");
1138         }
1139     }
1140
1141     // Produce the task assignment for this rank.
1142     GpuTaskAssignmentsBuilder gpuTaskAssignmentsBuilder;
1143     GpuTaskAssignments        gpuTaskAssignments = gpuTaskAssignmentsBuilder.build(
1144             gpuIdsToUse, userGpuTaskAssignment, *hwinfo, cr, ms, physicalNodeComm, nonbondedTarget,
1145             pmeTarget, bondedTarget, updateTarget, useGpuForNonbonded, useGpuForPme,
1146             thisRankHasDuty(cr, DUTY_PP),
1147             // TODO cr->duty & DUTY_PME should imply that a PME
1148             // algorithm is active, but currently does not.
1149             EEL_PME(inputrec->coulombtype) && thisRankHasDuty(cr, DUTY_PME));
1150
1151     const bool printHostName = (cr->nnodes > 1);
1152     gpuTaskAssignments.reportGpuUsage(mdlog, printHostName, useGpuForBonded, pmeRunMode);
1153
1154     // If the user chose a task assignment, give them some hints
1155     // where appropriate.
1156     if (!userGpuTaskAssignment.empty())
1157     {
1158         gpuTaskAssignments.logPerformanceHints(mdlog, ssize(gpuIdsToUse));
1159     }
1160
1161     // Get the device handles for the modules, nullptr when no task is assigned.
1162     gmx_device_info_t* nonbondedDeviceInfo = gpuTaskAssignments.initNonbondedDevice(cr);
1163     gmx_device_info_t* pmeDeviceInfo       = gpuTaskAssignments.initPmeDevice();
1164
1165     // TODO Initialize GPU streams here.
1166
1167     // TODO Currently this is always built, yet DD partition code
1168     // checks if it is built before using it. Probably it should
1169     // become an MDModule that is made only when another module
1170     // requires it (e.g. pull, CompEl, density fitting), so that we
1171     // don't update the local atom sets unilaterally every step.
1172     LocalAtomSetManager atomSets;
1173     if (ddBuilder)
1174     {
1175         // TODO Pass the GPU streams to ddBuilder to use in buffer
1176         // transfers (e.g. halo exchange)
1177         cr->dd = ddBuilder->build(&atomSets);
1178         // The builder's job is done, so destruct it
1179         ddBuilder.reset(nullptr);
1180         // Note that local state still does not exist yet.
1181     }
1182
1183     if (PAR(cr))
1184     {
1185         /* After possible communicator splitting in make_dd_communicators.
1186          * we can set up the intra/inter node communication.
1187          */
1188         gmx_setup_nodecomm(fplog, cr);
1189     }
1190
1191 #if GMX_MPI
1192     if (isMultiSim(ms))
1193     {
1194         GMX_LOG(mdlog.warning)
1195                 .asParagraph()
1196                 .appendTextFormatted(
1197                         "This is simulation %d out of %d running as a composite GROMACS\n"
1198                         "multi-simulation job. Setup for this simulation:\n",
1199                         ms->sim, ms->nsim);
1200     }
1201     GMX_LOG(mdlog.warning)
1202             .appendTextFormatted("Using %d MPI %s\n", cr->nnodes,
1203 #    if GMX_THREAD_MPI
1204                                  cr->nnodes == 1 ? "thread" : "threads"
1205 #    else
1206                                  cr->nnodes == 1 ? "process" : "processes"
1207 #    endif
1208             );
1209     fflush(stderr);
1210 #endif
1211
1212     // If mdrun -pin auto honors any affinity setting that already
1213     // exists. If so, it is nice to provide feedback about whether
1214     // that existing affinity setting was from OpenMP or something
1215     // else, so we run this code both before and after we initialize
1216     // the OpenMP support.
1217     gmx_check_thread_affinity_set(mdlog, &hw_opt, hwinfo->nthreads_hw_avail, FALSE);
1218     /* Check and update the number of OpenMP threads requested */
1219     checkAndUpdateRequestedNumOpenmpThreads(&hw_opt, *hwinfo, cr, ms, physicalNodeComm.size_,
1220                                             pmeRunMode, mtop, *inputrec);
1221
1222     gmx_omp_nthreads_init(mdlog, cr, hwinfo->nthreads_hw_avail, physicalNodeComm.size_,
1223                           hw_opt.nthreads_omp, hw_opt.nthreads_omp_pme, !thisRankHasDuty(cr, DUTY_PP));
1224
1225     // Enable FP exception detection, but not in
1226     // Release mode and not for compilers with known buggy FP
1227     // exception support (clang with any optimization) or suspected
1228     // buggy FP exception support (gcc 7.* with optimization).
1229 #if !defined NDEBUG                                                                         \
1230         && !((defined __clang__ || (defined(__GNUC__) && !defined(__ICC) && __GNUC__ == 7)) \
1231              && defined __OPTIMIZE__)
1232     const bool bEnableFPE = true;
1233 #else
1234     const bool bEnableFPE = false;
1235 #endif
1236     // FIXME - reconcile with gmx_feenableexcept() call from CommandLineModuleManager::run()
1237     if (bEnableFPE)
1238     {
1239         gmx_feenableexcept();
1240     }
1241
1242     /* Now that we know the setup is consistent, check for efficiency */
1243     check_resource_division_efficiency(hwinfo, gpuTaskAssignments.thisRankHasAnyGpuTask(),
1244                                        mdrunOptions.ntompOptionIsSet, cr, mdlog);
1245
1246     /* getting number of PP/PME threads on this MPI / tMPI rank.
1247        PME: env variable should be read only on one node to make sure it is
1248        identical everywhere;
1249      */
1250     const int numThreadsOnThisRank = thisRankHasDuty(cr, DUTY_PP) ? gmx_omp_nthreads_get(emntNonbonded)
1251                                                                   : gmx_omp_nthreads_get(emntPME);
1252     checkHardwareOversubscription(numThreadsOnThisRank, cr->nodeid, *hwinfo->hardwareTopology,
1253                                   physicalNodeComm, mdlog);
1254
1255     // Enable Peer access between GPUs where available
1256     // Only for DD, only master PP rank needs to perform setup, and only if thread MPI plus
1257     // any of the GPU communication features are active.
1258     if (DOMAINDECOMP(cr) && MASTER(cr) && thisRankHasDuty(cr, DUTY_PP) && GMX_THREAD_MPI
1259         && (devFlags.enableGpuHaloExchange || devFlags.enableGpuPmePPComm))
1260     {
1261         setupGpuDevicePeerAccess(gpuIdsToUse, mdlog);
1262     }
1263
1264     if (hw_opt.threadAffinity != ThreadAffinity::Off)
1265     {
1266         /* Before setting affinity, check whether the affinity has changed
1267          * - which indicates that probably the OpenMP library has changed it
1268          * since we first checked).
1269          */
1270         gmx_check_thread_affinity_set(mdlog, &hw_opt, hwinfo->nthreads_hw_avail, TRUE);
1271
1272         int numThreadsOnThisNode, intraNodeThreadOffset;
1273         analyzeThreadsOnThisNode(physicalNodeComm, numThreadsOnThisRank, &numThreadsOnThisNode,
1274                                  &intraNodeThreadOffset);
1275
1276         /* Set the CPU affinity */
1277         gmx_set_thread_affinity(mdlog, cr, &hw_opt, *hwinfo->hardwareTopology, numThreadsOnThisRank,
1278                                 numThreadsOnThisNode, intraNodeThreadOffset, nullptr);
1279     }
1280
1281     if (mdrunOptions.timingOptions.resetStep > -1)
1282     {
1283         GMX_LOG(mdlog.info)
1284                 .asParagraph()
1285                 .appendText(
1286                         "The -resetstep functionality is deprecated, and may be removed in a "
1287                         "future version.");
1288     }
1289     wcycle = wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
1290
1291     if (PAR(cr))
1292     {
1293         /* Master synchronizes its value of reset_counters with all nodes
1294          * including PME only nodes */
1295         int64_t reset_counters = wcycle_get_reset_counters(wcycle);
1296         gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
1297         wcycle_set_reset_counters(wcycle, reset_counters);
1298     }
1299
1300     // Membrane embedding must be initialized before we call init_forcerec()
1301     if (doMembed)
1302     {
1303         if (MASTER(cr))
1304         {
1305             fprintf(stderr, "Initializing membed");
1306         }
1307         /* Note that membed cannot work in parallel because mtop is
1308          * changed here. Fix this if we ever want to make it run with
1309          * multiple ranks. */
1310         membed = init_membed(fplog, filenames.size(), filenames.data(), &mtop, inputrec,
1311                              globalState.get(), cr, &mdrunOptions.checkpointOptions.period);
1312     }
1313
1314     const bool                   thisRankHasPmeGpuTask = gpuTaskAssignments.thisRankHasPmeGpuTask();
1315     std::unique_ptr<MDAtoms>     mdAtoms;
1316     std::unique_ptr<gmx_vsite_t> vsite;
1317
1318     t_nrnb nrnb;
1319     if (thisRankHasDuty(cr, DUTY_PP))
1320     {
1321         mdModulesNotifier.notify(*cr);
1322         mdModulesNotifier.notify(&atomSets);
1323         mdModulesNotifier.notify(PeriodicBoundaryConditionType{ inputrec->ePBC });
1324         mdModulesNotifier.notify(SimulationTimeStep{ inputrec->delta_t });
1325         /* Initiate forcerecord */
1326         fr                 = new t_forcerec;
1327         fr->forceProviders = mdModules_->initForceProviders();
1328         init_forcerec(fplog, mdlog, fr, fcd, inputrec, &mtop, cr, box,
1329                       opt2fn("-table", filenames.size(), filenames.data()),
1330                       opt2fn("-tablep", filenames.size(), filenames.data()),
1331                       opt2fns("-tableb", filenames.size(), filenames.data()), *hwinfo,
1332                       nonbondedDeviceInfo, useGpuForBonded,
1333                       pmeRunMode == PmeRunMode::GPU && !thisRankHasDuty(cr, DUTY_PME), pforce, wcycle);
1334
1335         // TODO Move this to happen during domain decomposition setup,
1336         // once stream and event handling works well with that.
1337         // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
1338         if (havePPDomainDecomposition(cr) && prefer1DAnd1PulseDD && is1DAnd1PulseDD(*cr->dd))
1339         {
1340             GMX_RELEASE_ASSERT(devFlags.enableGpuBufferOps,
1341                                "Must use GMX_USE_GPU_BUFFER_OPS=1 to use GMX_GPU_DD_COMMS=1");
1342             void* streamLocal =
1343                     Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
1344             void* streamNonLocal =
1345                     Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
1346             GMX_LOG(mdlog.warning)
1347                     .asParagraph()
1348                     .appendTextFormatted(
1349                             "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the "
1350                             "GMX_GPU_DD_COMMS environment variable.");
1351             cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(
1352                     cr->dd, cr->mpi_comm_mysim, streamLocal, streamNonLocal);
1353         }
1354
1355         /* Initialize the mdAtoms structure.
1356          * mdAtoms is not filled with atom data,
1357          * as this can not be done now with domain decomposition.
1358          */
1359         mdAtoms = makeMDAtoms(fplog, mtop, *inputrec, thisRankHasPmeGpuTask);
1360         if (globalState && thisRankHasPmeGpuTask)
1361         {
1362             // The pinning of coordinates in the global state object works, because we only use
1363             // PME on GPU without DD or on a separate PME rank, and because the local state pointer
1364             // points to the global state object without DD.
1365             // FIXME: MD and EM separately set up the local state - this should happen in the same
1366             // function, which should also perform the pinning.
1367             changePinningPolicy(&globalState->x, pme_get_pinning_policy());
1368         }
1369
1370         /* Initialize the virtual site communication */
1371         vsite = initVsite(mtop, cr);
1372
1373         calc_shifts(box, fr->shift_vec);
1374
1375         /* With periodic molecules the charge groups should be whole at start up
1376          * and the virtual sites should not be far from their proper positions.
1377          */
1378         if (!inputrec->bContinuation && MASTER(cr) && !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
1379         {
1380             /* Make molecules whole at start of run */
1381             if (fr->ePBC != epbcNONE)
1382             {
1383                 do_pbc_first_mtop(fplog, inputrec->ePBC, box, &mtop, globalState->x.rvec_array());
1384             }
1385             if (vsite)
1386             {
1387                 /* Correct initial vsite positions are required
1388                  * for the initial distribution in the domain decomposition
1389                  * and for the initial shell prediction.
1390                  */
1391                 constructVsitesGlobal(mtop, globalState->x);
1392             }
1393         }
1394
1395         if (EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
1396         {
1397             ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
1398             ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
1399         }
1400     }
1401     else
1402     {
1403         /* This is a PME only node */
1404
1405         GMX_ASSERT(globalState == nullptr,
1406                    "We don't need the state on a PME only rank and expect it to be unitialized");
1407
1408         ewaldcoeff_q  = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol);
1409         ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj);
1410     }
1411
1412     gmx_pme_t* sepPmeData = nullptr;
1413     // This reference hides the fact that PME data is owned by runner on PME-only ranks and by forcerec on other ranks
1414     GMX_ASSERT(thisRankHasDuty(cr, DUTY_PP) == (fr != nullptr),
1415                "Double-checking that only PME-only ranks have no forcerec");
1416     gmx_pme_t*& pmedata = fr ? fr->pmedata : sepPmeData;
1417
1418     // TODO should live in ewald module once its testing is improved
1419     //
1420     // Later, this program could contain kernels that might be later
1421     // re-used as auto-tuning progresses, or subsequent simulations
1422     // are invoked.
1423     PmeGpuProgramStorage pmeGpuProgram;
1424     if (thisRankHasPmeGpuTask)
1425     {
1426         pmeGpuProgram = buildPmeGpuProgram(pmeDeviceInfo);
1427     }
1428
1429     /* Initiate PME if necessary,
1430      * either on all nodes or on dedicated PME nodes only. */
1431     if (EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
1432     {
1433         if (mdAtoms && mdAtoms->mdatoms())
1434         {
1435             nChargePerturbed = mdAtoms->mdatoms()->nChargePerturbed;
1436             if (EVDW_PME(inputrec->vdwtype))
1437             {
1438                 nTypePerturbed = mdAtoms->mdatoms()->nTypePerturbed;
1439             }
1440         }
1441         if (cr->npmenodes > 0)
1442         {
1443             /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/
1444             gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
1445             gmx_bcast_sim(sizeof(nTypePerturbed), &nTypePerturbed, cr);
1446         }
1447
1448         if (thisRankHasDuty(cr, DUTY_PME))
1449         {
1450             try
1451             {
1452                 pmedata = gmx_pme_init(cr, getNumPmeDomains(cr->dd), inputrec, nChargePerturbed != 0,
1453                                        nTypePerturbed != 0, mdrunOptions.reproducible, ewaldcoeff_q,
1454                                        ewaldcoeff_lj, gmx_omp_nthreads_get(emntPME), pmeRunMode,
1455                                        nullptr, pmeDeviceInfo, pmeGpuProgram.get(), mdlog);
1456             }
1457             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
1458         }
1459     }
1460
1461
1462     if (EI_DYNAMICS(inputrec->eI))
1463     {
1464         /* Turn on signal handling on all nodes */
1465         /*
1466          * (A user signal from the PME nodes (if any)
1467          * is communicated to the PP nodes.
1468          */
1469         signal_handler_install();
1470     }
1471
1472     pull_t* pull_work = nullptr;
1473     if (thisRankHasDuty(cr, DUTY_PP))
1474     {
1475         /* Assumes uniform use of the number of OpenMP threads */
1476         walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
1477
1478         if (inputrec->bPull)
1479         {
1480             /* Initialize pull code */
1481             pull_work = init_pull(fplog, inputrec->pull, inputrec, &mtop, cr, &atomSets,
1482                                   inputrec->fepvals->init_lambda);
1483             if (inputrec->pull->bXOutAverage || inputrec->pull->bFOutAverage)
1484             {
1485                 initPullHistory(pull_work, &observablesHistory);
1486             }
1487             if (EI_DYNAMICS(inputrec->eI) && MASTER(cr))
1488             {
1489                 init_pull_output_files(pull_work, filenames.size(), filenames.data(), oenv, startingBehavior);
1490             }
1491         }
1492
1493         std::unique_ptr<EnforcedRotation> enforcedRotation;
1494         if (inputrec->bRot)
1495         {
1496             /* Initialize enforced rotation code */
1497             enforcedRotation =
1498                     init_rot(fplog, inputrec, filenames.size(), filenames.data(), cr, &atomSets,
1499                              globalState.get(), &mtop, oenv, mdrunOptions, startingBehavior);
1500         }
1501
1502         t_swap* swap = nullptr;
1503         if (inputrec->eSwapCoords != eswapNO)
1504         {
1505             /* Initialize ion swapping code */
1506             swap = init_swapcoords(fplog, inputrec,
1507                                    opt2fn_master("-swap", filenames.size(), filenames.data(), cr),
1508                                    &mtop, globalState.get(), &observablesHistory, cr, &atomSets,
1509                                    oenv, mdrunOptions, startingBehavior);
1510         }
1511
1512         /* Let makeConstraints know whether we have essential dynamics constraints. */
1513         auto constr = makeConstraints(mtop, *inputrec, pull_work, doEssentialDynamics, fplog,
1514                                       *mdAtoms->mdatoms(), cr, ms, &nrnb, wcycle, fr->bMolPBC);
1515
1516         /* Energy terms and groups */
1517         gmx_enerdata_t enerd(mtop.groups.groups[SimulationAtomGroupType::EnergyOutput].size(),
1518                              inputrec->fepvals->n_lambda);
1519
1520         /* Kinetic energy data */
1521         gmx_ekindata_t ekind;
1522         init_ekindata(fplog, &mtop, &(inputrec->opts), &ekind, inputrec->cos_accel);
1523
1524         /* Set up interactive MD (IMD) */
1525         auto imdSession =
1526                 makeImdSession(inputrec, cr, wcycle, &enerd, ms, &mtop, mdlog,
1527                                MASTER(cr) ? globalState->x.rvec_array() : nullptr, filenames.size(),
1528                                filenames.data(), oenv, mdrunOptions.imdOptions, startingBehavior);
1529
1530         if (DOMAINDECOMP(cr))
1531         {
1532             GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
1533             /* This call is not included in init_domain_decomposition mainly
1534              * because fr->cginfo_mb is set later.
1535              */
1536             dd_init_bondeds(fplog, cr->dd, mtop, vsite.get(), inputrec,
1537                             domdecOptions.checkBondedInteractions, fr->cginfo_mb);
1538         }
1539
1540         const bool inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible(
1541                 false, inputrec, doRerun, vsite.get(), ms, replExParams, fcd,
1542                 static_cast<int>(filenames.size()), filenames.data(), &observablesHistory, membed);
1543
1544         const bool useModularSimulator = inputIsCompatibleWithModularSimulator
1545                                          && !(getenv("GMX_DISABLE_MODULAR_SIMULATOR") != nullptr);
1546
1547         // TODO This is not the right place to manage the lifetime of
1548         // this data structure, but currently it's the easiest way to
1549         // make it work.
1550         MdrunScheduleWorkload runScheduleWork;
1551         // Also populates the simulation constant workload description.
1552         runScheduleWork.simulationWork = createSimulationWorkload(
1553                 useGpuForNonbonded, pmeRunMode, useGpuForBonded, useGpuForUpdate,
1554                 devFlags.enableGpuBufferOps, devFlags.enableGpuHaloExchange,
1555                 devFlags.enableGpuPmePPComm, haveEwaldSurfaceContribution(*inputrec));
1556
1557         std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
1558         if (gpusWereDetected
1559             && ((useGpuForPme && thisRankHasDuty(cr, DUTY_PME))
1560                 || runScheduleWork.simulationWork.useGpuBufferOps))
1561         {
1562             const void* pmeStream = pme_gpu_get_device_stream(fr->pmedata);
1563             const void* localStream =
1564                     fr->nbv->gpu_nbv != nullptr
1565                             ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local)
1566                             : nullptr;
1567             const void* nonLocalStream =
1568                     fr->nbv->gpu_nbv != nullptr
1569                             ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal)
1570                             : nullptr;
1571             const void*        deviceContext = pme_gpu_get_device_context(fr->pmedata);
1572             const int          paddingSize   = pme_gpu_get_padding_size(fr->pmedata);
1573             GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator)
1574                                                       ? GpuApiCallBehavior::Async
1575                                                       : GpuApiCallBehavior::Sync;
1576
1577             stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
1578                     pmeStream, localStream, nonLocalStream, deviceContext, transferKind, paddingSize);
1579             fr->stateGpu = stateGpu.get();
1580         }
1581
1582         GMX_ASSERT(stopHandlerBuilder_, "Runner must provide StopHandlerBuilder to simulator.");
1583         SimulatorBuilder simulatorBuilder;
1584
1585         // build and run simulator object based on user-input
1586         auto simulator = simulatorBuilder.build(
1587                 inputIsCompatibleWithModularSimulator, fplog, cr, ms, mdlog,
1588                 static_cast<int>(filenames.size()), filenames.data(), oenv, mdrunOptions,
1589                 startingBehavior, vsite.get(), constr.get(),
1590                 enforcedRotation ? enforcedRotation->getLegacyEnfrot() : nullptr, deform.get(),
1591                 mdModules_->outputProvider(), mdModules_->notifier(), inputrec, imdSession.get(),
1592                 pull_work, swap, &mtop, fcd, globalState.get(), &observablesHistory, mdAtoms.get(),
1593                 &nrnb, wcycle, fr, &enerd, &ekind, &runScheduleWork, replExParams, membed,
1594                 walltime_accounting, std::move(stopHandlerBuilder_), doRerun);
1595         simulator->run();
1596
1597         if (inputrec->bPull)
1598         {
1599             finish_pull(pull_work);
1600         }
1601         finish_swapcoords(swap);
1602     }
1603     else
1604     {
1605         GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
1606         /* do PME only */
1607         walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
1608         gmx_pmeonly(pmedata, cr, &nrnb, wcycle, walltime_accounting, inputrec, pmeRunMode);
1609     }
1610
1611     wallcycle_stop(wcycle, ewcRUN);
1612
1613     /* Finish up, write some stuff
1614      * if rerunMD, don't write last frame again
1615      */
1616     finish_run(fplog, mdlog, cr, inputrec, &nrnb, wcycle, walltime_accounting,
1617                fr ? fr->nbv.get() : nullptr, pmedata, EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms));
1618
1619     // clean up cycle counter
1620     wallcycle_destroy(wcycle);
1621
1622     // Free PME data
1623     if (pmedata)
1624     {
1625         gmx_pme_destroy(pmedata);
1626         pmedata = nullptr;
1627     }
1628
1629     // FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
1630     // before we destroy the GPU context(s) in free_gpu().
1631     // Pinned buffers are associated with contexts in CUDA.
1632     // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
1633     mdAtoms.reset(nullptr);
1634     globalState.reset(nullptr);
1635     mdModules_.reset(nullptr); // destruct force providers here as they might also use the GPU
1636     /* Free pinned buffers in *fr */
1637     delete fr;
1638     fr = nullptr;
1639
1640     if (hwinfo->gpu_info.n_dev > 0)
1641     {
1642         /* stop the GPU profiler (only CUDA) */
1643         stopGpuProfiler();
1644     }
1645
1646     /* With tMPI we need to wait for all ranks to finish deallocation before
1647      * destroying the CUDA context in free_gpu() as some tMPI ranks may be sharing
1648      * GPU and context.
1649      *
1650      * This is not a concern in OpenCL where we use one context per rank which
1651      * is freed in nbnxn_gpu_free().
1652      *
1653      * Note: it is safe to not call the barrier on the ranks which do not use GPU,
1654      * but it is easier and more futureproof to call it on the whole node.
1655      *
1656      * Note that this function needs to be called even if GPUs are not used
1657      * in this run because the PME ranks have no knowledge of whether GPUs
1658      * are used or not, but all ranks need to enter the barrier below.
1659      * \todo Remove this physical node barrier after making sure
1660      * that it's not needed anymore (with a shared GPU run).
1661      */
1662     if (GMX_THREAD_MPI)
1663     {
1664         physicalNodeComm.barrier();
1665     }
1666
1667     free_gpu(nonbondedDeviceInfo);
1668     free_gpu(pmeDeviceInfo);
1669     sfree(fcd);
1670
1671     if (doMembed)
1672     {
1673         free_membed(membed);
1674     }
1675
1676     /* Does what it says */
1677     print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
1678     walltime_accounting_destroy(walltime_accounting);
1679
1680     // Ensure log file content is written
1681     if (logFileHandle)
1682     {
1683         gmx_fio_flush(logFileHandle);
1684     }
1685
1686     /* Reset FPEs (important for unit tests) by disabling them. Assumes no
1687      * exceptions were enabled before function was called. */
1688     if (bEnableFPE)
1689     {
1690         gmx_fedisableexcept();
1691     }
1692
1693     auto rc = static_cast<int>(gmx_get_stop_condition());
1694
1695 #if GMX_THREAD_MPI
1696     /* we need to join all threads. The sub-threads join when they
1697        exit this function, but the master thread needs to be told to
1698        wait for that. */
1699     if (PAR(cr) && MASTER(cr))
1700     {
1701         tMPI_Finalize();
1702     }
1703 #endif
1704     return rc;
1705 }
1706
1707 Mdrunner::~Mdrunner()
1708 {
1709     // Clean up of the Manager.
1710     // This will end up getting called on every thread-MPI rank, which is unnecessary,
1711     // but okay as long as threads synchronize some time before adding or accessing
1712     // a new set of restraints.
1713     if (restraintManager_)
1714     {
1715         restraintManager_->clear();
1716         GMX_ASSERT(restraintManager_->countRestraints() == 0,
1717                    "restraints added during runner life time should be cleared at runner "
1718                    "destruction.");
1719     }
1720 };
1721
1722 void Mdrunner::addPotential(std::shared_ptr<gmx::IRestraintPotential> puller, const std::string& name)
1723 {
1724     GMX_ASSERT(restraintManager_, "Mdrunner must have a restraint manager.");
1725     // Not sure if this should be logged through the md logger or something else,
1726     // but it is helpful to have some sort of INFO level message sent somewhere.
1727     //    std::cout << "Registering restraint named " << name << std::endl;
1728
1729     // When multiple restraints are used, it may be wasteful to register them separately.
1730     // Maybe instead register an entire Restraint Manager as a force provider.
1731     restraintManager_->addToSpec(std::move(puller), name);
1732 }
1733
1734 Mdrunner::Mdrunner(std::unique_ptr<MDModules> mdModules) : mdModules_(std::move(mdModules)) {}
1735
1736 Mdrunner::Mdrunner(Mdrunner&&) noexcept = default;
1737
1738 //NOLINTNEXTLINE(performance-noexcept-move-constructor) working around GCC bug 58265
1739 Mdrunner& Mdrunner::operator=(Mdrunner&& /*handle*/) noexcept(BUGFREE_NOEXCEPT_STRING) = default;
1740
1741 class Mdrunner::BuilderImplementation
1742 {
1743 public:
1744     BuilderImplementation() = delete;
1745     BuilderImplementation(std::unique_ptr<MDModules> mdModules, compat::not_null<SimulationContext*> context);
1746     ~BuilderImplementation();
1747
1748     BuilderImplementation& setExtraMdrunOptions(const MdrunOptions& options,
1749                                                 real                forceWarningThreshold,
1750                                                 StartingBehavior    startingBehavior);
1751
1752     void addDomdec(const DomdecOptions& options);
1753
1754     void addVerletList(int nstlist);
1755
1756     void addReplicaExchange(const ReplicaExchangeParameters& params);
1757
1758     void addNonBonded(const char* nbpu_opt);
1759
1760     void addPME(const char* pme_opt_, const char* pme_fft_opt_);
1761
1762     void addBondedTaskAssignment(const char* bonded_opt);
1763
1764     void addUpdateTaskAssignment(const char* update_opt);
1765
1766     void addHardwareOptions(const gmx_hw_opt_t& hardwareOptions);
1767
1768     void addFilenames(ArrayRef<const t_filenm> filenames);
1769
1770     void addOutputEnvironment(gmx_output_env_t* outputEnvironment);
1771
1772     void addLogFile(t_fileio* logFileHandle);
1773
1774     void addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder);
1775
1776     Mdrunner build();
1777
1778 private:
1779     // Default parameters copied from runner.h
1780     // \todo Clarify source(s) of default parameters.
1781
1782     const char* nbpu_opt_    = nullptr;
1783     const char* pme_opt_     = nullptr;
1784     const char* pme_fft_opt_ = nullptr;
1785     const char* bonded_opt_  = nullptr;
1786     const char* update_opt_  = nullptr;
1787
1788     MdrunOptions mdrunOptions_;
1789
1790     DomdecOptions domdecOptions_;
1791
1792     ReplicaExchangeParameters replicaExchangeParameters_;
1793
1794     //! Command-line override for the duration of a neighbor list with the Verlet scheme.
1795     int nstlist_ = 0;
1796
1797     //! Multisim communicator handle.
1798     gmx_multisim_t* multiSimulation_;
1799
1800     //! mdrun communicator
1801     MPI_Comm communicator_ = MPI_COMM_NULL;
1802
1803     //! Print a warning if any force is larger than this (in kJ/mol nm).
1804     real forceWarningThreshold_ = -1;
1805
1806     //! Whether the simulation will start afresh, or restart with/without appending.
1807     StartingBehavior startingBehavior_ = StartingBehavior::NewSimulation;
1808
1809     //! The modules that comprise the functionality of mdrun.
1810     std::unique_ptr<MDModules> mdModules_;
1811
1812     //! \brief Parallelism information.
1813     gmx_hw_opt_t hardwareOptions_;
1814
1815     //! filename options for simulation.
1816     ArrayRef<const t_filenm> filenames_;
1817
1818     /*! \brief Handle to output environment.
1819      *
1820      * \todo gmx_output_env_t needs lifetime management.
1821      */
1822     gmx_output_env_t* outputEnvironment_ = nullptr;
1823
1824     /*! \brief Non-owning handle to MD log file.
1825      *
1826      * \todo Context should own output facilities for client.
1827      * \todo Improve log file handle management.
1828      * \internal
1829      * Code managing the FILE* relies on the ability to set it to
1830      * nullptr to check whether the filehandle is valid.
1831      */
1832     t_fileio* logFileHandle_ = nullptr;
1833
1834     /*!
1835      * \brief Builder for simulation stop signal handler.
1836      */
1837     std::unique_ptr<StopHandlerBuilder> stopHandlerBuilder_ = nullptr;
1838 };
1839
1840 Mdrunner::BuilderImplementation::BuilderImplementation(std::unique_ptr<MDModules> mdModules,
1841                                                        compat::not_null<SimulationContext*> context) :
1842     mdModules_(std::move(mdModules))
1843 {
1844     communicator_    = context->communicator_;
1845     multiSimulation_ = context->multiSimulation_.get();
1846 }
1847
1848 Mdrunner::BuilderImplementation::~BuilderImplementation() = default;
1849
1850 Mdrunner::BuilderImplementation&
1851 Mdrunner::BuilderImplementation::setExtraMdrunOptions(const MdrunOptions&    options,
1852                                                       const real             forceWarningThreshold,
1853                                                       const StartingBehavior startingBehavior)
1854 {
1855     mdrunOptions_          = options;
1856     forceWarningThreshold_ = forceWarningThreshold;
1857     startingBehavior_      = startingBehavior;
1858     return *this;
1859 }
1860
1861 void Mdrunner::BuilderImplementation::addDomdec(const DomdecOptions& options)
1862 {
1863     domdecOptions_ = options;
1864 }
1865
1866 void Mdrunner::BuilderImplementation::addVerletList(int nstlist)
1867 {
1868     nstlist_ = nstlist;
1869 }
1870
1871 void Mdrunner::BuilderImplementation::addReplicaExchange(const ReplicaExchangeParameters& params)
1872 {
1873     replicaExchangeParameters_ = params;
1874 }
1875
1876 Mdrunner Mdrunner::BuilderImplementation::build()
1877 {
1878     auto newRunner = Mdrunner(std::move(mdModules_));
1879
1880     newRunner.mdrunOptions     = mdrunOptions_;
1881     newRunner.pforce           = forceWarningThreshold_;
1882     newRunner.startingBehavior = startingBehavior_;
1883     newRunner.domdecOptions    = domdecOptions_;
1884
1885     // \todo determine an invariant to check or confirm that all gmx_hw_opt_t objects are valid
1886     newRunner.hw_opt = hardwareOptions_;
1887
1888     // No invariant to check. This parameter exists to optionally override other behavior.
1889     newRunner.nstlist_cmdline = nstlist_;
1890
1891     newRunner.replExParams = replicaExchangeParameters_;
1892
1893     newRunner.filenames = filenames_;
1894
1895     newRunner.communicator = communicator_;
1896
1897     // nullptr is a valid value for the multisim handle
1898     newRunner.ms = multiSimulation_;
1899
1900     // \todo Clarify ownership and lifetime management for gmx_output_env_t
1901     // \todo Update sanity checking when output environment has clearly specified invariants.
1902     // Initialization and default values for oenv are not well specified in the current version.
1903     if (outputEnvironment_)
1904     {
1905         newRunner.oenv = outputEnvironment_;
1906     }
1907     else
1908     {
1909         GMX_THROW(gmx::APIError(
1910                 "MdrunnerBuilder::addOutputEnvironment() is required before build()"));
1911     }
1912
1913     newRunner.logFileHandle = logFileHandle_;
1914
1915     if (nbpu_opt_)
1916     {
1917         newRunner.nbpu_opt = nbpu_opt_;
1918     }
1919     else
1920     {
1921         GMX_THROW(gmx::APIError("MdrunnerBuilder::addNonBonded() is required before build()"));
1922     }
1923
1924     if (pme_opt_ && pme_fft_opt_)
1925     {
1926         newRunner.pme_opt     = pme_opt_;
1927         newRunner.pme_fft_opt = pme_fft_opt_;
1928     }
1929     else
1930     {
1931         GMX_THROW(gmx::APIError("MdrunnerBuilder::addElectrostatics() is required before build()"));
1932     }
1933
1934     if (bonded_opt_)
1935     {
1936         newRunner.bonded_opt = bonded_opt_;
1937     }
1938     else
1939     {
1940         GMX_THROW(gmx::APIError(
1941                 "MdrunnerBuilder::addBondedTaskAssignment() is required before build()"));
1942     }
1943
1944     if (update_opt_)
1945     {
1946         newRunner.update_opt = update_opt_;
1947     }
1948     else
1949     {
1950         GMX_THROW(gmx::APIError(
1951                 "MdrunnerBuilder::addUpdateTaskAssignment() is required before build()  "));
1952     }
1953
1954
1955     newRunner.restraintManager_ = std::make_unique<gmx::RestraintManager>();
1956
1957     if (stopHandlerBuilder_)
1958     {
1959         newRunner.stopHandlerBuilder_ = std::move(stopHandlerBuilder_);
1960     }
1961     else
1962     {
1963         newRunner.stopHandlerBuilder_ = std::make_unique<StopHandlerBuilder>();
1964     }
1965
1966     return newRunner;
1967 }
1968
1969 void Mdrunner::BuilderImplementation::addNonBonded(const char* nbpu_opt)
1970 {
1971     nbpu_opt_ = nbpu_opt;
1972 }
1973
1974 void Mdrunner::BuilderImplementation::addPME(const char* pme_opt, const char* pme_fft_opt)
1975 {
1976     pme_opt_     = pme_opt;
1977     pme_fft_opt_ = pme_fft_opt;
1978 }
1979
1980 void Mdrunner::BuilderImplementation::addBondedTaskAssignment(const char* bonded_opt)
1981 {
1982     bonded_opt_ = bonded_opt;
1983 }
1984
1985 void Mdrunner::BuilderImplementation::addUpdateTaskAssignment(const char* update_opt)
1986 {
1987     update_opt_ = update_opt;
1988 }
1989
1990 void Mdrunner::BuilderImplementation::addHardwareOptions(const gmx_hw_opt_t& hardwareOptions)
1991 {
1992     hardwareOptions_ = hardwareOptions;
1993 }
1994
1995 void Mdrunner::BuilderImplementation::addFilenames(ArrayRef<const t_filenm> filenames)
1996 {
1997     filenames_ = filenames;
1998 }
1999
2000 void Mdrunner::BuilderImplementation::addOutputEnvironment(gmx_output_env_t* outputEnvironment)
2001 {
2002     outputEnvironment_ = outputEnvironment;
2003 }
2004
2005 void Mdrunner::BuilderImplementation::addLogFile(t_fileio* logFileHandle)
2006 {
2007     logFileHandle_ = logFileHandle;
2008 }
2009
2010 void Mdrunner::BuilderImplementation::addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder)
2011 {
2012     stopHandlerBuilder_ = std::move(builder);
2013 }
2014
2015 MdrunnerBuilder::MdrunnerBuilder(std::unique_ptr<MDModules>           mdModules,
2016                                  compat::not_null<SimulationContext*> context) :
2017     impl_{ std::make_unique<Mdrunner::BuilderImplementation>(std::move(mdModules), context) }
2018 {
2019 }
2020
2021 MdrunnerBuilder::~MdrunnerBuilder() = default;
2022
2023 MdrunnerBuilder& MdrunnerBuilder::addSimulationMethod(const MdrunOptions&    options,
2024                                                       real                   forceWarningThreshold,
2025                                                       const StartingBehavior startingBehavior)
2026 {
2027     impl_->setExtraMdrunOptions(options, forceWarningThreshold, startingBehavior);
2028     return *this;
2029 }
2030
2031 MdrunnerBuilder& MdrunnerBuilder::addDomainDecomposition(const DomdecOptions& options)
2032 {
2033     impl_->addDomdec(options);
2034     return *this;
2035 }
2036
2037 MdrunnerBuilder& MdrunnerBuilder::addNeighborList(int nstlist)
2038 {
2039     impl_->addVerletList(nstlist);
2040     return *this;
2041 }
2042
2043 MdrunnerBuilder& MdrunnerBuilder::addReplicaExchange(const ReplicaExchangeParameters& params)
2044 {
2045     impl_->addReplicaExchange(params);
2046     return *this;
2047 }
2048
2049 MdrunnerBuilder& MdrunnerBuilder::addNonBonded(const char* nbpu_opt)
2050 {
2051     impl_->addNonBonded(nbpu_opt);
2052     return *this;
2053 }
2054
2055 MdrunnerBuilder& MdrunnerBuilder::addElectrostatics(const char* pme_opt, const char* pme_fft_opt)
2056 {
2057     // The builder method may become more general in the future, but in this version,
2058     // parameters for PME electrostatics are both required and the only parameters
2059     // available.
2060     if (pme_opt && pme_fft_opt)
2061     {
2062         impl_->addPME(pme_opt, pme_fft_opt);
2063     }
2064     else
2065     {
2066         GMX_THROW(
2067                 gmx::InvalidInputError("addElectrostatics() arguments must be non-null pointers."));
2068     }
2069     return *this;
2070 }
2071
2072 MdrunnerBuilder& MdrunnerBuilder::addBondedTaskAssignment(const char* bonded_opt)
2073 {
2074     impl_->addBondedTaskAssignment(bonded_opt);
2075     return *this;
2076 }
2077
2078 MdrunnerBuilder& MdrunnerBuilder::addUpdateTaskAssignment(const char* update_opt)
2079 {
2080     impl_->addUpdateTaskAssignment(update_opt);
2081     return *this;
2082 }
2083
2084 Mdrunner MdrunnerBuilder::build()
2085 {
2086     return impl_->build();
2087 }
2088
2089 MdrunnerBuilder& MdrunnerBuilder::addHardwareOptions(const gmx_hw_opt_t& hardwareOptions)
2090 {
2091     impl_->addHardwareOptions(hardwareOptions);
2092     return *this;
2093 }
2094
2095 MdrunnerBuilder& MdrunnerBuilder::addFilenames(ArrayRef<const t_filenm> filenames)
2096 {
2097     impl_->addFilenames(filenames);
2098     return *this;
2099 }
2100
2101 MdrunnerBuilder& MdrunnerBuilder::addOutputEnvironment(gmx_output_env_t* outputEnvironment)
2102 {
2103     impl_->addOutputEnvironment(outputEnvironment);
2104     return *this;
2105 }
2106
2107 MdrunnerBuilder& MdrunnerBuilder::addLogFile(t_fileio* logFileHandle)
2108 {
2109     impl_->addLogFile(logFileHandle);
2110     return *this;
2111 }
2112
2113 MdrunnerBuilder& MdrunnerBuilder::addStopHandlerBuilder(std::unique_ptr<StopHandlerBuilder> builder)
2114 {
2115     impl_->addStopHandlerBuilder(std::move(builder));
2116     return *this;
2117 }
2118
2119 MdrunnerBuilder::MdrunnerBuilder(MdrunnerBuilder&&) noexcept = default;
2120
2121 MdrunnerBuilder& MdrunnerBuilder::operator=(MdrunnerBuilder&&) noexcept = default;
2122
2123 } // namespace gmx