src/gromacs/taskassignment/taskassignment.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  * \brief
  37  * Defines helper and factory functionality for task assignment.
  38  *
  39  * Note that the GPU ID assignment could potentially match many
  40  * different kinds of simulation setups, including ranks from multiple
  41  * simulations, ranks from the same simulation, and/or ranks with duty
  42  * only for particular tasks (e.g. PME-only ranks). Which GPU ID
  43  * assignments are valid will naturally depend on the other run-time
  44  * options given to mdrun, and the current capabilities of the
  45  * implementation.
  46  *
  47  * \author Mark Abraham <mark.j.abraham@gmail.com>
  48  * \ingroup module_taskassignment
  49  */
  50 #include "gmxpre.h"
  51
  52 #include "taskassignment.h"
  53
  54 #include "config.h"
  55
  56 #include <algorithm>
  57 #include <exception>
  58 #include <string>
  59 #include <vector>
  60
  61 #include "gromacs/domdec/domdec.h"
  62 #include "gromacs/gmxlib/network.h"
  63 #include "gromacs/hardware/device_management.h"
  64 #include "gromacs/hardware/hw_info.h"
  65 #include "gromacs/mdrunutility/multisim.h"
  66 #include "gromacs/mdtypes/commrec.h"
  67 #include "gromacs/taskassignment/usergpuids.h"
  68 #include "gromacs/utility/cstringutil.h"
  69 #include "gromacs/utility/exceptions.h"
  70 #include "gromacs/utility/fatalerror.h"
  71 #include "gromacs/utility/gmxassert.h"
  72 #include "gromacs/utility/gmxmpi.h"
  73 #include "gromacs/utility/logger.h"
  74 #include "gromacs/utility/physicalnodecommunicator.h"
  75 #include "gromacs/utility/stringutil.h"
  76 #include "gromacs/utility/sysinfo.h"
  77
  78 #include "findallgputasks.h"
  79 #include "reportgpuusage.h"
  80
  81 namespace gmx
  82 {
  83
  84 namespace
  85 {
  86
  87 /*! \brief Build data structure of types of GPU tasks on a rank,
  88  * together with the mapped GPU device IDs, for all GPU tasks on all
  89  * the ranks of this node.
  90  *
  91  * \param[in]   gpuTasksOnRanksOfThisNode  For each rank on this node, the set of tasks
  92  *                                         that are eligible to run on GPUs.
  93  * \param[in]   gpuIds                     The user-supplied GPU IDs.
  94  */
  95 std::vector<GpuTaskAssignment> buildTaskAssignment(const GpuTasksOnRanks& gpuTasksOnRanksOfThisNode,
  96                                                    ArrayRef<const int>    gpuIds)
  97 {
  98     std::vector<GpuTaskAssignment> gpuTaskAssignmentOnRanksOfThisNode(gpuTasksOnRanksOfThisNode.size());
  99
 100     // Loop over the ranks on this node, and the tasks on each
 101     // rank. For each task, take the next device ID from those
 102     // provided by the user, to build a vector of mappings of task to
 103     // ID, for each rank on this node. Note that if there have not
 104     // been any GPU tasks identified, then gpuIds can be empty.
 105     auto currentGpuId            = gpuIds.begin();
 106     auto gpuTaskAssignmentOnRank = gpuTaskAssignmentOnRanksOfThisNode.begin();
 107     for (const auto& gpuTasksOnRank : gpuTasksOnRanksOfThisNode)
 108     {
 109         gpuTaskAssignmentOnRank->reserve(gpuTasksOnRank.size());
 110         for (const auto& gpuTaskType : gpuTasksOnRank)
 111         {
 112             GMX_RELEASE_ASSERT(currentGpuId != gpuIds.end(), "Indexing out of range for GPU tasks");
 113             gpuTaskAssignmentOnRank->push_back({ gpuTaskType, *currentGpuId });
 114             ++currentGpuId;
 115         }
 116         GMX_RELEASE_ASSERT(gpuTaskAssignmentOnRank->size() == gpuTasksOnRank.size(),
 117                            "Mismatch in number of GPU tasks on a rank with the number of elements "
 118                            "in the resulting task assignment");
 119         ++gpuTaskAssignmentOnRank;
 120     }
 121
 122     return gpuTaskAssignmentOnRanksOfThisNode;
 123 }
 124
 125 /*! \brief Return whether a GPU device is shared between any ranks.
 126  *
 127  * Sharing GPUs among multiple ranks is possible via either user or
 128  * automated selection. */
 129 bool isAnyGpuSharedBetweenRanks(ArrayRef<const GpuTaskAssignment> gpuTaskAssignments)
 130 {
 131     // Loop over all ranks i, looking on all higher ranks j whether
 132     // any tasks on them share GPU device IDs.
 133     //
 134     // TODO Should this functionality also consider whether tasks on
 135     // the same rank are sharing a device?
 136     for (size_t i = 0; i < gpuTaskAssignments.size(); ++i)
 137     {
 138         for (const auto& taskOnRankI : gpuTaskAssignments[i])
 139         {
 140             for (size_t j = i + 1; j < gpuTaskAssignments.size(); ++j)
 141             {
 142                 for (const auto& taskOnRankJ : gpuTaskAssignments[j])
 143                 {
 144                     if (taskOnRankI.deviceId_ == taskOnRankJ.deviceId_)
 145                     {
 146                         return true;
 147                     }
 148                 }
 149             }
 150         }
 151     }
 152     return false;
 153 }
 154
 155 } // namespace
 156
 157 void GpuTaskAssignments::logPerformanceHints(const MDLogger& mdlog, size_t numAvailableDevicesOnThisNode)
 158 {
 159     if (numAvailableDevicesOnThisNode > numGpuTasksOnThisNode_)
 160     {
 161         /* TODO In principle, this warning could be warranted only on
 162          * some nodes, but we lack the infrastructure to do a good job
 163          * of reporting that. */
 164         GMX_LOG(mdlog.warning)
 165                 .asParagraph()
 166                 .appendText(
 167                         "NOTE: You assigned the GPU tasks on a node such that some GPUs "
 168                         "available on that node are unused, which might not be optimal.");
 169     }
 170
 171     if (isAnyGpuSharedBetweenRanks(assignmentForAllRanksOnThisNode_))
 172     {
 173         GMX_LOG(mdlog.warning)
 174                 .asParagraph()
 175                 .appendText(
 176                         "NOTE: You assigned the same GPU ID(s) to multiple ranks, which is a good "
 177                         "idea if you have measured the performance of alternatives.");
 178     }
 179 }
 180
 181 namespace
 182 {
 183
 184 //! Counts all the GPU tasks on this node.
 185 size_t countGpuTasksOnThisNode(const GpuTasksOnRanks& gpuTasksOnRanksOfThisNode)
 186 {
 187     size_t numGpuTasksOnThisNode = 0;
 188     for (const auto& gpuTasksOnRank : gpuTasksOnRanksOfThisNode)
 189     {
 190         numGpuTasksOnThisNode += gpuTasksOnRank.size();
 191     }
 192     return numGpuTasksOnThisNode;
 193 }
 194
 195 /*! \brief Return on each rank the total count over all ranks of all
 196  * simulations. */
 197 int countOverAllRanks(MPI_Comm comm, int countOnThisRank)
 198 {
 199     int sum;
 200 #if GMX_MPI
 201     int numRanks;
 202     MPI_Comm_size(comm, &numRanks);
 203     if (numRanks > 1)
 204     {
 205         MPI_Allreduce(&countOnThisRank, &sum, 1, MPI_INT, MPI_SUM, comm);
 206     }
 207     else
 208 #else
 209     GMX_UNUSED_VALUE(comm);
 210 #endif
 211     {
 212         sum = countOnThisRank;
 213     }
 214
 215     return sum;
 216 }
 217
 218 /*! \brief Barrier over all rank in \p comm */
 219 void barrierOverAllRanks(MPI_Comm comm)
 220 {
 221 #if GMX_MPI
 222     int numRanks;
 223     MPI_Comm_size(comm, &numRanks);
 224     if (numRanks > 1)
 225     {
 226         MPI_Barrier(comm);
 227     }
 228 #else
 229     GMX_UNUSED_VALUE(comm);
 230 #endif
 231 }
 232
 233 } // namespace
 234
 235 GpuTaskAssignmentsBuilder::GpuTaskAssignmentsBuilder() = default;
 236
 237 GpuTaskAssignments GpuTaskAssignmentsBuilder::build(const gmx::ArrayRef<const int> availableDevices,
 238                                                     const gmx::ArrayRef<const int> userGpuTaskAssignment,
 239                                                     const gmx_hw_info_t&           hardwareInfo,
 240                                                     MPI_Comm                       gromacsWorldComm,
 241                                                     const PhysicalNodeCommunicator& physicalNodeComm,
 242                                                     const TaskTarget                nonbondedTarget,
 243                                                     const TaskTarget                pmeTarget,
 244                                                     const TaskTarget                bondedTarget,
 245                                                     const TaskTarget                updateTarget,
 246                                                     const bool useGpuForNonbonded,
 247                                                     const bool useGpuForPme,
 248                                                     bool       rankHasPpTask,
 249                                                     bool       rankHasPmeTask)
 250 {
 251     size_t               numRanksOnThisNode = physicalNodeComm.size_;
 252     std::vector<GpuTask> gpuTasksOnThisRank = findGpuTasksOnThisRank(!availableDevices.empty(),
 253                                                                      nonbondedTarget,
 254                                                                      pmeTarget,
 255                                                                      bondedTarget,
 256                                                                      updateTarget,
 257                                                                      useGpuForNonbonded,
 258                                                                      useGpuForPme,
 259                                                                      rankHasPpTask,
 260                                                                      rankHasPmeTask);
 261     /* Communicate among ranks on this node to find each task that can
 262      * be executed on a GPU, on each rank. */
 263     auto gpuTasksOnRanksOfThisNode = findAllGpuTasksOnThisNode(gpuTasksOnThisRank, physicalNodeComm);
 264     size_t numGpuTasksOnThisNode   = countGpuTasksOnThisNode(gpuTasksOnRanksOfThisNode);
 265
 266     std::exception_ptr             exceptionPtr;
 267     std::vector<GpuTaskAssignment> taskAssignmentOnRanksOfThisNode;
 268     std::vector<int>               deviceIdAssignment;
 269     try
 270     {
 271         // Use the GPU IDs from the user if they supplied
 272         // them. Otherwise, choose from the compatible GPUs.
 273         //
 274         // GPU ID assignment strings, if provided, cover all the ranks
 275         // on a node. If nodes or the process placement on them are
 276         // heterogeneous, then the GMX_GPU_ID environment variable
 277         // must be set by a user who also wishes to direct GPU ID
 278         // assignment.  Thus this implementation of task assignment
 279         // can assume it has a GPU ID assignment appropriate for the
 280         // node upon which its process is running.
 281         //
 282         // Valid GPU ID assignments are `an ordered set of digits that
 283         // identify GPU device IDs (e.g. as understood by the GPU
 284         // runtime, and subject to environment modification such as
 285         // with CUDA_VISIBLE_DEVICES) that will be used for the
 286         // GPU-suitable tasks on all of the ranks of that node.
 287         std::vector<int> generatedGpuIds;
 288         if (userGpuTaskAssignment.empty())
 289         {
 290             ArrayRef<const int> compatibleGpusToUse = availableDevices;
 291
 292             // enforce the single device/rank restriction
 293             if (numRanksOnThisNode == 1 && !compatibleGpusToUse.empty())
 294             {
 295                 compatibleGpusToUse = compatibleGpusToUse.subArray(0, 1);
 296             }
 297
 298             // When doing automated assignment of GPU tasks to GPU
 299             // IDs, even if we have more than one kind of GPU task, we
 300             // do a simple round-robin assignment. That's not ideal,
 301             // but we don't have any way to do a better job reliably.
 302             generatedGpuIds = makeGpuIds(compatibleGpusToUse, numGpuTasksOnThisNode);
 303
 304             if ((numGpuTasksOnThisNode > availableDevices.size())
 305                 && (numGpuTasksOnThisNode % availableDevices.size() != 0))
 306             {
 307                 // TODO Decorating the message with hostname should be
 308                 // the job of an error-reporting module.
 309                 char host[STRLEN];
 310                 gmx_gethostname(host, STRLEN);
 311
 312                 GMX_THROW(InconsistentInputError(formatString(
 313                         "There were %zu GPU tasks found on node %s, but %zu GPUs were "
 314                         "available. If the GPUs are equivalent, then it is usually best "
 315                         "to have a number of tasks that is a multiple of the number of GPUs. "
 316                         "You should reconsider your GPU task assignment, "
 317                         "number of ranks, or your use of the -nb, -pme, and -npme options, "
 318                         "perhaps after measuring the performance you can get.",
 319                         numGpuTasksOnThisNode,
 320                         host,
 321                         availableDevices.size())));
 322             }
 323             deviceIdAssignment = generatedGpuIds;
 324         }
 325         else
 326         {
 327             if (numGpuTasksOnThisNode != userGpuTaskAssignment.size())
 328             {
 329                 // TODO Decorating the message with hostname should be
 330                 // the job of an error-reporting module.
 331                 char host[STRLEN];
 332                 gmx_gethostname(host, STRLEN);
 333
 334                 GMX_THROW(InconsistentInputError(formatString(
 335                         "There were %zu GPU tasks assigned on node %s, but %zu GPU tasks were "
 336                         "identified, and these must match. Reconsider your GPU task assignment, "
 337                         "number of ranks, or your use of the -nb, -pme, and -npme options.",
 338                         userGpuTaskAssignment.size(),
 339                         host,
 340                         numGpuTasksOnThisNode)));
 341             }
 342             // Did the user choose compatible GPUs?
 343             checkUserGpuIds(hardwareInfo.deviceInfoList, availableDevices, userGpuTaskAssignment);
 344
 345             deviceIdAssignment = gmx::copyOf(userGpuTaskAssignment);
 346         }
 347         taskAssignmentOnRanksOfThisNode =
 348                 buildTaskAssignment(gpuTasksOnRanksOfThisNode, deviceIdAssignment);
 349     }
 350     catch (...)
 351     {
 352         exceptionPtr = std::current_exception();
 353     }
 354     int countOfExceptionsOnThisRank = int(bool(exceptionPtr));
 355     int countOfExceptionsOverAllRanks = countOverAllRanks(gromacsWorldComm, countOfExceptionsOnThisRank);
 356
 357     // Avoid all ranks spamming the error stream
 358     //
 359     // TODO improve this so that unique errors on different ranks
 360     // are all reported on one rank.
 361     if (countOfExceptionsOnThisRank > 0 && physicalNodeComm.rank_ == 0)
 362     {
 363         try
 364         {
 365             if (exceptionPtr)
 366             {
 367                 std::rethrow_exception(exceptionPtr);
 368             }
 369         }
 370         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
 371     }
 372     // TODO Global barrier so that MPI runtimes can
 373     // organize an orderly shutdown if one of the ranks has had to
 374     // issue a fatal error above. When we have MPI-aware error
 375     // handling and reporting, this should be improved (perhaps
 376     // centralized there).
 377     barrierOverAllRanks(gromacsWorldComm);
 378     if (countOfExceptionsOverAllRanks > 0)
 379     {
 380         gmx_fatal(FARGS,
 381                   "Exiting because task assignment failed. If there is no descriptive error "
 382                   "message in the terminal output, please report this failure as a bug.");
 383     }
 384
 385     // TODO There is no check that mdrun -nb gpu or -pme gpu or
 386     // -gpu_id is actually being implemented such that nonbonded tasks
 387     // are being run on compatible GPUs, on all applicable ranks. That
 388     // would require communication.
 389
 390     GpuTaskAssignments gpuTaskAssignments(hardwareInfo);
 391     gpuTaskAssignments.assignmentForAllRanksOnThisNode_ = taskAssignmentOnRanksOfThisNode;
 392     gpuTaskAssignments.indexOfThisRank_                 = physicalNodeComm.rank_;
 393     gpuTaskAssignments.numGpuTasksOnThisNode_           = numGpuTasksOnThisNode;
 394     gpuTaskAssignments.numRanksOnThisNode_              = numRanksOnThisNode;
 395     gpuTaskAssignments.deviceIdsAssigned_               = deviceIdAssignment;
 396     std::sort(gpuTaskAssignments.deviceIdsAssigned_.begin(), gpuTaskAssignments.deviceIdsAssigned_.end());
 397     gpuTaskAssignments.deviceIdsAssigned_.erase(unique(gpuTaskAssignments.deviceIdsAssigned_.begin(),
 398                                                        gpuTaskAssignments.deviceIdsAssigned_.end()),
 399                                                 gpuTaskAssignments.deviceIdsAssigned_.end());
 400     return gpuTaskAssignments;
 401 }
 402
 403 GpuTaskAssignments::GpuTaskAssignments(const gmx_hw_info_t& hardwareInfo) :
 404     hardwareInfo_(hardwareInfo)
 405 {
 406 }
 407
 408 void GpuTaskAssignments::reportGpuUsage(const MDLogger& mdlog,
 409                                         bool            printHostName,
 410                                         bool            useGpuForBonded,
 411                                         PmeRunMode      pmeRunMode,
 412                                         bool            useGpuForUpdate)
 413 {
 414     gmx::reportGpuUsage(mdlog,
 415                         assignmentForAllRanksOnThisNode_,
 416                         numGpuTasksOnThisNode_,
 417                         numRanksOnThisNode_,
 418                         printHostName,
 419                         useGpuForBonded,
 420                         pmeRunMode,
 421                         useGpuForUpdate);
 422 }
 423
 424 /*! \brief Function for whether the task of \c mapping has value \c TaskType.
 425  *
 426  * \param[in] mapping  Current GPU task mapping.
 427  * \returns If \c TaskType task was assigned to the \c mapping.
 428  */
 429 template<GpuTask TaskType>
 430 static bool hasTaskType(const GpuTaskMapping& mapping)
 431 {
 432     return mapping.task_ == TaskType;
 433 }
 434
 435 /*! \brief Function for whether the \c mapping has the GPU PME or Nonbonded task.
 436  *
 437  * \param[in] mapping  Current GPU task mapping.
 438  * \returns If PME on Nonbonded GPU task was assigned to this mapping.
 439  */
 440 static bool hasPmeOrNonbondedTask(const GpuTaskMapping& mapping)
 441 {
 442     return hasTaskType<GpuTask::Pme>(mapping) || hasTaskType<GpuTask::Nonbonded>(mapping);
 443 }
 444
 445 DeviceInformation* GpuTaskAssignments::initDevice(int* deviceId) const
 446 {
 447     DeviceInformation*       deviceInfo        = nullptr;
 448     const GpuTaskAssignment& gpuTaskAssignment = assignmentForAllRanksOnThisNode_[indexOfThisRank_];
 449
 450     // This works because only one task of each type per rank is currently permitted.
 451     auto gpuTaskMapping =
 452             std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(), hasPmeOrNonbondedTask);
 453
 454     if (gpuTaskMapping != gpuTaskAssignment.end())
 455     {
 456         *deviceId  = gpuTaskMapping->deviceId_;
 457         deviceInfo = hardwareInfo_.deviceInfoList[*deviceId].get();
 458         setActiveDevice(*deviceInfo);
 459     }
 460     return deviceInfo;
 461 }
 462
 463 bool GpuTaskAssignments::thisRankHasPmeGpuTask() const
 464 {
 465     const GpuTaskAssignment& gpuTaskAssignment = assignmentForAllRanksOnThisNode_[indexOfThisRank_];
 466
 467     auto pmeGpuTaskMapping = std::find_if(
 468             gpuTaskAssignment.begin(), gpuTaskAssignment.end(), hasTaskType<GpuTask::Pme>);
 469     const bool thisRankHasPmeGpuTask = (pmeGpuTaskMapping != gpuTaskAssignment.end());
 470
 471     return thisRankHasPmeGpuTask;
 472 }
 473
 474 bool GpuTaskAssignments::thisRankHasAnyGpuTask() const
 475 {
 476     const GpuTaskAssignment& gpuTaskAssignment = assignmentForAllRanksOnThisNode_[indexOfThisRank_];
 477
 478     const bool thisRankHasAnyGpuTask = !gpuTaskAssignment.empty();
 479     return thisRankHasAnyGpuTask;
 480 }
 481
 482 } // namespace gmx