2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * Defines helper and factory functionality for task assignment.
39 * Note that the GPU ID assignment could potentially match many
40 * different kinds of simulation setups, including ranks from multiple
41 * simulations, ranks from the same simulation, and/or ranks with duty
42 * only for particular tasks (e.g. PME-only ranks). Which GPU ID
43 * assignments are valid will naturally depend on the other run-time
44 * options given to mdrun, and the current capabilities of the
47 * \author Mark Abraham <mark.j.abraham@gmail.com>
48 * \ingroup module_taskassignment
52 #include "taskassignment.h"
61 #include "gromacs/domdec/domdec.h"
62 #include "gromacs/gmxlib/network.h"
63 #include "gromacs/hardware/device_management.h"
64 #include "gromacs/hardware/hw_info.h"
65 #include "gromacs/mdrunutility/multisim.h"
66 #include "gromacs/mdtypes/commrec.h"
67 #include "gromacs/taskassignment/usergpuids.h"
68 #include "gromacs/utility/cstringutil.h"
69 #include "gromacs/utility/exceptions.h"
70 #include "gromacs/utility/fatalerror.h"
71 #include "gromacs/utility/gmxassert.h"
72 #include "gromacs/utility/gmxmpi.h"
73 #include "gromacs/utility/logger.h"
74 #include "gromacs/utility/physicalnodecommunicator.h"
75 #include "gromacs/utility/stringutil.h"
76 #include "gromacs/utility/sysinfo.h"
78 #include "findallgputasks.h"
79 #include "reportgpuusage.h"
87 /*! \brief Build data structure of types of GPU tasks on a rank,
88 * together with the mapped GPU device IDs, for all GPU tasks on all
89 * the ranks of this node.
91 * \param[in] gpuTasksOnRanksOfThisNode For each rank on this node, the set of tasks
92 * that are eligible to run on GPUs.
93 * \param[in] gpuIds The user-supplied GPU IDs.
95 std::vector<GpuTaskAssignment> buildTaskAssignment(const GpuTasksOnRanks& gpuTasksOnRanksOfThisNode,
96 ArrayRef<const int> gpuIds)
98 std::vector<GpuTaskAssignment> gpuTaskAssignmentOnRanksOfThisNode(gpuTasksOnRanksOfThisNode.size());
100 // Loop over the ranks on this node, and the tasks on each
101 // rank. For each task, take the next device ID from those
102 // provided by the user, to build a vector of mappings of task to
103 // ID, for each rank on this node. Note that if there have not
104 // been any GPU tasks identified, then gpuIds can be empty.
105 auto currentGpuId = gpuIds.begin();
106 auto gpuTaskAssignmentOnRank = gpuTaskAssignmentOnRanksOfThisNode.begin();
107 for (const auto& gpuTasksOnRank : gpuTasksOnRanksOfThisNode)
109 gpuTaskAssignmentOnRank->reserve(gpuTasksOnRank.size());
110 for (const auto& gpuTaskType : gpuTasksOnRank)
112 GMX_RELEASE_ASSERT(currentGpuId != gpuIds.end(), "Indexing out of range for GPU tasks");
113 gpuTaskAssignmentOnRank->push_back({ gpuTaskType, *currentGpuId });
116 GMX_RELEASE_ASSERT(gpuTaskAssignmentOnRank->size() == gpuTasksOnRank.size(),
117 "Mismatch in number of GPU tasks on a rank with the number of elements "
118 "in the resulting task assignment");
119 ++gpuTaskAssignmentOnRank;
122 return gpuTaskAssignmentOnRanksOfThisNode;
125 /*! \brief Return whether a GPU device is shared between any ranks.
127 * Sharing GPUs among multiple ranks is possible via either user or
128 * automated selection. */
129 bool isAnyGpuSharedBetweenRanks(ArrayRef<const GpuTaskAssignment> gpuTaskAssignments)
131 // Loop over all ranks i, looking on all higher ranks j whether
132 // any tasks on them share GPU device IDs.
134 // TODO Should this functionality also consider whether tasks on
135 // the same rank are sharing a device?
136 for (size_t i = 0; i < gpuTaskAssignments.size(); ++i)
138 for (const auto& taskOnRankI : gpuTaskAssignments[i])
140 for (size_t j = i + 1; j < gpuTaskAssignments.size(); ++j)
142 for (const auto& taskOnRankJ : gpuTaskAssignments[j])
144 if (taskOnRankI.deviceId_ == taskOnRankJ.deviceId_)
157 void GpuTaskAssignments::logPerformanceHints(const MDLogger& mdlog, size_t numAvailableDevicesOnThisNode)
159 if (numAvailableDevicesOnThisNode > numGpuTasksOnThisNode_)
161 /* TODO In principle, this warning could be warranted only on
162 * some nodes, but we lack the infrastructure to do a good job
163 * of reporting that. */
164 GMX_LOG(mdlog.warning)
167 "NOTE: You assigned the GPU tasks on a node such that some GPUs "
168 "available on that node are unused, which might not be optimal.");
171 if (isAnyGpuSharedBetweenRanks(assignmentForAllRanksOnThisNode_))
173 GMX_LOG(mdlog.warning)
176 "NOTE: You assigned the same GPU ID(s) to multiple ranks, which is a good "
177 "idea if you have measured the performance of alternatives.");
184 //! Counts all the GPU tasks on this node.
185 size_t countGpuTasksOnThisNode(const GpuTasksOnRanks& gpuTasksOnRanksOfThisNode)
187 size_t numGpuTasksOnThisNode = 0;
188 for (const auto& gpuTasksOnRank : gpuTasksOnRanksOfThisNode)
190 numGpuTasksOnThisNode += gpuTasksOnRank.size();
192 return numGpuTasksOnThisNode;
195 /*! \brief Return on each rank the total count over all ranks of all
197 int countOverAllRanks(MPI_Comm comm, int countOnThisRank)
202 MPI_Comm_size(comm, &numRanks);
205 MPI_Allreduce(&countOnThisRank, &sum, 1, MPI_INT, MPI_SUM, comm);
209 GMX_UNUSED_VALUE(comm);
212 sum = countOnThisRank;
218 /*! \brief Barrier over all rank in \p comm */
219 void barrierOverAllRanks(MPI_Comm comm)
223 MPI_Comm_size(comm, &numRanks);
229 GMX_UNUSED_VALUE(comm);
235 GpuTaskAssignmentsBuilder::GpuTaskAssignmentsBuilder() = default;
237 GpuTaskAssignments GpuTaskAssignmentsBuilder::build(const gmx::ArrayRef<const int> availableDevices,
238 const gmx::ArrayRef<const int> userGpuTaskAssignment,
239 const gmx_hw_info_t& hardwareInfo,
240 MPI_Comm gromacsWorldComm,
241 const PhysicalNodeCommunicator& physicalNodeComm,
242 const TaskTarget nonbondedTarget,
243 const TaskTarget pmeTarget,
244 const TaskTarget bondedTarget,
245 const TaskTarget updateTarget,
246 const bool useGpuForNonbonded,
247 const bool useGpuForPme,
251 size_t numRanksOnThisNode = physicalNodeComm.size_;
252 std::vector<GpuTask> gpuTasksOnThisRank = findGpuTasksOnThisRank(!availableDevices.empty(),
261 /* Communicate among ranks on this node to find each task that can
262 * be executed on a GPU, on each rank. */
263 auto gpuTasksOnRanksOfThisNode = findAllGpuTasksOnThisNode(gpuTasksOnThisRank, physicalNodeComm);
264 size_t numGpuTasksOnThisNode = countGpuTasksOnThisNode(gpuTasksOnRanksOfThisNode);
266 std::exception_ptr exceptionPtr;
267 std::vector<GpuTaskAssignment> taskAssignmentOnRanksOfThisNode;
268 std::vector<int> deviceIdAssignment;
271 // Use the GPU IDs from the user if they supplied
272 // them. Otherwise, choose from the compatible GPUs.
274 // GPU ID assignment strings, if provided, cover all the ranks
275 // on a node. If nodes or the process placement on them are
276 // heterogeneous, then the GMX_GPU_ID environment variable
277 // must be set by a user who also wishes to direct GPU ID
278 // assignment. Thus this implementation of task assignment
279 // can assume it has a GPU ID assignment appropriate for the
280 // node upon which its process is running.
282 // Valid GPU ID assignments are `an ordered set of digits that
283 // identify GPU device IDs (e.g. as understood by the GPU
284 // runtime, and subject to environment modification such as
285 // with CUDA_VISIBLE_DEVICES) that will be used for the
286 // GPU-suitable tasks on all of the ranks of that node.
287 std::vector<int> generatedGpuIds;
288 if (userGpuTaskAssignment.empty())
290 ArrayRef<const int> compatibleGpusToUse = availableDevices;
292 // enforce the single device/rank restriction
293 if (numRanksOnThisNode == 1 && !compatibleGpusToUse.empty())
295 compatibleGpusToUse = compatibleGpusToUse.subArray(0, 1);
298 // When doing automated assignment of GPU tasks to GPU
299 // IDs, even if we have more than one kind of GPU task, we
300 // do a simple round-robin assignment. That's not ideal,
301 // but we don't have any way to do a better job reliably.
302 generatedGpuIds = makeGpuIds(compatibleGpusToUse, numGpuTasksOnThisNode);
304 if ((numGpuTasksOnThisNode > availableDevices.size())
305 && (numGpuTasksOnThisNode % availableDevices.size() != 0))
307 // TODO Decorating the message with hostname should be
308 // the job of an error-reporting module.
310 gmx_gethostname(host, STRLEN);
312 GMX_THROW(InconsistentInputError(formatString(
313 "There were %zu GPU tasks found on node %s, but %zu GPUs were "
314 "available. If the GPUs are equivalent, then it is usually best "
315 "to have a number of tasks that is a multiple of the number of GPUs. "
316 "You should reconsider your GPU task assignment, "
317 "number of ranks, or your use of the -nb, -pme, and -npme options, "
318 "perhaps after measuring the performance you can get.",
319 numGpuTasksOnThisNode,
321 availableDevices.size())));
323 deviceIdAssignment = generatedGpuIds;
327 if (numGpuTasksOnThisNode != userGpuTaskAssignment.size())
329 // TODO Decorating the message with hostname should be
330 // the job of an error-reporting module.
332 gmx_gethostname(host, STRLEN);
334 GMX_THROW(InconsistentInputError(formatString(
335 "There were %zu GPU tasks assigned on node %s, but %zu GPU tasks were "
336 "identified, and these must match. Reconsider your GPU task assignment, "
337 "number of ranks, or your use of the -nb, -pme, and -npme options.",
338 userGpuTaskAssignment.size(),
340 numGpuTasksOnThisNode)));
342 // Did the user choose compatible GPUs?
343 checkUserGpuIds(hardwareInfo.deviceInfoList, availableDevices, userGpuTaskAssignment);
345 deviceIdAssignment = gmx::copyOf(userGpuTaskAssignment);
347 taskAssignmentOnRanksOfThisNode =
348 buildTaskAssignment(gpuTasksOnRanksOfThisNode, deviceIdAssignment);
352 exceptionPtr = std::current_exception();
354 int countOfExceptionsOnThisRank = int(bool(exceptionPtr));
355 int countOfExceptionsOverAllRanks = countOverAllRanks(gromacsWorldComm, countOfExceptionsOnThisRank);
357 // Avoid all ranks spamming the error stream
359 // TODO improve this so that unique errors on different ranks
360 // are all reported on one rank.
361 if (countOfExceptionsOnThisRank > 0 && physicalNodeComm.rank_ == 0)
367 std::rethrow_exception(exceptionPtr);
370 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
372 // TODO Global barrier so that MPI runtimes can
373 // organize an orderly shutdown if one of the ranks has had to
374 // issue a fatal error above. When we have MPI-aware error
375 // handling and reporting, this should be improved (perhaps
376 // centralized there).
377 barrierOverAllRanks(gromacsWorldComm);
378 if (countOfExceptionsOverAllRanks > 0)
381 "Exiting because task assignment failed. If there is no descriptive error "
382 "message in the terminal output, please report this failure as a bug.");
385 // TODO There is no check that mdrun -nb gpu or -pme gpu or
386 // -gpu_id is actually being implemented such that nonbonded tasks
387 // are being run on compatible GPUs, on all applicable ranks. That
388 // would require communication.
390 GpuTaskAssignments gpuTaskAssignments(hardwareInfo);
391 gpuTaskAssignments.assignmentForAllRanksOnThisNode_ = taskAssignmentOnRanksOfThisNode;
392 gpuTaskAssignments.indexOfThisRank_ = physicalNodeComm.rank_;
393 gpuTaskAssignments.numGpuTasksOnThisNode_ = numGpuTasksOnThisNode;
394 gpuTaskAssignments.numRanksOnThisNode_ = numRanksOnThisNode;
395 gpuTaskAssignments.deviceIdsAssigned_ = deviceIdAssignment;
396 std::sort(gpuTaskAssignments.deviceIdsAssigned_.begin(), gpuTaskAssignments.deviceIdsAssigned_.end());
397 gpuTaskAssignments.deviceIdsAssigned_.erase(unique(gpuTaskAssignments.deviceIdsAssigned_.begin(),
398 gpuTaskAssignments.deviceIdsAssigned_.end()),
399 gpuTaskAssignments.deviceIdsAssigned_.end());
400 return gpuTaskAssignments;
403 GpuTaskAssignments::GpuTaskAssignments(const gmx_hw_info_t& hardwareInfo) :
404 hardwareInfo_(hardwareInfo)
408 void GpuTaskAssignments::reportGpuUsage(const MDLogger& mdlog,
410 bool useGpuForBonded,
411 PmeRunMode pmeRunMode,
412 bool useGpuForUpdate)
414 gmx::reportGpuUsage(mdlog,
415 assignmentForAllRanksOnThisNode_,
416 numGpuTasksOnThisNode_,
424 /*! \brief Function for whether the task of \c mapping has value \c TaskType.
426 * \param[in] mapping Current GPU task mapping.
427 * \returns If \c TaskType task was assigned to the \c mapping.
429 template<GpuTask TaskType>
430 static bool hasTaskType(const GpuTaskMapping& mapping)
432 return mapping.task_ == TaskType;
435 /*! \brief Function for whether the \c mapping has the GPU PME or Nonbonded task.
437 * \param[in] mapping Current GPU task mapping.
438 * \returns If PME on Nonbonded GPU task was assigned to this mapping.
440 static bool hasPmeOrNonbondedTask(const GpuTaskMapping& mapping)
442 return hasTaskType<GpuTask::Pme>(mapping) || hasTaskType<GpuTask::Nonbonded>(mapping);
445 DeviceInformation* GpuTaskAssignments::initDevice(int* deviceId) const
447 DeviceInformation* deviceInfo = nullptr;
448 const GpuTaskAssignment& gpuTaskAssignment = assignmentForAllRanksOnThisNode_[indexOfThisRank_];
450 // This works because only one task of each type per rank is currently permitted.
451 auto gpuTaskMapping =
452 std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(), hasPmeOrNonbondedTask);
454 if (gpuTaskMapping != gpuTaskAssignment.end())
456 *deviceId = gpuTaskMapping->deviceId_;
457 deviceInfo = hardwareInfo_.deviceInfoList[*deviceId].get();
458 setActiveDevice(*deviceInfo);
463 bool GpuTaskAssignments::thisRankHasPmeGpuTask() const
465 const GpuTaskAssignment& gpuTaskAssignment = assignmentForAllRanksOnThisNode_[indexOfThisRank_];
467 auto pmeGpuTaskMapping = std::find_if(
468 gpuTaskAssignment.begin(), gpuTaskAssignment.end(), hasTaskType<GpuTask::Pme>);
469 const bool thisRankHasPmeGpuTask = (pmeGpuTaskMapping != gpuTaskAssignment.end());
471 return thisRankHasPmeGpuTask;
474 bool GpuTaskAssignments::thisRankHasAnyGpuTask() const
476 const GpuTaskAssignment& gpuTaskAssignment = assignmentForAllRanksOnThisNode_[indexOfThisRank_];
478 const bool thisRankHasAnyGpuTask = !gpuTaskAssignment.empty();
479 return thisRankHasAnyGpuTask;