src/gromacs/taskassignment/findallgputasks.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  * \brief
  37  * Defines routine for collecting all GPU tasks found on ranks of a node.
  38  *
  39  * \author Mark Abraham <mark.j.abraham@gmail.com>
  40  * \ingroup module_taskassignment
  41  */
  42 #include "gmxpre.h"
  43
  44 #include "findallgputasks.h"
  45
  46 #include "config.h"
  47
  48 #include <numeric>
  49 #include <vector>
  50
  51 #include "gromacs/taskassignment/decidegpuusage.h"
  52 #include "gromacs/taskassignment/taskassignment.h"
  53 #include "gromacs/utility/arrayref.h"
  54 #include "gromacs/utility/exceptions.h"
  55 #include "gromacs/utility/fatalerror.h"
  56 #include "gromacs/utility/gmxassert.h"
  57 #include "gromacs/utility/gmxmpi.h"
  58 #include "gromacs/utility/physicalnodecommunicator.h"
  59
  60 namespace gmx
  61 {
  62
  63 std::vector<GpuTask> findGpuTasksOnThisRank(const bool       haveGpusOnThisPhysicalNode,
  64                                             const TaskTarget nonbondedTarget,
  65                                             const TaskTarget pmeTarget,
  66                                             const TaskTarget bondedTarget,
  67                                             const TaskTarget updateTarget,
  68                                             const bool       useGpuForNonbonded,
  69                                             const bool       useGpuForPme,
  70                                             const bool       rankHasPpTask,
  71                                             const bool       rankHasPmeTask)
  72 {
  73     std::vector<GpuTask> gpuTasksOnThisRank;
  74     if (rankHasPpTask)
  75     {
  76         if (useGpuForNonbonded)
  77         {
  78             // Note that any bonded tasks on a GPU always accompany a
  79             // non-bonded task.
  80             if (haveGpusOnThisPhysicalNode)
  81             {
  82                 gpuTasksOnThisRank.push_back(GpuTask::Nonbonded);
  83             }
  84             else if (nonbondedTarget == TaskTarget::Gpu)
  85             {
  86                 gmx_fatal(FARGS,
  87                           "Cannot run short-ranged nonbonded interactions on a GPU because no GPU "
  88                           "is detected.");
  89             }
  90             else if (bondedTarget == TaskTarget::Gpu)
  91             {
  92                 gmx_fatal(FARGS,
  93                           "Cannot run bonded interactions on a GPU because no GPU is detected.");
  94             }
  95             else if (updateTarget == TaskTarget::Gpu)
  96             {
  97                 gmx_fatal(FARGS,
  98                           "Cannot run coordinate update on a GPU because no GPU is detected.");
  99             }
 100         }
 101     }
 102     if (rankHasPmeTask)
 103     {
 104         if (useGpuForPme)
 105         {
 106             if (haveGpusOnThisPhysicalNode)
 107             {
 108                 gpuTasksOnThisRank.push_back(GpuTask::Pme);
 109             }
 110             else if (pmeTarget == TaskTarget::Gpu)
 111             {
 112                 gmx_fatal(FARGS, "Cannot run PME on a GPU because no GPU is detected.");
 113             }
 114         }
 115     }
 116     return gpuTasksOnThisRank;
 117 }
 118
 119 namespace
 120 {
 121
 122 //! Constant used to help minimize preprocessing of code.
 123 constexpr bool g_usingMpi = GMX_MPI;
 124
 125 //! Helper function to prepare to all-gather the vector of non-bonded tasks on this node.
 126 std::vector<int> allgather(const int& input, int numRanks, MPI_Comm communicator)
 127 {
 128     std::vector<int> result(numRanks);
 129     if (g_usingMpi && numRanks > 1)
 130     {
 131         // TODO This works as an MPI_Allgather, but thread-MPI does
 132         // not implement that. It's only intra-node communication, and
 133         // happens rarely, so not worth optimizing (yet). Also
 134         // thread-MPI segfaults with 1 rank.
 135 #if GMX_MPI
 136         int root = 0;
 137         // Calling a C API with the const T * from data() doesn't seem
 138         // to compile warning-free with all versions of MPI headers.
 139         //
 140         // TODO Make an allgather template to deal with this nonsense.
 141         MPI_Gather(const_cast<int*>(&input), 1, MPI_INT, const_cast<int*>(result.data()), 1, MPI_INT, root, communicator);
 142         MPI_Bcast(const_cast<int*>(result.data()), result.size(), MPI_INT, root, communicator);
 143 #else
 144         GMX_UNUSED_VALUE(communicator);
 145 #endif
 146     }
 147     else
 148     {
 149         result[0] = input;
 150     }
 151
 152     return result;
 153 }
 154
 155 //! Helper function to compute allgatherv displacements.
 156 std::vector<int> computeDisplacements(ArrayRef<const int> extentOnEachRank, int numRanks)
 157 {
 158     std::vector<int> displacements(numRanks + 1);
 159     displacements[0] = 0;
 160     std::partial_sum(
 161             std::begin(extentOnEachRank), std::end(extentOnEachRank), std::begin(displacements) + 1);
 162     return displacements;
 163 }
 164
 165 //! Helper function to all-gather the vector of all GPU tasks on ranks of this node.
 166 std::vector<GpuTask> allgatherv(ArrayRef<const GpuTask> input,
 167                                 ArrayRef<const int>     extentOnEachRank,
 168                                 ArrayRef<const int>     displacementForEachRank,
 169                                 MPI_Comm                communicator)
 170 {
 171     // Now allocate the vector and do the allgatherv
 172     int totalExtent = displacementForEachRank.back();
 173
 174     std::vector<GpuTask> result;
 175     result.reserve(totalExtent);
 176     if (g_usingMpi && extentOnEachRank.size() > 1 && totalExtent > 0)
 177     {
 178         result.resize(totalExtent);
 179         // TODO This works as an MPI_Allgatherv, but thread-MPI does
 180         // not implement that. It's only intra-node communication, and
 181         // happens rarely, so not worth optimizing (yet). Also
 182         // thread-MPI segfaults with 1 rank and with zero totalExtent.
 183 #if GMX_MPI
 184         int root = 0;
 185         // Calling a C API with the const T * from data() doesn't seem to compile reliably.
 186         // TODO Make an allgatherv template to deal with this nonsense.
 187         MPI_Gatherv(const_cast<GpuTask*>(input.data()),
 188                     input.size(),
 189                     MPI_INT,
 190                     const_cast<GpuTask*>(result.data()),
 191                     const_cast<int*>(extentOnEachRank.data()),
 192                     const_cast<int*>(displacementForEachRank.data()),
 193                     MPI_INT,
 194                     root,
 195                     communicator);
 196         MPI_Bcast(const_cast<GpuTask*>(result.data()), result.size(), MPI_INT, root, communicator);
 197 #else
 198         GMX_UNUSED_VALUE(communicator);
 199 #endif
 200     }
 201     else
 202     {
 203         for (const auto& gpuTask : input)
 204         {
 205             result.push_back(gpuTask);
 206         }
 207     }
 208     return result;
 209 }
 210
 211 } // namespace
 212
 213 /*! \brief Returns container of all tasks on all ranks of this node
 214  * that are eligible for GPU execution.
 215  *
 216  * Perform all necessary communication for preparing for task
 217  * assignment. Separating this aspect makes it possible to unit test
 218  * the logic of task assignment. */
 219 GpuTasksOnRanks findAllGpuTasksOnThisNode(ArrayRef<const GpuTask>         gpuTasksOnThisRank,
 220                                           const PhysicalNodeCommunicator& physicalNodeComm)
 221 {
 222     int      numRanksOnThisNode = physicalNodeComm.size_;
 223     MPI_Comm communicator       = physicalNodeComm.comm_;
 224     // Find out how many GPU tasks are on each rank on this node.
 225     auto numGpuTasksOnEachRankOfThisNode =
 226             allgather(gpuTasksOnThisRank.size(), numRanksOnThisNode, communicator);
 227
 228     /* Collect on each rank of this node a vector describing all
 229      * GPU tasks on this node, in ascending order of rank. This
 230      * requires a vector allgather. The displacements indicate where
 231      * the GPU tasks on each rank of this node start and end within
 232      * the vector. */
 233     auto displacementsForEachRank =
 234             computeDisplacements(numGpuTasksOnEachRankOfThisNode, numRanksOnThisNode);
 235     auto gpuTasksOnThisNode = allgatherv(
 236             gpuTasksOnThisRank, numGpuTasksOnEachRankOfThisNode, displacementsForEachRank, communicator);
 237
 238     /* Next, we re-use the displacements to break up the vector
 239      * of GPU tasks into something that can be indexed like
 240      * gpuTasks[rankIndex][taskIndex]. */
 241     GpuTasksOnRanks gpuTasksOnRanksOfThisNode;
 242     // TODO This would be nicer if we had a good abstraction for "pair
 243     // of iterators that point to adjacent container elements" or
 244     // "iterator that points to the first of a pair of valid adjacent
 245     // container elements, or end".
 246     GMX_ASSERT(displacementsForEachRank.size() > 1,
 247                "Even with one rank, there's always both a start and end displacement");
 248     auto currentDisplacementIt = displacementsForEachRank.begin();
 249     auto nextDisplacementIt    = currentDisplacementIt + 1;
 250     do
 251     {
 252         gpuTasksOnRanksOfThisNode.emplace_back(std::vector<GpuTask>());
 253         for (auto taskOnThisRankIndex = *currentDisplacementIt; taskOnThisRankIndex != *nextDisplacementIt;
 254              ++taskOnThisRankIndex)
 255         {
 256             gpuTasksOnRanksOfThisNode.back().push_back(gpuTasksOnThisNode[taskOnThisRankIndex]);
 257         }
 258
 259         currentDisplacementIt = nextDisplacementIt;
 260         ++nextDisplacementIt;
 261     } while (nextDisplacementIt != displacementsForEachRank.end());
 262
 263     return gpuTasksOnRanksOfThisNode;
 264 }
 265
 266 } // namespace gmx