src/gromacs/taskassignment/findallgputasks.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  * \brief
  37  * Defines routine for collecting all GPU tasks found on ranks of a node.
  38  *
  39  * \author Mark Abraham <mark.j.abraham@gmail.com>
  40  * \ingroup module_taskassignment
  41  */
  42 #include "gmxpre.h"
  43
  44 #include "findallgputasks.h"
  45
  46 #include "config.h"
  47
  48 #include <numeric>
  49 #include <vector>
  50
  51 #include "gromacs/taskassignment/decidegpuusage.h"
  52 #include "gromacs/taskassignment/taskassignment.h"
  53 #include "gromacs/utility/arrayref.h"
  54 #include "gromacs/utility/exceptions.h"
  55 #include "gromacs/utility/fatalerror.h"
  56 #include "gromacs/utility/gmxassert.h"
  57 #include "gromacs/utility/gmxmpi.h"
  58 #include "gromacs/utility/physicalnodecommunicator.h"
  59
  60 namespace gmx
  61 {
  62
  63 std::vector<GpuTask> findGpuTasksOnThisRank(const bool       haveGpusOnThisPhysicalNode,
  64                                             const TaskTarget nonbondedTarget,
  65                                             const TaskTarget pmeTarget,
  66                                             const TaskTarget bondedTarget,
  67                                             const TaskTarget updateTarget,
  68                                             const bool       useGpuForNonbonded,
  69                                             const bool       useGpuForPme,
  70                                             const bool       rankHasPpTask,
  71                                             const bool       rankHasPmeTask)
  72 {
  73     std::vector<GpuTask> gpuTasksOnThisRank;
  74     if (rankHasPpTask)
  75     {
  76         if (useGpuForNonbonded)
  77         {
  78             // Note that any bonded tasks on a GPU always accompany a
  79             // non-bonded task.
  80             if (haveGpusOnThisPhysicalNode)
  81             {
  82                 gpuTasksOnThisRank.push_back(GpuTask::Nonbonded);
  83             }
  84             else if (nonbondedTarget == TaskTarget::Gpu)
  85             {
  86                 gmx_fatal(FARGS,
  87                           "Cannot run short-ranged nonbonded interactions on a GPU because no GPU "
  88                           "is detected.");
  89             }
  90             else if (bondedTarget == TaskTarget::Gpu)
  91             {
  92                 gmx_fatal(FARGS,
  93                           "Cannot run bonded interactions on a GPU because no GPU is detected.");
  94             }
  95             else if (updateTarget == TaskTarget::Gpu)
  96             {
  97                 gmx_fatal(FARGS,
  98                           "Cannot run coordinate update on a GPU because no GPU is detected.");
  99             }
 100         }
 101     }
 102     if (rankHasPmeTask)
 103     {
 104         if (useGpuForPme)
 105         {
 106             if (haveGpusOnThisPhysicalNode)
 107             {
 108                 gpuTasksOnThisRank.push_back(GpuTask::Pme);
 109             }
 110             else if (pmeTarget == TaskTarget::Gpu)
 111             {
 112                 gmx_fatal(FARGS, "Cannot run PME on a GPU because no GPU is detected.");
 113             }
 114         }
 115     }
 116     return gpuTasksOnThisRank;
 117 }
 118
 119 namespace
 120 {
 121
 122 //! Constant used to help minimize preprocessing of code.
 123 constexpr bool g_usingMpi = GMX_MPI;
 124
 125 //! Helper function to prepare to all-gather the vector of non-bonded tasks on this node.
 126 std::vector<int> allgather(const int& input, int numRanks, MPI_Comm communicator)
 127 {
 128     std::vector<int> result(numRanks);
 129     if (g_usingMpi && numRanks > 1)
 130     {
 131         // TODO This works as an MPI_Allgather, but thread-MPI does
 132         // not implement that. It's only intra-node communication, and
 133         // happens rarely, so not worth optimizing (yet). Also
 134         // thread-MPI segfaults with 1 rank.
 135 #if GMX_MPI
 136         int root = 0;
 137         // Calling a C API with the const T * from data() doesn't seem
 138         // to compile warning-free with all versions of MPI headers.
 139         //
 140         // TODO Make an allgather template to deal with this nonsense.
 141         MPI_Gather(const_cast<int*>(&input), 1, MPI_INT, const_cast<int*>(result.data()), 1,
 142                    MPI_INT, root, communicator);
 143         MPI_Bcast(const_cast<int*>(result.data()), result.size(), MPI_INT, root, communicator);
 144 #else
 145         GMX_UNUSED_VALUE(communicator);
 146 #endif
 147     }
 148     else
 149     {
 150         result[0] = input;
 151     }
 152
 153     return result;
 154 }
 155
 156 //! Helper function to compute allgatherv displacements.
 157 std::vector<int> computeDisplacements(ArrayRef<const int> extentOnEachRank, int numRanks)
 158 {
 159     std::vector<int> displacements(numRanks + 1);
 160     displacements[0] = 0;
 161     std::partial_sum(std::begin(extentOnEachRank), std::end(extentOnEachRank),
 162                      std::begin(displacements) + 1);
 163     return displacements;
 164 }
 165
 166 //! Helper function to all-gather the vector of all GPU tasks on ranks of this node.
 167 std::vector<GpuTask> allgatherv(ArrayRef<const GpuTask> input,
 168                                 ArrayRef<const int>     extentOnEachRank,
 169                                 ArrayRef<const int>     displacementForEachRank,
 170                                 MPI_Comm                communicator)
 171 {
 172     // Now allocate the vector and do the allgatherv
 173     int totalExtent = displacementForEachRank.back();
 174
 175     std::vector<GpuTask> result;
 176     result.reserve(totalExtent);
 177     if (g_usingMpi && extentOnEachRank.size() > 1 && totalExtent > 0)
 178     {
 179         result.resize(totalExtent);
 180         // TODO This works as an MPI_Allgatherv, but thread-MPI does
 181         // not implement that. It's only intra-node communication, and
 182         // happens rarely, so not worth optimizing (yet). Also
 183         // thread-MPI segfaults with 1 rank and with zero totalExtent.
 184 #if GMX_MPI
 185         int root = 0;
 186         // Calling a C API with the const T * from data() doesn't seem to compile reliably.
 187         // TODO Make an allgatherv template to deal with this nonsense.
 188         MPI_Gatherv(const_cast<GpuTask*>(input.data()), input.size(), MPI_INT,
 189                     const_cast<GpuTask*>(result.data()), const_cast<int*>(extentOnEachRank.data()),
 190                     const_cast<int*>(displacementForEachRank.data()), MPI_INT, root, communicator);
 191         MPI_Bcast(const_cast<GpuTask*>(result.data()), result.size(), MPI_INT, root, communicator);
 192 #else
 193         GMX_UNUSED_VALUE(communicator);
 194 #endif
 195     }
 196     else
 197     {
 198         for (const auto& gpuTask : input)
 199         {
 200             result.push_back(gpuTask);
 201         }
 202     }
 203     return result;
 204 }
 205
 206 } // namespace
 207
 208 /*! \brief Returns container of all tasks on all ranks of this node
 209  * that are eligible for GPU execution.
 210  *
 211  * Perform all necessary communication for preparing for task
 212  * assignment. Separating this aspect makes it possible to unit test
 213  * the logic of task assignment. */
 214 GpuTasksOnRanks findAllGpuTasksOnThisNode(ArrayRef<const GpuTask>         gpuTasksOnThisRank,
 215                                           const PhysicalNodeCommunicator& physicalNodeComm)
 216 {
 217     int      numRanksOnThisNode = physicalNodeComm.size_;
 218     MPI_Comm communicator       = physicalNodeComm.comm_;
 219     // Find out how many GPU tasks are on each rank on this node.
 220     auto numGpuTasksOnEachRankOfThisNode =
 221             allgather(gpuTasksOnThisRank.size(), numRanksOnThisNode, communicator);
 222
 223     /* Collect on each rank of this node a vector describing all
 224      * GPU tasks on this node, in ascending order of rank. This
 225      * requires a vector allgather. The displacements indicate where
 226      * the GPU tasks on each rank of this node start and end within
 227      * the vector. */
 228     auto displacementsForEachRank =
 229             computeDisplacements(numGpuTasksOnEachRankOfThisNode, numRanksOnThisNode);
 230     auto gpuTasksOnThisNode = allgatherv(gpuTasksOnThisRank, numGpuTasksOnEachRankOfThisNode,
 231                                          displacementsForEachRank, communicator);
 232
 233     /* Next, we re-use the displacements to break up the vector
 234      * of GPU tasks into something that can be indexed like
 235      * gpuTasks[rankIndex][taskIndex]. */
 236     GpuTasksOnRanks gpuTasksOnRanksOfThisNode;
 237     // TODO This would be nicer if we had a good abstraction for "pair
 238     // of iterators that point to adjacent container elements" or
 239     // "iterator that points to the first of a pair of valid adjacent
 240     // container elements, or end".
 241     GMX_ASSERT(displacementsForEachRank.size() > 1,
 242                "Even with one rank, there's always both a start and end displacement");
 243     auto currentDisplacementIt = displacementsForEachRank.begin();
 244     auto nextDisplacementIt    = currentDisplacementIt + 1;
 245     do
 246     {
 247         gpuTasksOnRanksOfThisNode.emplace_back(std::vector<GpuTask>());
 248         for (auto taskOnThisRankIndex = *currentDisplacementIt;
 249              taskOnThisRankIndex != *nextDisplacementIt; ++taskOnThisRankIndex)
 250         {
 251             gpuTasksOnRanksOfThisNode.back().push_back(gpuTasksOnThisNode[taskOnThisRankIndex]);
 252         }
 253
 254         currentDisplacementIt = nextDisplacementIt;
 255         ++nextDisplacementIt;
 256     } while (nextDisplacementIt != displacementsForEachRank.end());
 257
 258     return gpuTasksOnRanksOfThisNode;
 259 }
 260
 261 } // namespace gmx