src/gromacs/taskassignment/findallgputasks.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  * \brief
  37  * Defines routine for collecting all GPU tasks found on ranks of a node.
  38  *
  39  * \author Mark Abraham <mark.j.abraham@gmail.com>
  40  * \ingroup module_taskassignment
  41  */
  42 #include "gmxpre.h"
  43
  44 #include "findallgputasks.h"
  45
  46 #include "config.h"
  47
  48 #include <numeric>
  49 #include <vector>
  50
  51 #include "gromacs/utility/exceptions.h"
  52 #include "gromacs/utility/gmxassert.h"
  53 #include "gromacs/utility/gmxmpi.h"
  54 #include "gromacs/utility/physicalnodecommunicator.h"
  55
  56 namespace gmx
  57 {
  58
  59 namespace
  60 {
  61
  62 //! Constant used to help minimize preprocessing of code.
  63 constexpr bool g_usingMpi = GMX_MPI;
  64
  65 //! Helper function to prepare to all-gather the vector of non-bonded tasks on this node.
  66 std::vector<int> allgather(const int &input,
  67                            int        numRanks,
  68                            MPI_Comm   communicator)
  69 {
  70     std::vector<int> result(numRanks);
  71     if (g_usingMpi && numRanks > 1)
  72     {
  73         // TODO This works as an MPI_Allgather, but thread-MPI does
  74         // not implement that. It's only intra-node communication, and
  75         // happens rarely, so not worth optimizing (yet). Also
  76         // thread-MPI segfaults with 1 rank.
  77 #if GMX_MPI
  78         int root = 0;
  79         // Calling a C API with the const T * from data() doesn't seem
  80         // to compile warning-free with all versions of MPI headers.
  81         //
  82         // TODO Make an allgather template to deal with this nonsense.
  83         MPI_Gather(const_cast<int *>(&input),
  84                    1,
  85                    MPI_INT,
  86                    const_cast<int *>(result.data()),
  87                    1,
  88                    MPI_INT,
  89                    root,
  90                    communicator);
  91         MPI_Bcast(const_cast<int *>(result.data()),
  92                   result.size(),
  93                   MPI_INT,
  94                   root,
  95                   communicator);
  96 #else
  97         GMX_UNUSED_VALUE(communicator);
  98 #endif
  99     }
 100     else
 101     {
 102         result[0] = input;
 103     }
 104
 105     return result;
 106 }
 107
 108 //! Helper function to compute allgatherv displacements.
 109 std::vector<int> computeDisplacements(ArrayRef<const int> extentOnEachRank,
 110                                       int                 numRanks)
 111 {
 112     std::vector<int> displacements(numRanks + 1);
 113     displacements[0] = 0;
 114     std::partial_sum(std::begin(extentOnEachRank), std::end(extentOnEachRank), std::begin(displacements) + 1);
 115     return displacements;
 116 }
 117
 118 //! Helper function to all-gather the vector of all GPU tasks on ranks of this node.
 119 std::vector<GpuTask> allgatherv(ArrayRef<const GpuTask> input,
 120                                 ArrayRef<const int>     extentOnEachRank,
 121                                 ArrayRef<const int>     displacementForEachRank,
 122                                 MPI_Comm                communicator)
 123 {
 124     // Now allocate the vector and do the allgatherv
 125     int                  totalExtent = displacementForEachRank.back();
 126
 127     std::vector<GpuTask> result;
 128     result.reserve(totalExtent);
 129     if (g_usingMpi && extentOnEachRank.size() > 1 && totalExtent > 0)
 130     {
 131         result.resize(totalExtent);
 132         // TODO This works as an MPI_Allgatherv, but thread-MPI does
 133         // not implement that. It's only intra-node communication, and
 134         // happens rarely, so not worth optimizing (yet). Also
 135         // thread-MPI segfaults with 1 rank and with zero totalExtent.
 136 #if GMX_MPI
 137         int root = 0;
 138         // Calling a C API with the const T * from data() doesn't seem to compile reliably.
 139         // TODO Make an allgatherv template to deal with this nonsense.
 140         MPI_Gatherv(const_cast<GpuTask *>(input.data()),
 141                     input.size(),
 142                     MPI_INT,
 143                     const_cast<GpuTask *>(result.data()),
 144                     const_cast<int *>(extentOnEachRank.data()),
 145                     const_cast<int *>(displacementForEachRank.data()),
 146                     MPI_INT,
 147                     root,
 148                     communicator);
 149         MPI_Bcast(const_cast<GpuTask *>(result.data()),
 150                   result.size(),
 151                   MPI_INT,
 152                   root,
 153                   communicator);
 154 #else
 155         GMX_UNUSED_VALUE(communicator);
 156 #endif
 157     }
 158     else
 159     {
 160         for (const auto &gpuTask : input)
 161         {
 162             result.push_back(gpuTask);
 163         }
 164     }
 165     return result;
 166 }
 167
 168 }   // namespace
 169
 170 /*! \brief Returns container of all tasks on all ranks of this node
 171  * that are eligible for GPU execution.
 172  *
 173  * Perform all necessary communication for preparing for task
 174  * assignment. Separating this aspect makes it possible to unit test
 175  * the logic of task assignment. */
 176 GpuTasksOnRanks
 177 findAllGpuTasksOnThisNode(ArrayRef<const GpuTask>         gpuTasksOnThisRank,
 178                           const PhysicalNodeCommunicator &physicalNodeComm)
 179 {
 180     int      numRanksOnThisNode = physicalNodeComm.size_;
 181     MPI_Comm communicator       = physicalNodeComm.comm_;
 182     // Find out how many GPU tasks are on each rank on this node.
 183     auto     numGpuTasksOnEachRankOfThisNode =
 184         allgather(gpuTasksOnThisRank.size(), numRanksOnThisNode, communicator);
 185
 186     /* Collect on each rank of this node a vector describing all
 187      * GPU tasks on this node, in ascending order of rank. This
 188      * requires a vector allgather. The displacements indicate where
 189      * the GPU tasks on each rank of this node start and end within
 190      * the vector. */
 191     auto displacementsForEachRank = computeDisplacements(numGpuTasksOnEachRankOfThisNode, numRanksOnThisNode);
 192     auto gpuTasksOnThisNode       = allgatherv(gpuTasksOnThisRank, numGpuTasksOnEachRankOfThisNode,
 193                                                displacementsForEachRank, communicator);
 194
 195     /* Next, we re-use the displacements to break up the vector
 196      * of GPU tasks into something that can be indexed like
 197      * gpuTasks[rankIndex][taskIndex]. */
 198     GpuTasksOnRanks gpuTasksOnRanksOfThisNode;
 199     // TODO This would be nicer if we had a good abstraction for "pair
 200     // of iterators that point to adjacent container elements" or
 201     // "iterator that points to the first of a pair of valid adjacent
 202     // container elements, or end".
 203     GMX_ASSERT(displacementsForEachRank.size() > 1, "Even with one rank, there's always both a start and end displacement");
 204     auto currentDisplacementIt = displacementsForEachRank.begin();
 205     auto nextDisplacementIt    = currentDisplacementIt + 1;
 206     do
 207     {
 208         gpuTasksOnRanksOfThisNode.emplace_back(std::vector<GpuTask>());
 209         for (auto taskOnThisRankIndex = *currentDisplacementIt; taskOnThisRankIndex != *nextDisplacementIt; ++taskOnThisRankIndex)
 210         {
 211             gpuTasksOnRanksOfThisNode.back().push_back(gpuTasksOnThisNode[taskOnThisRankIndex]);
 212         }
 213
 214         currentDisplacementIt = nextDisplacementIt;
 215         ++nextDisplacementIt;
 216     }
 217     while (nextDisplacementIt != displacementsForEachRank.end());
 218
 219     return gpuTasksOnRanksOfThisNode;
 220 }
 221
 222 }  // namespace gmx