src/gromacs/hardware/hardwaretopology.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36
  37 /*! \internal \file
  38  * \brief
  39  * Implements gmx::HardwareTopology.
  40  *
  41  * \author Erik Lindahl <erik.lindahl@gmail.com>
  42  * \ingroup module_hardware
  43  */
  44
  45 #include "gmxpre.h"
  46
  47 #include "hardwaretopology.h"
  48
  49 #include "config.h"
  50
  51 #include <cstdio>
  52
  53 #include <algorithm>
  54 #include <functional>
  55 #include <limits>
  56 #include <utility>
  57 #include <vector>
  58
  59 #if GMX_USE_HWLOC
  60 #    include <hwloc.h>
  61 #endif
  62
  63 #include "gromacs/hardware/cpuinfo.h"
  64 #include "gromacs/utility/gmxassert.h"
  65
  66 #ifdef HAVE_UNISTD_H
  67 #    include <unistd.h> // sysconf()
  68 #endif
  69 #if GMX_NATIVE_WINDOWS
  70 #    include <windows.h> // GetSystemInfo()
  71 #endif
  72
  73 //! Convenience macro to help us avoid ifdefs each time we use sysconf
  74 #if !defined(_SC_NPROCESSORS_ONLN) && defined(_SC_NPROC_ONLN)
  75 #    define _SC_NPROCESSORS_ONLN _SC_NPROC_ONLN
  76 #endif
  77
  78 namespace gmx
  79 {
  80
  81 namespace
  82 {
  83
  84 /*****************************************************************************
  85  *                                                                           *
  86  *   Utility functions for extracting hardware topology from CpuInfo object  *
  87  *                                                                           *
  88  *****************************************************************************/
  89
  90 /*! \brief Initialize machine data from basic information in cpuinfo
  91  *
  92  *  \param  machine      Machine tree structure where information will be assigned
  93  *                       if the cpuinfo object contains topology information.
  94  *  \param  supportLevel If topology information is available in CpuInfo,
  95  *                       this will be updated to reflect the amount of
  96  *                       information written to the machine structure.
  97  */
  98 void parseCpuInfo(HardwareTopology::Machine* machine, HardwareTopology::SupportLevel* supportLevel)
  99 {
 100     CpuInfo cpuInfo(CpuInfo::detect());
 101
 102     if (!cpuInfo.logicalProcessors().empty())
 103     {
 104         int nSockets   = 0;
 105         int nCores     = 0;
 106         int nHwThreads = 0;
 107
 108         // Copy the logical processor information from cpuinfo
 109         for (auto& l : cpuInfo.logicalProcessors())
 110         {
 111             machine->logicalProcessors.push_back(
 112                     { l.socketRankInMachine, l.coreRankInSocket, l.hwThreadRankInCore, -1 });
 113             nSockets   = std::max(nSockets, l.socketRankInMachine);
 114             nCores     = std::max(nCores, l.coreRankInSocket);
 115             nHwThreads = std::max(nHwThreads, l.hwThreadRankInCore);
 116         }
 117
 118         // Fill info form sockets/cores/hwthreads
 119         int socketId   = 0;
 120         int coreId     = 0;
 121         int hwThreadId = 0;
 122
 123         machine->sockets.resize(nSockets + 1);
 124         for (auto& s : machine->sockets)
 125         {
 126             s.id = socketId++;
 127             s.cores.resize(nCores + 1);
 128             for (auto& c : s.cores)
 129             {
 130                 c.id         = coreId++;
 131                 c.numaNodeId = -1; // No numa information
 132                 c.hwThreads.resize(nHwThreads + 1);
 133                 for (auto& t : c.hwThreads)
 134                 {
 135                     t.id                 = hwThreadId++;
 136                     t.logicalProcessorId = -1; // set as unassigned for now
 137                 }
 138             }
 139         }
 140
 141         // Fill the logical processor id in the right place
 142         for (std::size_t i = 0; i < machine->logicalProcessors.size(); i++)
 143         {
 144             const HardwareTopology::LogicalProcessor& l = machine->logicalProcessors[i];
 145             machine->sockets[l.socketRankInMachine]
 146                     .cores[l.coreRankInSocket]
 147                     .hwThreads[l.hwThreadRankInCore]
 148                     .logicalProcessorId = static_cast<int>(i);
 149         }
 150         machine->logicalProcessorCount = machine->logicalProcessors.size();
 151         *supportLevel                  = HardwareTopology::SupportLevel::Basic;
 152     }
 153     else
 154     {
 155         *supportLevel = HardwareTopology::SupportLevel::None;
 156     }
 157 }
 158
 159 #if GMX_USE_HWLOC
 160
 161 #    if HWLOC_API_VERSION < 0x00010b00
 162 #        define HWLOC_OBJ_PACKAGE HWLOC_OBJ_SOCKET
 163 #        define HWLOC_OBJ_NUMANODE HWLOC_OBJ_NODE
 164 #    endif
 165
 166 // Preprocessor variable for if hwloc api is version 1.x.x or 2.x.x
 167 #    if HWLOC_API_VERSION >= 0x00020000
 168 #        define GMX_HWLOC_API_VERSION_IS_2XX 1
 169 #        if GMX_HWLOC_API_VERSION < 0x00020000
 170 #            error "HWLOC library major version set during configuration is 1, but currently using version 2 headers"
 171 #        endif
 172 #    else
 173 #        define GMX_HWLOC_API_VERSION_IS_2XX 0
 174 #        if GMX_HWLOC_API_VERSION >= 0x00020000
 175 #            error "HWLOC library major version set during configuration is 2, but currently using version 1 headers"
 176 #        endif
 177 #    endif
 178
 179 /*****************************************************************************
 180  *                                                                           *
 181  *   Utility functions for extracting hardware topology from hwloc library   *
 182  *                                                                           *
 183  *****************************************************************************/
 184
 185 // Compatibility function for accessing hwloc_obj_t object memory with different API versions of hwloc
 186 std::size_t getHwLocObjectMemory(const hwloc_obj* obj)
 187 {
 188 #    if GMX_HWLOC_API_VERSION_IS_2XX
 189     return obj->total_memory;
 190 #    else
 191     return obj->memory.total_memory;
 192 #    endif
 193 }
 194
 195 /*! \brief Return vector of all descendants of a given type in hwloc topology
 196  *
 197  *  \param topo  hwloc topology handle that has been initialized and loaded
 198  *  \param obj   Non-null hwloc object.
 199  *  \param type  hwloc object type to find. The routine will only search
 200  *               on levels below obj.
 201  *
 202  *  \return vector containing all the objects of given type that are
 203  *          descendants of the provided object. If no objects of this type
 204  *          were found, the vector will be empty.
 205  */
 206 std::vector<const hwloc_obj*> getHwLocDescendantsByType(const hwloc_topology*  topo,
 207                                                         const hwloc_obj*       obj,
 208                                                         const hwloc_obj_type_t type)
 209 {
 210     GMX_RELEASE_ASSERT(obj, "NULL hwloc object provided to getHwLocDescendantsByType()");
 211
 212     std::vector<const hwloc_obj*> v;
 213
 214     if (obj->type == type)
 215     {
 216         v.push_back(obj);
 217     }
 218     // Go through children; if this object has no children obj->arity is 0,
 219     // and we'll return an empty vector.
 220     hwloc_obj_t tempNode = nullptr;
 221     while ((tempNode = hwloc_get_next_child(const_cast<hwloc_topology_t>(topo),
 222                                             const_cast<hwloc_obj_t>(obj), tempNode))
 223            != nullptr)
 224     {
 225         std::vector<const hwloc_obj*> v2 = getHwLocDescendantsByType(topo, tempNode, type);
 226         v.insert(v.end(), v2.begin(), v2.end());
 227     }
 228     return v;
 229 }
 230
 231 /*! \brief Read information about sockets, cores and threads from hwloc topology
 232  *
 233  *  \param topo    hwloc topology handle that has been initialized and loaded
 234  *  \param machine Pointer to the machine structure in the HardwareTopology
 235  *                 class, where the tree of sockets/cores/threads will be written.
 236  *
 237  *  \return If all the data is found
 238  */
 239 bool parseHwLocSocketsCoresThreads(hwloc_topology_t topo, HardwareTopology::Machine* machine)
 240 {
 241     const hwloc_obj*              root         = hwloc_get_root_obj(topo);
 242     std::vector<const hwloc_obj*> hwlocSockets = getHwLocDescendantsByType(topo, root, HWLOC_OBJ_PACKAGE);
 243
 244     machine->logicalProcessorCount = hwloc_get_nbobjs_by_type(topo, HWLOC_OBJ_PU);
 245     machine->logicalProcessors.resize(machine->logicalProcessorCount);
 246     machine->sockets.resize(hwlocSockets.size());
 247
 248     bool topologyOk = !hwlocSockets.empty(); // Fail if we have no sockets in machine
 249
 250     for (std::size_t i = 0; i < hwlocSockets.size() && topologyOk; i++)
 251     {
 252         // Assign information about this socket
 253         machine->sockets[i].id = hwlocSockets[i]->logical_index;
 254
 255         // Get children (cores)
 256         std::vector<const hwloc_obj*> hwlocCores =
 257                 getHwLocDescendantsByType(topo, hwlocSockets[i], HWLOC_OBJ_CORE);
 258         machine->sockets[i].cores.resize(hwlocCores.size());
 259
 260         topologyOk = topologyOk && !hwlocCores.empty(); // Fail if we have no cores in socket
 261
 262         // Loop over child cores
 263         for (std::size_t j = 0; j < hwlocCores.size() && topologyOk; j++)
 264         {
 265             // Assign information about this core
 266             machine->sockets[i].cores[j].id         = hwlocCores[j]->logical_index;
 267             machine->sockets[i].cores[j].numaNodeId = -1;
 268
 269             // Get children (hwthreads)
 270             std::vector<const hwloc_obj*> hwlocPUs =
 271                     getHwLocDescendantsByType(topo, hwlocCores[j], HWLOC_OBJ_PU);
 272             machine->sockets[i].cores[j].hwThreads.resize(hwlocPUs.size());
 273
 274             topologyOk = topologyOk && !hwlocPUs.empty(); // Fail if we have no hwthreads in core
 275
 276             // Loop over child hwthreads
 277             for (std::size_t k = 0; k < hwlocPUs.size() && topologyOk; k++)
 278             {
 279                 // Assign information about this hwthread
 280                 std::size_t logicalProcessorId               = hwlocPUs[k]->os_index;
 281                 machine->sockets[i].cores[j].hwThreads[k].id = hwlocPUs[k]->logical_index;
 282                 machine->sockets[i].cores[j].hwThreads[k].logicalProcessorId = logicalProcessorId;
 283
 284                 if (logicalProcessorId < machine->logicalProcessors.size())
 285                 {
 286                     // Cross-assign data for this hwthread to the logicalprocess vector
 287                     machine->logicalProcessors[logicalProcessorId].socketRankInMachine =
 288                             static_cast<int>(i);
 289                     machine->logicalProcessors[logicalProcessorId].coreRankInSocket =
 290                             static_cast<int>(j);
 291                     machine->logicalProcessors[logicalProcessorId].hwThreadRankInCore =
 292                             static_cast<int>(k);
 293                     machine->logicalProcessors[logicalProcessorId].numaNodeId = -1;
 294                 }
 295                 else
 296                 {
 297                     topologyOk = false;
 298                 }
 299             }
 300         }
 301     }
 302
 303     if (!topologyOk)
 304     {
 305         machine->logicalProcessors.clear();
 306         machine->sockets.clear();
 307     }
 308     return topologyOk;
 309 }
 310
 311 /*! \brief Read cache information from hwloc topology
 312  *
 313  *  \param topo    hwloc topology handle that has been initialized and loaded
 314  *  \param machine Pointer to the machine structure in the HardwareTopology
 315  *                 class, where cache data will be filled.
 316  *
 317  *  \return If any cache data is found
 318  */
 319 bool parseHwLocCache(hwloc_topology_t topo, HardwareTopology::Machine* machine)
 320 {
 321     // Parse caches up to L5
 322     for (int cachelevel : { 1, 2, 3, 4, 5 })
 323     {
 324         int depth = hwloc_get_cache_type_depth(topo, cachelevel, HWLOC_OBJ_CACHE_DATA);
 325
 326         if (depth >= 0)
 327         {
 328             hwloc_obj_t cache = hwloc_get_next_obj_by_depth(topo, depth, nullptr);
 329             if (cache != nullptr)
 330             {
 331                 std::vector<const hwloc_obj*> hwThreads =
 332                         getHwLocDescendantsByType(topo, cache, HWLOC_OBJ_PU);
 333
 334                 machine->caches.push_back({ static_cast<int>(cache->attr->cache.depth),
 335                                             static_cast<std::size_t>(cache->attr->cache.size),
 336                                             static_cast<int>(cache->attr->cache.linesize),
 337                                             static_cast<int>(cache->attr->cache.associativity),
 338                                             std::max<int>(hwThreads.size(), 1) });
 339             }
 340         }
 341     }
 342     return !machine->caches.empty();
 343 }
 344
 345
 346 /*! \brief Read numa information from hwloc topology
 347  *
 348  *  \param topo    hwloc topology handle that has been initialized and loaded
 349  *  \param machine Pointer to the machine structure in the HardwareTopology
 350  *                 class, where numa information will be filled.
 351  *
 352  *  Hwloc should virtually always be able to detect numa information, but if
 353  *  there is only a single numa node in the system it is not reported at all.
 354  *  In this case we create a single numa node covering all cores.
 355  *
 356  *  This function uses the basic socket/core/thread information detected by
 357  *  parseHwLocSocketsCoresThreads(), which means that routine must have
 358  *  completed successfully before calling this one. If this is not the case,
 359  *  you will get an error return code.
 360  *
 361  *  \return If the data found makes sense (either in the numa node or the
 362  *          entire machine)
 363  */
 364 bool parseHwLocNuma(hwloc_topology_t topo, HardwareTopology::Machine* machine)
 365 {
 366     const hwloc_obj*              root = hwloc_get_root_obj(topo);
 367     std::vector<const hwloc_obj*> hwlocNumaNodes =
 368             getHwLocDescendantsByType(topo, root, HWLOC_OBJ_NUMANODE);
 369     bool topologyOk = true;
 370
 371     if (!hwlocNumaNodes.empty())
 372     {
 373         machine->numa.nodes.resize(hwlocNumaNodes.size());
 374
 375         for (std::size_t i = 0; i < hwlocNumaNodes.size(); i++)
 376         {
 377             machine->numa.nodes[i].id     = hwlocNumaNodes[i]->logical_index;
 378             machine->numa.nodes[i].memory = getHwLocObjectMemory(hwlocNumaNodes[i]);
 379
 380             machine->numa.nodes[i].logicalProcessorId.clear();
 381
 382             // Get list of PUs in this numa node. Get from numa node if v1.x.x, get from numa node's parent if 2.x.x
 383 #    if GMX_HWLOC_API_VERSION_IS_2XX
 384             std::vector<const hwloc_obj*> hwlocPUs =
 385                     getHwLocDescendantsByType(topo, hwlocNumaNodes[i]->parent, HWLOC_OBJ_PU);
 386 #    else
 387             std::vector<const hwloc_obj*> hwlocPUs =
 388                     getHwLocDescendantsByType(topo, hwlocNumaNodes[i], HWLOC_OBJ_PU);
 389 #    endif
 390             for (auto& p : hwlocPUs)
 391             {
 392                 machine->numa.nodes[i].logicalProcessorId.push_back(p->os_index);
 393
 394                 GMX_RELEASE_ASSERT(p->os_index < machine->logicalProcessors.size(),
 395                                    "OS index of PU in hwloc larger than processor count");
 396
 397                 machine->logicalProcessors[p->os_index].numaNodeId = static_cast<int>(i);
 398                 std::size_t s = machine->logicalProcessors[p->os_index].socketRankInMachine;
 399                 std::size_t c = machine->logicalProcessors[p->os_index].coreRankInSocket;
 400
 401                 GMX_RELEASE_ASSERT(s < machine->sockets.size(),
 402                                    "Socket index in logicalProcessors larger than socket count");
 403                 GMX_RELEASE_ASSERT(c < machine->sockets[s].cores.size(),
 404                                    "Core index in logicalProcessors larger than core count");
 405                 // Set numaNodeId in core too
 406                 machine->sockets[s].cores[c].numaNodeId = i;
 407             }
 408         }
 409         // Getting the distance matrix
 410 #    if GMX_HWLOC_API_VERSION_IS_2XX
 411         // with hwloc api v. 2.x.x, distances are no longer directly accessible. Need to retrieve and release hwloc_distances_s object
 412         // In addition, there can now be multiple types of distances, ie latency, bandwidth. We look only for latency, but have to check
 413         // if multiple distance matrices are returned.
 414
 415         // If only 1 numa node exists, the v2.x.x hwloc api won't have a distances matrix, set manually
 416         if (hwlocNumaNodes.size() == 1)
 417         {
 418             machine->numa.relativeLatency = { { 1.0 } };
 419         }
 420         else
 421         {
 422             hwloc_distances_s* dist;
 423             // Set the number of distance matrices to return (1 in our case, but hwloc 2.x.x allows
 424             // for multiple distances types and therefore multiple distance matrices)
 425             unsigned nr = 1;
 426             hwloc_distances_get(topo, &nr, &dist, HWLOC_DISTANCES_KIND_MEANS_LATENCY, 0);
 427             // If no distances were found, nr will be 0, otherwise distances will be populated with
 428             // 1 hwloc_distances_s object
 429             if (nr > 0 && dist->nbobjs == hwlocNumaNodes.size())
 430             {
 431
 432                 machine->numa.relativeLatency.resize(dist->nbobjs);
 433                 for (std::size_t i = 0; i < dist->nbobjs; i++)
 434                 {
 435                     machine->numa.relativeLatency[i].resize(dist->nbobjs);
 436                     for (std::size_t j = 0; j < dist->nbobjs; j++)
 437                     {
 438                         machine->numa.relativeLatency[i][j] = dist->values[i * dist->nbobjs + j];
 439                     }
 440                 }
 441             }
 442             else
 443             {
 444                 topologyOk = false;
 445             }
 446             hwloc_distances_release(topo, dist);
 447         }
 448
 449         // hwloc-2.x provides latencies as integers, but to make things more similar to the case of
 450         // a single numa node as well as hwloc-1.x, we rescale to relative floating-point values and
 451         // also set the largest relative latency value.
 452
 453         // find smallest value in matrix
 454         float minLatency = std::numeric_limits<float>::max(); // large number
 455         float maxLatency = std::numeric_limits<float>::min(); // 0.0
 456         for (const auto& v : machine->numa.relativeLatency)
 457         {
 458             auto result = std::minmax_element(v.begin(), v.end());
 459             minLatency  = std::min(minLatency, *result.first);
 460             maxLatency  = std::max(maxLatency, *result.second);
 461         }
 462
 463         // assign stuff
 464         for (auto& v : machine->numa.relativeLatency)
 465         {
 466             std::transform(v.begin(), v.end(), v.begin(),
 467                            std::bind(std::multiplies<float>(), std::placeholders::_1, 1.0 / minLatency));
 468         }
 469         machine->numa.baseLatency = 1.0; // latencies still do not have any units in hwloc-2.x
 470         machine->numa.maxRelativeLatency = maxLatency / minLatency;
 471
 472 #    else  // GMX_HWLOC_API_VERSION_IS_2XX == false, hwloc api is 1.x.x
 473         int                             depth = hwloc_get_type_depth(topo, HWLOC_OBJ_NUMANODE);
 474         const struct hwloc_distances_s* dist = hwloc_get_whole_distance_matrix_by_depth(topo, depth);
 475         if (dist != nullptr && dist->nbobjs == hwlocNumaNodes.size())
 476         {
 477             machine->numa.baseLatency        = dist->latency_base;
 478             machine->numa.maxRelativeLatency = dist->latency_max;
 479             machine->numa.relativeLatency.resize(dist->nbobjs);
 480             for (std::size_t i = 0; i < dist->nbobjs; i++)
 481             {
 482                 machine->numa.relativeLatency[i].resize(dist->nbobjs);
 483                 for (std::size_t j = 0; j < dist->nbobjs; j++)
 484                 {
 485                     machine->numa.relativeLatency[i][j] = dist->latency[i * dist->nbobjs + j];
 486                 }
 487             }
 488         }
 489         else
 490         {
 491             topologyOk = false;
 492         }
 493 #    endif // end GMX_HWLOC_API_VERSION_IS_2XX == false
 494     }
 495     else
 496     // Deals with the case of no numa nodes found.
 497 #    if GMX_HWLOC_API_VERSION_IS_2XX
 498     // If the hwloc version is 2.x.x, and there is no numa node, something went wrong
 499     {
 500         topologyOk = false;
 501     }
 502 #    else
 503     {
 504         // No numa nodes found. Use the entire machine as a numa node.
 505         // Note that this should only be the case with hwloc api v 1.x.x,
 506         // a numa node is assigned to the machine by default in v 2.x.x
 507         const hwloc_obj* const hwlocMachine = hwloc_get_next_obj_by_type(topo, HWLOC_OBJ_MACHINE, nullptr);
 508
 509         if (hwlocMachine != nullptr)
 510         {
 511             machine->numa.nodes.resize(1);
 512             machine->numa.nodes[0].id        = 0;
 513             machine->numa.nodes[0].memory    = hwlocMachine->memory.total_memory;
 514             machine->numa.baseLatency        = 10;
 515             machine->numa.maxRelativeLatency = 1;
 516             machine->numa.relativeLatency    = { { 1.0 } };
 517
 518             for (int i = 0; i < machine->logicalProcessorCount; i++)
 519             {
 520                 machine->numa.nodes[0].logicalProcessorId.push_back(i);
 521             }
 522             for (auto& l : machine->logicalProcessors)
 523             {
 524                 l.numaNodeId = 0;
 525             }
 526             for (auto& s : machine->sockets)
 527             {
 528                 for (auto& c : s.cores)
 529                 {
 530                     c.numaNodeId = 0;
 531                 }
 532             }
 533         }
 534         else
 535         {
 536             topologyOk = false;
 537         }
 538     }
 539 #    endif // end if not GMX_HWLOC_API_VERSION_IS_2XX
 540     if (!topologyOk)
 541     {
 542         machine->numa.nodes.clear();
 543     }
 544     return topologyOk;
 545 }
 546
 547 /*! \brief Read PCI device information from hwloc topology
 548  *
 549  *  \param topo    hwloc topology handle that has been initialized and loaded
 550  *  \param machine Pointer to the machine structure in the HardwareTopology
 551  *                 class, where PCI device information will be filled.
 552  * *
 553  *  \return If any devices were found
 554  */
 555 bool parseHwLocDevices(hwloc_topology_t topo, HardwareTopology::Machine* machine)
 556 {
 557     const hwloc_obj*              root = hwloc_get_root_obj(topo);
 558     std::vector<const hwloc_obj*> pcidevs = getHwLocDescendantsByType(topo, root, HWLOC_OBJ_PCI_DEVICE);
 559
 560     for (auto& p : pcidevs)
 561     {
 562 #    if GMX_HWLOC_API_VERSION_IS_2XX
 563         const hwloc_obj* ancestor = nullptr;
 564         // Numa nodes not directly part of tree. Walk up the tree until we find an ancestor with a numa node
 565         hwloc_obj_t parent = p->parent;
 566         while (parent && !parent->memory_arity)
 567         {
 568             parent = parent->parent;
 569         }
 570         if (parent)
 571         {
 572             ancestor = parent->memory_first_child;
 573         }
 574 #    else  // GMX_HWLOC_API_VERSION_IS_2XX = false, api v 1.x.x
 575         // numa nodes are normal part of tree, can use hwloc ancestor function
 576         const hwloc_obj* const ancestor =
 577                 hwloc_get_ancestor_obj_by_type(topo, HWLOC_OBJ_NUMANODE, const_cast<hwloc_obj_t>(p));
 578 #    endif // end if GMX_HWLOC_API_VERSION_IS_2XX
 579         int numaId;
 580         if (ancestor != nullptr)
 581         {
 582             numaId = ancestor->logical_index;
 583         }
 584         else
 585         {
 586             // If we only have a single numa node we belong to it, otherwise set it to -1 (unknown)
 587             numaId = (machine->numa.nodes.size() == 1) ? 0 : -1;
 588         }
 589
 590         GMX_RELEASE_ASSERT(p->attr, "Attributes should not be NULL for hwloc PCI object");
 591
 592         machine->devices.push_back({ p->attr->pcidev.vendor_id, p->attr->pcidev.device_id,
 593                                      p->attr->pcidev.class_id, p->attr->pcidev.domain,
 594                                      p->attr->pcidev.bus, p->attr->pcidev.dev, p->attr->pcidev.func,
 595                                      numaId });
 596     }
 597     return !pcidevs.empty();
 598 }
 599
 600 void parseHwLoc(HardwareTopology::Machine* machine, HardwareTopology::SupportLevel* supportLevel, bool* isThisSystem)
 601 {
 602     hwloc_topology_t topo;
 603
 604     // Initialize a hwloc object, set flags to request IO device information too,
 605     // try to load the topology, and get the root object. If either step fails,
 606     // return that we do not have any support at all from hwloc.
 607     if (hwloc_topology_init(&topo) != 0)
 608     {
 609         hwloc_topology_destroy(topo);
 610         return; // SupportLevel::None.
 611     }
 612
 613     // Flags to look for io devices
 614 #    if GMX_HWLOC_API_VERSION_IS_2XX
 615     GMX_RELEASE_ASSERT(
 616             (hwloc_get_api_version() >= 0x20000),
 617             "Mismatch between hwloc headers and library, using v2 headers with v1 library");
 618     hwloc_topology_set_io_types_filter(topo, HWLOC_TYPE_FILTER_KEEP_IMPORTANT);
 619 #    else
 620     GMX_RELEASE_ASSERT(
 621             (hwloc_get_api_version() < 0x20000),
 622             "Mismatch between hwloc headers and library, using v1 headers with v2 library");
 623     hwloc_topology_set_flags(topo, HWLOC_TOPOLOGY_FLAG_IO_DEVICES);
 624 #    endif
 625
 626     if (hwloc_topology_load(topo) != 0 || hwloc_get_root_obj(topo) == nullptr)
 627     {
 628         hwloc_topology_destroy(topo);
 629         return; // SupportLevel::None.
 630     }
 631
 632     // If we get here, we can get a valid root object for the topology
 633     *isThisSystem = hwloc_topology_is_thissystem(topo) != 0;
 634
 635     // Parse basic information about sockets, cores, and hardware threads
 636     if (parseHwLocSocketsCoresThreads(topo, machine))
 637     {
 638         *supportLevel = HardwareTopology::SupportLevel::Basic;
 639     }
 640     else
 641     {
 642         hwloc_topology_destroy(topo);
 643         return; // SupportLevel::None.
 644     }
 645
 646     // Get information about cache and numa nodes
 647     if (parseHwLocCache(topo, machine) && parseHwLocNuma(topo, machine))
 648     {
 649         *supportLevel = HardwareTopology::SupportLevel::Full;
 650     }
 651     else
 652     {
 653         hwloc_topology_destroy(topo);
 654         return; // SupportLevel::Basic.
 655     }
 656
 657     // PCI devices
 658     if (parseHwLocDevices(topo, machine))
 659     {
 660         *supportLevel = HardwareTopology::SupportLevel::FullWithDevices;
 661     }
 662
 663     hwloc_topology_destroy(topo);
 664     // SupportLevel::Full or SupportLevel::FullWithDevices.
 665 }
 666
 667 #endif
 668
 669 /*! \brief Try to detect the number of logical processors.
 670  *
 671  *  \return The number of hardware processing units, or 0 if it fails.
 672  */
 673 int detectLogicalProcessorCount()
 674 {
 675     int count = 0;
 676
 677     {
 678 #if GMX_NATIVE_WINDOWS
 679         // Windows
 680         SYSTEM_INFO sysinfo;
 681         GetSystemInfo(&sysinfo);
 682         count = sysinfo.dwNumberOfProcessors;
 683 #elif defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_ONLN)
 684         // We are probably on Unix. Check if we have the argument to use before executing any calls
 685         count = sysconf(_SC_NPROCESSORS_ONLN);
 686 #else
 687         count = 0; // Neither windows nor Unix.
 688 #endif
 689     }
 690
 691     return count;
 692 }
 693
 694 } // namespace
 695
 696 // static
 697 HardwareTopology HardwareTopology::detect()
 698 {
 699     HardwareTopology result;
 700
 701 #if GMX_USE_HWLOC
 702     parseHwLoc(&result.machine_, &result.supportLevel_, &result.isThisSystem_);
 703 #endif
 704
 705     // If something went wrong in hwloc (or if it was not present) we might
 706     // have more information in cpuInfo
 707     if (result.supportLevel_ < SupportLevel::Basic)
 708     {
 709         // There might be topology information in cpuInfo
 710         parseCpuInfo(&result.machine_, &result.supportLevel_);
 711     }
 712     // If we did not manage to get anything from either hwloc or cpuInfo, find the cpu count at least
 713     if (result.supportLevel_ == SupportLevel::None)
 714     {
 715         // No topology information; try to detect the number of logical processors at least
 716         result.machine_.logicalProcessorCount = detectLogicalProcessorCount();
 717         if (result.machine_.logicalProcessorCount > 0)
 718         {
 719             result.supportLevel_ = SupportLevel::LogicalProcessorCount;
 720         }
 721     }
 722     return result;
 723 }
 724
 725 HardwareTopology::Machine::Machine()
 726 {
 727     logicalProcessorCount   = 0;
 728     numa.baseLatency        = 0.0;
 729     numa.maxRelativeLatency = 0.0;
 730 }
 731
 732
 733 HardwareTopology::HardwareTopology() :
 734     supportLevel_(SupportLevel::None),
 735     machine_(),
 736     isThisSystem_(true)
 737 {
 738 }
 739
 740 HardwareTopology::HardwareTopology(int logicalProcessorCount) :
 741     supportLevel_(SupportLevel::None),
 742     machine_(),
 743     isThisSystem_(true)
 744 {
 745     if (logicalProcessorCount > 0)
 746     {
 747         machine_.logicalProcessorCount = logicalProcessorCount;
 748         supportLevel_                  = SupportLevel::LogicalProcessorCount;
 749     }
 750 }
 751
 752 int HardwareTopology::numberOfCores() const
 753 {
 754     if (supportLevel() >= SupportLevel::Basic)
 755     {
 756         // We assume all sockets have the same number of cores as socket 0.
 757         // Since topology information is present, we can assume there is at least one socket.
 758         return machine().sockets.size() * machine().sockets[0].cores.size();
 759     }
 760     else if (supportLevel() >= SupportLevel::LogicalProcessorCount)
 761     {
 762         return machine().logicalProcessorCount;
 763     }
 764     else
 765     {
 766         return 0;
 767     }
 768 }
 769
 770 } // namespace gmx