src/gromacs/hardware/cpuinfo.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012-2018, The GROMACS development team.
   5  * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36
  37 /*! \internal \file
  38  * \brief
  39  * Implements gmx::CpuInfo.
  40  *
  41  * We need to be able to compile this file in stand-alone mode to use basic
  42  * CPU feature detection to set the SIMD acceleration and similar things in
  43  * CMake, while we still want to use more features that enable topology
  44  * detection when config.h is present.
  45  *
  46  * We solve this by skipping the advanced stuff when the preprocessor
  47  * macro GMX_CPUINFO_STANDALONE is defined. In this case you likely also need to
  48  * define GMX_X86_GCC_INLINE_ASM if you are on x86; without inline assembly
  49  * support it is not possible to perform the actual detection on Linux/Mac.
  50  * Since these macros are specific to this file, they do not use the GMX prefix.
  51  *
  52  * The remaining defines (GMX_NATIVE_WINDOWS,HAVE_UNISTD_H,HAVE_SCHED_H,
  53  * HAVE_SYSCONF, HAVE_SCHED_AFFINITY) are only used to determine the topology on
  54  * 86, and for this we rely on including config.h.
  55  *
  56  * \author Erik Lindahl <erik.lindahl@gmail.com>
  57  * \ingroup module_hardware
  58  */
  59
  60 #ifndef GMX_CPUINFO_STANDALONE
  61 #    include "gmxpre.h"
  62 #endif
  63
  64 #include "cpuinfo.h"
  65
  66 #ifndef GMX_CPUINFO_STANDALONE
  67 #    include "config.h"
  68 #else
  69 #    define GMX_NATIVE_WINDOWS 0
  70 #endif
  71
  72 #if defined _MSC_VER
  73 #    include <intrin.h> // __cpuid()
  74 #endif
  75
  76 #if GMX_NATIVE_WINDOWS
  77 #    include <windows.h> // sysinfo(), necessary for topology stuff
  78 #endif
  79
  80 #ifdef HAVE_SCHED_H
  81 #    include <sched.h> // sched_getaffinity(), sched_setaffinity()
  82 #endif
  83 #ifdef HAVE_UNISTD_H
  84 #    include <unistd.h> // sysconf()
  85 #endif
  86
  87 #include <cctype>
  88 #include <cstdint> // uint32_t in X86 processor name code
  89 #include <cstdlib>
  90
  91 #include <algorithm>
  92 #include <fstream>
  93 #include <map>
  94 #include <set>
  95 #include <sstream>
  96 #include <string>
  97
  98 #ifdef GMX_CPUINFO_STANDALONE
  99 #    define gmx_unused
 100 #else
 101 #    include "gromacs/utility/basedefinitions.h"
 102 #endif
 103
 104 #include "architecture.h"
 105
 106 namespace gmx
 107 {
 108
 109 namespace
 110 {
 111
 112 /*! \cond internal */
 113
 114 /******************************************************************************
 115  *                                                                            *
 116  *   Utility functions to make this file independent of the GROMACS library   *
 117  *                                                                            *
 118  ******************************************************************************/
 119
 120 /*! \brief Remove initial and trailing whitespace from string
 121  *
 122  *  \param s  Pointer to string where whitespace will be removed
 123  */
 124 void trimString(std::string* s)
 125 {
 126     // heading
 127     s->erase(s->begin(),
 128              std::find_if(s->begin(), s->end(), [](char& c) -> bool { return std::isspace(c) == 0; }));
 129     // trailing
 130     s->erase(
 131             std::find_if(s->rbegin(), s->rend(), [](char& c) -> bool { return std::isspace(c) == 0; })
 132                     .base(),
 133             s->end());
 134 }
 135
 136
 137 /******************************************************************************
 138  *                                                                            *
 139  *                         x86 detection functions                            *
 140  *                                                                            *
 141  ******************************************************************************/
 142
 143 /*! \brief execute x86 cpuid instructions with custom level and extended level
 144  *
 145  *  \param level   The main cpuid level (input argument for eax register)
 146  *  \param ecxval  Extended level (input argument for ecx register)
 147  *  \param eax     Output in eax register
 148  *  \param ebx     Output in ebx register
 149  *  \param ecx     Output in ecx register
 150  *  \param edx     Output in edx register
 151  *
 152  *  \return 0 on success, or non-zero if the instruction could not execute.
 153  */
 154 int executeX86CpuID(unsigned int gmx_unused level,
 155                     unsigned int gmx_unused ecxval,
 156                     unsigned int*           eax,
 157                     unsigned int*           ebx,
 158                     unsigned int*           ecx,
 159                     unsigned int*           edx)
 160 {
 161     if (c_architecture == Architecture::X86)
 162     {
 163 #if defined __GNUC__ || GMX_X86_GCC_INLINE_ASM
 164
 165         // any compiler that understands gcc inline assembly
 166         *eax = level;
 167         *ecx = ecxval;
 168         *ebx = 0;
 169         *edx = 0;
 170
 171 #    if GMX_IS_X86_32 && defined(__PIC__)
 172         // Avoid clobbering the global offset table in 32-bit pic code (ebx register)
 173         __asm__ __volatile__(
 174                 "xchgl %%ebx, %1  \n\t"
 175                 "cpuid            \n\t"
 176                 "xchgl %%ebx, %1  \n\t"
 177                 : "+a"(*eax), "+r"(*ebx), "+c"(*ecx), "+d"(*edx));
 178 #    elif GMX_IS_X86_64
 179         // i386 without PIC, or x86-64. Things are easy and we can clobber any reg we want
 180         __asm__ __volatile__("cpuid            \n\t"
 181                              : "+a"(*eax), "+b"(*ebx), "+c"(*ecx), "+d"(*edx));
 182 #    else
 183         // Not a normal x86, which could happen when a compiler
 184         // targetting non-x86 pretends to be GCC.
 185 #    endif
 186         return 0;
 187
 188 #elif defined _MSC_VER
 189
 190         // MSVC (and icc on windows) on ia32 or x86-64
 191         int cpuInfo[4];
 192         __cpuidex(cpuInfo, level, ecxval);
 193         *eax = static_cast<unsigned int>(cpuInfo[0]);
 194         *ebx = static_cast<unsigned int>(cpuInfo[1]);
 195         *ecx = static_cast<unsigned int>(cpuInfo[2]);
 196         *edx = static_cast<unsigned int>(cpuInfo[3]);
 197         return 0;
 198
 199 #else
 200
 201         // We are on x86, but without compiler support for cpuid if we get here
 202         *eax = 0;
 203         *ebx = 0;
 204         *ecx = 0;
 205         *edx = 0;
 206         return 1;
 207
 208 #endif // check for inline asm on x86
 209     }
 210     else
 211     {
 212         // We are not on x86
 213         *eax = 0;
 214         *ebx = 0;
 215         *ecx = 0;
 216         *edx = 0;
 217         return 1;
 218     }
 219 }
 220
 221
 222 /*! \brief Detect x86 vendors by using the cpuid assembly instructions
 223  *
 224  *  If support for the cpuid instruction is present, we check for Intel,
 225  *  AMD or Hygon vendors
 226  *
 227  *  \return gmx::CpuInfo::Vendor::Intel, gmx::CpuInfo::Vendor::Amd,
 228  *          gmx::CpuInfl::Vendor::Hygon, . If neither Intel, Amd  nor
 229  *          Hygon can be identified, or if the code fails to execute,
 230  *          gmx::CpuInfo::Vendor::Unknown is returned.
 231  */
 232 CpuInfo::Vendor detectX86Vendor()
 233 {
 234     unsigned int    eax, ebx, ecx, edx;
 235     CpuInfo::Vendor v = CpuInfo::Vendor::Unknown;
 236
 237     if (executeX86CpuID(0x0, 0, &eax, &ebx, &ecx, &edx) == 0)
 238     {
 239         if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
 240         {
 241             v = CpuInfo::Vendor::Intel; // ebx=='uneG', ecx=='letn', edx=='Ieni'
 242         }
 243         else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
 244         {
 245             v = CpuInfo::Vendor::Amd; // ebx=='htuA', ecx=='DMAc', edx=='itne'
 246         }
 247         else if (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e)
 248         {
 249             v = CpuInfo::Vendor::Hygon; // ebx=='ogyH', ecx=='eniu', edx=='neGn'
 250         }
 251     }
 252     return v;
 253 }
 254
 255 /*! \brief Detect second AVX-512 FMA from the processor name
 256  *
 257  * Should only be called for processors already determined to support AVX-512.
 258  *
 259  *  \param [in] brand     x86 processor name
 260  *  \param [in] model     x86 model
 261  *  \return               True if second FMA present
 262  */
 263 bool detectProcCpuInfoSecondAvx512FMA(const std::string& brand, int model)
 264 {
 265     // Skylake server
 266     if (model == 0x55)
 267     {
 268         // detect Xeon
 269         if (brand.find("Xeon") == 9)
 270         {
 271             // detect Silver or Bronze or specific models
 272             if (brand.find("Silver") == 17 || brand.find("Bronze") == 17
 273                 || (brand.find('W') == 17 && brand.find('0') == 21)   // detect Xeon W 210x
 274                 || (brand.find('D') == 17 && brand.find("21") == 19)) // detect Xeon D 2xxx
 275             {
 276                 return false;
 277             }
 278             // detect Gold 5xxx - can be corrected once Cooper Lake is added
 279             else if (brand.find("Gold") == 17 && brand.find('5') == 22)
 280             {
 281                 return (brand.find("53") == 22 || // detect Cooper Lake
 282                         brand.find("22") == 24);  // detect 5[12]22
 283             }
 284         }
 285         return true;
 286     }
 287     // Cannon Lake client
 288     if (model == 0x66)
 289     {
 290         return false;
 291     }
 292     // Ice Lake client
 293     if (model == 0x7d || model == 0x7e)
 294     {
 295         return false;
 296     }
 297     // This is the right default...
 298     return true;
 299 }
 300
 301 /*! \brief Simple utility function to set/clear feature in a set
 302  *
 303  *  \param featureSet    Pointer to the feature set to update
 304  *  \param feature       The specific feature to set/clear
 305  *  \param registerValue Register value (returned from cpuid)
 306  *  \param bit           Bit to check in registerValue. The feature will be
 307  *                       added to the featureSet if this bit is set.
 308  *
 309  *  \note Nothing is done if the bit is not set. In particular, this will not
 310  *        erase anything if the feature already exists in the set.
 311  */
 312 void setFeatureFromBit(std::set<CpuInfo::Feature>* featureSet,
 313                        CpuInfo::Feature            feature,
 314                        unsigned int                registerValue,
 315                        unsigned char               bit)
 316 {
 317     if (registerValue & (1 << bit))
 318     {
 319         featureSet->insert(feature);
 320     }
 321 }
 322
 323 /*! \brief Process x86 cpuinfo features that are common to Intel and AMD CPUs
 324  *
 325  *  \param[out] brand      String where to write the x86 brand string
 326  *  \param[out] family     Major version of processor
 327  *  \param[out] model      Middle version of processor
 328  *  \param[out] stepping   Minor version of processor
 329  *  \param[out] features   Feature set where supported features are inserted
 330  */
 331 void detectX86Features(std::string* brand, int* family, int* model, int* stepping, std::set<CpuInfo::Feature>* features)
 332 {
 333     unsigned int eax, ebx, ecx, edx;
 334
 335     // Return if we cannot execute any levels
 336     if (executeX86CpuID(0x0, 0, &eax, &ebx, &ecx, &edx) != 0)
 337     {
 338         return;
 339     }
 340     unsigned int maxStdLevel = eax;
 341
 342     if (maxStdLevel >= 0x1)
 343     {
 344         executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
 345
 346         *family   = ((eax & 0x0ff00000) >> 20) + ((eax & 0x00000f00) >> 8);
 347         *model    = ((eax & 0x000f0000) >> 12) + ((eax & 0x000000f0) >> 4);
 348         *stepping = (eax & 0x0000000f);
 349
 350         setFeatureFromBit(features, CpuInfo::Feature::X86_Sse3, ecx, 0);
 351         setFeatureFromBit(features, CpuInfo::Feature::X86_Pclmuldq, ecx, 1);
 352         setFeatureFromBit(features, CpuInfo::Feature::X86_Ssse3, ecx, 9);
 353         setFeatureFromBit(features, CpuInfo::Feature::X86_Fma, ecx, 12);
 354         setFeatureFromBit(features, CpuInfo::Feature::X86_Cx16, ecx, 13);
 355         setFeatureFromBit(features, CpuInfo::Feature::X86_Pdcm, ecx, 15);
 356         setFeatureFromBit(features, CpuInfo::Feature::X86_Pcid, ecx, 17);
 357         setFeatureFromBit(features, CpuInfo::Feature::X86_Sse4_1, ecx, 19);
 358         setFeatureFromBit(features, CpuInfo::Feature::X86_Sse4_2, ecx, 20);
 359         setFeatureFromBit(features, CpuInfo::Feature::X86_X2Apic, ecx, 21);
 360         setFeatureFromBit(features, CpuInfo::Feature::X86_Popcnt, ecx, 23);
 361         setFeatureFromBit(features, CpuInfo::Feature::X86_Tdt, ecx, 24);
 362         setFeatureFromBit(features, CpuInfo::Feature::X86_Aes, ecx, 25);
 363         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx, ecx, 28);
 364         setFeatureFromBit(features, CpuInfo::Feature::X86_F16C, ecx, 29);
 365         setFeatureFromBit(features, CpuInfo::Feature::X86_Rdrnd, ecx, 30);
 366
 367         setFeatureFromBit(features, CpuInfo::Feature::X86_Pse, edx, 3);
 368         setFeatureFromBit(features, CpuInfo::Feature::X86_Msr, edx, 5);
 369         setFeatureFromBit(features, CpuInfo::Feature::X86_Cx8, edx, 8);
 370         setFeatureFromBit(features, CpuInfo::Feature::X86_Apic, edx, 9);
 371         setFeatureFromBit(features, CpuInfo::Feature::X86_Cmov, edx, 15);
 372         setFeatureFromBit(features, CpuInfo::Feature::X86_Clfsh, edx, 19);
 373         setFeatureFromBit(features, CpuInfo::Feature::X86_Mmx, edx, 23);
 374         setFeatureFromBit(features, CpuInfo::Feature::X86_Sse2, edx, 26);
 375         setFeatureFromBit(features, CpuInfo::Feature::X86_Htt, edx, 28);
 376     }
 377
 378     // Check whether Hyper-threading is really possible to enable in the hardware,
 379     // not just technically supported by this generation of processors
 380     if ((features->count(CpuInfo::Feature::X86_Htt) != 0U) && maxStdLevel >= 0x4)
 381     {
 382         executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
 383         unsigned int maxLogicalCores = (ebx >> 16) & 0x0ff;
 384         executeX86CpuID(0x4, 0, &eax, &ebx, &ecx, &edx);
 385         unsigned int maxPhysicalCores = ((eax >> 26) & 0x3f) + 1;
 386         if (maxLogicalCores / maxPhysicalCores < 2)
 387         {
 388             features->erase(CpuInfo::Feature::X86_Htt);
 389         }
 390     }
 391
 392     if (executeX86CpuID(0x80000000, 0, &eax, &ebx, &ecx, &edx) != 0)
 393     {
 394         // No point in continuing if we don't support any extended levels
 395         return;
 396     }
 397     unsigned int maxExtLevel = eax;
 398
 399     if (maxExtLevel >= 0x80000001)
 400     {
 401         executeX86CpuID(0x80000001, 0, &eax, &ebx, &ecx, &edx);
 402
 403         setFeatureFromBit(features, CpuInfo::Feature::X86_Lahf, ecx, 0);
 404         setFeatureFromBit(features, CpuInfo::Feature::X86_Sse4A, ecx, 6);
 405         setFeatureFromBit(features, CpuInfo::Feature::X86_MisalignSse, ecx, 7);
 406         setFeatureFromBit(features, CpuInfo::Feature::X86_Xop, ecx, 11);
 407         setFeatureFromBit(features, CpuInfo::Feature::X86_Fma4, ecx, 16);
 408         setFeatureFromBit(features, CpuInfo::Feature::X86_PDPE1GB, edx, 26);
 409         setFeatureFromBit(features, CpuInfo::Feature::X86_Rdtscp, edx, 27);
 410     }
 411
 412     if (maxExtLevel >= 0x80000005)
 413     {
 414         // Get the x86 CPU brand string (3 levels, 16 bytes in each)
 415         brand->clear();
 416         for (unsigned int level = 0x80000002; level < 0x80000005; level++)
 417         {
 418             executeX86CpuID(level, 0, &eax, &ebx, &ecx, &edx);
 419             // Add eax, ebx, ecx, edx contents as 4 chars each to the brand string
 420             brand->append(reinterpret_cast<const char*>(&eax), sizeof(eax));
 421             brand->append(reinterpret_cast<const char*>(&ebx), sizeof(ebx));
 422             brand->append(reinterpret_cast<const char*>(&ecx), sizeof(ecx));
 423             brand->append(reinterpret_cast<const char*>(&edx), sizeof(edx));
 424         }
 425         trimString(brand);
 426     }
 427
 428     if (maxStdLevel >= 0x7)
 429     {
 430         executeX86CpuID(0x7, 0, &eax, &ebx, &ecx, &edx);
 431
 432         setFeatureFromBit(features, CpuInfo::Feature::X86_Hle, ebx, 4);
 433         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx2, ebx, 5);
 434         setFeatureFromBit(features, CpuInfo::Feature::X86_Rtm, ebx, 11);
 435         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512F, ebx, 16);
 436         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512PF, ebx, 26);
 437         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512ER, ebx, 27);
 438         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512CD, ebx, 28);
 439         setFeatureFromBit(features, CpuInfo::Feature::X86_Sha, ebx, 29);
 440         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512BW, ebx, 30);
 441         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512VL, ebx, 31);
 442
 443         executeX86CpuID(0x7, 0x1, &eax, &ebx, &ecx, &edx);
 444         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512BF16, eax, 5);
 445
 446         if (features->count(CpuInfo::Feature::X86_Avx512F) != 0)
 447         {
 448             // Only checking if the CPU supports AVX-512. There is no CPUID bit for this.
 449             if (detectProcCpuInfoSecondAvx512FMA(*brand, *model))
 450             {
 451                 features->insert(CpuInfo::Feature::X86_Avx512secondFMA);
 452             }
 453         }
 454     }
 455
 456
 457     if (maxExtLevel >= 0x80000007)
 458     {
 459         executeX86CpuID(0x80000007, 0, &eax, &ebx, &ecx, &edx);
 460
 461         setFeatureFromBit(features, CpuInfo::Feature::X86_NonstopTsc, edx, 8);
 462     }
 463 }
 464
 465
 466 /*! \brief Return a vector with x86 APIC IDs for all threads
 467  *
 468  *  \param haveX2Apic  True if the processors supports x2APIC, otherwise vanilla APIC.
 469  *
 470  *  \returns A new std::vector of unsigned integer APIC IDs, one for each
 471  *           logical processor in the system.
 472  */
 473 std::vector<unsigned int> detectX86ApicIDs(bool gmx_unused haveX2Apic)
 474 {
 475     std::vector<unsigned int> apicID;
 476
 477     // We cannot just ask for all APIC IDs, but must force execution on each
 478     // hardware thread and extract the APIC id there.
 479 #if HAVE_SCHED_AFFINITY && defined HAVE_SYSCONF
 480     unsigned int eax, ebx, ecx, edx;
 481     unsigned int nApic = sysconf(_SC_NPROCESSORS_ONLN);
 482     cpu_set_t    saveCpuSet;
 483     cpu_set_t    cpuSet;
 484     sched_getaffinity(0, sizeof(cpu_set_t), &saveCpuSet);
 485     CPU_ZERO(&cpuSet);
 486     for (unsigned int i = 0; i < nApic; i++)
 487     {
 488         CPU_SET(i, &cpuSet);
 489         sched_setaffinity(0, sizeof(cpu_set_t), &cpuSet);
 490         if (haveX2Apic)
 491         {
 492             executeX86CpuID(0xb, 0, &eax, &ebx, &ecx, &edx);
 493             apicID.push_back(edx);
 494         }
 495         else
 496         {
 497             executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
 498             apicID.push_back(ebx >> 24);
 499         }
 500         CPU_CLR(i, &cpuSet);
 501     }
 502     sched_setaffinity(0, sizeof(cpu_set_t), &saveCpuSet);
 503 #elif GMX_NATIVE_WINDOWS
 504     unsigned int eax, ebx, ecx, edx;
 505     SYSTEM_INFO sysinfo;
 506     GetSystemInfo(&sysinfo);
 507     unsigned int nApic = sysinfo.dwNumberOfProcessors;
 508     unsigned int saveAffinity = SetThreadAffinityMask(GetCurrentThread(), 1);
 509     for (DWORD_PTR i = 0; i < nApic; i++)
 510     {
 511         SetThreadAffinityMask(GetCurrentThread(), (((DWORD_PTR)1) << i));
 512         Sleep(0);
 513         if (haveX2Apic)
 514         {
 515             executeX86CpuID(0xb, 0, &eax, &ebx, &ecx, &edx);
 516             apicID.push_back(edx);
 517         }
 518         else
 519         {
 520             executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
 521             apicID.push_back(ebx >> 24);
 522         }
 523     }
 524     SetThreadAffinityMask(GetCurrentThread(), saveAffinity);
 525 #endif
 526     return apicID;
 527 }
 528
 529
 530 /*! \brief Utility to renumber indices extracted from APIC IDs
 531  *
 532  * \param v  Vector with unsigned integer indices
 533  *
 534  * This routine returns the number of unique different elements found in the vector,
 535  * and renumbers these starting from 0. For example, the vector {0,1,2,8,9,10,8,9,10,0,1,2}
 536  * will be rewritten to {0,1,2,3,4,5,3,4,5,0,1,2}, and it returns 6 for the
 537  * number of unique elements.
 538  */
 539 void renumberIndex(std::vector<unsigned int>* v)
 540 {
 541     std::vector<unsigned int> sortedV(*v);
 542     std::sort(sortedV.begin(), sortedV.end());
 543
 544     std::vector<unsigned int> uniqueSortedV(sortedV);
 545     auto                      it = std::unique(uniqueSortedV.begin(), uniqueSortedV.end());
 546     uniqueSortedV.resize(std::distance(uniqueSortedV.begin(), it));
 547
 548     for (std::size_t i = 0; i < uniqueSortedV.size(); i++)
 549     {
 550         unsigned int val = uniqueSortedV[i];
 551         std::replace_if(v->begin(),
 552                         v->end(),
 553                         [val](unsigned int& c) -> bool { return c == val; },
 554                         static_cast<unsigned int>(i));
 555     }
 556 }
 557
 558 /*! \brief The layout of the bits in the APIC ID */
 559 struct ApicIdLayout
 560 {
 561     unsigned int hwThreadBits; //!< The number of least significant bits for hw-threads
 562     unsigned int coreBits;     //!< The number of core bits following the  hw-thread bits
 563 };
 564
 565 /*! \brief Detect the APIC ID layout for x2APIC
 566  */
 567 ApicIdLayout detectX2ApicIdLayout()
 568 {
 569     ApicIdLayout layout;
 570
 571     unsigned int eax;
 572     unsigned int ebx;
 573     unsigned int ecx;
 574     unsigned int edx;
 575     executeX86CpuID(0xb, 0, &eax, &ebx, &ecx, &edx);
 576     layout.hwThreadBits = eax & 0x1f;
 577     executeX86CpuID(0xb, 1, &eax, &ebx, &ecx, &edx);
 578     layout.coreBits = (eax & 0x1f) - layout.hwThreadBits;
 579
 580     return layout;
 581 }
 582
 583 /*! \brief Detect the APIC ID layout for standard APIC or xAPIC on AMD
 584  *
 585  * \param[in] maxExtLevel  The largest CPUID extended function input value supported by the processor implementation
 586  */
 587 ApicIdLayout detectAmdApicIdLayout(unsigned int maxExtLevel)
 588 {
 589     ApicIdLayout layout;
 590
 591     unsigned int eax;
 592     unsigned int ebx;
 593     unsigned int ecx;
 594     unsigned int edx;
 595     executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
 596     int family = ((eax & 0x0ff00000) >> 20) + ((eax & 0x00000f00) >> 8);
 597     executeX86CpuID(0x80000001, 0, &eax, &ebx, &ecx, &edx);
 598     bool haveExtendedTopology = (ecx & (1 << 22)) != 0U;
 599
 600     // NOTE: Here we assume 1 thread per core, unless we have family >= 17h
 601     layout.hwThreadBits = 0;
 602     if (family >= 0x17 && haveExtendedTopology && maxExtLevel >= 0x8000001e)
 603     {
 604         executeX86CpuID(0x8000001e, 1, &eax, &ebx, &ecx, &edx);
 605         int numThreadsPerCore = ((ebx >> 8) & 0xff) + 1;
 606         // NOTE: The AMD documentation only specifies the layout of apicid
 607         //       when we have 1 or 2 threads per core.
 608         while (numThreadsPerCore > (1 << layout.hwThreadBits))
 609         {
 610             layout.hwThreadBits++;
 611         }
 612     }
 613
 614     // Get number of core bits in apic ID - try modern extended method first
 615     executeX86CpuID(0x80000008, 0, &eax, &ebx, &ecx, &edx);
 616     layout.coreBits = (ecx >> 12) & 0xf;
 617     if (layout.coreBits == 0)
 618     {
 619         // Legacy method for old single/dual core AMD CPUs
 620         int i = ecx & 0xf;
 621         while (i >> layout.coreBits)
 622         {
 623             layout.coreBits++;
 624         }
 625     }
 626
 627     return layout;
 628 }
 629
 630 /*! \brief Try to detect basic CPU topology information using x86 cpuid
 631  *
 632  *  If x2APIC support is present, this is our first choice, otherwise we
 633  *  attempt to use old vanilla APIC.
 634  *
 635  *  \return A new vector of entries with socket, core, hwthread information
 636  *          for each logical processor.
 637  */
 638 std::vector<CpuInfo::LogicalProcessor> detectX86LogicalProcessors()
 639 {
 640     unsigned int eax;
 641     unsigned int ebx;
 642     unsigned int ecx;
 643     unsigned int edx;
 644     unsigned int maxStdLevel;
 645     unsigned int maxExtLevel;
 646     bool         haveApic;
 647     bool         haveX2Apic;
 648
 649     std::vector<CpuInfo::LogicalProcessor> logicalProcessors;
 650
 651     // Find largest standard & extended level input values allowed
 652     executeX86CpuID(0x0, 0, &eax, &ebx, &ecx, &edx);
 653     maxStdLevel = eax;
 654     executeX86CpuID(0x80000000, 0, &eax, &ebx, &ecx, &edx);
 655     maxExtLevel = eax;
 656
 657     if (maxStdLevel >= 0x1)
 658     {
 659         executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
 660         haveX2Apic = ((ecx & (1 << 21)) != 0U) && maxStdLevel >= 0xb;
 661         haveApic   = ((edx & (1 << 9)) != 0U) && maxExtLevel >= 0x80000008;
 662     }
 663     else
 664     {
 665         haveX2Apic = false;
 666         haveApic   = false;
 667     }
 668
 669     if (haveX2Apic || haveApic)
 670     {
 671         ApicIdLayout layout;
 672         // Get bits for cores and hardware threads
 673         if (haveX2Apic)
 674         {
 675             layout = detectX2ApicIdLayout();
 676         }
 677         else // haveApic
 678         {
 679             if (detectX86Vendor() == CpuInfo::Vendor::Amd || detectX86Vendor() == CpuInfo::Vendor::Hygon)
 680             {
 681                 layout = detectAmdApicIdLayout(maxExtLevel);
 682
 683                 if (layout.hwThreadBits > 1)
 684                 {
 685                     // At the time of writing this code we do not know what
 686                     // to do with more than 2 threads, so return empty.
 687                     return logicalProcessors;
 688                 }
 689             }
 690             else
 691             {
 692                 // We do not know the APIC ID layout, return empty.
 693                 return logicalProcessors;
 694             }
 695         }
 696
 697         std::vector<unsigned int> apicID = detectX86ApicIDs(haveX2Apic);
 698
 699         if (!apicID.empty())
 700         {
 701             // APIC IDs can be buggy, and it is always a mess. Typically more bits are
 702             // reserved than needed, and the numbers might not increment by 1 even in
 703             // a single socket or core. Extract, renumber, and check that things make sense.
 704             unsigned int              hwThreadMask = (1 << layout.hwThreadBits) - 1;
 705             unsigned int              coreMask     = (1 << layout.coreBits) - 1;
 706             std::vector<unsigned int> hwThreadRanks;
 707             std::vector<unsigned int> coreRanks;
 708             std::vector<unsigned int> socketRanks;
 709
 710             for (auto a : apicID)
 711             {
 712                 hwThreadRanks.push_back(static_cast<int>(a & hwThreadMask));
 713                 coreRanks.push_back(static_cast<int>((a >> layout.hwThreadBits) & coreMask));
 714                 socketRanks.push_back(static_cast<int>(a >> (layout.coreBits + layout.hwThreadBits)));
 715             }
 716
 717             renumberIndex(&hwThreadRanks);
 718             renumberIndex(&coreRanks);
 719             renumberIndex(&socketRanks);
 720
 721             unsigned int hwThreadRankSize =
 722                     1 + *std::max_element(hwThreadRanks.begin(), hwThreadRanks.end());
 723             unsigned int coreRankSize = 1 + *std::max_element(coreRanks.begin(), coreRanks.end());
 724             unsigned int socketRankSize = 1 + *std::max_element(socketRanks.begin(), socketRanks.end());
 725
 726             if (socketRankSize * coreRankSize * hwThreadRankSize == apicID.size())
 727             {
 728                 // Alright, everything looks consistent, so put it in the result
 729                 for (std::size_t i = 0; i < apicID.size(); i++)
 730                 {
 731                     // While the internal APIC IDs are always unsigned integers, we also cast to
 732                     // plain integers for the externally exposed vectors, since that will make
 733                     // it possible to use '-1' for invalid entries in the future.
 734                     logicalProcessors.push_back(
 735                             { int(socketRanks[i]), int(coreRanks[i]), int(hwThreadRanks[i]) });
 736                 }
 737             }
 738         }
 739     }
 740     return logicalProcessors; // Will only have contents if everything worked
 741 }
 742
 743
 744 /******************************************************************************
 745  *                                                                            *
 746  *              Generic Linux detection by parsing /proc/cpuinfo              *
 747  *                                                                            *
 748  ******************************************************************************/
 749
 750 /*! \brief Parse /proc/cpuinfo into a simple string map
 751  *
 752  * This routine will read the contents of /proc/cpuinfo, and for each
 753  * line that is not empty we will assign the (trimmed) string to the right of
 754  * the colon as a key, and the left-hand side as the value in the map.
 755  * For multi-processor systems where lines are repeated the latter lines will
 756  * overwrite the first occurrence.
 757  *
 758  * \return New map with the contents. If the file is not available, the returned
 759  *         map will be empty.
 760  */
 761 std::map<std::string, std::string> parseProcCpuInfo()
 762 {
 763     std::ifstream                      procCpuInfo("/proc/cpuinfo");
 764     std::string                        line;
 765     std::map<std::string, std::string> cpuInfo;
 766
 767     while (std::getline(procCpuInfo, line))
 768     {
 769         if (!line.empty())
 770         {
 771             std::stringstream iss(line);
 772             std::string       key;
 773             std::string       val;
 774             std::getline(iss, key, ':'); // part before colon
 775             std::getline(iss, val);      // part after colon
 776             trimString(&key);
 777             trimString(&val);
 778             // put it in the map. This will overwrite previous processors, but we don't care.
 779             cpuInfo[key] = val;
 780         }
 781     }
 782     return cpuInfo;
 783 }
 784
 785
 786 /*! \brief Try to detect vendor from /proc/cpuinfo
 787  *
 788  *  \param cpuInfo  Map returned from parseProcCpuinfo()
 789  *
 790  *  This routine tries to match a few common labels in /proc/cpuinfo to see if
 791  *  they begin with the name of a standard vendor. If the file cannot be read
 792  *  or if no match is found, we return gmx::CpuInfo::Vendor::Unknown.
 793  */
 794 CpuInfo::Vendor detectProcCpuInfoVendor(const std::map<std::string, std::string>& cpuInfo)
 795 {
 796     const std::map<std::string, CpuInfo::Vendor> testVendors = {
 797         { "GenuineIntel", CpuInfo::Vendor::Intel },
 798         { "Intel", CpuInfo::Vendor::Intel },
 799         { "AuthenticAmd", CpuInfo::Vendor::Amd },
 800         { "AMD", CpuInfo::Vendor::Amd },
 801         { "ARM", CpuInfo::Vendor::Arm },
 802         { "AArch64", CpuInfo::Vendor::Arm },
 803         { "Fujitsu", CpuInfo::Vendor::Fujitsu },
 804         { "IBM", CpuInfo::Vendor::Ibm },
 805         { "POWER", CpuInfo::Vendor::Ibm },
 806         { "Oracle", CpuInfo::Vendor::Oracle },
 807         { "HygonGenuine", CpuInfo::Vendor::Hygon },
 808         { "Hygon", CpuInfo::Vendor::Hygon },
 809     };
 810
 811     // For each label in /proc/cpuinfo, compare the value to the name in the
 812     // testNames map above, and if it's a match return the vendor.
 813     for (const auto& l : { "vendor_id", "vendor", "manufacture", "model", "processor", "cpu" })
 814     {
 815         if (cpuInfo.count(l) != 0U)
 816         {
 817             // there was a line with this left-hand side in /proc/cpuinfo
 818             const std::string& s1 = cpuInfo.at(l);
 819
 820             for (const auto& t : testVendors)
 821             {
 822                 const std::string& s2 = t.first;
 823
 824                 // If the entire name we are testing (s2) matches the first part of
 825                 // the string after the colon in /proc/cpuinfo (s1) we found our vendor
 826                 if (std::equal(s2.begin(), s2.end(), s1.begin(), [](const char& x, const char& y) -> bool {
 827                         return tolower(x) == tolower(y);
 828                     }))
 829                 {
 830                     return t.second;
 831                 }
 832             }
 833         }
 834     }
 835     return CpuInfo::Vendor::Unknown;
 836 }
 837
 838
 839 /*! \brief Detect IBM processor name and features from /proc/cpuinfo
 840  *
 841  *  \param      cpuInfo    Map returned from parseProcCpuinfo()
 842  *  \param[out] brand      String where to write the brand string
 843  *  \param[out] features   Feature set where supported features are inserted
 844  *
 845  *  This routine tries to match a few common labels in /proc/cpuinfo to see if
 846  *  we can find the processor name and features. It is likely fragile.
 847  */
 848 void detectProcCpuInfoIbm(const std::map<std::string, std::string>& cpuInfo,
 849                           std::string*                              brand,
 850                           std::set<CpuInfo::Feature>*               features)
 851 {
 852     // Get brand string from 'cpu' label if present, otherwise 'Processor'
 853     if (cpuInfo.count("cpu") != 0U)
 854     {
 855         *brand = cpuInfo.at("cpu");
 856     }
 857     else if (cpuInfo.count("Processor") != 0U)
 858     {
 859         *brand = cpuInfo.at("Processor");
 860     }
 861
 862     if (brand->find("A2") != std::string::npos)
 863     {
 864         // If the processor identification contains "A2", this is BlueGene/Q with QPX
 865         features->insert(CpuInfo::Feature::Ibm_Qpx);
 866     }
 867
 868     for (const auto& l : { "model name", "model", "Processor", "cpu" })
 869     {
 870         if (cpuInfo.count(l) != 0U)
 871         {
 872             std::string s1 = cpuInfo.at(l);
 873             std::transform(s1.begin(), s1.end(), s1.begin(), ::tolower);
 874
 875             if (s1.find("altivec") != std::string::npos)
 876             {
 877                 features->insert(CpuInfo::Feature::Ibm_Vmx);
 878                 // If this is a power6, we only have VMX. All later processors have VSX.
 879                 if (s1.find("power6") == std::string::npos)
 880                 {
 881                     features->insert(CpuInfo::Feature::Ibm_Vsx);
 882                 }
 883             }
 884         }
 885     }
 886 }
 887
 888
 889 /*! \brief Detect ARM processor name and features from /proc/cpuinfo
 890  *
 891  *  \param      cpuInfo    Map returned from parseProcCpuinfo()
 892  *  \param[out] brand      String where to write the brand string
 893  *  \param[out] family     Major version of processor
 894  *  \param[out] model      Middle version of processor
 895  *  \param[out] stepping   Minor version of processor
 896  *  \param[out] features   Feature set where supported features are inserted
 897  *
 898  *  This routine tries to match a few common labels in /proc/cpuinfo to see if
 899  *  we can find the processor name and features. It is likely fragile.
 900  */
 901 void detectProcCpuInfoArm(const std::map<std::string, std::string>& cpuInfo,
 902                           std::string*                              brand,
 903                           int*                                      family,
 904                           int*                                      model,
 905                           int*                                      stepping,
 906                           std::set<CpuInfo::Feature>*               features)
 907 {
 908     if (cpuInfo.count("Processor") != 0U)
 909     {
 910         *brand = cpuInfo.at("Processor");
 911     }
 912     else if (cpuInfo.count("model name") != 0U)
 913     {
 914         *brand = cpuInfo.at("model name");
 915     }
 916
 917     if (cpuInfo.count("CPU architecture") != 0U)
 918     {
 919         *family = std::strtol(cpuInfo.at("CPU architecture").c_str(), nullptr, 10);
 920         // For some 64-bit CPUs it appears to say 'AArch64' instead
 921         if (*family == 0 && cpuInfo.at("CPU architecture").find("AArch64") != std::string::npos)
 922         {
 923             *family = 8; // fragile - no idea how a future ARMv9 will be represented in this case
 924         }
 925     }
 926     if (cpuInfo.count("CPU variant") != 0U)
 927     {
 928         *model = std::strtol(cpuInfo.at("CPU variant").c_str(), nullptr, 16);
 929     }
 930     if (cpuInfo.count("CPU revision") != 0U)
 931     {
 932         *stepping = std::strtol(cpuInfo.at("CPU revision").c_str(), nullptr, 10);
 933     }
 934
 935     if (cpuInfo.count("Features") != 0U)
 936     {
 937         const std::string& s = cpuInfo.at("Features");
 938         if (s.find("neon") != std::string::npos)
 939         {
 940             features->insert(CpuInfo::Feature::Arm_Neon);
 941         }
 942         if (s.find("asimd") != std::string::npos)
 943         {
 944             // At least Jetson TX1 runs a 32-bit environment by default, although
 945             // the kernel is 64-bits, and reports asimd feature flags. We cannot
 946             // use Neon-asimd in this case, so make sure we are on a 64-bit platform.
 947             if (sizeof(void*) == 8)
 948             {
 949                 features->insert(CpuInfo::Feature::Arm_NeonAsimd);
 950             }
 951         }
 952         if (s.find("sve") != std::string::npos)
 953         {
 954             features->insert(CpuInfo::Feature::Arm_Sve);
 955         }
 956     }
 957 }
 958
 959
 960 /*! \brief Try to detect vendor, cpu and features from /proc/cpuinfo
 961  *
 962  *  \param[out] vendor     Detected hardware vendor
 963  *  \param[out] brand      String where to write the brand string
 964  *  \param[out] family     Major version of processor
 965  *  \param[out] model      Middle version of processor
 966  *  \param[out] stepping   Minor version of processor
 967  *  \param[out] features   Feature set where supported features are inserted
 968  *
 969  *  This routine reads the /proc/cpuinfo file into a map and calls subroutines
 970  *  that attempt to parse by matching keys and values to known strings. It is
 971  *  much more fragile than our x86 detection, but it does not depend on
 972  *  specific system calls, intrinsics or assembly instructions.
 973  */
 974 void detectProcCpuInfo(CpuInfo::Vendor*            vendor,
 975                        std::string*                brand,
 976                        int*                        family,
 977                        int*                        model,
 978                        int*                        stepping,
 979                        std::set<CpuInfo::Feature>* features)
 980 {
 981     std::map<std::string, std::string> cpuInfo = parseProcCpuInfo();
 982
 983     if (*vendor == CpuInfo::Vendor::Unknown)
 984     {
 985         *vendor = detectProcCpuInfoVendor(cpuInfo);
 986     }
 987
 988     // Unfortunately there is no standard for contents in /proc/cpuinfo. We cannot
 989     // indiscriminately look for e.g. 'cpu' since it could be either name or an index.
 990     // To handle this slightly better we use one subroutine per vendor.
 991     switch (*vendor)
 992     {
 993         case CpuInfo::Vendor::Ibm: detectProcCpuInfoIbm(cpuInfo, brand, features); break;
 994
 995         case CpuInfo::Vendor::Arm:
 996             detectProcCpuInfoArm(cpuInfo, brand, family, model, stepping, features);
 997             break;
 998
 999         default:
1000             // We only have a single check for fujitsu for now
1001 #ifdef __HPC_ACE__
1002             features->insert(CpuInfo::Feature::Fujitsu_HpcAce);
1003 #endif
1004             break;
1005     }
1006 }
1007 /*! \endcond */
1008 } // namespace
1009
1010
1011 // static
1012 CpuInfo CpuInfo::detect()
1013 {
1014     CpuInfo result;
1015
1016     if (c_architecture == Architecture::X86)
1017     {
1018         result.vendor_ = detectX86Vendor();
1019
1020         if (result.vendor_ == CpuInfo::Vendor::Intel)
1021         {
1022             result.features_.insert(CpuInfo::Feature::X86_Intel);
1023         }
1024         else if (result.vendor_ == CpuInfo::Vendor::Amd)
1025         {
1026             result.features_.insert(CpuInfo::Feature::X86_Amd);
1027         }
1028         else if (result.vendor_ == CpuInfo::Vendor::Hygon)
1029         {
1030             result.features_.insert(CpuInfo::Feature::X86_Hygon);
1031         }
1032         detectX86Features(
1033                 &result.brandString_, &result.family_, &result.model_, &result.stepping_, &result.features_);
1034         result.logicalProcessors_ = detectX86LogicalProcessors();
1035     }
1036     else
1037     {
1038         // Not x86
1039         if (c_architecture == Architecture::Arm)
1040         {
1041             result.vendor_ = CpuInfo::Vendor::Arm;
1042         }
1043         else if (c_architecture == Architecture::PowerPC)
1044         {
1045             result.vendor_ = CpuInfo::Vendor::Ibm;
1046         }
1047
1048 #if defined __aarch64__ || (defined _M_ARM && _M_ARM >= 8)
1049         result.features_.insert(Feature::Arm_Neon);      // ARMv8 always has Neon
1050         result.features_.insert(Feature::Arm_NeonAsimd); // ARMv8 always has Neon-asimd
1051 #endif
1052 #if defined __arch64__ && defined __ARM_FEATURE_SVE
1053         result.features_.insert(Feature::Arm_Sve);
1054 #endif
1055
1056 #if defined sun
1057         result.vendor_ = CpuInfo::Vendor::Oracle;
1058 #endif
1059
1060         // On Linux we might be able to find information in /proc/cpuinfo. If vendor or brand
1061         // is set to a known value this routine will not overwrite it.
1062         detectProcCpuInfo(&result.vendor_,
1063                           &result.brandString_,
1064                           &result.family_,
1065                           &result.model_,
1066                           &result.stepping_,
1067                           &result.features_);
1068     }
1069
1070     if (!result.logicalProcessors_.empty())
1071     {
1072         result.supportLevel_ = CpuInfo::SupportLevel::LogicalProcessorInfo;
1073     }
1074     else if (!result.features_.empty())
1075     {
1076         result.supportLevel_ = CpuInfo::SupportLevel::Features;
1077     }
1078     else if (result.vendor_ != CpuInfo::Vendor::Unknown
1079              || result.brandString_ != "Unknown CPU brand")
1080     {
1081         result.supportLevel_ = CpuInfo::SupportLevel::Name;
1082     }
1083     else
1084     {
1085         result.supportLevel_ = CpuInfo::SupportLevel::None;
1086     }
1087
1088     return result;
1089 }
1090
1091 CpuInfo::CpuInfo() :
1092     vendor_(CpuInfo::Vendor::Unknown),
1093     brandString_("Unknown CPU brand"),
1094     family_(0),
1095     model_(0),
1096     stepping_(0)
1097 {
1098 }
1099
1100 const std::string& CpuInfo::vendorString() const
1101 {
1102     static const std::map<Vendor, std::string> vendorStrings = {
1103         { Vendor::Unknown, "Unknown vendor" }, { Vendor::Intel, "Intel" }, { Vendor::Amd, "AMD" },
1104         { Vendor::Fujitsu, "Fujitsu" },        { Vendor::Ibm, "IBM" },     { Vendor::Arm, "ARM" },
1105         { Vendor::Oracle, "Oracle" },          { Vendor::Hygon, "Hygon" },
1106     };
1107
1108     return vendorStrings.at(vendor_);
1109 }
1110
1111
1112 const std::string& CpuInfo::featureString(Feature f)
1113 {
1114     static const std::map<Feature, std::string> featureStrings = {
1115         { Feature::X86_Aes, "aes" },
1116         { Feature::X86_Amd, "amd" },
1117         { Feature::X86_Apic, "apic" },
1118         { Feature::X86_Avx, "avx" },
1119         { Feature::X86_Avx2, "avx2" },
1120         { Feature::X86_Avx512F, "avx512f" },
1121         { Feature::X86_Avx512PF, "avx512pf" },
1122         { Feature::X86_Avx512ER, "avx512er" },
1123         { Feature::X86_Avx512CD, "avx512cd" },
1124         { Feature::X86_Avx512BW, "avx512bw" },
1125         { Feature::X86_Avx512VL, "avx512vl" },
1126         { Feature::X86_Avx512BF16, "avx512bf16" },
1127         { Feature::X86_Avx512secondFMA, "avx512secondFMA" },
1128         { Feature::X86_Clfsh, "clfsh" },
1129         { Feature::X86_Cmov, "cmov" },
1130         { Feature::X86_Cx8, "cx8" },
1131         { Feature::X86_Cx16, "cx16" },
1132         { Feature::X86_F16C, "f16c" },
1133         { Feature::X86_Fma, "fma" },
1134         { Feature::X86_Fma4, "fma4" },
1135         { Feature::X86_Hle, "hle" },
1136         { Feature::X86_Htt, "htt" },
1137         { Feature::X86_Intel, "intel" },
1138         { Feature::X86_Lahf, "lahf" },
1139         { Feature::X86_MisalignSse, "misalignsse" },
1140         { Feature::X86_Mmx, "mmx" },
1141         { Feature::X86_Msr, "msr" },
1142         { Feature::X86_NonstopTsc, "nonstop_tsc" },
1143         { Feature::X86_Pcid, "pcid" },
1144         { Feature::X86_Pclmuldq, "pclmuldq" },
1145         { Feature::X86_Pdcm, "pdcm" },
1146         { Feature::X86_PDPE1GB, "pdpe1gb" },
1147         { Feature::X86_Popcnt, "popcnt" },
1148         { Feature::X86_Pse, "pse" },
1149         { Feature::X86_Rdrnd, "rdrnd" },
1150         { Feature::X86_Rdtscp, "rdtscp" },
1151         { Feature::X86_Rtm, "rtm" },
1152         { Feature::X86_Sha, "sha" },
1153         { Feature::X86_Sse2, "sse2" },
1154         { Feature::X86_Sse3, "sse3" },
1155         { Feature::X86_Sse4A, "sse4a" },
1156         { Feature::X86_Sse4_1, "sse4.1" },
1157         { Feature::X86_Sse4_2, "sse4.2" },
1158         { Feature::X86_Ssse3, "ssse3" },
1159         { Feature::X86_Tdt, "tdt" },
1160         { Feature::X86_X2Apic, "x2apic" },
1161         { Feature::X86_Xop, "xop" },
1162         { Feature::Arm_Neon, "neon" },
1163         { Feature::Arm_NeonAsimd, "neon_asimd" },
1164         { Feature::Arm_Sve, "sve" },
1165         { Feature::Ibm_Qpx, "qpx" },
1166         { Feature::Ibm_Vmx, "vmx" },
1167         { Feature::Ibm_Vsx, "vsx" },
1168         { Feature::Fujitsu_HpcAce, "hpc-ace" },
1169         { Feature::X86_Hygon, "hygon" }
1170     };
1171     return featureStrings.at(f);
1172 }
1173
1174
1175 bool cpuIsX86Nehalem(const CpuInfo& cpuInfo)
1176 {
1177     return (cpuInfo.vendor() == CpuInfo::Vendor::Intel && cpuInfo.family() == 6
1178             && (cpuInfo.model() == 0x2E || cpuInfo.model() == 0x1A || cpuInfo.model() == 0x1E
1179                 || cpuInfo.model() == 0x2F || cpuInfo.model() == 0x2C || cpuInfo.model() == 0x25));
1180 }
1181
1182 bool cpuIsAmdZen1(const CpuInfo& cpuInfo)
1183 {
1184     /* Both Zen/Zen+/Zen2 have family==23
1185      * Model numbers for Zen:
1186      * 1)  Naples, Whitehaven, Summit Ridge, and Snowy Owl;
1187      * 17) Raven Ridge.
1188      * Model numbers for Zen+:
1189      * 8)  Pinnacle Ridge;
1190      * 24) Picasso.
1191      * Hygon got license for Zen1, but not Zen2 (https://www.tomshardware.com/news/amd-zen-china-x86-ip-license,39573.html)
1192      */
1193     return (cpuInfo.vendor() == CpuInfo::Vendor::Amd && cpuInfo.family() == 23
1194             && (cpuInfo.model() == 1 || cpuInfo.model() == 17 || cpuInfo.model() == 8
1195                 || cpuInfo.model() == 24))
1196            || (cpuInfo.vendor() == CpuInfo::Vendor::Hygon);
1197 }
1198
1199 } // namespace gmx
1200
1201 #ifdef GMX_CPUINFO_STANDALONE
1202 int main(int argc, char** argv)
1203 {
1204     if (argc < 2)
1205     {
1206         fprintf(stdout,
1207                 "Usage:\n\n%s [flags]\n\n"
1208                 "Available flags:\n"
1209                 "-vendor        Print CPU vendor.\n"
1210                 "-brand         Print CPU brand string.\n"
1211                 "-family        Print CPU family version.\n"
1212                 "-model         Print CPU model version.\n"
1213                 "-stepping      Print CPU stepping version.\n"
1214                 "-features      Print CPU feature flags.\n",
1215                 argv[0]);
1216         exit(1);
1217     }
1218
1219     std::string  arg(argv[1]);
1220     gmx::CpuInfo cpuInfo(gmx::CpuInfo::detect());
1221
1222     if (arg == "-vendor")
1223     {
1224         printf("%s\n", cpuInfo.vendorString().c_str());
1225     }
1226     else if (arg == "-brand")
1227     {
1228         printf("%s\n", cpuInfo.brandString().c_str());
1229     }
1230     else if (arg == "-family")
1231     {
1232         printf("%d\n", cpuInfo.family());
1233     }
1234     else if (arg == "-model")
1235     {
1236         printf("%d\n", cpuInfo.model());
1237     }
1238     else if (arg == "-stepping")
1239     {
1240         printf("%d\n", cpuInfo.stepping());
1241     }
1242     else if (arg == "-features")
1243     {
1244         // Separate the feature strings with spaces. Note that in the
1245         // GROMACS cmake code, surrounding whitespace is first
1246         // stripped by the CPU detection routine, and then added back
1247         // in the code for making the SIMD suggestion.
1248         for (auto& f : cpuInfo.featureSet())
1249         {
1250             printf("%s ", cpuInfo.featureString(f).c_str());
1251         }
1252         printf("\n");
1253     }
1254     else if (arg == "-topology")
1255     {
1256         // Undocumented debug option, usually not present in standalone version
1257         for (auto& t : cpuInfo.logicalProcessors())
1258         {
1259             printf("%3u %3u %3u\n", t.socketRankInMachine, t.coreRankInSocket, t.hwThreadRankInCore);
1260         }
1261     }
1262     return 0;
1263 }
1264 #endif