src/gromacs/hardware/cpuinfo.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012-2018, The GROMACS development team.
   5  * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36
  37 /*! \internal \file
  38  * \brief
  39  * Implements gmx::CpuInfo.
  40  *
  41  * We need to be able to compile this file in stand-alone mode to use basic
  42  * CPU feature detection to set the SIMD acceleration and similar things in
  43  * CMake, while we still want to use more features that enable topology
  44  * detection when config.h is present.
  45  *
  46  * We solve this by skipping the advanced stuff when the preprocessor
  47  * macro GMX_CPUINFO_STANDALONE is defined. In this case you likely also need to
  48  * define GMX_X86_GCC_INLINE_ASM if you are on x86; without inline assembly
  49  * support it is not possible to perform the actual detection on Linux/Mac.
  50  * Since these macros are specific to this file, they do not use the GMX prefix.
  51  *
  52  * The remaining defines (GMX_NATIVE_WINDOWS,HAVE_UNISTD_H,HAVE_SCHED_H,
  53  * HAVE_SYSCONF, HAVE_SCHED_AFFINITY) are only used to determine the topology on
  54  * 86, and for this we rely on including config.h.
  55  *
  56  * \author Erik Lindahl <erik.lindahl@gmail.com>
  57  * \ingroup module_hardware
  58  */
  59
  60 #ifndef GMX_CPUINFO_STANDALONE
  61 #    include "gmxpre.h"
  62 #endif
  63
  64 #include "cpuinfo.h"
  65
  66 #ifndef GMX_CPUINFO_STANDALONE
  67 #    include "config.h"
  68 #else
  69 #    define GMX_NATIVE_WINDOWS 0
  70 #endif
  71
  72 #if defined _MSC_VER
  73 #    include <intrin.h> // __cpuid()
  74 #endif
  75
  76 #if GMX_NATIVE_WINDOWS
  77 #    include <windows.h> // sysinfo(), necessary for topology stuff
  78 #endif
  79
  80 #ifdef HAVE_SCHED_H
  81 #    include <sched.h> // sched_getaffinity(), sched_setaffinity()
  82 #endif
  83 #ifdef HAVE_UNISTD_H
  84 #    include <unistd.h> // sysconf()
  85 #endif
  86
  87 #include <cctype>
  88 #include <cstdint> // uint32_t in X86 processor name code
  89 #include <cstdlib>
  90
  91 #include <algorithm>
  92 #include <fstream>
  93 #include <map>
  94 #include <set>
  95 #include <sstream>
  96 #include <string>
  97
  98 #ifdef GMX_CPUINFO_STANDALONE
  99 #    define gmx_unused
 100 #else
 101 #    include "gromacs/utility/basedefinitions.h"
 102 #endif
 103
 104 #include "architecture.h"
 105
 106 namespace gmx
 107 {
 108
 109 namespace
 110 {
 111
 112 /*! \cond internal */
 113
 114 /******************************************************************************
 115  *                                                                            *
 116  *   Utility functions to make this file independent of the GROMACS library   *
 117  *                                                                            *
 118  ******************************************************************************/
 119
 120 /*! \brief Remove initial and trailing whitespace from string
 121  *
 122  *  \param s  Pointer to string where whitespace will be removed
 123  */
 124 void trimString(std::string* s)
 125 {
 126     // heading
 127     s->erase(s->begin(),
 128              std::find_if(s->begin(), s->end(), [](char& c) -> bool { return std::isspace(c) == 0; }));
 129     // trailing
 130     s->erase(
 131             std::find_if(s->rbegin(), s->rend(), [](char& c) -> bool { return std::isspace(c) == 0; })
 132                     .base(),
 133             s->end());
 134 }
 135
 136
 137 /******************************************************************************
 138  *                                                                            *
 139  *                         x86 detection functions                            *
 140  *                                                                            *
 141  ******************************************************************************/
 142
 143 /*! \brief execute x86 cpuid instructions with custom level and extended level
 144  *
 145  *  \param level   The main cpuid level (input argument for eax register)
 146  *  \param ecxval  Extended level (input argument for ecx register)
 147  *  \param eax     Output in eax register
 148  *  \param ebx     Output in ebx register
 149  *  \param ecx     Output in ecx register
 150  *  \param edx     Output in edx register
 151  *
 152  *  \return 0 on success, or non-zero if the instruction could not execute.
 153  */
 154 int executeX86CpuID(unsigned int gmx_unused level,
 155                     unsigned int gmx_unused ecxval,
 156                     unsigned int*           eax,
 157                     unsigned int*           ebx,
 158                     unsigned int*           ecx,
 159                     unsigned int*           edx)
 160 {
 161     if (c_architecture == Architecture::X86)
 162     {
 163 #if defined __GNUC__ || GMX_X86_GCC_INLINE_ASM
 164
 165         // any compiler that understands gcc inline assembly
 166         *eax = level;
 167         *ecx = ecxval;
 168         *ebx = 0;
 169         *edx = 0;
 170
 171 #    if GMX_IS_X86_32 && defined(__PIC__)
 172         // Avoid clobbering the global offset table in 32-bit pic code (ebx register)
 173         __asm__ __volatile__(
 174                 "xchgl %%ebx, %1  \n\t"
 175                 "cpuid            \n\t"
 176                 "xchgl %%ebx, %1  \n\t"
 177                 : "+a"(*eax), "+r"(*ebx), "+c"(*ecx), "+d"(*edx));
 178 #    elif GMX_IS_X86_64
 179         // i386 without PIC, or x86-64. Things are easy and we can clobber any reg we want
 180         __asm__ __volatile__("cpuid            \n\t"
 181                              : "+a"(*eax), "+b"(*ebx), "+c"(*ecx), "+d"(*edx));
 182 #    else
 183         // Not a normal x86, which could happen when a compiler
 184         // targetting non-x86 pretends to be GCC.
 185 #    endif
 186         return 0;
 187
 188 #elif defined _MSC_VER
 189
 190         // MSVC (and icc on windows) on ia32 or x86-64
 191         int cpuInfo[4];
 192         __cpuidex(cpuInfo, level, ecxval);
 193         *eax = static_cast<unsigned int>(cpuInfo[0]);
 194         *ebx = static_cast<unsigned int>(cpuInfo[1]);
 195         *ecx = static_cast<unsigned int>(cpuInfo[2]);
 196         *edx = static_cast<unsigned int>(cpuInfo[3]);
 197         return 0;
 198
 199 #else
 200
 201         // We are on x86, but without compiler support for cpuid if we get here
 202         *eax = 0;
 203         *ebx = 0;
 204         *ecx = 0;
 205         *edx = 0;
 206         return 1;
 207
 208 #endif // check for inline asm on x86
 209     }
 210     else
 211     {
 212         // We are not on x86
 213         *eax = 0;
 214         *ebx = 0;
 215         *ecx = 0;
 216         *edx = 0;
 217         return 1;
 218     }
 219 }
 220
 221
 222 /*! \brief Detect x86 vendors by using the cpuid assembly instructions
 223  *
 224  *  If support for the cpuid instruction is present, we check for Intel,
 225  *  AMD or Hygon vendors
 226  *
 227  *  \return gmx::CpuInfo::Vendor::Intel, gmx::CpuInfo::Vendor::Amd,
 228  *          gmx::CpuInfl::Vendor::Hygon, . If neither Intel, Amd  nor
 229  *          Hygon can be identified, or if the code fails to execute,
 230  *          gmx::CpuInfo::Vendor::Unknown is returned.
 231  */
 232 CpuInfo::Vendor detectX86Vendor()
 233 {
 234     unsigned int    eax, ebx, ecx, edx;
 235     CpuInfo::Vendor v = CpuInfo::Vendor::Unknown;
 236
 237     if (executeX86CpuID(0x0, 0, &eax, &ebx, &ecx, &edx) == 0)
 238     {
 239         if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
 240         {
 241             v = CpuInfo::Vendor::Intel; // ebx=='uneG', ecx=='letn', edx=='Ieni'
 242         }
 243         else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
 244         {
 245             v = CpuInfo::Vendor::Amd; // ebx=='htuA', ecx=='DMAc', edx=='itne'
 246         }
 247         else if (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e)
 248         {
 249             v = CpuInfo::Vendor::Hygon; // ebx=='ogyH', ecx=='eniu', edx=='neGn'
 250         }
 251     }
 252     return v;
 253 }
 254
 255 /*! \brief Detect second AVX-512 FMA from the processor name
 256  *
 257  * Should only be called for processors already determined to support AVX-512.
 258  *
 259  *  \param [in] brand     x86 processor name
 260  *  \param [in] model     x86 model
 261  *  \return               True if second FMA present
 262  */
 263 bool detectProcCpuInfoSecondAvx512FMA(const std::string& brand, int model)
 264 {
 265     // Skylake server
 266     if (model == 0x55)
 267     {
 268         // detect Xeon
 269         if (brand.find("Xeon") == 9)
 270         {
 271             // detect Silver or Bronze or specific models
 272             if (brand.find("Silver") == 17 || brand.find("Bronze") == 17
 273                 || (brand.find('W') == 17 && brand.find('0') == 21)   // detect Xeon W 210x
 274                 || (brand.find('D') == 17 && brand.find("21") == 19)) // detect Xeon D 2xxx
 275             {
 276                 return false;
 277             }
 278             // detect Gold 5xxx - can be corrected once Cooper Lake is added
 279             else if (brand.find("Gold") == 17 && brand.find('5') == 22)
 280             {
 281                 return (brand.find("53") == 22 || // detect Cooper Lake
 282                         brand.find("22") == 24);  // detect 5[12]22
 283             }
 284         }
 285         return true;
 286     }
 287     // Cannon Lake client
 288     if (model == 0x66)
 289     {
 290         return false;
 291     }
 292     // Ice Lake client
 293     if (model == 0x7d || model == 0x7e)
 294     {
 295         return false;
 296     }
 297     // This is the right default...
 298     return true;
 299 }
 300
 301 /*! \brief Simple utility function to set/clear feature in a set
 302  *
 303  *  \param featureSet    Pointer to the feature set to update
 304  *  \param feature       The specific feature to set/clear
 305  *  \param registerValue Register value (returned from cpuid)
 306  *  \param bit           Bit to check in registerValue. The feature will be
 307  *                       added to the featureSet if this bit is set.
 308  *
 309  *  \note Nothing is done if the bit is not set. In particular, this will not
 310  *        erase anything if the feature already exists in the set.
 311  */
 312 void setFeatureFromBit(std::set<CpuInfo::Feature>* featureSet,
 313                        CpuInfo::Feature            feature,
 314                        unsigned int                registerValue,
 315                        unsigned char               bit)
 316 {
 317     if (registerValue & (1 << bit))
 318     {
 319         featureSet->insert(feature);
 320     }
 321 }
 322
 323 /*! \brief Process x86 cpuinfo features that are common to Intel and AMD CPUs
 324  *
 325  *  \param[out] brand      String where to write the x86 brand string
 326  *  \param[out] family     Major version of processor
 327  *  \param[out] model      Middle version of processor
 328  *  \param[out] stepping   Minor version of processor
 329  *  \param[out] features   Feature set where supported features are inserted
 330  */
 331 void detectX86Features(std::string* brand, int* family, int* model, int* stepping, std::set<CpuInfo::Feature>* features)
 332 {
 333     unsigned int eax, ebx, ecx, edx;
 334
 335     // Return if we cannot execute any levels
 336     if (executeX86CpuID(0x0, 0, &eax, &ebx, &ecx, &edx) != 0)
 337     {
 338         return;
 339     }
 340     unsigned int maxStdLevel = eax;
 341
 342     if (maxStdLevel >= 0x1)
 343     {
 344         executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
 345
 346         *family   = ((eax & 0x0ff00000) >> 20) + ((eax & 0x00000f00) >> 8);
 347         *model    = ((eax & 0x000f0000) >> 12) + ((eax & 0x000000f0) >> 4);
 348         *stepping = (eax & 0x0000000f);
 349
 350         setFeatureFromBit(features, CpuInfo::Feature::X86_Sse3, ecx, 0);
 351         setFeatureFromBit(features, CpuInfo::Feature::X86_Pclmuldq, ecx, 1);
 352         setFeatureFromBit(features, CpuInfo::Feature::X86_Ssse3, ecx, 9);
 353         setFeatureFromBit(features, CpuInfo::Feature::X86_Fma, ecx, 12);
 354         setFeatureFromBit(features, CpuInfo::Feature::X86_Cx16, ecx, 13);
 355         setFeatureFromBit(features, CpuInfo::Feature::X86_Pdcm, ecx, 15);
 356         setFeatureFromBit(features, CpuInfo::Feature::X86_Pcid, ecx, 17);
 357         setFeatureFromBit(features, CpuInfo::Feature::X86_Sse4_1, ecx, 19);
 358         setFeatureFromBit(features, CpuInfo::Feature::X86_Sse4_2, ecx, 20);
 359         setFeatureFromBit(features, CpuInfo::Feature::X86_X2Apic, ecx, 21);
 360         setFeatureFromBit(features, CpuInfo::Feature::X86_Popcnt, ecx, 23);
 361         setFeatureFromBit(features, CpuInfo::Feature::X86_Tdt, ecx, 24);
 362         setFeatureFromBit(features, CpuInfo::Feature::X86_Aes, ecx, 25);
 363         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx, ecx, 28);
 364         setFeatureFromBit(features, CpuInfo::Feature::X86_F16C, ecx, 29);
 365         setFeatureFromBit(features, CpuInfo::Feature::X86_Rdrnd, ecx, 30);
 366
 367         setFeatureFromBit(features, CpuInfo::Feature::X86_Pse, edx, 3);
 368         setFeatureFromBit(features, CpuInfo::Feature::X86_Msr, edx, 5);
 369         setFeatureFromBit(features, CpuInfo::Feature::X86_Cx8, edx, 8);
 370         setFeatureFromBit(features, CpuInfo::Feature::X86_Apic, edx, 9);
 371         setFeatureFromBit(features, CpuInfo::Feature::X86_Cmov, edx, 15);
 372         setFeatureFromBit(features, CpuInfo::Feature::X86_Clfsh, edx, 19);
 373         setFeatureFromBit(features, CpuInfo::Feature::X86_Mmx, edx, 23);
 374         setFeatureFromBit(features, CpuInfo::Feature::X86_Sse2, edx, 26);
 375         setFeatureFromBit(features, CpuInfo::Feature::X86_Htt, edx, 28);
 376     }
 377
 378     // Check whether Hyper-threading is really possible to enable in the hardware,
 379     // not just technically supported by this generation of processors
 380     if ((features->count(CpuInfo::Feature::X86_Htt) != 0U) && maxStdLevel >= 0x4)
 381     {
 382         executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
 383         unsigned int maxLogicalCores = (ebx >> 16) & 0x0ff;
 384         executeX86CpuID(0x4, 0, &eax, &ebx, &ecx, &edx);
 385         unsigned int maxPhysicalCores = ((eax >> 26) & 0x3f) + 1;
 386         if (maxLogicalCores / maxPhysicalCores < 2)
 387         {
 388             features->erase(CpuInfo::Feature::X86_Htt);
 389         }
 390     }
 391
 392     if (executeX86CpuID(0x80000000, 0, &eax, &ebx, &ecx, &edx) != 0)
 393     {
 394         // No point in continuing if we don't support any extended levels
 395         return;
 396     }
 397     unsigned int maxExtLevel = eax;
 398
 399     if (maxExtLevel >= 0x80000001)
 400     {
 401         executeX86CpuID(0x80000001, 0, &eax, &ebx, &ecx, &edx);
 402
 403         setFeatureFromBit(features, CpuInfo::Feature::X86_Lahf, ecx, 0);
 404         setFeatureFromBit(features, CpuInfo::Feature::X86_Sse4A, ecx, 6);
 405         setFeatureFromBit(features, CpuInfo::Feature::X86_MisalignSse, ecx, 7);
 406         setFeatureFromBit(features, CpuInfo::Feature::X86_Xop, ecx, 11);
 407         setFeatureFromBit(features, CpuInfo::Feature::X86_Fma4, ecx, 16);
 408         setFeatureFromBit(features, CpuInfo::Feature::X86_PDPE1GB, edx, 26);
 409         setFeatureFromBit(features, CpuInfo::Feature::X86_Rdtscp, edx, 27);
 410     }
 411
 412     if (maxExtLevel >= 0x80000005)
 413     {
 414         // Get the x86 CPU brand string (3 levels, 16 bytes in each)
 415         brand->clear();
 416         for (unsigned int level = 0x80000002; level < 0x80000005; level++)
 417         {
 418             executeX86CpuID(level, 0, &eax, &ebx, &ecx, &edx);
 419             // Add eax, ebx, ecx, edx contents as 4 chars each to the brand string
 420             brand->append(reinterpret_cast<const char*>(&eax), sizeof(eax));
 421             brand->append(reinterpret_cast<const char*>(&ebx), sizeof(ebx));
 422             brand->append(reinterpret_cast<const char*>(&ecx), sizeof(ecx));
 423             brand->append(reinterpret_cast<const char*>(&edx), sizeof(edx));
 424         }
 425         trimString(brand);
 426     }
 427
 428     if (maxStdLevel >= 0x7)
 429     {
 430         executeX86CpuID(0x7, 0, &eax, &ebx, &ecx, &edx);
 431
 432         setFeatureFromBit(features, CpuInfo::Feature::X86_Hle, ebx, 4);
 433         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx2, ebx, 5);
 434         setFeatureFromBit(features, CpuInfo::Feature::X86_Rtm, ebx, 11);
 435         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512F, ebx, 16);
 436         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512PF, ebx, 26);
 437         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512ER, ebx, 27);
 438         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512CD, ebx, 28);
 439         setFeatureFromBit(features, CpuInfo::Feature::X86_Sha, ebx, 29);
 440         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512BW, ebx, 30);
 441         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512VL, ebx, 31);
 442
 443         executeX86CpuID(0x7, 0x1, &eax, &ebx, &ecx, &edx);
 444         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512BF16, eax, 5);
 445
 446         if (features->count(CpuInfo::Feature::X86_Avx512F) != 0)
 447         {
 448             // Only checking if the CPU supports AVX-512. There is no CPUID bit for this.
 449             if (detectProcCpuInfoSecondAvx512FMA(*brand, *model))
 450             {
 451                 features->insert(CpuInfo::Feature::X86_Avx512secondFMA);
 452             }
 453         }
 454     }
 455
 456
 457     if (maxExtLevel >= 0x80000007)
 458     {
 459         executeX86CpuID(0x80000007, 0, &eax, &ebx, &ecx, &edx);
 460
 461         setFeatureFromBit(features, CpuInfo::Feature::X86_NonstopTsc, edx, 8);
 462     }
 463 }
 464
 465
 466 /*! \brief Return a vector with x86 APIC IDs for all threads
 467  *
 468  *  \param haveX2Apic  True if the processors supports x2APIC, otherwise vanilla APIC.
 469  *
 470  *  \returns A new std::vector of unsigned integer APIC IDs, one for each
 471  *           logical processor in the system.
 472  */
 473 std::vector<unsigned int> detectX86ApicIDs(bool gmx_unused haveX2Apic)
 474 {
 475     std::vector<unsigned int> apicID;
 476
 477     // We cannot just ask for all APIC IDs, but must force execution on each
 478     // hardware thread and extract the APIC id there.
 479 #if HAVE_SCHED_AFFINITY && defined HAVE_SYSCONF
 480     unsigned int eax, ebx, ecx, edx;
 481     unsigned int nApic = sysconf(_SC_NPROCESSORS_ONLN);
 482     cpu_set_t    saveCpuSet;
 483     cpu_set_t    cpuSet;
 484     sched_getaffinity(0, sizeof(cpu_set_t), &saveCpuSet);
 485     CPU_ZERO(&cpuSet);
 486     for (unsigned int i = 0; i < nApic; i++)
 487     {
 488         CPU_SET(i, &cpuSet);
 489         sched_setaffinity(0, sizeof(cpu_set_t), &cpuSet);
 490         if (haveX2Apic)
 491         {
 492             executeX86CpuID(0xb, 0, &eax, &ebx, &ecx, &edx);
 493             apicID.push_back(edx);
 494         }
 495         else
 496         {
 497             executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
 498             apicID.push_back(ebx >> 24);
 499         }
 500         CPU_CLR(i, &cpuSet);
 501     }
 502     sched_setaffinity(0, sizeof(cpu_set_t), &saveCpuSet);
 503 #elif GMX_NATIVE_WINDOWS
 504     unsigned int eax, ebx, ecx, edx;
 505     SYSTEM_INFO sysinfo;
 506     GetSystemInfo(&sysinfo);
 507     unsigned int nApic = sysinfo.dwNumberOfProcessors;
 508     unsigned int saveAffinity = SetThreadAffinityMask(GetCurrentThread(), 1);
 509     for (DWORD_PTR i = 0; i < nApic; i++)
 510     {
 511         SetThreadAffinityMask(GetCurrentThread(), (((DWORD_PTR)1) << i));
 512         Sleep(0);
 513         if (haveX2Apic)
 514         {
 515             executeX86CpuID(0xb, 0, &eax, &ebx, &ecx, &edx);
 516             apicID.push_back(edx);
 517         }
 518         else
 519         {
 520             executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
 521             apicID.push_back(ebx >> 24);
 522         }
 523     }
 524     SetThreadAffinityMask(GetCurrentThread(), saveAffinity);
 525 #endif
 526     return apicID;
 527 }
 528
 529
 530 /*! \brief Utility to renumber indices extracted from APIC IDs
 531  *
 532  * \param v  Vector with unsigned integer indices
 533  *
 534  * This routine returns the number of unique different elements found in the vector,
 535  * and renumbers these starting from 0. For example, the vector {0,1,2,8,9,10,8,9,10,0,1,2}
 536  * will be rewritten to {0,1,2,3,4,5,3,4,5,0,1,2}, and it returns 6 for the
 537  * number of unique elements.
 538  */
 539 void renumberIndex(std::vector<unsigned int>* v)
 540 {
 541     std::vector<unsigned int> sortedV(*v);
 542     std::sort(sortedV.begin(), sortedV.end());
 543
 544     std::vector<unsigned int> uniqueSortedV(sortedV);
 545     auto                      it = std::unique(uniqueSortedV.begin(), uniqueSortedV.end());
 546     uniqueSortedV.resize(std::distance(uniqueSortedV.begin(), it));
 547
 548     for (std::size_t i = 0; i < uniqueSortedV.size(); i++)
 549     {
 550         unsigned int val = uniqueSortedV[i];
 551         std::replace_if(
 552                 v->begin(),
 553                 v->end(),
 554                 [val](unsigned int& c) -> bool { return c == val; },
 555                 static_cast<unsigned int>(i));
 556     }
 557 }
 558
 559 /*! \brief The layout of the bits in the APIC ID */
 560 struct ApicIdLayout
 561 {
 562     unsigned int hwThreadBits; //!< The number of least significant bits for hw-threads
 563     unsigned int coreBits;     //!< The number of core bits following the  hw-thread bits
 564 };
 565
 566 /*! \brief Detect the APIC ID layout for x2APIC
 567  */
 568 ApicIdLayout detectX2ApicIdLayout()
 569 {
 570     ApicIdLayout layout;
 571
 572     unsigned int eax;
 573     unsigned int ebx;
 574     unsigned int ecx;
 575     unsigned int edx;
 576     executeX86CpuID(0xb, 0, &eax, &ebx, &ecx, &edx);
 577     layout.hwThreadBits = eax & 0x1f;
 578     executeX86CpuID(0xb, 1, &eax, &ebx, &ecx, &edx);
 579     layout.coreBits = (eax & 0x1f) - layout.hwThreadBits;
 580
 581     return layout;
 582 }
 583
 584 /*! \brief Detect the APIC ID layout for standard APIC or xAPIC on AMD
 585  *
 586  * \param[in] maxExtLevel  The largest CPUID extended function input value supported by the processor implementation
 587  */
 588 ApicIdLayout detectAmdApicIdLayout(unsigned int maxExtLevel)
 589 {
 590     ApicIdLayout layout;
 591
 592     unsigned int eax;
 593     unsigned int ebx;
 594     unsigned int ecx;
 595     unsigned int edx;
 596     executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
 597     int family = ((eax & 0x0ff00000) >> 20) + ((eax & 0x00000f00) >> 8);
 598     executeX86CpuID(0x80000001, 0, &eax, &ebx, &ecx, &edx);
 599     bool haveExtendedTopology = (ecx & (1 << 22)) != 0U;
 600
 601     // NOTE: Here we assume 1 thread per core, unless we have family >= 17h
 602     layout.hwThreadBits = 0;
 603     if (family >= 0x17 && haveExtendedTopology && maxExtLevel >= 0x8000001e)
 604     {
 605         executeX86CpuID(0x8000001e, 1, &eax, &ebx, &ecx, &edx);
 606         int numThreadsPerCore = ((ebx >> 8) & 0xff) + 1;
 607         // NOTE: The AMD documentation only specifies the layout of apicid
 608         //       when we have 1 or 2 threads per core.
 609         while (numThreadsPerCore > (1 << layout.hwThreadBits))
 610         {
 611             layout.hwThreadBits++;
 612         }
 613     }
 614
 615     // Get number of core bits in apic ID - try modern extended method first
 616     executeX86CpuID(0x80000008, 0, &eax, &ebx, &ecx, &edx);
 617     layout.coreBits = (ecx >> 12) & 0xf;
 618     if (layout.coreBits == 0)
 619     {
 620         // Legacy method for old single/dual core AMD CPUs
 621         int i = ecx & 0xf;
 622         while (i >> layout.coreBits)
 623         {
 624             layout.coreBits++;
 625         }
 626     }
 627
 628     return layout;
 629 }
 630
 631 /*! \brief Try to detect basic CPU topology information using x86 cpuid
 632  *
 633  *  If x2APIC support is present, this is our first choice, otherwise we
 634  *  attempt to use old vanilla APIC.
 635  *
 636  *  \return A new vector of entries with socket, core, hwthread information
 637  *          for each logical processor.
 638  */
 639 std::vector<CpuInfo::LogicalProcessor> detectX86LogicalProcessors()
 640 {
 641     unsigned int eax;
 642     unsigned int ebx;
 643     unsigned int ecx;
 644     unsigned int edx;
 645     unsigned int maxStdLevel;
 646     unsigned int maxExtLevel;
 647     bool         haveApic;
 648     bool         haveX2Apic;
 649
 650     std::vector<CpuInfo::LogicalProcessor> logicalProcessors;
 651
 652     // Find largest standard & extended level input values allowed
 653     executeX86CpuID(0x0, 0, &eax, &ebx, &ecx, &edx);
 654     maxStdLevel = eax;
 655     executeX86CpuID(0x80000000, 0, &eax, &ebx, &ecx, &edx);
 656     maxExtLevel = eax;
 657
 658     if (maxStdLevel >= 0x1)
 659     {
 660         executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
 661         haveX2Apic = ((ecx & (1 << 21)) != 0U) && maxStdLevel >= 0xb;
 662         haveApic   = ((edx & (1 << 9)) != 0U) && maxExtLevel >= 0x80000008;
 663     }
 664     else
 665     {
 666         haveX2Apic = false;
 667         haveApic   = false;
 668     }
 669
 670     if (haveX2Apic || haveApic)
 671     {
 672         ApicIdLayout layout;
 673         // Get bits for cores and hardware threads
 674         if (haveX2Apic)
 675         {
 676             layout = detectX2ApicIdLayout();
 677         }
 678         else // haveApic
 679         {
 680             if (detectX86Vendor() == CpuInfo::Vendor::Amd || detectX86Vendor() == CpuInfo::Vendor::Hygon)
 681             {
 682                 layout = detectAmdApicIdLayout(maxExtLevel);
 683
 684                 if (layout.hwThreadBits > 1)
 685                 {
 686                     // At the time of writing this code we do not know what
 687                     // to do with more than 2 threads, so return empty.
 688                     return logicalProcessors;
 689                 }
 690             }
 691             else
 692             {
 693                 // We do not know the APIC ID layout, return empty.
 694                 return logicalProcessors;
 695             }
 696         }
 697
 698         std::vector<unsigned int> apicID = detectX86ApicIDs(haveX2Apic);
 699
 700         if (!apicID.empty())
 701         {
 702             // APIC IDs can be buggy, and it is always a mess. Typically more bits are
 703             // reserved than needed, and the numbers might not increment by 1 even in
 704             // a single socket or core. Extract, renumber, and check that things make sense.
 705             unsigned int              hwThreadMask = (1 << layout.hwThreadBits) - 1;
 706             unsigned int              coreMask     = (1 << layout.coreBits) - 1;
 707             std::vector<unsigned int> hwThreadRanks;
 708             std::vector<unsigned int> coreRanks;
 709             std::vector<unsigned int> socketRanks;
 710
 711             for (auto a : apicID)
 712             {
 713                 hwThreadRanks.push_back(static_cast<int>(a & hwThreadMask));
 714                 coreRanks.push_back(static_cast<int>((a >> layout.hwThreadBits) & coreMask));
 715                 socketRanks.push_back(static_cast<int>(a >> (layout.coreBits + layout.hwThreadBits)));
 716             }
 717
 718             renumberIndex(&hwThreadRanks);
 719             renumberIndex(&coreRanks);
 720             renumberIndex(&socketRanks);
 721
 722             unsigned int hwThreadRankSize =
 723                     1 + *std::max_element(hwThreadRanks.begin(), hwThreadRanks.end());
 724             unsigned int coreRankSize = 1 + *std::max_element(coreRanks.begin(), coreRanks.end());
 725             unsigned int socketRankSize = 1 + *std::max_element(socketRanks.begin(), socketRanks.end());
 726
 727             if (socketRankSize * coreRankSize * hwThreadRankSize == apicID.size())
 728             {
 729                 // Alright, everything looks consistent, so put it in the result
 730                 for (std::size_t i = 0; i < apicID.size(); i++)
 731                 {
 732                     // While the internal APIC IDs are always unsigned integers, we also cast to
 733                     // plain integers for the externally exposed vectors, since that will make
 734                     // it possible to use '-1' for invalid entries in the future.
 735                     logicalProcessors.push_back(
 736                             { int(socketRanks[i]), int(coreRanks[i]), int(hwThreadRanks[i]) });
 737                 }
 738             }
 739         }
 740     }
 741     return logicalProcessors; // Will only have contents if everything worked
 742 }
 743
 744
 745 /******************************************************************************
 746  *                                                                            *
 747  *              Generic Linux detection by parsing /proc/cpuinfo              *
 748  *                                                                            *
 749  ******************************************************************************/
 750
 751 /*! \brief Parse /proc/cpuinfo into a simple string map
 752  *
 753  * This routine will read the contents of /proc/cpuinfo, and for each
 754  * line that is not empty we will assign the (trimmed) string to the right of
 755  * the colon as a key, and the left-hand side as the value in the map.
 756  * For multi-processor systems where lines are repeated the latter lines will
 757  * overwrite the first occurrence.
 758  *
 759  * \return New map with the contents. If the file is not available, the returned
 760  *         map will be empty.
 761  */
 762 std::map<std::string, std::string> parseProcCpuInfo()
 763 {
 764     std::ifstream                      procCpuInfo("/proc/cpuinfo");
 765     std::string                        line;
 766     std::map<std::string, std::string> cpuInfo;
 767
 768     while (std::getline(procCpuInfo, line))
 769     {
 770         if (!line.empty())
 771         {
 772             std::stringstream iss(line);
 773             std::string       key;
 774             std::string       val;
 775             std::getline(iss, key, ':'); // part before colon
 776             std::getline(iss, val);      // part after colon
 777             trimString(&key);
 778             trimString(&val);
 779             // put it in the map. This will overwrite previous processors, but we don't care.
 780             cpuInfo[key] = val;
 781         }
 782     }
 783     return cpuInfo;
 784 }
 785
 786
 787 /*! \brief Try to detect vendor from /proc/cpuinfo
 788  *
 789  *  \param cpuInfo  Map returned from parseProcCpuinfo()
 790  *
 791  *  This routine tries to match a few common labels in /proc/cpuinfo to see if
 792  *  they begin with the name of a standard vendor. If the file cannot be read
 793  *  or if no match is found, we return gmx::CpuInfo::Vendor::Unknown.
 794  */
 795 CpuInfo::Vendor detectProcCpuInfoVendor(const std::map<std::string, std::string>& cpuInfo)
 796 {
 797     const std::map<std::string, CpuInfo::Vendor> testVendors = {
 798         { "GenuineIntel", CpuInfo::Vendor::Intel },
 799         { "Intel", CpuInfo::Vendor::Intel },
 800         { "AuthenticAmd", CpuInfo::Vendor::Amd },
 801         { "AMD", CpuInfo::Vendor::Amd },
 802         { "ARM", CpuInfo::Vendor::Arm },
 803         { "AArch64", CpuInfo::Vendor::Arm },
 804         { "Fujitsu", CpuInfo::Vendor::Fujitsu },
 805         { "IBM", CpuInfo::Vendor::Ibm },
 806         { "POWER", CpuInfo::Vendor::Ibm },
 807         { "Oracle", CpuInfo::Vendor::Oracle },
 808         { "HygonGenuine", CpuInfo::Vendor::Hygon },
 809         { "Hygon", CpuInfo::Vendor::Hygon },
 810     };
 811
 812     // For each label in /proc/cpuinfo, compare the value to the name in the
 813     // testNames map above, and if it's a match return the vendor.
 814     for (const auto& l : { "vendor_id", "vendor", "manufacture", "model", "processor", "cpu" })
 815     {
 816         if (cpuInfo.count(l) != 0U)
 817         {
 818             // there was a line with this left-hand side in /proc/cpuinfo
 819             const std::string& s1 = cpuInfo.at(l);
 820
 821             for (const auto& t : testVendors)
 822             {
 823                 const std::string& s2 = t.first;
 824
 825                 // If the entire name we are testing (s2) matches the first part of
 826                 // the string after the colon in /proc/cpuinfo (s1) we found our vendor
 827                 if (std::equal(s2.begin(), s2.end(), s1.begin(), [](const char& x, const char& y) -> bool {
 828                         return tolower(x) == tolower(y);
 829                     }))
 830                 {
 831                     return t.second;
 832                 }
 833             }
 834         }
 835     }
 836     return CpuInfo::Vendor::Unknown;
 837 }
 838
 839
 840 /*! \brief Detect IBM processor name and features from /proc/cpuinfo
 841  *
 842  *  \param      cpuInfo    Map returned from parseProcCpuinfo()
 843  *  \param[out] brand      String where to write the brand string
 844  *  \param[out] features   Feature set where supported features are inserted
 845  *
 846  *  This routine tries to match a few common labels in /proc/cpuinfo to see if
 847  *  we can find the processor name and features. It is likely fragile.
 848  */
 849 void detectProcCpuInfoIbm(const std::map<std::string, std::string>& cpuInfo,
 850                           std::string*                              brand,
 851                           std::set<CpuInfo::Feature>*               features)
 852 {
 853     // Get brand string from 'cpu' label if present, otherwise 'Processor'
 854     if (cpuInfo.count("cpu") != 0U)
 855     {
 856         *brand = cpuInfo.at("cpu");
 857     }
 858     else if (cpuInfo.count("Processor") != 0U)
 859     {
 860         *brand = cpuInfo.at("Processor");
 861     }
 862
 863     if (brand->find("A2") != std::string::npos)
 864     {
 865         // If the processor identification contains "A2", this is BlueGene/Q with QPX
 866         features->insert(CpuInfo::Feature::Ibm_Qpx);
 867     }
 868
 869     for (const auto& l : { "model name", "model", "Processor", "cpu" })
 870     {
 871         if (cpuInfo.count(l) != 0U)
 872         {
 873             std::string s1 = cpuInfo.at(l);
 874             std::transform(s1.begin(), s1.end(), s1.begin(), ::tolower);
 875
 876             if (s1.find("altivec") != std::string::npos)
 877             {
 878                 features->insert(CpuInfo::Feature::Ibm_Vmx);
 879                 // If this is a power6, we only have VMX. All later processors have VSX.
 880                 if (s1.find("power6") == std::string::npos)
 881                 {
 882                     features->insert(CpuInfo::Feature::Ibm_Vsx);
 883                 }
 884             }
 885         }
 886     }
 887 }
 888
 889
 890 /*! \brief Detect ARM processor name and features from /proc/cpuinfo
 891  *
 892  *  \param      cpuInfo    Map returned from parseProcCpuinfo()
 893  *  \param[out] brand      String where to write the brand string
 894  *  \param[out] family     Major version of processor
 895  *  \param[out] model      Middle version of processor
 896  *  \param[out] stepping   Minor version of processor
 897  *  \param[out] features   Feature set where supported features are inserted
 898  *
 899  *  This routine tries to match a few common labels in /proc/cpuinfo to see if
 900  *  we can find the processor name and features. It is likely fragile.
 901  */
 902 void detectProcCpuInfoArm(const std::map<std::string, std::string>& cpuInfo,
 903                           std::string*                              brand,
 904                           int*                                      family,
 905                           int*                                      model,
 906                           int*                                      stepping,
 907                           std::set<CpuInfo::Feature>*               features)
 908 {
 909     if (cpuInfo.count("Processor") != 0U)
 910     {
 911         *brand = cpuInfo.at("Processor");
 912     }
 913     else if (cpuInfo.count("model name") != 0U)
 914     {
 915         *brand = cpuInfo.at("model name");
 916     }
 917
 918     if (cpuInfo.count("CPU architecture") != 0U)
 919     {
 920         *family = std::strtol(cpuInfo.at("CPU architecture").c_str(), nullptr, 10);
 921         // For some 64-bit CPUs it appears to say 'AArch64' instead
 922         if (*family == 0 && cpuInfo.at("CPU architecture").find("AArch64") != std::string::npos)
 923         {
 924             *family = 8; // fragile - no idea how a future ARMv9 will be represented in this case
 925         }
 926     }
 927     if (cpuInfo.count("CPU variant") != 0U)
 928     {
 929         *model = std::strtol(cpuInfo.at("CPU variant").c_str(), nullptr, 16);
 930     }
 931     if (cpuInfo.count("CPU revision") != 0U)
 932     {
 933         *stepping = std::strtol(cpuInfo.at("CPU revision").c_str(), nullptr, 10);
 934     }
 935
 936     if (cpuInfo.count("Features") != 0U)
 937     {
 938         const std::string& s = cpuInfo.at("Features");
 939         if (s.find("neon") != std::string::npos)
 940         {
 941             features->insert(CpuInfo::Feature::Arm_Neon);
 942         }
 943         if (s.find("asimd") != std::string::npos)
 944         {
 945             // At least Jetson TX1 runs a 32-bit environment by default, although
 946             // the kernel is 64-bits, and reports asimd feature flags. We cannot
 947             // use Neon-asimd in this case, so make sure we are on a 64-bit platform.
 948             if (sizeof(void*) == 8)
 949             {
 950                 features->insert(CpuInfo::Feature::Arm_NeonAsimd);
 951             }
 952         }
 953         if (s.find("sve") != std::string::npos)
 954         {
 955             features->insert(CpuInfo::Feature::Arm_Sve);
 956         }
 957     }
 958 }
 959
 960
 961 /*! \brief Try to detect vendor, cpu and features from /proc/cpuinfo
 962  *
 963  *  \param[out] vendor     Detected hardware vendor
 964  *  \param[out] brand      String where to write the brand string
 965  *  \param[out] family     Major version of processor
 966  *  \param[out] model      Middle version of processor
 967  *  \param[out] stepping   Minor version of processor
 968  *  \param[out] features   Feature set where supported features are inserted
 969  *
 970  *  This routine reads the /proc/cpuinfo file into a map and calls subroutines
 971  *  that attempt to parse by matching keys and values to known strings. It is
 972  *  much more fragile than our x86 detection, but it does not depend on
 973  *  specific system calls, intrinsics or assembly instructions.
 974  */
 975 void detectProcCpuInfo(CpuInfo::Vendor*            vendor,
 976                        std::string*                brand,
 977                        int*                        family,
 978                        int*                        model,
 979                        int*                        stepping,
 980                        std::set<CpuInfo::Feature>* features)
 981 {
 982     std::map<std::string, std::string> cpuInfo = parseProcCpuInfo();
 983
 984     if (*vendor == CpuInfo::Vendor::Unknown)
 985     {
 986         *vendor = detectProcCpuInfoVendor(cpuInfo);
 987     }
 988
 989     // Unfortunately there is no standard for contents in /proc/cpuinfo. We cannot
 990     // indiscriminately look for e.g. 'cpu' since it could be either name or an index.
 991     // To handle this slightly better we use one subroutine per vendor.
 992     switch (*vendor)
 993     {
 994         case CpuInfo::Vendor::Ibm: detectProcCpuInfoIbm(cpuInfo, brand, features); break;
 995
 996         case CpuInfo::Vendor::Arm:
 997             detectProcCpuInfoArm(cpuInfo, brand, family, model, stepping, features);
 998             break;
 999
1000         default:
1001             // We only have a single check for fujitsu for now
1002 #ifdef __HPC_ACE__
1003             features->insert(CpuInfo::Feature::Fujitsu_HpcAce);
1004 #endif
1005             break;
1006     }
1007 }
1008 /*! \endcond */
1009 } // namespace
1010
1011
1012 // static
1013 CpuInfo CpuInfo::detect()
1014 {
1015     CpuInfo result;
1016
1017     if (c_architecture == Architecture::X86)
1018     {
1019         result.vendor_ = detectX86Vendor();
1020
1021         if (result.vendor_ == CpuInfo::Vendor::Intel)
1022         {
1023             result.features_.insert(CpuInfo::Feature::X86_Intel);
1024         }
1025         else if (result.vendor_ == CpuInfo::Vendor::Amd)
1026         {
1027             result.features_.insert(CpuInfo::Feature::X86_Amd);
1028         }
1029         else if (result.vendor_ == CpuInfo::Vendor::Hygon)
1030         {
1031             result.features_.insert(CpuInfo::Feature::X86_Hygon);
1032         }
1033         detectX86Features(
1034                 &result.brandString_, &result.family_, &result.model_, &result.stepping_, &result.features_);
1035         result.logicalProcessors_ = detectX86LogicalProcessors();
1036     }
1037     else
1038     {
1039         // Not x86
1040         if (c_architecture == Architecture::Arm)
1041         {
1042             result.vendor_ = CpuInfo::Vendor::Arm;
1043         }
1044         else if (c_architecture == Architecture::PowerPC)
1045         {
1046             result.vendor_ = CpuInfo::Vendor::Ibm;
1047         }
1048
1049 #if defined __aarch64__ || (defined _M_ARM && _M_ARM >= 8)
1050         result.features_.insert(Feature::Arm_Neon);      // ARMv8 always has Neon
1051         result.features_.insert(Feature::Arm_NeonAsimd); // ARMv8 always has Neon-asimd
1052 #endif
1053 #if defined __arch64__ && defined __ARM_FEATURE_SVE
1054         result.features_.insert(Feature::Arm_Sve);
1055 #endif
1056
1057 #if defined sun
1058         result.vendor_ = CpuInfo::Vendor::Oracle;
1059 #endif
1060
1061         // On Linux we might be able to find information in /proc/cpuinfo. If vendor or brand
1062         // is set to a known value this routine will not overwrite it.
1063         detectProcCpuInfo(&result.vendor_,
1064                           &result.brandString_,
1065                           &result.family_,
1066                           &result.model_,
1067                           &result.stepping_,
1068                           &result.features_);
1069     }
1070
1071     if (!result.logicalProcessors_.empty())
1072     {
1073         result.supportLevel_ = CpuInfo::SupportLevel::LogicalProcessorInfo;
1074     }
1075     else if (!result.features_.empty())
1076     {
1077         result.supportLevel_ = CpuInfo::SupportLevel::Features;
1078     }
1079     else if (result.vendor_ != CpuInfo::Vendor::Unknown
1080              || result.brandString_ != "Unknown CPU brand")
1081     {
1082         result.supportLevel_ = CpuInfo::SupportLevel::Name;
1083     }
1084     else
1085     {
1086         result.supportLevel_ = CpuInfo::SupportLevel::None;
1087     }
1088
1089     return result;
1090 }
1091
1092 CpuInfo::CpuInfo() :
1093     vendor_(CpuInfo::Vendor::Unknown), brandString_("Unknown CPU brand"), family_(0), model_(0), stepping_(0)
1094 {
1095 }
1096
1097 const std::string& CpuInfo::vendorString() const
1098 {
1099     static const std::map<Vendor, std::string> vendorStrings = {
1100         { Vendor::Unknown, "Unknown vendor" }, { Vendor::Intel, "Intel" }, { Vendor::Amd, "AMD" },
1101         { Vendor::Fujitsu, "Fujitsu" },        { Vendor::Ibm, "IBM" },     { Vendor::Arm, "ARM" },
1102         { Vendor::Oracle, "Oracle" },          { Vendor::Hygon, "Hygon" },
1103     };
1104
1105     return vendorStrings.at(vendor_);
1106 }
1107
1108
1109 const std::string& CpuInfo::featureString(Feature f)
1110 {
1111     static const std::map<Feature, std::string> featureStrings = {
1112         { Feature::X86_Aes, "aes" },
1113         { Feature::X86_Amd, "amd" },
1114         { Feature::X86_Apic, "apic" },
1115         { Feature::X86_Avx, "avx" },
1116         { Feature::X86_Avx2, "avx2" },
1117         { Feature::X86_Avx512F, "avx512f" },
1118         { Feature::X86_Avx512PF, "avx512pf" },
1119         { Feature::X86_Avx512ER, "avx512er" },
1120         { Feature::X86_Avx512CD, "avx512cd" },
1121         { Feature::X86_Avx512BW, "avx512bw" },
1122         { Feature::X86_Avx512VL, "avx512vl" },
1123         { Feature::X86_Avx512BF16, "avx512bf16" },
1124         { Feature::X86_Avx512secondFMA, "avx512secondFMA" },
1125         { Feature::X86_Clfsh, "clfsh" },
1126         { Feature::X86_Cmov, "cmov" },
1127         { Feature::X86_Cx8, "cx8" },
1128         { Feature::X86_Cx16, "cx16" },
1129         { Feature::X86_F16C, "f16c" },
1130         { Feature::X86_Fma, "fma" },
1131         { Feature::X86_Fma4, "fma4" },
1132         { Feature::X86_Hle, "hle" },
1133         { Feature::X86_Htt, "htt" },
1134         { Feature::X86_Intel, "intel" },
1135         { Feature::X86_Lahf, "lahf" },
1136         { Feature::X86_MisalignSse, "misalignsse" },
1137         { Feature::X86_Mmx, "mmx" },
1138         { Feature::X86_Msr, "msr" },
1139         { Feature::X86_NonstopTsc, "nonstop_tsc" },
1140         { Feature::X86_Pcid, "pcid" },
1141         { Feature::X86_Pclmuldq, "pclmuldq" },
1142         { Feature::X86_Pdcm, "pdcm" },
1143         { Feature::X86_PDPE1GB, "pdpe1gb" },
1144         { Feature::X86_Popcnt, "popcnt" },
1145         { Feature::X86_Pse, "pse" },
1146         { Feature::X86_Rdrnd, "rdrnd" },
1147         { Feature::X86_Rdtscp, "rdtscp" },
1148         { Feature::X86_Rtm, "rtm" },
1149         { Feature::X86_Sha, "sha" },
1150         { Feature::X86_Sse2, "sse2" },
1151         { Feature::X86_Sse3, "sse3" },
1152         { Feature::X86_Sse4A, "sse4a" },
1153         { Feature::X86_Sse4_1, "sse4.1" },
1154         { Feature::X86_Sse4_2, "sse4.2" },
1155         { Feature::X86_Ssse3, "ssse3" },
1156         { Feature::X86_Tdt, "tdt" },
1157         { Feature::X86_X2Apic, "x2apic" },
1158         { Feature::X86_Xop, "xop" },
1159         { Feature::Arm_Neon, "neon" },
1160         { Feature::Arm_NeonAsimd, "neon_asimd" },
1161         { Feature::Arm_Sve, "sve" },
1162         { Feature::Ibm_Qpx, "qpx" },
1163         { Feature::Ibm_Vmx, "vmx" },
1164         { Feature::Ibm_Vsx, "vsx" },
1165         { Feature::Fujitsu_HpcAce, "hpc-ace" },
1166         { Feature::X86_Hygon, "hygon" }
1167     };
1168     return featureStrings.at(f);
1169 }
1170
1171
1172 bool cpuIsX86Nehalem(const CpuInfo& cpuInfo)
1173 {
1174     return (cpuInfo.vendor() == CpuInfo::Vendor::Intel && cpuInfo.family() == 6
1175             && (cpuInfo.model() == 0x2E || cpuInfo.model() == 0x1A || cpuInfo.model() == 0x1E
1176                 || cpuInfo.model() == 0x2F || cpuInfo.model() == 0x2C || cpuInfo.model() == 0x25));
1177 }
1178
1179 bool cpuIsAmdZen1(const CpuInfo& cpuInfo)
1180 {
1181     /* Both Zen/Zen+/Zen2 have family==23
1182      * Model numbers for Zen:
1183      * 1)  Naples, Whitehaven, Summit Ridge, and Snowy Owl;
1184      * 17) Raven Ridge.
1185      * Model numbers for Zen+:
1186      * 8)  Pinnacle Ridge;
1187      * 24) Picasso.
1188      * Hygon got license for Zen1, but not Zen2 (https://www.tomshardware.com/news/amd-zen-china-x86-ip-license,39573.html)
1189      */
1190     return (cpuInfo.vendor() == CpuInfo::Vendor::Amd && cpuInfo.family() == 23
1191             && (cpuInfo.model() == 1 || cpuInfo.model() == 17 || cpuInfo.model() == 8
1192                 || cpuInfo.model() == 24))
1193            || (cpuInfo.vendor() == CpuInfo::Vendor::Hygon);
1194 }
1195
1196 } // namespace gmx
1197
1198 #ifdef GMX_CPUINFO_STANDALONE
1199 int main(int argc, char** argv)
1200 {
1201     if (argc < 2)
1202     {
1203         fprintf(stdout,
1204                 "Usage:\n\n%s [flags]\n\n"
1205                 "Available flags:\n"
1206                 "-vendor        Print CPU vendor.\n"
1207                 "-brand         Print CPU brand string.\n"
1208                 "-family        Print CPU family version.\n"
1209                 "-model         Print CPU model version.\n"
1210                 "-stepping      Print CPU stepping version.\n"
1211                 "-features      Print CPU feature flags.\n",
1212                 argv[0]);
1213         exit(1);
1214     }
1215
1216     std::string  arg(argv[1]);
1217     gmx::CpuInfo cpuInfo(gmx::CpuInfo::detect());
1218
1219     if (arg == "-vendor")
1220     {
1221         printf("%s\n", cpuInfo.vendorString().c_str());
1222     }
1223     else if (arg == "-brand")
1224     {
1225         printf("%s\n", cpuInfo.brandString().c_str());
1226     }
1227     else if (arg == "-family")
1228     {
1229         printf("%d\n", cpuInfo.family());
1230     }
1231     else if (arg == "-model")
1232     {
1233         printf("%d\n", cpuInfo.model());
1234     }
1235     else if (arg == "-stepping")
1236     {
1237         printf("%d\n", cpuInfo.stepping());
1238     }
1239     else if (arg == "-features")
1240     {
1241         // Separate the feature strings with spaces. Note that in the
1242         // GROMACS cmake code, surrounding whitespace is first
1243         // stripped by the CPU detection routine, and then added back
1244         // in the code for making the SIMD suggestion.
1245         for (auto& f : cpuInfo.featureSet())
1246         {
1247             printf("%s ", cpuInfo.featureString(f).c_str());
1248         }
1249         printf("\n");
1250     }
1251     else if (arg == "-topology")
1252     {
1253         // Undocumented debug option, usually not present in standalone version
1254         for (auto& t : cpuInfo.logicalProcessors())
1255         {
1256             printf("%3u %3u %3u\n", t.socketRankInMachine, t.coreRankInSocket, t.hwThreadRankInCore);
1257         }
1258     }
1259     return 0;
1260 }
1261 #endif