2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012-2018, The GROMACS development team.
5 * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
39 * Implements gmx::CpuInfo.
41 * We need to be able to compile this file in stand-alone mode to use basic
42 * CPU feature detection to set the SIMD acceleration and similar things in
43 * CMake, while we still want to use more features that enable topology
44 * detection when config.h is present.
46 * We solve this by skipping the advanced stuff when the preprocessor
47 * macro GMX_CPUINFO_STANDALONE is defined. In this case you likely also need to
48 * define GMX_X86_GCC_INLINE_ASM if you are on x86; without inline assembly
49 * support it is not possible to perform the actual detection on Linux/Mac.
50 * Since these macros are specific to this file, they do not use the GMX prefix.
52 * The remaining defines (GMX_NATIVE_WINDOWS,HAVE_UNISTD_H,HAVE_SCHED_H,
53 * HAVE_SYSCONF, HAVE_SCHED_AFFINITY) are only used to determine the topology on
54 * 86, and for this we rely on including config.h.
56 * \author Erik Lindahl <erik.lindahl@gmail.com>
57 * \ingroup module_hardware
60 #ifndef GMX_CPUINFO_STANDALONE
66 #ifndef GMX_CPUINFO_STANDALONE
69 # define GMX_NATIVE_WINDOWS 0
73 # include <intrin.h> // __cpuid()
76 #if GMX_NATIVE_WINDOWS
77 # include <windows.h> // sysinfo(), necessary for topology stuff
81 # include <sched.h> // sched_getaffinity(), sched_setaffinity()
84 # include <unistd.h> // sysconf()
88 #include <cstdint> // uint32_t in X86 processor name code
98 #ifdef GMX_CPUINFO_STANDALONE
101 # include "gromacs/utility/basedefinitions.h"
104 #include "architecture.h"
112 /*! \cond internal */
114 /******************************************************************************
116 * Utility functions to make this file independent of the GROMACS library *
118 ******************************************************************************/
120 /*! \brief Remove initial and trailing whitespace from string
122 * \param s Pointer to string where whitespace will be removed
124 void trimString(std::string* s)
128 std::find_if(s->begin(), s->end(), [](char& c) -> bool { return std::isspace(c) == 0; }));
131 std::find_if(s->rbegin(), s->rend(), [](char& c) -> bool { return std::isspace(c) == 0; })
137 /******************************************************************************
139 * x86 detection functions *
141 ******************************************************************************/
143 /*! \brief execute x86 cpuid instructions with custom level and extended level
145 * \param level The main cpuid level (input argument for eax register)
146 * \param ecxval Extended level (input argument for ecx register)
147 * \param eax Output in eax register
148 * \param ebx Output in ebx register
149 * \param ecx Output in ecx register
150 * \param edx Output in edx register
152 * \return 0 on success, or non-zero if the instruction could not execute.
154 int executeX86CpuID(unsigned int gmx_unused level,
155 unsigned int gmx_unused ecxval,
161 if (c_architecture == Architecture::X86)
163 #if defined __GNUC__ || GMX_X86_GCC_INLINE_ASM
165 // any compiler that understands gcc inline assembly
171 # if GMX_IS_X86_32 && defined(__PIC__)
172 // Avoid clobbering the global offset table in 32-bit pic code (ebx register)
173 __asm__ __volatile__(
174 "xchgl %%ebx, %1 \n\t"
176 "xchgl %%ebx, %1 \n\t"
177 : "+a"(*eax), "+r"(*ebx), "+c"(*ecx), "+d"(*edx));
179 // i386 without PIC, or x86-64. Things are easy and we can clobber any reg we want
180 __asm__ __volatile__("cpuid \n\t"
181 : "+a"(*eax), "+b"(*ebx), "+c"(*ecx), "+d"(*edx));
183 // Not a normal x86, which could happen when a compiler
184 // targetting non-x86 pretends to be GCC.
188 #elif defined _MSC_VER
190 // MSVC (and icc on windows) on ia32 or x86-64
192 __cpuidex(cpuInfo, level, ecxval);
193 *eax = static_cast<unsigned int>(cpuInfo[0]);
194 *ebx = static_cast<unsigned int>(cpuInfo[1]);
195 *ecx = static_cast<unsigned int>(cpuInfo[2]);
196 *edx = static_cast<unsigned int>(cpuInfo[3]);
201 // We are on x86, but without compiler support for cpuid if we get here
208 #endif // check for inline asm on x86
222 /*! \brief Detect x86 vendors by using the cpuid assembly instructions
224 * If support for the cpuid instruction is present, we check for Intel,
225 * AMD or Hygon vendors
227 * \return gmx::CpuInfo::Vendor::Intel, gmx::CpuInfo::Vendor::Amd,
228 * gmx::CpuInfl::Vendor::Hygon, . If neither Intel, Amd nor
229 * Hygon can be identified, or if the code fails to execute,
230 * gmx::CpuInfo::Vendor::Unknown is returned.
232 CpuInfo::Vendor detectX86Vendor()
234 unsigned int eax, ebx, ecx, edx;
235 CpuInfo::Vendor v = CpuInfo::Vendor::Unknown;
237 if (executeX86CpuID(0x0, 0, &eax, &ebx, &ecx, &edx) == 0)
239 if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
241 v = CpuInfo::Vendor::Intel; // ebx=='uneG', ecx=='letn', edx=='Ieni'
243 else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
245 v = CpuInfo::Vendor::Amd; // ebx=='htuA', ecx=='DMAc', edx=='itne'
247 else if (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e)
249 v = CpuInfo::Vendor::Hygon; // ebx=='ogyH', ecx=='eniu', edx=='neGn'
255 /*! \brief Detect second AVX-512 FMA from the processor name
257 * Should only be called for processors already determined to support AVX-512.
259 * \param [in] brand x86 processor name
260 * \param [in] model x86 model
261 * \return True if second FMA present
263 bool detectProcCpuInfoSecondAvx512FMA(const std::string& brand, int model)
269 if (brand.find("Xeon") == 9)
271 // detect Silver or Bronze or specific models
272 if (brand.find("Silver") == 17 || brand.find("Bronze") == 17
273 || (brand.find('W') == 17 && brand.find('0') == 21) // detect Xeon W 210x
274 || (brand.find('D') == 17 && brand.find("21") == 19)) // detect Xeon D 2xxx
278 // detect Gold 5xxx - can be corrected once Cooper Lake is added
279 else if (brand.find("Gold") == 17 && brand.find('5') == 22)
281 return (brand.find("53") == 22 || // detect Cooper Lake
282 brand.find("22") == 24); // detect 5[12]22
287 // Cannon Lake client
293 if (model == 0x7d || model == 0x7e)
297 // This is the right default...
301 /*! \brief Simple utility function to set/clear feature in a set
303 * \param featureSet Pointer to the feature set to update
304 * \param feature The specific feature to set/clear
305 * \param registerValue Register value (returned from cpuid)
306 * \param bit Bit to check in registerValue. The feature will be
307 * added to the featureSet if this bit is set.
309 * \note Nothing is done if the bit is not set. In particular, this will not
310 * erase anything if the feature already exists in the set.
312 void setFeatureFromBit(std::set<CpuInfo::Feature>* featureSet,
313 CpuInfo::Feature feature,
314 unsigned int registerValue,
317 if (registerValue & (1 << bit))
319 featureSet->insert(feature);
323 /*! \brief Process x86 cpuinfo features that are common to Intel and AMD CPUs
325 * \param[out] brand String where to write the x86 brand string
326 * \param[out] family Major version of processor
327 * \param[out] model Middle version of processor
328 * \param[out] stepping Minor version of processor
329 * \param[out] features Feature set where supported features are inserted
331 void detectX86Features(std::string* brand, int* family, int* model, int* stepping, std::set<CpuInfo::Feature>* features)
333 unsigned int eax, ebx, ecx, edx;
335 // Return if we cannot execute any levels
336 if (executeX86CpuID(0x0, 0, &eax, &ebx, &ecx, &edx) != 0)
340 unsigned int maxStdLevel = eax;
342 if (maxStdLevel >= 0x1)
344 executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
346 *family = ((eax & 0x0ff00000) >> 20) + ((eax & 0x00000f00) >> 8);
347 *model = ((eax & 0x000f0000) >> 12) + ((eax & 0x000000f0) >> 4);
348 *stepping = (eax & 0x0000000f);
350 setFeatureFromBit(features, CpuInfo::Feature::X86_Sse3, ecx, 0);
351 setFeatureFromBit(features, CpuInfo::Feature::X86_Pclmuldq, ecx, 1);
352 setFeatureFromBit(features, CpuInfo::Feature::X86_Ssse3, ecx, 9);
353 setFeatureFromBit(features, CpuInfo::Feature::X86_Fma, ecx, 12);
354 setFeatureFromBit(features, CpuInfo::Feature::X86_Cx16, ecx, 13);
355 setFeatureFromBit(features, CpuInfo::Feature::X86_Pdcm, ecx, 15);
356 setFeatureFromBit(features, CpuInfo::Feature::X86_Pcid, ecx, 17);
357 setFeatureFromBit(features, CpuInfo::Feature::X86_Sse4_1, ecx, 19);
358 setFeatureFromBit(features, CpuInfo::Feature::X86_Sse4_2, ecx, 20);
359 setFeatureFromBit(features, CpuInfo::Feature::X86_X2Apic, ecx, 21);
360 setFeatureFromBit(features, CpuInfo::Feature::X86_Popcnt, ecx, 23);
361 setFeatureFromBit(features, CpuInfo::Feature::X86_Tdt, ecx, 24);
362 setFeatureFromBit(features, CpuInfo::Feature::X86_Aes, ecx, 25);
363 setFeatureFromBit(features, CpuInfo::Feature::X86_Avx, ecx, 28);
364 setFeatureFromBit(features, CpuInfo::Feature::X86_F16C, ecx, 29);
365 setFeatureFromBit(features, CpuInfo::Feature::X86_Rdrnd, ecx, 30);
367 setFeatureFromBit(features, CpuInfo::Feature::X86_Pse, edx, 3);
368 setFeatureFromBit(features, CpuInfo::Feature::X86_Msr, edx, 5);
369 setFeatureFromBit(features, CpuInfo::Feature::X86_Cx8, edx, 8);
370 setFeatureFromBit(features, CpuInfo::Feature::X86_Apic, edx, 9);
371 setFeatureFromBit(features, CpuInfo::Feature::X86_Cmov, edx, 15);
372 setFeatureFromBit(features, CpuInfo::Feature::X86_Clfsh, edx, 19);
373 setFeatureFromBit(features, CpuInfo::Feature::X86_Mmx, edx, 23);
374 setFeatureFromBit(features, CpuInfo::Feature::X86_Sse2, edx, 26);
375 setFeatureFromBit(features, CpuInfo::Feature::X86_Htt, edx, 28);
378 // Check whether Hyper-threading is really possible to enable in the hardware,
379 // not just technically supported by this generation of processors
380 if ((features->count(CpuInfo::Feature::X86_Htt) != 0U) && maxStdLevel >= 0x4)
382 executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
383 unsigned int maxLogicalCores = (ebx >> 16) & 0x0ff;
384 executeX86CpuID(0x4, 0, &eax, &ebx, &ecx, &edx);
385 unsigned int maxPhysicalCores = ((eax >> 26) & 0x3f) + 1;
386 if (maxLogicalCores / maxPhysicalCores < 2)
388 features->erase(CpuInfo::Feature::X86_Htt);
392 if (executeX86CpuID(0x80000000, 0, &eax, &ebx, &ecx, &edx) != 0)
394 // No point in continuing if we don't support any extended levels
397 unsigned int maxExtLevel = eax;
399 if (maxExtLevel >= 0x80000001)
401 executeX86CpuID(0x80000001, 0, &eax, &ebx, &ecx, &edx);
403 setFeatureFromBit(features, CpuInfo::Feature::X86_Lahf, ecx, 0);
404 setFeatureFromBit(features, CpuInfo::Feature::X86_Sse4A, ecx, 6);
405 setFeatureFromBit(features, CpuInfo::Feature::X86_MisalignSse, ecx, 7);
406 setFeatureFromBit(features, CpuInfo::Feature::X86_Xop, ecx, 11);
407 setFeatureFromBit(features, CpuInfo::Feature::X86_Fma4, ecx, 16);
408 setFeatureFromBit(features, CpuInfo::Feature::X86_PDPE1GB, edx, 26);
409 setFeatureFromBit(features, CpuInfo::Feature::X86_Rdtscp, edx, 27);
412 if (maxExtLevel >= 0x80000005)
414 // Get the x86 CPU brand string (3 levels, 16 bytes in each)
416 for (unsigned int level = 0x80000002; level < 0x80000005; level++)
418 executeX86CpuID(level, 0, &eax, &ebx, &ecx, &edx);
419 // Add eax, ebx, ecx, edx contents as 4 chars each to the brand string
420 brand->append(reinterpret_cast<const char*>(&eax), sizeof(eax));
421 brand->append(reinterpret_cast<const char*>(&ebx), sizeof(ebx));
422 brand->append(reinterpret_cast<const char*>(&ecx), sizeof(ecx));
423 brand->append(reinterpret_cast<const char*>(&edx), sizeof(edx));
428 if (maxStdLevel >= 0x7)
430 executeX86CpuID(0x7, 0, &eax, &ebx, &ecx, &edx);
432 setFeatureFromBit(features, CpuInfo::Feature::X86_Hle, ebx, 4);
433 setFeatureFromBit(features, CpuInfo::Feature::X86_Avx2, ebx, 5);
434 setFeatureFromBit(features, CpuInfo::Feature::X86_Rtm, ebx, 11);
435 setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512F, ebx, 16);
436 setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512PF, ebx, 26);
437 setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512ER, ebx, 27);
438 setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512CD, ebx, 28);
439 setFeatureFromBit(features, CpuInfo::Feature::X86_Sha, ebx, 29);
440 setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512BW, ebx, 30);
441 setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512VL, ebx, 31);
443 executeX86CpuID(0x7, 0x1, &eax, &ebx, &ecx, &edx);
444 setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512BF16, eax, 5);
446 if (features->count(CpuInfo::Feature::X86_Avx512F) != 0)
448 // Only checking if the CPU supports AVX-512. There is no CPUID bit for this.
449 if (detectProcCpuInfoSecondAvx512FMA(*brand, *model))
451 features->insert(CpuInfo::Feature::X86_Avx512secondFMA);
457 if (maxExtLevel >= 0x80000007)
459 executeX86CpuID(0x80000007, 0, &eax, &ebx, &ecx, &edx);
461 setFeatureFromBit(features, CpuInfo::Feature::X86_NonstopTsc, edx, 8);
466 /*! \brief Return a vector with x86 APIC IDs for all threads
468 * \param haveX2Apic True if the processors supports x2APIC, otherwise vanilla APIC.
470 * \returns A new std::vector of unsigned integer APIC IDs, one for each
471 * logical processor in the system.
473 std::vector<unsigned int> detectX86ApicIDs(bool gmx_unused haveX2Apic)
475 std::vector<unsigned int> apicID;
477 // We cannot just ask for all APIC IDs, but must force execution on each
478 // hardware thread and extract the APIC id there.
479 #if HAVE_SCHED_AFFINITY && defined HAVE_SYSCONF
480 unsigned int eax, ebx, ecx, edx;
481 unsigned int nApic = sysconf(_SC_NPROCESSORS_ONLN);
482 cpu_set_t saveCpuSet;
484 sched_getaffinity(0, sizeof(cpu_set_t), &saveCpuSet);
486 for (unsigned int i = 0; i < nApic; i++)
489 sched_setaffinity(0, sizeof(cpu_set_t), &cpuSet);
492 executeX86CpuID(0xb, 0, &eax, &ebx, &ecx, &edx);
493 apicID.push_back(edx);
497 executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
498 apicID.push_back(ebx >> 24);
502 sched_setaffinity(0, sizeof(cpu_set_t), &saveCpuSet);
503 #elif GMX_NATIVE_WINDOWS
504 unsigned int eax, ebx, ecx, edx;
506 GetSystemInfo(&sysinfo);
507 unsigned int nApic = sysinfo.dwNumberOfProcessors;
508 unsigned int saveAffinity = SetThreadAffinityMask(GetCurrentThread(), 1);
509 for (DWORD_PTR i = 0; i < nApic; i++)
511 SetThreadAffinityMask(GetCurrentThread(), (((DWORD_PTR)1) << i));
515 executeX86CpuID(0xb, 0, &eax, &ebx, &ecx, &edx);
516 apicID.push_back(edx);
520 executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
521 apicID.push_back(ebx >> 24);
524 SetThreadAffinityMask(GetCurrentThread(), saveAffinity);
530 /*! \brief Utility to renumber indices extracted from APIC IDs
532 * \param v Vector with unsigned integer indices
534 * This routine returns the number of unique different elements found in the vector,
535 * and renumbers these starting from 0. For example, the vector {0,1,2,8,9,10,8,9,10,0,1,2}
536 * will be rewritten to {0,1,2,3,4,5,3,4,5,0,1,2}, and it returns 6 for the
537 * number of unique elements.
539 void renumberIndex(std::vector<unsigned int>* v)
541 std::vector<unsigned int> sortedV(*v);
542 std::sort(sortedV.begin(), sortedV.end());
544 std::vector<unsigned int> uniqueSortedV(sortedV);
545 auto it = std::unique(uniqueSortedV.begin(), uniqueSortedV.end());
546 uniqueSortedV.resize(std::distance(uniqueSortedV.begin(), it));
548 for (std::size_t i = 0; i < uniqueSortedV.size(); i++)
550 unsigned int val = uniqueSortedV[i];
554 [val](unsigned int& c) -> bool { return c == val; },
555 static_cast<unsigned int>(i));
559 /*! \brief The layout of the bits in the APIC ID */
562 unsigned int hwThreadBits; //!< The number of least significant bits for hw-threads
563 unsigned int coreBits; //!< The number of core bits following the hw-thread bits
566 /*! \brief Detect the APIC ID layout for x2APIC
568 ApicIdLayout detectX2ApicIdLayout()
576 executeX86CpuID(0xb, 0, &eax, &ebx, &ecx, &edx);
577 layout.hwThreadBits = eax & 0x1f;
578 executeX86CpuID(0xb, 1, &eax, &ebx, &ecx, &edx);
579 layout.coreBits = (eax & 0x1f) - layout.hwThreadBits;
584 /*! \brief Detect the APIC ID layout for standard APIC or xAPIC on AMD
586 * \param[in] maxExtLevel The largest CPUID extended function input value supported by the processor implementation
588 ApicIdLayout detectAmdApicIdLayout(unsigned int maxExtLevel)
596 executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
597 int family = ((eax & 0x0ff00000) >> 20) + ((eax & 0x00000f00) >> 8);
598 executeX86CpuID(0x80000001, 0, &eax, &ebx, &ecx, &edx);
599 bool haveExtendedTopology = (ecx & (1 << 22)) != 0U;
601 // NOTE: Here we assume 1 thread per core, unless we have family >= 17h
602 layout.hwThreadBits = 0;
603 if (family >= 0x17 && haveExtendedTopology && maxExtLevel >= 0x8000001e)
605 executeX86CpuID(0x8000001e, 1, &eax, &ebx, &ecx, &edx);
606 int numThreadsPerCore = ((ebx >> 8) & 0xff) + 1;
607 // NOTE: The AMD documentation only specifies the layout of apicid
608 // when we have 1 or 2 threads per core.
609 while (numThreadsPerCore > (1 << layout.hwThreadBits))
611 layout.hwThreadBits++;
615 // Get number of core bits in apic ID - try modern extended method first
616 executeX86CpuID(0x80000008, 0, &eax, &ebx, &ecx, &edx);
617 layout.coreBits = (ecx >> 12) & 0xf;
618 if (layout.coreBits == 0)
620 // Legacy method for old single/dual core AMD CPUs
622 while (i >> layout.coreBits)
631 /*! \brief Try to detect basic CPU topology information using x86 cpuid
633 * If x2APIC support is present, this is our first choice, otherwise we
634 * attempt to use old vanilla APIC.
636 * \return A new vector of entries with socket, core, hwthread information
637 * for each logical processor.
639 std::vector<CpuInfo::LogicalProcessor> detectX86LogicalProcessors()
645 unsigned int maxStdLevel;
646 unsigned int maxExtLevel;
650 std::vector<CpuInfo::LogicalProcessor> logicalProcessors;
652 // Find largest standard & extended level input values allowed
653 executeX86CpuID(0x0, 0, &eax, &ebx, &ecx, &edx);
655 executeX86CpuID(0x80000000, 0, &eax, &ebx, &ecx, &edx);
658 if (maxStdLevel >= 0x1)
660 executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
661 haveX2Apic = ((ecx & (1 << 21)) != 0U) && maxStdLevel >= 0xb;
662 haveApic = ((edx & (1 << 9)) != 0U) && maxExtLevel >= 0x80000008;
670 if (haveX2Apic || haveApic)
673 // Get bits for cores and hardware threads
676 layout = detectX2ApicIdLayout();
680 if (detectX86Vendor() == CpuInfo::Vendor::Amd || detectX86Vendor() == CpuInfo::Vendor::Hygon)
682 layout = detectAmdApicIdLayout(maxExtLevel);
684 if (layout.hwThreadBits > 1)
686 // At the time of writing this code we do not know what
687 // to do with more than 2 threads, so return empty.
688 return logicalProcessors;
693 // We do not know the APIC ID layout, return empty.
694 return logicalProcessors;
698 std::vector<unsigned int> apicID = detectX86ApicIDs(haveX2Apic);
702 // APIC IDs can be buggy, and it is always a mess. Typically more bits are
703 // reserved than needed, and the numbers might not increment by 1 even in
704 // a single socket or core. Extract, renumber, and check that things make sense.
705 unsigned int hwThreadMask = (1 << layout.hwThreadBits) - 1;
706 unsigned int coreMask = (1 << layout.coreBits) - 1;
707 std::vector<unsigned int> hwThreadRanks;
708 std::vector<unsigned int> coreRanks;
709 std::vector<unsigned int> socketRanks;
711 for (auto a : apicID)
713 hwThreadRanks.push_back(static_cast<int>(a & hwThreadMask));
714 coreRanks.push_back(static_cast<int>((a >> layout.hwThreadBits) & coreMask));
715 socketRanks.push_back(static_cast<int>(a >> (layout.coreBits + layout.hwThreadBits)));
718 renumberIndex(&hwThreadRanks);
719 renumberIndex(&coreRanks);
720 renumberIndex(&socketRanks);
722 unsigned int hwThreadRankSize =
723 1 + *std::max_element(hwThreadRanks.begin(), hwThreadRanks.end());
724 unsigned int coreRankSize = 1 + *std::max_element(coreRanks.begin(), coreRanks.end());
725 unsigned int socketRankSize = 1 + *std::max_element(socketRanks.begin(), socketRanks.end());
727 if (socketRankSize * coreRankSize * hwThreadRankSize == apicID.size())
729 // Alright, everything looks consistent, so put it in the result
730 for (std::size_t i = 0; i < apicID.size(); i++)
732 // While the internal APIC IDs are always unsigned integers, we also cast to
733 // plain integers for the externally exposed vectors, since that will make
734 // it possible to use '-1' for invalid entries in the future.
735 logicalProcessors.push_back(
736 { int(socketRanks[i]), int(coreRanks[i]), int(hwThreadRanks[i]) });
741 return logicalProcessors; // Will only have contents if everything worked
745 /******************************************************************************
747 * Generic Linux detection by parsing /proc/cpuinfo *
749 ******************************************************************************/
751 /*! \brief Parse /proc/cpuinfo into a simple string map
753 * This routine will read the contents of /proc/cpuinfo, and for each
754 * line that is not empty we will assign the (trimmed) string to the right of
755 * the colon as a key, and the left-hand side as the value in the map.
756 * For multi-processor systems where lines are repeated the latter lines will
757 * overwrite the first occurrence.
759 * \return New map with the contents. If the file is not available, the returned
762 std::map<std::string, std::string> parseProcCpuInfo()
764 std::ifstream procCpuInfo("/proc/cpuinfo");
766 std::map<std::string, std::string> cpuInfo;
768 while (std::getline(procCpuInfo, line))
772 std::stringstream iss(line);
775 std::getline(iss, key, ':'); // part before colon
776 std::getline(iss, val); // part after colon
779 // put it in the map. This will overwrite previous processors, but we don't care.
787 /*! \brief Try to detect vendor from /proc/cpuinfo
789 * \param cpuInfo Map returned from parseProcCpuinfo()
791 * This routine tries to match a few common labels in /proc/cpuinfo to see if
792 * they begin with the name of a standard vendor. If the file cannot be read
793 * or if no match is found, we return gmx::CpuInfo::Vendor::Unknown.
795 CpuInfo::Vendor detectProcCpuInfoVendor(const std::map<std::string, std::string>& cpuInfo)
797 const std::map<std::string, CpuInfo::Vendor> testVendors = {
798 { "GenuineIntel", CpuInfo::Vendor::Intel },
799 { "Intel", CpuInfo::Vendor::Intel },
800 { "AuthenticAmd", CpuInfo::Vendor::Amd },
801 { "AMD", CpuInfo::Vendor::Amd },
802 { "ARM", CpuInfo::Vendor::Arm },
803 { "AArch64", CpuInfo::Vendor::Arm },
804 { "Fujitsu", CpuInfo::Vendor::Fujitsu },
805 { "IBM", CpuInfo::Vendor::Ibm },
806 { "POWER", CpuInfo::Vendor::Ibm },
807 { "Oracle", CpuInfo::Vendor::Oracle },
808 { "HygonGenuine", CpuInfo::Vendor::Hygon },
809 { "Hygon", CpuInfo::Vendor::Hygon },
812 // For each label in /proc/cpuinfo, compare the value to the name in the
813 // testNames map above, and if it's a match return the vendor.
814 for (const auto& l : { "vendor_id", "vendor", "manufacture", "model", "processor", "cpu" })
816 if (cpuInfo.count(l) != 0U)
818 // there was a line with this left-hand side in /proc/cpuinfo
819 const std::string& s1 = cpuInfo.at(l);
821 for (const auto& t : testVendors)
823 const std::string& s2 = t.first;
825 // If the entire name we are testing (s2) matches the first part of
826 // the string after the colon in /proc/cpuinfo (s1) we found our vendor
827 if (std::equal(s2.begin(), s2.end(), s1.begin(), [](const char& x, const char& y) -> bool {
828 return tolower(x) == tolower(y);
836 return CpuInfo::Vendor::Unknown;
840 /*! \brief Detect IBM processor name and features from /proc/cpuinfo
842 * \param cpuInfo Map returned from parseProcCpuinfo()
843 * \param[out] brand String where to write the brand string
844 * \param[out] features Feature set where supported features are inserted
846 * This routine tries to match a few common labels in /proc/cpuinfo to see if
847 * we can find the processor name and features. It is likely fragile.
849 void detectProcCpuInfoIbm(const std::map<std::string, std::string>& cpuInfo,
851 std::set<CpuInfo::Feature>* features)
853 // Get brand string from 'cpu' label if present, otherwise 'Processor'
854 if (cpuInfo.count("cpu") != 0U)
856 *brand = cpuInfo.at("cpu");
858 else if (cpuInfo.count("Processor") != 0U)
860 *brand = cpuInfo.at("Processor");
863 if (brand->find("A2") != std::string::npos)
865 // If the processor identification contains "A2", this is BlueGene/Q with QPX
866 features->insert(CpuInfo::Feature::Ibm_Qpx);
869 for (const auto& l : { "model name", "model", "Processor", "cpu" })
871 if (cpuInfo.count(l) != 0U)
873 std::string s1 = cpuInfo.at(l);
874 std::transform(s1.begin(), s1.end(), s1.begin(), ::tolower);
876 if (s1.find("altivec") != std::string::npos)
878 features->insert(CpuInfo::Feature::Ibm_Vmx);
879 // If this is a power6, we only have VMX. All later processors have VSX.
880 if (s1.find("power6") == std::string::npos)
882 features->insert(CpuInfo::Feature::Ibm_Vsx);
890 /*! \brief Detect ARM processor name and features from /proc/cpuinfo
892 * \param cpuInfo Map returned from parseProcCpuinfo()
893 * \param[out] brand String where to write the brand string
894 * \param[out] family Major version of processor
895 * \param[out] model Middle version of processor
896 * \param[out] stepping Minor version of processor
897 * \param[out] features Feature set where supported features are inserted
899 * This routine tries to match a few common labels in /proc/cpuinfo to see if
900 * we can find the processor name and features. It is likely fragile.
902 void detectProcCpuInfoArm(const std::map<std::string, std::string>& cpuInfo,
907 std::set<CpuInfo::Feature>* features)
909 if (cpuInfo.count("Processor") != 0U)
911 *brand = cpuInfo.at("Processor");
913 else if (cpuInfo.count("model name") != 0U)
915 *brand = cpuInfo.at("model name");
918 if (cpuInfo.count("CPU architecture") != 0U)
920 *family = std::strtol(cpuInfo.at("CPU architecture").c_str(), nullptr, 10);
921 // For some 64-bit CPUs it appears to say 'AArch64' instead
922 if (*family == 0 && cpuInfo.at("CPU architecture").find("AArch64") != std::string::npos)
924 *family = 8; // fragile - no idea how a future ARMv9 will be represented in this case
927 if (cpuInfo.count("CPU variant") != 0U)
929 *model = std::strtol(cpuInfo.at("CPU variant").c_str(), nullptr, 16);
931 if (cpuInfo.count("CPU revision") != 0U)
933 *stepping = std::strtol(cpuInfo.at("CPU revision").c_str(), nullptr, 10);
936 if (cpuInfo.count("Features") != 0U)
938 const std::string& s = cpuInfo.at("Features");
939 if (s.find("neon") != std::string::npos)
941 features->insert(CpuInfo::Feature::Arm_Neon);
943 if (s.find("asimd") != std::string::npos)
945 // At least Jetson TX1 runs a 32-bit environment by default, although
946 // the kernel is 64-bits, and reports asimd feature flags. We cannot
947 // use Neon-asimd in this case, so make sure we are on a 64-bit platform.
948 if (sizeof(void*) == 8)
950 features->insert(CpuInfo::Feature::Arm_NeonAsimd);
953 if (s.find("sve") != std::string::npos)
955 features->insert(CpuInfo::Feature::Arm_Sve);
961 /*! \brief Try to detect vendor, cpu and features from /proc/cpuinfo
963 * \param[out] vendor Detected hardware vendor
964 * \param[out] brand String where to write the brand string
965 * \param[out] family Major version of processor
966 * \param[out] model Middle version of processor
967 * \param[out] stepping Minor version of processor
968 * \param[out] features Feature set where supported features are inserted
970 * This routine reads the /proc/cpuinfo file into a map and calls subroutines
971 * that attempt to parse by matching keys and values to known strings. It is
972 * much more fragile than our x86 detection, but it does not depend on
973 * specific system calls, intrinsics or assembly instructions.
975 void detectProcCpuInfo(CpuInfo::Vendor* vendor,
980 std::set<CpuInfo::Feature>* features)
982 std::map<std::string, std::string> cpuInfo = parseProcCpuInfo();
984 if (*vendor == CpuInfo::Vendor::Unknown)
986 *vendor = detectProcCpuInfoVendor(cpuInfo);
989 // Unfortunately there is no standard for contents in /proc/cpuinfo. We cannot
990 // indiscriminately look for e.g. 'cpu' since it could be either name or an index.
991 // To handle this slightly better we use one subroutine per vendor.
994 case CpuInfo::Vendor::Ibm: detectProcCpuInfoIbm(cpuInfo, brand, features); break;
996 case CpuInfo::Vendor::Arm:
997 detectProcCpuInfoArm(cpuInfo, brand, family, model, stepping, features);
1001 // We only have a single check for fujitsu for now
1003 features->insert(CpuInfo::Feature::Fujitsu_HpcAce);
1013 CpuInfo CpuInfo::detect()
1017 if (c_architecture == Architecture::X86)
1019 result.vendor_ = detectX86Vendor();
1021 if (result.vendor_ == CpuInfo::Vendor::Intel)
1023 result.features_.insert(CpuInfo::Feature::X86_Intel);
1025 else if (result.vendor_ == CpuInfo::Vendor::Amd)
1027 result.features_.insert(CpuInfo::Feature::X86_Amd);
1029 else if (result.vendor_ == CpuInfo::Vendor::Hygon)
1031 result.features_.insert(CpuInfo::Feature::X86_Hygon);
1034 &result.brandString_, &result.family_, &result.model_, &result.stepping_, &result.features_);
1035 result.logicalProcessors_ = detectX86LogicalProcessors();
1040 if (c_architecture == Architecture::Arm)
1042 result.vendor_ = CpuInfo::Vendor::Arm;
1044 else if (c_architecture == Architecture::PowerPC)
1046 result.vendor_ = CpuInfo::Vendor::Ibm;
1049 #if defined __aarch64__ || (defined _M_ARM && _M_ARM >= 8)
1050 result.features_.insert(Feature::Arm_Neon); // ARMv8 always has Neon
1051 result.features_.insert(Feature::Arm_NeonAsimd); // ARMv8 always has Neon-asimd
1053 #if defined __arch64__ && defined __ARM_FEATURE_SVE
1054 result.features_.insert(Feature::Arm_Sve);
1058 result.vendor_ = CpuInfo::Vendor::Oracle;
1061 // On Linux we might be able to find information in /proc/cpuinfo. If vendor or brand
1062 // is set to a known value this routine will not overwrite it.
1063 detectProcCpuInfo(&result.vendor_,
1064 &result.brandString_,
1071 if (!result.logicalProcessors_.empty())
1073 result.supportLevel_ = CpuInfo::SupportLevel::LogicalProcessorInfo;
1075 else if (!result.features_.empty())
1077 result.supportLevel_ = CpuInfo::SupportLevel::Features;
1079 else if (result.vendor_ != CpuInfo::Vendor::Unknown
1080 || result.brandString_ != "Unknown CPU brand")
1082 result.supportLevel_ = CpuInfo::SupportLevel::Name;
1086 result.supportLevel_ = CpuInfo::SupportLevel::None;
1092 CpuInfo::CpuInfo() :
1093 vendor_(CpuInfo::Vendor::Unknown), brandString_("Unknown CPU brand"), family_(0), model_(0), stepping_(0)
1097 const std::string& CpuInfo::vendorString() const
1099 static const std::map<Vendor, std::string> vendorStrings = {
1100 { Vendor::Unknown, "Unknown vendor" }, { Vendor::Intel, "Intel" }, { Vendor::Amd, "AMD" },
1101 { Vendor::Fujitsu, "Fujitsu" }, { Vendor::Ibm, "IBM" }, { Vendor::Arm, "ARM" },
1102 { Vendor::Oracle, "Oracle" }, { Vendor::Hygon, "Hygon" },
1105 return vendorStrings.at(vendor_);
1109 const std::string& CpuInfo::featureString(Feature f)
1111 static const std::map<Feature, std::string> featureStrings = {
1112 { Feature::X86_Aes, "aes" },
1113 { Feature::X86_Amd, "amd" },
1114 { Feature::X86_Apic, "apic" },
1115 { Feature::X86_Avx, "avx" },
1116 { Feature::X86_Avx2, "avx2" },
1117 { Feature::X86_Avx512F, "avx512f" },
1118 { Feature::X86_Avx512PF, "avx512pf" },
1119 { Feature::X86_Avx512ER, "avx512er" },
1120 { Feature::X86_Avx512CD, "avx512cd" },
1121 { Feature::X86_Avx512BW, "avx512bw" },
1122 { Feature::X86_Avx512VL, "avx512vl" },
1123 { Feature::X86_Avx512BF16, "avx512bf16" },
1124 { Feature::X86_Avx512secondFMA, "avx512secondFMA" },
1125 { Feature::X86_Clfsh, "clfsh" },
1126 { Feature::X86_Cmov, "cmov" },
1127 { Feature::X86_Cx8, "cx8" },
1128 { Feature::X86_Cx16, "cx16" },
1129 { Feature::X86_F16C, "f16c" },
1130 { Feature::X86_Fma, "fma" },
1131 { Feature::X86_Fma4, "fma4" },
1132 { Feature::X86_Hle, "hle" },
1133 { Feature::X86_Htt, "htt" },
1134 { Feature::X86_Intel, "intel" },
1135 { Feature::X86_Lahf, "lahf" },
1136 { Feature::X86_MisalignSse, "misalignsse" },
1137 { Feature::X86_Mmx, "mmx" },
1138 { Feature::X86_Msr, "msr" },
1139 { Feature::X86_NonstopTsc, "nonstop_tsc" },
1140 { Feature::X86_Pcid, "pcid" },
1141 { Feature::X86_Pclmuldq, "pclmuldq" },
1142 { Feature::X86_Pdcm, "pdcm" },
1143 { Feature::X86_PDPE1GB, "pdpe1gb" },
1144 { Feature::X86_Popcnt, "popcnt" },
1145 { Feature::X86_Pse, "pse" },
1146 { Feature::X86_Rdrnd, "rdrnd" },
1147 { Feature::X86_Rdtscp, "rdtscp" },
1148 { Feature::X86_Rtm, "rtm" },
1149 { Feature::X86_Sha, "sha" },
1150 { Feature::X86_Sse2, "sse2" },
1151 { Feature::X86_Sse3, "sse3" },
1152 { Feature::X86_Sse4A, "sse4a" },
1153 { Feature::X86_Sse4_1, "sse4.1" },
1154 { Feature::X86_Sse4_2, "sse4.2" },
1155 { Feature::X86_Ssse3, "ssse3" },
1156 { Feature::X86_Tdt, "tdt" },
1157 { Feature::X86_X2Apic, "x2apic" },
1158 { Feature::X86_Xop, "xop" },
1159 { Feature::Arm_Neon, "neon" },
1160 { Feature::Arm_NeonAsimd, "neon_asimd" },
1161 { Feature::Arm_Sve, "sve" },
1162 { Feature::Ibm_Qpx, "qpx" },
1163 { Feature::Ibm_Vmx, "vmx" },
1164 { Feature::Ibm_Vsx, "vsx" },
1165 { Feature::Fujitsu_HpcAce, "hpc-ace" },
1166 { Feature::X86_Hygon, "hygon" }
1168 return featureStrings.at(f);
1172 bool cpuIsX86Nehalem(const CpuInfo& cpuInfo)
1174 return (cpuInfo.vendor() == CpuInfo::Vendor::Intel && cpuInfo.family() == 6
1175 && (cpuInfo.model() == 0x2E || cpuInfo.model() == 0x1A || cpuInfo.model() == 0x1E
1176 || cpuInfo.model() == 0x2F || cpuInfo.model() == 0x2C || cpuInfo.model() == 0x25));
1179 bool cpuIsAmdZen1(const CpuInfo& cpuInfo)
1181 /* Both Zen/Zen+/Zen2 have family==23
1182 * Model numbers for Zen:
1183 * 1) Naples, Whitehaven, Summit Ridge, and Snowy Owl;
1185 * Model numbers for Zen+:
1186 * 8) Pinnacle Ridge;
1188 * Hygon got license for Zen1, but not Zen2 (https://www.tomshardware.com/news/amd-zen-china-x86-ip-license,39573.html)
1190 return (cpuInfo.vendor() == CpuInfo::Vendor::Amd && cpuInfo.family() == 23
1191 && (cpuInfo.model() == 1 || cpuInfo.model() == 17 || cpuInfo.model() == 8
1192 || cpuInfo.model() == 24))
1193 || (cpuInfo.vendor() == CpuInfo::Vendor::Hygon);
1198 #ifdef GMX_CPUINFO_STANDALONE
1199 int main(int argc, char** argv)
1204 "Usage:\n\n%s [flags]\n\n"
1205 "Available flags:\n"
1206 "-vendor Print CPU vendor.\n"
1207 "-brand Print CPU brand string.\n"
1208 "-family Print CPU family version.\n"
1209 "-model Print CPU model version.\n"
1210 "-stepping Print CPU stepping version.\n"
1211 "-features Print CPU feature flags.\n",
1216 std::string arg(argv[1]);
1217 gmx::CpuInfo cpuInfo(gmx::CpuInfo::detect());
1219 if (arg == "-vendor")
1221 printf("%s\n", cpuInfo.vendorString().c_str());
1223 else if (arg == "-brand")
1225 printf("%s\n", cpuInfo.brandString().c_str());
1227 else if (arg == "-family")
1229 printf("%d\n", cpuInfo.family());
1231 else if (arg == "-model")
1233 printf("%d\n", cpuInfo.model());
1235 else if (arg == "-stepping")
1237 printf("%d\n", cpuInfo.stepping());
1239 else if (arg == "-features")
1241 // Separate the feature strings with spaces. Note that in the
1242 // GROMACS cmake code, surrounding whitespace is first
1243 // stripped by the CPU detection routine, and then added back
1244 // in the code for making the SIMD suggestion.
1245 for (auto& f : cpuInfo.featureSet())
1247 printf("%s ", cpuInfo.featureString(f).c_str());
1251 else if (arg == "-topology")
1253 // Undocumented debug option, usually not present in standalone version
1254 for (auto& t : cpuInfo.logicalProcessors())
1256 printf("%3u %3u %3u\n", t.socketRankInMachine, t.coreRankInSocket, t.hwThreadRankInCore);