2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012-2018, The GROMACS development team.
5 * Copyright (c) 2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
39 * Implements gmx::CpuInfo.
41 * We need to be able to compile this file in stand-alone mode to use basic
42 * CPU feature detection to set the SIMD acceleration and similar things in
43 * CMake, while we still want to use more features that enable topology
44 * detection when config.h is present.
46 * We solve this by skipping the advanced stuff when the preprocessor
47 * macro GMX_CPUINFO_STANDALONE is defined. In this case you likely also need to
48 * define GMX_X86_GCC_INLINE_ASM if you are on x86; without inline assembly
49 * support it is not possible to perform the actual detection on Linux/Mac.
50 * Since these macros are specific to this file, they do not use the GMX prefix.
52 * The remaining defines (GMX_NATIVE_WINDOWS,HAVE_UNISTD_H,HAVE_SCHED_H,
53 * HAVE_SYSCONF, HAVE_SCHED_AFFINITY) are only used to determine the topology on
54 * 86, and for this we rely on including config.h.
56 * \author Erik Lindahl <erik.lindahl@gmail.com>
57 * \ingroup module_hardware
60 #ifndef GMX_CPUINFO_STANDALONE
66 #ifndef GMX_CPUINFO_STANDALONE
69 # define GMX_NATIVE_WINDOWS 0
73 # include <intrin.h> // __cpuid()
76 #if GMX_NATIVE_WINDOWS
77 # include <windows.h> // sysinfo(), necessary for topology stuff
81 # include <sched.h> // sched_getaffinity(), sched_setaffinity()
84 # include <unistd.h> // sysconf()
88 #include <cstdint> // uint32_t in X86 processor name code
98 #ifdef GMX_CPUINFO_STANDALONE
101 # include "gromacs/utility/basedefinitions.h"
104 #include "architecture.h"
112 /*! \cond internal */
114 /******************************************************************************
116 * Utility functions to make this file independent of the GROMACS library *
118 ******************************************************************************/
120 /*! \brief Remove initial and trailing whitespace from string
122 * \param s Pointer to string where whitespace will be removed
124 void trimString(std::string* s)
128 std::find_if(s->begin(), s->end(), [](char& c) -> bool { return std::isspace(c) == 0; }));
131 std::find_if(s->rbegin(), s->rend(), [](char& c) -> bool { return std::isspace(c) == 0; })
137 /******************************************************************************
139 * x86 detection functions *
141 ******************************************************************************/
143 /*! \brief execute x86 cpuid instructions with custom level and extended level
145 * \param level The main cpuid level (input argument for eax register)
146 * \param ecxval Extended level (input argument for ecx register)
147 * \param eax Output in eax register
148 * \param ebx Output in ebx register
149 * \param ecx Output in ecx register
150 * \param edx Output in edx register
152 * \return 0 on success, or non-zero if the instruction could not execute.
154 int executeX86CpuID(unsigned int gmx_unused level,
155 unsigned int gmx_unused ecxval,
161 if (c_architecture == Architecture::X86)
163 #if defined __GNUC__ || GMX_X86_GCC_INLINE_ASM
165 // any compiler that understands gcc inline assembly
171 # if GMX_IS_X86_32 && defined(__PIC__)
172 // Avoid clobbering the global offset table in 32-bit pic code (ebx register)
173 __asm__ __volatile__(
174 "xchgl %%ebx, %1 \n\t"
176 "xchgl %%ebx, %1 \n\t"
177 : "+a"(*eax), "+r"(*ebx), "+c"(*ecx), "+d"(*edx));
179 // i386 without PIC, or x86-64. Things are easy and we can clobber any reg we want
180 __asm__ __volatile__("cpuid \n\t"
181 : "+a"(*eax), "+b"(*ebx), "+c"(*ecx), "+d"(*edx));
183 // Not a normal x86, which could happen when a compiler
184 // targetting non-x86 pretends to be GCC.
188 #elif defined _MSC_VER
190 // MSVC (and icc on windows) on ia32 or x86-64
192 __cpuidex(cpuInfo, level, ecxval);
193 *eax = static_cast<unsigned int>(cpuInfo[0]);
194 *ebx = static_cast<unsigned int>(cpuInfo[1]);
195 *ecx = static_cast<unsigned int>(cpuInfo[2]);
196 *edx = static_cast<unsigned int>(cpuInfo[3]);
201 // We are on x86, but without compiler support for cpuid if we get here
208 #endif // check for inline asm on x86
222 /*! \brief Detect x86 vendors by using the cpuid assembly instructions
224 * If support for the cpuid instruction is present, we check for Intel,
225 * AMD or Hygon vendors
227 * \return gmx::CpuInfo::Vendor::Intel, gmx::CpuInfo::Vendor::Amd,
228 * gmx::CpuInfl::Vendor::Hygon, . If neither Intel, Amd nor
229 * Hygon can be identified, or if the code fails to execute,
230 * gmx::CpuInfo::Vendor::Unknown is returned.
232 CpuInfo::Vendor detectX86Vendor()
234 unsigned int eax, ebx, ecx, edx;
235 CpuInfo::Vendor v = CpuInfo::Vendor::Unknown;
237 if (executeX86CpuID(0x0, 0, &eax, &ebx, &ecx, &edx) == 0)
239 if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
241 v = CpuInfo::Vendor::Intel; // ebx=='uneG', ecx=='letn', edx=='Ieni'
243 else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
245 v = CpuInfo::Vendor::Amd; // ebx=='htuA', ecx=='DMAc', edx=='itne'
247 else if (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e)
249 v = CpuInfo::Vendor::Hygon; // ebx=='ogyH', ecx=='eniu', edx=='neGn'
255 /*! \brief Detect second AVX-512 FMA from the processor name
257 * Should only be called for processors already determined to support AVX-512.
259 * \param [in] brand x86 processor name
260 * \param [in] model x86 model
261 * \return True if second FMA present
263 bool detectProcCpuInfoSecondAvx512FMA(const std::string& brand, int model)
269 if (brand.find("Xeon") == 9)
271 // detect Silver or Bronze or specific models
272 if (brand.find("Silver") == 17 || brand.find("Bronze") == 17
273 || (brand.find('W') == 17 && brand.find('0') == 21) // detect Xeon W 210x
274 || (brand.find('D') == 17 && brand.find("21") == 19)) // detect Xeon D 2xxx
278 // detect Gold 5xxx - can be corrected once Cooper Lake is added
279 else if (brand.find("Gold") == 17 && brand.find('5') == 22)
281 return (brand.find("53") == 22 || // detect Cooper Lake
282 brand.find("22") == 24); // detect 5[12]22
287 // Cannon Lake client
293 if (model == 0x7d || model == 0x7e)
297 // This is the right default...
301 /*! \brief Simple utility function to set/clear feature in a set
303 * \param featureSet Pointer to the feature set to update
304 * \param feature The specific feature to set/clear
305 * \param registerValue Register value (returned from cpuid)
306 * \param bit Bit to check in registerValue. The feature will be
307 * added to the featureSet if this bit is set.
309 * \note Nothing is done if the bit is not set. In particular, this will not
310 * erase anything if the feature already exists in the set.
312 void setFeatureFromBit(std::set<CpuInfo::Feature>* featureSet,
313 CpuInfo::Feature feature,
314 unsigned int registerValue,
317 if (registerValue & (1 << bit))
319 featureSet->insert(feature);
323 /*! \brief Process x86 cpuinfo features that are common to Intel and AMD CPUs
325 * \param[out] brand String where to write the x86 brand string
326 * \param[out] family Major version of processor
327 * \param[out] model Middle version of processor
328 * \param[out] stepping Minor version of processor
329 * \param[out] features Feature set where supported features are inserted
331 void detectX86Features(std::string* brand, int* family, int* model, int* stepping, std::set<CpuInfo::Feature>* features)
333 unsigned int eax, ebx, ecx, edx;
335 // Return if we cannot execute any levels
336 if (executeX86CpuID(0x0, 0, &eax, &ebx, &ecx, &edx) != 0)
340 unsigned int maxStdLevel = eax;
342 if (maxStdLevel >= 0x1)
344 executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
346 *family = ((eax & 0x0ff00000) >> 20) + ((eax & 0x00000f00) >> 8);
347 *model = ((eax & 0x000f0000) >> 12) + ((eax & 0x000000f0) >> 4);
348 *stepping = (eax & 0x0000000f);
350 setFeatureFromBit(features, CpuInfo::Feature::X86_Sse3, ecx, 0);
351 setFeatureFromBit(features, CpuInfo::Feature::X86_Pclmuldq, ecx, 1);
352 setFeatureFromBit(features, CpuInfo::Feature::X86_Ssse3, ecx, 9);
353 setFeatureFromBit(features, CpuInfo::Feature::X86_Fma, ecx, 12);
354 setFeatureFromBit(features, CpuInfo::Feature::X86_Cx16, ecx, 13);
355 setFeatureFromBit(features, CpuInfo::Feature::X86_Pdcm, ecx, 15);
356 setFeatureFromBit(features, CpuInfo::Feature::X86_Pcid, ecx, 17);
357 setFeatureFromBit(features, CpuInfo::Feature::X86_Sse4_1, ecx, 19);
358 setFeatureFromBit(features, CpuInfo::Feature::X86_Sse4_2, ecx, 20);
359 setFeatureFromBit(features, CpuInfo::Feature::X86_X2Apic, ecx, 21);
360 setFeatureFromBit(features, CpuInfo::Feature::X86_Popcnt, ecx, 23);
361 setFeatureFromBit(features, CpuInfo::Feature::X86_Tdt, ecx, 24);
362 setFeatureFromBit(features, CpuInfo::Feature::X86_Aes, ecx, 25);
363 setFeatureFromBit(features, CpuInfo::Feature::X86_Avx, ecx, 28);
364 setFeatureFromBit(features, CpuInfo::Feature::X86_F16C, ecx, 29);
365 setFeatureFromBit(features, CpuInfo::Feature::X86_Rdrnd, ecx, 30);
367 setFeatureFromBit(features, CpuInfo::Feature::X86_Pse, edx, 3);
368 setFeatureFromBit(features, CpuInfo::Feature::X86_Msr, edx, 5);
369 setFeatureFromBit(features, CpuInfo::Feature::X86_Cx8, edx, 8);
370 setFeatureFromBit(features, CpuInfo::Feature::X86_Apic, edx, 9);
371 setFeatureFromBit(features, CpuInfo::Feature::X86_Cmov, edx, 15);
372 setFeatureFromBit(features, CpuInfo::Feature::X86_Clfsh, edx, 19);
373 setFeatureFromBit(features, CpuInfo::Feature::X86_Mmx, edx, 23);
374 setFeatureFromBit(features, CpuInfo::Feature::X86_Sse2, edx, 26);
375 setFeatureFromBit(features, CpuInfo::Feature::X86_Htt, edx, 28);
378 // Check whether Hyper-threading is really possible to enable in the hardware,
379 // not just technically supported by this generation of processors
380 if ((features->count(CpuInfo::Feature::X86_Htt) != 0U) && maxStdLevel >= 0x4)
382 executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
383 unsigned int maxLogicalCores = (ebx >> 16) & 0x0ff;
384 executeX86CpuID(0x4, 0, &eax, &ebx, &ecx, &edx);
385 unsigned int maxPhysicalCores = ((eax >> 26) & 0x3f) + 1;
386 if (maxLogicalCores / maxPhysicalCores < 2)
388 features->erase(CpuInfo::Feature::X86_Htt);
392 if (executeX86CpuID(0x80000000, 0, &eax, &ebx, &ecx, &edx) != 0)
394 // No point in continuing if we don't support any extended levels
397 unsigned int maxExtLevel = eax;
399 if (maxExtLevel >= 0x80000001)
401 executeX86CpuID(0x80000001, 0, &eax, &ebx, &ecx, &edx);
403 setFeatureFromBit(features, CpuInfo::Feature::X86_Lahf, ecx, 0);
404 setFeatureFromBit(features, CpuInfo::Feature::X86_Sse4A, ecx, 6);
405 setFeatureFromBit(features, CpuInfo::Feature::X86_MisalignSse, ecx, 7);
406 setFeatureFromBit(features, CpuInfo::Feature::X86_Xop, ecx, 11);
407 setFeatureFromBit(features, CpuInfo::Feature::X86_Fma4, ecx, 16);
408 setFeatureFromBit(features, CpuInfo::Feature::X86_PDPE1GB, edx, 26);
409 setFeatureFromBit(features, CpuInfo::Feature::X86_Rdtscp, edx, 27);
412 if (maxExtLevel >= 0x80000005)
414 // Get the x86 CPU brand string (3 levels, 16 bytes in each)
416 for (unsigned int level = 0x80000002; level < 0x80000005; level++)
418 executeX86CpuID(level, 0, &eax, &ebx, &ecx, &edx);
419 // Add eax, ebx, ecx, edx contents as 4 chars each to the brand string
420 brand->append(reinterpret_cast<const char*>(&eax), sizeof(eax));
421 brand->append(reinterpret_cast<const char*>(&ebx), sizeof(ebx));
422 brand->append(reinterpret_cast<const char*>(&ecx), sizeof(ecx));
423 brand->append(reinterpret_cast<const char*>(&edx), sizeof(edx));
428 if (maxStdLevel >= 0x7)
430 executeX86CpuID(0x7, 0, &eax, &ebx, &ecx, &edx);
432 setFeatureFromBit(features, CpuInfo::Feature::X86_Hle, ebx, 4);
433 setFeatureFromBit(features, CpuInfo::Feature::X86_Avx2, ebx, 5);
434 setFeatureFromBit(features, CpuInfo::Feature::X86_Rtm, ebx, 11);
435 setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512F, ebx, 16);
436 setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512PF, ebx, 26);
437 setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512ER, ebx, 27);
438 setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512CD, ebx, 28);
439 setFeatureFromBit(features, CpuInfo::Feature::X86_Sha, ebx, 29);
440 setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512BW, ebx, 30);
441 setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512VL, ebx, 31);
443 executeX86CpuID(0x7, 0x1, &eax, &ebx, &ecx, &edx);
444 setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512BF16, eax, 5);
446 if (features->count(CpuInfo::Feature::X86_Avx512F) != 0)
448 // Only checking if the CPU supports AVX-512. There is no CPUID bit for this.
449 if (detectProcCpuInfoSecondAvx512FMA(*brand, *model))
451 features->insert(CpuInfo::Feature::X86_Avx512secondFMA);
457 if (maxExtLevel >= 0x80000007)
459 executeX86CpuID(0x80000007, 0, &eax, &ebx, &ecx, &edx);
461 setFeatureFromBit(features, CpuInfo::Feature::X86_NonstopTsc, edx, 8);
466 /*! \brief Return a vector with x86 APIC IDs for all threads
468 * \param haveX2Apic True if the processors supports x2APIC, otherwise vanilla APIC.
470 * \returns A new std::vector of unsigned integer APIC IDs, one for each
471 * logical processor in the system.
473 std::vector<unsigned int> detectX86ApicIDs(bool gmx_unused haveX2Apic)
475 std::vector<unsigned int> apicID;
477 // We cannot just ask for all APIC IDs, but must force execution on each
478 // hardware thread and extract the APIC id there.
479 #if HAVE_SCHED_AFFINITY && defined HAVE_SYSCONF
480 unsigned int eax, ebx, ecx, edx;
481 unsigned int nApic = sysconf(_SC_NPROCESSORS_ONLN);
482 cpu_set_t saveCpuSet;
484 sched_getaffinity(0, sizeof(cpu_set_t), &saveCpuSet);
486 for (unsigned int i = 0; i < nApic; i++)
489 sched_setaffinity(0, sizeof(cpu_set_t), &cpuSet);
492 executeX86CpuID(0xb, 0, &eax, &ebx, &ecx, &edx);
493 apicID.push_back(edx);
497 executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
498 apicID.push_back(ebx >> 24);
502 sched_setaffinity(0, sizeof(cpu_set_t), &saveCpuSet);
503 #elif GMX_NATIVE_WINDOWS
504 unsigned int eax, ebx, ecx, edx;
506 GetSystemInfo(&sysinfo);
507 unsigned int nApic = sysinfo.dwNumberOfProcessors;
508 unsigned int saveAffinity = SetThreadAffinityMask(GetCurrentThread(), 1);
509 for (DWORD_PTR i = 0; i < nApic; i++)
511 SetThreadAffinityMask(GetCurrentThread(), (((DWORD_PTR)1) << i));
515 executeX86CpuID(0xb, 0, &eax, &ebx, &ecx, &edx);
516 apicID.push_back(edx);
520 executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
521 apicID.push_back(ebx >> 24);
524 SetThreadAffinityMask(GetCurrentThread(), saveAffinity);
530 /*! \brief Utility to renumber indices extracted from APIC IDs
532 * \param v Vector with unsigned integer indices
534 * This routine returns the number of unique different elements found in the vector,
535 * and renumbers these starting from 0. For example, the vector {0,1,2,8,9,10,8,9,10,0,1,2}
536 * will be rewritten to {0,1,2,3,4,5,3,4,5,0,1,2}, and it returns 6 for the
537 * number of unique elements.
539 void renumberIndex(std::vector<unsigned int>* v)
541 std::vector<unsigned int> sortedV(*v);
542 std::sort(sortedV.begin(), sortedV.end());
544 std::vector<unsigned int> uniqueSortedV(sortedV);
545 auto it = std::unique(uniqueSortedV.begin(), uniqueSortedV.end());
546 uniqueSortedV.resize(std::distance(uniqueSortedV.begin(), it));
548 for (std::size_t i = 0; i < uniqueSortedV.size(); i++)
550 unsigned int val = uniqueSortedV[i];
551 std::replace_if(v->begin(),
553 [val](unsigned int& c) -> bool { return c == val; },
554 static_cast<unsigned int>(i));
558 /*! \brief The layout of the bits in the APIC ID */
561 unsigned int hwThreadBits; //!< The number of least significant bits for hw-threads
562 unsigned int coreBits; //!< The number of core bits following the hw-thread bits
565 /*! \brief Detect the APIC ID layout for x2APIC
567 ApicIdLayout detectX2ApicIdLayout()
575 executeX86CpuID(0xb, 0, &eax, &ebx, &ecx, &edx);
576 layout.hwThreadBits = eax & 0x1f;
577 executeX86CpuID(0xb, 1, &eax, &ebx, &ecx, &edx);
578 layout.coreBits = (eax & 0x1f) - layout.hwThreadBits;
583 /*! \brief Detect the APIC ID layout for standard APIC or xAPIC on AMD
585 * \param[in] maxExtLevel The largest CPUID extended function input value supported by the processor implementation
587 ApicIdLayout detectAmdApicIdLayout(unsigned int maxExtLevel)
595 executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
596 int family = ((eax & 0x0ff00000) >> 20) + ((eax & 0x00000f00) >> 8);
597 executeX86CpuID(0x80000001, 0, &eax, &ebx, &ecx, &edx);
598 bool haveExtendedTopology = (ecx & (1 << 22)) != 0U;
600 // NOTE: Here we assume 1 thread per core, unless we have family >= 17h
601 layout.hwThreadBits = 0;
602 if (family >= 0x17 && haveExtendedTopology && maxExtLevel >= 0x8000001e)
604 executeX86CpuID(0x8000001e, 1, &eax, &ebx, &ecx, &edx);
605 int numThreadsPerCore = ((ebx >> 8) & 0xff) + 1;
606 // NOTE: The AMD documentation only specifies the layout of apicid
607 // when we have 1 or 2 threads per core.
608 while (numThreadsPerCore > (1 << layout.hwThreadBits))
610 layout.hwThreadBits++;
614 // Get number of core bits in apic ID - try modern extended method first
615 executeX86CpuID(0x80000008, 0, &eax, &ebx, &ecx, &edx);
616 layout.coreBits = (ecx >> 12) & 0xf;
617 if (layout.coreBits == 0)
619 // Legacy method for old single/dual core AMD CPUs
621 while (i >> layout.coreBits)
630 /*! \brief Try to detect basic CPU topology information using x86 cpuid
632 * If x2APIC support is present, this is our first choice, otherwise we
633 * attempt to use old vanilla APIC.
635 * \return A new vector of entries with socket, core, hwthread information
636 * for each logical processor.
638 std::vector<CpuInfo::LogicalProcessor> detectX86LogicalProcessors()
644 unsigned int maxStdLevel;
645 unsigned int maxExtLevel;
649 std::vector<CpuInfo::LogicalProcessor> logicalProcessors;
651 // Find largest standard & extended level input values allowed
652 executeX86CpuID(0x0, 0, &eax, &ebx, &ecx, &edx);
654 executeX86CpuID(0x80000000, 0, &eax, &ebx, &ecx, &edx);
657 if (maxStdLevel >= 0x1)
659 executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
660 haveX2Apic = ((ecx & (1 << 21)) != 0U) && maxStdLevel >= 0xb;
661 haveApic = ((edx & (1 << 9)) != 0U) && maxExtLevel >= 0x80000008;
669 if (haveX2Apic || haveApic)
672 // Get bits for cores and hardware threads
675 layout = detectX2ApicIdLayout();
679 if (detectX86Vendor() == CpuInfo::Vendor::Amd || detectX86Vendor() == CpuInfo::Vendor::Hygon)
681 layout = detectAmdApicIdLayout(maxExtLevel);
683 if (layout.hwThreadBits > 1)
685 // At the time of writing this code we do not know what
686 // to do with more than 2 threads, so return empty.
687 return logicalProcessors;
692 // We do not know the APIC ID layout, return empty.
693 return logicalProcessors;
697 std::vector<unsigned int> apicID = detectX86ApicIDs(haveX2Apic);
701 // APIC IDs can be buggy, and it is always a mess. Typically more bits are
702 // reserved than needed, and the numbers might not increment by 1 even in
703 // a single socket or core. Extract, renumber, and check that things make sense.
704 unsigned int hwThreadMask = (1 << layout.hwThreadBits) - 1;
705 unsigned int coreMask = (1 << layout.coreBits) - 1;
706 std::vector<unsigned int> hwThreadRanks;
707 std::vector<unsigned int> coreRanks;
708 std::vector<unsigned int> socketRanks;
710 for (auto a : apicID)
712 hwThreadRanks.push_back(static_cast<int>(a & hwThreadMask));
713 coreRanks.push_back(static_cast<int>((a >> layout.hwThreadBits) & coreMask));
714 socketRanks.push_back(static_cast<int>(a >> (layout.coreBits + layout.hwThreadBits)));
717 renumberIndex(&hwThreadRanks);
718 renumberIndex(&coreRanks);
719 renumberIndex(&socketRanks);
721 unsigned int hwThreadRankSize =
722 1 + *std::max_element(hwThreadRanks.begin(), hwThreadRanks.end());
723 unsigned int coreRankSize = 1 + *std::max_element(coreRanks.begin(), coreRanks.end());
724 unsigned int socketRankSize = 1 + *std::max_element(socketRanks.begin(), socketRanks.end());
726 if (socketRankSize * coreRankSize * hwThreadRankSize == apicID.size())
728 // Alright, everything looks consistent, so put it in the result
729 for (std::size_t i = 0; i < apicID.size(); i++)
731 // While the internal APIC IDs are always unsigned integers, we also cast to
732 // plain integers for the externally exposed vectors, since that will make
733 // it possible to use '-1' for invalid entries in the future.
734 logicalProcessors.push_back(
735 { int(socketRanks[i]), int(coreRanks[i]), int(hwThreadRanks[i]) });
740 return logicalProcessors; // Will only have contents if everything worked
744 /******************************************************************************
746 * Generic Linux detection by parsing /proc/cpuinfo *
748 ******************************************************************************/
750 /*! \brief Parse /proc/cpuinfo into a simple string map
752 * This routine will read the contents of /proc/cpuinfo, and for each
753 * line that is not empty we will assign the (trimmed) string to the right of
754 * the colon as a key, and the left-hand side as the value in the map.
755 * For multi-processor systems where lines are repeated the latter lines will
756 * overwrite the first occurrence.
758 * \return New map with the contents. If the file is not available, the returned
761 std::map<std::string, std::string> parseProcCpuInfo()
763 std::ifstream procCpuInfo("/proc/cpuinfo");
765 std::map<std::string, std::string> cpuInfo;
767 while (std::getline(procCpuInfo, line))
771 std::stringstream iss(line);
774 std::getline(iss, key, ':'); // part before colon
775 std::getline(iss, val); // part after colon
778 // put it in the map. This will overwrite previous processors, but we don't care.
786 /*! \brief Try to detect vendor from /proc/cpuinfo
788 * \param cpuInfo Map returned from parseProcCpuinfo()
790 * This routine tries to match a few common labels in /proc/cpuinfo to see if
791 * they begin with the name of a standard vendor. If the file cannot be read
792 * or if no match is found, we return gmx::CpuInfo::Vendor::Unknown.
794 CpuInfo::Vendor detectProcCpuInfoVendor(const std::map<std::string, std::string>& cpuInfo)
796 const std::map<std::string, CpuInfo::Vendor> testVendors = {
797 { "GenuineIntel", CpuInfo::Vendor::Intel },
798 { "Intel", CpuInfo::Vendor::Intel },
799 { "AuthenticAmd", CpuInfo::Vendor::Amd },
800 { "AMD", CpuInfo::Vendor::Amd },
801 { "ARM", CpuInfo::Vendor::Arm },
802 { "AArch64", CpuInfo::Vendor::Arm },
803 { "Fujitsu", CpuInfo::Vendor::Fujitsu },
804 { "IBM", CpuInfo::Vendor::Ibm },
805 { "POWER", CpuInfo::Vendor::Ibm },
806 { "Oracle", CpuInfo::Vendor::Oracle },
807 { "HygonGenuine", CpuInfo::Vendor::Hygon },
808 { "Hygon", CpuInfo::Vendor::Hygon },
811 // For each label in /proc/cpuinfo, compare the value to the name in the
812 // testNames map above, and if it's a match return the vendor.
813 for (auto& l : { "vendor_id", "vendor", "manufacture", "model", "processor", "cpu" })
815 if (cpuInfo.count(l) != 0U)
817 // there was a line with this left-hand side in /proc/cpuinfo
818 const std::string& s1 = cpuInfo.at(l);
820 for (auto& t : testVendors)
822 const std::string& s2 = t.first;
824 // If the entire name we are testing (s2) matches the first part of
825 // the string after the colon in /proc/cpuinfo (s1) we found our vendor
826 if (std::equal(s2.begin(), s2.end(), s1.begin(), [](const char& x, const char& y) -> bool {
827 return tolower(x) == tolower(y);
835 return CpuInfo::Vendor::Unknown;
839 /*! \brief Detect IBM processor name and features from /proc/cpuinfo
841 * \param cpuInfo Map returned from parseProcCpuinfo()
842 * \param[out] brand String where to write the brand string
843 * \param[out] features Feature set where supported features are inserted
845 * This routine tries to match a few common labels in /proc/cpuinfo to see if
846 * we can find the processor name and features. It is likely fragile.
848 void detectProcCpuInfoIbm(const std::map<std::string, std::string>& cpuInfo,
850 std::set<CpuInfo::Feature>* features)
852 // Get brand string from 'cpu' label if present, otherwise 'Processor'
853 if (cpuInfo.count("cpu") != 0U)
855 *brand = cpuInfo.at("cpu");
857 else if (cpuInfo.count("Processor") != 0U)
859 *brand = cpuInfo.at("Processor");
862 if (brand->find("A2") != std::string::npos)
864 // If the processor identification contains "A2", this is BlueGene/Q with QPX
865 features->insert(CpuInfo::Feature::Ibm_Qpx);
868 for (auto& l : { "model name", "model", "Processor", "cpu" })
870 if (cpuInfo.count(l) != 0U)
872 std::string s1 = cpuInfo.at(l);
873 std::transform(s1.begin(), s1.end(), s1.begin(), ::tolower);
875 if (s1.find("altivec") != std::string::npos)
877 features->insert(CpuInfo::Feature::Ibm_Vmx);
878 // If this is a power6, we only have VMX. All later processors have VSX.
879 if (s1.find("power6") == std::string::npos)
881 features->insert(CpuInfo::Feature::Ibm_Vsx);
889 /*! \brief Detect ARM processor name and features from /proc/cpuinfo
891 * \param cpuInfo Map returned from parseProcCpuinfo()
892 * \param[out] brand String where to write the brand string
893 * \param[out] family Major version of processor
894 * \param[out] model Middle version of processor
895 * \param[out] stepping Minor version of processor
896 * \param[out] features Feature set where supported features are inserted
898 * This routine tries to match a few common labels in /proc/cpuinfo to see if
899 * we can find the processor name and features. It is likely fragile.
901 void detectProcCpuInfoArm(const std::map<std::string, std::string>& cpuInfo,
906 std::set<CpuInfo::Feature>* features)
908 if (cpuInfo.count("Processor") != 0U)
910 *brand = cpuInfo.at("Processor");
912 else if (cpuInfo.count("model name") != 0U)
914 *brand = cpuInfo.at("model name");
917 if (cpuInfo.count("CPU architecture") != 0U)
919 *family = std::strtol(cpuInfo.at("CPU architecture").c_str(), nullptr, 10);
920 // For some 64-bit CPUs it appears to say 'AArch64' instead
921 if (*family == 0 && cpuInfo.at("CPU architecture").find("AArch64") != std::string::npos)
923 *family = 8; // fragile - no idea how a future ARMv9 will be represented in this case
926 if (cpuInfo.count("CPU variant") != 0U)
928 *model = std::strtol(cpuInfo.at("CPU variant").c_str(), nullptr, 16);
930 if (cpuInfo.count("CPU revision") != 0U)
932 *stepping = std::strtol(cpuInfo.at("CPU revision").c_str(), nullptr, 10);
935 if (cpuInfo.count("Features") != 0U)
937 const std::string& s = cpuInfo.at("Features");
938 if (s.find("neon") != std::string::npos)
940 features->insert(CpuInfo::Feature::Arm_Neon);
942 if (s.find("asimd") != std::string::npos)
944 // At least Jetson TX1 runs a 32-bit environment by default, although
945 // the kernel is 64-bits, and reports asimd feature flags. We cannot
946 // use Neon-asimd in this case, so make sure we are on a 64-bit platform.
947 if (sizeof(void*) == 8)
949 features->insert(CpuInfo::Feature::Arm_NeonAsimd);
952 if (s.find("sve") != std::string::npos)
954 features->insert(CpuInfo::Feature::Arm_Sve);
960 /*! \brief Try to detect vendor, cpu and features from /proc/cpuinfo
962 * \param[out] vendor Detected hardware vendor
963 * \param[out] brand String where to write the brand string
964 * \param[out] family Major version of processor
965 * \param[out] model Middle version of processor
966 * \param[out] stepping Minor version of processor
967 * \param[out] features Feature set where supported features are inserted
969 * This routine reads the /proc/cpuinfo file into a map and calls subroutines
970 * that attempt to parse by matching keys and values to known strings. It is
971 * much more fragile than our x86 detection, but it does not depend on
972 * specific system calls, intrinsics or assembly instructions.
974 void detectProcCpuInfo(CpuInfo::Vendor* vendor,
979 std::set<CpuInfo::Feature>* features)
981 std::map<std::string, std::string> cpuInfo = parseProcCpuInfo();
983 if (*vendor == CpuInfo::Vendor::Unknown)
985 *vendor = detectProcCpuInfoVendor(cpuInfo);
988 // Unfortunately there is no standard for contents in /proc/cpuinfo. We cannot
989 // indiscriminately look for e.g. 'cpu' since it could be either name or an index.
990 // To handle this slightly better we use one subroutine per vendor.
993 case CpuInfo::Vendor::Ibm: detectProcCpuInfoIbm(cpuInfo, brand, features); break;
995 case CpuInfo::Vendor::Arm:
996 detectProcCpuInfoArm(cpuInfo, brand, family, model, stepping, features);
1000 // We only have a single check for fujitsu for now
1002 features->insert(CpuInfo::Feature::Fujitsu_HpcAce);
1012 CpuInfo CpuInfo::detect()
1016 if (c_architecture == Architecture::X86)
1018 result.vendor_ = detectX86Vendor();
1020 if (result.vendor_ == CpuInfo::Vendor::Intel)
1022 result.features_.insert(CpuInfo::Feature::X86_Intel);
1024 else if (result.vendor_ == CpuInfo::Vendor::Amd)
1026 result.features_.insert(CpuInfo::Feature::X86_Amd);
1028 else if (result.vendor_ == CpuInfo::Vendor::Hygon)
1030 result.features_.insert(CpuInfo::Feature::X86_Hygon);
1033 &result.brandString_, &result.family_, &result.model_, &result.stepping_, &result.features_);
1034 result.logicalProcessors_ = detectX86LogicalProcessors();
1039 if (c_architecture == Architecture::Arm)
1041 result.vendor_ = CpuInfo::Vendor::Arm;
1043 else if (c_architecture == Architecture::PowerPC)
1045 result.vendor_ = CpuInfo::Vendor::Ibm;
1048 #if defined __aarch64__ || (defined _M_ARM && _M_ARM >= 8)
1049 result.features_.insert(Feature::Arm_Neon); // ARMv8 always has Neon
1050 result.features_.insert(Feature::Arm_NeonAsimd); // ARMv8 always has Neon-asimd
1052 #if defined __arch64__ && defined __ARM_FEATURE_SVE
1053 result.features_.insert(Feature::Arm_Sve);
1057 result.vendor_ = CpuInfo::Vendor::Oracle;
1060 // On Linux we might be able to find information in /proc/cpuinfo. If vendor or brand
1061 // is set to a known value this routine will not overwrite it.
1062 detectProcCpuInfo(&result.vendor_,
1063 &result.brandString_,
1070 if (!result.logicalProcessors_.empty())
1072 result.supportLevel_ = CpuInfo::SupportLevel::LogicalProcessorInfo;
1074 else if (!result.features_.empty())
1076 result.supportLevel_ = CpuInfo::SupportLevel::Features;
1078 else if (result.vendor_ != CpuInfo::Vendor::Unknown
1079 || result.brandString_ != "Unknown CPU brand")
1081 result.supportLevel_ = CpuInfo::SupportLevel::Name;
1085 result.supportLevel_ = CpuInfo::SupportLevel::None;
1091 CpuInfo::CpuInfo() :
1092 vendor_(CpuInfo::Vendor::Unknown),
1093 brandString_("Unknown CPU brand"),
1100 const std::string& CpuInfo::vendorString() const
1102 static const std::map<Vendor, std::string> vendorStrings = {
1103 { Vendor::Unknown, "Unknown vendor" }, { Vendor::Intel, "Intel" }, { Vendor::Amd, "AMD" },
1104 { Vendor::Fujitsu, "Fujitsu" }, { Vendor::Ibm, "IBM" }, { Vendor::Arm, "ARM" },
1105 { Vendor::Oracle, "Oracle" }, { Vendor::Hygon, "Hygon" },
1108 return vendorStrings.at(vendor_);
1112 const std::string& CpuInfo::featureString(Feature f)
1114 static const std::map<Feature, std::string> featureStrings = {
1115 { Feature::X86_Aes, "aes" },
1116 { Feature::X86_Amd, "amd" },
1117 { Feature::X86_Apic, "apic" },
1118 { Feature::X86_Avx, "avx" },
1119 { Feature::X86_Avx2, "avx2" },
1120 { Feature::X86_Avx512F, "avx512f" },
1121 { Feature::X86_Avx512PF, "avx512pf" },
1122 { Feature::X86_Avx512ER, "avx512er" },
1123 { Feature::X86_Avx512CD, "avx512cd" },
1124 { Feature::X86_Avx512BW, "avx512bw" },
1125 { Feature::X86_Avx512VL, "avx512vl" },
1126 { Feature::X86_Avx512BF16, "avx512bf16" },
1127 { Feature::X86_Avx512secondFMA, "avx512secondFMA" },
1128 { Feature::X86_Clfsh, "clfsh" },
1129 { Feature::X86_Cmov, "cmov" },
1130 { Feature::X86_Cx8, "cx8" },
1131 { Feature::X86_Cx16, "cx16" },
1132 { Feature::X86_F16C, "f16c" },
1133 { Feature::X86_Fma, "fma" },
1134 { Feature::X86_Fma4, "fma4" },
1135 { Feature::X86_Hle, "hle" },
1136 { Feature::X86_Htt, "htt" },
1137 { Feature::X86_Intel, "intel" },
1138 { Feature::X86_Lahf, "lahf" },
1139 { Feature::X86_MisalignSse, "misalignsse" },
1140 { Feature::X86_Mmx, "mmx" },
1141 { Feature::X86_Msr, "msr" },
1142 { Feature::X86_NonstopTsc, "nonstop_tsc" },
1143 { Feature::X86_Pcid, "pcid" },
1144 { Feature::X86_Pclmuldq, "pclmuldq" },
1145 { Feature::X86_Pdcm, "pdcm" },
1146 { Feature::X86_PDPE1GB, "pdpe1gb" },
1147 { Feature::X86_Popcnt, "popcnt" },
1148 { Feature::X86_Pse, "pse" },
1149 { Feature::X86_Rdrnd, "rdrnd" },
1150 { Feature::X86_Rdtscp, "rdtscp" },
1151 { Feature::X86_Rtm, "rtm" },
1152 { Feature::X86_Sha, "sha" },
1153 { Feature::X86_Sse2, "sse2" },
1154 { Feature::X86_Sse3, "sse3" },
1155 { Feature::X86_Sse4A, "sse4a" },
1156 { Feature::X86_Sse4_1, "sse4.1" },
1157 { Feature::X86_Sse4_2, "sse4.2" },
1158 { Feature::X86_Ssse3, "ssse3" },
1159 { Feature::X86_Tdt, "tdt" },
1160 { Feature::X86_X2Apic, "x2apic" },
1161 { Feature::X86_Xop, "xop" },
1162 { Feature::Arm_Neon, "neon" },
1163 { Feature::Arm_NeonAsimd, "neon_asimd" },
1164 { Feature::Arm_Sve, "sve" },
1165 { Feature::Ibm_Qpx, "qpx" },
1166 { Feature::Ibm_Vmx, "vmx" },
1167 { Feature::Ibm_Vsx, "vsx" },
1168 { Feature::Fujitsu_HpcAce, "hpc-ace" },
1169 { Feature::X86_Hygon, "hygon" }
1171 return featureStrings.at(f);
1175 bool cpuIsX86Nehalem(const CpuInfo& cpuInfo)
1177 return (cpuInfo.vendor() == CpuInfo::Vendor::Intel && cpuInfo.family() == 6
1178 && (cpuInfo.model() == 0x2E || cpuInfo.model() == 0x1A || cpuInfo.model() == 0x1E
1179 || cpuInfo.model() == 0x2F || cpuInfo.model() == 0x2C || cpuInfo.model() == 0x25));
1182 bool cpuIsAmdZen1(const CpuInfo& cpuInfo)
1184 /* Both Zen/Zen+/Zen2 have family==23
1185 * Model numbers for Zen:
1186 * 1) Naples, Whitehaven, Summit Ridge, and Snowy Owl;
1188 * Model numbers for Zen+:
1189 * 8) Pinnacle Ridge;
1191 * Hygon got license for Zen1, but not Zen2 (https://www.tomshardware.com/news/amd-zen-china-x86-ip-license,39573.html)
1193 return (cpuInfo.vendor() == CpuInfo::Vendor::Amd && cpuInfo.family() == 23
1194 && (cpuInfo.model() == 1 || cpuInfo.model() == 17 || cpuInfo.model() == 8
1195 || cpuInfo.model() == 24))
1196 || (cpuInfo.vendor() == CpuInfo::Vendor::Hygon);
1201 #ifdef GMX_CPUINFO_STANDALONE
1202 int main(int argc, char** argv)
1207 "Usage:\n\n%s [flags]\n\n"
1208 "Available flags:\n"
1209 "-vendor Print CPU vendor.\n"
1210 "-brand Print CPU brand string.\n"
1211 "-family Print CPU family version.\n"
1212 "-model Print CPU model version.\n"
1213 "-stepping Print CPU stepping version.\n"
1214 "-features Print CPU feature flags.\n",
1219 std::string arg(argv[1]);
1220 gmx::CpuInfo cpuInfo(gmx::CpuInfo::detect());
1222 if (arg == "-vendor")
1224 printf("%s\n", cpuInfo.vendorString().c_str());
1226 else if (arg == "-brand")
1228 printf("%s\n", cpuInfo.brandString().c_str());
1230 else if (arg == "-family")
1232 printf("%d\n", cpuInfo.family());
1234 else if (arg == "-model")
1236 printf("%d\n", cpuInfo.model());
1238 else if (arg == "-stepping")
1240 printf("%d\n", cpuInfo.stepping());
1242 else if (arg == "-features")
1244 // Separate the feature strings with spaces. Note that in the
1245 // GROMACS cmake code, surrounding whitespace is first
1246 // stripped by the CPU detection routine, and then added back
1247 // in the code for making the SIMD suggestion.
1248 for (auto& f : cpuInfo.featureSet())
1250 printf("%s ", cpuInfo.featureString(f).c_str());
1254 else if (arg == "-topology")
1256 // Undocumented debug option, usually not present in standalone version
1257 for (auto& t : cpuInfo.logicalProcessors())
1259 printf("%3u %3u %3u\n", t.socketRankInMachine, t.coreRankInSocket, t.hwThreadRankInCore);