Merge branch release-2018
[alexxy/gromacs.git] / src / gromacs / hardware / cpuinfo.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35
36 /*! \internal \file
37  * \brief
38  * Implements gmx::CpuInfo.
39  *
40  * We need to be able to compile this file in stand-alone mode to use basic
41  * CPU feature detection to set the SIMD acceleration and similar things in
42  * CMake, while we still want to use more features that enable topology
43  * detection when config.h is present.
44  *
45  * We solve this by skipping the advanced stuff when the preprocessor
46  * macro GMX_CPUINFO_STANDALONE is defined. In this case you likely also need to
47  * define GMX_X86_GCC_INLINE_ASM if you are on x86; without inline assembly
48  * support it is not possible to perform the actual detection on Linux/Mac.
49  * Since these macros are specific to this file, they do not use the GMX prefix.
50  *
51  * The remaining defines (GMX_NATIVE_WINDOWS,HAVE_UNISTD_H,HAVE_SCHED_H,
52  * HAVE_SYSCONF, HAVE_SCHED_AFFINITY) are only used to determine the topology on
53  * 86, and for this we rely on including config.h.
54  *
55  * \author Erik Lindahl <erik.lindahl@gmail.com>
56  * \ingroup module_hardware
57  */
58
59 #ifndef GMX_CPUINFO_STANDALONE
60 #    include "gmxpre.h"
61 #endif
62
63 #include "cpuinfo.h"
64
65 #ifndef GMX_CPUINFO_STANDALONE
66 #    include "config.h"
67 #else
68 #    define GMX_NATIVE_WINDOWS 0
69 #endif
70
71 #if defined _MSC_VER
72 #    include <intrin.h> // __cpuid()
73 #endif
74
75 #if GMX_NATIVE_WINDOWS
76 #    include <windows.h>    // sysinfo(), necessary for topology stuff
77 #endif
78
79 #ifdef HAVE_SCHED_H
80 #    include <sched.h>      // sched_getaffinity(), sched_setaffinity()
81 #endif
82 #ifdef HAVE_UNISTD_H
83 #    include <unistd.h>     // sysconf()
84 #endif
85
86 #include <cctype>
87 #include <cstdlib>
88
89 #include <algorithm>
90 #include <fstream>
91 #include <map>
92 #include <set>
93 #include <sstream>
94 #include <string>
95
96 #ifdef GMX_CPUINFO_STANDALONE
97 #    define gmx_unused
98 #else
99 #    include "gromacs/utility/basedefinitions.h"
100 #endif
101
102 #include "architecture.h"
103
104 namespace gmx
105 {
106
107 namespace
108 {
109
110 /*! \cond internal */
111
112 /******************************************************************************
113  *                                                                            *
114  *   Utility functions to make this file independent of the GROMACS library   *
115  *                                                                            *
116  ******************************************************************************/
117
118 /*! \brief Remove initial and trailing whitespace from string
119  *
120  *  \param s  Pointer to string where whitespace will be removed
121  */
122 void
123 trimString(std::string * s)
124 {
125     // heading
126     s->erase(s->begin(), std::find_if(s->begin(), s->end(), [](char &c) -> bool { return !std::isspace(c); }));
127     // trailing
128     s->erase(std::find_if(s->rbegin(), s->rend(), [](char &c) -> bool { return !std::isspace(c); }).base(), s->end());
129 }
130
131
132 /******************************************************************************
133  *                                                                            *
134  *                         x86 detection functions                            *
135  *                                                                            *
136  ******************************************************************************/
137
138 /*! \brief execute x86 cpuid instructions with custom level and extended level
139  *
140  *  \param level   The main cpuid level (input argument for eax register)
141  *  \param ecxval  Extended level (input argument for ecx register)
142  *  \param eax     Output in eax register
143  *  \param ebx     Output in ebx register
144  *  \param ecx     Output in ecx register
145  *  \param edx     Output in edx register
146  *
147  *  \return 0 on success, or non-zero if the instruction could not execute.
148  */
149 int
150 executeX86CpuID(unsigned int     gmx_unused level,
151                 unsigned int     gmx_unused ecxval,
152                 unsigned int *              eax,
153                 unsigned int *              ebx,
154                 unsigned int *              ecx,
155                 unsigned int *              edx)
156 {
157     if (c_architecture == Architecture::X86)
158     {
159 #if defined __GNUC__ || GMX_X86_GCC_INLINE_ASM
160
161         // any compiler that understands gcc inline assembly
162         *eax = level;
163         *ecx = ecxval;
164         *ebx = 0;
165         *edx = 0;
166
167 #    if GMX_IS_X86_32 && defined(__PIC__)
168         // Avoid clobbering the global offset table in 32-bit pic code (ebx register)
169         __asm__ __volatile__ ("xchgl %%ebx, %1  \n\t"
170                               "cpuid            \n\t"
171                               "xchgl %%ebx, %1  \n\t"
172                               : "+a" (*eax), "+r" (*ebx), "+c" (*ecx), "+d" (*edx));
173 #    elif GMX_IS_X86_64
174         // i386 without PIC, or x86-64. Things are easy and we can clobber any reg we want
175         __asm__ __volatile__ ("cpuid            \n\t"
176                               : "+a" (*eax), "+b" (*ebx), "+c" (*ecx), "+d" (*edx));
177 #    else
178         // Not a normal x86, which could happen when a compiler
179         // targetting non-x86 pretends to be GCC.
180 #    endif
181         return 0;
182
183 #elif defined _MSC_VER
184
185         // MSVC (and icc on windows) on ia32 or x86-64
186         int cpuInfo[4];
187         __cpuidex(cpuInfo, level, ecxval);
188         *eax = static_cast<unsigned int>(cpuInfo[0]);
189         *ebx = static_cast<unsigned int>(cpuInfo[1]);
190         *ecx = static_cast<unsigned int>(cpuInfo[2]);
191         *edx = static_cast<unsigned int>(cpuInfo[3]);
192         return 0;
193
194 #else
195
196         // We are on x86, but without compiler support for cpuid if we get here
197         *eax = 0;
198         *ebx = 0;
199         *ecx = 0;
200         *edx = 0;
201         return 1;
202
203 #endif          // check for inline asm on x86
204     }
205     else
206     {
207         // We are not on x86
208         *eax = 0;
209         *ebx = 0;
210         *ecx = 0;
211         *edx = 0;
212         return 1;
213     }
214 }
215
216
217 /*! \brief Detect x86 vendors by using the cpuid assembly instructions
218  *
219  *  If support for the cpuid instruction is present, we check for Intel
220  *  or AMD vendors.
221  *
222  *  \return gmx::CpuInfo::Vendor::Intel, gmx::CpuInfo::Vendor::Amd. If neither
223  *          Intel nor Amd can be identified, or if the code fails to execute,
224  *          gmx::CpuInfo::Vendor::Unknown is returned.
225  */
226 CpuInfo::Vendor
227 detectX86Vendor()
228 {
229     unsigned int    eax, ebx, ecx, edx;
230     CpuInfo::Vendor v = CpuInfo::Vendor::Unknown;
231
232     if (executeX86CpuID(0x0, 0, &eax, &ebx, &ecx, &edx) == 0)
233     {
234         if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
235         {
236             v = CpuInfo::Vendor::Intel; // ebx=='uneG', ecx=='letn', edx=='Ieni'
237         }
238         else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
239         {
240             v = CpuInfo::Vendor::Amd; // ebx=='htuA', ecx=='DMAc', edx=='itne'
241         }
242     }
243     return v;
244 }
245
246 /*! \brief Simple utility function to set/clear feature in a set
247  *
248  *  \param featureSet    Pointer to the feature set to update
249  *  \param feature       The specific feature to set/clear
250  *  \param registerValue Register value (returned from cpuid)
251  *  \param bit           Bit to check in registerValue. The feature will be
252  *                       added to the featureSet if this bit is set.
253  *
254  *  \note Nothing is done if the bit is not set. In particular, this will not
255  *        erase anything if the feature already exists in the set.
256  */
257 void
258 setFeatureFromBit(std::set<CpuInfo::Feature> *   featureSet,
259                   CpuInfo::Feature               feature,
260                   unsigned int                   registerValue,
261                   unsigned char                  bit)
262 {
263     if (registerValue & (1 << bit))
264     {
265         featureSet->insert(feature);
266     }
267 }
268
269 /*! \brief Process x86 cpuinfo features that are common to Intel and AMD CPUs
270  *
271  *  \param[out] brand      String where to write the x86 brand string
272  *  \param[out] family     Major version of processor
273  *  \param[out] model      Middle version of processor
274  *  \param[out] stepping   Minor version of processor
275  *  \param[out] features   Feature set where supported features are inserted
276  */
277 void
278 detectX86Features(std::string *                  brand,
279                   int *                          family,
280                   int *                          model,
281                   int *                          stepping,
282                   std::set<CpuInfo::Feature> *   features)
283 {
284     unsigned int eax, ebx, ecx, edx;
285
286     // Return if we cannot execute any levels
287     if (executeX86CpuID(0x0, 0, &eax, &ebx, &ecx, &edx) != 0)
288     {
289         return;
290     }
291     unsigned int maxStdLevel = eax;
292
293     if (maxStdLevel >= 0x1)
294     {
295         executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
296
297         *family   = ((eax & 0x0ff00000) >> 20) + ((eax & 0x00000f00) >> 8);
298         *model    = ((eax & 0x000f0000) >> 12) + ((eax & 0x000000f0) >> 4);
299         *stepping = (eax & 0x0000000f);
300
301         setFeatureFromBit(features, CpuInfo::Feature::X86_Sse3,     ecx,  0 );
302         setFeatureFromBit(features, CpuInfo::Feature::X86_Pclmuldq, ecx,  1 );
303         setFeatureFromBit(features, CpuInfo::Feature::X86_Ssse3,    ecx,  9 );
304         setFeatureFromBit(features, CpuInfo::Feature::X86_Fma,      ecx, 12 );
305         setFeatureFromBit(features, CpuInfo::Feature::X86_Cx16,     ecx, 13 );
306         setFeatureFromBit(features, CpuInfo::Feature::X86_Pdcm,     ecx, 15 );
307         setFeatureFromBit(features, CpuInfo::Feature::X86_Pcid,     ecx, 17 );
308         setFeatureFromBit(features, CpuInfo::Feature::X86_Sse4_1,   ecx, 19 );
309         setFeatureFromBit(features, CpuInfo::Feature::X86_Sse4_2,   ecx, 20 );
310         setFeatureFromBit(features, CpuInfo::Feature::X86_X2Apic,   ecx, 21 );
311         setFeatureFromBit(features, CpuInfo::Feature::X86_Popcnt,   ecx, 23 );
312         setFeatureFromBit(features, CpuInfo::Feature::X86_Tdt,      ecx, 24 );
313         setFeatureFromBit(features, CpuInfo::Feature::X86_Aes,      ecx, 25 );
314         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx,      ecx, 28 );
315         setFeatureFromBit(features, CpuInfo::Feature::X86_F16C,     ecx, 29 );
316         setFeatureFromBit(features, CpuInfo::Feature::X86_Rdrnd,    ecx, 30 );
317
318         setFeatureFromBit(features, CpuInfo::Feature::X86_Pse,      edx,  3 );
319         setFeatureFromBit(features, CpuInfo::Feature::X86_Msr,      edx,  5 );
320         setFeatureFromBit(features, CpuInfo::Feature::X86_Cx8,      edx,  8 );
321         setFeatureFromBit(features, CpuInfo::Feature::X86_Apic,     edx,  9 );
322         setFeatureFromBit(features, CpuInfo::Feature::X86_Cmov,     edx, 15 );
323         setFeatureFromBit(features, CpuInfo::Feature::X86_Clfsh,    edx, 19 );
324         setFeatureFromBit(features, CpuInfo::Feature::X86_Mmx,      edx, 23 );
325         setFeatureFromBit(features, CpuInfo::Feature::X86_Sse2,     edx, 26 );
326         setFeatureFromBit(features, CpuInfo::Feature::X86_Htt,      edx, 28 );
327     }
328
329     if (maxStdLevel >= 0x7)
330     {
331         executeX86CpuID(0x7, 0, &eax, &ebx, &ecx, &edx);
332
333         setFeatureFromBit(features, CpuInfo::Feature::X86_Hle,      ebx,  4 );
334         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx2,     ebx,  5 );
335         setFeatureFromBit(features, CpuInfo::Feature::X86_Rtm,      ebx, 11 );
336         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512F,  ebx, 16 );
337         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512PF, ebx, 26 );
338         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512ER, ebx, 27 );
339         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512CD, ebx, 28 );
340         setFeatureFromBit(features, CpuInfo::Feature::X86_Sha,      ebx, 29 );
341         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512BW, ebx, 30 );
342         setFeatureFromBit(features, CpuInfo::Feature::X86_Avx512VL, ebx, 31 );
343     }
344
345     // Check whether Hyper-threading is really possible to enable in the hardware,
346     // not just technically supported by this generation of processors
347     if (features->count(CpuInfo::Feature::X86_Htt) && maxStdLevel >= 0x4)
348     {
349         executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
350         unsigned int maxLogicalCores  = (ebx >> 16) & 0x0ff;
351         executeX86CpuID(0x4, 0, &eax, &ebx, &ecx, &edx);
352         unsigned int maxPhysicalCores = ((eax >> 26) & 0x3f) + 1;
353         if (maxLogicalCores/maxPhysicalCores < 2)
354         {
355             features->erase(CpuInfo::Feature::X86_Htt);
356         }
357     }
358
359     if (executeX86CpuID(0x80000000, 0, &eax, &ebx, &ecx, &edx) != 0)
360     {
361         // No point in continuing if we don't support any extended levels
362         return;
363     }
364     unsigned int maxExtLevel = eax;
365
366     if (maxExtLevel >= 0x80000001)
367     {
368         executeX86CpuID(0x80000001, 0, &eax, &ebx, &ecx, &edx);
369
370         setFeatureFromBit(features, CpuInfo::Feature::X86_Lahf,        ecx,  0 );
371         setFeatureFromBit(features, CpuInfo::Feature::X86_Sse4A,       ecx,  6 );
372         setFeatureFromBit(features, CpuInfo::Feature::X86_MisalignSse, ecx,  7 );
373         setFeatureFromBit(features, CpuInfo::Feature::X86_Xop,         ecx, 11 );
374         setFeatureFromBit(features, CpuInfo::Feature::X86_Fma4,        ecx, 16 );
375         setFeatureFromBit(features, CpuInfo::Feature::X86_PDPE1GB,     edx, 26 );
376         setFeatureFromBit(features, CpuInfo::Feature::X86_Rdtscp,      edx, 27 );
377     }
378
379     if (maxExtLevel >= 0x80000005)
380     {
381         // Get the x86 CPU brand string (3 levels, 16 bytes in each)
382         brand->clear();
383         for (unsigned int level = 0x80000002; level < 0x80000005; level++)
384         {
385             executeX86CpuID(level, 0, &eax, &ebx, &ecx, &edx);
386             // Add eax, ebx, ecx, edx contents as 4 chars each to the brand string
387             brand->append(reinterpret_cast<const char *>(&eax), sizeof(eax));
388             brand->append(reinterpret_cast<const char *>(&ebx), sizeof(ebx));
389             brand->append(reinterpret_cast<const char *>(&ecx), sizeof(ecx));
390             brand->append(reinterpret_cast<const char *>(&edx), sizeof(edx));
391         }
392         trimString(brand);
393     }
394
395     if (maxExtLevel >= 0x80000007)
396     {
397         executeX86CpuID(0x80000007, 0, &eax, &ebx, &ecx, &edx);
398
399         setFeatureFromBit(features, CpuInfo::Feature::X86_NonstopTsc, edx,  8 );
400     }
401 }
402
403
404 /*! \brief Return a vector with x86 APIC IDs for all threads
405  *
406  *  \param haveX2Apic  True if the processors supports x2APIC, otherwise vanilla APIC.
407  *
408  *  \returns A new std::vector of unsigned integer APIC IDs, one for each
409  *           logical processor in the system.
410  */
411 const std::vector<unsigned int>
412 detectX86ApicIDs(bool gmx_unused haveX2Apic)
413 {
414     std::vector<unsigned int>  apicID;
415
416     // We cannot just ask for all APIC IDs, but must force execution on each
417     // hardware thread and extract the APIC id there.
418 #if HAVE_SCHED_AFFINITY && defined HAVE_SYSCONF
419     unsigned int   eax, ebx, ecx, edx;
420     unsigned int   nApic = sysconf(_SC_NPROCESSORS_ONLN);
421     cpu_set_t      saveCpuSet;
422     cpu_set_t      cpuSet;
423     sched_getaffinity(0, sizeof(cpu_set_t), &saveCpuSet);
424     CPU_ZERO(&cpuSet);
425     for (unsigned int i = 0; i < nApic; i++)
426     {
427         CPU_SET(i, &cpuSet);
428         sched_setaffinity(0, sizeof(cpu_set_t), &cpuSet);
429         if (haveX2Apic)
430         {
431             executeX86CpuID(0xb, 0, &eax, &ebx, &ecx, &edx);
432             apicID.push_back(edx);
433         }
434         else
435         {
436             executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
437             apicID.push_back(ebx >> 24);
438         }
439         CPU_CLR(i, &cpuSet);
440     }
441     sched_setaffinity(0, sizeof(cpu_set_t), &saveCpuSet);
442 #elif GMX_NATIVE_WINDOWS
443     unsigned int   eax, ebx, ecx, edx;
444     SYSTEM_INFO    sysinfo;
445     GetSystemInfo( &sysinfo );
446     unsigned int   nApic        = sysinfo.dwNumberOfProcessors;
447     unsigned int   saveAffinity = SetThreadAffinityMask(GetCurrentThread(), 1);
448     for (DWORD_PTR i = 0; i < nApic; i++)
449     {
450         SetThreadAffinityMask(GetCurrentThread(), (((DWORD_PTR)1)<<i));
451         Sleep(0);
452         if (haveX2Apic)
453         {
454             executeX86CpuID(0xb, 0, &eax, &ebx, &ecx, &edx);
455             apicID.push_back(edx);
456         }
457         else
458         {
459             executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
460             apicID.push_back(ebx >> 24);
461         }
462     }
463     SetThreadAffinityMask(GetCurrentThread(), saveAffinity);
464 #endif
465     return apicID;
466 }
467
468
469 /*! \brief Utility to renumber indices extracted from APIC IDs
470  *
471  * \param v  Vector with unsigned integer indices
472  *
473  * This routine returns the number of unique different elements found in the vector,
474  * and renumbers these starting from 0. For example, the vector {0,1,2,8,9,10,8,9,10,0,1,2}
475  * will be rewritten to {0,1,2,3,4,5,3,4,5,0,1,2}, and it returns 6 for the
476  * number of unique elements.
477  */
478 void
479 renumberIndex(std::vector<unsigned int> * v)
480 {
481     std::vector<unsigned int> sortedV (*v);
482     std::sort(sortedV.begin(), sortedV.end());
483
484     std::vector<unsigned int> uniqueSortedV (sortedV);
485     auto                      it = std::unique(uniqueSortedV.begin(), uniqueSortedV.end());
486     uniqueSortedV.resize( std::distance(uniqueSortedV.begin(), it) );
487
488     for (std::size_t i = 0; i < uniqueSortedV.size(); i++)
489     {
490         unsigned int val = uniqueSortedV[i];
491         std::replace_if(v->begin(), v->end(), [val](unsigned int &c) -> bool { return c == val; }, static_cast<unsigned int>(i));
492     }
493 }
494
495
496 /*! \brief Try to detect basic CPU topology information using x86 cpuid
497  *
498  *  If x2APIC support is present, this is our first choice, otherwise we
499  *  attempt to use old vanilla APIC.
500  *
501  *  \return A new vector of entries with socket, core, hwthread information
502  *          for each logical processor.
503  */
504 std::vector<CpuInfo::LogicalProcessor>
505 detectX86LogicalProcessors()
506 {
507     unsigned int   eax;
508     unsigned int   ebx;
509     unsigned int   ecx;
510     unsigned int   edx;
511     unsigned int   maxStdLevel;
512     unsigned int   maxExtLevel;
513     bool           haveApic;
514     bool           haveX2Apic;
515
516     std::vector<CpuInfo::LogicalProcessor> logicalProcessors;
517
518     // Find largest standard & extended level input values allowed
519     executeX86CpuID(0x0, 0, &eax, &ebx, &ecx, &edx);
520     maxStdLevel = eax;
521     executeX86CpuID(0x80000000, 0, &eax, &ebx, &ecx, &edx);
522     maxExtLevel = eax;
523
524     if (maxStdLevel >= 0x1)
525     {
526         executeX86CpuID(0x1, 0, &eax, &ebx, &ecx, &edx);
527         haveX2Apic = (ecx & (1 << 21)) && maxStdLevel >= 0xb;
528         haveApic   = (edx & (1 <<  9)) && maxExtLevel >= 0x80000008;
529     }
530     else
531     {
532         haveX2Apic = false,
533         haveApic   = false;
534     }
535
536     if (haveX2Apic || haveApic)
537     {
538         unsigned int   hwThreadBits;
539         unsigned int   coreBits;
540         // Get bits for cores and hardware threads
541         if (haveX2Apic)
542         {
543             executeX86CpuID(0xb, 0, &eax, &ebx, &ecx, &edx);
544             hwThreadBits    = eax & 0x1f;
545             executeX86CpuID(0xb, 1, &eax, &ebx, &ecx, &edx);
546             coreBits        = (eax & 0x1f) - hwThreadBits;
547         }
548         else    // haveApic
549         {
550             // AMD without x2APIC does not support SMT - there are no hwthread bits in apic ID
551             hwThreadBits = 0;
552             // Get number of core bits in apic ID - try modern extended method first
553             executeX86CpuID(0x80000008, 0, &eax, &ebx, &ecx, &edx);
554             coreBits = (ecx >> 12) & 0xf;
555             if (coreBits == 0)
556             {
557                 // Legacy method for old single/dual core AMD CPUs
558                 int i = ecx & 0xf;
559                 while (i >> coreBits)
560                 {
561                     coreBits++;
562                 }
563             }
564         }
565
566         std::vector<unsigned int>  apicID = detectX86ApicIDs(haveX2Apic);
567
568         if (!apicID.empty())
569         {
570             // APIC IDs can be buggy, and it is always a mess. Typically more bits are
571             // reserved than needed, and the numbers might not increment by 1 even in
572             // a single socket or core. Extract, renumber, and check that things make sense.
573             unsigned int               hwThreadMask  = (1 << hwThreadBits) - 1;
574             unsigned int               coreMask      = (1 << coreBits) - 1;
575             std::vector<unsigned int>  hwThreadRanks;
576             std::vector<unsigned int>  coreRanks;
577             std::vector<unsigned int>  socketRanks;
578
579             for (auto a : apicID)
580             {
581                 hwThreadRanks.push_back( static_cast<int>( a & hwThreadMask ) );
582                 coreRanks.push_back( static_cast<int>( ( a >> hwThreadBits ) & coreMask ) );
583                 socketRanks.push_back( static_cast<int>( a >> ( coreBits + hwThreadBits ) ) );
584             }
585
586             renumberIndex(&hwThreadRanks);
587             renumberIndex(&coreRanks);
588             renumberIndex(&socketRanks);
589
590             unsigned int  hwThreadRankSize = 1 + *std::max_element(hwThreadRanks.begin(), hwThreadRanks.end());
591             unsigned int  coreRankSize     = 1 + *std::max_element(coreRanks.begin(), coreRanks.end());
592             unsigned int  socketRankSize   = 1 + *std::max_element(socketRanks.begin(), socketRanks.end());
593
594             if (socketRankSize * coreRankSize * hwThreadRankSize == apicID.size() )
595             {
596                 // Alright, everything looks consistent, so put it in the result
597                 for (std::size_t i = 0; i < apicID.size(); i++)
598                 {
599                     // While the internal APIC IDs are always unsigned integers, we also cast to
600                     // plain integers for the externally exposed vectors, since that will make
601                     // it possible to use '-1' for invalid entries in the future.
602                     logicalProcessors.push_back( { int(socketRanks[i]), int(coreRanks[i]), int(hwThreadRanks[i]) } );
603                 }
604             }
605         }
606     }
607     return logicalProcessors; // Will only have contents if everything worked
608 }
609
610
611 /******************************************************************************
612  *                                                                            *
613  *              Generic Linux detection by parsing /proc/cpuinfo              *
614  *                                                                            *
615  ******************************************************************************/
616
617 /*! \brief Parse /proc/cpuinfo into a simple string map
618  *
619  * This routine will read the contents of /proc/cpuinfo, and for each
620  * line that is not empty we will assign the (trimmed) string to the right of
621  * the colon as a key, and the left-hand side as the value in the map.
622  * For multi-processor systems where lines are repeated the latter lines will
623  * overwrite the first occurrence.
624  *
625  * \return New map with the contents. If the file is not available, the returned
626  *         map will be empty.
627  */
628 const std::map<std::string, std::string>
629 parseProcCpuInfo()
630 {
631     std::ifstream                       procCpuInfo("/proc/cpuinfo");
632     std::string                         line;
633     std::map<std::string, std::string>  cpuInfo;
634
635     while (std::getline(procCpuInfo, line))
636     {
637         if (!line.empty())
638         {
639             std::stringstream iss(line);
640             std::string       key;
641             std::string       val;
642             std::getline(iss, key, ':');  // part before colon
643             std::getline(iss, val);       // part after colon
644             trimString(&key);
645             trimString(&val);
646             // put it in the map. This will overwrite previous processors, but we don't care.
647             cpuInfo[key] = val;
648         }
649     }
650     return cpuInfo;
651 }
652
653
654 /*! \brief Try to detect vendor from /proc/cpuinfo
655  *
656  *  \param cpuInfo  Map returned from parseProcCpuinfo()
657  *
658  *  This routine tries to match a few common labels in /proc/cpuinfo to see if
659  *  they begin with the name of a standard vendor. If the file cannot be read
660  *  or if no match is found, we return gmx::CpuInfo::Vendor::Unknown.
661  */
662 CpuInfo::Vendor
663 detectProcCpuInfoVendor(const std::map<std::string, std::string> &cpuInfo)
664 {
665     const std::map<std::string, CpuInfo::Vendor> testVendors =
666     {
667         { "GenuineIntel", CpuInfo::Vendor::Intel   },
668         { "Intel",        CpuInfo::Vendor::Intel   },
669         { "AuthenticAmd", CpuInfo::Vendor::Amd     },
670         { "AMD",          CpuInfo::Vendor::Amd     },
671         { "ARM",          CpuInfo::Vendor::Arm     },
672         { "AArch64",      CpuInfo::Vendor::Arm     },
673         { "Fujitsu",      CpuInfo::Vendor::Fujitsu },
674         { "IBM",          CpuInfo::Vendor::Ibm     },
675         { "POWER",        CpuInfo::Vendor::Ibm     },
676         { "Oracle",       CpuInfo::Vendor::Oracle  },
677     };
678
679     // For each label in /proc/cpuinfo, compare the value to the name in the
680     // testNames map above, and if it's a match return the vendor.
681     for (auto &l : { "vendor_id", "vendor", "manufacture", "model", "processor", "cpu" })
682     {
683         if (cpuInfo.count(l))
684         {
685             // there was a line with this left-hand side in /proc/cpuinfo
686             const std::string &s1 = cpuInfo.at(l);
687
688             for (auto &t : testVendors)
689             {
690                 const std::string &s2 = t.first;
691
692                 // If the entire name we are testing (s2) matches the first part of
693                 // the string after the colon in /proc/cpuinfo (s1) we found our vendor
694                 if (std::equal(s2.begin(), s2.end(), s1.begin(),
695                                [](const char &x, const char &y) -> bool { return tolower(x) == tolower(y); }))
696                 {
697                     return t.second;
698                 }
699             }
700         }
701     }
702     return CpuInfo::Vendor::Unknown;
703 }
704
705
706 /*! \brief Detect IBM processor name and features from /proc/cpuinfo
707  *
708  *  \param      cpuInfo    Map returned from parseProcCpuinfo()
709  *  \param[out] brand      String where to write the brand string
710  *  \param[out] features   Feature set where supported features are inserted
711  *
712  *  This routine tries to match a few common labels in /proc/cpuinfo to see if
713  *  we can find the processor name and features. It is likely fragile.
714  */
715 void
716 detectProcCpuInfoIbm(const std::map<std::string, std::string> &cpuInfo,
717                      std::string *                             brand,
718                      std::set<CpuInfo::Feature> *              features)
719 {
720     // Get brand string from 'cpu' label if present, otherwise 'Processor'
721     if (cpuInfo.count("cpu"))
722     {
723         *brand = cpuInfo.at("cpu");
724     }
725     else if (cpuInfo.count("Processor"))
726     {
727         *brand = cpuInfo.at("Processor");
728     }
729
730     if (brand->find("A2") != std::string::npos)
731     {
732         // If the processor identification contains "A2", this is BlueGene/Q with QPX
733         features->insert(CpuInfo::Feature::Ibm_Qpx);
734     }
735
736     for (auto &l : { "model name", "model", "Processor", "cpu" })
737     {
738         if (cpuInfo.count(l))
739         {
740             std::string s1 = cpuInfo.at(l);
741             std::transform(s1.begin(), s1.end(), s1.begin(), ::tolower);
742
743             if (s1.find("altivec") != std::string::npos)
744             {
745                 features->insert(CpuInfo::Feature::Ibm_Vmx);
746                 // If this is a power6, we only have VMX. All later processors have VSX.
747                 if (s1.find("power6") == std::string::npos)
748                 {
749                     features->insert(CpuInfo::Feature::Ibm_Vsx);
750                 }
751             }
752         }
753     }
754 }
755
756
757 /*! \brief Detect ARM processor name and features from /proc/cpuinfo
758  *
759  *  \param      cpuInfo    Map returned from parseProcCpuinfo()
760  *  \param[out] brand      String where to write the brand string
761  *  \param[out] family     Major version of processor
762  *  \param[out] model      Middle version of processor
763  *  \param[out] stepping   Minor version of processor
764  *  \param[out] features   Feature set where supported features are inserted
765  *
766  *  This routine tries to match a few common labels in /proc/cpuinfo to see if
767  *  we can find the processor name and features. It is likely fragile.
768  */
769 void
770 detectProcCpuInfoArm(const std::map<std::string, std::string>   &cpuInfo,
771                      std::string *                               brand,
772                      int *                                       family,
773                      int *                                       model,
774                      int *                                       stepping,
775                      std::set<CpuInfo::Feature> *                features)
776 {
777     if (cpuInfo.count("Processor"))
778     {
779         *brand = cpuInfo.at("Processor");
780     }
781     else if (cpuInfo.count("model name"))
782     {
783         *brand = cpuInfo.at("model name");
784     }
785
786     if (cpuInfo.count("CPU architecture"))
787     {
788         *family = std::strtol(cpuInfo.at("CPU architecture").c_str(), nullptr, 10);
789         // For some 64-bit CPUs it appears to say 'AArch64' instead
790         if (*family == 0 && cpuInfo.at("CPU architecture").find("AArch64") != std::string::npos)
791         {
792             *family = 8;  // fragile - no idea how a future ARMv9 will be represented in this case
793         }
794     }
795     if (cpuInfo.count("CPU variant"))
796     {
797         *model    = std::strtol(cpuInfo.at("CPU variant").c_str(), nullptr, 16);
798     }
799     if (cpuInfo.count("CPU revision"))
800     {
801         *stepping = std::strtol(cpuInfo.at("CPU revision").c_str(), nullptr, 10);
802     }
803
804     if (cpuInfo.count("Features"))
805     {
806         const std::string &s = cpuInfo.at("Features");
807         if (s.find("neon") != std::string::npos)
808         {
809             features->insert(CpuInfo::Feature::Arm_Neon);
810         }
811         if (s.find("asimd") != std::string::npos)
812         {
813             // At least Jetson TX1 runs a 32-bit environment by default, although
814             // the kernel is 64-bits, and reports asimd feature flags. We cannot
815             // use Neon-asimd in this case, so make sure we are on a 64-bit platform.
816             if (sizeof(void *) == 8)
817             {
818                 features->insert(CpuInfo::Feature::Arm_NeonAsimd);
819             }
820         }
821     }
822 }
823
824
825 /*! \brief Try to detect vendor, cpu and features from /proc/cpuinfo
826  *
827  *  \param[out] vendor     Detected hardware vendor
828  *  \param[out] brand      String where to write the brand string
829  *  \param[out] family     Major version of processor
830  *  \param[out] model      Middle version of processor
831  *  \param[out] stepping   Minor version of processor
832  *  \param[out] features   Feature set where supported features are inserted
833  *
834  *  This routine reads the /proc/cpuinfo file into a map and calls subroutines
835  *  that attempt to parse by matching keys and values to known strings. It is
836  *  much more fragile than our x86 detection, but it does not depend on
837  *  specific system calls, intrinsics or assembly instructions.
838  */
839 void
840 detectProcCpuInfo(CpuInfo::Vendor *              vendor,
841                   std::string   *                brand,
842                   int   *                        family,
843                   int   *                        model,
844                   int   *                        stepping,
845                   std::set<CpuInfo::Feature> *   features)
846 {
847     std::map<std::string, std::string> cpuInfo = parseProcCpuInfo();
848
849     if (*vendor == CpuInfo::Vendor::Unknown)
850     {
851         *vendor = detectProcCpuInfoVendor(cpuInfo);
852     }
853
854     // Unfortunately there is no standard for contents in /proc/cpuinfo. We cannot
855     // indiscriminately look for e.g. 'cpu' since it could be either name or an index.
856     // To handle this slightly better we use one subroutine per vendor.
857     switch (*vendor)
858     {
859         case CpuInfo::Vendor::Ibm:
860             detectProcCpuInfoIbm(cpuInfo, brand, features);
861             break;
862
863         case CpuInfo::Vendor::Arm:
864             detectProcCpuInfoArm(cpuInfo, brand, family, model, stepping, features);
865             break;
866
867         default:
868             // We only have a single check for fujitsu for now
869 #ifdef __HPC_ACE__
870             features->insert(CpuInfo::Feature::Fujitsu_HpcAce);
871 #endif
872             break;
873     }
874 }
875 /*! \endcond */
876 }   // namespace anonymous
877
878
879 // static
880 CpuInfo CpuInfo::detect()
881 {
882     CpuInfo result;
883
884     if (c_architecture == Architecture::X86)
885     {
886         result.vendor_ = detectX86Vendor();
887
888         if (result.vendor_ == CpuInfo::Vendor::Intel)
889         {
890             result.features_.insert(CpuInfo::Feature::X86_Intel);
891         }
892         else if (result.vendor_ == CpuInfo::Vendor::Amd)
893         {
894             result.features_.insert(CpuInfo::Feature::X86_Amd);
895         }
896         detectX86Features(&result.brandString_, &result.family_, &result.model_,
897                           &result.stepping_, &result.features_);
898         result.logicalProcessors_ = detectX86LogicalProcessors();
899     }
900     else
901     {
902         // Not x86
903         if (c_architecture == Architecture::Arm)
904         {
905             result.vendor_  = CpuInfo::Vendor::Arm;
906         }
907         else if (c_architecture == Architecture::PowerPC)
908         {
909             result.vendor_  = CpuInfo::Vendor::Ibm;
910         }
911
912 #if defined __aarch64__ || ( defined _M_ARM && _M_ARM >= 8 )
913         result.features_.insert(Feature::Arm_Neon);      // ARMv8 always has Neon
914         result.features_.insert(Feature::Arm_NeonAsimd); // ARMv8 always has Neon-asimd
915 #endif
916
917 #if defined sun
918         result.vendor_ = CpuInfo::Vendor::Oracle;
919 #endif
920
921         // On Linux we might be able to find information in /proc/cpuinfo. If vendor or brand
922         // is set to a known value this routine will not overwrite it.
923         detectProcCpuInfo(&result.vendor_, &result.brandString_, &result.family_,
924                           &result.model_, &result.stepping_, &result.features_);
925     }
926
927     if (!result.logicalProcessors_.empty())
928     {
929         result.supportLevel_ = CpuInfo::SupportLevel::LogicalProcessorInfo;
930     }
931     else if (!result.features_.empty())
932     {
933         result.supportLevel_ = CpuInfo::SupportLevel::Features;
934     }
935     else if (result.vendor_ != CpuInfo::Vendor::Unknown
936              || result.brandString_ != "Unknown CPU brand")
937     {
938         result.supportLevel_ = CpuInfo::SupportLevel::Name;
939     }
940     else
941     {
942         result.supportLevel_ = CpuInfo::SupportLevel::None;
943     }
944
945     return result;
946 }
947
948
949 CpuInfo::CpuInfo()
950     : vendor_(CpuInfo::Vendor::Unknown), brandString_("Unknown CPU brand"),
951       family_(0), model_(0), stepping_(0)
952 {
953 }
954
955
956 const std::map<CpuInfo::Vendor, std::string>
957 CpuInfo::s_vendorStrings_ =
958 {
959     { CpuInfo::Vendor::Unknown, "Unknown vendor"                  },
960     { CpuInfo::Vendor::Intel, "Intel"                             },
961     { CpuInfo::Vendor::Amd, "AMD"                                 },
962     { CpuInfo::Vendor::Fujitsu, "Fujitsu"                         },
963     { CpuInfo::Vendor::Ibm, "IBM"                                 },
964     { CpuInfo::Vendor::Arm, "ARM"                                 },
965     { CpuInfo::Vendor::Oracle, "Oracle"                           },
966 };
967
968
969 const std::map<CpuInfo::Feature, std::string>
970 CpuInfo::s_featureStrings_ =
971 {
972     { CpuInfo::Feature::X86_Aes, "aes"                            },
973     { CpuInfo::Feature::X86_Amd, "amd"                            },
974     { CpuInfo::Feature::X86_Apic, "apic"                          },
975     { CpuInfo::Feature::X86_Avx, "avx"                            },
976     { CpuInfo::Feature::X86_Avx2, "avx2"                          },
977     { CpuInfo::Feature::X86_Avx512F, "avx512f"                    },
978     { CpuInfo::Feature::X86_Avx512PF, "avx512pf"                  },
979     { CpuInfo::Feature::X86_Avx512ER, "avx512er"                  },
980     { CpuInfo::Feature::X86_Avx512CD, "avx512cd"                  },
981     { CpuInfo::Feature::X86_Avx512BW, "avx512bw"                  },
982     { CpuInfo::Feature::X86_Avx512VL, "avx512vl"                  },
983     { CpuInfo::Feature::X86_Clfsh, "clfsh"                        },
984     { CpuInfo::Feature::X86_Cmov, "cmov"                          },
985     { CpuInfo::Feature::X86_Cx8, "cx8"                            },
986     { CpuInfo::Feature::X86_Cx16, "cx16"                          },
987     { CpuInfo::Feature::X86_F16C, "f16c"                          },
988     { CpuInfo::Feature::X86_Fma, "fma"                            },
989     { CpuInfo::Feature::X86_Fma4, "fma4"                          },
990     { CpuInfo::Feature::X86_Hle, "hle"                            },
991     { CpuInfo::Feature::X86_Htt, "htt"                            },
992     { CpuInfo::Feature::X86_Intel, "intel"                        },
993     { CpuInfo::Feature::X86_Lahf, "lahf"                          },
994     { CpuInfo::Feature::X86_MisalignSse, "misalignsse"            },
995     { CpuInfo::Feature::X86_Mmx, "mmx"                            },
996     { CpuInfo::Feature::X86_Msr, "msr"                            },
997     { CpuInfo::Feature::X86_NonstopTsc, "nonstop_tsc"             },
998     { CpuInfo::Feature::X86_Pcid, "pcid"                          },
999     { CpuInfo::Feature::X86_Pclmuldq, "pclmuldq"                  },
1000     { CpuInfo::Feature::X86_Pdcm, "pdcm"                          },
1001     { CpuInfo::Feature::X86_PDPE1GB, "pdpe1gb"                    },
1002     { CpuInfo::Feature::X86_Popcnt, "popcnt"                      },
1003     { CpuInfo::Feature::X86_Pse, "pse"                            },
1004     { CpuInfo::Feature::X86_Rdrnd, "rdrnd"                        },
1005     { CpuInfo::Feature::X86_Rdtscp, "rdtscp"                      },
1006     { CpuInfo::Feature::X86_Rtm, "rtm"                            },
1007     { CpuInfo::Feature::X86_Sha, "sha"                            },
1008     { CpuInfo::Feature::X86_Sse2, "sse2"                          },
1009     { CpuInfo::Feature::X86_Sse3, "sse3"                          },
1010     { CpuInfo::Feature::X86_Sse4A, "sse4a"                        },
1011     { CpuInfo::Feature::X86_Sse4_1, "sse4.1"                      },
1012     { CpuInfo::Feature::X86_Sse4_2, "sse4.2"                      },
1013     { CpuInfo::Feature::X86_Ssse3, "ssse3"                        },
1014     { CpuInfo::Feature::X86_Tdt, "tdt"                            },
1015     { CpuInfo::Feature::X86_X2Apic, "x2apic"                      },
1016     { CpuInfo::Feature::X86_Xop, "xop"                            },
1017     { CpuInfo::Feature::Arm_Neon, "neon"                          },
1018     { CpuInfo::Feature::Arm_NeonAsimd, "neon_asimd"               },
1019     { CpuInfo::Feature::Ibm_Qpx, "qpx"                            },
1020     { CpuInfo::Feature::Ibm_Vmx, "vmx"                            },
1021     { CpuInfo::Feature::Ibm_Vsx, "vsx"                            },
1022     { CpuInfo::Feature::Fujitsu_HpcAce, "hpc-ace"                 }
1023 };
1024
1025
1026 bool
1027 cpuIsX86Nehalem(const CpuInfo &cpuInfo)
1028 {
1029     return (cpuInfo.vendor() == gmx::CpuInfo::Vendor::Intel &&
1030             cpuInfo.family() == 6 &&
1031             (cpuInfo.model() == 0x2E || cpuInfo.model() == 0x1A ||
1032              cpuInfo.model() == 0x1E || cpuInfo.model() == 0x2F ||
1033              cpuInfo.model() == 0x2C || cpuInfo.model() == 0x25) );
1034 }
1035
1036 }  // namespace gmx
1037
1038 #ifdef GMX_CPUINFO_STANDALONE
1039 int
1040 main(int argc, char **argv)
1041 {
1042     if (argc < 2)
1043     {
1044         fprintf(stdout,
1045                 "Usage:\n\n%s [flags]\n\n"
1046                 "Available flags:\n"
1047                 "-vendor        Print CPU vendor.\n"
1048                 "-brand         Print CPU brand string.\n"
1049                 "-family        Print CPU family version.\n"
1050                 "-model         Print CPU model version.\n"
1051                 "-stepping      Print CPU stepping version.\n"
1052                 "-features      Print CPU feature flags.\n",
1053                 argv[0]);
1054         exit(1);
1055     }
1056
1057     std::string   arg(argv[1]);
1058     gmx::CpuInfo  cpuInfo(gmx::CpuInfo::detect());
1059
1060     if (arg == "-vendor")
1061     {
1062         printf("%s\n", cpuInfo.vendorString().c_str());
1063     }
1064     else if (arg == "-brand")
1065     {
1066         printf("%s\n", cpuInfo.brandString().c_str());
1067     }
1068     else if (arg == "-family")
1069     {
1070         printf("%d\n", cpuInfo.family());
1071     }
1072     else if (arg == "-model")
1073     {
1074         printf("%d\n", cpuInfo.model());
1075     }
1076     else if (arg == "-stepping")
1077     {
1078         printf("%d\n", cpuInfo.stepping());
1079     }
1080     else if (arg == "-features")
1081     {
1082         // Separate the feature strings with spaces. Note that in the
1083         // GROMACS cmake code, surrounding whitespace is first
1084         // stripped by the CPU detection routine, and then added back
1085         // in the code for making the SIMD suggestion.
1086         for (auto &f : cpuInfo.featureSet() )
1087         {
1088             printf("%s ", cpuInfo.featureString(f).c_str());
1089         }
1090         printf("\n");
1091     }
1092     else if (arg == "-topology")
1093     {
1094         // Undocumented debug option, usually not present in standalone version
1095         for (auto &t : cpuInfo.logicalProcessors() )
1096         {
1097             printf("%3u %3u %3u\n", t.socketRankInMachine, t.coreRankInSocket, t.hwThreadRankInCore);
1098         }
1099     }
1100     return 0;
1101 }
1102 #endif