From a1bd375a4411f978b6451473357b339c1d5cfd0b Mon Sep 17 00:00:00 2001 From: Erik Lindahl Date: Wed, 9 Jan 2013 01:37:43 +0100 Subject: [PATCH] Added basic CPU topology information to cpuid code We can now detect the locality of hardware threads, cores, and packages for Intel and AMD CPUs under Linux and Windows. In particular, this provides an array with locality order for logical processors that can be used to optimize placement. Refs #1086, #1101. Change-Id: I3f7985b1b67729376918c5a135b9157a9086235e --- include/gmx_cpuid.h | 38 +++- src/gmxlib/gmx_cpuid.c | 381 ++++++++++++++++++++++++++++++----------- 2 files changed, 320 insertions(+), 99 deletions(-) diff --git a/include/gmx_cpuid.h b/include/gmx_cpuid.h index 3b6673c807..d58f90273f 100644 --- a/include/gmx_cpuid.h +++ b/include/gmx_cpuid.h @@ -194,6 +194,40 @@ gmx_cpuid_feature (gmx_cpuid_t cpuid, enum gmx_cpuid_feature feature); +/* Return pointers to cpu topology information. + * + * Important: CPU topology requires more OS support than most other + * functions in this file, including support for thread pinning to hardware. + * This means it will not work on some platforms, including e.g. Mac OS X. + * Thus, it is IMPERATIVE that you check the return value from this routine + * before doing anything with the information. It is only if the return + * value is zero that the data is valid. + * + * For the returned values we have: + * - nprocessors Total number of logical processors reported by OS + * - npackages Usually number of CPU sockets + * - ncores_per_package Number of cores in each package + * - nhwthreads_per_core Number of hardware threads per core; 2 for hyperthreading. + * - package_id Array with the package index for each logical cpu + * - core_id Array with local core index for each logical cpu + * - hwthread_id Array with local hwthread index for each logical cpu + * - locality_order Array with logical cpu numbers, sorted in order + * of physical and logical locality in the system. + * + * All arrays are of length nprocessors. + */ +GMX_LIBGMX_EXPORT +int +gmx_cpuid_topology(gmx_cpuid_t cpuid, + int * nprocessors, + int * npackages, + int * ncores_per_package, + int * nhwthreads_per_core, + const int ** package_id, + const int ** core_id, + const int ** hwthread_id, + const int ** locality_order); + /* Enumerated values for x86 SMT enabled-status. Note that this does not refer * to Hyper-Threading support (that is the flag GMX_CPUID_FEATURE_X86_HTT), but * whether Hyper-Threading is _enabled_ and _used_ in bios right now. @@ -232,13 +266,15 @@ enum gmx_cpuid_x86_smt * in order not to give the impression we can detect any SMT. We haven't * even tested the performance on other SMT implementations, so it is not * obvious we shouldn't use SMT there. + * + * Note that you can get more complete topology information from + * gmx_cpuid_topology(), although that requires slightly more OS support. */ GMX_LIBGMX_EXPORT enum gmx_cpuid_x86_smt gmx_cpuid_x86_smt(gmx_cpuid_t cpuid); - /* Formats a text string (up to n characters) from the data structure. * The output will have max 80 chars between newline characters. */ diff --git a/src/gmxlib/gmx_cpuid.c b/src/gmxlib/gmx_cpuid.c index bf7c1302a2..9b1227b423 100644 --- a/src/gmxlib/gmx_cpuid.c +++ b/src/gmxlib/gmx_cpuid.c @@ -140,6 +140,20 @@ struct gmx_cpuid int stepping; /* Not using gmx_bool here, since this file must be possible to compile without simple.h */ char feature[GMX_CPUID_NFEATURES]; + + /* Basic CPU topology information. For x86 this is a bit complicated since the topology differs between + * operating systems and sometimes even settings. For most other architectures you can likely just check + * the documentation and then write static information to these arrays rather than detecting on-the-fly. + */ + int have_cpu_topology; + int nproc; /* total number of logical processors from OS */ + int npackages; + int ncores_per_package; + int nhwthreads_per_core; + int * package_id; + int * core_id; /* Local core id in each package */ + int * hwthread_id; /* Local hwthread id in each core */ + int * locality_order; /* Processor indices sorted in locality order */ }; @@ -390,17 +404,108 @@ cpuid_check_common_x86(gmx_cpuid_t cpuid) execute_x86cpuid(0x80000007,0,&eax,&ebx,&ecx,&edx); cpuid->feature[GMX_CPUID_FEATURE_X86_NONSTOP_TSC] = (edx & (1 << 8)) != 0; } - return 0; } +/* This routine returns the number of unique different elements found in the array, + * and renumbers these starting from 0. For example, the array {0,1,2,8,9,10,8,9,10,0,1,2} + * will be rewritten to {0,1,2,3,4,5,3,4,5,0,1,2}, and it returns 6 for the + * number of unique elements. + */ +static int +cpuid_renumber_elements(int *data, int n) +{ + int *unique; + int i,j,nunique,found; + + unique = malloc(sizeof(int)*n); + + nunique=0; + for(i=0;i0 && unique[j-1]>data[i];j--) + { + unique[j]=unique[j-1]; + } + unique[j]=data[i]; + } + } + /* renumber */ + for(i=0;ihwthread_id = malloc(sizeof(int)*cpuid->nproc); + cpuid->core_id = malloc(sizeof(int)*cpuid->nproc); + cpuid->package_id = malloc(sizeof(int)*cpuid->nproc); + cpuid->locality_order = malloc(sizeof(int)*cpuid->nproc); + + hwthread_mask = (1 << hwthread_bits) - 1; + core_mask_after_shift = (1 << core_bits) - 1; + + for(i=0;inproc;i++) + { + cpuid->hwthread_id[i] = apic_id[i] & hwthread_mask; + cpuid->core_id[i] = (apic_id[i] >> hwthread_bits) & core_mask_after_shift; + cpuid->package_id[i] = apic_id[i] >> (core_bits + hwthread_bits); + } + + cpuid->npackages = cpuid_renumber_elements(cpuid->package_id,cpuid->nproc); + cpuid->ncores_per_package = cpuid_renumber_elements(cpuid->core_id,cpuid->nproc); + cpuid->nhwthreads_per_core = cpuid_renumber_elements(cpuid->hwthread_id,cpuid->nproc); + + /* Create a locality order array, i.e. first all resources in package0, which in turn + * are sorted so we first have all resources in core0, where threads are sorted in order, etc. + */ + for(i=0;inproc;i++) + { + idx = (cpuid->package_id[i]*cpuid->ncores_per_package + cpuid->core_id[i])*cpuid->nhwthreads_per_core + cpuid->hwthread_id[i]; + cpuid->locality_order[idx]=i; + } +} + + /* Detection of AMD-specific CPU features */ static int cpuid_check_amd_x86(gmx_cpuid_t cpuid) { int max_stdfn,max_extfn; unsigned int eax,ebx,ecx,edx; - + int i; + int hwthread_bits,core_bits; + int * apic_id; + cpuid_check_common_x86(cpuid); execute_x86cpuid(0x0,0,&eax,&ebx,&ecx,&edx); @@ -418,7 +523,64 @@ cpuid_check_amd_x86(gmx_cpuid_t cpuid) cpuid->feature[GMX_CPUID_FEATURE_X86_XOP] = (ecx & (1 << 11)) != 0; cpuid->feature[GMX_CPUID_FEATURE_X86_FMA4] = (ecx & (1 << 16)) != 0; } - + + /* Query APIC information on AMD */ + if(max_extfn>=0x80000008) + { +#if (defined HAVE_SCHED_H && defined HAVE_SCHED_SETAFFINITY && defined HAVE_SYSCONF && defined __linux__) + /* Linux */ + cpu_set_t cpuset,save_cpuset; + cpuid->nproc = sysconf(_SC_NPROCESSORS_ONLN); + apic_id = malloc(sizeof(int)*cpuid->nproc); + sched_getaffinity(0,sizeof(cpu_set_t),&save_cpuset); + /* Get APIC id from each core */ + CPU_ZERO(&cpuset); + for(i=0;inproc;i++) + { + CPU_SET(i,&cpuset); + sched_setaffinity(0,sizeof(cpu_set_t),&cpuset); + execute_x86cpuid(0x1,0,&eax,&ebx,&ecx,&edx); + apic_id[i]=ebx >> 24; + CPU_CLR(i,&cpuset); + } + /* Reset affinity to the value it had when calling this routine */ + sched_setaffinity(0,sizeof(cpu_set_t),&save_cpuset); +#define CPUID_HAVE_APIC +#elif defined GMX_NATIVE_WINDOWS + /* Windows */ + SYSTEM_INFO sysinfo; + unsigned int save_affinity,affinity; + GetSystemInfo( &sysinfo ); + cpuid->nproc = sysinfo.dwNumberOfProcessors; + apic_id = malloc(sizeof(int)*cpuid->nproc); + /* Get previous affinity mask */ + save_affinity = SetThreadAffinityMask(GetCurrentThread(),1); + for(i=0;inproc;i++) + { + SetThreadAffinityMask(GetCurrentThread(),(1<> 24; + } + SetThreadAffinityMask(GetCurrentThread(),save_affinity); +#define CPUID_HAVE_APIC +#endif +#ifdef CPUID_HAVE_APIC + /* AMD does not support SMT yet - there are no hwthread bits in apic ID */ + hwthread_bits = 0; + /* Get number of core bits in apic ID - try modern extended method first */ + execute_x86cpuid(0x80000008,0,&eax,&ebx,&ecx,&edx); + core_bits = (ecx >> 12) & 0xf; + if(core_bits==0) + { + /* Legacy method for old single/dual core AMD CPUs */ + i = ecx & 0xF; + for(core_bits=0;(i>>core_bits)>0;core_bits++) ; + } + cpuid_x86_decode_apic_id(cpuid,apic_id,core_bits,hwthread_bits); + cpuid->have_cpu_topology = 1; +#endif + } return 0; } @@ -430,6 +592,8 @@ cpuid_check_intel_x86(gmx_cpuid_t cpuid) unsigned int eax,ebx,ecx,edx; unsigned int i; unsigned int max_logical_cores,max_physical_cores; + int hwthread_bits,core_bits; + int * apic_id; cpuid_check_common_x86(cpuid); @@ -468,6 +632,57 @@ cpuid_check_intel_x86(gmx_cpuid_t cpuid) cpuid->feature[GMX_CPUID_FEATURE_X86_HTT] = 0; } } + + if(max_stdfn>=0xB) + { + /* Query x2 APIC information from cores */ +#if (defined HAVE_SCHED_H && defined HAVE_SCHED_SETAFFINITY && defined HAVE_SYSCONF && defined __linux__) + /* Linux */ + cpu_set_t cpuset,save_cpuset; + cpuid->nproc = sysconf(_SC_NPROCESSORS_ONLN); + apic_id = malloc(sizeof(int)*cpuid->nproc); + sched_getaffinity(0,sizeof(cpu_set_t),&save_cpuset); + /* Get x2APIC ID from each hardware thread */ + CPU_ZERO(&cpuset); + for(i=0;inproc;i++) + { + CPU_SET(i,&cpuset); + sched_setaffinity(0,sizeof(cpu_set_t),&cpuset); + execute_x86cpuid(0xB,0,&eax,&ebx,&ecx,&edx); + apic_id[i]=edx; + CPU_CLR(i,&cpuset); + } + /* Reset affinity to the value it had when calling this routine */ + sched_setaffinity(0,sizeof(cpu_set_t),&save_cpuset); +#define CPUID_HAVE_APIC +#elif defined GMX_NATIVE_WINDOWS + /* Windows */ + SYSTEM_INFO sysinfo; + unsigned int save_affinity,affinity; + GetSystemInfo( &sysinfo ); + cpuid->nproc = sysinfo.dwNumberOfProcessors; + apic_id = malloc(sizeof(int)*cpuid->nproc); + /* Get previous affinity mask */ + save_affinity = SetThreadAffinityMask(GetCurrentThread(),1); + for(i=0;inproc;i++) + { + SetThreadAffinityMask(GetCurrentThread(),(1<have_cpu_topology = 1; +#endif + } return 0; } #endif /* GMX_CPUID_X86 */ @@ -513,6 +728,59 @@ cpuid_check_vendor(void) +int +gmx_cpuid_topology(gmx_cpuid_t cpuid, + int * nprocessors, + int * npackages, + int * ncores_per_package, + int * nhwthreads_per_core, + const int ** package_id, + const int ** core_id, + const int ** hwthread_id, + const int ** locality_order) +{ + int rc; + + if(cpuid->have_cpu_topology) + { + *nprocessors = cpuid->nproc; + *npackages = cpuid->npackages; + *ncores_per_package = cpuid->ncores_per_package; + *nhwthreads_per_core = cpuid->nhwthreads_per_core; + *package_id = cpuid->package_id; + *core_id = cpuid->core_id; + *hwthread_id = cpuid->hwthread_id; + *locality_order = cpuid->locality_order; + rc = 0; + } + else + { + rc = -1; + } + return rc; +} + + +enum gmx_cpuid_x86_smt +gmx_cpuid_x86_smt(gmx_cpuid_t cpuid) +{ + enum gmx_cpuid_x86_smt rc; + + if(cpuid->have_cpu_topology) + { + rc = (cpuid->nhwthreads_per_core>1) ? GMX_CPUID_X86_SMT_ENABLED : GMX_CPUID_X86_SMT_DISABLED; + } + else if(cpuid->vendor==GMX_CPUID_VENDOR_AMD || gmx_cpuid_feature(cpuid,GMX_CPUID_FEATURE_X86_HTT)==0) + { + rc = GMX_CPUID_X86_SMT_DISABLED; + } + else + { + rc = GMX_CPUID_X86_SMT_CANNOTDETECT; + } + return rc; +} + int gmx_cpuid_init (gmx_cpuid_t * pcpuid) @@ -528,9 +796,18 @@ gmx_cpuid_init (gmx_cpuid_t * pcpuid) { cpuid->feature[i]=0; } - + cpuid->have_cpu_topology = 0; + cpuid->nproc = 0; + cpuid->npackages = 0; + cpuid->ncores_per_package = 0; + cpuid->nhwthreads_per_core = 0; + cpuid->package_id = NULL; + cpuid->core_id = NULL; + cpuid->hwthread_id = NULL; + cpuid->locality_order = NULL; + cpuid->vendor = cpuid_check_vendor(); - + switch(cpuid->vendor) { #ifdef GMX_CPUID_X86 @@ -547,7 +824,7 @@ gmx_cpuid_init (gmx_cpuid_t * pcpuid) cpuid->family = 0; cpuid->model = 0; cpuid->stepping = 0; - + for(i=0;ifeature[i]=0; @@ -717,98 +994,6 @@ gmx_cpuid_acceleration_check(gmx_cpuid_t cpuid, } -enum gmx_cpuid_x86_smt -gmx_cpuid_x86_smt(gmx_cpuid_t cpuid) -{ -#ifdef GMX_CPUID_X86 -#if (defined HAVE_SCHED_H && defined HAVE_SCHED_SETAFFINITY && defined HAVE_SYSCONF && defined __linux__) - int i; - int nproc; - cpu_set_t cpuset,save_cpuset; - int * apic_id; - unsigned int eax,ebx,ecx,edx; - int core_shift_bits; - int smt_found; - - if( gmx_cpuid_vendor(cpuid)!=GMX_CPUID_VENDOR_INTEL || - gmx_cpuid_feature(cpuid,GMX_CPUID_FEATURE_X86_HTT)==0) - { - return GMX_CPUID_X86_SMT_DISABLED; - } - - /* Check cpuid max standard function */ - execute_x86cpuid(0x0,0,&eax,&ebx,&ecx,&edx); - - /* Early CPUs that do not support function 11 do not support SMT either */ - if(eax<0xB) - { - return GMX_CPUID_X86_SMT_DISABLED; - } - - /* If we got here, it is a modern Intel CPU that supports detection, as does our OS */ - - /* How many processors? */ - nproc = sysconf(_SC_NPROCESSORS_ONLN); - - apic_id = malloc(sizeof(int)*nproc); - - sched_getaffinity(0,sizeof(cpu_set_t),&save_cpuset); - - /* Get x2APIC ID from each hardware thread */ - CPU_ZERO(&cpuset); - for(i=0;i>core_shift_bits == apic_id[0] >> core_shift_bits); - } - - free(apic_id); - - if(smt_found==1) - { - return GMX_CPUID_X86_SMT_ENABLED; - } - else - { - return GMX_CPUID_X86_SMT_DISABLED; - } -#else - /* Do the trivial stuff first. If Hyper-Threading isn't even supported it - * cannot be enabled, no matter what OS detection we use! - */ - if(0==gmx_cpuid_feature(cpuid,GMX_CPUID_FEATURE_X86_HTT)) - { - return GMX_CPUID_X86_SMT_DISABLED; - } - else - { - return GMX_CPUID_X86_SMT_CANNOTDETECT; - } -#endif -#else - /* not x86 */ - return GMX_CPUID_X86_SMT_CANNOTDETECT; -#endif -} - - - #ifdef GMX_CPUID_STANDALONE /* Stand-alone program to enable queries of CPU features from Cmake. -- 2.22.0