#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
-#ifdef __linux
+#if defined(HAVE_SCHED_H) && (defined(HAVE_SCHED_GETAFFINITY) || defined(HAVE_SCHED_SETAFFINITY))
#define _GNU_SOURCE
#include <sched.h>
#include <sys/syscall.h>
}
-static int get_tmpi_omp_thread_distribution(const gmx_hw_opt_t *hw_opt,
- int nthreads_tot,
- int ngpu)
+static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
+ const gmx_hw_opt_t *hw_opt,
+ int nthreads_tot,
+ int ngpu)
{
int nthreads_tmpi;
}
else if (hw_opt->nthreads_omp > 0)
{
- if (hw_opt->nthreads_omp > nthreads_tot)
- {
- gmx_fatal(FARGS,"More OpenMP threads requested (%d) than the total number of threads requested (%d)",hw_opt->nthreads_omp,nthreads_tot);
- }
- nthreads_tmpi = nthreads_tot/hw_opt->nthreads_omp;
+ /* Here we could oversubscribe, when we do, we issue a warning later */
+ nthreads_tmpi = max(1,nthreads_tot/hw_opt->nthreads_omp);
}
else
{
/* TODO choose nthreads_omp based on hardware topology
when we have a hardware topology detection library */
- /* Don't use OpenMP parallelization */
- nthreads_tmpi = nthreads_tot;
+ /* In general, when running up to 4 threads, OpenMP should be faster.
+ * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
+ * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
+ * even on two CPUs it's usually faster (but with many OpenMP threads
+ * it could be faster not to use HT, currently we always use HT).
+ * On Nehalem/Westmere we want to avoid running 16 threads over
+ * two CPUs with HT, so we need a limit<16; thus we use 12.
+ * A reasonable limit for Intel Sandy and Ivy bridge,
+ * not knowing the topology, is 16 threads.
+ */
+ const int nthreads_omp_always_faster = 4;
+ const int nthreads_omp_always_faster_Nehalem = 12;
+ const int nthreads_omp_always_faster_SandyBridge = 16;
+ const int first_model_Nehalem = 0x1A;
+ const int first_model_SandyBridge = 0x2A;
+ gmx_bool bIntel_Family6;
+
+ bIntel_Family6 =
+ (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
+ gmx_cpuid_family(hwinfo->cpuid_info) == 6);
+
+ if (nthreads_tot <= nthreads_omp_always_faster ||
+ (bIntel_Family6 &&
+ ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
+ (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
+ {
+ /* Use pure OpenMP parallelization */
+ nthreads_tmpi = 1;
+ }
+ else
+ {
+ /* Don't use OpenMP parallelization */
+ nthreads_tmpi = nthreads_tot;
+ }
}
return nthreads_tmpi;
const t_commrec *cr,
FILE *fplog)
{
- int nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
+ int nthreads_hw,nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
int min_atoms_per_mpi_thread;
char *env;
char sbuf[STRLEN];
return hw_opt->nthreads_tmpi;
}
+ nthreads_hw = hwinfo->nthreads_hw_avail;
+
/* How many total (#tMPI*#OpenMP) threads can we start? */
if (hw_opt->nthreads_tot > 0)
{
}
else
{
- nthreads_tot_max = tMPI_Thread_get_hw_number();
+ nthreads_tot_max = nthreads_hw;
}
bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
}
nthreads_tmpi =
- get_tmpi_omp_thread_distribution(hw_opt,nthreads_tot_max,ngpu);
+ get_tmpi_omp_thread_division(hwinfo,hw_opt,nthreads_tot_max,ngpu);
if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
{
threads (too few atoms per thread) */
nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
- if (nthreads_new > 8 || (nthreads_tmpi == 8 && nthreads_new > 4))
+ /* Avoid partial use of Hyper-Threading */
+ if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
+ nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
{
- /* TODO replace this once we have proper HT detection
- * Use only multiples of 4 above 8 threads
- * or with an 8-core processor
- * (to avoid 6 threads on 8 core processors with 4 real cores).
- */
- nthreads_new = (nthreads_new/4)*4;
+ nthreads_new = nthreads_hw/2;
}
- else if (nthreads_new > 4)
+
+ /* Avoid large prime numbers in the thread count */
+ if (nthreads_new >= 6)
{
- /* Avoid 5 or 7 threads */
- nthreads_new = (nthreads_new/2)*2;
+ /* Use only 6,8,10 with additional factors of 2 */
+ int fac;
+
+ fac = 2;
+ while (3*fac*2 <= nthreads_new)
+ {
+ fac *= 2;
+ }
+
+ nthreads_new = (nthreads_new/fac)*fac;
+ }
+ else
+ {
+ /* Avoid 5 */
+ if (nthreads_new == 5)
+ {
+ nthreads_new = 4;
+ }
}
nthreads_tmpi = nthreads_new;
gmx_mtop_remove_chargegroups(mtop);
}
+/* Check the process affinity mask and if it is found to be non-zero,
+ * will honor it and disable mdrun internal affinity setting.
+ * This function should be called first before the OpenMP library gets
+ * initialized with the last argument FALSE (which will detect affinity
+ * set by external tools like taskset), and later, after the OpenMP
+ * initialization, with the last argument TRUE to detect affinity changes
+ * made by the OpenMP library.
+ *
+ * Note that this will only work on Linux as we use a GNU feature. */
+static void check_cpu_affinity_set(FILE *fplog, const t_commrec *cr,
+ gmx_hw_opt_t *hw_opt, int ncpus,
+ gmx_bool bAfterOpenmpInit)
+{
+#ifdef HAVE_SCHED_GETAFFINITY
+ cpu_set_t mask_current;
+ int i, ret, cpu_count, cpu_set;
+ gmx_bool bAllSet;
+
+ assert(hw_opt);
+ if (!hw_opt->bThreadPinning)
+ {
+ /* internal affinity setting is off, don't bother checking process affinity */
+ return;
+ }
+
+ CPU_ZERO(&mask_current);
+ if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
+ {
+ /* failed to query affinity mask, will just return */
+ if (debug)
+ {
+ fprintf(debug, "Failed to query affinity mask (error %d)", ret);
+ }
+ return;
+ }
+
+ /* Before proceeding with the actual check, make sure that the number of
+ * detected CPUs is >= the CPUs in the current set.
+ * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
+#ifdef CPU_COUNT
+ if (ncpus < CPU_COUNT(&mask_current))
+ {
+ if (debug)
+ {
+ fprintf(debug, "%d CPUs detected, but %d was returned by CPU_COUNT",
+ ncpus, CPU_COUNT(&mask_current));
+ }
+ return;
+ }
+#endif /* CPU_COUNT */
+
+ bAllSet = TRUE;
+ for (i = 0; (i < ncpus && i < CPU_SETSIZE); i++)
+ {
+ bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0);
+ }
+
+ if (!bAllSet)
+ {
+ if (!bAfterOpenmpInit)
+ {
+ md_print_warn(cr, fplog,
+ "Non-default process affinity set, disabling internal affinity");
+ }
+ else
+ {
+ md_print_warn(cr, fplog,
+ "Non-default process affinity set probably by the OpenMP library, "
+ "disabling internal affinity");
+ }
+ hw_opt->bThreadPinning = FALSE;
+
+ if (debug)
+ {
+ fprintf(debug, "Non-default affinity mask found\n");
+ }
+ }
+ else
+ {
+ if (debug)
+ {
+ fprintf(debug, "Default affinity mask found\n");
+ }
+ }
+#endif /* HAVE_SCHED_GETAFFINITY */
+}
/* Set CPU affinity. Can be important for performance.
On some systems (e.g. Cray) CPU Affinity is set by default.
#endif
#ifdef GMX_OPENMP /* TODO: actually we could do this even without OpenMP?! */
-#ifdef __linux /* TODO: only linux? why not everywhere if sched_setaffinity is available */
+#ifdef HAVE_SCHED_SETAFFINITY
if (hw_opt->bThreadPinning)
{
int thread, nthread_local, nthread_node, nthread_hw_max, nphyscore;
sched_setaffinity((pid_t) syscall (SYS_gettid), sizeof(cpu_set_t), &mask);
}
}
-#endif /* __linux */
+#endif /* HAVE_SCHED_SETAFFINITY */
#endif /* GMX_OPENMP */
}
* before the other nodes have read the tpx file and called gmx_detect_hardware.
*/
typedef struct {
- int cutoff_scheme; /* The cutoff-scheme from inputrec_t */
+ int cutoff_scheme; /* The cutoff scheme from inputrec_t */
gmx_bool bUseGPU; /* Use GPU or GPU emulation */
} master_inf_t;
cr->npmenodes = 0;
}
+ /* Check for externally set OpenMP affinity and turn off internal
+ * pinning if any is found. We need to do this check early to tell
+ * thread-MPI whether it should do pinning when spawning threads.
+ */
+ gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
+
#ifdef GMX_THREAD_MPI
/* With thread-MPI inputrec is only set here on the master thread */
if (SIMMASTER(cr))
{
check_and_update_hw_opt(hw_opt,minf.cutoff_scheme);
+#ifdef GMX_THREAD_MPI
+ /* Early check for externally set process affinity. Can't do over all
+ * MPI processes because hwinfo is not available everywhere, but with
+ * thread-MPI it's needed as pinning might get turned off which needs
+ * to be known before starting thread-MPI. */
+ check_cpu_affinity_set(fplog,
+ NULL,
+ hw_opt, hwinfo->nthreads_hw_avail, FALSE);
+#endif
+
#ifdef GMX_THREAD_MPI
if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
{
}
}
- /* Check for externally set OpenMP affinity and turn off internal
- * pinning if any is found. We need to do this check early to tell
- * thread-MPI whether it should do pinning when spawning threads.
- */
- gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
-
#ifdef GMX_THREAD_MPI
if (SIMMASTER(cr))
{
gmx_detect_hardware(fplog, hwinfo, cr,
bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
}
+
+ /* Now do the affinity check with MPI/no-MPI (done earlier with thread-MPI). */
+ check_cpu_affinity_set(fplog, cr,
+ hw_opt, hwinfo->nthreads_hw_avail, FALSE);
#endif
/* now make sure the state is initialized and propagated */
snew(pmedata,1);
}
+ /* Before setting affinity, check whether the affinity has changed
+ * - which indicates that probably the OpenMP library has changed it since
+ * we first checked). */
+ check_cpu_affinity_set(fplog, cr, hw_opt, hwinfo->nthreads_hw_avail, TRUE);
+
/* Set the CPU affinity */
set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);