"Note that using combined MPI+OpenMP parallelization is almost always",
"slower than single parallelization, except at the scaling limit, where",
"especially OpenMP parallelization of PME reduces the communication cost.",
+ "OpenMP-only parallelization is much faster than MPI-only parallelization",
+ "on a single CPU(-die). Since we currently don't have proper hardware",
+ "topology detection, [TT]mdrun[tt] compiled with thread-MPI will only",
+ "automatically use OpenMP-only parallelization when you use up to 4",
+ "threads, up to 12 threads with Intel Nehalem/Westmere, or up to 16",
+ "threads with Intel Sandy Bridge or newer CPUs. Otherwise MPI-only",
+ "parallelization is used (except with GPUs, see below).",
"[PAR]",
"To quickly test the performance of the new Verlet cut-off scheme",
"with old [TT].tpr[tt] files, either on CPUs or CPUs+GPUs, you can use",
{ "-ntmpi", FALSE, etINT, {&hw_opt.nthreads_tmpi},
"Number of thread-MPI threads to start (0 is guess)" },
{ "-ntomp", FALSE, etINT, {&hw_opt.nthreads_omp},
- "Number of OpenMP threads to start (0 is guess)" },
+ "Number of OpenMP threads per MPI process/thread to start (0 is guess)" },
{ "-ntomp_pme", FALSE, etINT, {&hw_opt.nthreads_omp_pme},
- "Number of OpenMP threads to start (0 is -ntomp)" },
+ "Number of OpenMP threads per MPI process/thread to start (0 is -ntomp)" },
{ "-pin", FALSE, etBOOL, {&hw_opt.bThreadPinning},
"Pin OpenMP threads to cores" },
{ "-pinht", FALSE, etBOOL, {&hw_opt.bPinHyperthreading},
}
-static int get_tmpi_omp_thread_distribution(const gmx_hw_opt_t *hw_opt,
- int nthreads_tot,
- int ngpu)
+static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
+ const gmx_hw_opt_t *hw_opt,
+ int nthreads_tot,
+ int ngpu)
{
int nthreads_tmpi;
}
else if (hw_opt->nthreads_omp > 0)
{
- if (hw_opt->nthreads_omp > nthreads_tot)
- {
- gmx_fatal(FARGS,"More OpenMP threads requested (%d) than the total number of threads requested (%d)",hw_opt->nthreads_omp,nthreads_tot);
- }
- nthreads_tmpi = nthreads_tot/hw_opt->nthreads_omp;
+ /* Here we could oversubscribe, when we do, we issue a warning later */
+ nthreads_tmpi = max(1,nthreads_tot/hw_opt->nthreads_omp);
}
else
{
/* TODO choose nthreads_omp based on hardware topology
when we have a hardware topology detection library */
- /* Don't use OpenMP parallelization */
- nthreads_tmpi = nthreads_tot;
+ /* In general, when running up to 4 threads, OpenMP should be faster.
+ * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
+ * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
+ * even on two CPUs it's usually faster (but with many OpenMP threads
+ * it could be faster not to use HT, currently we always use HT).
+ * On Nehalem/Westmere we want to avoid running 16 threads over
+ * two CPUs with HT, so we need a limit<16; thus we use 12.
+ * A reasonable limit for Intel Sandy and Ivy bridge,
+ * not knowing the topology, is 16 threads.
+ */
+ const int nthreads_omp_always_faster = 4;
+ const int nthreads_omp_always_faster_Nehalem = 12;
+ const int nthreads_omp_always_faster_SandyBridge = 16;
+ const int first_model_Nehalem = 0x1A;
+ const int first_model_SandyBridge = 0x2A;
+ gmx_bool bIntel_Family6;
+
+ bIntel_Family6 =
+ (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
+ gmx_cpuid_family(hwinfo->cpuid_info) == 6);
+
+ if (nthreads_tot <= nthreads_omp_always_faster ||
+ (bIntel_Family6 &&
+ ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
+ (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
+ {
+ /* Use pure OpenMP parallelization */
+ nthreads_tmpi = 1;
+ }
+ else
+ {
+ /* Don't use OpenMP parallelization */
+ nthreads_tmpi = nthreads_tot;
+ }
}
return nthreads_tmpi;
const t_commrec *cr,
FILE *fplog)
{
- int nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
+ int nthreads_hw,nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
int min_atoms_per_mpi_thread;
char *env;
char sbuf[STRLEN];
return hw_opt->nthreads_tmpi;
}
+ nthreads_hw = hwinfo->nthreads_hw_avail;
+
/* How many total (#tMPI*#OpenMP) threads can we start? */
if (hw_opt->nthreads_tot > 0)
{
}
else
{
- nthreads_tot_max = tMPI_Thread_get_hw_number();
+ nthreads_tot_max = nthreads_hw;
}
bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
}
nthreads_tmpi =
- get_tmpi_omp_thread_distribution(hw_opt,nthreads_tot_max,ngpu);
+ get_tmpi_omp_thread_division(hwinfo,hw_opt,nthreads_tot_max,ngpu);
if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
{
threads (too few atoms per thread) */
nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
- if (nthreads_new > 8 || (nthreads_tmpi == 8 && nthreads_new > 4))
+ /* Avoid partial use of Hyper-Threading */
+ if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
+ nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
{
- /* TODO replace this once we have proper HT detection
- * Use only multiples of 4 above 8 threads
- * or with an 8-core processor
- * (to avoid 6 threads on 8 core processors with 4 real cores).
- */
- nthreads_new = (nthreads_new/4)*4;
+ nthreads_new = nthreads_hw/2;
}
- else if (nthreads_new > 4)
+
+ /* Avoid large prime numbers in the thread count */
+ if (nthreads_new >= 6)
{
- /* Avoid 5 or 7 threads */
- nthreads_new = (nthreads_new/2)*2;
+ /* Use only 6,8,10 with additional factors of 2 */
+ int fac;
+
+ fac = 2;
+ while (3*fac*2 <= nthreads_new)
+ {
+ fac *= 2;
+ }
+
+ nthreads_new = (nthreads_new/fac)*fac;
+ }
+ else
+ {
+ /* Avoid 5 */
+ if (nthreads_new == 5)
+ {
+ nthreads_new = 4;
+ }
}
nthreads_tmpi = nthreads_new;