#include "gromacs/utility/fatalerror.h"
+/* DISCLAIMER: All the atom count and thread numbers below are heuristic.
+ * The real switching points will depend on the system simulation,
+ * the algorithms used and the hardware it's running on, as well as if there
+ * are other jobs running on the same machine. We try to take into account
+ * factors that have a large influence, such as recent Intel CPUs being
+ * much better at wide multi-threading. The remaining factors should
+ * (hopefully) have a small influence, such that the performance just before
+ * and after a switch point doesn't change too much.
+ */
+
+#ifdef GMX_OPENMP
+static const bool bOMP = true;
+#else
+static const bool bOMP = false;
+#endif
+
#ifdef GMX_THREAD_MPI
/* The minimum number of atoms per tMPI thread. With fewer atoms than this,
* the number of threads will get lowered.
/* TODO choose nthreads_omp based on hardware topology
when we have a hardware topology detection library */
/* First we consider the case of no MPI (1 MPI rank).
- * In general, when running up to 4 threads, OpenMP should be faster.
+ * In general, when running up to 8 threads, OpenMP should be faster.
* Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
* On Intel>=Nehalem running OpenMP on a single CPU is always faster,
* even on two CPUs it's usually faster (but with many OpenMP threads
* Sandy/Ivy Bridge, Has/Broadwell. By checking for AVX instead of
* model numbers we ensure also future Intel CPUs are covered.
*/
-const int nthreads_omp_always_faster_default = 8;
-const int nthreads_omp_always_faster_Nehalem = 12;
-const int nthreads_omp_always_faster_Intel_AVX = 16;
+const int nthreads_omp_faster_default = 8;
+const int nthreads_omp_faster_Nehalem = 12;
+const int nthreads_omp_faster_Intel_AVX = 16;
/* For CPU only runs the fastest options are usually MPI or OpenMP only.
* With one GPU, using MPI only is almost never optimal, so we need to
* compare running pure OpenMP with combined MPI+OpenMP. This means higher
* OpenMP threads counts can still be ok. Multiplying the numbers above
* by a factor of 2 seems to be a good estimate.
*/
-const int nthreads_omp_always_faster_gpu_fac = 2;
+const int nthreads_omp_faster_gpu_fac = 2;
/* This is the case with MPI (2 or more MPI PP ranks).
* By default we will terminate with a fatal error when more than 8
const int nthreads_omp_mpi_target_max = 6;
-static int nthreads_omp_always_faster(gmx_cpuid_t cpuid_info, gmx_bool bUseGPU)
+/* Returns the maximum OpenMP thread count for which using a single MPI rank
+ * should be faster than using multiple ranks with the same total thread count.
+ */
+static int nthreads_omp_faster(gmx_cpuid_t cpuid_info, gmx_bool bUseGPU)
{
int nth;
if (gmx_cpuid_vendor(cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
gmx_cpuid_feature(cpuid_info, GMX_CPUID_FEATURE_X86_AVX))
{
- nth = nthreads_omp_always_faster_Intel_AVX;
+ nth = nthreads_omp_faster_Intel_AVX;
}
else if (gmx_cpuid_is_intel_nehalem(cpuid_info))
{
- nth = nthreads_omp_always_faster_Nehalem;
+ nth = nthreads_omp_faster_Nehalem;
}
else
{
- nth = nthreads_omp_always_faster_default;
+ nth = nthreads_omp_faster_default;
}
if (bUseGPU)
{
- nth *= nthreads_omp_always_faster_gpu_fac;
+ nth *= nthreads_omp_faster_gpu_fac;
}
nth = std::min(nth, GMX_OPENMP_MAX_THREADS);
return nth;
}
+/* Returns that maximum OpenMP thread count that passes the efficiency check */
+static int nthreads_omp_efficient_max(int gmx_unused nrank,
+ gmx_cpuid_t cpuid_info,
+ gmx_bool bUseGPU)
+{
+#if defined GMX_OPENMP && defined GMX_MPI
+ if (nrank > 1)
+ {
+ return nthreads_omp_mpi_ok_max;
+ }
+ else
+#endif
+ {
+ return nthreads_omp_faster(cpuid_info, bUseGPU);
+ }
+}
+
+/* Return the number of thread-MPI ranks to use.
+ * This is chosen such that we can always obey our own efficiency checks.
+ */
static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
const gmx_hw_opt_t *hw_opt,
int nthreads_tot,
nrank = nthreads_tot;
}
else if (gmx_gpu_sharing_supported() &&
- (nthreads_tot > nthreads_omp_always_faster(hwinfo->cpuid_info,
- ngpu > 0) ||
+ (nthreads_tot > nthreads_omp_faster(hwinfo->cpuid_info,
+ ngpu > 0) ||
(ngpu > 1 && nthreads_tot/ngpu > nthreads_omp_mpi_target_max)))
{
/* The high OpenMP thread count will likely result in sub-optimal
}
else
{
- if (nthreads_tot <= nthreads_omp_always_faster(hwinfo->cpuid_info,
- ngpu > 0))
+ if (nthreads_tot <= nthreads_omp_faster(hwinfo->cpuid_info, ngpu > 0))
{
/* Use pure OpenMP parallelization */
nrank = 1;
}
+static int getMaxGpuUsable(FILE *fplog, const t_commrec *cr, const gmx_hw_info_t *hwinfo, int cutoff_scheme)
+{
+ /* This code relies on the fact that GPU are not detected when GPU
+ * acceleration was disabled at run time by the user.
+ */
+ if (cutoff_scheme == ecutsVERLET &&
+ hwinfo->gpu_info.n_dev_compatible > 0)
+ {
+ if (gmx_multiple_gpu_per_node_supported())
+ {
+ return hwinfo->gpu_info.n_dev_compatible;
+ }
+ else
+ {
+ if (hwinfo->gpu_info.n_dev_compatible > 1)
+ {
+ md_print_warn(cr, fplog, "More than one compatible GPU is available, but GROMACS can only use one of them. Using a single thread-MPI rank.\n");
+ }
+ return 1;
+ }
+ }
+ else
+ {
+ return 0;
+ }
+}
+
+
#ifdef GMX_THREAD_MPI
/* Get the number of MPI ranks to use for thread-MPI based on how many
* were requested, which algorithms we're using,
* with the hardware, except that ntmpi could be larger than #GPU.
*/
int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
- const gmx_hw_opt_t *hw_opt,
+ gmx_hw_opt_t *hw_opt,
const t_inputrec *inputrec,
const gmx_mtop_t *mtop,
const t_commrec *cr,
{
int nthreads_hw, nthreads_tot_max, nrank, ngpu;
int min_atoms_per_mpi_rank;
- gmx_bool bCanUseGPU;
/* Check if an algorithm does not support parallel simulation. */
if (inputrec->eI == eiLBFGS ||
nthreads_tot_max = nthreads_hw;
}
- bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET &&
- hwinfo->gpu_info.n_dev_compatible > 0);
- if (bCanUseGPU)
- {
- if (gmx_multiple_gpu_per_node_supported())
- {
- ngpu = hwinfo->gpu_info.n_dev_compatible;
- }
- else
- {
- if (hwinfo->gpu_info.n_dev_compatible > 1)
- {
- md_print_warn(cr, fplog, "More than one compatible GPU is available, but GROMACS can only use one of them. Using a single thread-MPI rank.\n");
- }
- ngpu = 1;
- }
- }
- else
- {
- ngpu = 0;
- }
+ ngpu = getMaxGpuUsable(fplog, cr, hwinfo, inputrec->cutoff_scheme);
if (inputrec->cutoff_scheme == ecutsGROUP)
{
}
else
{
- if (bCanUseGPU)
+ if (ngpu >= 1)
{
min_atoms_per_mpi_rank = min_atoms_per_gpu;
}
nrank = nrank_new;
+ /* We reduced the number of tMPI ranks, which means we might violate
+ * our own efficiency checks if we simply use all hardware threads.
+ */
+ if (bOMP && hw_opt->nthreads_omp <= 0 && hw_opt->nthreads_tot <= 0)
+ {
+ /* The user set neither the total nor the OpenMP thread count,
+ * we should use all hardware threads, unless we will violate
+ * our own efficiency limitation on the thread count.
+ */
+ int nt_omp_max;
+
+ nt_omp_max = nthreads_omp_efficient_max(nrank, hwinfo->cpuid_info, ngpu >= 1);
+
+ if (nrank*nt_omp_max < hwinfo->nthreads_hw_avail)
+ {
+ /* Limit the number of OpenMP threads to start */
+ hw_opt->nthreads_omp = nt_omp_max;
+ }
+ }
+
fprintf(stderr, "\n");
fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n");
fprintf(stderr, " only starting %d thread-MPI ranks.\n", nrank);
void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
const gmx_hw_opt_t *hw_opt,
- gmx_bool bNTOptSet,
+ gmx_bool bNtOmpOptionSet,
t_commrec *cr,
FILE *fplog)
{
* OpenMP have been initialized. Check that here.
*/
#ifdef GMX_THREAD_MPI
- assert(nthreads_omp_always_faster_default >= nthreads_omp_mpi_ok_max);
+ assert(nthreads_omp_faster_default >= nthreads_omp_mpi_ok_max);
assert(hw_opt->nthreads_tmpi >= 1);
#endif
assert(gmx_omp_nthreads_get(emntDefault) >= 1);
nthreads_omp_mpi_ok_min,
nthreads_omp_mpi_target_max);
- if (bNTOptSet)
+ if (bNtOmpOptionSet)
{
md_print_warn(cr, fplog, "NOTE: %s\n", buf);
}
{
/* No domain decomposition (or only one domain) */
if (!(ngpu > 0 && !gmx_gpu_sharing_supported()) &&
- nth_omp_max > nthreads_omp_always_faster(hwinfo->cpuid_info, ngpu > 0))
+ nth_omp_max > nthreads_omp_faster(hwinfo->cpuid_info, ngpu > 0))
{
/* To arrive here, the user/system set #ranks and/or #OMPthreads */
gmx_bool bEnvSet;
bEnvSet = (getenv("OMP_NUM_THREADS") != NULL);
- if (bNTOptSet || bEnvSet)
+ if (bNtOmpOptionSet || bEnvSet)
{
sprintf(buf2, "You requested %d OpenMP threads", nth_omp_max);
}
* with different values per rank or node, since in that case
* the user can not set -ntomp to override the error.
*/
- if (bNTOptSet || (bEnvSet && nth_omp_min != nth_omp_max))
+ if (bNtOmpOptionSet || (bEnvSet && nth_omp_min != nth_omp_max))
{
md_print_warn(cr, fplog, "NOTE: %s\n", buf);
}
#else /* GMX_OPENMP && GMX_MPI */
/* No OpenMP and/or MPI: it doesn't make much sense to check */
GMX_UNUSED_VALUE(hw_opt);
- GMX_UNUSED_VALUE(bNTOptSet);
+ GMX_UNUSED_VALUE(bNtOmpOptionSet);
/* Check if we have more than 1 physical core, if detected,
* or more than 1 hardware thread if physical cores were not detected.
*/
}
#endif
-#ifndef GMX_OPENMP
- if (hw_opt->nthreads_omp > 1)
+ if (!bOMP)
{
- gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but GROMACS was compiled without OpenMP support");
+ if (hw_opt->nthreads_omp > 1)
+ {
+ gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but GROMACS was compiled without OpenMP support");
+ }
+ hw_opt->nthreads_omp = 1;
}
- hw_opt->nthreads_omp = 1;
-#endif
if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
{
}
}
-#ifndef GMX_OPENMP
- if (hw_opt->nthreads_omp > 1)
+ if (!bOMP && hw_opt->nthreads_omp > 1)
{
gmx_fatal(FARGS, "OpenMP threads are requested, but GROMACS was compiled without OpenMP support");
}
-#endif
if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
{