Merge origin/release-4-6 into master
[alexxy/gromacs.git] / src / programs / mdrun / runner.c
index 937585fdd6866359c72be978125a1c2f8370d2b2..8e0096bb1bbed4c7697c2605ec7ced97d2d411a7 100644 (file)
@@ -36,7 +36,7 @@
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
-#ifdef __linux
+#if defined(HAVE_SCHED_H) && (defined(HAVE_SCHED_GETAFFINITY) || defined(HAVE_SCHED_SETAFFINITY))
 #define _GNU_SOURCE
 #include <sched.h>
 #include <sys/syscall.h>
@@ -278,9 +278,10 @@ static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
 }
 
 
-static int get_tmpi_omp_thread_distribution(const gmx_hw_opt_t *hw_opt,
-                                            int nthreads_tot,
-                                            int ngpu)
+static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
+                                        const gmx_hw_opt_t *hw_opt,
+                                        int nthreads_tot,
+                                        int ngpu)
 {
     int nthreads_tmpi;
 
@@ -299,18 +300,47 @@ static int get_tmpi_omp_thread_distribution(const gmx_hw_opt_t *hw_opt,
     }
     else if (hw_opt->nthreads_omp > 0)
     {
-        if (hw_opt->nthreads_omp > nthreads_tot)
-        {
-            gmx_fatal(FARGS,"More OpenMP threads requested (%d) than the total number of threads requested (%d)",hw_opt->nthreads_omp,nthreads_tot);
-        }
-        nthreads_tmpi = nthreads_tot/hw_opt->nthreads_omp;
+        /* Here we could oversubscribe, when we do, we issue a warning later */
+        nthreads_tmpi = max(1,nthreads_tot/hw_opt->nthreads_omp);
     }
     else
     {
         /* TODO choose nthreads_omp based on hardware topology
            when we have a hardware topology detection library */
-        /* Don't use OpenMP parallelization */
-        nthreads_tmpi = nthreads_tot;
+        /* In general, when running up to 4 threads, OpenMP should be faster.
+         * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
+         * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
+         * even on two CPUs it's usually faster (but with many OpenMP threads
+         * it could be faster not to use HT, currently we always use HT).
+         * On Nehalem/Westmere we want to avoid running 16 threads over
+         * two CPUs with HT, so we need a limit<16; thus we use 12.
+         * A reasonable limit for Intel Sandy and Ivy bridge,
+         * not knowing the topology, is 16 threads.
+         */
+        const int nthreads_omp_always_faster             =  4;
+        const int nthreads_omp_always_faster_Nehalem     = 12;
+        const int nthreads_omp_always_faster_SandyBridge = 16;
+        const int first_model_Nehalem     = 0x1A;
+        const int first_model_SandyBridge = 0x2A;
+        gmx_bool bIntel_Family6;
+
+        bIntel_Family6 =
+            (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
+             gmx_cpuid_family(hwinfo->cpuid_info) == 6);
+
+        if (nthreads_tot <= nthreads_omp_always_faster ||
+            (bIntel_Family6 &&
+             ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
+              (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
+        {
+            /* Use pure OpenMP parallelization */
+            nthreads_tmpi = 1;
+        }
+        else
+        {
+            /* Don't use OpenMP parallelization */
+            nthreads_tmpi = nthreads_tot;
+        }
     }
 
     return nthreads_tmpi;
@@ -330,7 +360,7 @@ static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
                             const t_commrec *cr,
                             FILE *fplog)
 {
-    int nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
+    int nthreads_hw,nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
     int min_atoms_per_mpi_thread;
     char *env;
     char sbuf[STRLEN];
@@ -342,6 +372,8 @@ static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
         return hw_opt->nthreads_tmpi;
     }
 
+    nthreads_hw = hwinfo->nthreads_hw_avail;
+
     /* How many total (#tMPI*#OpenMP) threads can we start? */ 
     if (hw_opt->nthreads_tot > 0)
     {
@@ -349,7 +381,7 @@ static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
     }
     else
     {
-        nthreads_tot_max = tMPI_Thread_get_hw_number();
+        nthreads_tot_max = nthreads_hw;
     }
 
     bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
@@ -363,7 +395,7 @@ static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
     }
 
     nthreads_tmpi =
-        get_tmpi_omp_thread_distribution(hw_opt,nthreads_tot_max,ngpu);
+        get_tmpi_omp_thread_division(hwinfo,hw_opt,nthreads_tot_max,ngpu);
 
     if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
     {
@@ -401,19 +433,34 @@ static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
            threads (too few atoms per thread) */
         nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
 
-        if (nthreads_new > 8 || (nthreads_tmpi == 8 && nthreads_new > 4))
+        /* Avoid partial use of Hyper-Threading */
+        if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
+            nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
         {
-            /* TODO replace this once we have proper HT detection
-             * Use only multiples of 4 above 8 threads
-             * or with an 8-core processor
-             * (to avoid 6 threads on 8 core processors with 4 real cores).
-             */
-            nthreads_new = (nthreads_new/4)*4;
+            nthreads_new = nthreads_hw/2;
         }
-        else if (nthreads_new > 4)
+
+        /* Avoid large prime numbers in the thread count */
+        if (nthreads_new >= 6)
         {
-            /* Avoid 5 or 7 threads */
-            nthreads_new = (nthreads_new/2)*2;
+            /* Use only 6,8,10 with additional factors of 2 */
+            int fac;
+
+            fac = 2;
+            while (3*fac*2 <= nthreads_new)
+            {
+                fac *= 2;
+            }
+
+            nthreads_new = (nthreads_new/fac)*fac;
+        }
+        else
+        {
+            /* Avoid 5 */
+            if (nthreads_new == 5)
+            {
+                nthreads_new = 4;
+            }
         }
 
         nthreads_tmpi = nthreads_new;
@@ -726,6 +773,92 @@ static void convert_to_verlet_scheme(FILE *fplog,
     gmx_mtop_remove_chargegroups(mtop);
 }
 
+/* Check the process affinity mask and if it is found to be non-zero,
+ * will honor it and disable mdrun internal affinity setting.
+ * This function should be called first before the OpenMP library gets
+ * initialized with the last argument FALSE (which will detect affinity
+ * set by external tools like taskset), and later, after the OpenMP
+ * initialization, with the last argument TRUE to detect affinity changes
+ * made by the OpenMP library.
+ *
+ * Note that this will only work on Linux as we use a GNU feature. */
+static void check_cpu_affinity_set(FILE *fplog, const t_commrec *cr,
+                                   gmx_hw_opt_t *hw_opt, int ncpus,
+                                   gmx_bool bAfterOpenmpInit)
+{
+#ifdef HAVE_SCHED_GETAFFINITY
+    cpu_set_t mask_current;
+    int       i, ret, cpu_count, cpu_set;
+    gmx_bool  bAllSet;
+
+    assert(hw_opt);
+    if (!hw_opt->bThreadPinning)
+    {
+        /* internal affinity setting is off, don't bother checking process affinity */
+        return;
+    }
+
+    CPU_ZERO(&mask_current);
+    if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
+    {
+        /* failed to query affinity mask, will just return */
+        if (debug)
+        {
+            fprintf(debug, "Failed to query affinity mask (error %d)", ret);
+        }
+        return;
+    }
+
+    /* Before proceeding with the actual check, make sure that the number of
+     * detected CPUs is >= the CPUs in the current set.
+     * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
+#ifdef CPU_COUNT
+    if (ncpus < CPU_COUNT(&mask_current))
+    {
+        if (debug)
+        {
+            fprintf(debug, "%d CPUs detected, but %d was returned by CPU_COUNT",
+                    ncpus, CPU_COUNT(&mask_current));
+        }
+        return;
+    }
+#endif /* CPU_COUNT */
+
+    bAllSet = TRUE;
+    for (i = 0; (i < ncpus && i < CPU_SETSIZE); i++)
+    {
+        bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0);
+    }
+
+    if (!bAllSet)
+    {
+        if (!bAfterOpenmpInit)
+        {
+            md_print_warn(cr, fplog,
+                          "Non-default process affinity set, disabling internal affinity");
+        }
+        else
+        {
+            md_print_warn(cr, fplog,
+                          "Non-default process affinity set probably by the OpenMP library, "
+                          "disabling internal affinity");
+        }
+        hw_opt->bThreadPinning = FALSE;
+
+        if (debug)
+        {
+            fprintf(debug, "Non-default affinity mask found\n");
+        }
+    }
+    else
+    {
+        if (debug)
+        {
+            fprintf(debug, "Default affinity mask found\n");
+        }
+    }
+#endif /* HAVE_SCHED_GETAFFINITY */
+}
 
 /* Set CPU affinity. Can be important for performance.
    On some systems (e.g. Cray) CPU Affinity is set by default.
@@ -754,7 +887,7 @@ static void set_cpu_affinity(FILE *fplog,
 #endif
 
 #ifdef GMX_OPENMP /* TODO: actually we could do this even without OpenMP?! */
-#ifdef __linux /* TODO: only linux? why not everywhere if sched_setaffinity is available */
+#ifdef HAVE_SCHED_SETAFFINITY
     if (hw_opt->bThreadPinning)
     {
         int thread, nthread_local, nthread_node, nthread_hw_max, nphyscore;
@@ -870,7 +1003,7 @@ static void set_cpu_affinity(FILE *fplog,
             sched_setaffinity((pid_t) syscall (SYS_gettid), sizeof(cpu_set_t), &mask);
         }
     }
-#endif /* __linux    */
+#endif /* HAVE_SCHED_SETAFFINITY */
 #endif /* GMX_OPENMP */
 }
 
@@ -1013,7 +1146,7 @@ static void override_nsteps_cmdline(FILE *fplog,
  * before the other nodes have read the tpx file and called gmx_detect_hardware.
  */
 typedef struct {
-    int      cutoff_scheme; /* The cutoff-scheme from inputrec_t */
+    int cutoff_scheme; /* The cutoff scheme from inputrec_t */
     gmx_bool bUseGPU;       /* Use GPU or GPU emulation          */
 } master_inf_t;
 
@@ -1128,6 +1261,12 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
         cr->npmenodes = 0;
     }
 
+    /* Check for externally set OpenMP affinity and turn off internal
+     * pinning if any is found. We need to do this check early to tell
+     * thread-MPI whether it should do pinning when spawning threads.
+     */
+    gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
+
 #ifdef GMX_THREAD_MPI
     /* With thread-MPI inputrec is only set here on the master thread */
     if (SIMMASTER(cr))
@@ -1135,6 +1274,16 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
     {
         check_and_update_hw_opt(hw_opt,minf.cutoff_scheme);
 
+#ifdef GMX_THREAD_MPI
+        /* Early check for externally set process affinity. Can't do over all
+         * MPI processes because hwinfo is not available everywhere, but with
+         * thread-MPI it's needed as pinning might get turned off which needs
+         * to be known before starting thread-MPI. */
+        check_cpu_affinity_set(fplog,
+                               NULL,
+                               hw_opt, hwinfo->nthreads_hw_avail, FALSE);
+#endif
+
 #ifdef GMX_THREAD_MPI
         if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
         {
@@ -1149,12 +1298,6 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
         }
     }
 
-    /* Check for externally set OpenMP affinity and turn off internal
-     * pinning if any is found. We need to do this check early to tell
-     * thread-MPI whether it should do pinning when spawning threads.
-     */
-    gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
-
 #ifdef GMX_THREAD_MPI
     if (SIMMASTER(cr))
     {
@@ -1242,6 +1385,10 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
         gmx_detect_hardware(fplog, hwinfo, cr,
                                  bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
     }
+
+    /* Now do the affinity check with MPI/no-MPI (done earlier with thread-MPI). */
+    check_cpu_affinity_set(fplog, cr,
+                           hw_opt, hwinfo->nthreads_hw_avail, FALSE);
 #endif
 
     /* now make sure the state is initialized and propagated */
@@ -1601,6 +1748,11 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
         snew(pmedata,1);
     }
 
+    /* Before setting affinity, check whether the affinity has changed
+     * - which indicates that probably the OpenMP library has changed it since
+     * we first checked). */
+    check_cpu_affinity_set(fplog, cr, hw_opt, hwinfo->nthreads_hw_avail, TRUE);
+
     /* Set the CPU affinity */
     set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);