updated several things related with OpenMP thread count

author Berk Hess <hess@kth.se>

Tue, 9 Oct 2012 15:08:42 +0000 (17:08 +0200)

committer Berk Hess <hess@kth.se>

Mon, 5 Nov 2012 13:44:56 +0000 (14:44 +0100)
author Berk Hess <hess@kth.se>
Tue, 9 Oct 2012 15:08:42 +0000 (17:08 +0200)
committer Berk Hess <hess@kth.se>
Mon, 5 Nov 2012 13:44:56 +0000 (14:44 +0100)
diff --git a/src/kernel/mdrun.c b/src/kernel/mdrun.c

index 316b485143b3617e862f328ac00a05c7dc01c117..dc49b996170b2b9750d842200bd85ac70b2a5be7 100644 (file)
--- a/src/kernel/mdrun.c
+++ b/src/kernel/mdrun.c
@@ -174,6 +174,13 @@ int main(int argc,char *argv[])
      "Note that using combined MPI+OpenMP parallelization is almost always",
      "slower than single parallelization, except at the scaling limit, where",
      "especially OpenMP parallelization of PME reduces the communication cost.",
+    "OpenMP-only parallelization is much faster than MPI-only parallelization",
+    "on a single CPU(-die). Since we currently don't have proper hardware",
+    "topology detection, [TT]mdrun[tt] compiled with thread-MPI will only",
+    "automatically use OpenMP-only parallelization when you use up to 4",
+    "threads, up to 12 threads with Intel Nehalem/Westmere, or up to 16",
+    "threads with Intel Sandy Bridge or newer CPUs. Otherwise MPI-only",
+    "parallelization is used (except with GPUs, see below).",
      "[PAR]",
      "To quickly test the performance of the new Verlet cut-off scheme",
      "with old [TT].tpr[tt] files, either on CPUs or CPUs+GPUs, you can use",
@@ -522,9 +529,9 @@ int main(int argc,char *argv[])
      { "-ntmpi",   FALSE, etINT, {&hw_opt.nthreads_tmpi},
        "Number of thread-MPI threads to start (0 is guess)" },
      { "-ntomp",   FALSE, etINT, {&hw_opt.nthreads_omp},
-      "Number of OpenMP threads to start (0 is guess)" },
+      "Number of OpenMP threads per MPI process/thread to start (0 is guess)" },
      { "-ntomp_pme", FALSE, etINT, {&hw_opt.nthreads_omp_pme},
-      "Number of OpenMP threads to start (0 is -ntomp)" },
+      "Number of OpenMP threads per MPI process/thread to start (0 is -ntomp)" },
      { "-pin",     FALSE, etBOOL, {&hw_opt.bThreadPinning},
        "Pin OpenMP threads to cores" },
      { "-pinht",   FALSE, etBOOL, {&hw_opt.bPinHyperthreading},
diff --git a/src/kernel/runner.c b/src/kernel/runner.c

index 7599238dc5fbcaa285ec7a1f08b51cf952a8193c..b7549d9a442e62ac11ea82718a89a4ab8744b2ec 100644 (file)
--- a/src/kernel/runner.c
+++ b/src/kernel/runner.c
@@ -275,9 +275,10 @@ static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
  }
  
  
-static int get_tmpi_omp_thread_distribution(const gmx_hw_opt_t *hw_opt,
-                                            int nthreads_tot,
-                                            int ngpu)
+static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
+                                        const gmx_hw_opt_t *hw_opt,
+                                        int nthreads_tot,
+                                        int ngpu)
  {
      int nthreads_tmpi;
  
@@ -296,18 +297,47 @@ static int get_tmpi_omp_thread_distribution(const gmx_hw_opt_t *hw_opt,
      }
      else if (hw_opt->nthreads_omp > 0)
      {
-        if (hw_opt->nthreads_omp > nthreads_tot)
-        {
-            gmx_fatal(FARGS,"More OpenMP threads requested (%d) than the total number of threads requested (%d)",hw_opt->nthreads_omp,nthreads_tot);
-        }
-        nthreads_tmpi = nthreads_tot/hw_opt->nthreads_omp;
+        /* Here we could oversubscribe, when we do, we issue a warning later */
+        nthreads_tmpi = max(1,nthreads_tot/hw_opt->nthreads_omp);
      }
      else
      {
          /* TODO choose nthreads_omp based on hardware topology
             when we have a hardware topology detection library */
-        /* Don't use OpenMP parallelization */
-        nthreads_tmpi = nthreads_tot;
+        /* In general, when running up to 4 threads, OpenMP should be faster.
+         * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
+         * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
+         * even on two CPUs it's usually faster (but with many OpenMP threads
+         * it could be faster not to use HT, currently we always use HT).
+         * On Nehalem/Westmere we want to avoid running 16 threads over
+         * two CPUs with HT, so we need a limit<16; thus we use 12.
+         * A reasonable limit for Intel Sandy and Ivy bridge,
+         * not knowing the topology, is 16 threads.
+         */
+        const int nthreads_omp_always_faster             =  4;
+        const int nthreads_omp_always_faster_Nehalem     = 12;
+        const int nthreads_omp_always_faster_SandyBridge = 16;
+        const int first_model_Nehalem     = 0x1A;
+        const int first_model_SandyBridge = 0x2A;
+        gmx_bool bIntel_Family6;
+
+        bIntel_Family6 =
+            (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
+             gmx_cpuid_family(hwinfo->cpuid_info) == 6);
+
+        if (nthreads_tot <= nthreads_omp_always_faster ||
+            (bIntel_Family6 &&
+             ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
+              (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
+        {
+            /* Use pure OpenMP parallelization */
+            nthreads_tmpi = 1;
+        }
+        else
+        {
+            /* Don't use OpenMP parallelization */
+            nthreads_tmpi = nthreads_tot;
+        }
      }
  
      return nthreads_tmpi;
@@ -327,7 +357,7 @@ static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
                              const t_commrec *cr,
                              FILE *fplog)
  {
-    int nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
+    int nthreads_hw,nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
      int min_atoms_per_mpi_thread;
      char *env;
      char sbuf[STRLEN];
@@ -339,6 +369,8 @@ static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
          return hw_opt->nthreads_tmpi;
      }
  
+    nthreads_hw = hwinfo->nthreads_hw_avail;
+
      /* How many total (#tMPI*#OpenMP) threads can we start? */ 
      if (hw_opt->nthreads_tot > 0)
      {
@@ -346,7 +378,7 @@ static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
      }
      else
      {
-        nthreads_tot_max = tMPI_Thread_get_hw_number();
+        nthreads_tot_max = nthreads_hw;
      }
  
      bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
@@ -360,7 +392,7 @@ static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
      }
  
      nthreads_tmpi =
-        get_tmpi_omp_thread_distribution(hw_opt,nthreads_tot_max,ngpu);
+        get_tmpi_omp_thread_division(hwinfo,hw_opt,nthreads_tot_max,ngpu);
  
      if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
      {
@@ -398,19 +430,34 @@ static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
             threads (too few atoms per thread) */
          nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
  
-        if (nthreads_new > 8 || (nthreads_tmpi == 8 && nthreads_new > 4))
+        /* Avoid partial use of Hyper-Threading */
+        if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
+            nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
          {
-            /* TODO replace this once we have proper HT detection
-             * Use only multiples of 4 above 8 threads
-             * or with an 8-core processor
-             * (to avoid 6 threads on 8 core processors with 4 real cores).
-             */
-            nthreads_new = (nthreads_new/4)*4;
+            nthreads_new = nthreads_hw/2;
          }
-        else if (nthreads_new > 4)
+
+        /* Avoid large prime numbers in the thread count */
+        if (nthreads_new >= 6)
          {
-            /* Avoid 5 or 7 threads */
-            nthreads_new = (nthreads_new/2)*2;
+            /* Use only 6,8,10 with additional factors of 2 */
+            int fac;
+
+            fac = 2;
+            while (3*fac*2 <= nthreads_new)
+            {
+                fac *= 2;
+            }
+
+            nthreads_new = (nthreads_new/fac)*fac;
+        }
+        else
+        {
+            /* Avoid 5 */
+            if (nthreads_new == 5)
+            {
+                nthreads_new = 4;
+            }
          }
  
          nthreads_tmpi = nthreads_new;
author	Berk Hess <hess@kth.se>
	Tue, 9 Oct 2012 15:08:42 +0000 (17:08 +0200)
committer	Berk Hess <hess@kth.se>
	Mon, 5 Nov 2012 13:44:56 +0000 (14:44 +0100)
src/kernel/mdrun.c		patch \| blob \| history
src/kernel/runner.c		patch \| blob \| history