Obey OpenMP thread count limit with tMPI

author Berk Hess <hess@kth.se>

Thu, 2 Jul 2015 08:09:43 +0000 (10:09 +0200)

committer Gerrit Code Review <gerrit@gerrit.gromacs.org>

Sat, 4 Jul 2015 09:06:42 +0000 (11:06 +0200)
author Berk Hess <hess@kth.se>
Thu, 2 Jul 2015 08:09:43 +0000 (10:09 +0200)
committer Gerrit Code Review <gerrit@gerrit.gromacs.org>
Sat, 4 Jul 2015 09:06:42 +0000 (11:06 +0200)
diff --git a/src/programs/mdrun/resource-division.cpp b/src/programs/mdrun/resource-division.cpp

index f352cc1a839b524fb4a1ab190dcdcb6197e858c3..9042aee8a9c860ac8a4399d3abf55c3357de6c8e 100644 (file)
--- a/src/programs/mdrun/resource-division.cpp
+++ b/src/programs/mdrun/resource-division.cpp
@@ -53,6 +53,22 @@
  #include "gromacs/utility/fatalerror.h"
  
  
+/* DISCLAIMER: All the atom count and thread numbers below are heuristic.
+ * The real switching points will depend on the system simulation,
+ * the algorithms used and the hardware it's running on, as well as if there
+ * are other jobs running on the same machine. We try to take into account
+ * factors that have a large influence, such as recent Intel CPUs being
+ * much better at wide multi-threading. The remaining factors should
+ * (hopefully) have a small influence, such that the performance just before
+ * and after a switch point doesn't change too much.
+ */
+
+#ifdef GMX_OPENMP
+static const bool bOMP = true;
+#else
+static const bool bOMP = false;
+#endif
+
  #ifdef GMX_THREAD_MPI
  /* The minimum number of atoms per tMPI thread. With fewer atoms than this,
   * the number of threads will get lowered.
@@ -64,7 +80,7 @@ static const int min_atoms_per_gpu        = 900;
  /* TODO choose nthreads_omp based on hardware topology
     when we have a hardware topology detection library */
  /* First we consider the case of no MPI (1 MPI rank).
- * In general, when running up to 4 threads, OpenMP should be faster.
+ * In general, when running up to 8 threads, OpenMP should be faster.
   * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
   * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
   * even on two CPUs it's usually faster (but with many OpenMP threads
@@ -77,16 +93,16 @@ static const int min_atoms_per_gpu        = 900;
   * Sandy/Ivy Bridge, Has/Broadwell. By checking for AVX instead of
   * model numbers we ensure also future Intel CPUs are covered.
   */
-const int nthreads_omp_always_faster_default   =  8;
-const int nthreads_omp_always_faster_Nehalem   = 12;
-const int nthreads_omp_always_faster_Intel_AVX = 16;
+const int nthreads_omp_faster_default   =  8;
+const int nthreads_omp_faster_Nehalem   = 12;
+const int nthreads_omp_faster_Intel_AVX = 16;
  /* For CPU only runs the fastest options are usually MPI or OpenMP only.
   * With one GPU, using MPI only is almost never optimal, so we need to
   * compare running pure OpenMP with combined MPI+OpenMP. This means higher
   * OpenMP threads counts can still be ok. Multiplying the numbers above
   * by a factor of 2 seems to be a good estimate.
   */
-const int nthreads_omp_always_faster_gpu_fac   =  2;
+const int nthreads_omp_faster_gpu_fac   =  2;
  
  /* This is the case with MPI (2 or more MPI PP ranks).
   * By default we will terminate with a fatal error when more than 8
@@ -104,27 +120,30 @@ const int nthreads_omp_mpi_ok_min_gpu          =  2;
  const int nthreads_omp_mpi_target_max          =  6;
  
  
-static int nthreads_omp_always_faster(gmx_cpuid_t cpuid_info, gmx_bool bUseGPU)
+/* Returns the maximum OpenMP thread count for which using a single MPI rank
+ * should be faster than using multiple ranks with the same total thread count.
+ */
+static int nthreads_omp_faster(gmx_cpuid_t cpuid_info, gmx_bool bUseGPU)
  {
      int nth;
  
      if (gmx_cpuid_vendor(cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
          gmx_cpuid_feature(cpuid_info, GMX_CPUID_FEATURE_X86_AVX))
      {
-        nth = nthreads_omp_always_faster_Intel_AVX;
+        nth = nthreads_omp_faster_Intel_AVX;
      }
      else if (gmx_cpuid_is_intel_nehalem(cpuid_info))
      {
-        nth = nthreads_omp_always_faster_Nehalem;
+        nth = nthreads_omp_faster_Nehalem;
      }
      else
      {
-        nth = nthreads_omp_always_faster_default;
+        nth = nthreads_omp_faster_default;
      }
  
      if (bUseGPU)
      {
-        nth *= nthreads_omp_always_faster_gpu_fac;
+        nth *= nthreads_omp_faster_gpu_fac;
      }
  
      nth = std::min(nth, GMX_OPENMP_MAX_THREADS);
@@ -132,6 +151,26 @@ static int nthreads_omp_always_faster(gmx_cpuid_t cpuid_info, gmx_bool bUseGPU)
      return nth;
  }
  
+/* Returns that maximum OpenMP thread count that passes the efficiency check */
+static int nthreads_omp_efficient_max(int gmx_unused nrank,
+                                      gmx_cpuid_t    cpuid_info,
+                                      gmx_bool       bUseGPU)
+{
+#if defined GMX_OPENMP && defined GMX_MPI
+    if (nrank > 1)
+    {
+        return nthreads_omp_mpi_ok_max;
+    }
+    else
+#endif
+    {
+        return nthreads_omp_faster(cpuid_info, bUseGPU);
+    }
+}
+
+/* Return the number of thread-MPI ranks to use.
+ * This is chosen such that we can always obey our own efficiency checks.
+ */
  static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
                                          const gmx_hw_opt_t  *hw_opt,
                                          int                  nthreads_tot,
@@ -155,8 +194,8 @@ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
              nrank = nthreads_tot;
          }
          else if (gmx_gpu_sharing_supported() &&
-                 (nthreads_tot > nthreads_omp_always_faster(hwinfo->cpuid_info,
-                                                            ngpu > 0) ||
+                 (nthreads_tot > nthreads_omp_faster(hwinfo->cpuid_info,
+                                                     ngpu > 0) ||
                    (ngpu > 1 && nthreads_tot/ngpu > nthreads_omp_mpi_target_max)))
          {
              /* The high OpenMP thread count will likely result in sub-optimal
@@ -186,8 +225,7 @@ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
      }
      else
      {
-        if (nthreads_tot <= nthreads_omp_always_faster(hwinfo->cpuid_info,
-                                                       ngpu > 0))
+        if (nthreads_tot <= nthreads_omp_faster(hwinfo->cpuid_info, ngpu > 0))
          {
              /* Use pure OpenMP parallelization */
              nrank = 1;
@@ -203,6 +241,34 @@ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
  }
  
  
+static int getMaxGpuUsable(FILE *fplog, const t_commrec *cr, const gmx_hw_info_t *hwinfo, int cutoff_scheme)
+{
+    /* This code relies on the fact that GPU are not detected when GPU
+     * acceleration was disabled at run time by the user.
+     */
+    if (cutoff_scheme == ecutsVERLET &&
+        hwinfo->gpu_info.n_dev_compatible > 0)
+    {
+        if (gmx_multiple_gpu_per_node_supported())
+        {
+            return hwinfo->gpu_info.n_dev_compatible;
+        }
+        else
+        {
+            if (hwinfo->gpu_info.n_dev_compatible > 1)
+            {
+                md_print_warn(cr, fplog, "More than one compatible GPU is available, but GROMACS can only use one of them. Using a single thread-MPI rank.\n");
+            }
+            return 1;
+        }
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+
  #ifdef GMX_THREAD_MPI
  /* Get the number of MPI ranks to use for thread-MPI based on how many
   * were requested, which algorithms we're using,
@@ -212,7 +278,7 @@ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
   * with the hardware, except that ntmpi could be larger than #GPU.
   */
  int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
-                     const gmx_hw_opt_t  *hw_opt,
+                     gmx_hw_opt_t        *hw_opt,
                       const t_inputrec    *inputrec,
                       const gmx_mtop_t    *mtop,
                       const t_commrec     *cr,
@@ -220,7 +286,6 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
  {
      int      nthreads_hw, nthreads_tot_max, nrank, ngpu;
      int      min_atoms_per_mpi_rank;
-    gmx_bool bCanUseGPU;
  
      /* Check if an algorithm does not support parallel simulation.  */
      if (inputrec->eI == eiLBFGS ||
@@ -259,27 +324,7 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
          nthreads_tot_max = nthreads_hw;
      }
  
-    bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET &&
-                  hwinfo->gpu_info.n_dev_compatible > 0);
-    if (bCanUseGPU)
-    {
-        if (gmx_multiple_gpu_per_node_supported())
-        {
-            ngpu = hwinfo->gpu_info.n_dev_compatible;
-        }
-        else
-        {
-            if (hwinfo->gpu_info.n_dev_compatible > 1)
-            {
-                md_print_warn(cr, fplog, "More than one compatible GPU is available, but GROMACS can only use one of them. Using a single thread-MPI rank.\n");
-            }
-            ngpu = 1;
-        }
-    }
-    else
-    {
-        ngpu = 0;
-    }
+    ngpu = getMaxGpuUsable(fplog, cr, hwinfo, inputrec->cutoff_scheme);
  
      if (inputrec->cutoff_scheme == ecutsGROUP)
      {
@@ -301,7 +346,7 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
      }
      else
      {
-        if (bCanUseGPU)
+        if (ngpu >= 1)
          {
              min_atoms_per_mpi_rank = min_atoms_per_gpu;
          }
@@ -366,6 +411,26 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
  
          nrank = nrank_new;
  
+        /* We reduced the number of tMPI ranks, which means we might violate
+         * our own efficiency checks if we simply use all hardware threads.
+         */
+        if (bOMP && hw_opt->nthreads_omp <= 0 && hw_opt->nthreads_tot <= 0)
+        {
+            /* The user set neither the total nor the OpenMP thread count,
+             * we should use all hardware threads, unless we will violate
+             * our own efficiency limitation on the thread count.
+             */
+            int  nt_omp_max;
+
+            nt_omp_max = nthreads_omp_efficient_max(nrank, hwinfo->cpuid_info, ngpu >= 1);
+
+            if (nrank*nt_omp_max < hwinfo->nthreads_hw_avail)
+            {
+                /* Limit the number of OpenMP threads to start */
+                hw_opt->nthreads_omp = nt_omp_max;
+            }
+        }
+
          fprintf(stderr, "\n");
          fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n");
          fprintf(stderr, "      only starting %d thread-MPI ranks.\n", nrank);
@@ -379,7 +444,7 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
  
  void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
                                          const gmx_hw_opt_t  *hw_opt,
-                                        gmx_bool             bNTOptSet,
+                                        gmx_bool             bNtOmpOptionSet,
                                          t_commrec           *cr,
                                          FILE                *fplog)
  {
@@ -396,7 +461,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
       * OpenMP have been initialized. Check that here.
       */
  #ifdef GMX_THREAD_MPI
-    assert(nthreads_omp_always_faster_default >= nthreads_omp_mpi_ok_max);
+    assert(nthreads_omp_faster_default >= nthreads_omp_mpi_ok_max);
      assert(hw_opt->nthreads_tmpi >= 1);
  #endif
      assert(gmx_omp_nthreads_get(emntDefault) >= 1);
@@ -448,7 +513,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
                      nthreads_omp_mpi_ok_min,
                      nthreads_omp_mpi_target_max);
  
-            if (bNTOptSet)
+            if (bNtOmpOptionSet)
              {
                  md_print_warn(cr, fplog, "NOTE: %s\n", buf);
              }
@@ -466,7 +531,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
      {
          /* No domain decomposition (or only one domain) */
          if (!(ngpu > 0 && !gmx_gpu_sharing_supported()) &&
-            nth_omp_max > nthreads_omp_always_faster(hwinfo->cpuid_info, ngpu > 0))
+            nth_omp_max > nthreads_omp_faster(hwinfo->cpuid_info, ngpu > 0))
          {
              /* To arrive here, the user/system set #ranks and/or #OMPthreads */
              gmx_bool bEnvSet;
@@ -474,7 +539,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
  
              bEnvSet = (getenv("OMP_NUM_THREADS") != NULL);
  
-            if (bNTOptSet || bEnvSet)
+            if (bNtOmpOptionSet || bEnvSet)
              {
                  sprintf(buf2, "You requested %d OpenMP threads", nth_omp_max);
              }
@@ -494,7 +559,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
               * with different values per rank or node, since in that case
               * the user can not set -ntomp to override the error.
               */
-            if (bNTOptSet || (bEnvSet && nth_omp_min != nth_omp_max))
+            if (bNtOmpOptionSet || (bEnvSet && nth_omp_min != nth_omp_max))
              {
                  md_print_warn(cr, fplog, "NOTE: %s\n", buf);
              }
@@ -507,7 +572,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
  #else /* GMX_OPENMP && GMX_MPI */
        /* No OpenMP and/or MPI: it doesn't make much sense to check */
      GMX_UNUSED_VALUE(hw_opt);
-    GMX_UNUSED_VALUE(bNTOptSet);
+    GMX_UNUSED_VALUE(bNtOmpOptionSet);
      /* Check if we have more than 1 physical core, if detected,
       * or more than 1 hardware thread if physical cores were not detected.
       */
@@ -547,13 +612,14 @@ void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt,
      }
  #endif
  
-#ifndef GMX_OPENMP
-    if (hw_opt->nthreads_omp > 1)
+    if (!bOMP)
      {
-        gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but GROMACS was compiled without OpenMP support");
+        if (hw_opt->nthreads_omp > 1)
+        {
+            gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but GROMACS was compiled without OpenMP support");
+        }
+        hw_opt->nthreads_omp = 1;
      }
-    hw_opt->nthreads_omp = 1;
-#endif
  
      if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
      {
@@ -589,12 +655,10 @@ void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt,
          }
      }
  
-#ifndef GMX_OPENMP
-    if (hw_opt->nthreads_omp > 1)
+    if (!bOMP && hw_opt->nthreads_omp > 1)
      {
          gmx_fatal(FARGS, "OpenMP threads are requested, but GROMACS was compiled without OpenMP support");
      }
-#endif
  
      if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
      {
diff --git a/src/programs/mdrun/resource-division.h b/src/programs/mdrun/resource-division.h

index ca982377414f2c575358fdf0e5018513f01d9173..42f4f05182472753a829148947068115549e7129 100644 (file)
--- a/src/programs/mdrun/resource-division.h
+++ b/src/programs/mdrun/resource-division.h
@@ -45,9 +45,10 @@
   * At the point we have already called check_and_update_hw_opt.
   * Thus all options should be internally consistent and consistent
   * with the hardware, except that ntmpi could be larger than #GPU.
+ * If necessary, this function will modify hw_opt->nthreads_omp.
   */
  int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
-                     const gmx_hw_opt_t  *hw_opt,
+                     gmx_hw_opt_t        *hw_opt,
                       const t_inputrec    *inputrec,
                       const gmx_mtop_t    *mtop,
                       const t_commrec     *cr,
@@ -58,12 +59,12 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
   * intended to catch cases where the user starts 1 MPI rank per hardware
   * thread or 1 rank per physical node.
   * With a sub-optimal setup a note is printed to fplog and stderr when
- * bNtOptSet==TRUE; with bNtOptSet==FALSE a fatal error is issued.
+ * bNtOmpSet==TRUE; with bNtOptOptionSet==FALSE a fatal error is issued.
   * This function should be called after thread-MPI and OpenMP are set up.
   */
  void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
                                          const gmx_hw_opt_t  *hw_opt,
-                                        gmx_bool             bNTOptSet,
+                                        gmx_bool             bNtOmpOptionSet,
                                          t_commrec           *cr,
                                          FILE                *fplog);
  
diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp

index 8a294358090c47285208b15a10224b5c39e55f24..c878c17316b477b060cb42ab747ae4d2daa6ddd8 100644 (file)
--- a/src/programs/mdrun/runner.cpp
+++ b/src/programs/mdrun/runner.cpp
@@ -780,10 +780,6 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
                                                   hw_opt,
                                                   inputrec, mtop,
                                                   cr, fplog);
-        if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
-        {
-            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
-        }
  
          if (hw_opt->nthreads_tmpi > 1)
          {
author	Berk Hess <hess@kth.se>
	Thu, 2 Jul 2015 08:09:43 +0000 (10:09 +0200)
committer	Gerrit Code Review <gerrit@gerrit.gromacs.org>
	Sat, 4 Jul 2015 09:06:42 +0000 (11:06 +0200)
src/programs/mdrun/resource-division.cpp		patch \| blob \| history
src/programs/mdrun/resource-division.h		patch \| blob \| history
src/programs/mdrun/runner.cpp		patch \| blob \| history