Merge branch release-2016 into release-2018

[alexxy/gromacs.git] / src / gromacs / taskassignment / resourcedivision.cpp
diff --git a/src/programs/mdrun/resource-division.cpp b/src/gromacs/taskassignment/resourcedivision.cpp

similarity index 70%

rename from src/programs/mdrun/resource-division.cpp

rename to src/gromacs/taskassignment/resourcedivision.cpp

index 89fe730f805616d81190d486fb4082de073c0cc4..015c8b68575771e7f078ccbcaa897aef394b21c3 100644 (file)
--- a/src/programs/mdrun/resource-division.cpp
+++ b/src/gromacs/taskassignment/resourcedivision.cpp
@@ -32,10 +32,17 @@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
+/*! \internal \file
+ * \brief Defines utility functionality for dividing resources and
+ * checking for consistency and usefulness.
+ *
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \ingroup module_taskassignment
+ */
  
  #include "gmxpre.h"
  
-#include "resource-division.h"
+#include "resourcedivision.h"
  
  #include "config.h"
  
@@ -44,20 +51,22 @@
  
  #include <algorithm>
  
-#include "gromacs/gmxlib/md_logging.h"
+#include "gromacs/ewald/pme.h"
  #include "gromacs/hardware/cpuinfo.h"
  #include "gromacs/hardware/detecthardware.h"
-#include "gromacs/hardware/gpu_hw_info.h"
  #include "gromacs/hardware/hardwaretopology.h"
  #include "gromacs/hardware/hw_info.h"
+#include "gromacs/math/functions.h"
  #include "gromacs/mdlib/gmx_omp_nthreads.h"
  #include "gromacs/mdtypes/commrec.h"
  #include "gromacs/mdtypes/inputrec.h"
  #include "gromacs/mdtypes/md_enums.h"
  #include "gromacs/topology/mtop_util.h"
  #include "gromacs/topology/topology.h"
+#include "gromacs/utility/baseversion.h"
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/logger.h"
  #include "gromacs/utility/stringutil.h"
  
  
@@ -71,15 +80,22 @@
   * and after a switch point doesn't change too much.
   */
  
+//! Constant used to help minimize preprocessed code
  static const bool bHasOmpSupport = GMX_OPENMP;
  
-#if GMX_THREAD_MPI
-/* The minimum number of atoms per tMPI thread. With fewer atoms than this,
- * the number of threads will get lowered.
+/*! \brief The minimum number of atoms per thread-MPI thread when GPUs
+ * are present. With fewer atoms than this, the number of thread-MPI
+ * ranks will get lowered.
   */
  static const int min_atoms_per_mpi_thread =  90;
+/*! \brief The minimum number of atoms per GPU with thread-MPI
+ * active. With fewer atoms than this, the number of thread-MPI ranks
+ * will get lowered.
+ */
  static const int min_atoms_per_gpu        = 900;
-#endif /* GMX_THREAD_MPI */
+
+/**@{*/
+/*! \brief Constants for implementing default divisions of threads */
  
  /* TODO choose nthreads_omp based on hardware topology
     when we have a hardware topology detection library */
@@ -100,6 +116,7 @@ static const int min_atoms_per_gpu        = 900;
  const int nthreads_omp_faster_default   =  8;
  const int nthreads_omp_faster_Nehalem   = 12;
  const int nthreads_omp_faster_Intel_AVX = 16;
+const int nthreads_omp_faster_AMD_Ryzen = 16;
  /* For CPU only runs the fastest options are usually MPI or OpenMP only.
   * With one GPU, using MPI only is almost never optimal, so we need to
   * compare running pure OpenMP with combined MPI+OpenMP. This means higher
@@ -123,8 +140,9 @@ const int nthreads_omp_mpi_ok_min_cpu          =  1;
  const int nthreads_omp_mpi_ok_min_gpu          =  2;
  const int nthreads_omp_mpi_target_max          =  6;
  
+/**@}*/
  
-/* Returns the maximum OpenMP thread count for which using a single MPI rank
+/*! \brief Returns the maximum OpenMP thread count for which using a single MPI rank
   * should be faster than using multiple ranks with the same total thread count.
   */
  static int nthreads_omp_faster(const gmx::CpuInfo &cpuInfo, gmx_bool bUseGPU)
@@ -141,6 +159,11 @@ static int nthreads_omp_faster(const gmx::CpuInfo &cpuInfo, gmx_bool bUseGPU)
          // Intel Nehalem
          nth = nthreads_omp_faster_Nehalem;
      }
+    else if (cpuInfo.vendor() == gmx::CpuInfo::Vendor::Amd && cpuInfo.family() >= 23)
+    {
+        // AMD Ryzen
+        nth = nthreads_omp_faster_AMD_Ryzen;
+    }
      else
      {
          nth = nthreads_omp_faster_default;
@@ -156,10 +179,10 @@ static int nthreads_omp_faster(const gmx::CpuInfo &cpuInfo, gmx_bool bUseGPU)
      return nth;
  }
  
-/* Returns that maximum OpenMP thread count that passes the efficiency check */
-static int nthreads_omp_efficient_max(int gmx_unused       nrank,
-                                      const gmx::CpuInfo  &cpuInfo,
-                                      gmx_bool             bUseGPU)
+/*! \brief Returns that maximum OpenMP thread count that passes the efficiency check */
+gmx_unused static int nthreads_omp_efficient_max(int gmx_unused       nrank,
+                                                 const gmx::CpuInfo  &cpuInfo,
+                                                 gmx_bool             bUseGPU)
  {
  #if GMX_OPENMP && GMX_MPI
      if (nrank > 1)
@@ -173,13 +196,13 @@ static int nthreads_omp_efficient_max(int gmx_unused       nrank,
      }
  }
  
-/* Return the number of thread-MPI ranks to use.
+/*! \brief Return the number of thread-MPI ranks to use.
   * This is chosen such that we can always obey our own efficiency checks.
   */
-static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
-                                        const gmx_hw_opt_t  *hw_opt,
-                                        int                  nthreads_tot,
-                                        int                  ngpu)
+gmx_unused static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
+                                                   const gmx_hw_opt_t  &hw_opt,
+                                                   int                  nthreads_tot,
+                                                   int                  ngpu)
  {
      int                 nrank;
      const gmx::CpuInfo &cpuInfo = *hwinfo->cpuInfo;
@@ -193,7 +216,7 @@ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
       */
      if (ngpu > 0)
      {
-        if (hw_opt->nthreads_omp > 0)
+        if (hw_opt.nthreads_omp > 0)
          {
              /* In this case it is unclear if we should use 1 rank per GPU
               * or more or less, so we require also setting the number of ranks.
@@ -209,8 +232,8 @@ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
           * If the user does not set the number of OpenMP threads, nthreads_omp==0 and
           * this code has no effect.
           */
-        GMX_RELEASE_ASSERT(hw_opt->nthreads_omp >= 0, "nthreads_omp is negative, but previous checks should have prevented this");
-        while (nrank*hw_opt->nthreads_omp > hwinfo->nthreads_hw_avail && nrank > 1)
+        GMX_RELEASE_ASSERT(hw_opt.nthreads_omp >= 0, "nthreads_omp is negative, but previous checks should have prevented this");
+        while (nrank*hw_opt.nthreads_omp > hwinfo->nthreads_hw_avail && nrank > 1)
          {
              nrank--;
          }
@@ -220,9 +243,8 @@ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
              /* #thread < #gpu is very unlikely, but if so: waste gpu(s) */
              nrank = nthreads_tot;
          }
-        else if (gmx_gpu_sharing_supported() &&
-                 (nthreads_tot > nthreads_omp_faster(cpuInfo, ngpu > 0) ||
-                  (ngpu > 1 && nthreads_tot/ngpu > nthreads_omp_mpi_target_max)))
+        else if (nthreads_tot > nthreads_omp_faster(cpuInfo, ngpu > 0) ||
+                 (ngpu > 1 && nthreads_tot/ngpu > nthreads_omp_mpi_target_max))
          {
              /* The high OpenMP thread count will likely result in sub-optimal
               * performance. Increase the rank count to reduce the thread count
@@ -244,10 +266,10 @@ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
                     (nthreads_tot/(ngpu*(nshare + 1)) >= nthreads_omp_mpi_ok_min_gpu && nthreads_tot % nrank != 0));
          }
      }
-    else if (hw_opt->nthreads_omp > 0)
+    else if (hw_opt.nthreads_omp > 0)
      {
          /* Here we could oversubscribe, when we do, we issue a warning later */
-        nrank = std::max(1, nthreads_tot/hw_opt->nthreads_omp);
+        nrank = std::max(1, nthreads_tot/hw_opt.nthreads_omp);
      }
      else
      {
@@ -266,39 +288,7 @@ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
      return nrank;
  }
  
-
-static int getMaxGpuUsable(FILE *fplog, const t_commrec *cr, const gmx_hw_info_t *hwinfo,
-                           int cutoff_scheme, gmx_bool bUseGpu)
-{
-    /* This code relies on the fact that GPU are not detected when GPU
-     * acceleration was disabled at run time by the user.
-     */
-    if (cutoff_scheme == ecutsVERLET &&
-        bUseGpu &&
-        hwinfo->gpu_info.n_dev_compatible > 0)
-    {
-        if (gmx_multiple_gpu_per_node_supported())
-        {
-            return hwinfo->gpu_info.n_dev_compatible;
-        }
-        else
-        {
-            if (hwinfo->gpu_info.n_dev_compatible > 1)
-            {
-                md_print_warn(cr, fplog, "More than one compatible GPU is available, but GROMACS can only use one of them. Using a single thread-MPI rank.\n");
-            }
-            return 1;
-        }
-    }
-    else
-    {
-        return 0;
-    }
-}
-
-
-#if GMX_THREAD_MPI
-
+//! Return whether hyper threading is enabled.
  static bool
  gmxSmtIsEnabled(const gmx::HardwareTopology &hwTop)
  {
@@ -308,6 +298,7 @@ gmxSmtIsEnabled(const gmx::HardwareTopology &hwTop)
  namespace
  {
  
+//! Handles checks for algorithms that must use a single rank.
  class SingleRankChecker
  {
      public:
@@ -350,14 +341,15 @@ class SingleRankChecker
   * Thus all options should be internally consistent and consistent
   * with the hardware, except that ntmpi could be larger than #GPU.
   */
-int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
-                     gmx_hw_opt_t        *hw_opt,
-                     const t_inputrec    *inputrec,
-                     const gmx_mtop_t    *mtop,
-                     const t_commrec     *cr,
-                     FILE                *fplog,
-                     gmx_bool             bUseGpu,
-                     bool                 doMembed)
+int get_nthreads_mpi(const gmx_hw_info_t    *hwinfo,
+                     gmx_hw_opt_t           *hw_opt,
+                     const std::vector<int> &gpuIdsToUse,
+                     bool                    nonbondedOnGpu,
+                     bool                    pmeOnGpu,
+                     const t_inputrec       *inputrec,
+                     const gmx_mtop_t       *mtop,
+                     const gmx::MDLogger    &mdlog,
+                     bool                    doMembed)
  {
      int                          nthreads_hw, nthreads_tot_max, nrank, ngpu;
      int                          min_atoms_per_mpi_rank;
@@ -365,6 +357,20 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
      const gmx::CpuInfo          &cpuInfo = *hwinfo->cpuInfo;
      const gmx::HardwareTopology &hwTop   = *hwinfo->hardwareTopology;
  
+    if (pmeOnGpu)
+    {
+        GMX_RELEASE_ASSERT((EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)) &&
+                           pme_gpu_supports_input(inputrec, nullptr),
+                           "PME can't be on GPUs unless we are using PME");
+
+        // PME on GPUs supports a single PME rank with PP running on the same or few other ranks.
+        // For now, let's treat separate PME GPU rank as opt-in.
+        if (hw_opt->nthreads_tmpi < 1)
+        {
+            return 1;
+        }
+    }
+
      {
          /* Check if an algorithm does not support parallel simulation.  */
          // TODO This might work better if e.g. implemented algorithms
@@ -383,17 +389,18 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
              {
                  gmx_fatal(FARGS, "%s However, you asked for more than 1 thread-MPI rank, so mdrun cannot continue. Choose a single rank, or a different algorithm.", message.c_str());
              }
-            md_print_warn(cr, fplog, "%s Choosing to use only a single thread-MPI rank.", message.c_str());
+            GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted("%s Choosing to use only a single thread-MPI rank.", message.c_str());
              return 1;
          }
      }
  
      if (hw_opt->nthreads_tmpi > 0)
      {
-        /* Trivial, return right away */
+        /* Trivial, return the user's choice right away */
          return hw_opt->nthreads_tmpi;
      }
  
+    // Now implement automatic selection of number of thread-MPI ranks
      nthreads_hw = hwinfo->nthreads_hw_avail;
  
      if (nthreads_hw <= 0)
@@ -412,7 +419,9 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
          nthreads_tot_max = nthreads_hw;
      }
  
-    ngpu = getMaxGpuUsable(fplog, cr, hwinfo, inputrec->cutoff_scheme, bUseGpu);
+    /* nonbondedOnGpu might be false e.g. because this simulation uses
+     * the group scheme, or is a rerun with energy groups. */
+    ngpu = (nonbondedOnGpu ? static_cast<int>(gpuIdsToUse.size()) : 0);
  
      if (inputrec->cutoff_scheme == ecutsGROUP)
      {
@@ -421,7 +430,7 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
      }
  
      nrank =
-        get_tmpi_omp_thread_division(hwinfo, hw_opt, nthreads_tot_max, ngpu);
+        get_tmpi_omp_thread_division(hwinfo, *hw_opt, nthreads_tot_max, ngpu);
  
      if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
      {
@@ -527,17 +536,17 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
  
      return nrank;
  }
-#endif /* GMX_THREAD_MPI */
  
  
  void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
-                                        const gmx_hw_opt_t  *hw_opt,
+                                        bool                 willUsePhysicalGpu,
                                          gmx_bool             bNtOmpOptionSet,
                                          t_commrec           *cr,
-                                        FILE                *fplog)
+                                        const gmx::MDLogger &mdlog)
  {
+    GMX_UNUSED_VALUE(hwinfo);
  #if GMX_OPENMP && GMX_MPI
-    int         nth_omp_min, nth_omp_max, ngpu;
+    int         nth_omp_min, nth_omp_max;
      char        buf[1000];
  #if GMX_THREAD_MPI
      const char *mpi_option = " (option -ntmpi)";
@@ -550,14 +559,13 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
       */
  #if GMX_THREAD_MPI
      GMX_RELEASE_ASSERT(nthreads_omp_faster_default >= nthreads_omp_mpi_ok_max, "Inconsistent OpenMP thread count default values");
-    GMX_RELEASE_ASSERT(hw_opt->nthreads_tmpi >= 1, "Must have at least one thread-MPI rank");
  #endif
      GMX_RELEASE_ASSERT(gmx_omp_nthreads_get(emntDefault) >= 1, "Must have at least one OpenMP thread");
  
      nth_omp_min = gmx_omp_nthreads_get(emntDefault);
      nth_omp_max = gmx_omp_nthreads_get(emntDefault);
-    ngpu        = hw_opt->gpu_opt.n_dev_use;
  
+    bool anyRankIsUsingGpus = willUsePhysicalGpu;
      /* Thread-MPI seems to have a bug with reduce on 1 node, so use a cond. */
      if (cr->nnodes + cr->npmenodes > 1)
      {
@@ -565,19 +573,19 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
  
          count[0] = -nth_omp_min;
          count[1] =  nth_omp_max;
-        count[2] =  ngpu;
+        count[2] =  willUsePhysicalGpu;
  
          MPI_Allreduce(count, count_max, 3, MPI_INT, MPI_MAX, cr->mpi_comm_mysim);
  
          /* In case of an inhomogeneous run setup we use the maximum counts */
-        nth_omp_min = -count_max[0];
-        nth_omp_max =  count_max[1];
-        ngpu        =  count_max[2];
+        nth_omp_min        = -count_max[0];
+        nth_omp_max        =  count_max[1];
+        anyRankIsUsingGpus = count_max[2] > 0;
      }
  
      int nthreads_omp_mpi_ok_min;
  
-    if (ngpu == 0)
+    if (!anyRankIsUsingGpus)
      {
          nthreads_omp_mpi_ok_min = nthreads_omp_mpi_ok_min_cpu;
      }
@@ -592,8 +600,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
      if (DOMAINDECOMP(cr) && cr->nnodes > 1)
      {
          if (nth_omp_max < nthreads_omp_mpi_ok_min ||
-            (!(ngpu > 0 && !gmx_gpu_sharing_supported()) &&
-             nth_omp_max > nthreads_omp_mpi_ok_max))
+            nth_omp_max > nthreads_omp_mpi_ok_max)
          {
              /* Note that we print target_max here, not ok_max */
              sprintf(buf, "Your choice of number of MPI ranks and amount of resources results in using %d OpenMP threads per rank, which is most likely inefficient. The optimum is usually between %d and %d threads per rank.",
@@ -603,7 +610,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
  
              if (bNtOmpOptionSet)
              {
-                md_print_warn(cr, fplog, "NOTE: %s\n", buf);
+                GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted("NOTE: %s", buf);
              }
              else
              {
@@ -615,83 +622,34 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
              }
          }
      }
-    else
-    {
-        const gmx::CpuInfo &cpuInfo = *hwinfo->cpuInfo;
-
-        /* No domain decomposition (or only one domain) */
-        if (!(ngpu > 0 && !gmx_gpu_sharing_supported()) &&
-            nth_omp_max > nthreads_omp_faster(cpuInfo, ngpu > 0))
-        {
-            /* To arrive here, the user/system set #ranks and/or #OMPthreads */
-            gmx_bool bEnvSet;
-            char     buf2[256];
-
-            bEnvSet = (getenv("OMP_NUM_THREADS") != NULL);
-
-            if (bNtOmpOptionSet || bEnvSet)
-            {
-                sprintf(buf2, "You requested %d OpenMP threads", nth_omp_max);
-            }
-            else
-            {
-                sprintf(buf2, "Your choice of %d MPI rank%s and the use of %d total threads %sleads to the use of %d OpenMP threads",
-                        cr->nnodes + cr->npmenodes,
-                        cr->nnodes + cr->npmenodes == 1 ? "" : "s",
-                        hw_opt->nthreads_tot > 0 ? hw_opt->nthreads_tot : hwinfo->nthreads_hw_avail,
-                        hwinfo->nphysicalnode > 1 ? "on a node " : "",
-                        nth_omp_max);
-            }
-            sprintf(buf, "%s, whereas we expect the optimum to be with more MPI ranks with %d to %d OpenMP threads.",
-                    buf2, nthreads_omp_mpi_ok_min, nthreads_omp_mpi_target_max);
-
-            /* We can not quit with a fatal error when OMP_NUM_THREADS is set
-             * with different values per rank or node, since in that case
-             * the user can not set -ntomp to override the error.
-             */
-            if (bNtOmpOptionSet || (bEnvSet && nth_omp_min != nth_omp_max))
-            {
-                md_print_warn(cr, fplog, "NOTE: %s\n", buf);
-            }
-            else
-            {
-                gmx_fatal(FARGS, "%s If you want to run with this many OpenMP threads, specify the -ntomp option. But we suggest to increase the number of MPI ranks%s.", buf, mpi_option);
-            }
-        }
-    }
  #else /* GMX_OPENMP && GMX_MPI */
        /* No OpenMP and/or MPI: it doesn't make much sense to check */
-    GMX_UNUSED_VALUE(hw_opt);
      GMX_UNUSED_VALUE(bNtOmpOptionSet);
+    GMX_UNUSED_VALUE(willUsePhysicalGpu);
+    GMX_UNUSED_VALUE(cr);
      /* Check if we have more than 1 physical core, if detected,
       * or more than 1 hardware thread if physical cores were not detected.
       */
-#if !GMX_OPENMP && !GMX_MPI
-    if (hwinfo->hardwareTopology->numberOfCores() > 1)
+    if (!GMX_OPENMP && !GMX_MPI && hwinfo->hardwareTopology->numberOfCores() > 1)
      {
-        md_print_warn(cr, fplog, "NOTE: GROMACS was compiled without OpenMP and (thread-)MPI support, can only use a single CPU core\n");
+        GMX_LOG(mdlog.warning).asParagraph().appendText("NOTE: GROMACS was compiled without OpenMP and (thread-)MPI support, can only use a single CPU core");
      }
-#else
-    GMX_UNUSED_VALUE(hwinfo);
-    GMX_UNUSED_VALUE(cr);
-    GMX_UNUSED_VALUE(fplog);
-#endif
-
  #endif /* GMX_OPENMP && GMX_MPI */
  }
  
  
+//! Dump a \c hw_opt to \c fp.
  static void print_hw_opt(FILE *fp, const gmx_hw_opt_t *hw_opt)
  {
-    fprintf(fp, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
+    fprintf(fp, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s' gputasks '%s'\n",
              hw_opt->nthreads_tot,
              hw_opt->nthreads_tmpi,
              hw_opt->nthreads_omp,
              hw_opt->nthreads_omp_pme,
-            hw_opt->gpu_opt.gpu_id != NULL ? hw_opt->gpu_opt.gpu_id : "");
+            hw_opt->gpuIdsAvailable.c_str(),
+            hw_opt->userGpuTaskAssignment.c_str());
  }
  
-/* Checks we can do when we don't (yet) know the cut-off scheme */
  void check_and_update_hw_opt_1(gmx_hw_opt_t    *hw_opt,
                                 const t_commrec *cr,
                                 int              nPmeRanks)
@@ -723,6 +681,16 @@ void check_and_update_hw_opt_1(gmx_hw_opt_t    *hw_opt,
      }
  #endif
  
+    /* With thread-MPI the master thread sets hw_opt->totNumThreadsIsAuto.
+     * The other threads receive a partially processed hw_opt from the master
+     * thread and should not set hw_opt->totNumThreadsIsAuto again.
+     */
+    if (!GMX_THREAD_MPI || SIMMASTER(cr))
+    {
+        /* Check if mdrun is free to choose the total number of threads */
+        hw_opt->totNumThreadsIsAuto = (hw_opt->nthreads_omp == 0 && hw_opt->nthreads_omp_pme == 0 && hw_opt->nthreads_tot == 0);
+    }
+
      if (bHasOmpSupport)
      {
          /* Check restrictions on PME thread related options set by the user */
@@ -783,41 +751,20 @@ void check_and_update_hw_opt_1(gmx_hw_opt_t    *hw_opt,
          }
      }
  
-    if (hw_opt->nthreads_tot == 1)
+    if (hw_opt->nthreads_tot > 0)
      {
-        hw_opt->nthreads_tmpi = 1;
-
-        if (hw_opt->nthreads_omp > 1)
+        if (hw_opt->nthreads_omp > hw_opt->nthreads_tot)
          {
-            gmx_fatal(FARGS, "You requested %d OpenMP threads with %d total threads",
-                      hw_opt->nthreads_tmpi, hw_opt->nthreads_tot);
+            gmx_fatal(FARGS, "You requested %d OpenMP threads with %d total threads. Choose a total number of threads that is a multiple of the number of OpenMP threads.",
+                      hw_opt->nthreads_omp, hw_opt->nthreads_tot);
          }
-        hw_opt->nthreads_omp     = 1;
-        hw_opt->nthreads_omp_pme = 1;
-    }
  
-    /* Parse GPU IDs, if provided.
-     * We check consistency with the tMPI thread count later.
-     */
-    gmx_parse_gpu_ids(&hw_opt->gpu_opt);
-
-#if GMX_THREAD_MPI
-    if (hw_opt->gpu_opt.n_dev_use > 0 && hw_opt->nthreads_tmpi == 0)
-    {
-        /* Set the number of MPI threads equal to the number of GPUs */
-        hw_opt->nthreads_tmpi = hw_opt->gpu_opt.n_dev_use;
-
-        if (hw_opt->nthreads_tot > 0 &&
-            hw_opt->nthreads_tmpi > hw_opt->nthreads_tot)
+        if (hw_opt->nthreads_tmpi > hw_opt->nthreads_tot)
          {
-            /* We have more GPUs than total threads requested.
-             * We choose to (later) generate a mismatch error,
-             * instead of launching more threads than requested.
-             */
-            hw_opt->nthreads_tmpi = hw_opt->nthreads_tot;
+            gmx_fatal(FARGS, "You requested %d thread-MPI ranks with %d total threads. Choose a total number of threads that is a multiple of the number of thread-MPI ranks.",
+                      hw_opt->nthreads_tmpi, hw_opt->nthreads_tot);
          }
      }
-#endif
  
      if (debug)
      {
@@ -830,7 +777,6 @@ void check_and_update_hw_opt_1(gmx_hw_opt_t    *hw_opt,
                         "PME thread count should only be set when the normal thread count is also set");
  }
  
-/* Checks we can do when we know the cut-off scheme */
  void check_and_update_hw_opt_2(gmx_hw_opt_t *hw_opt,
                                 int           cutoff_scheme)
  {
@@ -847,8 +793,11 @@ void check_and_update_hw_opt_2(gmx_hw_opt_t *hw_opt,
      }
  }
  
-/* Checks we can do when we know the thread-MPI rank count */
-void check_and_update_hw_opt_3(gmx_hw_opt_t gmx_unused *hw_opt)
+void checkAndUpdateRequestedNumOpenmpThreads(gmx_hw_opt_t        *hw_opt,
+                                             const gmx_hw_info_t &hwinfo,
+                                             const t_commrec     *cr,
+                                             PmeRunMode           pmeRunMode,
+                                             const gmx_mtop_t    &mtop)
  {
  #if GMX_THREAD_MPI
      GMX_RELEASE_ASSERT(hw_opt->nthreads_tmpi >= 1, "Must have at least one thread-MPI rank");
@@ -867,6 +816,51 @@ void check_and_update_hw_opt_3(gmx_hw_opt_t gmx_unused *hw_opt)
      }
  #endif
  
+    /* With both non-bonded and PME on GPU, the work left on the CPU is often
+     * (much) slower with SMT than without SMT. This is mostly the case with
+     * few atoms per core. Thus, if the number of threads is set to auto,
+     * we turn off SMT in that case. Note that PME on GPU implies that also
+     * the non-bonded are computed on the GPU.
+     * We only need to do this when the number of hardware theads is larger
+     * than the number of cores. Note that a queuing system could limit
+     * the number of hardware threads available, but we are not trying to be
+     * too smart here in that case.
+     */
+    /* The thread reduction and synchronization costs go up roughy quadratically
+     * with the threads count, so we apply a threshold quadratic in #cores.
+     * Also more cores per GPU usually means the CPU gets faster than the GPU.
+     * The number 1000 atoms per core^2 is a reasonable threshold
+     * for Intel x86 and AMD Threadripper.
+     */
+    constexpr int c_numAtomsPerCoreSquaredSmtThreshold = 1000;
+
+    /* Prepare conditions for deciding if we should disable SMT.
+     * We currently only limit SMT for simulations using a single rank.
+     * TODO: Consider limiting also for multi-rank simulations.
+     */
+    bool canChooseNumOpenmpThreads      = (bHasOmpSupport && hw_opt->nthreads_omp <= 0);
+    bool haveSmtSupport                 = (hwinfo.hardwareTopology->supportLevel() >= gmx::HardwareTopology::SupportLevel::Basic &&
+                                           hwinfo.hardwareTopology->machine().logicalProcessorCount > hwinfo.hardwareTopology->numberOfCores());
+    bool simRunsSingleRankNBAndPmeOnGpu = (cr->nnodes == 1 && pmeRunMode == PmeRunMode::GPU);
+
+    if (canChooseNumOpenmpThreads && haveSmtSupport &&
+        simRunsSingleRankNBAndPmeOnGpu)
+    {
+        /* Note that the queing system might have limited us from using
+         * all detected ncore_tot physical cores. We are currently not
+         * checking for that here.
+         */
+        int numRanksTot     = cr->nnodes*(MULTISIM(cr) ? cr->ms->nsim : 1);
+        int numAtomsPerRank = mtop.natoms/cr->nnodes;
+        int numCoresPerRank = hwinfo.ncore_tot/numRanksTot;
+        if (numAtomsPerRank < c_numAtomsPerCoreSquaredSmtThreshold*gmx::square(numCoresPerRank))
+        {
+            int numRanksInThisNode = (cr ? cr->nrank_intranode : 1);
+            /* Choose one OpenMP thread per physical core */
+            hw_opt->nthreads_omp = std::max(1, hwinfo.hardwareTopology->numberOfCores()/numRanksInThisNode);
+        }
+    }
+
      GMX_RELEASE_ASSERT(bHasOmpSupport || hw_opt->nthreads_omp == 1, "Without OpenMP support, only one thread per rank can be used");
  
      /* We are done with updating nthreads_omp, we can set nthreads_omp_pme */
@@ -880,3 +874,67 @@ void check_and_update_hw_opt_3(gmx_hw_opt_t gmx_unused *hw_opt)
          print_hw_opt(debug, hw_opt);
      }
  }
+
+void checkHardwareOversubscription(int                          numThreadsOnThisRank,
+                                   const gmx::HardwareTopology &hwTop,
+                                   const t_commrec             *cr,
+                                   const gmx::MDLogger         &mdlog)
+{
+    if (hwTop.supportLevel() < gmx::HardwareTopology::SupportLevel::LogicalProcessorCount)
+    {
+        /* There is nothing we can check */
+        return;
+    }
+
+    int numRanksOnThisNode   = 1;
+    int numThreadsOnThisNode = numThreadsOnThisRank;
+#if GMX_MPI
+    if (PAR(cr) || MULTISIM(cr))
+    {
+        /* Count the threads within this physical node */
+        MPI_Comm_size(cr->mpi_comm_physicalnode, &numRanksOnThisNode);
+        MPI_Allreduce(&numThreadsOnThisRank, &numThreadsOnThisNode, 1, MPI_INT, MPI_SUM, cr->mpi_comm_physicalnode);
+    }
+#endif
+
+    if (numThreadsOnThisNode > hwTop.machine().logicalProcessorCount)
+    {
+        std::string mesg = "WARNING: ";
+        if (GMX_LIB_MPI)
+        {
+            mesg += gmx::formatString("On rank %d: o", cr->sim_nodeid);
+        }
+        else
+        {
+            mesg += "O";
+        }
+        mesg     += gmx::formatString("versubscribing the available %d logical CPU cores", hwTop.machine().logicalProcessorCount);
+        if (GMX_LIB_MPI)
+        {
+            mesg += " per node";
+        }
+        mesg     += gmx::formatString(" with %d ", numThreadsOnThisNode);
+        if (numRanksOnThisNode == numThreadsOnThisNode)
+        {
+            if (GMX_THREAD_MPI)
+            {
+                mesg += "thread-MPI threads.";
+            }
+            else
+            {
+                mesg += "MPI processes.";
+            }
+        }
+        else
+        {
+            mesg += "threads.";
+        }
+        mesg     += "\n         This will cause considerable performance loss.";
+        /* Note that only the master rank logs to stderr and only ranks
+         * with an open log file write to log.
+         * TODO: When we have a proper parallel logging framework,
+         *       the framework should add the rank and node numbers.
+         */
+        GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(mesg.c_str());
+    }
+}