Merge branch release-4-6 into master

[alexxy/gromacs.git] / src / gromacs / gmxlib / gmx_detect_hardware.c
diff --git a/src/gromacs/gmxlib/gmx_detect_hardware.c b/src/gromacs/gmxlib/gmx_detect_hardware.c

index 56f020014789cfe1dd9d87b39cd9ec0963ccc590..2ce47d2ea252e17f157598b6acd606e93d4e63fd 100644 (file)
--- a/src/gromacs/gmxlib/gmx_detect_hardware.c
+++ b/src/gromacs/gmxlib/gmx_detect_hardware.c
@@ -38,10 +38,11 @@
  #include "main.h"
  #include "md_logging.h"
  
+#include "thread_mpi/threads.h"
+
  #ifdef HAVE_UNISTD_H
  #include <unistd.h>
  #endif
-
  #if ((defined(WIN32) || defined( _WIN32 ) || defined(WIN64) || defined( _WIN64 )) && !(defined (__CYGWIN__) || defined (__CYGWIN32__)))
  #include "windows.h"
  #endif
@@ -56,8 +57,16 @@ static unsigned int max_gpu_ids_user = 64;
  static const char * invalid_gpuid_hint =
      "A delimiter-free sequence of valid numeric IDs of available GPUs is expected.";
  
+/* The globally shared hwinfo structure. */
+static gmx_hw_info_t      *hwinfo_g;
+/* A reference counter for the hwinfo structure */
+static int                 n_hwinfo = 0;
+/* A lock to protect the hwinfo structure */
+static tMPI_Thread_mutex_t hw_info_lock = TMPI_THREAD_MUTEX_INITIALIZER;
+
+
  /* FW decl. */
-void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count);
+static void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count);
  
  static void sprint_gpus(char *sbuf, const gmx_gpu_info_t *gpu_info)
  {
@@ -177,226 +186,262 @@ void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
                                        const t_commrec *cr, int ntmpi_requested,
                                        gmx_bool bUseGPU)
  {
-    int      npppn, ntmpi_pp, ngpu;
-    char     sbuf[STRLEN], th_or_proc[STRLEN], th_or_proc_plural[STRLEN], pernode[STRLEN];
-    char     gpu_plural[2];
-    gmx_bool bGPUBin, btMPI, bMPI, bMaxMpiThreadsSet, bNthreadsAuto, bEmulateGPU;
+    int                        npppn, ntmpi_pp, ngpu;
+    char                       sbuf[STRLEN], th_or_proc[STRLEN], th_or_proc_plural[STRLEN], pernode[STRLEN];
+    char                       gpu_plural[2];
+    gmx_bool                   bGPUBin, btMPI, bMPI, bMaxMpiThreadsSet, bNthreadsAuto, bEmulateGPU;
+    int                        ret;
+    static tMPI_Thread_mutex_t cons_lock = TMPI_THREAD_MUTEX_INITIALIZER;
+
  
      assert(hwinfo);
      assert(cr);
  
-    btMPI         = bMPI = FALSE;
-    bNthreadsAuto = FALSE;
+    /* Below we only do consistency checks for PP and GPUs,
+     * this is irrelevant for PME only nodes, so in that case we return
+     * here.
+     */
+    if (!(cr->duty & DUTY_PP))
+    {
+        return;
+    }
+
+    /* We run this function only once, but must make sure that all threads
+       that are alive run this function, so they get consistent data. We
+       achieve this by mutual exclusion and returning if the structure is
+       already properly checked & set */
+    ret = tMPI_Thread_mutex_lock(&cons_lock);
+    if (ret != 0)
+    {
+        gmx_fatal(FARGS, "Error locking cons mutex: %s", strerror(errno));
+    }
+
+    if (!hwinfo->bConsistencyChecked)
+    {
+        btMPI         = bMPI = FALSE;
+        bNthreadsAuto = FALSE;
  #if defined(GMX_THREAD_MPI)
-    btMPI         = TRUE;
-    bNthreadsAuto = (ntmpi_requested < 1);
+        btMPI         = TRUE;
+        bNthreadsAuto = (ntmpi_requested < 1);
  #elif defined(GMX_LIB_MPI)
-    bMPI  = TRUE;
+        bMPI  = TRUE;
  #endif
  
  #ifdef GMX_GPU
-    bGPUBin      = TRUE;
+        bGPUBin      = TRUE;
  #else
-    bGPUBin      = FALSE;
+        bGPUBin      = FALSE;
  #endif
  
-    /* GPU emulation detection is done later, but we need here as well
-     * -- uncool, but there's no elegant workaround */
-    bEmulateGPU       = (getenv("GMX_EMULATE_GPU") != NULL);
-    bMaxMpiThreadsSet = (getenv("GMX_MAX_MPI_THREADS") != NULL);
+        /* GPU emulation detection is done later, but we need here as well
+         * -- uncool, but there's no elegant workaround */
+        bEmulateGPU       = (getenv("GMX_EMULATE_GPU") != NULL);
+        bMaxMpiThreadsSet = (getenv("GMX_MAX_MPI_THREADS") != NULL);
  
-    if (SIMMASTER(cr))
-    {
-        /* check the acceleration mdrun is compiled with against hardware capabilities */
-        /* TODO: Here we assume homogeneous hardware which is not necessarily the case!
-         *       Might not hurt to add an extra check over MPI. */
+        /* check the acceleration mdrun is compiled with against hardware
+           capabilities */
+        /* TODO: Here we assume homogeneous hardware which is not necessarily
+                 the case! Might not hurt to add an extra check over MPI. */
          gmx_cpuid_acceleration_check(hwinfo->cpuid_info, fplog);
-    }
-
-    /* Below we only do consistency checks for PP and GPUs,
-     * this is irrelevant for PME only nodes, so in that case we return here.
-     */
-    if (!(cr->duty & DUTY_PP))
-    {
-        return;
-    }
  
-    /* Need to ensure that we have enough GPUs:
-     * - need one GPU per PP node
-     * - no GPU oversubscription with tMPI
-     * => keep on the GPU support, otherwise turn off (or bail if forced)
-     * */
-    /* number of PP processes per node */
-    npppn = cr->nrank_pp_intranode;
-
-    pernode[0]           = '\0';
-    th_or_proc_plural[0] = '\0';
-    if (btMPI)
-    {
-        sprintf(th_or_proc, "thread-MPI thread");
-        if (npppn > 1)
+        /* Need to ensure that we have enough GPUs:
+         * - need one GPU per PP node
+         * - no GPU oversubscription with tMPI
+         * => keep on the GPU support, otherwise turn off (or bail if forced)
+         * */
+        /* number of PP processes per node */
+        npppn = cr->nrank_pp_intranode;
+
+        pernode[0]           = '\0';
+        th_or_proc_plural[0] = '\0';
+        if (btMPI)
          {
-            sprintf(th_or_proc_plural, "s");
+            sprintf(th_or_proc, "thread-MPI thread");
+            if (npppn > 1)
+            {
+                sprintf(th_or_proc_plural, "s");
+            }
          }
-    }
-    else if (bMPI)
-    {
-        sprintf(th_or_proc, "MPI process");
-        if (npppn > 1)
+        else if (bMPI)
          {
-            sprintf(th_or_proc_plural, "es");
+            sprintf(th_or_proc, "MPI process");
+            if (npppn > 1)
+            {
+                sprintf(th_or_proc_plural, "es");
+            }
+            sprintf(pernode, " per node");
+        }
+        else
+        {
+            /* neither MPI nor tMPI */
+            sprintf(th_or_proc, "process");
          }
-        sprintf(pernode, " per node");
-    }
-    else
-    {
-        /* neither MPI nor tMPI */
-        sprintf(th_or_proc, "process");
-    }
-
-    if (bGPUBin)
-    {
-        print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
-    }
  
-    if (bUseGPU && hwinfo->bCanUseGPU && !bEmulateGPU)
-    {
-        ngpu = hwinfo->gpu_info.ncuda_dev_use;
-        sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
+        if (bGPUBin)
+        {
+            print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
+        }
  
-        /* number of tMPI threads atuo-adjusted */
-        if (btMPI && bNthreadsAuto && SIMMASTER(cr))
+        if (bUseGPU && hwinfo->bCanUseGPU && !bEmulateGPU)
          {
-            if (npppn < ngpu)
+            ngpu = hwinfo->gpu_info.ncuda_dev_use;
+            sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
+
+            /* number of tMPI threads atuo-adjusted */
+            if (btMPI && bNthreadsAuto)
              {
-                if (hwinfo->gpu_info.bUserSet)
+                if (npppn < ngpu)
                  {
-                    /* The user manually provided more GPUs than threads we could
-                     * automatically start. */
-                    gmx_fatal(FARGS,
-                              "%d GPU%s provided, but only %d PP thread-MPI thread%s coud be started.\n"
-                              "%s requires one PP tread-MPI thread per GPU; use fewer GPUs%s.",
-                              ngpu, gpu_plural, npppn, th_or_proc_plural,
-                              ShortProgram(), bMaxMpiThreadsSet ? "\nor allow more threads to be used" : "");
-                }
-                else
-                {
-                    /* There are more GPUs than tMPI threads; we have to limit the number GPUs used. */
-                    md_print_warn(cr, fplog,
-                                  "NOTE: %d GPU%s were detected, but only %d PP thread-MPI thread%s can be started.\n"
-                                  "      %s can use one GPU per PP tread-MPI thread, so only %d GPU%s will be used.%s\n",
+                    if (hwinfo->gpu_info.bUserSet)
+                    {
+                        /* The user manually provided more GPUs than threads we
+                           could automatically start. */
+                        gmx_fatal(FARGS,
+                                  "%d GPU%s provided, but only %d PP thread-MPI thread%s coud be started.\n"
+                                  "%s requires one PP tread-MPI thread per GPU; use fewer GPUs%s.",
                                    ngpu, gpu_plural, npppn, th_or_proc_plural,
-                                  ShortProgram(), npppn, npppn > 1 ? "s" : "",
-                                  bMaxMpiThreadsSet ? "\n      Also, you can allow more threads to be used by increasing GMX_MAX_MPI_THREADS" : "");
-
-                    if (cr->rank_pp_intranode == 0)
+                                  ShortProgram(), bMaxMpiThreadsSet ? "\nor allow more threads to be used" : "");
+                    }
+                    else
                      {
-                        limit_num_gpus_used(hwinfo, npppn);
-                        ngpu = hwinfo->gpu_info.ncuda_dev_use;
-                        sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
+                        /* There are more GPUs than tMPI threads; we have to
+                           limit the number GPUs used. */
+                        md_print_warn(cr, fplog,
+                                      "NOTE: %d GPU%s were detected, but only %d PP thread-MPI thread%s can be started.\n"
+                                      "      %s can use one GPU per PP tread-MPI thread, so only %d GPU%s will be used.%s\n",
+                                      ngpu, gpu_plural, npppn,
+                                      th_or_proc_plural,
+                                      ShortProgram(), npppn,
+                                      npppn > 1 ? "s" : "",
+                                      bMaxMpiThreadsSet ? "\n      Also, you can allow more threads to be used by increasing GMX_MAX_MPI_THREADS" : "");
+
+                        if (cr->rank_pp_intranode == 0)
+                        {
+                            limit_num_gpus_used(hwinfo, npppn);
+                            ngpu = hwinfo->gpu_info.ncuda_dev_use;
+                            sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
+                        }
                      }
                  }
              }
-        }
  
-        if (ngpu != npppn)
-        {
-            if (hwinfo->gpu_info.bUserSet)
+            if (ngpu != npppn)
              {
-                gmx_fatal(FARGS,
-                          "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
-                          "%s was started with %d PP %s%s%s, but you provided %d GPU%s.",
-                          th_or_proc, btMPI ? "s" : "es", pernode,
-                          ShortProgram(), npppn, th_or_proc, th_or_proc_plural, pernode, ngpu, gpu_plural);
-            }
-            else
-            {
-                if (ngpu > npppn)
+                if (hwinfo->gpu_info.bUserSet)
                  {
-                    md_print_warn(cr, fplog,
-                                  "NOTE: potentially sub-optimal launch configuration, %s started with less\n"
-                                  "      PP %s%s%s than GPU%s available.\n"
-                                  "      Each PP %s can use only one GPU, %d GPU%s%s will be used.\n",
-                                  ShortProgram(),
-                                  th_or_proc, th_or_proc_plural, pernode, gpu_plural,
-                                  th_or_proc, npppn, gpu_plural, pernode);
-
-                    if (bMPI || (btMPI && cr->rank_pp_intranode == 0))
-                    {
-                        limit_num_gpus_used(hwinfo, npppn);
-                        ngpu = hwinfo->gpu_info.ncuda_dev_use;
-                        sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
-                    }
+                    gmx_fatal(FARGS,
+                              "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
+                              "%s was started with %d PP %s%s%s, but you provided %d GPU%s.",
+                              th_or_proc, btMPI ? "s" : "es", pernode,
+                              ShortProgram(), npppn, th_or_proc,
+                              th_or_proc_plural, pernode, ngpu, gpu_plural);
                  }
                  else
                  {
-                    /* Avoid duplicate error messages.
-                     * Unfortunately we can only do this at the physical node
-                     * level, since the hardware setup and MPI process count
-                     * might be differ over physical nodes.
-                     */
-                    if (cr->rank_pp_intranode == 0)
+                    if (ngpu > npppn)
                      {
-                        gmx_fatal(FARGS,
-                                  "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
-                                  "%s was started with %d PP %s%s%s, but only %d GPU%s were detected.",
-                                  th_or_proc, btMPI ? "s" : "es", pernode,
-                                  ShortProgram(), npppn, th_or_proc, th_or_proc_plural, pernode, ngpu, gpu_plural);
+                        md_print_warn(cr, fplog,
+                                      "NOTE: potentially sub-optimal launch configuration, %s started with less\n"
+                                      "      PP %s%s%s than GPU%s available.\n"
+                                      "      Each PP %s can use only one GPU, %d GPU%s%s will be used.\n",
+                                      ShortProgram(), th_or_proc,
+                                      th_or_proc_plural, pernode, gpu_plural,
+                                      th_or_proc, npppn, gpu_plural, pernode);
+
+                        if (bMPI || (btMPI && cr->rank_pp_intranode == 0))
+                        {
+                            limit_num_gpus_used(hwinfo, npppn);
+                            ngpu = hwinfo->gpu_info.ncuda_dev_use;
+                            sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
+                        }
                      }
-#ifdef GMX_MPI
                      else
                      {
-                        /* Avoid other ranks to continue after inconsistency */
-                        MPI_Barrier(cr->mpi_comm_mygroup);
+                        /* Avoid duplicate error messages.
+                         * Unfortunately we can only do this at the physical node
+                         * level, since the hardware setup and MPI process count
+                         * might be differ over physical nodes.
+                         */
+                        if (cr->rank_pp_intranode == 0)
+                        {
+                            gmx_fatal(FARGS,
+                                      "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
+                                      "%s was started with %d PP %s%s%s, but only %d GPU%s were detected.",
+                                      th_or_proc, btMPI ? "s" : "es", pernode,
+                                      ShortProgram(), npppn, th_or_proc,
+                                      th_or_proc_plural, pernode, ngpu,
+                                      gpu_plural);
+                        }
                      }
-#endif
                  }
              }
-        }
  
-        hwinfo->gpu_info.bDevShare = FALSE;
-        if (hwinfo->gpu_info.bUserSet && (cr->rank_pp_intranode == 0))
-        {
-            int      i, j, same_count;
-            gmx_bool bSomeSame, bAllDifferent;
+            {
+                int      same_count;
  
-            same_count    = 0; /* number of GPUs shared among ranks */
-            bSomeSame     = FALSE;
-            bAllDifferent = TRUE;
+                same_count = gmx_count_gpu_dev_shared(&(hwinfo->gpu_info));
  
-            for (i = 0; i < ngpu - 1; i++)
-            {
-                for (j = i + 1; j < ngpu; j++)
+                if (btMPI && same_count > 0)
                  {
-                    bSomeSame       |= hwinfo->gpu_info.cuda_dev_use[i] == hwinfo->gpu_info.cuda_dev_use[j];
-                    bAllDifferent   &= hwinfo->gpu_info.cuda_dev_use[i] != hwinfo->gpu_info.cuda_dev_use[j];
-                    same_count      += hwinfo->gpu_info.cuda_dev_use[i] == hwinfo->gpu_info.cuda_dev_use[j];
+                    gmx_fatal(FARGS,
+                              "Invalid GPU assignment: can't share a GPU among multiple thread-MPI threads.\n"
+                              "Use MPI if you are sure that you want to assign GPU to multiple threads.");
+                }
+
+                if (same_count > 0)
+                {
+                    md_print_warn(cr, fplog,
+                                  "NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n"
+                                  "      multiple %s%s; this should be avoided as it can cause\n"
+                                  "      performance loss.\n",
+                                  same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
                  }
              }
+            print_gpu_use_stats(fplog, &hwinfo->gpu_info, cr);
+        }
+        hwinfo->bConsistencyChecked = TRUE;
+    }
  
-            /* store the number of shared/oversubscribed GPUs */
-            hwinfo->gpu_info.bDevShare = bSomeSame;
+    ret = tMPI_Thread_mutex_unlock(&cons_lock);
+    if (ret != 0)
+    {
+        gmx_fatal(FARGS, "Error unlocking cons mutex: %s", strerror(errno));
+    }
  
-            if (btMPI && !bAllDifferent)
-            {
-                gmx_fatal(FARGS,
-                          "Invalid GPU assignment: can't share a GPU among multiple thread-MPI threads.\n"
-                          "Use MPI if you are sure that you want to assign GPU to multiple threads.");
-            }
+#ifdef GMX_MPI
+    if (PAR(cr))
+    {
+        /* Avoid other ranks to continue after
+           inconsistency */
+        MPI_Barrier(cr->mpi_comm_mygroup);
+    }
+#endif
+
+}
+
+int gmx_count_gpu_dev_shared(const gmx_gpu_info_t *gpu_info)
+{
+    int      same_count    = 0;
+    int      ngpu          = gpu_info->ncuda_dev_use;
  
-            if (bSomeSame)
+    if (gpu_info->bUserSet)
+    {
+        int      i, j;
+
+        for (i = 0; i < ngpu - 1; i++)
+        {
+            for (j = i + 1; j < ngpu; j++)
              {
-                md_print_warn(cr, fplog,
-                              "NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n"
-                              "      multiple %s%s; this should be avoided as it can cause\n"
-                              "      performance loss.\n",
-                              same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
+                same_count      += (gpu_info->cuda_dev_use[i] ==
+                                    gpu_info->cuda_dev_use[j]);
              }
          }
-        print_gpu_use_stats(fplog, &hwinfo->gpu_info, cr);
      }
+
+    return same_count;
  }
  
+
  /* Return the number of hardware threads supported by the current CPU.
   * We assume that this is equal with the number of CPUs reported to be
   * online by the OS at the time of the call.
@@ -448,10 +493,9 @@ static int get_nthreads_hw_avail(FILE gmx_unused *fplog, const t_commrec gmx_unu
      return ret;
  }
  
-void gmx_detect_hardware(FILE *fplog, gmx_hw_info_t *hwinfo,
-                         const t_commrec *cr,
-                         gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
-                         const char *gpu_id)
+gmx_hw_info_t *gmx_detect_hardware(FILE *fplog, const t_commrec *cr,
+                                   gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
+                                   const char *gpu_id)
  {
      int              i;
      const char      *env;
@@ -459,132 +503,157 @@ void gmx_detect_hardware(FILE *fplog, gmx_hw_info_t *hwinfo,
      gmx_hw_info_t   *hw;
      gmx_gpu_info_t   gpuinfo_auto, gpuinfo_user;
      gmx_bool         bGPUBin;
+    int              ret;
  
-    assert(hwinfo);
-
-    /* detect CPUID info; no fuss, we don't detect system-wide
-     * -- sloppy, but that's it for now */
-    if (gmx_cpuid_init(&hwinfo->cpuid_info) != 0)
+    /* make sure no one else is doing the same thing */
+    ret = tMPI_Thread_mutex_lock(&hw_info_lock);
+    if (ret != 0)
      {
-        gmx_fatal_collective(FARGS, cr, NULL, "CPUID detection failed!");
+        gmx_fatal(FARGS, "Error locking hwinfo mutex: %s", strerror(errno));
      }
  
-    /* detect number of hardware threads */
-    hwinfo->nthreads_hw_avail = get_nthreads_hw_avail(fplog, cr);
+    /* only initialize the hwinfo structure if it is not already initalized */
+    if (n_hwinfo == 0)
+    {
+        snew(hwinfo_g, 1);
+        hwinfo_g->bConsistencyChecked = FALSE;
  
-    /* detect GPUs */
-    hwinfo->gpu_info.ncuda_dev_use  = 0;
-    hwinfo->gpu_info.cuda_dev_use   = NULL;
-    hwinfo->gpu_info.ncuda_dev      = 0;
-    hwinfo->gpu_info.cuda_dev       = NULL;
+        /* detect CPUID info; no fuss, we don't detect system-wide
+         * -- sloppy, but that's it for now */
+        if (gmx_cpuid_init(&hwinfo_g->cpuid_info) != 0)
+        {
+            gmx_fatal_collective(FARGS, cr, NULL, "CPUID detection failed!");
+        }
+
+        /* detect number of hardware threads */
+        hwinfo_g->nthreads_hw_avail = get_nthreads_hw_avail(fplog, cr);
+
+        /* detect GPUs */
+        hwinfo_g->gpu_info.ncuda_dev_use  = 0;
+        hwinfo_g->gpu_info.cuda_dev_use   = NULL;
+        hwinfo_g->gpu_info.ncuda_dev      = 0;
+        hwinfo_g->gpu_info.cuda_dev       = NULL;
  
  #ifdef GMX_GPU
-    bGPUBin      = TRUE;
+        bGPUBin      = TRUE;
  #else
-    bGPUBin      = FALSE;
+        bGPUBin      = FALSE;
  #endif
  
-    /* Bail if binary is not compiled with GPU acceleration, but this is either
-     * explicitly (-nb gpu) or implicitly (gpu ID passed) requested. */
-    if (bForceUseGPU && !bGPUBin)
-    {
-        gmx_fatal(FARGS, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
-    }
-    if (gpu_id != NULL && !bGPUBin)
-    {
-        gmx_fatal(FARGS, "GPU ID string set, but %s was compiled without GPU support!", ShortProgram());
-    }
-
-    /* run the detection if the binary was compiled with GPU support */
-    if (bGPUBin && getenv("GMX_DISABLE_GPU_DETECTION") == NULL)
-    {
-        char detection_error[STRLEN];
-
-        if (detect_cuda_gpus(&hwinfo->gpu_info, detection_error) != 0)
+        /* Bail if binary is not compiled with GPU acceleration, but this is either
+         * explicitly (-nb gpu) or implicitly (gpu ID passed) requested. */
+        if (bForceUseGPU && !bGPUBin)
          {
-            if (detection_error != NULL && detection_error[0] != '\0')
-            {
-                sprintf(sbuf, ":\n      %s\n", detection_error);
-            }
-            else
-            {
-                sprintf(sbuf, ".");
-            }
-            md_print_warn(cr, fplog,
-                          "NOTE: Error occurred during GPU detection%s"
-                          "      Can not use GPU acceleration, will fall back to CPU kernels.\n",
-                          sbuf);
+            gmx_fatal(FARGS, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
          }
-    }
-
-    if (bForceUseGPU || bTryUseGPU)
-    {
-        env = getenv("GMX_GPU_ID");
-        if (env != NULL && gpu_id != NULL)
+        if (gpu_id != NULL && !bGPUBin)
          {
-            gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time");
+            gmx_fatal(FARGS, "GPU ID string set, but %s was compiled without GPU support!", ShortProgram());
          }
-        if (env == NULL)
+
+        /* run the detection if the binary was compiled with GPU support */
+        if (bGPUBin && getenv("GMX_DISABLE_GPU_DETECTION") == NULL)
          {
-            env = gpu_id;
+            char detection_error[STRLEN];
+
+            if (detect_cuda_gpus(&hwinfo_g->gpu_info, detection_error) != 0)
+            {
+                if (detection_error != NULL && detection_error[0] != '\0')
+                {
+                    sprintf(sbuf, ":\n      %s\n", detection_error);
+                }
+                else
+                {
+                    sprintf(sbuf, ".");
+                }
+                md_print_warn(cr, fplog,
+                              "NOTE: Error occurred during GPU detection%s"
+                              "      Can not use GPU acceleration, will fall back to CPU kernels.\n",
+                              sbuf);
+            }
          }
  
-        /* parse GPU IDs if the user passed any */
-        if (env != NULL)
+        if (bForceUseGPU || bTryUseGPU)
          {
-            int *gpuid, *checkres;
-            int  nid, res;
+            env = getenv("GMX_GPU_ID");
+            if (env != NULL && gpu_id != NULL)
+            {
+                gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time");
+            }
+            if (env == NULL)
+            {
+                env = gpu_id;
+            }
  
-            snew(gpuid, max_gpu_ids_user);
-            snew(checkres, max_gpu_ids_user);
+            /* parse GPU IDs if the user passed any */
+            if (env != NULL)
+            {
+                int *gpuid, *checkres;
+                int  nid, res;
  
-            parse_gpu_id_plain_string(env, &nid, gpuid);
+                snew(gpuid, max_gpu_ids_user);
+                snew(checkres, max_gpu_ids_user);
  
-            if (nid == 0)
-            {
-                gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n", invalid_gpuid_hint);
-            }
+                parse_gpu_id_plain_string(env, &nid, gpuid);
  
-            res = check_select_cuda_gpus(checkres, &hwinfo->gpu_info, gpuid, nid);
+                if (nid == 0)
+                {
+                    gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n",
+                              invalid_gpuid_hint);
+                }
  
-            if (!res)
-            {
-                print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
+                res = check_select_cuda_gpus(checkres, &hwinfo_g->gpu_info,
+                                             gpuid, nid);
  
-                sprintf(sbuf, "Some of the requested GPUs do not exist, behave strangely, or are not compatible:\n");
-                for (i = 0; i < nid; i++)
+                if (!res)
                  {
-                    if (checkres[i] != egpuCompatible)
+                    print_gpu_detection_stats(fplog, &hwinfo_g->gpu_info, cr);
+
+                    sprintf(sbuf, "Some of the requested GPUs do not exist, behave strangely, or are not compatible:\n");
+                    for (i = 0; i < nid; i++)
                      {
-                        sprintf(stmp, "    GPU #%d: %s\n",
-                                gpuid[i], gpu_detect_res_str[checkres[i]]);
-                        strcat(sbuf, stmp);
+                        if (checkres[i] != egpuCompatible)
+                        {
+                            sprintf(stmp, "    GPU #%d: %s\n",
+                                    gpuid[i], gpu_detect_res_str[checkres[i]]);
+                            strcat(sbuf, stmp);
+                        }
                      }
+                    gmx_fatal(FARGS, "%s", sbuf);
                  }
-                gmx_fatal(FARGS, "%s", sbuf);
-            }
  
-            hwinfo->gpu_info.bUserSet = TRUE;
+                hwinfo_g->gpu_info.bUserSet = TRUE;
  
-            sfree(gpuid);
-            sfree(checkres);
-        }
-        else
-        {
-            pick_compatible_gpus(&hwinfo->gpu_info);
-            hwinfo->gpu_info.bUserSet = FALSE;
-        }
+                sfree(gpuid);
+                sfree(checkres);
+            }
+            else
+            {
+                pick_compatible_gpus(&hwinfo_g->gpu_info);
+                hwinfo_g->gpu_info.bUserSet = FALSE;
+            }
  
-        /* decide whether we can use GPU */
-        hwinfo->bCanUseGPU = (hwinfo->gpu_info.ncuda_dev_use > 0);
-        if (!hwinfo->bCanUseGPU && bForceUseGPU)
-        {
-            gmx_fatal(FARGS, "GPU acceleration requested, but no compatible GPUs were detected.");
+            /* decide whether we can use GPU */
+            hwinfo_g->bCanUseGPU = (hwinfo_g->gpu_info.ncuda_dev_use > 0);
+            if (!hwinfo_g->bCanUseGPU && bForceUseGPU)
+            {
+                gmx_fatal(FARGS, "GPU acceleration requested, but no compatible GPUs were detected.");
+            }
          }
      }
+    /* increase the reference counter */
+    n_hwinfo++;
+
+    ret = tMPI_Thread_mutex_unlock(&hw_info_lock);
+    if (ret != 0)
+    {
+        gmx_fatal(FARGS, "Error unlocking hwinfo mutex: %s", strerror(errno));
+    }
+
+    return hwinfo_g;
  }
  
-void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count)
+static void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count)
  {
      int ndev_use;
  
@@ -612,10 +681,38 @@ void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count)
  
  void gmx_hardware_info_free(gmx_hw_info_t *hwinfo)
  {
-    if (hwinfo)
+    int ret;
+
+    ret = tMPI_Thread_mutex_lock(&hw_info_lock);
+    if (ret != 0)
+    {
+        gmx_fatal(FARGS, "Error locking hwinfo mutex: %s", strerror(errno));
+    }
+
+    /* decrease the reference counter */
+    n_hwinfo--;
+
+
+    if (hwinfo != hwinfo_g)
+    {
+        gmx_incons("hwinfo < hwinfo_g");
+    }
+
+    if (n_hwinfo < 0)
+    {
+        gmx_incons("n_hwinfo < 0");
+    }
+
+    if (n_hwinfo == 0)
+    {
+        gmx_cpuid_done(hwinfo_g->cpuid_info);
+        free_gpu_info(&hwinfo_g->gpu_info);
+        sfree(hwinfo_g);
+    }
+
+    ret = tMPI_Thread_mutex_unlock(&hw_info_lock);
+    if (ret != 0)
      {
-        gmx_cpuid_done(hwinfo->cpuid_info);
-        free_gpu_info(&hwinfo->gpu_info);
-        sfree(hwinfo);
+        gmx_fatal(FARGS, "Error unlocking hwinfo mutex: %s", strerror(errno));
      }
  }