Merge branch release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / gmx_detect_hardware.c
index 56f020014789cfe1dd9d87b39cd9ec0963ccc590..2ce47d2ea252e17f157598b6acd606e93d4e63fd 100644 (file)
 #include "main.h"
 #include "md_logging.h"
 
+#include "thread_mpi/threads.h"
+
 #ifdef HAVE_UNISTD_H
 #include <unistd.h>
 #endif
-
 #if ((defined(WIN32) || defined( _WIN32 ) || defined(WIN64) || defined( _WIN64 )) && !(defined (__CYGWIN__) || defined (__CYGWIN32__)))
 #include "windows.h"
 #endif
@@ -56,8 +57,16 @@ static unsigned int max_gpu_ids_user = 64;
 static const char * invalid_gpuid_hint =
     "A delimiter-free sequence of valid numeric IDs of available GPUs is expected.";
 
+/* The globally shared hwinfo structure. */
+static gmx_hw_info_t      *hwinfo_g;
+/* A reference counter for the hwinfo structure */
+static int                 n_hwinfo = 0;
+/* A lock to protect the hwinfo structure */
+static tMPI_Thread_mutex_t hw_info_lock = TMPI_THREAD_MUTEX_INITIALIZER;
+
+
 /* FW decl. */
-void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count);
+static void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count);
 
 static void sprint_gpus(char *sbuf, const gmx_gpu_info_t *gpu_info)
 {
@@ -177,226 +186,262 @@ void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
                                       const t_commrec *cr, int ntmpi_requested,
                                       gmx_bool bUseGPU)
 {
-    int      npppn, ntmpi_pp, ngpu;
-    char     sbuf[STRLEN], th_or_proc[STRLEN], th_or_proc_plural[STRLEN], pernode[STRLEN];
-    char     gpu_plural[2];
-    gmx_bool bGPUBin, btMPI, bMPI, bMaxMpiThreadsSet, bNthreadsAuto, bEmulateGPU;
+    int                        npppn, ntmpi_pp, ngpu;
+    char                       sbuf[STRLEN], th_or_proc[STRLEN], th_or_proc_plural[STRLEN], pernode[STRLEN];
+    char                       gpu_plural[2];
+    gmx_bool                   bGPUBin, btMPI, bMPI, bMaxMpiThreadsSet, bNthreadsAuto, bEmulateGPU;
+    int                        ret;
+    static tMPI_Thread_mutex_t cons_lock = TMPI_THREAD_MUTEX_INITIALIZER;
+
 
     assert(hwinfo);
     assert(cr);
 
-    btMPI         = bMPI = FALSE;
-    bNthreadsAuto = FALSE;
+    /* Below we only do consistency checks for PP and GPUs,
+     * this is irrelevant for PME only nodes, so in that case we return
+     * here.
+     */
+    if (!(cr->duty & DUTY_PP))
+    {
+        return;
+    }
+
+    /* We run this function only once, but must make sure that all threads
+       that are alive run this function, so they get consistent data. We
+       achieve this by mutual exclusion and returning if the structure is
+       already properly checked & set */
+    ret = tMPI_Thread_mutex_lock(&cons_lock);
+    if (ret != 0)
+    {
+        gmx_fatal(FARGS, "Error locking cons mutex: %s", strerror(errno));
+    }
+
+    if (!hwinfo->bConsistencyChecked)
+    {
+        btMPI         = bMPI = FALSE;
+        bNthreadsAuto = FALSE;
 #if defined(GMX_THREAD_MPI)
-    btMPI         = TRUE;
-    bNthreadsAuto = (ntmpi_requested < 1);
+        btMPI         = TRUE;
+        bNthreadsAuto = (ntmpi_requested < 1);
 #elif defined(GMX_LIB_MPI)
-    bMPI  = TRUE;
+        bMPI  = TRUE;
 #endif
 
 #ifdef GMX_GPU
-    bGPUBin      = TRUE;
+        bGPUBin      = TRUE;
 #else
-    bGPUBin      = FALSE;
+        bGPUBin      = FALSE;
 #endif
 
-    /* GPU emulation detection is done later, but we need here as well
-     * -- uncool, but there's no elegant workaround */
-    bEmulateGPU       = (getenv("GMX_EMULATE_GPU") != NULL);
-    bMaxMpiThreadsSet = (getenv("GMX_MAX_MPI_THREADS") != NULL);
+        /* GPU emulation detection is done later, but we need here as well
+         * -- uncool, but there's no elegant workaround */
+        bEmulateGPU       = (getenv("GMX_EMULATE_GPU") != NULL);
+        bMaxMpiThreadsSet = (getenv("GMX_MAX_MPI_THREADS") != NULL);
 
-    if (SIMMASTER(cr))
-    {
-        /* check the acceleration mdrun is compiled with against hardware capabilities */
-        /* TODO: Here we assume homogeneous hardware which is not necessarily the case!
-         *       Might not hurt to add an extra check over MPI. */
+        /* check the acceleration mdrun is compiled with against hardware
+           capabilities */
+        /* TODO: Here we assume homogeneous hardware which is not necessarily
+                 the case! Might not hurt to add an extra check over MPI. */
         gmx_cpuid_acceleration_check(hwinfo->cpuid_info, fplog);
-    }
-
-    /* Below we only do consistency checks for PP and GPUs,
-     * this is irrelevant for PME only nodes, so in that case we return here.
-     */
-    if (!(cr->duty & DUTY_PP))
-    {
-        return;
-    }
 
-    /* Need to ensure that we have enough GPUs:
-     * - need one GPU per PP node
-     * - no GPU oversubscription with tMPI
-     * => keep on the GPU support, otherwise turn off (or bail if forced)
-     * */
-    /* number of PP processes per node */
-    npppn = cr->nrank_pp_intranode;
-
-    pernode[0]           = '\0';
-    th_or_proc_plural[0] = '\0';
-    if (btMPI)
-    {
-        sprintf(th_or_proc, "thread-MPI thread");
-        if (npppn > 1)
+        /* Need to ensure that we have enough GPUs:
+         * - need one GPU per PP node
+         * - no GPU oversubscription with tMPI
+         * => keep on the GPU support, otherwise turn off (or bail if forced)
+         * */
+        /* number of PP processes per node */
+        npppn = cr->nrank_pp_intranode;
+
+        pernode[0]           = '\0';
+        th_or_proc_plural[0] = '\0';
+        if (btMPI)
         {
-            sprintf(th_or_proc_plural, "s");
+            sprintf(th_or_proc, "thread-MPI thread");
+            if (npppn > 1)
+            {
+                sprintf(th_or_proc_plural, "s");
+            }
         }
-    }
-    else if (bMPI)
-    {
-        sprintf(th_or_proc, "MPI process");
-        if (npppn > 1)
+        else if (bMPI)
         {
-            sprintf(th_or_proc_plural, "es");
+            sprintf(th_or_proc, "MPI process");
+            if (npppn > 1)
+            {
+                sprintf(th_or_proc_plural, "es");
+            }
+            sprintf(pernode, " per node");
+        }
+        else
+        {
+            /* neither MPI nor tMPI */
+            sprintf(th_or_proc, "process");
         }
-        sprintf(pernode, " per node");
-    }
-    else
-    {
-        /* neither MPI nor tMPI */
-        sprintf(th_or_proc, "process");
-    }
-
-    if (bGPUBin)
-    {
-        print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
-    }
 
-    if (bUseGPU && hwinfo->bCanUseGPU && !bEmulateGPU)
-    {
-        ngpu = hwinfo->gpu_info.ncuda_dev_use;
-        sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
+        if (bGPUBin)
+        {
+            print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
+        }
 
-        /* number of tMPI threads atuo-adjusted */
-        if (btMPI && bNthreadsAuto && SIMMASTER(cr))
+        if (bUseGPU && hwinfo->bCanUseGPU && !bEmulateGPU)
         {
-            if (npppn < ngpu)
+            ngpu = hwinfo->gpu_info.ncuda_dev_use;
+            sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
+
+            /* number of tMPI threads atuo-adjusted */
+            if (btMPI && bNthreadsAuto)
             {
-                if (hwinfo->gpu_info.bUserSet)
+                if (npppn < ngpu)
                 {
-                    /* The user manually provided more GPUs than threads we could
-                     * automatically start. */
-                    gmx_fatal(FARGS,
-                              "%d GPU%s provided, but only %d PP thread-MPI thread%s coud be started.\n"
-                              "%s requires one PP tread-MPI thread per GPU; use fewer GPUs%s.",
-                              ngpu, gpu_plural, npppn, th_or_proc_plural,
-                              ShortProgram(), bMaxMpiThreadsSet ? "\nor allow more threads to be used" : "");
-                }
-                else
-                {
-                    /* There are more GPUs than tMPI threads; we have to limit the number GPUs used. */
-                    md_print_warn(cr, fplog,
-                                  "NOTE: %d GPU%s were detected, but only %d PP thread-MPI thread%s can be started.\n"
-                                  "      %s can use one GPU per PP tread-MPI thread, so only %d GPU%s will be used.%s\n",
+                    if (hwinfo->gpu_info.bUserSet)
+                    {
+                        /* The user manually provided more GPUs than threads we
+                           could automatically start. */
+                        gmx_fatal(FARGS,
+                                  "%d GPU%s provided, but only %d PP thread-MPI thread%s coud be started.\n"
+                                  "%s requires one PP tread-MPI thread per GPU; use fewer GPUs%s.",
                                   ngpu, gpu_plural, npppn, th_or_proc_plural,
-                                  ShortProgram(), npppn, npppn > 1 ? "s" : "",
-                                  bMaxMpiThreadsSet ? "\n      Also, you can allow more threads to be used by increasing GMX_MAX_MPI_THREADS" : "");
-
-                    if (cr->rank_pp_intranode == 0)
+                                  ShortProgram(), bMaxMpiThreadsSet ? "\nor allow more threads to be used" : "");
+                    }
+                    else
                     {
-                        limit_num_gpus_used(hwinfo, npppn);
-                        ngpu = hwinfo->gpu_info.ncuda_dev_use;
-                        sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
+                        /* There are more GPUs than tMPI threads; we have to
+                           limit the number GPUs used. */
+                        md_print_warn(cr, fplog,
+                                      "NOTE: %d GPU%s were detected, but only %d PP thread-MPI thread%s can be started.\n"
+                                      "      %s can use one GPU per PP tread-MPI thread, so only %d GPU%s will be used.%s\n",
+                                      ngpu, gpu_plural, npppn,
+                                      th_or_proc_plural,
+                                      ShortProgram(), npppn,
+                                      npppn > 1 ? "s" : "",
+                                      bMaxMpiThreadsSet ? "\n      Also, you can allow more threads to be used by increasing GMX_MAX_MPI_THREADS" : "");
+
+                        if (cr->rank_pp_intranode == 0)
+                        {
+                            limit_num_gpus_used(hwinfo, npppn);
+                            ngpu = hwinfo->gpu_info.ncuda_dev_use;
+                            sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
+                        }
                     }
                 }
             }
-        }
 
-        if (ngpu != npppn)
-        {
-            if (hwinfo->gpu_info.bUserSet)
+            if (ngpu != npppn)
             {
-                gmx_fatal(FARGS,
-                          "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
-                          "%s was started with %d PP %s%s%s, but you provided %d GPU%s.",
-                          th_or_proc, btMPI ? "s" : "es", pernode,
-                          ShortProgram(), npppn, th_or_proc, th_or_proc_plural, pernode, ngpu, gpu_plural);
-            }
-            else
-            {
-                if (ngpu > npppn)
+                if (hwinfo->gpu_info.bUserSet)
                 {
-                    md_print_warn(cr, fplog,
-                                  "NOTE: potentially sub-optimal launch configuration, %s started with less\n"
-                                  "      PP %s%s%s than GPU%s available.\n"
-                                  "      Each PP %s can use only one GPU, %d GPU%s%s will be used.\n",
-                                  ShortProgram(),
-                                  th_or_proc, th_or_proc_plural, pernode, gpu_plural,
-                                  th_or_proc, npppn, gpu_plural, pernode);
-
-                    if (bMPI || (btMPI && cr->rank_pp_intranode == 0))
-                    {
-                        limit_num_gpus_used(hwinfo, npppn);
-                        ngpu = hwinfo->gpu_info.ncuda_dev_use;
-                        sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
-                    }
+                    gmx_fatal(FARGS,
+                              "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
+                              "%s was started with %d PP %s%s%s, but you provided %d GPU%s.",
+                              th_or_proc, btMPI ? "s" : "es", pernode,
+                              ShortProgram(), npppn, th_or_proc,
+                              th_or_proc_plural, pernode, ngpu, gpu_plural);
                 }
                 else
                 {
-                    /* Avoid duplicate error messages.
-                     * Unfortunately we can only do this at the physical node
-                     * level, since the hardware setup and MPI process count
-                     * might be differ over physical nodes.
-                     */
-                    if (cr->rank_pp_intranode == 0)
+                    if (ngpu > npppn)
                     {
-                        gmx_fatal(FARGS,
-                                  "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
-                                  "%s was started with %d PP %s%s%s, but only %d GPU%s were detected.",
-                                  th_or_proc, btMPI ? "s" : "es", pernode,
-                                  ShortProgram(), npppn, th_or_proc, th_or_proc_plural, pernode, ngpu, gpu_plural);
+                        md_print_warn(cr, fplog,
+                                      "NOTE: potentially sub-optimal launch configuration, %s started with less\n"
+                                      "      PP %s%s%s than GPU%s available.\n"
+                                      "      Each PP %s can use only one GPU, %d GPU%s%s will be used.\n",
+                                      ShortProgram(), th_or_proc,
+                                      th_or_proc_plural, pernode, gpu_plural,
+                                      th_or_proc, npppn, gpu_plural, pernode);
+
+                        if (bMPI || (btMPI && cr->rank_pp_intranode == 0))
+                        {
+                            limit_num_gpus_used(hwinfo, npppn);
+                            ngpu = hwinfo->gpu_info.ncuda_dev_use;
+                            sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
+                        }
                     }
-#ifdef GMX_MPI
                     else
                     {
-                        /* Avoid other ranks to continue after inconsistency */
-                        MPI_Barrier(cr->mpi_comm_mygroup);
+                        /* Avoid duplicate error messages.
+                         * Unfortunately we can only do this at the physical node
+                         * level, since the hardware setup and MPI process count
+                         * might be differ over physical nodes.
+                         */
+                        if (cr->rank_pp_intranode == 0)
+                        {
+                            gmx_fatal(FARGS,
+                                      "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
+                                      "%s was started with %d PP %s%s%s, but only %d GPU%s were detected.",
+                                      th_or_proc, btMPI ? "s" : "es", pernode,
+                                      ShortProgram(), npppn, th_or_proc,
+                                      th_or_proc_plural, pernode, ngpu,
+                                      gpu_plural);
+                        }
                     }
-#endif
                 }
             }
-        }
 
-        hwinfo->gpu_info.bDevShare = FALSE;
-        if (hwinfo->gpu_info.bUserSet && (cr->rank_pp_intranode == 0))
-        {
-            int      i, j, same_count;
-            gmx_bool bSomeSame, bAllDifferent;
+            {
+                int      same_count;
 
-            same_count    = 0; /* number of GPUs shared among ranks */
-            bSomeSame     = FALSE;
-            bAllDifferent = TRUE;
+                same_count = gmx_count_gpu_dev_shared(&(hwinfo->gpu_info));
 
-            for (i = 0; i < ngpu - 1; i++)
-            {
-                for (j = i + 1; j < ngpu; j++)
+                if (btMPI && same_count > 0)
                 {
-                    bSomeSame       |= hwinfo->gpu_info.cuda_dev_use[i] == hwinfo->gpu_info.cuda_dev_use[j];
-                    bAllDifferent   &= hwinfo->gpu_info.cuda_dev_use[i] != hwinfo->gpu_info.cuda_dev_use[j];
-                    same_count      += hwinfo->gpu_info.cuda_dev_use[i] == hwinfo->gpu_info.cuda_dev_use[j];
+                    gmx_fatal(FARGS,
+                              "Invalid GPU assignment: can't share a GPU among multiple thread-MPI threads.\n"
+                              "Use MPI if you are sure that you want to assign GPU to multiple threads.");
+                }
+
+                if (same_count > 0)
+                {
+                    md_print_warn(cr, fplog,
+                                  "NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n"
+                                  "      multiple %s%s; this should be avoided as it can cause\n"
+                                  "      performance loss.\n",
+                                  same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
                 }
             }
+            print_gpu_use_stats(fplog, &hwinfo->gpu_info, cr);
+        }
+        hwinfo->bConsistencyChecked = TRUE;
+    }
 
-            /* store the number of shared/oversubscribed GPUs */
-            hwinfo->gpu_info.bDevShare = bSomeSame;
+    ret = tMPI_Thread_mutex_unlock(&cons_lock);
+    if (ret != 0)
+    {
+        gmx_fatal(FARGS, "Error unlocking cons mutex: %s", strerror(errno));
+    }
 
-            if (btMPI && !bAllDifferent)
-            {
-                gmx_fatal(FARGS,
-                          "Invalid GPU assignment: can't share a GPU among multiple thread-MPI threads.\n"
-                          "Use MPI if you are sure that you want to assign GPU to multiple threads.");
-            }
+#ifdef GMX_MPI
+    if (PAR(cr))
+    {
+        /* Avoid other ranks to continue after
+           inconsistency */
+        MPI_Barrier(cr->mpi_comm_mygroup);
+    }
+#endif
+
+}
+
+int gmx_count_gpu_dev_shared(const gmx_gpu_info_t *gpu_info)
+{
+    int      same_count    = 0;
+    int      ngpu          = gpu_info->ncuda_dev_use;
 
-            if (bSomeSame)
+    if (gpu_info->bUserSet)
+    {
+        int      i, j;
+
+        for (i = 0; i < ngpu - 1; i++)
+        {
+            for (j = i + 1; j < ngpu; j++)
             {
-                md_print_warn(cr, fplog,
-                              "NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n"
-                              "      multiple %s%s; this should be avoided as it can cause\n"
-                              "      performance loss.\n",
-                              same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
+                same_count      += (gpu_info->cuda_dev_use[i] ==
+                                    gpu_info->cuda_dev_use[j]);
             }
         }
-        print_gpu_use_stats(fplog, &hwinfo->gpu_info, cr);
     }
+
+    return same_count;
 }
 
+
 /* Return the number of hardware threads supported by the current CPU.
  * We assume that this is equal with the number of CPUs reported to be
  * online by the OS at the time of the call.
@@ -448,10 +493,9 @@ static int get_nthreads_hw_avail(FILE gmx_unused *fplog, const t_commrec gmx_unu
     return ret;
 }
 
-void gmx_detect_hardware(FILE *fplog, gmx_hw_info_t *hwinfo,
-                         const t_commrec *cr,
-                         gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
-                         const char *gpu_id)
+gmx_hw_info_t *gmx_detect_hardware(FILE *fplog, const t_commrec *cr,
+                                   gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
+                                   const char *gpu_id)
 {
     int              i;
     const char      *env;
@@ -459,132 +503,157 @@ void gmx_detect_hardware(FILE *fplog, gmx_hw_info_t *hwinfo,
     gmx_hw_info_t   *hw;
     gmx_gpu_info_t   gpuinfo_auto, gpuinfo_user;
     gmx_bool         bGPUBin;
+    int              ret;
 
-    assert(hwinfo);
-
-    /* detect CPUID info; no fuss, we don't detect system-wide
-     * -- sloppy, but that's it for now */
-    if (gmx_cpuid_init(&hwinfo->cpuid_info) != 0)
+    /* make sure no one else is doing the same thing */
+    ret = tMPI_Thread_mutex_lock(&hw_info_lock);
+    if (ret != 0)
     {
-        gmx_fatal_collective(FARGS, cr, NULL, "CPUID detection failed!");
+        gmx_fatal(FARGS, "Error locking hwinfo mutex: %s", strerror(errno));
     }
 
-    /* detect number of hardware threads */
-    hwinfo->nthreads_hw_avail = get_nthreads_hw_avail(fplog, cr);
+    /* only initialize the hwinfo structure if it is not already initalized */
+    if (n_hwinfo == 0)
+    {
+        snew(hwinfo_g, 1);
+        hwinfo_g->bConsistencyChecked = FALSE;
 
-    /* detect GPUs */
-    hwinfo->gpu_info.ncuda_dev_use  = 0;
-    hwinfo->gpu_info.cuda_dev_use   = NULL;
-    hwinfo->gpu_info.ncuda_dev      = 0;
-    hwinfo->gpu_info.cuda_dev       = NULL;
+        /* detect CPUID info; no fuss, we don't detect system-wide
+         * -- sloppy, but that's it for now */
+        if (gmx_cpuid_init(&hwinfo_g->cpuid_info) != 0)
+        {
+            gmx_fatal_collective(FARGS, cr, NULL, "CPUID detection failed!");
+        }
+
+        /* detect number of hardware threads */
+        hwinfo_g->nthreads_hw_avail = get_nthreads_hw_avail(fplog, cr);
+
+        /* detect GPUs */
+        hwinfo_g->gpu_info.ncuda_dev_use  = 0;
+        hwinfo_g->gpu_info.cuda_dev_use   = NULL;
+        hwinfo_g->gpu_info.ncuda_dev      = 0;
+        hwinfo_g->gpu_info.cuda_dev       = NULL;
 
 #ifdef GMX_GPU
-    bGPUBin      = TRUE;
+        bGPUBin      = TRUE;
 #else
-    bGPUBin      = FALSE;
+        bGPUBin      = FALSE;
 #endif
 
-    /* Bail if binary is not compiled with GPU acceleration, but this is either
-     * explicitly (-nb gpu) or implicitly (gpu ID passed) requested. */
-    if (bForceUseGPU && !bGPUBin)
-    {
-        gmx_fatal(FARGS, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
-    }
-    if (gpu_id != NULL && !bGPUBin)
-    {
-        gmx_fatal(FARGS, "GPU ID string set, but %s was compiled without GPU support!", ShortProgram());
-    }
-
-    /* run the detection if the binary was compiled with GPU support */
-    if (bGPUBin && getenv("GMX_DISABLE_GPU_DETECTION") == NULL)
-    {
-        char detection_error[STRLEN];
-
-        if (detect_cuda_gpus(&hwinfo->gpu_info, detection_error) != 0)
+        /* Bail if binary is not compiled with GPU acceleration, but this is either
+         * explicitly (-nb gpu) or implicitly (gpu ID passed) requested. */
+        if (bForceUseGPU && !bGPUBin)
         {
-            if (detection_error != NULL && detection_error[0] != '\0')
-            {
-                sprintf(sbuf, ":\n      %s\n", detection_error);
-            }
-            else
-            {
-                sprintf(sbuf, ".");
-            }
-            md_print_warn(cr, fplog,
-                          "NOTE: Error occurred during GPU detection%s"
-                          "      Can not use GPU acceleration, will fall back to CPU kernels.\n",
-                          sbuf);
+            gmx_fatal(FARGS, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
         }
-    }
-
-    if (bForceUseGPU || bTryUseGPU)
-    {
-        env = getenv("GMX_GPU_ID");
-        if (env != NULL && gpu_id != NULL)
+        if (gpu_id != NULL && !bGPUBin)
         {
-            gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time");
+            gmx_fatal(FARGS, "GPU ID string set, but %s was compiled without GPU support!", ShortProgram());
         }
-        if (env == NULL)
+
+        /* run the detection if the binary was compiled with GPU support */
+        if (bGPUBin && getenv("GMX_DISABLE_GPU_DETECTION") == NULL)
         {
-            env = gpu_id;
+            char detection_error[STRLEN];
+
+            if (detect_cuda_gpus(&hwinfo_g->gpu_info, detection_error) != 0)
+            {
+                if (detection_error != NULL && detection_error[0] != '\0')
+                {
+                    sprintf(sbuf, ":\n      %s\n", detection_error);
+                }
+                else
+                {
+                    sprintf(sbuf, ".");
+                }
+                md_print_warn(cr, fplog,
+                              "NOTE: Error occurred during GPU detection%s"
+                              "      Can not use GPU acceleration, will fall back to CPU kernels.\n",
+                              sbuf);
+            }
         }
 
-        /* parse GPU IDs if the user passed any */
-        if (env != NULL)
+        if (bForceUseGPU || bTryUseGPU)
         {
-            int *gpuid, *checkres;
-            int  nid, res;
+            env = getenv("GMX_GPU_ID");
+            if (env != NULL && gpu_id != NULL)
+            {
+                gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time");
+            }
+            if (env == NULL)
+            {
+                env = gpu_id;
+            }
 
-            snew(gpuid, max_gpu_ids_user);
-            snew(checkres, max_gpu_ids_user);
+            /* parse GPU IDs if the user passed any */
+            if (env != NULL)
+            {
+                int *gpuid, *checkres;
+                int  nid, res;
 
-            parse_gpu_id_plain_string(env, &nid, gpuid);
+                snew(gpuid, max_gpu_ids_user);
+                snew(checkres, max_gpu_ids_user);
 
-            if (nid == 0)
-            {
-                gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n", invalid_gpuid_hint);
-            }
+                parse_gpu_id_plain_string(env, &nid, gpuid);
 
-            res = check_select_cuda_gpus(checkres, &hwinfo->gpu_info, gpuid, nid);
+                if (nid == 0)
+                {
+                    gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n",
+                              invalid_gpuid_hint);
+                }
 
-            if (!res)
-            {
-                print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
+                res = check_select_cuda_gpus(checkres, &hwinfo_g->gpu_info,
+                                             gpuid, nid);
 
-                sprintf(sbuf, "Some of the requested GPUs do not exist, behave strangely, or are not compatible:\n");
-                for (i = 0; i < nid; i++)
+                if (!res)
                 {
-                    if (checkres[i] != egpuCompatible)
+                    print_gpu_detection_stats(fplog, &hwinfo_g->gpu_info, cr);
+
+                    sprintf(sbuf, "Some of the requested GPUs do not exist, behave strangely, or are not compatible:\n");
+                    for (i = 0; i < nid; i++)
                     {
-                        sprintf(stmp, "    GPU #%d: %s\n",
-                                gpuid[i], gpu_detect_res_str[checkres[i]]);
-                        strcat(sbuf, stmp);
+                        if (checkres[i] != egpuCompatible)
+                        {
+                            sprintf(stmp, "    GPU #%d: %s\n",
+                                    gpuid[i], gpu_detect_res_str[checkres[i]]);
+                            strcat(sbuf, stmp);
+                        }
                     }
+                    gmx_fatal(FARGS, "%s", sbuf);
                 }
-                gmx_fatal(FARGS, "%s", sbuf);
-            }
 
-            hwinfo->gpu_info.bUserSet = TRUE;
+                hwinfo_g->gpu_info.bUserSet = TRUE;
 
-            sfree(gpuid);
-            sfree(checkres);
-        }
-        else
-        {
-            pick_compatible_gpus(&hwinfo->gpu_info);
-            hwinfo->gpu_info.bUserSet = FALSE;
-        }
+                sfree(gpuid);
+                sfree(checkres);
+            }
+            else
+            {
+                pick_compatible_gpus(&hwinfo_g->gpu_info);
+                hwinfo_g->gpu_info.bUserSet = FALSE;
+            }
 
-        /* decide whether we can use GPU */
-        hwinfo->bCanUseGPU = (hwinfo->gpu_info.ncuda_dev_use > 0);
-        if (!hwinfo->bCanUseGPU && bForceUseGPU)
-        {
-            gmx_fatal(FARGS, "GPU acceleration requested, but no compatible GPUs were detected.");
+            /* decide whether we can use GPU */
+            hwinfo_g->bCanUseGPU = (hwinfo_g->gpu_info.ncuda_dev_use > 0);
+            if (!hwinfo_g->bCanUseGPU && bForceUseGPU)
+            {
+                gmx_fatal(FARGS, "GPU acceleration requested, but no compatible GPUs were detected.");
+            }
         }
     }
+    /* increase the reference counter */
+    n_hwinfo++;
+
+    ret = tMPI_Thread_mutex_unlock(&hw_info_lock);
+    if (ret != 0)
+    {
+        gmx_fatal(FARGS, "Error unlocking hwinfo mutex: %s", strerror(errno));
+    }
+
+    return hwinfo_g;
 }
 
-void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count)
+static void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count)
 {
     int ndev_use;
 
@@ -612,10 +681,38 @@ void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count)
 
 void gmx_hardware_info_free(gmx_hw_info_t *hwinfo)
 {
-    if (hwinfo)
+    int ret;
+
+    ret = tMPI_Thread_mutex_lock(&hw_info_lock);
+    if (ret != 0)
+    {
+        gmx_fatal(FARGS, "Error locking hwinfo mutex: %s", strerror(errno));
+    }
+
+    /* decrease the reference counter */
+    n_hwinfo--;
+
+
+    if (hwinfo != hwinfo_g)
+    {
+        gmx_incons("hwinfo < hwinfo_g");
+    }
+
+    if (n_hwinfo < 0)
+    {
+        gmx_incons("n_hwinfo < 0");
+    }
+
+    if (n_hwinfo == 0)
+    {
+        gmx_cpuid_done(hwinfo_g->cpuid_info);
+        free_gpu_info(&hwinfo_g->gpu_info);
+        sfree(hwinfo_g);
+    }
+
+    ret = tMPI_Thread_mutex_unlock(&hw_info_lock);
+    if (ret != 0)
     {
-        gmx_cpuid_done(hwinfo->cpuid_info);
-        free_gpu_info(&hwinfo->gpu_info);
-        sfree(hwinfo);
+        gmx_fatal(FARGS, "Error unlocking hwinfo mutex: %s", strerror(errno));
     }
 }