From: Berk Hess <hess@kth.se>
Date: Thu, 10 Oct 2013 15:39:25 +0000 (+0200)
Subject: reorganized GPU detection and selection
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=95d10d3903aed9c700009f6effece9c6f1d60517;p=alexxy%2Fgromacs.git

reorganized GPU detection and selection

The GPU selection has been separated from the GPU detection
and now happens after the thread-MPI threads are started.
The GPU user/auto-selected options have been removed from
gmx_hw_info_t, such that it only contains hardware info
and can be passed around as const.
As both the CPU and GPU options structs are now tMPI rank local,
tMPI thread concurrency issues are avoided.
Fixes #1334 #1359

The GPU detection is now skipped with mdrun -nb cpu
CPU acceleration binary/hardware mismatch is now only printed once
to stderr (instead of #MPI-rank times to stdout).
Removed the master_inf_t struct.

Change-Id: If497f611b911808f6d01ca83f41ae288061dd361
---

diff --git a/include/gmx_cpuid.h b/include/gmx_cpuid.h
index e623351f38..0ddbe358be 100644
--- a/include/gmx_cpuid.h
+++ b/include/gmx_cpuid.h
@@ -297,11 +297,13 @@ gmx_cpuid_acceleration_suggest  (gmx_cpuid_t                    cpuid);
 
 /* Check if this binary was compiled with the same acceleration as we
  * would suggest for the current hardware. Always print stats to the log file
- * if it is non-NULL, and print a warning in stdout if we don't have a match.
+ * if it is non-NULL, and if we don't have a match, print a warning in log
+ * (if non-NULL) and if print_to_stderr!=0 also to stderr.
  */
 int
 gmx_cpuid_acceleration_check    (gmx_cpuid_t                cpuid,
-                                 FILE *                     log);
+                                 FILE *                     log,
+                                 int                        print_to_stderr);
 
 
 /* Release resources used by data structure. Note that the pointer to the
diff --git a/include/gmx_detect_hardware.h b/include/gmx_detect_hardware.h
index 787fc561f6..a8cdc7fb0e 100644
--- a/include/gmx_detect_hardware.h
+++ b/include/gmx_detect_hardware.h
@@ -52,18 +52,27 @@ extern "C" {
 /* return a pointer to a global hwinfo structure. */
 GMX_LIBGMX_EXPORT
 gmx_hw_info_t *gmx_detect_hardware(FILE *fplog, const t_commrec *cr,
-                                   gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
-                                   const char *gpu_id);
+                                   gmx_bool bDetectGPUs);
 
 GMX_LIBGMX_EXPORT
 void gmx_hardware_info_free(gmx_hw_info_t *hwinfo);
 
-/* Check the thread count + GPU assignment. This function must
-   either be run by all threads that persist (i.e. all tmpi threads),
-   or be run before they are created.  */
 GMX_LIBGMX_EXPORT
-void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
-                                      const t_commrec *cr, int ntmpi_requsted,
+void gmx_parse_gpu_ids(gmx_gpu_opt_t *gpu_opt);
+
+GMX_LIBGMX_EXPORT
+void gmx_select_gpu_ids(FILE *fplog, const t_commrec *cr,
+                        const gmx_gpu_info_t *gpu_info,
+                        gmx_bool bForceUseGPU,
+                        gmx_gpu_opt_t *gpu_opt);
+
+/* Check the consistency of hw_opt with hwinfo.
+   This function should be called once on each MPI rank. */
+GMX_LIBGMX_EXPORT
+void gmx_check_hw_runconf_consistency(FILE *fplog,
+                                      const gmx_hw_info_t *hwinfo,
+                                      const t_commrec *cr,
+                                      const gmx_hw_opt_t *hw_opt,
                                       gmx_bool bUseGPU);
 #endif
 
@@ -71,11 +80,11 @@ void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
 /* Check whether a GPU is shared among ranks, and return the number of shared
    gpus
 
-   hwinfo        = the hwinfo struct
+   gpu_opt       = the gpu options struct
 
    returns: The number of GPUs shared among ranks, or 0 */
 GMX_LIBGMX_EXPORT
-int gmx_count_gpu_dev_shared(const gmx_gpu_info_t *gpu_info);
+int gmx_count_gpu_dev_shared(const gmx_gpu_opt_t *gpu_opt);
 
 
 #ifdef __cplusplus
diff --git a/include/gpu_utils.h b/include/gpu_utils.h
index 6c678c7d5a..68f8f7a2c7 100644
--- a/include/gpu_utils.h
+++ b/include/gpu_utils.h
@@ -69,17 +69,21 @@ FUNC_QUALIFIER
 int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str) FUNC_TERM_INT
 
 FUNC_QUALIFIER
-void pick_compatible_gpus(gmx_gpu_info_t *gpu_info) FUNC_TERM_VOID
+void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
+                          gmx_gpu_opt_t *gpu_opt) FUNC_TERM_VOID
 
 FUNC_QUALIFIER
-gmx_bool check_select_cuda_gpus(int *checkres, gmx_gpu_info_t *gpu_info,
-                                const int *requested_devs, int count) FUNC_TERM_INT
+gmx_bool check_selected_cuda_gpus(int *checkres,
+                                  const gmx_gpu_info_t *gpu_info,
+                                  gmx_gpu_opt_t *gpu_opt) FUNC_TERM_INT
 
 FUNC_QUALIFIER
 void free_gpu_info(const gmx_gpu_info_t *gpu_info) FUNC_TERM_VOID
 
 FUNC_QUALIFIER
-gmx_bool init_gpu(int mygpu, char *result_str, const gmx_gpu_info_t *gpu_info) FUNC_TERM_INT
+gmx_bool init_gpu(int mygpu, char *result_str,
+                  const gmx_gpu_info_t *gpu_info,
+                  const gmx_gpu_opt_t *gpu_opt) FUNC_TERM_INT
 
 FUNC_QUALIFIER
 gmx_bool free_gpu(char *result_str) FUNC_TERM_INT
@@ -89,7 +93,9 @@ FUNC_QUALIFIER
 int get_current_gpu_device_id(void) FUNC_TERM_INT
 
 FUNC_QUALIFIER
-int get_gpu_device_id(const gmx_gpu_info_t *gpu_info, int index) FUNC_TERM_INT
+int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
+                      const gmx_gpu_opt_t *gpu_opt,
+                      int index) FUNC_TERM_INT
 
 FUNC_QUALIFIER
 void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index) FUNC_TERM_VOID
diff --git a/include/mdrun.h b/include/mdrun.h
index 98ff760bc6..1e2b9b5189 100644
--- a/include/mdrun.h
+++ b/include/mdrun.h
@@ -93,22 +93,6 @@ enum {
     ddnoSEL, ddnoINTERLEAVE, ddnoPP_PME, ddnoCARTESIAN, ddnoNR
 };
 
-/* The options for the thread affinity setting, default: auto */
-enum {
-    threadaffSEL, threadaffAUTO, threadaffON, threadaffOFF, threadaffNR
-};
-
-typedef struct {
-    int      nthreads_tot;        /* Total number of threads requested (TMPI) */
-    int      nthreads_tmpi;       /* Number of TMPI threads requested         */
-    int      nthreads_omp;        /* Number of OpenMP threads requested       */
-    int      nthreads_omp_pme;    /* As nthreads_omp, but for PME only nodes  */
-    int      thread_affinity;     /* Thread affinity switch, see enum above   */
-    int      core_pinning_stride; /* Logical core pinning stride              */
-    int      core_pinning_offset; /* Logical core pinning offset              */
-    char    *gpu_id;              /* GPU id's to use, each specified as chars */
-} gmx_hw_opt_t;
-
 /* Variables for temporary use with the deform option,
  * used in runner.c and md.c.
  * (These variables should be stored in the tpx file.)
diff --git a/include/nbnxn_cuda_data_mgmt.h b/include/nbnxn_cuda_data_mgmt.h
index f215978a0e..02af68fa4e 100644
--- a/include/nbnxn_cuda_data_mgmt.h
+++ b/include/nbnxn_cuda_data_mgmt.h
@@ -60,7 +60,9 @@ extern "C" {
 FUNC_QUALIFIER
 void nbnxn_cuda_init(FILE *fplog,
                      nbnxn_cuda_ptr_t *p_cu_nb,
-                     const gmx_gpu_info_t *gpu_info, int my_gpu_index,
+                     const gmx_gpu_info_t *gpu_info,
+                     const gmx_gpu_opt_t *gpu_opt,
+                     int my_gpu_index,
                      /* true of both local and non-local are don on GPU */
                      gmx_bool bLocalAndNonlocal) FUNC_TERM
 
diff --git a/include/types/forcerec.h b/include/types/forcerec.h
index 2e78634a43..920a81db54 100644
--- a/include/types/forcerec.h
+++ b/include/types/forcerec.h
@@ -200,6 +200,7 @@ typedef struct {
     rvec                 posres_comB;
 
     const gmx_hw_info_t *hwinfo;
+    const gmx_gpu_opt_t *gpu_opt;
     gmx_bool             use_cpu_acceleration;
 
     /* Interaction for calculated in kernels. In many cases this is similar to
diff --git a/include/types/hw_info.h b/include/types/hw_info.h
index f3c9c9284b..db9ce3b7fc 100644
--- a/include/types/hw_info.h
+++ b/include/types/hw_info.h
@@ -67,12 +67,10 @@ static const char * const gpu_detect_res_str[] =
  * The gmx_hardware_detect module initializes it. */
 typedef struct
 {
-    gmx_bool             bUserSet;      /* true if the GPUs in cuda_dev_use are manually provided by the user */
-
-    int                  ncuda_dev_use; /* number of devices selected to be used */
-    int                 *cuda_dev_use;  /* index of the devices selected to be used */
+    gmx_bool             bDetectGPUs;   /* Did we try to detect GPUs? */
     int                  ncuda_dev;     /* total number of devices detected */
     cuda_dev_info_ptr_t  cuda_dev;      /* devices detected in the system (per node) */
+    int                  ncuda_dev_compatible; /* number of compatible GPUs */
 } gmx_gpu_info_t;
 
 /* Hardware information structure with CPU and GPU information.
@@ -81,7 +79,6 @@ typedef struct
  *       (i.e. must be able to be shared among all threads) */
 typedef struct
 {
-    gmx_bool        bCanUseGPU;          /* True if compatible GPUs are detected during hardware detection */
     gmx_gpu_info_t  gpu_info;            /* Information about GPUs detected in the system */
 
     gmx_cpuid_t     cpuid_info;          /* CPUID information about CPU detected;
@@ -90,11 +87,37 @@ typedef struct
     int             nthreads_hw_avail;   /* Number of hardware threads available; this number
                                             is based on the number of CPUs reported as available
                                             by the OS at the time of detection. */
-    gmx_bool        bConsistencyChecked; /* whether
-                                            gmx_check_hw_runconf_consistency()
-                                            has been run with this hw_info */
 } gmx_hw_info_t;
 
+
+/* The options for the thread affinity setting, default: auto */
+enum {
+    threadaffSEL, threadaffAUTO, threadaffON, threadaffOFF, threadaffNR
+};
+
+/* GPU device selection information -- for now with only CUDA devices */
+typedef struct
+{
+    char     *gpu_id;        /* GPU id's to use, each specified as chars */
+    gmx_bool  bUserSet;      /* true if the GPUs in cuda_dev_use are manually provided by the user */
+
+    int       ncuda_dev_use; /* number of devices selected to be used */
+    int      *cuda_dev_use;  /* index of the devices selected to be used */
+} gmx_gpu_opt_t;
+
+/* Threading and GPU options, can be set automatically or by the user */
+typedef struct {
+    int      nthreads_tot;        /* Total number of threads requested (TMPI) */
+    int      nthreads_tmpi;       /* Number of TMPI threads requested         */
+    int      nthreads_omp;        /* Number of OpenMP threads requested       */
+    int      nthreads_omp_pme;    /* As nthreads_omp, but for PME only nodes  */
+    int      thread_affinity;     /* Thread affinity switch, see enum above   */
+    int      core_pinning_stride; /* Logical core pinning stride              */
+    int      core_pinning_offset; /* Logical core pinning offset              */
+
+    gmx_gpu_opt_t gpu_opt;        /* The GPU options                          */
+} gmx_hw_opt_t;
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gmxlib/gmx_cpuid.c b/src/gmxlib/gmx_cpuid.c
index 0eb36fa584..7b9356451b 100644
--- a/src/gmxlib/gmx_cpuid.c
+++ b/src/gmxlib/gmx_cpuid.c
@@ -1105,7 +1105,8 @@ gmx_cpuid_acceleration_suggest  (gmx_cpuid_t                 cpuid)
 
 int
 gmx_cpuid_acceleration_check(gmx_cpuid_t   cpuid,
-                             FILE *        log)
+                             FILE *        log,
+                             int           print_to_stderr)
 {
     int                           rc;
     char                          str[1024];
@@ -1140,9 +1141,12 @@ gmx_cpuid_acceleration_check(gmx_cpuid_t   cpuid,
                     gmx_cpuid_acceleration_string[acc],
                     gmx_cpuid_acceleration_string[compiled_acc]);
         }
-        printf("Compiled acceleration: %s (Gromacs could use %s on this machine, which is better)\n",
-               gmx_cpuid_acceleration_string[compiled_acc],
-               gmx_cpuid_acceleration_string[acc]);
+        if (print_to_stderr)
+        {
+            fprintf(stderr, "Compiled acceleration: %s (Gromacs could use %s on this machine, which is better)\n",
+                   gmx_cpuid_acceleration_string[compiled_acc],
+                   gmx_cpuid_acceleration_string[acc]);
+        }
     }
     return rc;
 }
diff --git a/src/gmxlib/gmx_detect_hardware.c b/src/gmxlib/gmx_detect_hardware.c
index 640d74e85e..c407f40b5d 100644
--- a/src/gmxlib/gmx_detect_hardware.c
+++ b/src/gmxlib/gmx_detect_hardware.c
@@ -58,12 +58,11 @@
 #include "windows.h"
 #endif
 
-/* Although we can't have more than 10 GPU different ID-s passed by the user as
- * the id-s are assumed to be represented by single digits, as multiple
- * processes can share a GPU, we can end up with more than 10 IDs.
- * To account for potential extreme cases we'll set the limit to a pretty
- * ridiculous number. */
-static unsigned int max_gpu_ids_user = 64;
+#ifdef GMX_GPU
+const gmx_bool bGPUBinary = TRUE;
+#else
+const gmx_bool bGPUBinary = FALSE;
+#endif
 
 static const char * invalid_gpuid_hint =
     "A delimiter-free sequence of valid numeric IDs of available GPUs is expected.";
@@ -77,7 +76,7 @@ static tMPI_Thread_mutex_t hw_info_lock = TMPI_THREAD_MUTEX_INITIALIZER;
 
 
 /* FW decl. */
-static void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count);
+static void limit_num_gpus_used(gmx_gpu_opt_t *gpu_opt, int count);
 
 static void sprint_gpus(char *sbuf, const gmx_gpu_info_t *gpu_info, gmx_bool bPrintAll)
 {
@@ -106,6 +105,12 @@ static void print_gpu_detection_stats(FILE                 *fplog,
     char onhost[266], stmp[STRLEN];
     int  ngpu;
 
+    if (!gpu_info->bDetectGPUs)
+    {
+        /* We skipped the detection, so don't print detection stats */
+        return;
+    }
+
     ngpu = gpu_info->ncuda_dev;
 
 #if defined GMX_MPI && !defined GMX_THREAD_MPI
@@ -131,31 +136,32 @@ static void print_gpu_detection_stats(FILE                 *fplog,
 
 static void print_gpu_use_stats(FILE                 *fplog,
                                 const gmx_gpu_info_t *gpu_info,
+                                const gmx_gpu_opt_t  *gpu_opt,
                                 const t_commrec      *cr)
 {
     char sbuf[STRLEN], stmp[STRLEN];
-    int  i, ngpu, ngpu_all;
+    int  i, ngpu_comp, ngpu_use;
 
-    ngpu     = gpu_info->ncuda_dev_use;
-    ngpu_all = gpu_info->ncuda_dev;
+    ngpu_comp = gpu_info->ncuda_dev_compatible;
+    ngpu_use  = gpu_opt->ncuda_dev_use;
 
-    /* Issue note if GPUs are available but not used */
-    if (ngpu_all > 0 && ngpu < 1)
+    /* Issue a note if GPUs are available but not used */
+    if (ngpu_comp > 0 && ngpu_use < 1)
     {
         sprintf(sbuf,
                 "%d compatible GPU%s detected in the system, but none will be used.\n"
                 "Consider trying GPU acceleration with the Verlet scheme!",
-                ngpu_all, (ngpu_all > 1) ? "s" : "");
+                ngpu_comp, (ngpu_comp > 1) ? "s" : "");
     }
     else
     {
         sprintf(sbuf, "%d GPU%s %sselected for this run: ",
-                ngpu, (ngpu > 1) ? "s" : "",
-                gpu_info->bUserSet ? "user-" : "auto-");
-        for (i = 0; i < ngpu; i++)
+                ngpu_use, (ngpu_use > 1) ? "s" : "",
+                gpu_opt->bUserSet ? "user-" : "auto-");
+        for (i = 0; i < ngpu_use; i++)
         {
-            sprintf(stmp, "#%d", get_gpu_device_id(gpu_info, i));
-            if (i < ngpu - 1)
+            sprintf(stmp, "#%d", get_gpu_device_id(gpu_info, gpu_opt, i));
+            if (i < ngpu_use - 1)
             {
                 strcat(stmp, ", ");
             }
@@ -167,20 +173,13 @@ static void print_gpu_use_stats(FILE                 *fplog,
 
 /* Parse a "plain" GPU ID string which contains a sequence of digits corresponding
  * to GPU IDs; the order will indicate the process/tMPI thread - GPU assignment. */
-static void parse_gpu_id_plain_string(const char *idstr, int *nid, int *idlist)
+static void parse_gpu_id_plain_string(const char *idstr, int *nid, int **idlist)
 {
-    int    i;
-    size_t len_idstr;
-
-    len_idstr = strlen(idstr);
+    int i;
 
-    if (len_idstr > max_gpu_ids_user)
-    {
-        gmx_fatal(FARGS, "%d GPU IDs provided, but only at most %d are supported",
-                  len_idstr, max_gpu_ids_user);
-    }
+    *nid = strlen(idstr);
 
-    *nid = len_idstr;
+    snew(*idlist, *nid);
 
     for (i = 0; i < *nid; i++)
     {
@@ -189,7 +188,7 @@ static void parse_gpu_id_plain_string(const char *idstr, int *nid, int *idlist)
             gmx_fatal(FARGS, "Invalid character in GPU ID string: '%c'\n%s\n",
                       idstr[i], invalid_gpuid_hint);
         }
-        idlist[i] = idstr[i] - '0';
+        (*idlist)[i] = idstr[i] - '0';
     }
 }
 
@@ -199,17 +198,15 @@ static void parse_gpu_id_csv_string(const char *idstr, int *nid, int *idlist)
     gmx_incons("Not implemented yet");
 }
 
-void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
-                                      const t_commrec *cr, int ntmpi_requested,
+void gmx_check_hw_runconf_consistency(FILE *fplog,
+                                      const gmx_hw_info_t *hwinfo,
+                                      const t_commrec *cr,
+                                      const gmx_hw_opt_t *hw_opt,
                                       gmx_bool bUseGPU)
 {
-    int                        npppn, ntmpi_pp, ngpu;
-    char                       sbuf[STRLEN], th_or_proc[STRLEN], th_or_proc_plural[STRLEN], pernode[STRLEN];
-    char                       gpu_plural[2];
-    gmx_bool                   bGPUBin, btMPI, bMPI, bMaxMpiThreadsSet, bNthreadsAuto, bEmulateGPU;
-    int                        ret;
-    static tMPI_Thread_mutex_t cons_lock = TMPI_THREAD_MUTEX_INITIALIZER;
-
+    int      npppn, ntmpi_pp;
+    char     sbuf[STRLEN], th_or_proc[STRLEN], th_or_proc_plural[STRLEN], pernode[STRLEN];
+    gmx_bool btMPI, bMPI, bMaxMpiThreadsSet, bNthreadsAuto, bEmulateGPU;
 
     assert(hwinfo);
     assert(cr);
@@ -223,206 +220,176 @@ void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
         return;
     }
 
-    /* We run this function only once, but must make sure that all threads
-       that are alive run this function, so they get consistent data. We
-       achieve this by mutual exclusion and returning if the structure is
-       already properly checked & set */
-    ret = tMPI_Thread_mutex_lock(&cons_lock);
-    if (ret != 0)
+    btMPI         = bMPI = FALSE;
+    bNthreadsAuto = FALSE;
+#if defined(GMX_THREAD_MPI)
+    btMPI         = TRUE;
+    bNthreadsAuto = (hw_opt->nthreads_tmpi < 1);
+#elif defined(GMX_LIB_MPI)
+    bMPI  = TRUE;
+#endif
+
+    /* GPU emulation detection is done later, but we need here as well
+     * -- uncool, but there's no elegant workaround */
+    bEmulateGPU       = (getenv("GMX_EMULATE_GPU") != NULL);
+    bMaxMpiThreadsSet = (getenv("GMX_MAX_MPI_THREADS") != NULL);
+
+    /* check the acceleration mdrun is compiled with against hardware
+       capabilities */
+    /* TODO: Here we assume homogeneous hardware which is not necessarily
+             the case! Might not hurt to add an extra check over MPI. */
+    gmx_cpuid_acceleration_check(hwinfo->cpuid_info, fplog, SIMMASTER(cr));
+
+    /* NOTE: this print is only for and on one physical node */
+    print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
+
+    if (hwinfo->gpu_info.ncuda_dev_compatible > 0)
     {
-        gmx_fatal(FARGS, "Error locking cons mutex: %s", strerror(errno));
+        /* NOTE: this print is only for and on one physical node */
+        print_gpu_use_stats(fplog, &hwinfo->gpu_info, &hw_opt->gpu_opt, cr);
     }
 
-    if (!hwinfo->bConsistencyChecked)
+    /* Need to ensure that we have enough GPUs:
+     * - need one GPU per PP node
+     * - no GPU oversubscription with tMPI
+     * */
+    /* number of PP processes per node */
+    npppn = cr->nrank_pp_intranode;
+
+    pernode[0]           = '\0';
+    th_or_proc_plural[0] = '\0';
+    if (btMPI)
     {
-        btMPI         = bMPI = FALSE;
-        bNthreadsAuto = FALSE;
-#if defined(GMX_THREAD_MPI)
-        btMPI         = TRUE;
-        bNthreadsAuto = (ntmpi_requested < 1);
-#elif defined(GMX_LIB_MPI)
-        bMPI  = TRUE;
-#endif
+        sprintf(th_or_proc, "thread-MPI thread");
+        if (npppn > 1)
+        {
+            sprintf(th_or_proc_plural, "s");
+        }
+    }
+    else if (bMPI)
+    {
+        sprintf(th_or_proc, "MPI process");
+        if (npppn > 1)
+        {
+            sprintf(th_or_proc_plural, "es");
+        }
+        sprintf(pernode, " per node");
+    }
+    else
+    {
+        /* neither MPI nor tMPI */
+        sprintf(th_or_proc, "process");
+    }
 
-#ifdef GMX_GPU
-        bGPUBin      = TRUE;
-#else
-        bGPUBin      = FALSE;
-#endif
+    if (bUseGPU && hwinfo->gpu_info.ncuda_dev_compatible > 0 &&
+        !bEmulateGPU)
+    {
+        int  ngpu_comp, ngpu_use;
+        char gpu_comp_plural[2], gpu_use_plural[2];
+
+        ngpu_comp = hwinfo->gpu_info.ncuda_dev_compatible;
+        ngpu_use  = hw_opt->gpu_opt.ncuda_dev_use;
 
-        /* GPU emulation detection is done later, but we need here as well
-         * -- uncool, but there's no elegant workaround */
-        bEmulateGPU       = (getenv("GMX_EMULATE_GPU") != NULL);
-        bMaxMpiThreadsSet = (getenv("GMX_MAX_MPI_THREADS") != NULL);
-
-        /* check the acceleration mdrun is compiled with against hardware
-           capabilities */
-        /* TODO: Here we assume homogeneous hardware which is not necessarily
-                 the case! Might not hurt to add an extra check over MPI. */
-        gmx_cpuid_acceleration_check(hwinfo->cpuid_info, fplog);
-
-        /* Need to ensure that we have enough GPUs:
-         * - need one GPU per PP node
-         * - no GPU oversubscription with tMPI
-         * => keep on the GPU support, otherwise turn off (or bail if forced)
-         * */
-        /* number of PP processes per node */
-        npppn = cr->nrank_pp_intranode;
-
-        pernode[0]           = '\0';
-        th_or_proc_plural[0] = '\0';
-        if (btMPI)
+        sprintf(gpu_comp_plural, "%s", (ngpu_comp> 1) ? "s" : "");
+        sprintf(gpu_use_plural,  "%s", (ngpu_use > 1) ? "s" : "");
+
+        /* number of tMPI threads auto-adjusted */
+        if (btMPI && bNthreadsAuto)
         {
-            sprintf(th_or_proc, "thread-MPI thread");
-            if (npppn > 1)
+            if (hw_opt->gpu_opt.bUserSet && npppn < ngpu_use)
             {
-                sprintf(th_or_proc_plural, "s");
+                /* The user manually provided more GPUs than threads we
+                   could automatically start. */
+                gmx_fatal(FARGS,
+                          "%d GPU%s provided, but only %d PP thread-MPI thread%s coud be started.\n"
+                          "%s requires one PP tread-MPI thread per GPU; use fewer GPUs%s.",
+                          ngpu_use, gpu_use_plural,
+                          npppn, th_or_proc_plural,
+                          ShortProgram(), bMaxMpiThreadsSet ? "\nor allow more threads to be used" : "");
             }
-        }
-        else if (bMPI)
-        {
-            sprintf(th_or_proc, "MPI process");
-            if (npppn > 1)
+
+            if (!hw_opt->gpu_opt.bUserSet && npppn < ngpu_comp)
             {
-                sprintf(th_or_proc_plural, "es");
+                /* There are more GPUs than tMPI threads; we have
+                   limited the number GPUs used. */
+                md_print_warn(cr, fplog,
+                              "NOTE: %d GPU%s were detected, but only %d PP thread-MPI thread%s can be started.\n"
+                              "      %s can use one GPU per PP tread-MPI thread, so only %d GPU%s will be used.%s\n",
+                              ngpu_comp, gpu_comp_plural,
+                              npppn, th_or_proc_plural,
+                              ShortProgram(), npppn,
+                              npppn > 1 ? "s" : "",
+                              bMaxMpiThreadsSet ? "\n      Also, you can allow more threads to be used by increasing GMX_MAX_MPI_THREADS" : "");
             }
-            sprintf(pernode, " per node");
-        }
-        else
-        {
-            /* neither MPI nor tMPI */
-            sprintf(th_or_proc, "process");
         }
 
-        if (bGPUBin)
+        if (hw_opt->gpu_opt.bUserSet)
         {
-            print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
+            if (ngpu_use != npppn)
+            {
+                gmx_fatal(FARGS,
+                          "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
+                          "%s was started with %d PP %s%s%s, but you provided %d GPU%s.",
+                          th_or_proc, btMPI ? "s" : "es", pernode,
+                          ShortProgram(), npppn, th_or_proc,
+                          th_or_proc_plural, pernode,
+                          ngpu_use, gpu_use_plural);
+            }
         }
-
-        if (bUseGPU && hwinfo->bCanUseGPU && !bEmulateGPU)
+        else
         {
-            ngpu = hwinfo->gpu_info.ncuda_dev_use;
-            sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
-
-            /* number of tMPI threads atuo-adjusted */
-            if (btMPI && bNthreadsAuto)
+            if (ngpu_comp > npppn)
             {
-                if (npppn < ngpu)
-                {
-                    if (hwinfo->gpu_info.bUserSet)
-                    {
-                        /* The user manually provided more GPUs than threads we
-                           could automatically start. */
-                        gmx_fatal(FARGS,
-                                  "%d GPU%s provided, but only %d PP thread-MPI thread%s coud be started.\n"
-                                  "%s requires one PP tread-MPI thread per GPU; use fewer GPUs%s.",
-                                  ngpu, gpu_plural, npppn, th_or_proc_plural,
-                                  ShortProgram(), bMaxMpiThreadsSet ? "\nor allow more threads to be used" : "");
-                    }
-                    else
-                    {
-                        /* There are more GPUs than tMPI threads; we have to
-                           limit the number GPUs used. */
-                        md_print_warn(cr, fplog,
-                                      "NOTE: %d GPU%s were detected, but only %d PP thread-MPI thread%s can be started.\n"
-                                      "      %s can use one GPU per PP tread-MPI thread, so only %d GPU%s will be used.%s\n",
-                                      ngpu, gpu_plural, npppn,
-                                      th_or_proc_plural,
-                                      ShortProgram(), npppn,
-                                      npppn > 1 ? "s" : "",
-                                      bMaxMpiThreadsSet ? "\n      Also, you can allow more threads to be used by increasing GMX_MAX_MPI_THREADS" : "");
-
-                        if (cr->rank_pp_intranode == 0)
-                        {
-                            limit_num_gpus_used(hwinfo, npppn);
-                            ngpu = hwinfo->gpu_info.ncuda_dev_use;
-                            sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
-                        }
-                    }
-                }
+                md_print_warn(cr, fplog,
+                              "NOTE: potentially sub-optimal launch configuration, %s started with less\n"
+                              "      PP %s%s%s than GPU%s available.\n"
+                              "      Each PP %s can use only one GPU, %d GPU%s%s will be used.\n",
+                              ShortProgram(), th_or_proc,
+                              th_or_proc_plural, pernode, gpu_comp_plural,
+                              th_or_proc, npppn, gpu_use_plural, pernode);
             }
-
-            if (ngpu != npppn)
+            
+            if (ngpu_use != npppn)
             {
-                if (hwinfo->gpu_info.bUserSet)
+                /* Avoid duplicate error messages.
+                 * Unfortunately we can only do this at the physical node
+                 * level, since the hardware setup and MPI process count
+                 * might differ between physical nodes.
+                 */
+                if (cr->rank_pp_intranode == 0)
                 {
                     gmx_fatal(FARGS,
                               "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
-                              "%s was started with %d PP %s%s%s, but you provided %d GPU%s.",
+                              "%s was started with %d PP %s%s%s, but only %d GPU%s were detected.",
                               th_or_proc, btMPI ? "s" : "es", pernode,
                               ShortProgram(), npppn, th_or_proc,
-                              th_or_proc_plural, pernode, ngpu, gpu_plural);
-                }
-                else
-                {
-                    if (ngpu > npppn)
-                    {
-                        md_print_warn(cr, fplog,
-                                      "NOTE: potentially sub-optimal launch configuration, %s started with less\n"
-                                      "      PP %s%s%s than GPU%s available.\n"
-                                      "      Each PP %s can use only one GPU, %d GPU%s%s will be used.\n",
-                                      ShortProgram(), th_or_proc,
-                                      th_or_proc_plural, pernode, gpu_plural,
-                                      th_or_proc, npppn, gpu_plural, pernode);
-
-                        if (bMPI || (btMPI && cr->rank_pp_intranode == 0))
-                        {
-                            limit_num_gpus_used(hwinfo, npppn);
-                            ngpu = hwinfo->gpu_info.ncuda_dev_use;
-                            sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
-                        }
-                    }
-                    else
-                    {
-                        /* Avoid duplicate error messages.
-                         * Unfortunately we can only do this at the physical node
-                         * level, since the hardware setup and MPI process count
-                         * might be differ over physical nodes.
-                         */
-                        if (cr->rank_pp_intranode == 0)
-                        {
-                            gmx_fatal(FARGS,
-                                      "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
-                                      "%s was started with %d PP %s%s%s, but only %d GPU%s were detected.",
-                                      th_or_proc, btMPI ? "s" : "es", pernode,
-                                      ShortProgram(), npppn, th_or_proc,
-                                      th_or_proc_plural, pernode, ngpu,
-                                      gpu_plural);
-                        }
-                    }
+                              th_or_proc_plural, pernode,
+                              ngpu_use, gpu_use_plural);
                 }
             }
+        }
 
-            {
-                int      same_count;
+        {
+            int      same_count;
 
-                same_count = gmx_count_gpu_dev_shared(&(hwinfo->gpu_info));
+            same_count = gmx_count_gpu_dev_shared(&hw_opt->gpu_opt);
 
-                if (btMPI && same_count > 0)
-                {
-                    gmx_fatal(FARGS,
-                              "Invalid GPU assignment: can't share a GPU among multiple thread-MPI threads.\n"
-                              "Use MPI if you are sure that you want to assign GPU to multiple threads.");
-                }
+            if (btMPI && same_count > 0)
+            {
+                gmx_fatal(FARGS,
+                          "Invalid GPU assignment: can't share a GPU among multiple thread-MPI threads.\n"
+                          "Use MPI if you are sure that you want to assign a GPU to multiple threads.");
+            }
 
-                if (same_count > 0)
-                {
-                    md_print_warn(cr, fplog,
-                                  "NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n"
-                                  "      multiple %s%s; this should be avoided as it can cause\n"
-                                  "      performance loss.\n",
-                                  same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
-                }
+            if (same_count > 0)
+            {
+                md_print_warn(cr, fplog,
+                              "NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n"
+                              "      multiple %s%s; this should be avoided as it can cause\n"
+                              "      performance loss.\n",
+                              same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
             }
-            print_gpu_use_stats(fplog, &hwinfo->gpu_info, cr);
         }
-        hwinfo->bConsistencyChecked = TRUE;
-    }
-
-    ret = tMPI_Thread_mutex_unlock(&cons_lock);
-    if (ret != 0)
-    {
-        gmx_fatal(FARGS, "Error unlocking cons mutex: %s", strerror(errno));
     }
 
 #ifdef GMX_MPI
@@ -436,12 +403,12 @@ void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
 
 }
 
-int gmx_count_gpu_dev_shared(const gmx_gpu_info_t *gpu_info)
+int gmx_count_gpu_dev_shared(const gmx_gpu_opt_t *gpu_opt)
 {
     int      same_count    = 0;
-    int      ngpu          = gpu_info->ncuda_dev_use;
+    int      ngpu          = gpu_opt->ncuda_dev_use;
 
-    if (gpu_info->bUserSet)
+    if (gpu_opt->bUserSet)
     {
         int      i, j;
 
@@ -449,8 +416,8 @@ int gmx_count_gpu_dev_shared(const gmx_gpu_info_t *gpu_info)
         {
             for (j = i + 1; j < ngpu; j++)
             {
-                same_count      += (gpu_info->cuda_dev_use[i] ==
-                                    gpu_info->cuda_dev_use[j]);
+                same_count      += (gpu_opt->cuda_dev_use[i] ==
+                                    gpu_opt->cuda_dev_use[j]);
             }
         }
     }
@@ -511,15 +478,11 @@ static int get_nthreads_hw_avail(FILE *fplog, const t_commrec *cr)
 }
 
 gmx_hw_info_t *gmx_detect_hardware(FILE *fplog, const t_commrec *cr,
-                                   gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
-                                   const char *gpu_id)
+                                   gmx_bool bDetectGPUs)
 {
-    int              i;
-    const char      *env;
-    char             sbuf[STRLEN], stmp[STRLEN];
+    char             sbuf[STRLEN];
     gmx_hw_info_t   *hw;
     gmx_gpu_info_t   gpuinfo_auto, gpuinfo_user;
-    gmx_bool         bGPUBin;
     int              ret;
 
     /* make sure no one else is doing the same thing */
@@ -533,7 +496,6 @@ gmx_hw_info_t *gmx_detect_hardware(FILE *fplog, const t_commrec *cr,
     if (n_hwinfo == 0)
     {
         snew(hwinfo_g, 1);
-        hwinfo_g->bConsistencyChecked = FALSE;
 
         /* detect CPUID info; no fuss, we don't detect system-wide
          * -- sloppy, but that's it for now */
@@ -546,30 +508,17 @@ gmx_hw_info_t *gmx_detect_hardware(FILE *fplog, const t_commrec *cr,
         hwinfo_g->nthreads_hw_avail = get_nthreads_hw_avail(fplog, cr);
 
         /* detect GPUs */
-        hwinfo_g->gpu_info.ncuda_dev_use  = 0;
-        hwinfo_g->gpu_info.cuda_dev_use   = NULL;
-        hwinfo_g->gpu_info.ncuda_dev      = 0;
-        hwinfo_g->gpu_info.cuda_dev       = NULL;
-
-#ifdef GMX_GPU
-        bGPUBin      = TRUE;
-#else
-        bGPUBin      = FALSE;
-#endif
-
-        /* Bail if binary is not compiled with GPU acceleration, but this is either
-         * explicitly (-nb gpu) or implicitly (gpu ID passed) requested. */
-        if (bForceUseGPU && !bGPUBin)
-        {
-            gmx_fatal(FARGS, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
-        }
-        if (gpu_id != NULL && !bGPUBin)
-        {
-            gmx_fatal(FARGS, "GPU ID string set, but %s was compiled without GPU support!", ShortProgram());
-        }
-
-        /* run the detection if the binary was compiled with GPU support */
-        if (bGPUBin && getenv("GMX_DISABLE_GPU_DETECTION") == NULL)
+        hwinfo_g->gpu_info.ncuda_dev            = 0;
+        hwinfo_g->gpu_info.cuda_dev             = NULL;
+        hwinfo_g->gpu_info.ncuda_dev_compatible = 0;
+
+        /* Run the detection if the binary was compiled with GPU support
+         * and we requested detection.
+         */
+        hwinfo_g->gpu_info.bDetectGPUs =
+            (bGPUBinary && bDetectGPUs &&
+             getenv("GMX_DISABLE_GPU_DETECTION") == NULL);
+        if (hwinfo_g->gpu_info.bDetectGPUs)
         {
             char detection_error[STRLEN];
 
@@ -589,94 +538,133 @@ gmx_hw_info_t *gmx_detect_hardware(FILE *fplog, const t_commrec *cr,
                               sbuf);
             }
         }
+    }
+    /* increase the reference counter */
+    n_hwinfo++;
+
+    ret = tMPI_Thread_mutex_unlock(&hw_info_lock);
+    if (ret != 0)
+    {
+        gmx_fatal(FARGS, "Error unlocking hwinfo mutex: %s", strerror(errno));
+    }
+
+    return hwinfo_g;
+}
+
+void gmx_parse_gpu_ids(gmx_gpu_opt_t *gpu_opt)
+{
+    char *env;
 
-        if (bForceUseGPU || bTryUseGPU)
+    if (gpu_opt->gpu_id != NULL && !bGPUBinary)
+    {
+        gmx_fatal(FARGS, "GPU ID string set, but %s was compiled without GPU support!", ShortProgram());
+    }
+
+    env = getenv("GMX_GPU_ID");
+    if (env != NULL && gpu_opt->gpu_id != NULL)
+    {
+        gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time");
+    }
+    if (env == NULL)
+    {
+        env = gpu_opt->gpu_id;
+    }
+
+    /* parse GPU IDs if the user passed any */
+    if (env != NULL)
+    {
+        parse_gpu_id_plain_string(env,
+                                  &gpu_opt->ncuda_dev_use,
+                                  &gpu_opt->cuda_dev_use);
+
+        if (gpu_opt->ncuda_dev_use == 0)
         {
-            env = getenv("GMX_GPU_ID");
-            if (env != NULL && gpu_id != NULL)
-            {
-                gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time");
-            }
-            if (env == NULL)
-            {
-                env = gpu_id;
-            }
+            gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n",
+                      invalid_gpuid_hint);
+        }
 
-            /* parse GPU IDs if the user passed any */
-            if (env != NULL)
-            {
-                int *gpuid, *checkres;
-                int  nid, res;
+        gpu_opt->bUserSet = TRUE;
+    }
+}
 
-                snew(gpuid, max_gpu_ids_user);
-                snew(checkres, max_gpu_ids_user);
+void gmx_select_gpu_ids(FILE *fplog, const t_commrec *cr,
+                        const gmx_gpu_info_t *gpu_info,
+                        gmx_bool bForceUseGPU,
+                        gmx_gpu_opt_t *gpu_opt)
+{
+    int              i;
+    const char      *env;
+    char             sbuf[STRLEN], stmp[STRLEN];
 
-                parse_gpu_id_plain_string(env, &nid, gpuid);
+    /* Bail if binary is not compiled with GPU acceleration, but this is either
+     * explicitly (-nb gpu) or implicitly (gpu ID passed) requested. */
+    if (bForceUseGPU && !bGPUBinary)
+    {
+        gmx_fatal(FARGS, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
+    }
 
-                if (nid == 0)
-                {
-                    gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n",
-                              invalid_gpuid_hint);
-                }
+    if (gpu_opt->bUserSet)
+    {
+        /* Check the GPU IDs passed by the user.
+         * (GPU IDs have been parsed by gmx_parse_gpu_ids before)
+         */
+        int *checkres;
+        int  res;
 
-                res = check_select_cuda_gpus(checkres, &hwinfo_g->gpu_info,
-                                             gpuid, nid);
+        snew(checkres, gpu_opt->ncuda_dev_use);
 
-                if (!res)
-                {
-                    print_gpu_detection_stats(fplog, &hwinfo_g->gpu_info, cr);
-
-                    sprintf(sbuf, "Some of the requested GPUs do not exist, behave strangely, or are not compatible:\n");
-                    for (i = 0; i < nid; i++)
-                    {
-                        if (checkres[i] != egpuCompatible)
-                        {
-                            sprintf(stmp, "    GPU #%d: %s\n",
-                                    gpuid[i], gpu_detect_res_str[checkres[i]]);
-                            strcat(sbuf, stmp);
-                        }
-                    }
-                    gmx_fatal(FARGS, "%s", sbuf);
-                }
+        res = check_selected_cuda_gpus(checkres, gpu_info, gpu_opt);
 
-                hwinfo_g->gpu_info.bUserSet = TRUE;
+        if (!res)
+        {
+            print_gpu_detection_stats(fplog, gpu_info, cr);
 
-                sfree(gpuid);
-                sfree(checkres);
-            }
-            else
+            sprintf(sbuf, "Some of the requested GPUs do not exist, behave strangely, or are not compatible:\n");
+            for (i = 0; i < gpu_opt->ncuda_dev_use; i++)
             {
-                pick_compatible_gpus(&hwinfo_g->gpu_info);
-                hwinfo_g->gpu_info.bUserSet = FALSE;
+                if (checkres[i] != egpuCompatible)
+                {
+                    sprintf(stmp, "    GPU #%d: %s\n",
+                            gpu_opt->cuda_dev_use[i],
+                            gpu_detect_res_str[checkres[i]]);
+                    strcat(sbuf, stmp);
+                }
             }
+            gmx_fatal(FARGS, "%s", sbuf);
+        }
 
-            /* decide whether we can use GPU */
-            hwinfo_g->bCanUseGPU = (hwinfo_g->gpu_info.ncuda_dev_use > 0);
-            if (!hwinfo_g->bCanUseGPU && bForceUseGPU)
-            {
-                gmx_fatal(FARGS, "GPU acceleration requested, but no compatible GPUs were detected.");
-            }
+        sfree(checkres);
+    }
+    else
+    {
+        pick_compatible_gpus(&hwinfo_g->gpu_info, gpu_opt);
+
+        if (gpu_opt->ncuda_dev_use > cr->nrank_pp_intranode)
+        {
+            /* We picked more GPUs than we can use: limit the number.
+             * We print detailed messages about this later in
+             * gmx_check_hw_runconf_consistency.
+             */
+            limit_num_gpus_used(gpu_opt, cr->nrank_pp_intranode);
         }
+
+        gpu_opt->bUserSet = FALSE;
     }
-    /* increase the reference counter */
-    n_hwinfo++;
 
-    ret = tMPI_Thread_mutex_unlock(&hw_info_lock);
-    if (ret != 0)
+    /* If the user asked for a GPU, check whether we have a GPU */
+    if (bForceUseGPU && gpu_info->ncuda_dev_compatible == 0)
     {
-        gmx_fatal(FARGS, "Error unlocking hwinfo mutex: %s", strerror(errno));
+        gmx_fatal(FARGS, "GPU acceleration requested, but no compatible GPUs were detected.");
     }
-
-    return hwinfo_g;
 }
 
-static void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count)
+static void limit_num_gpus_used(gmx_gpu_opt_t *gpu_opt, int count)
 {
     int ndev_use;
 
-    assert(hwinfo);
+    assert(gpu_opt);
 
-    ndev_use = hwinfo->gpu_info.ncuda_dev_use;
+    ndev_use = gpu_opt->ncuda_dev_use;
 
     if (count > ndev_use)
     {
@@ -693,7 +681,7 @@ static void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count)
     }
 
     /* TODO: improve this implementation: either sort GPUs or remove the weakest here */
-    hwinfo->gpu_info.ncuda_dev_use = count;
+    gpu_opt->ncuda_dev_use = count;
 }
 
 void gmx_hardware_info_free(gmx_hw_info_t *hwinfo)
diff --git a/src/gmxlib/gmx_thread_affinity.c b/src/gmxlib/gmx_thread_affinity.c
index e3870412e9..badb753428 100644
--- a/src/gmxlib/gmx_thread_affinity.c
+++ b/src/gmxlib/gmx_thread_affinity.c
@@ -50,7 +50,6 @@
 #include "gmx_cpuid.h"
 #include "gmx_omp.h"
 #include "gmx_omp_nthreads.h"
-#include "mdrun.h"
 #include "md_logging.h"
 #include "statutil.h"
 #include "gmx_thread_affinity.h"
diff --git a/src/gmxlib/gpu_utils/gpu_utils.cu b/src/gmxlib/gpu_utils/gpu_utils.cu
index e11367dc7e..8640ed3bc8 100644
--- a/src/gmxlib/gpu_utils/gpu_utils.cu
+++ b/src/gmxlib/gpu_utils/gpu_utils.cu
@@ -492,9 +492,12 @@ int do_timed_memtest(int dev_id, int time_constr)
  * \param[out] result_str   the message related to the error that occurred
  *                          during the initialization (if there was any).
  * \param[in] gpu_info      GPU info of all detected devices in the system.
+ * \param[in] gpu_opt       options for using the GPUs in gpu_info
  * \returns                 true if no error occurs during initialization.
  */
-gmx_bool init_gpu(int mygpu, char *result_str, const gmx_gpu_info_t *gpu_info)
+gmx_bool init_gpu(int mygpu, char *result_str,
+                  const gmx_gpu_info_t *gpu_info,
+                  const gmx_gpu_opt_t *gpu_opt)
 {
     cudaError_t stat;
     char sbuf[STRLEN];
@@ -503,15 +506,15 @@ gmx_bool init_gpu(int mygpu, char *result_str, const gmx_gpu_info_t *gpu_info)
     assert(gpu_info);
     assert(result_str);
 
-    if (mygpu < 0 || mygpu >= gpu_info->ncuda_dev_use)
+    if (mygpu < 0 || mygpu >= gpu_opt->ncuda_dev_use)
     {
         sprintf(sbuf, "Trying to initialize an inexistent GPU: "
                 "there are %d %s-selected GPU(s), but #%d was requested.",
-                 gpu_info->ncuda_dev_use, gpu_info->bUserSet ? "user" : "auto", mygpu);
+                 gpu_opt->ncuda_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
         gmx_incons(sbuf);
     }
 
-    gpuid = gpu_info->cuda_dev[gpu_info->cuda_dev_use[mygpu]].id;
+    gpuid = gpu_info->cuda_dev[gpu_opt->cuda_dev_use[mygpu]].id;
 
     stat = cudaSetDevice(gpuid);
     strncpy(result_str, cudaGetErrorString(stat), STRLEN);
@@ -651,6 +654,8 @@ int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
     assert(gpu_info);
     assert(err_str);
 
+    gpu_info->ncuda_dev_compatible = 0;
+
     ndev    = 0;
     devs    = NULL;
 
@@ -677,6 +682,11 @@ int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
             devs[i].id   = i;
             devs[i].prop = prop;
             devs[i].stat = checkres;
+
+            if (checkres == egpuCompatible)
+            {
+                gpu_info->ncuda_dev_compatible++;
+            }
         }
         retval = 0;
     }
@@ -692,15 +702,16 @@ int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
  * This function selects the compatible gpus and initializes
  * gpu_info->cuda_dev_use and gpu_info->ncuda_dev_use.
  *
- * Given the list of GPUs available in the system the it checks each gpu in
- * gpu_info->cuda_dev and puts the the indices (into gpu_info->cuda_dev) of
- * the compatible ones into cuda_dev_use with this marking the respective
- * GPUs as "available for use."
+ * Given the list of GPUs available in the system check each device in
+ * gpu_info->cuda_dev and place the indices of the compatible GPUs into
+ * cuda_dev_use with this marking the respective GPUs as "available for use."
  * Note that \detect_cuda_gpus must have been called before.
  *
- * \param[in]    gpu_info    pointer to structure holding GPU information
+ * \param[in]     gpu_info    pointer to structure holding GPU information
+ * \param[in,out] gpu_opt     pointer to structure holding GPU options
  */
-void pick_compatible_gpus(gmx_gpu_info_t *gpu_info)
+void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
+                          gmx_gpu_opt_t *gpu_opt)
 {
     int i, ncompat;
     int *compat;
@@ -720,53 +731,52 @@ void pick_compatible_gpus(gmx_gpu_info_t *gpu_info)
         }
     }
 
-    gpu_info->ncuda_dev_use = ncompat;
-    snew(gpu_info->cuda_dev_use, ncompat);
-    memcpy(gpu_info->cuda_dev_use, compat, ncompat*sizeof(*compat));
+    gpu_opt->ncuda_dev_use = ncompat;
+    snew(gpu_opt->cuda_dev_use, ncompat);
+    memcpy(gpu_opt->cuda_dev_use, compat, ncompat*sizeof(*compat));
     sfree(compat);
 }
 
 /*! \brief Check the existence/compatibility of a set of GPUs specified by their device IDs.
  *
- * Given the a list of GPU devide IDs in \requested_devs, check for the
- * existence and compatibility of the respective GPUs and fill in \gpu_info
- * with the collected information. Also provide the caller with an array with
+ * Given the a list of gpu->ncuda_dev_use GPU device IDs stored in
+ * gpu_opt->cuda_dev_use check the existence and compatibility
+ * of the respective GPUs. Also provide the caller with an array containing
  * the result of checks in \checkres.
  *
  * \param[out]  checkres    check result for each ID passed in \requested_devs
  * \param[in]   gpu_info    pointer to structure holding GPU information
- * \param[in]   requested_devs array of requested device IDs
- * \param[in]   count       number of IDs in \requested_devs
- * \returns                 TRUE if every requested GPU is compatible
+ * \param[out]  gpu_opt     pointer to structure holding GPU options
+ * \returns                 TRUE if every the requested GPUs are compatible
  */
-gmx_bool check_select_cuda_gpus(int *checkres, gmx_gpu_info_t *gpu_info,
-                                const int *requested_devs, int count)
+gmx_bool check_selected_cuda_gpus(int *checkres,
+                                  const gmx_gpu_info_t *gpu_info,
+                                  gmx_gpu_opt_t *gpu_opt)
 {
     int i, id;
     bool bAllOk;
 
     assert(checkres);
     assert(gpu_info);
-    assert(requested_devs);
-    assert(count >= 0);
+    assert(gpu_opt->ncuda_dev_use >= 0);
 
-    if (count == 0)
+    if (gpu_opt->ncuda_dev_use == 0)
     {
         return TRUE;
     }
 
+    assert(gpu_opt->cuda_dev_use);
+
     /* we will assume that all GPUs requested are valid IDs,
        otherwise we'll bail anyways */
-    gpu_info->ncuda_dev_use = count;
-    snew(gpu_info->cuda_dev_use, count);
 
     bAllOk = true;
-    for (i = 0; i < count; i++)
+    for (i = 0; i < gpu_opt->ncuda_dev_use; i++)
     {
-        id = requested_devs[i];
+        id = gpu_opt->cuda_dev_use[i];
 
         /* devices are stored in increasing order of IDs in cuda_dev */
-        gpu_info->cuda_dev_use[i] = id;
+        gpu_opt->cuda_dev_use[i] = id;
 
         checkres[i] = (id >= gpu_info->ncuda_dev) ?
             egpuNonexistent : gpu_info->cuda_dev[id].stat;
@@ -788,7 +798,6 @@ void free_gpu_info(const gmx_gpu_info_t *gpu_info)
         return;
     }
 
-    sfree(gpu_info->cuda_dev_use);
     sfree(gpu_info->cuda_dev);
 }
 
@@ -841,18 +850,22 @@ void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int ind
  * respective CUDA GPU.
  *
  * \param[in]    gpu_info   pointer to structure holding GPU information
+ * \param[in]    gpu_opt    pointer to structure holding GPU options
  * \param[in]    idx        index into the array of used GPUs
  * \returns                 device ID of the requested GPU
  */
-int get_gpu_device_id(const gmx_gpu_info_t *gpu_info, int idx)
+int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
+                      const gmx_gpu_opt_t *gpu_opt,
+                      int idx)
 {
     assert(gpu_info);
-    if (idx < 0 && idx >= gpu_info->ncuda_dev_use)
+    assert(gpu_opt);
+    if (idx < 0 && idx >= gpu_opt->ncuda_dev_use)
     {
         return -1;
     }
 
-    return gpu_info->cuda_dev[gpu_info->cuda_dev_use[idx]].id;
+    return gpu_info->cuda_dev[gpu_opt->cuda_dev_use[idx]].id;
 }
 
 /*! \brief Returns the device ID of the GPU currently in use.
diff --git a/src/kernel/mdrun.c b/src/kernel/mdrun.c
index a5c0e6a625..29cb7e3e43 100644
--- a/src/kernel/mdrun.c
+++ b/src/kernel/mdrun.c
@@ -451,7 +451,12 @@ int cmain(int argc, char *argv[])
     output_env_t  oenv                  = NULL;
     const char   *deviceOptions         = "";
 
-    gmx_hw_opt_t  hw_opt = {0, 0, 0, 0, threadaffSEL, 0, 0, NULL};
+    /* Non transparent initialization of a complex gmx_hw_opt_t struct.
+     * But unfortunately we are not allowed to call a function here,
+     * since declarations follow below.
+     */
+    gmx_hw_opt_t  hw_opt = { 0, 0, 0, 0, threadaffSEL, 0, 0,
+                             { NULL, FALSE, 0, NULL } };
 
     t_pargs       pa[] = {
 
@@ -477,7 +482,7 @@ int cmain(int argc, char *argv[])
           "The starting logical core number for pinning to cores; used to avoid pinning threads from different mdrun instances to the same core" },
         { "-pinstride", FALSE, etINT, {&hw_opt.core_pinning_stride},
           "Pinning distance in logical cores for threads, use 0 to minimize the number of threads per physical core" },
-        { "-gpu_id",  FALSE, etSTR, {&hw_opt.gpu_id},
+        { "-gpu_id",  FALSE, etSTR, {&hw_opt.gpu_opt.gpu_id},
           "List of GPU id's to use" },
         { "-ddcheck", FALSE, etBOOL, {&bDDBondCheck},
           "Check for all bonded interactions with DD" },
diff --git a/src/kernel/runner.c b/src/kernel/runner.c
index 70f80d13d9..3665c47406 100644
--- a/src/kernel/runner.c
+++ b/src/kernel/runner.c
@@ -112,7 +112,7 @@ tMPI_Thread_mutex_t deform_init_box_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
 #ifdef GMX_THREAD_MPI
 struct mdrunner_arglist
 {
-    gmx_hw_opt_t   *hw_opt;
+    gmx_hw_opt_t    hw_opt;
     FILE           *fplog;
     t_commrec      *cr;
     int             nfile;
@@ -170,7 +170,7 @@ static void mdrunner_start_fn(void *arg)
         fplog = mc.fplog;
     }
 
-    mda->ret = mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
+    mda->ret = mdrunner(&mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
                         mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
                         mc.ddxyz, mc.dd_node_order, mc.rdd,
                         mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
@@ -215,7 +215,8 @@ static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
     fnmn = dup_tfn(nfile, fnm);
 
     /* fill the data structure to pass as void pointer to thread start fn */
-    mda->hw_opt         = hw_opt;
+    /* hw_opt contains pointers, which should all be NULL at this stage */
+    mda->hw_opt         = *hw_opt;
     mda->fplog          = fplog;
     mda->cr             = cr;
     mda->nfile          = nfile;
@@ -370,16 +371,23 @@ static int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
         nthreads_tot_max = nthreads_hw;
     }
 
-    bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
+    bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET &&
+                  hwinfo->gpu_info.ncuda_dev_compatible > 0);
     if (bCanUseGPU)
     {
-        ngpu = hwinfo->gpu_info.ncuda_dev_use;
+        ngpu = hwinfo->gpu_info.ncuda_dev_compatible;
     }
     else
     {
         ngpu = 0;
     }
 
+    if (inputrec->cutoff_scheme == ecutsGROUP)
+    {
+        /* We checked this before, but it doesn't hurt to do it once more */
+        assert(hw_opt->nthreads_omp == 1);
+    }
+
     nthreads_tmpi =
         get_tmpi_omp_thread_division(hwinfo, hw_opt, nthreads_tot_max, ngpu);
 
@@ -641,20 +649,13 @@ static void increase_nstlist(FILE *fp, t_commrec *cr,
 }
 
 static void prepare_verlet_scheme(FILE                *fplog,
-                                  const gmx_hw_info_t *hwinfo,
                                   t_commrec           *cr,
                                   const char          *nbpu_opt,
                                   t_inputrec          *ir,
                                   const gmx_mtop_t    *mtop,
                                   matrix               box,
-                                  gmx_bool            *bUseGPU)
+                                  gmx_bool             bUseGPU)
 {
-    /* Here we only check for GPU usage on the MPI master process,
-     * as here we don't know how many GPUs we will use yet.
-     * We check for a GPU on all processes later.
-     */
-    *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
-
     if (ir->verletbuf_drift > 0)
     {
         /* Update the Verlet buffer size for the current run setup */
@@ -665,7 +666,7 @@ static void prepare_verlet_scheme(FILE                *fplog,
          * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
          * and 4x2 gives a larger buffer than 4x4, this is ok.
          */
-        verletbuf_get_list_setup(*bUseGPU, &ls);
+        verletbuf_get_list_setup(bUseGPU, &ls);
 
         calc_verlet_buffer_size(mtop, det(box), ir,
                                 ir->verletbuf_drift, &ls,
@@ -685,7 +686,7 @@ static void prepare_verlet_scheme(FILE                *fplog,
 
     /* With GPU or emulation we should check nstlist for performance */
     if ((EI_DYNAMICS(ir->eI) &&
-         *bUseGPU &&
+         bUseGPU &&
          ir->nstlist < NSTLIST_GPU_ENOUGH) ||
         getenv(NSTLIST_ENVVAR) != NULL)
     {
@@ -771,9 +772,19 @@ static void convert_to_verlet_scheme(FILE *fplog,
     gmx_mtop_remove_chargegroups(mtop);
 }
 
-static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
-                                    int           cutoff_scheme,
-                                    gmx_bool      bIsSimMaster)
+static void print_hw_opt(FILE *fp, const gmx_hw_opt_t *hw_opt)
+{
+    fprintf(fp, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
+            hw_opt->nthreads_tot,
+            hw_opt->nthreads_tmpi,
+            hw_opt->nthreads_omp,
+            hw_opt->nthreads_omp_pme,
+            hw_opt->gpu_opt.gpu_id != NULL ? hw_opt->gpu_opt.gpu_id : "");
+}
+
+/* Checks we can do when we don't (yet) know the cut-off scheme */
+static void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt,
+                                      gmx_bool      bIsSimMaster)
 {
     gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, bIsSimMaster);
 
@@ -837,18 +848,6 @@ static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
     }
 #endif
 
-    if (cutoff_scheme == ecutsGROUP)
-    {
-        /* We only have OpenMP support for PME only nodes */
-        if (hw_opt->nthreads_omp > 1)
-        {
-            gmx_fatal(FARGS, "OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
-                      ecutscheme_names[cutoff_scheme],
-                      ecutscheme_names[ecutsVERLET]);
-        }
-        hw_opt->nthreads_omp = 1;
-    }
-
     if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
     {
         gmx_fatal(FARGS, "You need to specify -ntomp in addition to -ntomp_pme");
@@ -871,15 +870,59 @@ static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
         hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
     }
 
+    /* Parse GPU IDs, if provided.
+     * We check consistency with the tMPI thread count later.
+     */
+    gmx_parse_gpu_ids(&hw_opt->gpu_opt);
+
+#ifdef GMX_THREAD_MPI
+    if (hw_opt->gpu_opt.ncuda_dev_use > 0 && hw_opt->nthreads_tmpi == 0)
+    {
+        /* Set the number of MPI threads equal to the number of GPUs */
+        hw_opt->nthreads_tmpi = hw_opt->gpu_opt.ncuda_dev_use;
+
+        if (hw_opt->nthreads_tot > 0 &&
+            hw_opt->nthreads_tmpi > hw_opt->nthreads_tot)
+        {
+            /* We have more GPUs than total threads requested.
+             * We choose to (later) generate a mismatch error,
+             * instead of launching more threads than requested.
+             */
+            hw_opt->nthreads_tmpi = hw_opt->nthreads_tot;
+        }
+    }
+#endif
+
     if (debug)
     {
-        fprintf(debug, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
-                hw_opt->nthreads_tot,
-                hw_opt->nthreads_tmpi,
-                hw_opt->nthreads_omp,
-                hw_opt->nthreads_omp_pme,
-                hw_opt->gpu_id != NULL ? hw_opt->gpu_id : "");
+        print_hw_opt(debug, hw_opt);
+    }
+}
 
+/* Checks we can do when we know the cut-off scheme */
+static void check_and_update_hw_opt_2(gmx_hw_opt_t *hw_opt,
+                                      int           cutoff_scheme)
+{
+    if (cutoff_scheme == ecutsGROUP)
+    {
+        /* We only have OpenMP support for PME only nodes */
+        if (hw_opt->nthreads_omp > 1)
+        {
+            gmx_fatal(FARGS, "OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
+                      ecutscheme_names[cutoff_scheme],
+                      ecutscheme_names[ecutsVERLET]);
+        }
+        hw_opt->nthreads_omp = 1;
+    }
+
+    if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
+    {
+        hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
+    }
+
+    if (debug)
+    {
+        print_hw_opt(debug, hw_opt);
     }
 }
 
@@ -917,14 +960,6 @@ static void override_nsteps_cmdline(FILE            *fplog,
     }
 }
 
-/* Data structure set by SIMMASTER which needs to be passed to all nodes
- * before the other nodes have read the tpx file and called gmx_detect_hardware.
- */
-typedef struct {
-    int      cutoff_scheme; /* The cutoff scheme from inputrec_t */
-    gmx_bool bUseGPU;       /* Use GPU or GPU emulation          */
-} master_inf_t;
-
 int mdrunner(gmx_hw_opt_t *hw_opt,
              FILE *fplog, t_commrec *cr, int nfile,
              const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
@@ -969,7 +1004,8 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
     int             nthreads_pp  = 1;
     gmx_membed_t    membed       = NULL;
     gmx_hw_info_t  *hwinfo       = NULL;
-    master_inf_t    minf         = {-1, FALSE};
+    /* The master rank decides early on bUseGPU and broadcasts this later */
+    gmx_bool        bUseGPU      = FALSE;
 
     /* CAUTION: threads may be started later on in this function, so
        cr doesn't reflect the final parallel state right now */
@@ -986,8 +1022,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
 
     /* Detect hardware, gather information. This is an operation that is
      * global for this process (MPI rank). */
-    hwinfo = gmx_detect_hardware(fplog, cr,
-                                 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
+    hwinfo = gmx_detect_hardware(fplog, cr, bTryUseGPU);
 
 
     snew(state, 1);
@@ -1002,17 +1037,17 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
             convert_to_verlet_scheme(fplog, inputrec, mtop, det(state->box));
         }
 
-
-        minf.cutoff_scheme = inputrec->cutoff_scheme;
-        minf.bUseGPU       = FALSE;
-
         if (inputrec->cutoff_scheme == ecutsVERLET)
         {
-            prepare_verlet_scheme(fplog, hwinfo, cr, nbpu_opt,
-                                  inputrec, mtop, state->box,
-                                  &minf.bUseGPU);
+            /* Here the master rank decides if all ranks will use GPUs */
+            bUseGPU = (hwinfo->gpu_info.ncuda_dev_compatible > 0 ||
+                       getenv("GMX_EMULATE_GPU") != NULL);
+
+            prepare_verlet_scheme(fplog, cr,
+                                  nbpu_opt, inputrec, mtop, state->box,
+                                  bUseGPU);
         }
-        else if (hwinfo->bCanUseGPU)
+        else if (hwinfo->gpu_info.ncuda_dev_compatible > 0)
         {
             md_print_warn(cr, fplog,
                           "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
@@ -1034,17 +1069,6 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
         }
 #endif
     }
-#ifndef GMX_THREAD_MPI
-    if (PAR(cr))
-    {
-        gmx_bcast_sim(sizeof(minf), &minf, cr);
-    }
-#endif
-    if (minf.bUseGPU && cr->npmenodes == -1)
-    {
-        /* Don't automatically use PME-only nodes with GPUs */
-        cr->npmenodes = 0;
-    }
 
     /* Check for externally set OpenMP affinity and turn off internal
      * pinning if any is found. We need to do this check early to tell
@@ -1053,18 +1077,18 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
      */
     gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
 
-#ifdef GMX_THREAD_MPI
-    /* With thread-MPI inputrec is only set here on the master thread */
+    /* Check and update the hardware options for internal consistency */
+    check_and_update_hw_opt_1(hw_opt, SIMMASTER(cr));
+
     if (SIMMASTER(cr))
-#endif
     {
-        check_and_update_hw_opt(hw_opt, minf.cutoff_scheme, SIMMASTER(cr));
-
 #ifdef GMX_THREAD_MPI
-        /* Early check for externally set process affinity. Can't do over all
-         * MPI processes because hwinfo is not available everywhere, but with
-         * thread-MPI it's needed as pinning might get turned off which needs
-         * to be known before starting thread-MPI. */
+        /* Early check for externally set process affinity.
+         * With thread-MPI this is needed as pinning might get turned off,
+         * which needs to be known before starting thread-MPI.
+         * With thread-MPI hw_opt is processed here on the master rank
+         * and passed to the other ranks later, so we only do this on master.
+         */
         gmx_check_thread_affinity_set(fplog,
                                       NULL,
                                       hw_opt, hwinfo->nthreads_hw_avail, FALSE);
@@ -1087,6 +1111,12 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
 #ifdef GMX_THREAD_MPI
     if (SIMMASTER(cr))
     {
+        /* Since the master knows the cut-off scheme, update hw_opt for this.
+         * This is done later for normal MPI and also once more with tMPI
+         * for all tMPI ranks.
+         */
+        check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);
+
         /* NOW the threads will be started: */
         hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
                                                  hw_opt,
@@ -1379,6 +1409,9 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
     fflush(stderr);
 #endif
 
+    /* Check and update hw_opt for the cut-off scheme */
+    check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);
+
     gmx_omp_nthreads_init(fplog, cr,
                           hwinfo->nthreads_hw_avail,
                           hw_opt->nthreads_omp,
@@ -1386,9 +1419,29 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
                           (cr->duty & DUTY_PP) == 0,
                           inputrec->cutoff_scheme == ecutsVERLET);
 
-    /* check consistency and decide on the number of gpus to use. */
-    gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi,
-                                     minf.bUseGPU);
+    if (PAR(cr))
+    {
+        /* The master rank decided on the use of GPUs,
+         * broadcast this information to all ranks.
+         */
+        gmx_bcast_sim(sizeof(bUseGPU), &bUseGPU, cr);
+    }
+
+    if (bUseGPU)
+    {
+        if (cr->npmenodes == -1)
+        {
+            /* Don't automatically use PME-only nodes with GPUs */
+            cr->npmenodes = 0;
+        }
+
+        /* Select GPU id's to use */
+        gmx_select_gpu_ids(fplog, cr, &hwinfo->gpu_info, bForceUseGPU,
+                           &hw_opt->gpu_opt);
+    }
+
+    /* check consistency of CPU acceleration and number of GPUs selected */
+    gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt, bUseGPU);
 
     /* getting number of PP/PME threads
        PME: env variable should be read only on one node to make sure it is
@@ -1430,8 +1483,9 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
         }
 
         /* Initiate forcerecord */
-        fr         = mk_forcerec();
-        fr->hwinfo = hwinfo;
+        fr          = mk_forcerec();
+        fr->hwinfo  = hwinfo;
+        fr->gpu_opt = &hw_opt->gpu_opt;
         init_forcerec(fplog, oenv, fr, fcd, inputrec, mtop, cr, box, FALSE,
                       opt2fn("-table", nfile, fnm),
                       opt2fn("-tabletf", nfile, fnm),
diff --git a/src/mdlib/forcerec.c b/src/mdlib/forcerec.c
index 2b79f8e447..f7da120aca 100644
--- a/src/mdlib/forcerec.c
+++ b/src/mdlib/forcerec.c
@@ -1662,7 +1662,8 @@ static void pick_nbnxn_resources(FILE                *fp,
                                  const gmx_hw_info_t *hwinfo,
                                  gmx_bool             bDoNonbonded,
                                  gmx_bool            *bUseGPU,
-                                 gmx_bool            *bEmulateGPU)
+                                 gmx_bool            *bEmulateGPU,
+                                 const gmx_gpu_opt_t *gpu_opt)
 {
     gmx_bool bEmulateGPUEnvVarSet;
     char     gpu_err_str[STRLEN];
@@ -1683,21 +1684,24 @@ static void pick_nbnxn_resources(FILE                *fp,
      * Note that you should freezing the system as otherwise it will explode.
      */
     *bEmulateGPU = (bEmulateGPUEnvVarSet ||
-                    (!bDoNonbonded && hwinfo->bCanUseGPU));
+                    (!bDoNonbonded &&
+                     gpu_opt->ncuda_dev_use > 0));
 
     /* Enable GPU mode when GPUs are available or no GPU emulation is requested.
      */
-    if (hwinfo->bCanUseGPU && !(*bEmulateGPU))
+    if (gpu_opt->ncuda_dev_use > 0 && !(*bEmulateGPU))
     {
         /* Each PP node will use the intra-node id-th device from the
          * list of detected/selected GPUs. */
-        if (!init_gpu(cr->rank_pp_intranode, gpu_err_str, &hwinfo->gpu_info))
+        if (!init_gpu(cr->rank_pp_intranode, gpu_err_str,
+                      &hwinfo->gpu_info, gpu_opt))
         {
             /* At this point the init should never fail as we made sure that
              * we have all the GPUs we need. If it still does, we'll bail. */
             gmx_fatal(FARGS, "On node %d failed to initialize GPU #%d: %s",
                       cr->nodeid,
-                      get_gpu_device_id(&hwinfo->gpu_info, cr->rank_pp_intranode),
+                      get_gpu_device_id(&hwinfo->gpu_info, gpu_opt,
+                                        cr->rank_pp_intranode),
                       gpu_err_str);
         }
 
@@ -1898,7 +1902,8 @@ static void init_nb_verlet(FILE                *fp,
     pick_nbnxn_resources(fp, cr, fr->hwinfo,
                          fr->bNonbonded,
                          &nbv->bUseGPU,
-                         &bEmulateGPU);
+                         &bEmulateGPU,
+                         fr->gpu_opt);
 
     nbv->nbs = NULL;
 
@@ -1946,7 +1951,8 @@ static void init_nb_verlet(FILE                *fp,
         /* init the NxN GPU data; the last argument tells whether we'll have
          * both local and non-local NB calculation on GPU */
         nbnxn_cuda_init(fp, &nbv->cu_nbv,
-                        &fr->hwinfo->gpu_info, cr->rank_pp_intranode,
+                        &fr->hwinfo->gpu_info, fr->gpu_opt,
+                        cr->rank_pp_intranode,
                         (nbv->ngrp > 1) && !bHybridGPURun);
 
         if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
@@ -2059,7 +2065,7 @@ void init_forcerec(FILE              *fp,
          * In mdrun, hwinfo has already been set before calling init_forcerec.
          * Here we ignore GPUs, as tools will not use them anyhow.
          */
-        fr->hwinfo = gmx_detect_hardware(fp, cr, FALSE, FALSE, NULL);
+        fr->hwinfo = gmx_detect_hardware(fp, cr, FALSE);
     }
 
     /* By default we turn acceleration on, but it might be turned off further down... */
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
index 4daa238bec..5b84773943 100644
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -505,7 +505,9 @@ static int pick_nbnxn_kernel_version(FILE            *fplog,
 
 void nbnxn_cuda_init(FILE *fplog,
                      nbnxn_cuda_ptr_t *p_cu_nb,
-                     const gmx_gpu_info_t *gpu_info, int my_gpu_index,
+                     const gmx_gpu_info_t *gpu_info,
+                     const gmx_gpu_opt_t *gpu_opt,
+                     int my_gpu_index,
                      gmx_bool bLocalAndNonlocal)
 {
     cudaError_t stat;
@@ -540,7 +542,7 @@ void nbnxn_cuda_init(FILE *fplog,
     init_plist(nb->plist[eintLocal]);
 
     /* set device info, just point it to the right GPU among the detected ones */
-    nb->dev_info = &gpu_info->cuda_dev[get_gpu_device_id(gpu_info, my_gpu_index)];
+    nb->dev_info = &gpu_info->cuda_dev[get_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
 
     /* local/non-local GPU streams */
     stat = cudaStreamCreate(&nb->stream[eintLocal]);
@@ -626,7 +628,7 @@ void nbnxn_cuda_init(FILE *fplog,
          *   - GPUs are not being shared.
          */
         bool bShouldUsePollSync = (bX86 && bTMPIAtomics &&
-                                   (gmx_count_gpu_dev_shared(gpu_info) < 1));
+                                   (gmx_count_gpu_dev_shared(gpu_opt) < 1));
 
         if (bStreamSync)
         {