Sort all includes in src/gromacs

[alexxy/gromacs.git] / src / gromacs / gmxlib / gpu_utils / gpu_utils.cu
diff --git a/src/gromacs/gmxlib/gpu_utils/gpu_utils.cu b/src/gromacs/gmxlib/gpu_utils/gpu_utils.cu

index 64d1e39e523495726c9f587465278c4c4aa9d531..f8b741923eb6827ed853639308f7e4e29bee18c1 100644 (file)
--- a/src/gromacs/gmxlib/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gmxlib/gpu_utils/gpu_utils.cu
@@ -1,151 +1,78 @@
-/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
- *
- * 
- *                This source code is part of
- * 
- *                 G   R   O   M   A   C   S
- * 
- *          GROningen MAchine for Chemical Simulations
- * 
- * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2010, The GROMACS development team,
- * check out http://www.gromacs.org for more information.
-
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2010,2011,2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
   * of the License, or (at your option) any later version.
- * 
- * If you want to redistribute modifications, please consider that
- * scientific software is very special. Version control is crucial -
- * bugs must be traceable. We will be happy to consider code for
- * inclusion in the official distribution, but derived work must not
- * be called official GROMACS. Details are found in the README & COPYING
- * files - if they are missing, get the official version at www.gromacs.org.
- * 
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
   * To help us fund GROMACS development, we humbly ask that you cite
- * the papers on the package - you can find them in the top README file.
- * 
- * For more info, check our website at http://www.gromacs.org
- * 
- * And Hey:
- * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ * the research papers on the package. Check out http://www.gromacs.org.
   */
  
+#include "gmxpre.h"
+
+#include "gromacs/legacyheaders/gpu_utils.h"
+
+#include <assert.h>
  #include <stdio.h>
  #include <stdlib.h>
-#include <assert.h>
  
-#include "smalloc.h"
-#include "string2.h"
-#include "types/hw_info.h"
-
-#include "gpu_utils.h"
-#include "../cuda_tools/cudautils.cuh"
-#include "memtestG80_core.h"
-
-
-#define QUICK_MEM       250 /*!< Amount of memory to be used in quick memtest. */
-#define QUICK_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bit flag with type of tests
-                                                                            to run in quick memtest. */
-#define QUICK_ITER      3 /*!< Number of iterations in quick memtest. */
-
-#define FULL_TESTS      0x3FFF /*!<  Bitflag with all test set on for full memetest. */
-#define FULL_ITER       25 /*!< Number of iterations in full memtest. */
-
-#define TIMED_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bit flag with type of tests to
-                                                                            run in time constrained memtest. */
-
-/*! Number of supported GPUs */
-#define NB_GPUS (sizeof(SupportedGPUs)/sizeof(SupportedGPUs[0]))
-
-static int cuda_max_device_count = 32; /*! Max number of devices supported by CUDA (for consistency checking).
-                                           In reality it 16 with CUDA <=v5.0, but let's stay on the safe side. */
-
-/*! Dummy kernel used for sanity checking. */
-__global__ void k_dummy_test(){}
-
-
-/*! Bit-flags which refer to memtestG80 test types and are used in do_memtest to specify which tests to run. */
-enum memtest_G80_test_types {
-    MOVING_INVERSIONS_10 =      0x1,
-    MOVING_INVERSIONS_RAND =    0x2,
-    WALKING_8BIT_M86 =          0x4,
-    WALKING_0_8BIT =            0x8,
-    WALKING_1_8BIT =            0x10,
-    WALKING_0_32BIT =           0x20,
-    WALKING_1_32BIT =           0x40,
-    RANDOM_BLOCKS =             0x80,
-    MOD_20_32BIT =              0x100,
-    LOGIC_1_ITER =              0x200,
-    LOGIC_4_ITER =              0x400,
-    LOGIC_1_ITER_SHMEM =        0x800,
-    LOGIC_4_ITER_SHMEM =        0x1000
-};
-
-// TODO put this list into an external file and include it so that the list is easily accessible
-/*! List of supported GPUs. */
-static const char * const SupportedGPUs[] = {
-    /* GT400 */
-    "Geforce GTX 480",
-    "Geforce GTX 470",
-    "Geforce GTX 465",
-    "Geforce GTX 460",
-
-    "Tesla C2070",
-    "Tesla C2050",
-    "Tesla S2070",
-    "Tesla S2050",
-    "Tesla M2070",
-    "Tesla M2050",
-
-    "Quadro 5000",
-    "Quadro 6000",
-
-    /* GT200 */
-    "Geforce GTX 295",
-    "Geforce GTX 285",
-    "Geforce GTX 280",
-    "Geforce GTX 275",
-    "Geforce GTX 260",
-    "GeForce GTS 250",
-    "GeForce GTS 150",
-
-    "GeForce GTX 285M",
-    "GeForce GTX 280M",
-
-    "Tesla S1070",
-    "Tesla C1060",
-    "Tesla M1060",
-
-    "Quadro FX 5800",
-    "Quadro FX 4800",
-    "Quadro CX",
-    "Quadro Plex 2200 D2",
-    "Quadro Plex 2200 S4",
-
-    /* G90 */
-    "GeForce 9800 G", /* GX2, GTX, GTX+, GT */
-    "GeForce 9800M GTX",
-
-    "Quadro FX 4700",
-    "Quadro Plex 2100 D4"
-};
-
-
-/*! 
-  * \brief Runs GPU sanity checks.
-  *
-  * Runs a series of checks to determine that the given GPU and underlying CUDA
-  * driver/runtime functions properly.
-  * Returns properties of a device with given ID or the one that has
-  * already been initialized earlier in the case if of \dev_id == -1.
-  *
-  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
-  * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
-  * \returns                0 if the device looks OK
-  */
+#include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
+#include "gromacs/legacyheaders/types/hw_info.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/smalloc.h"
+
+/*! \brief
+ * Max number of devices supported by CUDA (for consistency checking).
+ *
+ * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
+ */
+static int cuda_max_device_count = 32;
+
+/** Dummy kernel used for sanity checking. */
+__global__ void k_dummy_test()
+{
+}
+
+
+/*!
+ * \brief Runs GPU sanity checks.
+ *
+ * Runs a series of checks to determine that the given GPU and underlying CUDA
+ * driver/runtime functions properly.
+ * Returns properties of a device with given ID or the one that has
+ * already been initialized earlier in the case if of \dev_id == -1.
+ *
+ * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
+ * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
+ * \returns                0 if the device looks OK
+ *
+ * TODO: introduce errors codes and handle errors more smoothly.
+ */
  static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
  {
      cudaError_t cu_err;
@@ -154,18 +81,22 @@ static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
      cu_err = cudaGetDeviceCount(&dev_count);
      if (cu_err != cudaSuccess)
      {
-       fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
-               cudaGetErrorString(cu_err));
+        fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
+                cudaGetErrorString(cu_err));
          return -1;
      }
  
      /* no CUDA compatible device at all */
      if (dev_count == 0)
+    {
          return -1;
+    }
  
      /* things might go horribly wrong if cudart is not compatible with the driver */
      if (dev_count < 0 || dev_count > cuda_max_device_count)
+    {
          return -1;
+    }
  
      if (dev_id == -1) /* device already selected let's not destroy the context */
      {
@@ -199,10 +130,14 @@ static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
  
      /* both major & minor is 9999 if no CUDA capable devices are present */
      if (dev_prop->major == 9999 && dev_prop->minor == 9999)
+    {
          return -1;
+    }
      /* we don't care about emulation mode */
      if (dev_prop->major == 0)
+    {
          return -1;
+    }
  
      if (id != -1)
      {
@@ -216,8 +151,11 @@ static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
      }
  
      /* try to execute a dummy kernel */
-    k_dummy_test<<<1, 512>>>();
-    CU_LAUNCH_ERR_SYNC("dummy test kernel");
+    k_dummy_test<<< 1, 512>>> ();
+    if (cudaThreadSynchronize() != cudaSuccess)
+    {
+        return -1;
+    }
  
      /* destroy context if we created one */
      if (id != -1)
@@ -234,341 +172,6 @@ static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
      return 0;
  }
  
-
-/*! 
- * \brief Checks whether the GPU with the given name is supported in Gromacs-OpenMM.
- * 
- * \param[in] gpu_name  the name of the CUDA device
- * \returns             TRUE if the device is supported, otherwise FALSE
- */
-static bool is_gmx_openmm_supported_gpu_name(char *gpuName)
-{
-    size_t i;
-    for (i = 0; i < NB_GPUS; i++)
-    {
-        trim(gpuName);
-        if (gmx_strncasecmp(gpuName, SupportedGPUs[i], strlen(SupportedGPUs[i])) == 0)
-            return 1;
-    }
-    return 0;
-}
-
-/*! \brief Checks whether the GPU with the given device id is supported in Gromacs-OpenMM.
- *
- * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
- * \param[out] gpu_name Set to contain the name of the CUDA device, if NULL passed, no device name is set. 
- * \returns             TRUE if the device is supported, otherwise FALSE
- * 
- */
-gmx_bool is_gmx_openmm_supported_gpu(int dev_id, char *gpu_name)
-{
-    cudaDeviceProp dev_prop;
-
-    if (debug) fprintf(debug, "Checking compatibility with device #%d, %s\n", dev_id, gpu_name);
-
-    if (do_sanity_checks(dev_id, &dev_prop) != 0)
-        return -1;
-
-    if (gpu_name != NULL)
-    { 
-        strcpy(gpu_name, dev_prop.name);
-    }
-    return is_gmx_openmm_supported_gpu_name(dev_prop.name);
-}
-
-
-/*!
- * \brief Runs a set of memory tests specified by the given bit-flags.
- * Tries to allocate and do the test on \p megs Mb memory or
- * the greatest amount that can be allocated (>10Mb).
- * In case if an error is detected it stops without finishing the remaining
- * steps/iterations and returns greater then zero value.
- * In case of other errors (e.g. kernel launch errors, device querying errors)
- * -1 is returned.
- *
- * \param[in] which_tests   variable with bit-flags of the requested tests
- * \param[in] megs          amount of memory that will be tested in MB
- * \param[in] iter          number of iterations
- * \returns                 0 if no error was detected, otherwise >0
- */
-static int do_memtest(unsigned int which_tests, int megs, int iter)
-{
-    memtestState    tester;
-    int             i;
-    uint            err_count; //, err_iter;
-
-    // no parameter check as this fn won't be called externally
-
-    // let's try to allocate the mem
-    while (!tester.allocate(megs) && (megs - 10 > 0))
-        { megs -= 10; tester.deallocate(); }
-
-    if (megs <= 10)
-    {
-        fprintf(stderr, "Unable to allocate GPU memory!\n");
-        return -1;
-    }
-
-    // clear the first 18 bits
-    which_tests &= 0x3FFF;
-    for (i = 0; i < iter; i++)
-    {
-        // Moving Inversions (ones and zeros)
-        if ((MOVING_INVERSIONS_10 & which_tests) == MOVING_INVERSIONS_10)
-        {
-            tester.gpuMovingInversionsOnesZeros(err_count);
-            if (err_count > 0)
-                return MOVING_INVERSIONS_10;
-        }
-        // Moving Inversions (random)
-        if ((MOVING_INVERSIONS_RAND & which_tests) == MOVING_INVERSIONS_RAND)
-        {
-            tester.gpuMovingInversionsRandom(err_count);
-            if (err_count > 0)
-                return MOVING_INVERSIONS_RAND;
-        }
-       // Memtest86 Walking 8-bit
-        if ((WALKING_8BIT_M86 & which_tests) == WALKING_8BIT_M86)
-        {
-            for (uint shift = 0; shift < 8; shift++)
-            {
-                tester.gpuWalking8BitM86(err_count, shift);
-                if (err_count > 0)
-                    return WALKING_8BIT_M86;
-            }
-      }
-        // True Walking zeros (8-bit)
-        if ((WALKING_0_8BIT & which_tests) == WALKING_0_8BIT)
-        {
-            for (uint shift = 0; shift < 8; shift++)
-            {
-                tester.gpuWalking8Bit(err_count, false, shift);
-                if (err_count > 0)
-                    return WALKING_0_8BIT;
-            }
-        }
-        // True Walking ones (8-bit)
-        if ((WALKING_1_8BIT & which_tests) == WALKING_1_8BIT)
-        {
-            for (uint shift = 0; shift < 8; shift++)
-            {
-                tester.gpuWalking8Bit(err_count, true, shift);
-                if (err_count > 0)
-                    return WALKING_1_8BIT;
-            }
-        }
-        // Memtest86 Walking zeros (32-bit)
-        if ((WALKING_0_32BIT & which_tests) == WALKING_0_32BIT)
-        {
-            for (uint shift = 0; shift < 32; shift++)
-            {
-                tester.gpuWalking32Bit(err_count, false, shift);
-                if (err_count > 0)
-                    return WALKING_0_32BIT;
-            }
-        }
-       // Memtest86 Walking ones (32-bit)
-        if ((WALKING_1_32BIT & which_tests) == WALKING_1_32BIT)
-        {
-            for (uint shift = 0; shift < 32; shift++)
-            {
-                tester.gpuWalking32Bit(err_count, true, shift);
-                if (err_count > 0)
-                    return WALKING_1_32BIT;
-            }
-       }
-        // Random blocks
-        if ((RANDOM_BLOCKS & which_tests) == RANDOM_BLOCKS)
-        {
-            tester.gpuRandomBlocks(err_count,rand());
-            if (err_count > 0)
-                return RANDOM_BLOCKS;
-
-        }
-
-        // Memtest86 Modulo-20
-        if ((MOD_20_32BIT & which_tests) == MOD_20_32BIT)
-        {
-            for (uint shift = 0; shift < 20; shift++)
-            {
-                tester.gpuModuloX(err_count, shift, rand(), 20, 2);
-                if (err_count > 0)
-                    return MOD_20_32BIT;
-            }
-        }
-        // Logic (one iteration)
-        if ((LOGIC_1_ITER & which_tests) == LOGIC_1_ITER)
-        {
-            tester.gpuShortLCG0(err_count,1);
-            if (err_count > 0)
-                return LOGIC_1_ITER;
-        }
-        // Logic (4 iterations)
-        if ((LOGIC_4_ITER & which_tests) == LOGIC_4_ITER)
-        {
-            tester.gpuShortLCG0(err_count,4);
-            if (err_count > 0)
-                return LOGIC_4_ITER;
-
-        }
-        // Logic (shared memory, one iteration)
-        if ((LOGIC_1_ITER_SHMEM & which_tests) == LOGIC_1_ITER_SHMEM)
-        {
-            tester.gpuShortLCG0Shmem(err_count,1);
-            if (err_count > 0)
-                return LOGIC_1_ITER_SHMEM;
-        }
-        // Logic (shared-memory, 4 iterations)
-        if ((LOGIC_4_ITER_SHMEM & which_tests) == LOGIC_4_ITER_SHMEM)
-        {
-            tester.gpuShortLCG0Shmem(err_count,4);
-            if (err_count > 0)
-                return LOGIC_4_ITER_SHMEM;
-        }
-    }
-
-    tester.deallocate();
-    return err_count;
-}
-
-/*! \brief Runs a quick memory test and returns 0 in case if no error is detected.
- * If an error is detected it stops before completing the test and returns a
- * value greater then 0. In case of other errors (e.g. kernel launch errors,
- * device querying errors) -1 is returned.
- *
- * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
- * \returns             0 if no error was detected, otherwise >0
- */
-int do_quick_memtest(int dev_id)
-{
-    cudaDeviceProp  dev_prop;
-    int             devmem, res, time=0;
-
-    if (debug) { time = getTimeMilliseconds(); }
-
-    if (do_sanity_checks(dev_id, &dev_prop) != 0)
-    {
-        // something went wrong
-        return -1;
-    }
-
-    if (debug)
-    {
-        devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
-        fprintf(debug, ">> Running QUICK memtests on %d MiB (out of total %d MiB), %d iterations\n",
-            QUICK_MEM, devmem, QUICK_ITER);
-    }
-
-    res = do_memtest(QUICK_TESTS, QUICK_MEM, QUICK_ITER);
-
-    if (debug)
-    {
-        fprintf(debug, "Q-RES = %d\n", res);
-        fprintf(debug, "Q-runtime: %d ms\n", getTimeMilliseconds() - time);
-    }
-
-    /* destroy context only if we created it */
-    if (dev_id !=-1) cudaThreadExit();
-    return res;
-}
-
-/*! \brief Runs a full memory test and returns 0 in case if no error is detected.
- * If an error is detected  it stops before completing the test and returns a
- * value greater then 0. In case of other errors (e.g. kernel launch errors,
- * device querying errors) -1 is returned.
- *
- * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
- * \returns             0 if no error was detected, otherwise >0
- */
-
-int do_full_memtest(int dev_id)
-{
-    cudaDeviceProp  dev_prop;
-    int             devmem, res, time=0;
-
-    if (debug) { time = getTimeMilliseconds(); }
-
-    if (do_sanity_checks(dev_id, &dev_prop) != 0)
-    {
-        // something went wrong
-        return -1;
-    }
-
-    devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
-
-    if (debug) 
-    { 
-        fprintf(debug, ">> Running FULL memtests on %d MiB (out of total %d MiB), %d iterations\n",
-            devmem, devmem, FULL_ITER); 
-    }
-
-    /* do all test on the entire memory */
-    res = do_memtest(FULL_TESTS, devmem, FULL_ITER);
-
-    if (debug)
-    {
-        fprintf(debug, "F-RES = %d\n", res);
-        fprintf(debug, "F-runtime: %d ms\n", getTimeMilliseconds() - time);
-    }
-
-    /* destroy context only if we created it */
-    if (dev_id != -1) cudaThreadExit();
-    return res;
-}
-
-/*! \brief Runs a time constrained memory test and returns 0 in case if no error is detected.
- * If an error is detected it stops before completing the test and returns a value greater
- * than zero. In case of other errors (e.g. kernel launch errors, device querying errors) -1
- * is returned. Note, that test iterations are not interrupted therefor the total runtime of
- * the test will always be multipple of one iteration's runtime.
- *
- * \param[in] dev_id        the device id of the GPU or -1 if the device has laredy been selected
- * \param[in] time_constr   the time limit of the testing
- * \returns                 0 if no error was detected, otherwise >0
- */
-int do_timed_memtest(int dev_id, int time_constr)
-{
-    cudaDeviceProp  dev_prop;
-    int             devmem, res=0, time=0, startt;
-
-    if (debug) { time = getTimeMilliseconds(); }
-
-    time_constr *= 1000;  /* convert to ms for convenience */
-    startt = getTimeMilliseconds();
-
-    if (do_sanity_checks(dev_id, &dev_prop) != 0)
-    {
-        // something went wrong
-        return -1;
-    }
-
-    devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
-
-    if (debug) 
-    { 
-        fprintf(debug, ">> Running time constrained memtests on %d MiB (out of total %d MiB), time limit of %d s \n",
-        devmem, devmem, time_constr); 
-    }
-
-    /* do the TIMED_TESTS set, one step at a time on the entire memory 
-       that can be allocated, and stop when the given time is exceeded */
-    while ( ((int)getTimeMilliseconds() - startt) < time_constr)
-    {        
-        res = do_memtest(TIMED_TESTS, devmem, 1);
-        if (res != 0) break;
-    }
-
-    if (debug)
-    {
-        fprintf(debug, "T-RES = %d\n", res);
-        fprintf(debug, "T-runtime: %d ms\n", getTimeMilliseconds() - time);
-    }
-
-    /* destroy context only if we created it */
-    if (dev_id != -1) cudaThreadExit();
-    return res;
-}
-
  /*! \brief Initializes the GPU with the given index.
   *
   * The varible \mygpu is the index of the GPU to initialize in the
@@ -578,26 +181,29 @@ int do_timed_memtest(int dev_id, int time_constr)
   * \param[out] result_str   the message related to the error that occurred
   *                          during the initialization (if there was any).
   * \param[in] gpu_info      GPU info of all detected devices in the system.
+ * \param[in] gpu_opt       options for using the GPUs in gpu_info
   * \returns                 true if no error occurs during initialization.
   */
-gmx_bool init_gpu(int mygpu, char *result_str, const gmx_gpu_info_t *gpu_info)
+gmx_bool init_gpu(int mygpu, char *result_str,
+                  const gmx_gpu_info_t *gpu_info,
+                  const gmx_gpu_opt_t *gpu_opt)
  {
      cudaError_t stat;
-    char sbuf[STRLEN];
-    int gpuid;
+    char        sbuf[STRLEN];
+    int         gpuid;
  
      assert(gpu_info);
      assert(result_str);
  
-    if (mygpu < 0 || mygpu >= gpu_info->ncuda_dev_use)
+    if (mygpu < 0 || mygpu >= gpu_opt->ncuda_dev_use)
      {
          sprintf(sbuf, "Trying to initialize an inexistent GPU: "
                  "there are %d %s-selected GPU(s), but #%d was requested.",
-                 gpu_info->ncuda_dev_use, gpu_info->bUserSet ? "user" : "auto", mygpu);
+                gpu_opt->ncuda_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
          gmx_incons(sbuf);
      }
  
-    gpuid = gpu_info->cuda_dev[gpu_info->cuda_dev_use[mygpu]].id;
+    gpuid = gpu_info->cuda_dev[gpu_opt->cuda_dev_use[mygpu]].id;
  
      stat = cudaSetDevice(gpuid);
      strncpy(result_str, cudaGetErrorString(stat), STRLEN);
@@ -681,13 +287,21 @@ static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
      int         ndev;
  
      stat = cudaGetDeviceCount(&ndev);
-    CU_RET_ERR(stat, "cudaGetDeviceCount failed");
+    if (stat != cudaSuccess)
+    {
+        return egpuInsane;
+    }
  
      if (dev_id > ndev - 1)
      {
          return egpuNonexistent;
      }
  
+    /* TODO: currently we do not make a distinction between the type of errors
+     * that can appear during sanity checks. This needs to be improved, e.g if
+     * the dummy test kernel fails to execute with a "device busy message" we
+     * should appropriately report that the device is busy instead of insane.
+     */
      if (do_sanity_checks(dev_id, dev_prop) == 0)
      {
          if (is_gmx_supported_gpu(dev_prop))
@@ -714,31 +328,62 @@ static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
   *  status.
   *
   *  \param[in] gpu_info    pointer to structure holding GPU information.
+ *  \param[out] err_str    The error message of any CUDA API error that caused
+ *                         the detection to fail (if there was any). The memory
+ *                         the pointer points to should be managed externally.
+ *  \returns               non-zero if the detection encountered a failure, zero otherwise.
   */
-void detect_cuda_gpus(gmx_gpu_info_t *gpu_info)
+int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
  {
-    int             i, ndev, checkres;
-    cudaError_t     stat;
-    cudaDeviceProp  prop;
+    int              i, ndev, checkres, retval;
+    cudaError_t      stat;
+    cudaDeviceProp   prop;
      cuda_dev_info_t *devs;
  
      assert(gpu_info);
+    assert(err_str);
+
+    gpu_info->ncuda_dev_compatible = 0;
+
+    ndev    = 0;
+    devs    = NULL;
  
      stat = cudaGetDeviceCount(&ndev);
-    CU_RET_ERR(stat, "cudaGetDeviceCount failed");
+    if (stat != cudaSuccess)
+    {
+        const char *s;
  
-    snew(devs, ndev);
-    for (i = 0; i < ndev; i++)
+        /* cudaGetDeviceCount failed which means that there is something
+         * wrong with the machine: driver-runtime mismatch, all GPUs being
+         * busy in exclusive mode, or some other condition which should
+         * result in us issuing a warning a falling back to CPUs. */
+        retval = -1;
+        s      = cudaGetErrorString(stat);
+        strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
+    }
+    else
      {
-        checkres = is_gmx_supported_gpu_id(i, &prop);
+        snew(devs, ndev);
+        for (i = 0; i < ndev; i++)
+        {
+            checkres = is_gmx_supported_gpu_id(i, &prop);
+
+            devs[i].id   = i;
+            devs[i].prop = prop;
+            devs[i].stat = checkres;
  
-        devs[i].id   = i;
-        devs[i].prop = prop;
-        devs[i].stat = checkres;
+            if (checkres == egpuCompatible)
+            {
+                gpu_info->ncuda_dev_compatible++;
+            }
+        }
+        retval = 0;
      }
  
      gpu_info->ncuda_dev = ndev;
      gpu_info->cuda_dev  = devs;
+
+    return retval;
  }
  
  /*! \brief Select the GPUs compatible with the native GROMACS acceleration.
@@ -746,17 +391,18 @@ void detect_cuda_gpus(gmx_gpu_info_t *gpu_info)
   * This function selects the compatible gpus and initializes
   * gpu_info->cuda_dev_use and gpu_info->ncuda_dev_use.
   *
- * Given the list of GPUs available in the system the it checks each gpu in
- * gpu_info->cuda_dev and puts the the indices (into gpu_info->cuda_dev) of
- * the compatible ones into cuda_dev_use with this marking the respective
- * GPUs as "available for use."
+ * Given the list of GPUs available in the system check each device in
+ * gpu_info->cuda_dev and place the indices of the compatible GPUs into
+ * cuda_dev_use with this marking the respective GPUs as "available for use."
   * Note that \detect_cuda_gpus must have been called before.
   *
- * \param[in]    gpu_info    pointer to structure holding GPU information
+ * \param[in]     gpu_info    pointer to structure holding GPU information
+ * \param[in,out] gpu_opt     pointer to structure holding GPU options
   */
-void pick_compatible_gpus(gmx_gpu_info_t *gpu_info)
+void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
+                          gmx_gpu_opt_t        *gpu_opt)
  {
-    int i, ncompat;
+    int  i, ncompat;
      int *compat;
  
      assert(gpu_info);
@@ -774,53 +420,52 @@ void pick_compatible_gpus(gmx_gpu_info_t *gpu_info)
          }
      }
  
-    gpu_info->ncuda_dev_use = ncompat;
-    snew(gpu_info->cuda_dev_use, ncompat);
-    memcpy(gpu_info->cuda_dev_use, compat, ncompat*sizeof(*compat));
+    gpu_opt->ncuda_dev_use = ncompat;
+    snew(gpu_opt->cuda_dev_use, ncompat);
+    memcpy(gpu_opt->cuda_dev_use, compat, ncompat*sizeof(*compat));
      sfree(compat);
  }
  
  /*! \brief Check the existence/compatibility of a set of GPUs specified by their device IDs.
   *
- * Given the a list of GPU devide IDs in \requested_devs, check for the
- * existence and compatibility of the respective GPUs and fill in \gpu_info
- * with the collected information. Also provide the caller with an array with
+ * Given the a list of gpu->ncuda_dev_use GPU device IDs stored in
+ * gpu_opt->cuda_dev_use check the existence and compatibility
+ * of the respective GPUs. Also provide the caller with an array containing
   * the result of checks in \checkres.
   *
   * \param[out]  checkres    check result for each ID passed in \requested_devs
   * \param[in]   gpu_info    pointer to structure holding GPU information
- * \param[in]   requested_devs array of requested device IDs
- * \param[in]   count       number of IDs in \requested_devs
- * \returns                 TRUE if every requested GPU is compatible
+ * \param[out]  gpu_opt     pointer to structure holding GPU options
+ * \returns                 TRUE if every the requested GPUs are compatible
   */
-gmx_bool check_select_cuda_gpus(int *checkres, gmx_gpu_info_t *gpu_info,
-                                const int *requested_devs, int count)
+gmx_bool check_selected_cuda_gpus(int                  *checkres,
+                                  const gmx_gpu_info_t *gpu_info,
+                                  gmx_gpu_opt_t        *gpu_opt)
  {
-    int i, id;
+    int  i, id;
      bool bAllOk;
  
      assert(checkres);
      assert(gpu_info);
-    assert(requested_devs);
-    assert(count >= 0);
+    assert(gpu_opt->ncuda_dev_use >= 0);
  
-    if (count == 0)
+    if (gpu_opt->ncuda_dev_use == 0)
      {
          return TRUE;
      }
  
+    assert(gpu_opt->cuda_dev_use);
+
      /* we will assume that all GPUs requested are valid IDs,
         otherwise we'll bail anyways */
-    gpu_info->ncuda_dev_use = count;
-    snew(gpu_info->cuda_dev_use, count);
  
      bAllOk = true;
-    for (i = 0; i < count; i++)
+    for (i = 0; i < gpu_opt->ncuda_dev_use; i++)
      {
-        id = requested_devs[i];
+        id = gpu_opt->cuda_dev_use[i];
  
          /* devices are stored in increasing order of IDs in cuda_dev */
-        gpu_info->cuda_dev_use[i] = id;
+        gpu_opt->cuda_dev_use[i] = id;
  
          checkres[i] = (id >= gpu_info->ncuda_dev) ?
              egpuNonexistent : gpu_info->cuda_dev[id].stat;
@@ -842,7 +487,6 @@ void free_gpu_info(const gmx_gpu_info_t *gpu_info)
          return;
      }
  
-    sfree(gpu_info->cuda_dev_use);
      sfree(gpu_info->cuda_dev);
  }
  
@@ -868,7 +512,7 @@ void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int ind
  
      cuda_dev_info_t *dinfo = &gpu_info->cuda_dev[index];
  
-    bool bGpuExists =
+    bool             bGpuExists =
          dinfo->stat == egpuCompatible ||
          dinfo->stat == egpuIncompatible;
  
@@ -895,18 +539,19 @@ void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int ind
   * respective CUDA GPU.
   *
   * \param[in]    gpu_info   pointer to structure holding GPU information
+ * \param[in]    gpu_opt    pointer to structure holding GPU options
   * \param[in]    idx        index into the array of used GPUs
   * \returns                 device ID of the requested GPU
   */
-int get_gpu_device_id(const gmx_gpu_info_t *gpu_info, int idx)
+int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
+                      const gmx_gpu_opt_t  *gpu_opt,
+                      int                   idx)
  {
      assert(gpu_info);
-    if (idx < 0 && idx >= gpu_info->ncuda_dev_use)
-    {
-        return -1;
-    }
+    assert(gpu_opt);
+    assert(idx >= 0 && idx < gpu_opt->ncuda_dev_use);
  
-    return gpu_info->cuda_dev[gpu_info->cuda_dev_use[idx]].id;
+    return gpu_info->cuda_dev[gpu_opt->cuda_dev_use[idx]].id;
  }
  
  /*! \brief Returns the device ID of the GPU currently in use.
@@ -923,3 +568,14 @@ int get_current_gpu_device_id(void)
  
      return gpuid;
  }
+
+/*! \brief Returns the size of the cuda_dev_info struct.
+ *
+ * The size of cuda_dev_info can be used for allocation and communication.
+ *
+ * \returns                 size in bytes of cuda_dev_info
+ */
+size_t sizeof_cuda_dev_info(void)
+{
+    return sizeof(cuda_dev_info);
+}