made errors during GPU detection non-fatal

author Szilard Pall <pszilard@cbr.su.se>

Thu, 29 Nov 2012 21:12:13 +0000 (22:12 +0100)

committer Gerrit Code Review <gerrit@gerrit.gromacs.org>

Thu, 29 Nov 2012 22:28:16 +0000 (23:28 +0100)
author Szilard Pall <pszilard@cbr.su.se>
Thu, 29 Nov 2012 21:12:13 +0000 (22:12 +0100)
committer Gerrit Code Review <gerrit@gerrit.gromacs.org>
Thu, 29 Nov 2012 22:28:16 +0000 (23:28 +0100)
diff --git a/include/gpu_utils.h b/include/gpu_utils.h

index 751936a4fb19e9ccc43042096089801b928aec57..068e34960f7707ccdefe05822e59001f8315afa2 100644 (file)
--- a/include/gpu_utils.h
+++ b/include/gpu_utils.h
@@ -66,7 +66,7 @@ FUNC_QUALIFIER
  gmx_bool is_gmx_openmm_supported_gpu(int dev_id, char *gpu_name) FUNC_TERM_INT
  
  FUNC_QUALIFIER
-void detect_cuda_gpus(gmx_gpu_info_t *gpu_info) FUNC_TERM_VOID
+int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str) FUNC_TERM_INT
  
  FUNC_QUALIFIER
  void pick_compatible_gpus(gmx_gpu_info_t *gpu_info) FUNC_TERM_VOID
diff --git a/src/gmxlib/gmx_detect_hardware.c b/src/gmxlib/gmx_detect_hardware.c

index 6fe77d759ef9f8e3a9f5fd847b4c53172daafac3..655169cfa5a644bcae76f9003ce35be96d3702c3 100644 (file)
--- a/src/gmxlib/gmx_detect_hardware.c
+++ b/src/gmxlib/gmx_detect_hardware.c
@@ -487,7 +487,23 @@ void gmx_detect_hardware(FILE *fplog, gmx_hw_info_t *hwinfo,
      /* run the detection if the binary was compiled with GPU support */
      if (bGPUBin && getenv("GMX_DISABLE_GPU_DETECTION")==NULL)
      {
-        detect_cuda_gpus(&hwinfo->gpu_info);
+        char detection_error[STRLEN];
+
+        if (detect_cuda_gpus(&hwinfo->gpu_info, detection_error) != 0)
+        {
+            if (detection_error != NULL && detection_error[0] != '\0')
+            {
+                sprintf(sbuf, ":\n      %s\n", detection_error);
+            }
+            else
+            {
+                sprintf(sbuf, ".");
+            }
+            md_print_warn(cr, fplog,
+                          "NOTE: Error occurred during GPU detection%s"
+                          "      Can not use GPU acceleration, will fall back to CPU kernels.\n",
+                          sbuf);
+        }
      }
  
      if (bForceUseGPU || bTryUseGPU)
diff --git a/src/gmxlib/gpu_utils/gpu_utils.cu b/src/gmxlib/gpu_utils/gpu_utils.cu

index 64d1e39e523495726c9f587465278c4c4aa9d531..c29b74dff18465f448d0084bae4d465e5745645d 100644 (file)
--- a/src/gmxlib/gpu_utils/gpu_utils.cu
+++ b/src/gmxlib/gpu_utils/gpu_utils.cu
@@ -145,6 +145,8 @@ static const char * const SupportedGPUs[] = {
    * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
    * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
    * \returns                0 if the device looks OK
+  *
+  * TODO: introduce errors codes and handle errors more smoothly.
    */
  static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
  {
@@ -217,7 +219,10 @@ static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
  
      /* try to execute a dummy kernel */
      k_dummy_test<<<1, 512>>>();
-    CU_LAUNCH_ERR_SYNC("dummy test kernel");
+    if (cudaThreadSynchronize() != cudaSuccess)
+    {
+        return -1;
+    }
  
      /* destroy context if we created one */
      if (id != -1)
@@ -681,13 +686,21 @@ static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
      int         ndev;
  
      stat = cudaGetDeviceCount(&ndev);
-    CU_RET_ERR(stat, "cudaGetDeviceCount failed");
+    if (stat != cudaSuccess)
+    {
+        return egpuInsane;
+    }
  
      if (dev_id > ndev - 1)
      {
          return egpuNonexistent;
      }
  
+    /* TODO: currently we do not make a distinction between the type of errors
+     * that can appear during sanity checks. This needs to be improved, e.g if
+     * the dummy test kernel fails to execute with a "device busy message" we
+     * should appropriately report that the device is busy instead of insane.
+     */
      if (do_sanity_checks(dev_id, dev_prop) == 0)
      {
          if (is_gmx_supported_gpu(dev_prop))
@@ -714,31 +727,55 @@ static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
   *  status.
   *
   *  \param[in] gpu_info    pointer to structure holding GPU information.
+ *  \param[out] err_str    The error message of any CUDA API error that caused
+ *                         the detection to fail (if there was any). The memory
+ *                         the pointer points to should be managed externally.
+ *  \returns               non-zero if the detection encountered a failure, zero otherwise.
   */
-void detect_cuda_gpus(gmx_gpu_info_t *gpu_info)
+int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
  {
-    int             i, ndev, checkres;
+    int             i, ndev, checkres, retval;
      cudaError_t     stat;
      cudaDeviceProp  prop;
      cuda_dev_info_t *devs;
  
      assert(gpu_info);
+    assert(err_str);
+
+    ndev    = 0;
+    devs    = NULL;
  
      stat = cudaGetDeviceCount(&ndev);
-    CU_RET_ERR(stat, "cudaGetDeviceCount failed");
+    if (stat != cudaSuccess)
+    {
+        const char *s;
  
-    snew(devs, ndev);
-    for (i = 0; i < ndev; i++)
+        /* cudaGetDeviceCount failed which means that there is something
+         * wrong with the machine: driver-runtime mismatch, all GPUs being
+         * busy in exclusive mode, or some other condition which should
+         * result in us issuing a warning a falling back to CPUs. */
+        retval = -1;
+        s = cudaGetErrorString(stat);
+        strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
+    }
+    else
      {
-        checkres = is_gmx_supported_gpu_id(i, &prop);
+        snew(devs, ndev);
+        for (i = 0; i < ndev; i++)
+        {
+            checkres = is_gmx_supported_gpu_id(i, &prop);
  
-        devs[i].id   = i;
-        devs[i].prop = prop;
-        devs[i].stat = checkres;
+            devs[i].id   = i;
+            devs[i].prop = prop;
+            devs[i].stat = checkres;
+        }
+        retval = 0;
      }
  
      gpu_info->ncuda_dev = ndev;
      gpu_info->cuda_dev  = devs;
+
+    return retval;
  }
  
  /*! \brief Select the GPUs compatible with the native GROMACS acceleration.
author	Szilard Pall <pszilard@cbr.su.se>
	Thu, 29 Nov 2012 21:12:13 +0000 (22:12 +0100)
committer	Gerrit Code Review <gerrit@gerrit.gromacs.org>
	Thu, 29 Nov 2012 22:28:16 +0000 (23:28 +0100)
include/gpu_utils.h		patch \| blob \| history
src/gmxlib/gmx_detect_hardware.c		patch \| blob \| history
src/gmxlib/gpu_utils/gpu_utils.cu		patch \| blob \| history