made errors during GPU detection non-fatal
authorSzilard Pall <pszilard@cbr.su.se>
Thu, 29 Nov 2012 21:12:13 +0000 (22:12 +0100)
committerGerrit Code Review <gerrit@gerrit.gromacs.org>
Thu, 29 Nov 2012 22:28:16 +0000 (23:28 +0100)
Errors during the GPU detection phase were incorrectly treated as fatal
which meant that in cases like GPU set to thread/process exclusive mode
or runtime/driver incompatibilities resulted in fatal errors when using
binaries compiled with GPU acceleration. These errors are now reported
as a note to the user and allow mdrun to fall back to CPU kernels.

Change-Id: Ie8ebb5a1eeb6533ad451adfd3377870ec859a31d

include/gpu_utils.h
src/gmxlib/gmx_detect_hardware.c
src/gmxlib/gpu_utils/gpu_utils.cu

index 751936a4fb19e9ccc43042096089801b928aec57..068e34960f7707ccdefe05822e59001f8315afa2 100644 (file)
@@ -66,7 +66,7 @@ FUNC_QUALIFIER
 gmx_bool is_gmx_openmm_supported_gpu(int dev_id, char *gpu_name) FUNC_TERM_INT
 
 FUNC_QUALIFIER
-void detect_cuda_gpus(gmx_gpu_info_t *gpu_info) FUNC_TERM_VOID
+int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str) FUNC_TERM_INT
 
 FUNC_QUALIFIER
 void pick_compatible_gpus(gmx_gpu_info_t *gpu_info) FUNC_TERM_VOID
index 6fe77d759ef9f8e3a9f5fd847b4c53172daafac3..655169cfa5a644bcae76f9003ce35be96d3702c3 100644 (file)
@@ -487,7 +487,23 @@ void gmx_detect_hardware(FILE *fplog, gmx_hw_info_t *hwinfo,
     /* run the detection if the binary was compiled with GPU support */
     if (bGPUBin && getenv("GMX_DISABLE_GPU_DETECTION")==NULL)
     {
-        detect_cuda_gpus(&hwinfo->gpu_info);
+        char detection_error[STRLEN];
+
+        if (detect_cuda_gpus(&hwinfo->gpu_info, detection_error) != 0)
+        {
+            if (detection_error != NULL && detection_error[0] != '\0')
+            {
+                sprintf(sbuf, ":\n      %s\n", detection_error);
+            }
+            else
+            {
+                sprintf(sbuf, ".");
+            }
+            md_print_warn(cr, fplog,
+                          "NOTE: Error occurred during GPU detection%s"
+                          "      Can not use GPU acceleration, will fall back to CPU kernels.\n",
+                          sbuf);
+        }
     }
 
     if (bForceUseGPU || bTryUseGPU)
index 64d1e39e523495726c9f587465278c4c4aa9d531..c29b74dff18465f448d0084bae4d465e5745645d 100644 (file)
@@ -145,6 +145,8 @@ static const char * const SupportedGPUs[] = {
   * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
   * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
   * \returns                0 if the device looks OK
+  *
+  * TODO: introduce errors codes and handle errors more smoothly.
   */
 static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 {
@@ -217,7 +219,10 @@ static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 
     /* try to execute a dummy kernel */
     k_dummy_test<<<1, 512>>>();
-    CU_LAUNCH_ERR_SYNC("dummy test kernel");
+    if (cudaThreadSynchronize() != cudaSuccess)
+    {
+        return -1;
+    }
 
     /* destroy context if we created one */
     if (id != -1)
@@ -681,13 +686,21 @@ static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
     int         ndev;
 
     stat = cudaGetDeviceCount(&ndev);
-    CU_RET_ERR(stat, "cudaGetDeviceCount failed");
+    if (stat != cudaSuccess)
+    {
+        return egpuInsane;
+    }
 
     if (dev_id > ndev - 1)
     {
         return egpuNonexistent;
     }
 
+    /* TODO: currently we do not make a distinction between the type of errors
+     * that can appear during sanity checks. This needs to be improved, e.g if
+     * the dummy test kernel fails to execute with a "device busy message" we
+     * should appropriately report that the device is busy instead of insane.
+     */
     if (do_sanity_checks(dev_id, dev_prop) == 0)
     {
         if (is_gmx_supported_gpu(dev_prop))
@@ -714,31 +727,55 @@ static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
  *  status.
  *
  *  \param[in] gpu_info    pointer to structure holding GPU information.
+ *  \param[out] err_str    The error message of any CUDA API error that caused
+ *                         the detection to fail (if there was any). The memory
+ *                         the pointer points to should be managed externally.
+ *  \returns               non-zero if the detection encountered a failure, zero otherwise.
  */
-void detect_cuda_gpus(gmx_gpu_info_t *gpu_info)
+int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
 {
-    int             i, ndev, checkres;
+    int             i, ndev, checkres, retval;
     cudaError_t     stat;
     cudaDeviceProp  prop;
     cuda_dev_info_t *devs;
 
     assert(gpu_info);
+    assert(err_str);
+
+    ndev    = 0;
+    devs    = NULL;
 
     stat = cudaGetDeviceCount(&ndev);
-    CU_RET_ERR(stat, "cudaGetDeviceCount failed");
+    if (stat != cudaSuccess)
+    {
+        const char *s;
 
-    snew(devs, ndev);
-    for (i = 0; i < ndev; i++)
+        /* cudaGetDeviceCount failed which means that there is something
+         * wrong with the machine: driver-runtime mismatch, all GPUs being
+         * busy in exclusive mode, or some other condition which should
+         * result in us issuing a warning a falling back to CPUs. */
+        retval = -1;
+        s = cudaGetErrorString(stat);
+        strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
+    }
+    else
     {
-        checkres = is_gmx_supported_gpu_id(i, &prop);
+        snew(devs, ndev);
+        for (i = 0; i < ndev; i++)
+        {
+            checkres = is_gmx_supported_gpu_id(i, &prop);
 
-        devs[i].id   = i;
-        devs[i].prop = prop;
-        devs[i].stat = checkres;
+            devs[i].id   = i;
+            devs[i].prop = prop;
+            devs[i].stat = checkres;
+        }
+        retval = 0;
     }
 
     gpu_info->ncuda_dev = ndev;
     gpu_info->cuda_dev  = devs;
+
+    return retval;
 }
 
 /*! \brief Select the GPUs compatible with the native GROMACS acceleration.