From: Szilard Pall Date: Thu, 29 Nov 2012 21:12:13 +0000 (+0100) Subject: made errors during GPU detection non-fatal X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=47e22c51403b087b5afa52dad4669e01d8ca1374;p=alexxy%2Fgromacs.git made errors during GPU detection non-fatal Errors during the GPU detection phase were incorrectly treated as fatal which meant that in cases like GPU set to thread/process exclusive mode or runtime/driver incompatibilities resulted in fatal errors when using binaries compiled with GPU acceleration. These errors are now reported as a note to the user and allow mdrun to fall back to CPU kernels. Change-Id: Ie8ebb5a1eeb6533ad451adfd3377870ec859a31d --- diff --git a/include/gpu_utils.h b/include/gpu_utils.h index 751936a4fb..068e34960f 100644 --- a/include/gpu_utils.h +++ b/include/gpu_utils.h @@ -66,7 +66,7 @@ FUNC_QUALIFIER gmx_bool is_gmx_openmm_supported_gpu(int dev_id, char *gpu_name) FUNC_TERM_INT FUNC_QUALIFIER -void detect_cuda_gpus(gmx_gpu_info_t *gpu_info) FUNC_TERM_VOID +int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str) FUNC_TERM_INT FUNC_QUALIFIER void pick_compatible_gpus(gmx_gpu_info_t *gpu_info) FUNC_TERM_VOID diff --git a/src/gmxlib/gmx_detect_hardware.c b/src/gmxlib/gmx_detect_hardware.c index 6fe77d759e..655169cfa5 100644 --- a/src/gmxlib/gmx_detect_hardware.c +++ b/src/gmxlib/gmx_detect_hardware.c @@ -487,7 +487,23 @@ void gmx_detect_hardware(FILE *fplog, gmx_hw_info_t *hwinfo, /* run the detection if the binary was compiled with GPU support */ if (bGPUBin && getenv("GMX_DISABLE_GPU_DETECTION")==NULL) { - detect_cuda_gpus(&hwinfo->gpu_info); + char detection_error[STRLEN]; + + if (detect_cuda_gpus(&hwinfo->gpu_info, detection_error) != 0) + { + if (detection_error != NULL && detection_error[0] != '\0') + { + sprintf(sbuf, ":\n %s\n", detection_error); + } + else + { + sprintf(sbuf, "."); + } + md_print_warn(cr, fplog, + "NOTE: Error occurred during GPU detection%s" + " Can not use GPU acceleration, will fall back to CPU kernels.\n", + sbuf); + } } if (bForceUseGPU || bTryUseGPU) diff --git a/src/gmxlib/gpu_utils/gpu_utils.cu b/src/gmxlib/gpu_utils/gpu_utils.cu index 64d1e39e52..c29b74dff1 100644 --- a/src/gmxlib/gpu_utils/gpu_utils.cu +++ b/src/gmxlib/gpu_utils/gpu_utils.cu @@ -145,6 +145,8 @@ static const char * const SupportedGPUs[] = { * \param[in] dev_id the device ID of the GPU or -1 if the device has already been initialized * \param[out] dev_prop pointer to the structure in which the device properties will be returned * \returns 0 if the device looks OK + * + * TODO: introduce errors codes and handle errors more smoothly. */ static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop) { @@ -217,7 +219,10 @@ static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop) /* try to execute a dummy kernel */ k_dummy_test<<<1, 512>>>(); - CU_LAUNCH_ERR_SYNC("dummy test kernel"); + if (cudaThreadSynchronize() != cudaSuccess) + { + return -1; + } /* destroy context if we created one */ if (id != -1) @@ -681,13 +686,21 @@ static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop) int ndev; stat = cudaGetDeviceCount(&ndev); - CU_RET_ERR(stat, "cudaGetDeviceCount failed"); + if (stat != cudaSuccess) + { + return egpuInsane; + } if (dev_id > ndev - 1) { return egpuNonexistent; } + /* TODO: currently we do not make a distinction between the type of errors + * that can appear during sanity checks. This needs to be improved, e.g if + * the dummy test kernel fails to execute with a "device busy message" we + * should appropriately report that the device is busy instead of insane. + */ if (do_sanity_checks(dev_id, dev_prop) == 0) { if (is_gmx_supported_gpu(dev_prop)) @@ -714,31 +727,55 @@ static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop) * status. * * \param[in] gpu_info pointer to structure holding GPU information. + * \param[out] err_str The error message of any CUDA API error that caused + * the detection to fail (if there was any). The memory + * the pointer points to should be managed externally. + * \returns non-zero if the detection encountered a failure, zero otherwise. */ -void detect_cuda_gpus(gmx_gpu_info_t *gpu_info) +int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str) { - int i, ndev, checkres; + int i, ndev, checkres, retval; cudaError_t stat; cudaDeviceProp prop; cuda_dev_info_t *devs; assert(gpu_info); + assert(err_str); + + ndev = 0; + devs = NULL; stat = cudaGetDeviceCount(&ndev); - CU_RET_ERR(stat, "cudaGetDeviceCount failed"); + if (stat != cudaSuccess) + { + const char *s; - snew(devs, ndev); - for (i = 0; i < ndev; i++) + /* cudaGetDeviceCount failed which means that there is something + * wrong with the machine: driver-runtime mismatch, all GPUs being + * busy in exclusive mode, or some other condition which should + * result in us issuing a warning a falling back to CPUs. */ + retval = -1; + s = cudaGetErrorString(stat); + strncpy(err_str, s, STRLEN*sizeof(err_str[0])); + } + else { - checkres = is_gmx_supported_gpu_id(i, &prop); + snew(devs, ndev); + for (i = 0; i < ndev; i++) + { + checkres = is_gmx_supported_gpu_id(i, &prop); - devs[i].id = i; - devs[i].prop = prop; - devs[i].stat = checkres; + devs[i].id = i; + devs[i].prop = prop; + devs[i].stat = checkres; + } + retval = 0; } gpu_info->ncuda_dev = ndev; gpu_info->cuda_dev = devs; + + return retval; } /*! \brief Select the GPUs compatible with the native GROMACS acceleration.