From: Mark Abraham Date: Sun, 10 Dec 2017 07:35:52 +0000 (+1100) Subject: Check for GPU detection support before detecting X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=9ee128dc84f7e2c1e9af52f897718b39788fe9cf;p=alexxy%2Fgromacs.git Check for GPU detection support before detecting When a CUDA-enabled binary was run on a node with no CUDA driver available, a note was issued that the version of the CUDA driver is insufficient, which was wrong. Fixed this by separating detection of a valid CUDA driver (or OpenCL platform) from enumerating the compatible devices. This permits a GPU-enabled build configuration to gracefully degrade to the same behaviour as a CPU-only build configuration. Also suppressed more warnings about use of OpenCL API elements that have been deprecated but which we intend to contiune to use regardless. Also fixed confusing name of rank_local, and replaced it with a boolean that cleanly describes the required functionality. Also fixed and simplified logic of printing the GPU report. The implementation only prints details about the node of the master rank, so there is no value in checking a variable that reflects the number of GPUs detected across all nodes. Fixes #2322 Change-Id: I831d3c0017dafc00f7bb82e3f71be5b122657d1e --- diff --git a/src/gromacs/gpu_utils/gmxopencl.h b/src/gromacs/gpu_utils/gmxopencl.h index b9ccfa361d..5119d8985a 100644 --- a/src/gromacs/gpu_utils/gmxopencl.h +++ b/src/gromacs/gpu_utils/gmxopencl.h @@ -50,9 +50,13 @@ #define GMX_GPU_UTILS_GMXOPENCL_H /*! \brief Declare to OpenCL SDKs that we intend to use OpenCL API - features that were deprecated in 2.0, so that they don't warn about - it. */ + features that were deprecated in 1.2 or 2.0, so that they don't + warn about it. */ +///@{ +# define CL_USE_DEPRECATED_OPENCL_1_1_APIS +# define CL_USE_DEPRECATED_OPENCL_1_2_APIS # define CL_USE_DEPRECATED_OPENCL_2_0_APIS +///@} # ifdef __APPLE__ # include # else diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu index e479f3e743..5364ff64c0 100644 --- a/src/gromacs/gpu_utils/gpu_utils.cu +++ b/src/gromacs/gpu_utils/gpu_utils.cu @@ -636,6 +636,18 @@ static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop) } } +bool canDetectGpus() +{ + cudaError_t stat; + int driverVersion = -1; + stat = cudaDriverGetVersion(&driverVersion); + GMX_ASSERT(stat != cudaErrorInvalidValue, "An impossible null pointer was passed to cudaDriverGetVersion"); + GMX_RELEASE_ASSERT(stat == cudaSuccess, + gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s", + cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str()); + bool foundDriver = (driverVersion > 0); + return foundDriver; +} int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str) { diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h index 88323f8762..d7e724aedf 100644 --- a/src/gromacs/gpu_utils/gpu_utils.h +++ b/src/gromacs/gpu_utils/gpu_utils.h @@ -74,12 +74,26 @@ enum class GpuTaskCompletion Check /*<< Only check whether the task has completed */ }; +/*! \brief Return whether GPUs can be detected + * + * Returns true when this is a build of \Gromacs configured to support + * GPU usage, and a valid device driver or ICD was detected by the GPU + * runtime. + * + * Does not throw. */ +GPU_FUNC_QUALIFIER +bool canDetectGpus() GPU_FUNC_TERM_WITH_RETURN(false); + /*! \brief Detect all GPUs in the system. * - * Will detect every GPU supported by the device driver in use. Also - * check for the compatibility of each and fill the gpu_info->gpu_dev array - * with the required information on each the device: ID, device properties, - * status. + * Will detect every GPU supported by the device driver in use. If + * the device driver is missing or unsuitable, returns the same error + * as for "no valid devices detected," so generally calling code + * should have checked the return value from canDetectGpus() first, + * in order to understand the behaviour of this routine. This routine + * also checks for the compatibility of each and fill the + * gpu_info->gpu_dev array with the required information on each the + * device: ID, device properties, status. * * \param[in] gpu_info pointer to structure holding GPU information. * \param[out] err_str The error message of any GPU API error that caused diff --git a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp index c121f813b3..88edc975a1 100644 --- a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp +++ b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp @@ -60,6 +60,7 @@ #include "gromacs/utility/cstringutil.h" #include "gromacs/utility/fatalerror.h" #include "gromacs/utility/smalloc.h" +#include "gromacs/utility/stringutil.h" /*! \brief Helper macro for error handling */ #define CALLOCLFUNC_LOGERROR(func, err_str, retval) { \ @@ -161,6 +162,24 @@ static ocl_vendor_id_t get_vendor_id(char *vendor_name) } +//! This function is documented in the header file +bool canDetectGpus() +{ + cl_uint numPlatforms = -1; + cl_int status = clGetPlatformIDs(0, nullptr, &numPlatforms); + GMX_ASSERT(status != CL_INVALID_VALUE, "Incorrect call of clGetPlatformIDs detected"); + if (status == CL_PLATFORM_NOT_FOUND_KHR) + { + // No valid ICDs found + return false; + } + GMX_RELEASE_ASSERT(status == CL_SUCCESS, + gmx::formatString("An unexpected value was returned from clGetPlatformIDs %u: %s", + status, ocl_get_error_string(status).c_str()).c_str()); + bool foundPlatform = (numPlatforms > 0); + return foundPlatform; +} + //! This function is documented in the header file int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str) { diff --git a/src/gromacs/hardware/detecthardware.cpp b/src/gromacs/hardware/detecthardware.cpp index a8e9304730..015f3003ad 100644 --- a/src/gromacs/hardware/detecthardware.cpp +++ b/src/gromacs/hardware/detecthardware.cpp @@ -114,7 +114,7 @@ static void gmx_detect_gpus(const gmx::MDLogger &mdlog, const t_commrec *cr) int rank_world; MPI_Comm physicalnode_comm; #endif - int rank_local; + bool isMasterRankOfNode; hwinfo_g->gpu_info.bDetectGPUs = (bGPUBinary && getenv("GMX_DISABLE_GPU_DETECTION") == nullptr); @@ -142,20 +142,32 @@ static void gmx_detect_gpus(const gmx::MDLogger &mdlog, const t_commrec *cr) MPI_Comm_rank(MPI_COMM_WORLD, &rank_world); MPI_Comm_split(MPI_COMM_WORLD, gmx_physicalnode_id_hash(), rank_world, &physicalnode_comm); - MPI_Comm_rank(physicalnode_comm, &rank_local); + { + int rankOnNode = -1; + MPI_Comm_rank(physicalnode_comm, &rankOnNode); + isMasterRankOfNode = (rankOnNode == 0); + } GMX_UNUSED_VALUE(cr); #else - /* Here there should be only one process, check this */ + // Here there should be only one process, because if we are using + // thread-MPI, only one thread is active so far. So we check this. GMX_RELEASE_ASSERT(cr->nnodes == 1 && cr->sim_nodeid == 0, "Only a single (master) process should execute here"); - - rank_local = 0; + isMasterRankOfNode = true; #endif /* With CUDA detect only on one rank per host, with OpenCL need do * the detection on all PP ranks */ bool isOpenclPpRank = ((GMX_GPU == GMX_GPU_OPENCL) && thisRankHasDuty(cr, DUTY_PP)); - if (rank_local == 0 || isOpenclPpRank) + bool gpusCanBeDetected = false; + if (isMasterRankOfNode || isOpenclPpRank) + { + gpusCanBeDetected = canDetectGpus(); + // No need to tell the user anything at this point, they get a + // hardware report later. + } + + if (gpusCanBeDetected) { char detection_error[STRLEN] = "", sbuf[STRLEN]; @@ -188,7 +200,7 @@ static void gmx_detect_gpus(const gmx::MDLogger &mdlog, const t_commrec *cr) dev_size = hwinfo_g->gpu_info.n_dev*sizeof_gpu_dev_info(); - if (rank_local > 0) + if (!isMasterRankOfNode) { hwinfo_g->gpu_info.gpu_dev = (struct gmx_device_info_t *)malloc(dev_size); diff --git a/src/gromacs/hardware/printhardware.cpp b/src/gromacs/hardware/printhardware.cpp index 17b120765b..76c9830c1e 100644 --- a/src/gromacs/hardware/printhardware.cpp +++ b/src/gromacs/hardware/printhardware.cpp @@ -344,16 +344,12 @@ static std::string detected_hardware_string(const gmx_hw_info_t *hwinfo, } } - if (bGPUBinary && (hwinfo->ngpu_compatible_tot > 0 || - hwinfo->gpu_info.n_dev > 0)) + if (bGPUBinary && hwinfo->gpu_info.n_dev > 0) { s += gmx::formatString(" GPU info:\n"); s += gmx::formatString(" Number of GPUs detected: %d\n", hwinfo->gpu_info.n_dev); - if (hwinfo->gpu_info.n_dev > 0) - { - s += sprint_gpus(hwinfo->gpu_info) + "\n"; - } + s += sprint_gpus(hwinfo->gpu_info) + "\n"; } return s; }