From: Szilárd Páll Date: Fri, 25 Oct 2019 23:24:23 +0000 (+0200) Subject: Avoid mdrun terminate due to GPU sanity check errors X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=0a9b0ba74db6c70afc066eba21c767225b5feab3;p=alexxy%2Fgromacs.git Avoid mdrun terminate due to GPU sanity check errors When a GPU is a exclusive or prohibited mode, early detection calls can fail and as a result an mdrun run abort with an error, even if all GPU offload is explicitly disabled by the user. This change adds a status code to handle the case of devices being unavailable. Additionally, other errors may be encountered during the dummy kernel sanity check (e.g. out of memory), but since the change that switches to using launchGpuKernel() wrapper did not handle the exception in the sanity checking, this can also abort a run even if the GPU in question is not selected to be used. This change adds code to catch the exception this and report the error and avoid abort the run. Fixes #3178 #3399 Change-Id: I0cdedbc02769084c172e4a42fe5c1af192007cec --- diff --git a/docs/release-notes/2019/2019.6.rst b/docs/release-notes/2019/2019.6.rst index 8eb56ec75e..ce1e9146dc 100644 --- a/docs/release-notes/2019/2019.6.rst +++ b/docs/release-notes/2019/2019.6.rst @@ -62,3 +62,9 @@ Compilation was failing with ``mcpcom: core dumped`` for the file :file:`pulluti Miscellaneous ^^^^^^^^^^^^^ +Avoid cryptic GPU detection errors when devices are unavailable or out of memory +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +:issue:`3178` +:issue:`3399` + diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu index 56047d32c2..41aedf9f13 100644 --- a/src/gromacs/gpu_utils/gpu_utils.cu +++ b/src/gromacs/gpu_utils/gpu_utils.cu @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by + * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -75,29 +75,27 @@ static __global__ void k_dummy_test(void) { } -static void checkCompiledTargetCompatibility(int deviceId, - const cudaDeviceProp &deviceProp) +static cudaError_t checkCompiledTargetCompatibility(int deviceId, + const cudaDeviceProp &deviceProp) { cudaFuncAttributes attributes; cudaError_t stat = cudaFuncGetAttributes(&attributes, k_dummy_test); if (cudaErrorInvalidDeviceFunction == stat) { - gmx_fatal(FARGS, - "The %s binary does not include support for the CUDA architecture of " - "the GPU ID #%d (compute capability %d.%d) detected during detection. " - "By default, GROMACS supports all architectures of compute " - "capability >= 3.0, so your GPU " - "might be rare, or some architectures were disabled in the build. " - "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and " - "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. " - "To work around this error, use the CUDA_VISIBLE_DEVICES environment" - "variable to pass a list of GPUs that excludes the ID %d.", - gmx::getProgramContext().displayName(), deviceId, - deviceProp.major, deviceProp.minor, deviceId); - } - - CU_RET_ERR(stat, "cudaFuncGetAttributes failed"); + fprintf(stderr, + "\nWARNING: The %s binary does not include support for the CUDA architecture of " + "the GPU ID #%d (compute capability %d.%d) detected during detection. " + "By default, GROMACS supports all architectures of compute " + "capability >= 3.0, so your GPU " + "might be rare, or some architectures were disabled in the build. \n" + "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and " + "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. \n", + gmx::getProgramContext().displayName(), deviceId, + deviceProp.major, deviceProp.minor); + } + + return stat; } bool isHostMemoryPinned(const void *h_ptr) @@ -133,7 +131,7 @@ bool isHostMemoryPinned(const void *h_ptr) * * \param[in] dev_id the device ID of the GPU or -1 if the device has already been initialized * \param[in] dev_prop The device properties structure - * \returns 0 if the device looks OK + * \returns 0 if the device looks OK, -1 if it sanity checks failed, and -2 if the device is busy * * TODO: introduce errors codes and handle errors more smoothly. */ @@ -205,13 +203,35 @@ static int do_sanity_checks(int dev_id, const cudaDeviceProp &dev_prop) } } + cu_err = checkCompiledTargetCompatibility(dev_id, dev_prop); + // Avoid triggering an error if GPU devices are in exclusive or prohibited mode; + // it is enough to check for cudaErrorDevicesUnavailable only here because + // if we encounter it that will happen in cudaFuncGetAttributes in the above function. + if (cu_err == cudaErrorDevicesUnavailable) + { + return -2; + } + else if (cu_err != cudaSuccess) + { + return -1; + } + /* try to execute a dummy kernel */ - checkCompiledTargetCompatibility(dev_id, dev_prop); + try + { + KernelLaunchConfig config; + config.blockSize[0] = 512; + const auto dummyArguments = prepareGpuKernelArguments(k_dummy_test, config); + launchGpuKernel(k_dummy_test, config, nullptr, "Dummy kernel", dummyArguments); + } + catch (gmx::GromacsException &ex) + { + // launchGpuKernel error is not fatal and should continue with marking the device bad + fprintf(stderr, "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n", + id, formatExceptionMessageToString(ex).c_str()); + return -1; + } - KernelLaunchConfig config; - config.blockSize[0] = 512; - const auto dummyArguments = prepareGpuKernelArguments(k_dummy_test, config); - launchGpuKernel(k_dummy_test, config, nullptr, "Dummy kernel", dummyArguments); if (cudaDeviceSynchronize() != cudaSuccess) { return -1; @@ -322,12 +342,15 @@ static int is_gmx_supported_gpu_id(int deviceId, * the dummy test kernel fails to execute with a "device busy message" we * should appropriately report that the device is busy instead of insane. */ - if (do_sanity_checks(deviceId, deviceProp) != 0) + const int checkResult = do_sanity_checks(deviceId, deviceProp); + switch (checkResult) { - return egpuInsane; + case 0: return egpuCompatible; + case -1: return egpuInsane; + case -2: return egpuUnavailable; + default: GMX_RELEASE_ASSERT(false, "Invalid do_sanity_checks() return value"); + return egpuCompatible; } - - return egpuCompatible; } bool canDetectGpus(std::string *errorMessage) diff --git a/src/gromacs/hardware/gpu_hw_info.cpp b/src/gromacs/hardware/gpu_hw_info.cpp index a65c5cea0b..7afade8405 100644 --- a/src/gromacs/hardware/gpu_hw_info.cpp +++ b/src/gromacs/hardware/gpu_hw_info.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2017,2018, by the GROMACS development team, led by + * Copyright (c) 2017,2018,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -47,5 +47,5 @@ /* Names of the GPU detection/check results (see e_gpu_detect_res_t in hw_info.h). */ const char * const gpu_detect_res_str[egpuNR] = { - "compatible", "nonexistent", "incompatible", "incompatible (please recompile with GMX_OPENCL_NB_CLUSTER_SIZE=4)", "insane" + "compatible", "nonexistent", "incompatible", "incompatible (please recompile with GMX_OPENCL_NB_CLUSTER_SIZE=4)", "insane", "unavailable" }; diff --git a/src/gromacs/hardware/gpu_hw_info.h b/src/gromacs/hardware/gpu_hw_info.h index 51e1d74c4e..b13934a02d 100644 --- a/src/gromacs/hardware/gpu_hw_info.h +++ b/src/gromacs/hardware/gpu_hw_info.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014,2015,2017,2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -47,10 +47,13 @@ struct gmx_device_info_t; * * The egpuInsane value means that during the sanity checks an error * occurred that indicates malfunctioning of the device, driver, or - * incompatible driver/runtime. */ + * incompatible driver/runtime. + * eGpuUnavailable indicates that CUDA devices are busy or unavailable + * typically due to use of cudaComputeModeExclusive, cudaComputeModeProhibited modes. + */ typedef enum { - egpuCompatible = 0, egpuNonexistent, egpuIncompatible, egpuIncompatibleClusterSize, egpuInsane, egpuNR + egpuCompatible = 0, egpuNonexistent, egpuIncompatible, egpuIncompatibleClusterSize, egpuInsane, egpuUnavailable, egpuNR } e_gpu_detect_res_t; /* Names of the GPU detection/check results */