From 2dc4ea2f362b6338008895f5525525c8bff7a33b Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Thu, 10 Sep 2020 10:27:03 +0000 Subject: [PATCH] Use device information object instead of id when performing device checks Initializing plain object is dangerous, since it then can be used to set active device, with default id. This change ensures that proper device will be used, even if the high level device management is used in the checks. --- src/gromacs/hardware/device_management.cu | 103 +++++------------- .../hardware/device_management_ocl.cpp | 6 +- 2 files changed, 31 insertions(+), 78 deletions(-) diff --git a/src/gromacs/hardware/device_management.cu b/src/gromacs/hardware/device_management.cu index 32708873ec..b59b51e59a 100644 --- a/src/gromacs/hardware/device_management.cu +++ b/src/gromacs/hardware/device_management.cu @@ -106,78 +106,33 @@ static cudaError_t checkCompiledTargetCompatibility(int deviceId, const cudaDevi * \todo Introduce errors codes and handle errors more smoothly. * * - * \param[in] dev_id the device ID of the GPU or -1 if the device has already been initialized - * \param[in] dev_prop The device properties structure - * \returns 0 if the device looks OK, -1 if it sanity checks failed, and -2 if the device is busy + * \param[in] deviceInfo Device information on the device to check. + * \returns The status enumeration value for the checked device: */ -static DeviceStatus isDeviceFunctional(int dev_id, const cudaDeviceProp& dev_prop) +static DeviceStatus isDeviceFunctional(const DeviceInformation& deviceInfo) { cudaError_t cu_err; - int dev_count, id; - - cu_err = cudaGetDeviceCount(&dev_count); - if (cu_err != cudaSuccess) - { - fprintf(stderr, "Error %d while querying device count: %s\n", cu_err, cudaGetErrorString(cu_err)); - return DeviceStatus::NonFunctional; - } - - /* no CUDA compatible device at all */ - if (dev_count == 0) - { - return DeviceStatus::NonFunctional; - } - - /* things might go horribly wrong if cudart is not compatible with the driver */ - if (dev_count < 0 || dev_count > c_cudaMaxDeviceCount) - { - return DeviceStatus::NonFunctional; - } - - if (dev_id == -1) /* device already selected let's not destroy the context */ - { - cu_err = cudaGetDevice(&id); - if (cu_err != cudaSuccess) - { - fprintf(stderr, "Error %d while querying device id: %s\n", cu_err, cudaGetErrorString(cu_err)); - return DeviceStatus::NonFunctional; - } - } - else - { - id = dev_id; - if (id > dev_count - 1) /* pfff there's no such device */ - { - fprintf(stderr, - "The requested device with id %d does not seem to exist (device count=%d)\n", - dev_id, dev_count); - return DeviceStatus::NonFunctional; - } - } /* both major & minor is 9999 if no CUDA capable devices are present */ - if (dev_prop.major == 9999 && dev_prop.minor == 9999) + if (deviceInfo.prop.major == 9999 && deviceInfo.prop.minor == 9999) { return DeviceStatus::NonFunctional; } /* we don't care about emulation mode */ - if (dev_prop.major == 0) + if (deviceInfo.prop.major == 0) { return DeviceStatus::NonFunctional; } - if (id != -1) + cu_err = cudaSetDevice(deviceInfo.id); + if (cu_err != cudaSuccess) { - cu_err = cudaSetDevice(id); - if (cu_err != cudaSuccess) - { - fprintf(stderr, "Error %d while switching to device #%d: %s\n", cu_err, id, - cudaGetErrorString(cu_err)); - return DeviceStatus::NonFunctional; - } + fprintf(stderr, "Error %d while switching to device #%d: %s\n", cu_err, deviceInfo.id, + cudaGetErrorString(cu_err)); + return DeviceStatus::NonFunctional; } - cu_err = checkCompiledTargetCompatibility(dev_id, dev_prop); + cu_err = checkCompiledTargetCompatibility(deviceInfo.id, deviceInfo.prop); // Avoid triggering an error if GPU devices are in exclusive or prohibited mode; // it is enough to check for cudaErrorDevicesUnavailable only here because // if we encounter it that will happen in cudaFuncGetAttributes in the above function. @@ -196,7 +151,6 @@ static DeviceStatus isDeviceFunctional(int dev_id, const cudaDeviceProp& dev_pro KernelLaunchConfig config; config.blockSize[0] = 512; const auto dummyArguments = prepareGpuKernelArguments(dummy_kernel, config); - DeviceInformation deviceInfo; const DeviceContext deviceContext(deviceInfo); const DeviceStream deviceStream(deviceContext, DeviceStreamPriority::Normal, false); launchGpuKernel(dummy_kernel, config, deviceStream, nullptr, "Dummy kernel", dummyArguments); @@ -205,8 +159,8 @@ static DeviceStatus isDeviceFunctional(int dev_id, const cudaDeviceProp& dev_pro { // launchGpuKernel error is not fatal and should continue with marking the device bad fprintf(stderr, - "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n", id, - formatExceptionMessageToString(ex).c_str()); + "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n", + deviceInfo.id, formatExceptionMessageToString(ex).c_str()); return DeviceStatus::NonFunctional; } @@ -215,12 +169,8 @@ static DeviceStatus isDeviceFunctional(int dev_id, const cudaDeviceProp& dev_pro return DeviceStatus::NonFunctional; } - /* destroy context if we created one */ - if (id != -1) - { - cu_err = cudaDeviceReset(); - CU_RET_ERR(cu_err, "cudaDeviceReset failed"); - } + cu_err = cudaDeviceReset(); + CU_RET_ERR(cu_err, "cudaDeviceReset failed"); return DeviceStatus::Compatible; } @@ -247,17 +197,16 @@ static bool isDeviceGenerationSupported(const cudaDeviceProp& deviceProperties) * upon return. Note that this also means it is the caller's responsibility to * reset the CUDA runtime state. * - * \param[in] deviceId the ID of the GPU to check. - * \param[in] deviceProp the CUDA device properties of the device checked. + * \param[in] deviceInfo The device information on the device to check. * \returns the status of the requested device */ -static DeviceStatus checkDeviceStatus(int deviceId, const cudaDeviceProp& deviceProp) +static DeviceStatus checkDeviceStatus(const DeviceInformation& deviceInfo) { - if (!isDeviceGenerationSupported(deviceProp)) + if (!isDeviceGenerationSupported(deviceInfo.prop)) { return DeviceStatus::Incompatible; } - return isDeviceFunctional(deviceId, deviceProp); + return isDeviceFunctional(deviceInfo); } bool isDeviceDetectionFunctional(std::string* errorMessage) @@ -326,6 +275,9 @@ std::vector> findDevices() "canPerformDeviceDetection() was not called appropriately beforehand.")); } + /* things might go horribly wrong if cudart is not compatible with the driver */ + numDevices = std::min(numDevices, c_cudaMaxDeviceCount); + // We expect to start device support/sanity checks with a clean runtime error state gmx::ensureNoPendingCudaError(""); @@ -335,13 +287,14 @@ std::vector> findDevices() cudaDeviceProp prop; memset(&prop, 0, sizeof(cudaDeviceProp)); stat = cudaGetDeviceProperties(&prop, i); - const DeviceStatus checkResult = - (stat != cudaSuccess) ? DeviceStatus::NonFunctional : checkDeviceStatus(i, prop); - deviceInfoList[i] = std::make_unique(); + deviceInfoList[i] = std::make_unique(); + deviceInfoList[i]->id = i; + deviceInfoList[i]->prop = prop; + + const DeviceStatus checkResult = (stat != cudaSuccess) ? DeviceStatus::NonFunctional + : checkDeviceStatus(*deviceInfoList[i]); - deviceInfoList[i]->id = i; - deviceInfoList[i]->prop = prop; deviceInfoList[i]->status = checkResult; if (checkResult != DeviceStatus::Compatible) diff --git a/src/gromacs/hardware/device_management_ocl.cpp b/src/gromacs/hardware/device_management_ocl.cpp index 4404efed3c..c46a04ee01 100644 --- a/src/gromacs/hardware/device_management_ocl.cpp +++ b/src/gromacs/hardware/device_management_ocl.cpp @@ -119,9 +119,9 @@ static bool runningOnCompatibleOSForAmd() * of these returned. * * \param[in] deviceInfo The device info pointer. - * \returns The result of the compatibility checks. + * \returns The status enumeration value for the checked device: */ -static DeviceStatus isDeviceSupported(const DeviceInformation& deviceInfo) +static DeviceStatus isDeviceFunctional(const DeviceInformation& deviceInfo) { if (getenv("GMX_OCL_DISABLE_COMPATIBILITY_CHECK") != nullptr) { @@ -272,7 +272,7 @@ static bool isDeviceFunctional(const DeviceInformation& deviceInfo, std::string* static DeviceStatus checkGpu(size_t deviceId, const DeviceInformation& deviceInfo) { - DeviceStatus supportStatus = isDeviceSupported(deviceInfo); + DeviceStatus supportStatus = isDeviceFunctional(deviceInfo); if (supportStatus != DeviceStatus::Compatible) { return supportStatus; -- 2.22.0