Improve handling of CUDA API errors
[alexxy/gromacs.git] / src / gromacs / hardware / device_management.cu
index a09d6bf99a2d729c6d37ee4739106da95cf2b58c..0e77621a5bdf61ba82aadcea9cd882eae77b8759 100644 (file)
@@ -127,8 +127,8 @@ static DeviceStatus isDeviceFunctional(const DeviceInformation& deviceInfo)
     cu_err = cudaSetDevice(deviceInfo.id);
     if (cu_err != cudaSuccess)
     {
-        fprintf(stderr, "Error %d while switching to device #%d: %s\n", cu_err, deviceInfo.id,
-                cudaGetErrorString(cu_err));
+        fprintf(stderr, "Error while switching to device #%d. %s\n", deviceInfo.id,
+                gmx::getDeviceErrorString(cu_err).c_str());
         return DeviceStatus::NonFunctional;
     }
 
@@ -216,11 +216,10 @@ bool isDeviceDetectionFunctional(std::string* errorMessage)
     stat                      = cudaDriverGetVersion(&driverVersion);
     GMX_ASSERT(stat != cudaErrorInvalidValue,
                "An impossible null pointer was passed to cudaDriverGetVersion");
-    GMX_RELEASE_ASSERT(
-            stat == cudaSuccess,
-            gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
-                              cudaGetErrorName(stat), cudaGetErrorString(stat))
-                    .c_str());
+    GMX_RELEASE_ASSERT(stat == cudaSuccess,
+                       ("An unexpected value was returned from cudaDriverGetVersion. "
+                        + gmx::getDeviceErrorString(stat))
+                               .c_str());
     bool foundDriver = (driverVersion > 0);
     if (!foundDriver)
     {
@@ -268,18 +267,15 @@ std::vector<std::unique_ptr<DeviceInformation>> findDevices()
 {
     int         numDevices;
     cudaError_t stat = cudaGetDeviceCount(&numDevices);
-    if (stat != cudaSuccess)
-    {
-        GMX_THROW(gmx::InternalError(
-                "Invalid call of findDevices() when CUDA API returned an error, perhaps "
-                "canPerformDeviceDetection() was not called appropriately beforehand."));
-    }
+    gmx::checkDeviceError(stat,
+                          "Invalid call of findDevices() when CUDA API returned an error, perhaps "
+                          "canPerformDeviceDetection() was not called appropriately beforehand.");
 
     /* things might go horribly wrong if cudart is not compatible with the driver */
     numDevices = std::min(numDevices, c_cudaMaxDeviceCount);
 
     // We expect to start device support/sanity checks with a clean runtime error state
-    gmx::ensureNoPendingCudaError("");
+    gmx::ensureNoPendingDeviceError("Trying to find available CUDA devices.");
 
     std::vector<std::unique_ptr<DeviceInformation>> deviceInfoList(numDevices);
     for (int i = 0; i < numDevices; i++)
@@ -311,20 +307,18 @@ std::vector<std::unique_ptr<DeviceInformation>> findDevices()
             //
             // Here we also clear the CUDA API error state so potential
             // errors during sanity checks don't propagate.
-            if ((stat = cudaGetLastError()) != cudaSuccess)
-            {
-                gmx_warning("An error occurred while sanity checking device #%d; %s: %s",
-                            deviceInfoList[i]->id, cudaGetErrorName(stat), cudaGetErrorString(stat));
-            }
+            const std::string errorMessage = gmx::formatString(
+                    "An error occurred while sanity checking device #%d.", deviceInfoList[i]->id);
+            gmx::ensureNoPendingDeviceError(errorMessage);
         }
     }
 
     stat = cudaPeekAtLastError();
-    GMX_RELEASE_ASSERT(stat == cudaSuccess,
-                       gmx::formatString("We promise to return with clean CUDA state, but "
-                                         "non-success state encountered: %s: %s",
-                                         cudaGetErrorName(stat), cudaGetErrorString(stat))
-                               .c_str());
+    GMX_RELEASE_ASSERT(
+            stat == cudaSuccess,
+            ("We promise to return with clean CUDA state, but non-success state encountered. "
+             + gmx::getDeviceErrorString(stat))
+                    .c_str());
 
     return deviceInfoList;
 }
@@ -360,13 +354,13 @@ void releaseDevice(DeviceInformation* deviceInfo)
         {
             if (debug)
             {
-                fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
+                fprintf(stderr, "Cleaning up context on GPU ID #%d.\n", gpuid);
             }
 
             stat = cudaDeviceReset();
             if (stat != cudaSuccess)
             {
-                gmx_warning("Failed to free GPU #%d: %s", gpuid, cudaGetErrorString(stat));
+                gmx_warning("Failed to free GPU #%d. %s", gpuid, gmx::getDeviceErrorString(stat).c_str());
             }
         }
     }