Update mdrun message to reflect that Cuda CC>=3.5 is supported.

[alexxy/gromacs.git] / src / gromacs / hardware / device_management.cu
diff --git a/src/gromacs/hardware/device_management.cu b/src/gromacs/hardware/device_management.cu

index b59b51e59a10af1a1226ed879123207bed8ac310..ab64545ad9971f5439e64787f15ec098348f87c6 100644 (file)
--- a/src/gromacs/hardware/device_management.cu
+++ b/src/gromacs/hardware/device_management.cu
@@ -2,7 +2,7 @@
   * This file is part of the GROMACS molecular simulation package.
   *
   * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team.
- * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -66,10 +66,10 @@
   *
   * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
   */
-static int c_cudaMaxDeviceCount = 32;
+static const int c_cudaMaxDeviceCount = 32;
  
  /** Dummy kernel used for sanity checking. */
-static __global__ void dummy_kernel(void) {}
+static __global__ void dummy_kernel() {}
  
  static cudaError_t checkCompiledTargetCompatibility(int deviceId, const cudaDeviceProp& deviceProp)
  {
@@ -82,11 +82,14 @@ static cudaError_t checkCompiledTargetCompatibility(int deviceId, const cudaDevi
                  "\nWARNING: The %s binary does not include support for the CUDA architecture of "
                  "the GPU ID #%d (compute capability %d.%d) detected during detection. "
                  "By default, GROMACS supports all architectures of compute "
-                "capability >= 3.0, so your GPU "
+                "capability >= 3.5, so your GPU "
                  "might be rare, or some architectures were disabled in the build. \n"
                  "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
                  "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. \n",
-                gmx::getProgramContext().displayName(), deviceId, deviceProp.major, deviceProp.minor);
+                gmx::getProgramContext().displayName(),
+                deviceId,
+                deviceProp.major,
+                deviceProp.minor);
      }
  
      return stat;
@@ -127,8 +130,10 @@ static DeviceStatus isDeviceFunctional(const DeviceInformation& deviceInfo)
      cu_err = cudaSetDevice(deviceInfo.id);
      if (cu_err != cudaSuccess)
      {
-        fprintf(stderr, "Error %d while switching to device #%d: %s\n", cu_err, deviceInfo.id,
-                cudaGetErrorString(cu_err));
+        fprintf(stderr,
+                "Error while switching to device #%d. %s\n",
+                deviceInfo.id,
+                gmx::getDeviceErrorString(cu_err).c_str());
          return DeviceStatus::NonFunctional;
      }
  
@@ -160,7 +165,8 @@ static DeviceStatus isDeviceFunctional(const DeviceInformation& deviceInfo)
          // launchGpuKernel error is not fatal and should continue with marking the device bad
          fprintf(stderr,
                  "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n",
-                deviceInfo.id, formatExceptionMessageToString(ex).c_str());
+                deviceInfo.id,
+                formatExceptionMessageToString(ex).c_str());
          return DeviceStatus::NonFunctional;
      }
  
@@ -216,11 +222,10 @@ bool isDeviceDetectionFunctional(std::string* errorMessage)
      stat                      = cudaDriverGetVersion(&driverVersion);
      GMX_ASSERT(stat != cudaErrorInvalidValue,
                 "An impossible null pointer was passed to cudaDriverGetVersion");
-    GMX_RELEASE_ASSERT(
-            stat == cudaSuccess,
-            gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
-                              cudaGetErrorName(stat), cudaGetErrorString(stat))
-                    .c_str());
+    GMX_RELEASE_ASSERT(stat == cudaSuccess,
+                       ("An unexpected value was returned from cudaDriverGetVersion. "
+                        + gmx::getDeviceErrorString(stat))
+                               .c_str());
      bool foundDriver = (driverVersion > 0);
      if (!foundDriver)
      {
@@ -268,18 +273,15 @@ std::vector<std::unique_ptr<DeviceInformation>> findDevices()
  {
      int         numDevices;
      cudaError_t stat = cudaGetDeviceCount(&numDevices);
-    if (stat != cudaSuccess)
-    {
-        GMX_THROW(gmx::InternalError(
-                "Invalid call of findDevices() when CUDA API returned an error, perhaps "
-                "canPerformDeviceDetection() was not called appropriately beforehand."));
-    }
+    gmx::checkDeviceError(stat,
+                          "Invalid call of findDevices() when CUDA API returned an error, perhaps "
+                          "canPerformDeviceDetection() was not called appropriately beforehand.");
  
      /* things might go horribly wrong if cudart is not compatible with the driver */
      numDevices = std::min(numDevices, c_cudaMaxDeviceCount);
  
      // We expect to start device support/sanity checks with a clean runtime error state
-    gmx::ensureNoPendingCudaError("");
+    gmx::ensureNoPendingDeviceError("Trying to find available CUDA devices.");
  
      std::vector<std::unique_ptr<DeviceInformation>> deviceInfoList(numDevices);
      for (int i = 0; i < numDevices; i++)
@@ -288,9 +290,10 @@ std::vector<std::unique_ptr<DeviceInformation>> findDevices()
          memset(&prop, 0, sizeof(cudaDeviceProp));
          stat = cudaGetDeviceProperties(&prop, i);
  
-        deviceInfoList[i]       = std::make_unique<DeviceInformation>();
-        deviceInfoList[i]->id   = i;
-        deviceInfoList[i]->prop = prop;
+        deviceInfoList[i]               = std::make_unique<DeviceInformation>();
+        deviceInfoList[i]->id           = i;
+        deviceInfoList[i]->prop         = prop;
+        deviceInfoList[i]->deviceVendor = DeviceVendor::Nvidia;
  
          const DeviceStatus checkResult = (stat != cudaSuccess) ? DeviceStatus::NonFunctional
                                                                 : checkDeviceStatus(*deviceInfoList[i]);
@@ -310,20 +313,18 @@ std::vector<std::unique_ptr<DeviceInformation>> findDevices()
              //
              // Here we also clear the CUDA API error state so potential
              // errors during sanity checks don't propagate.
-            if ((stat = cudaGetLastError()) != cudaSuccess)
-            {
-                gmx_warning("An error occurred while sanity checking device #%d; %s: %s",
-                            deviceInfoList[i]->id, cudaGetErrorName(stat), cudaGetErrorString(stat));
-            }
+            const std::string errorMessage = gmx::formatString(
+                    "An error occurred while sanity checking device #%d.", deviceInfoList[i]->id);
+            gmx::ensureNoPendingDeviceError(errorMessage);
          }
      }
  
      stat = cudaPeekAtLastError();
-    GMX_RELEASE_ASSERT(stat == cudaSuccess,
-                       gmx::formatString("We promise to return with clean CUDA state, but "
-                                         "non-success state encountered: %s: %s",
-                                         cudaGetErrorName(stat), cudaGetErrorString(stat))
-                               .c_str());
+    GMX_RELEASE_ASSERT(
+            stat == cudaSuccess,
+            ("We promise to return with clean CUDA state, but non-success state encountered. "
+             + gmx::getDeviceErrorString(stat))
+                    .c_str());
  
      return deviceInfoList;
  }
@@ -337,7 +338,7 @@ void setActiveDevice(const DeviceInformation& deviceInfo)
      if (stat != cudaSuccess)
      {
          auto message = gmx::formatString("Failed to initialize GPU #%d", deviceId);
-        CU_RET_ERR(stat, message.c_str());
+        CU_RET_ERR(stat, message);
      }
  
      if (debug)
@@ -359,13 +360,13 @@ void releaseDevice(DeviceInformation* deviceInfo)
          {
              if (debug)
              {
-                fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
+                fprintf(stderr, "Cleaning up context on GPU ID #%d.\n", gpuid);
              }
  
              stat = cudaDeviceReset();
              if (stat != cudaSuccess)
              {
-                gmx_warning("Failed to free GPU #%d: %s", gpuid, cudaGetErrorString(stat));
+                gmx_warning("Failed to free GPU #%d. %s", gpuid, gmx::getDeviceErrorString(stat).c_str());
              }
          }
      }
@@ -378,14 +379,17 @@ std::string getDeviceInformationString(const DeviceInformation& deviceInfo)
  
      if (!gpuExists)
      {
-        return gmx::formatString("#%d: %s, stat: %s", deviceInfo.id, "N/A",
-                                 c_deviceStateString[deviceInfo.status]);
+        return gmx::formatString(
+                "#%d: %s, stat: %s", deviceInfo.id, "N/A", c_deviceStateString[deviceInfo.status]);
      }
      else
      {
          return gmx::formatString("#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
-                                 deviceInfo.id, deviceInfo.prop.name, deviceInfo.prop.major,
-                                 deviceInfo.prop.minor, deviceInfo.prop.ECCEnabled ? "yes" : " no",
+                                 deviceInfo.id,
+                                 deviceInfo.prop.name,
+                                 deviceInfo.prop.major,
+                                 deviceInfo.prop.minor,
+                                 deviceInfo.prop.ECCEnabled ? "yes" : " no",
                                   c_deviceStateString[deviceInfo.status]);
      }
  }