Avoid mdrun terminate due to GPU sanity check errors

author Szilárd Páll <pall.szilard@gmail.com>

Fri, 25 Oct 2019 23:24:23 +0000 (01:24 +0200)

committer Szilárd Páll <pall.szilard@gmail.com>

Wed, 26 Feb 2020 15:18:29 +0000 (16:18 +0100)
author Szilárd Páll <pall.szilard@gmail.com>
Fri, 25 Oct 2019 23:24:23 +0000 (01:24 +0200)
committer Szilárd Páll <pall.szilard@gmail.com>
Wed, 26 Feb 2020 15:18:29 +0000 (16:18 +0100)
diff --git a/docs/release-notes/2019/2019.6.rst b/docs/release-notes/2019/2019.6.rst

index 8eb56ec75ea3cda2c9dced969bdf7b8f00819ced..ce1e9146dc3c562711e94464f226a0ee2fc9f380 100644 (file)
--- a/docs/release-notes/2019/2019.6.rst
+++ b/docs/release-notes/2019/2019.6.rst
@@ -62,3 +62,9 @@ Compilation was failing with ``mcpcom: core dumped`` for the file :file:`pulluti
  Miscellaneous
  ^^^^^^^^^^^^^
  
+Avoid cryptic GPU detection errors when devices are unavailable or out of memory
+""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+
+:issue:`3178`
+:issue:`3399`
+
diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu

index 56047d32c2ac78a04c91b9bd20368d8bc250fc81..41aedf9f13e9dc6e8a1153e9fa532785077a8747 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gpu_utils/gpu_utils.cu
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -75,29 +75,27 @@ static __global__ void k_dummy_test(void)
  {
  }
  
-static void checkCompiledTargetCompatibility(int                   deviceId,
-                                             const cudaDeviceProp &deviceProp)
+static cudaError_t checkCompiledTargetCompatibility(int                   deviceId,
+                                                    const cudaDeviceProp &deviceProp)
  {
      cudaFuncAttributes attributes;
      cudaError_t        stat = cudaFuncGetAttributes(&attributes, k_dummy_test);
  
      if (cudaErrorInvalidDeviceFunction == stat)
      {
-        gmx_fatal(FARGS,
-                  "The %s binary does not include support for the CUDA architecture of "
-                  "the GPU ID #%d (compute capability %d.%d) detected during detection. "
-                  "By default, GROMACS supports all architectures of compute "
-                  "capability >= 3.0, so your GPU "
-                  "might be rare, or some architectures were disabled in the build. "
-                  "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
-                  "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. "
-                  "To work around this error, use the CUDA_VISIBLE_DEVICES environment"
-                  "variable to pass a list of GPUs that excludes the ID %d.",
-                  gmx::getProgramContext().displayName(), deviceId,
-                  deviceProp.major, deviceProp.minor, deviceId);
-    }
-
-    CU_RET_ERR(stat, "cudaFuncGetAttributes failed");
+        fprintf(stderr,
+                "\nWARNING: The %s binary does not include support for the CUDA architecture of "
+                "the GPU ID #%d (compute capability %d.%d) detected during detection. "
+                "By default, GROMACS supports all architectures of compute "
+                "capability >= 3.0, so your GPU "
+                "might be rare, or some architectures were disabled in the build. \n"
+                "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
+                "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. \n",
+                gmx::getProgramContext().displayName(), deviceId,
+                deviceProp.major, deviceProp.minor);
+    }
+
+    return stat;
  }
  
  bool isHostMemoryPinned(const void *h_ptr)
@@ -133,7 +131,7 @@ bool isHostMemoryPinned(const void *h_ptr)
   *
   * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
   * \param[in]  dev_prop    The device properties structure
- * \returns                0 if the device looks OK
+ * \returns                0 if the device looks OK, -1 if it sanity checks failed, and -2 if the device is busy
   *
   * TODO: introduce errors codes and handle errors more smoothly.
   */
@@ -205,13 +203,35 @@ static int do_sanity_checks(int dev_id, const cudaDeviceProp &dev_prop)
          }
      }
  
+    cu_err = checkCompiledTargetCompatibility(dev_id, dev_prop);
+    // Avoid triggering an error if GPU devices are in exclusive or prohibited mode;
+    // it is enough to check for cudaErrorDevicesUnavailable only here because
+    // if we encounter it that will happen in cudaFuncGetAttributes in the above function.
+    if (cu_err == cudaErrorDevicesUnavailable)
+    {
+        return -2;
+    }
+    else if (cu_err != cudaSuccess)
+    {
+        return -1;
+    }
+
      /* try to execute a dummy kernel */
-    checkCompiledTargetCompatibility(dev_id, dev_prop);
+    try
+    {
+        KernelLaunchConfig config;
+        config.blockSize[0] = 512;
+        const auto         dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
+        launchGpuKernel(k_dummy_test, config, nullptr, "Dummy kernel", dummyArguments);
+    }
+    catch (gmx::GromacsException &ex)
+    {
+        // launchGpuKernel error is not fatal and should continue with marking the device bad
+        fprintf(stderr, "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n",
+                id, formatExceptionMessageToString(ex).c_str());
+        return -1;
+    }
  
-    KernelLaunchConfig config;
-    config.blockSize[0] = 512;
-    const auto         dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
-    launchGpuKernel(k_dummy_test, config, nullptr, "Dummy kernel", dummyArguments);
      if (cudaDeviceSynchronize() != cudaSuccess)
      {
          return -1;
@@ -322,12 +342,15 @@ static int is_gmx_supported_gpu_id(int                   deviceId,
       * the dummy test kernel fails to execute with a "device busy message" we
       * should appropriately report that the device is busy instead of insane.
       */
-    if (do_sanity_checks(deviceId, deviceProp) != 0)
+    const int checkResult = do_sanity_checks(deviceId, deviceProp);
+    switch (checkResult)
      {
-        return egpuInsane;
+        case  0: return egpuCompatible;
+        case -1: return egpuInsane;
+        case -2: return egpuUnavailable;
+        default: GMX_RELEASE_ASSERT(false, "Invalid do_sanity_checks() return value");
+            return egpuCompatible;
      }
-
-    return egpuCompatible;
  }
  
  bool canDetectGpus(std::string *errorMessage)
diff --git a/src/gromacs/hardware/gpu_hw_info.cpp b/src/gromacs/hardware/gpu_hw_info.cpp

index a65c5cea0bd4e75a044672c055e804c887f46dab..7afade840532d4683bade75d2bd0245447e3fd89 100644 (file)
--- a/src/gromacs/hardware/gpu_hw_info.cpp
+++ b/src/gromacs/hardware/gpu_hw_info.cpp
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2017,2018, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -47,5 +47,5 @@
  /* Names of the GPU detection/check results (see e_gpu_detect_res_t in hw_info.h). */
  const char * const gpu_detect_res_str[egpuNR] =
  {
-    "compatible", "nonexistent", "incompatible", "incompatible (please recompile with GMX_OPENCL_NB_CLUSTER_SIZE=4)", "insane"
+    "compatible", "nonexistent", "incompatible", "incompatible (please recompile with GMX_OPENCL_NB_CLUSTER_SIZE=4)", "insane", "unavailable"
  };
diff --git a/src/gromacs/hardware/gpu_hw_info.h b/src/gromacs/hardware/gpu_hw_info.h

index 51e1d74c4e0b2f614e483f69c296c7ac950d6eeb..b13934a02da54d5619583cdf8c684144256dde22 100644 (file)
--- a/src/gromacs/hardware/gpu_hw_info.h
+++ b/src/gromacs/hardware/gpu_hw_info.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2017,2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -47,10 +47,13 @@ struct gmx_device_info_t;
   *
   * The egpuInsane value means that during the sanity checks an error
   * occurred that indicates malfunctioning of the device, driver, or
- * incompatible driver/runtime. */
+ * incompatible driver/runtime.
+ * eGpuUnavailable indicates that CUDA devices are busy or unavailable
+ * typically due to use of cudaComputeModeExclusive, cudaComputeModeProhibited modes.
+ */
  typedef enum
  {
-    egpuCompatible = 0,  egpuNonexistent,  egpuIncompatible, egpuIncompatibleClusterSize, egpuInsane, egpuNR
+    egpuCompatible = 0,  egpuNonexistent,  egpuIncompatible, egpuIncompatibleClusterSize, egpuInsane, egpuUnavailable, egpuNR
  } e_gpu_detect_res_t;
  
  /* Names of the GPU detection/check results */
author	Szilárd Páll <pall.szilard@gmail.com>
	Fri, 25 Oct 2019 23:24:23 +0000 (01:24 +0200)
committer	Szilárd Páll <pall.szilard@gmail.com>
	Wed, 26 Feb 2020 15:18:29 +0000 (16:18 +0100)
docs/release-notes/2019/2019.6.rst		patch \| blob \| history
src/gromacs/gpu_utils/gpu_utils.cu		patch \| blob \| history
src/gromacs/hardware/gpu_hw_info.cpp		patch \| blob \| history
src/gromacs/hardware/gpu_hw_info.h		patch \| blob \| history