Separate canDetectGpus and findGpus futher, and fix tests

author Mark Abraham <mark.j.abraham@gmail.com>

Thu, 21 Dec 2017 10:56:59 +0000 (21:56 +1100)

committer Erik Lindahl <erik.lindahl@gmail.com>

Fri, 29 Dec 2017 11:08:51 +0000 (12:08 +0100)
author Mark Abraham <mark.j.abraham@gmail.com>
Thu, 21 Dec 2017 10:56:59 +0000 (21:56 +1100)
committer Erik Lindahl <erik.lindahl@gmail.com>
Fri, 29 Dec 2017 11:08:51 +0000 (12:08 +0100)
diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu

index 8aa59a45e59ea2a2eacaec7b9d88ec7e3b7b7769..9ca379ee96366cf64016c78b09fdafd5891fcad5 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gpu_utils/gpu_utils.cu
@@ -55,6 +55,7 @@
  #include "gromacs/hardware/gpu_hw_info.h"
  #include "gromacs/utility/basedefinitions.h"
  #include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/logger.h"
@@ -644,7 +645,7 @@ static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
      }
  }
  
-bool canDetectGpus()
+bool canDetectGpus(std::string *errorMessage)
  {
      cudaError_t        stat;
      int                driverVersion = -1;
@@ -654,18 +655,56 @@ bool canDetectGpus()
                         gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
                                           cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
      bool foundDriver = (driverVersion > 0);
-    return foundDriver;
+    if (!foundDriver)
+    {
+        // Can't detect GPUs if there is no driver
+        if (errorMessage != nullptr)
+        {
+            errorMessage->assign("No valid CUDA driver found");
+        }
+        return false;
+    }
+
+    int numDevices;
+    stat = cudaGetDeviceCount(&numDevices);
+    if (stat != cudaSuccess)
+    {
+        if (errorMessage != nullptr)
+        {
+            /* cudaGetDeviceCount failed which means that there is
+             * something wrong with the machine: driver-runtime
+             * mismatch, all GPUs being busy in exclusive mode,
+             * invalid CUDA_VISIBLE_DEVICES, or some other condition
+             * which should result in GROMACS issuing a warning a
+             * falling back to CPUs. */
+            errorMessage->assign(cudaGetErrorString(stat));
+        }
+
+        // Consume the error now that we have prepared to handle
+        // it. This stops it reappearing next time we check for
+        // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
+        // valid devices, then cudaGetLastError returns the
+        // (undocumented) cudaErrorNoDevice, but this should not be a
+        // problem as there should be no future CUDA API calls.
+        // NVIDIA bug report #2038718 has been filed.
+        cudaGetLastError();
+        // Can't detect GPUs
+        return false;
+    }
+
+    // We don't actually use numDevices here, that's not the job of
+    // this function.
+    return true;
  }
  
-int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
+void findGpus(gmx_gpu_info_t *gpu_info)
  {
-    int                i, ndev, checkres, retval;
+    int                i, ndev, checkres;
      cudaError_t        stat;
      cudaDeviceProp     prop;
      gmx_device_info_t *devs;
  
      assert(gpu_info);
-    assert(err_str);
  
      gpu_info->n_dev_compatible = 0;
  
@@ -675,43 +714,28 @@ int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
      stat = cudaGetDeviceCount(&ndev);
      if (stat != cudaSuccess)
      {
-        const char *s;
-
-        /* cudaGetDeviceCount failed which means that there is something
-         * wrong with the machine: driver-runtime mismatch, all GPUs being
-         * busy in exclusive mode, or some other condition which should
-         * result in us issuing a warning a falling back to CPUs. */
-        retval = -1;
-        s      = cudaGetErrorString(stat);
-        strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
-
-        // Consume the error now that we have prepared to handle
-        // it. This stops it reappearing next time we check for errors.
-        cudaGetLastError();
+        GMX_THROW(gmx::InternalError("Invalid call of findGpus() when CUDA API returned an error, perhaps "
+                                     "canDetectGpus() was not called appropriately beforehand."));
      }
-    else
+
+    snew(devs, ndev);
+    for (i = 0; i < ndev; i++)
      {
-        snew(devs, ndev);
-        for (i = 0; i < ndev; i++)
-        {
-            checkres = is_gmx_supported_gpu_id(i, &prop);
+        checkres = is_gmx_supported_gpu_id(i, &prop);
  
-            devs[i].id   = i;
-            devs[i].prop = prop;
-            devs[i].stat = checkres;
+        devs[i].id   = i;
+        devs[i].prop = prop;
+        devs[i].stat = checkres;
  
-            if (checkres == egpuCompatible)
-            {
-                gpu_info->n_dev_compatible++;
-            }
+        if (checkres == egpuCompatible)
+        {
+            gpu_info->n_dev_compatible++;
          }
-        retval = 0;
      }
+    GMX_RELEASE_ASSERT(cudaSuccess == cudaPeekAtLastError(), "Should be cudaSuccess");
  
      gpu_info->n_dev   = ndev;
      gpu_info->gpu_dev = devs;
-
-    return retval;
  }
  
  std::vector<int> getCompatibleGpus(const gmx_gpu_info_t &gpu_info)
diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h

index d7e724aedf022b7e10d80b414201028bf3b19247..1a5ced2c0ba7640bebb8d1bccdd8f948edfb5367 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_utils.h
+++ b/src/gromacs/gpu_utils/gpu_utils.h
@@ -47,6 +47,7 @@
  
  #include <cstdio>
  
+#include <string>
  #include <vector>
  
  #include "gromacs/gpu_utils/gpu_macros.h"
@@ -77,32 +78,31 @@ enum class GpuTaskCompletion
  /*! \brief Return whether GPUs can be detected
   *
   * Returns true when this is a build of \Gromacs configured to support
- * GPU usage, and a valid device driver or ICD was detected by the GPU
- * runtime.
+ * GPU usage, and a valid device driver, ICD, and/or runtime was detected.
+ *
+ * \param[out] errorMessage  When returning false and non-nullptr was passed,
+ *                           the string contains a descriptive message about
+ *                           why GPUs cannot be detected.
   *
   * Does not throw. */
  GPU_FUNC_QUALIFIER
-bool canDetectGpus() GPU_FUNC_TERM_WITH_RETURN(false);
+bool canDetectGpus(std::string *GPU_FUNC_ARGUMENT(errorMessage)) GPU_FUNC_TERM_WITH_RETURN(false);
  
-/*! \brief Detect all GPUs in the system.
+/*! \brief Find all GPUs in the system.
   *
- *  Will detect every GPU supported by the device driver in use. If
- *  the device driver is missing or unsuitable, returns the same error
- *  as for "no valid devices detected," so generally calling code
- *  should have checked the return value from canDetectGpus() first,
- *  in order to understand the behaviour of this routine. This routine
+ *  Will detect every GPU supported by the device driver in use. Must
+ *  only be called if canDetectGpus() has returned true. This routine
   *  also checks for the compatibility of each and fill the
   *  gpu_info->gpu_dev array with the required information on each the
   *  device: ID, device properties, status.
   *
   *  \param[in] gpu_info    pointer to structure holding GPU information.
- *  \param[out] err_str    The error message of any GPU API error that caused
- *                         the detection to fail (if there was any). The memory
- *                         the pointer points to should be managed externally.
- *  \returns               non-zero if the detection encountered a failure, zero otherwise.
+ *
+ *  \throws                InternalError if a GPU API returns an unexpected failure (because
+ *                         the call to canDetectGpus() should always prevent this occuring)
   */
  GPU_FUNC_QUALIFIER
-int detect_gpus(struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info), char *GPU_FUNC_ARGUMENT(err_str)) GPU_FUNC_TERM_WITH_RETURN(-1)
+void findGpus(struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info)) GPU_FUNC_TERM
  
  /*! \brief Return a container of the detected GPUs that are compatible.
   *
diff --git a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp

index dc1293d32fc2ba7b7fe20a968698646fbe7fe75d..ee7683db84c7c7e8a3a9e945176525cf4c9f07a9 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
+++ b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
@@ -58,23 +58,11 @@
  #include "gromacs/hardware/hw_info.h"
  #include "gromacs/mdtypes/md_enums.h"
  #include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/smalloc.h"
  #include "gromacs/utility/stringutil.h"
  
-/*! \brief Helper macro for error handling */
-#define CALLOCLFUNC_LOGERROR(func, err_str, retval) { \
-        cl_int opencl_ret = func; \
-        if (CL_SUCCESS != opencl_ret) \
-        { \
-            sprintf(err_str, "OpenCL error %d", opencl_ret); \
-            retval = -1; \
-        } \
-        else{ \
-            retval = 0; } \
-}
-
-
  /*! \brief Return true if executing on compatible OS for AMD OpenCL.
   *
   * This is assumed to be true for OS X version of at least 10.10.4 and
@@ -163,7 +151,7 @@ static ocl_vendor_id_t get_vendor_id(char *vendor_name)
  
  
  //! This function is documented in the header file
-bool canDetectGpus()
+bool canDetectGpus(std::string *errorMessage)
  {
      cl_uint numPlatforms;
      cl_int  status       = clGetPlatformIDs(0, nullptr, &numPlatforms);
@@ -171,24 +159,30 @@ bool canDetectGpus()
      if (status == CL_PLATFORM_NOT_FOUND_KHR)
      {
          // No valid ICDs found
+        if (errorMessage != nullptr)
+        {
+            errorMessage->assign("No valid OpenCL driver found");
+        }
          return false;
      }
      GMX_RELEASE_ASSERT(status == CL_SUCCESS,
                         gmx::formatString("An unexpected value was returned from clGetPlatformIDs %u: %s",
                                           status, ocl_get_error_string(status).c_str()).c_str());
      bool foundPlatform = (numPlatforms > 0);
+    if (!foundPlatform && errorMessage != nullptr)
+    {
+        errorMessage->assign("No OpenCL platforms found even though the driver was valid");
+    }
      return foundPlatform;
  }
  
  //! This function is documented in the header file
-int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
+void findGpus(gmx_gpu_info_t *gpu_info)
  {
-    int             retval;
      cl_uint         ocl_platform_count;
      cl_platform_id *ocl_platform_ids;
      cl_device_type  req_dev_type = CL_DEVICE_TYPE_GPU;
  
-    retval           = 0;
      ocl_platform_ids = NULL;
  
      if (getenv("GMX_OCL_FORCE_CPU") != NULL)
@@ -198,23 +192,26 @@ int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
  
      while (1)
      {
-        CALLOCLFUNC_LOGERROR(clGetPlatformIDs(0, NULL, &ocl_platform_count), err_str, retval)
-        if (0 != retval)
+        cl_int status = clGetPlatformIDs(0, NULL, &ocl_platform_count);
+        if (CL_SUCCESS != status)
          {
-            break;
+            GMX_THROW(gmx::InternalError(gmx::formatString("An unexpected value %u was returned from clGetPlatformIDs: ",
+                                                           status) + ocl_get_error_string(status)));
          }
  
          if (1 > ocl_platform_count)
          {
+            // TODO this should have a descriptive error message that we only support one OpenCL platform
              break;
          }
  
          snew(ocl_platform_ids, ocl_platform_count);
  
-        CALLOCLFUNC_LOGERROR(clGetPlatformIDs(ocl_platform_count, ocl_platform_ids, NULL), err_str, retval)
-        if (0 != retval)
+        status = clGetPlatformIDs(ocl_platform_count, ocl_platform_ids, NULL);
+        if (CL_SUCCESS != status)
          {
-            break;
+            GMX_THROW(gmx::InternalError(gmx::formatString("An unexpected value %u was returned from clGetPlatformIDs: ",
+                                                           status) + ocl_get_error_string(status)));
          }
  
          for (unsigned int i = 0; i < ocl_platform_count; i++)
@@ -346,8 +343,6 @@ int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
      }
  
      sfree(ocl_platform_ids);
-
-    return retval;
  }
  
  //! This function is documented in the header file
diff --git a/src/gromacs/gpu_utils/tests/gputest.cpp b/src/gromacs/gpu_utils/tests/gputest.cpp

index 2cd80e1609a5391c8adc87c2a9d0851e173bb02a..03abbf8784524e8d40ff89ca87a1e6831429568f 100644 (file)
--- a/src/gromacs/gpu_utils/tests/gputest.cpp
+++ b/src/gromacs/gpu_utils/tests/gputest.cpp
@@ -46,7 +46,6 @@
  
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/hardware/gpu_hw_info.h"
-#include "gromacs/utility/cstringutil.h"
  #include "gromacs/utility/smalloc.h"
  
  namespace gmx
@@ -57,8 +56,11 @@ namespace test
  GpuTest::GpuTest()
  {
      snew(gpuInfo_, 1);
-    char errorString[STRLEN];
-    detect_gpus(gpuInfo_, errorString);
+    if (canDetectGpus(nullptr))
+    {
+        findGpus(gpuInfo_);
+    }
+    // Failing to find valid GPUs does not require further action
  }
  
  GpuTest::~GpuTest()
diff --git a/src/gromacs/gpu_utils/tests/hostallocator.cpp b/src/gromacs/gpu_utils/tests/hostallocator.cpp

index 809cce2b5a593292c82126e9493dc83f9943de32..53daa9cb980a2edc47b358416296ebfaf0877574 100644 (file)
--- a/src/gromacs/gpu_utils/tests/hostallocator.cpp
+++ b/src/gromacs/gpu_utils/tests/hostallocator.cpp
@@ -226,6 +226,11 @@ TYPED_TEST(HostAllocatorTest, FillInputAlsoWorksAfterCallingReserve)
  
  TYPED_TEST(HostAllocatorTest, TransfersWithPinningWorkWithCuda)
  {
+    if (!this->haveValidGpus())
+    {
+        return;
+    }
+
      typename TestFixture::VectorType input;
      changePinningPolicy(&input, PinningPolicy::CanBePinned);
      this->fillInput(&input);
@@ -246,6 +251,11 @@ bool isPinned(const VectorType &v)
  
  TYPED_TEST(HostAllocatorTest, ManualPinningOperationsWorkWithCuda)
  {
+    if (!this->haveValidGpus())
+    {
+        return;
+    }
+
      typename TestFixture::VectorType input;
      changePinningPolicy(&input, PinningPolicy::CanBePinned);
      EXPECT_FALSE(isPinned(input));
diff --git a/src/gromacs/gpu_utils/tests/pinnedmemorychecker.cpp b/src/gromacs/gpu_utils/tests/pinnedmemorychecker.cpp

index 0f34dab15fda23ba522351ee303e2feeaf8c298a..f6d1643a9e27b86a22cbe8099a5f66d1d5dbdf35 100644 (file)
--- a/src/gromacs/gpu_utils/tests/pinnedmemorychecker.cpp
+++ b/src/gromacs/gpu_utils/tests/pinnedmemorychecker.cpp
@@ -66,12 +66,22 @@ using PinnedMemoryCheckerTest = GpuTest;
  
  TEST_F(PinnedMemoryCheckerTest, DefaultContainerIsRecognized)
  {
+    if (!haveValidGpus())
+    {
+        return;
+    }
+
      std::vector<real> dummy(3, 1.5);
      EXPECT_FALSE(isHostMemoryPinned(dummy.data()));
  }
  
  TEST_F(PinnedMemoryCheckerTest, NonpinnedContainerIsRecognized)
  {
+    if (!haveValidGpus())
+    {
+        return;
+    }
+
      HostVector<real> dummy(3, 1.5);
      changePinningPolicy(&dummy, PinningPolicy::CannotBePinned);
      EXPECT_FALSE(isHostMemoryPinned(dummy.data()));
@@ -79,6 +89,11 @@ TEST_F(PinnedMemoryCheckerTest, NonpinnedContainerIsRecognized)
  
  TEST_F(PinnedMemoryCheckerTest, PinnedContainerIsRecognized)
  {
+    if (!haveValidGpus())
+    {
+        return;
+    }
+
      HostVector<real> dummy(3, 1.5);
      changePinningPolicy(&dummy, PinningPolicy::CanBePinned);
      EXPECT_TRUE(isHostMemoryPinned(dummy.data()));
@@ -86,6 +101,11 @@ TEST_F(PinnedMemoryCheckerTest, PinnedContainerIsRecognized)
  
  TEST_F(PinnedMemoryCheckerTest, DefaultCBufferIsRecognized)
  {
+    if (!haveValidGpus())
+    {
+        return;
+    }
+
      real *dummy;
      snew(dummy, 3);
      EXPECT_FALSE(isHostMemoryPinned(dummy));
@@ -94,6 +114,11 @@ TEST_F(PinnedMemoryCheckerTest, DefaultCBufferIsRecognized)
  
  TEST_F(PinnedMemoryCheckerTest, PinnedCBufferIsRecognized)
  {
+    if (!haveValidGpus())
+    {
+        return;
+    }
+
      real *dummy = nullptr;
      pmalloc((void **)&dummy, 3 * sizeof(real));
      EXPECT_TRUE(isHostMemoryPinned(dummy));
diff --git a/src/gromacs/hardware/detecthardware.cpp b/src/gromacs/hardware/detecthardware.cpp

index 015f3003ad2165cb9bf4797dab51ac6844da4f0e..f72237eab67ebfe9328168d1a4505161528b5bb7 100644 (file)
--- a/src/gromacs/hardware/detecthardware.cpp
+++ b/src/gromacs/hardware/detecthardware.cpp
@@ -162,32 +162,25 @@ static void gmx_detect_gpus(const gmx::MDLogger &mdlog, const t_commrec *cr)
      bool gpusCanBeDetected = false;
      if (isMasterRankOfNode || isOpenclPpRank)
      {
-        gpusCanBeDetected = canDetectGpus();
-        // No need to tell the user anything at this point, they get a
-        // hardware report later.
-    }
-
-    if (gpusCanBeDetected)
-    {
-        char detection_error[STRLEN] = "", sbuf[STRLEN];
-
-        if (detect_gpus(&hwinfo_g->gpu_info, detection_error) != 0)
+        std::string errorMessage;
+        gpusCanBeDetected = canDetectGpus(&errorMessage);
+        if (!gpusCanBeDetected)
          {
-            if (detection_error[0] != '\0')
-            {
-                sprintf(sbuf, ":\n      %s\n", detection_error);
-            }
-            else
-            {
-                sprintf(sbuf, ".");
-            }
              GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
-                    "NOTE: Error occurred during GPU detection%s"
+                    "NOTE: GPUs cannot be detected:\n"
+                    "      %s\n"
                      "      Can not use GPU acceleration, will fall back to CPU kernels.",
-                    sbuf);
+                    errorMessage.c_str());
          }
      }
  
+    if (gpusCanBeDetected)
+    {
+        findGpus(&hwinfo_g->gpu_info);
+        // No need to tell the user anything at this point, they get a
+        // hardware report later.
+    }
+
  #if GMX_LIB_MPI
      if (!isOpenclPpRank)
      {
diff --git a/src/programs/mdrun/tests/pmetest.cpp b/src/programs/mdrun/tests/pmetest.cpp

index 57ae23e152f422767b151c97c2ca13806a624c37..0c11b336b6df80b25ba03c11f83ee0dd5686de46 100644 (file)
--- a/src/programs/mdrun/tests/pmetest.cpp
+++ b/src/programs/mdrun/tests/pmetest.cpp
@@ -75,7 +75,9 @@ namespace test
  namespace
  {
  
-//! A basic PME runner
+/*! \brief A basic PME runner
+ *
+ * \todo Consider also using GpuTest class. */
  class PmeTest : public MdrunTestFixture
  {
      public:
@@ -90,16 +92,16 @@ bool PmeTest::s_hasCompatibleCudaGpus = false;
  void PmeTest::SetUpTestCase()
  {
      gmx_gpu_info_t gpuInfo {};
-    char           detection_error[STRLEN];
-    GMX_UNUSED_VALUE(detection_error); //TODO
      // It would be nicer to do this detection once and have mdrun
      // re-use it, but this is OK. Note that this also caters for when
      // there is no GPU support in the build.
+    //
+    // TODO report any error messages gracefully.
      if (GMX_GPU == GMX_GPU_CUDA &&
-        (detect_gpus(&gpuInfo, detection_error) >= 0) &&
-        gpuInfo.n_dev_compatible > 0)
+        canDetectGpus(nullptr))
      {
-        s_hasCompatibleCudaGpus = true;
+        findGpus(&gpuInfo);
+        s_hasCompatibleCudaGpus = (gpuInfo.n_dev_compatible > 0);
      }
      free_gpu_info(&gpuInfo);
  }
author	Mark Abraham <mark.j.abraham@gmail.com>
	Thu, 21 Dec 2017 10:56:59 +0000 (21:56 +1100)
committer	Erik Lindahl <erik.lindahl@gmail.com>
	Fri, 29 Dec 2017 11:08:51 +0000 (12:08 +0100)
src/gromacs/gpu_utils/gpu_utils.cu		patch \| blob \| history
src/gromacs/gpu_utils/gpu_utils.h		patch \| blob \| history
src/gromacs/gpu_utils/gpu_utils_ocl.cpp		patch \| blob \| history
src/gromacs/gpu_utils/tests/gputest.cpp		patch \| blob \| history
src/gromacs/gpu_utils/tests/hostallocator.cpp		patch \| blob \| history
src/gromacs/gpu_utils/tests/pinnedmemorychecker.cpp		patch \| blob \| history
src/gromacs/hardware/detecthardware.cpp		patch \| blob \| history
src/programs/mdrun/tests/pmetest.cpp		patch \| blob \| history