From 6803c6583682616e7e9018e38dac834bbc0f2c6f Mon Sep 17 00:00:00 2001
From: Mark Abraham <mark.j.abraham@gmail.com>
Date: Thu, 21 Dec 2017 21:56:59 +1100
Subject: [PATCH] Separate canDetectGpus and findGpus futher, and fix tests

Renamed detect_gpus to findGpus so that no code can silently call
detect_gpus while forgetting to call the required canDetectGpus first.
Some test code is updated accordingly, which should have happened
earlier. The function with the new name now needs no return value, so
the formerly confusing return value of zero for success is no longer
present.

Shifted some more responsibilities from findGpus to canDetectGpus, so
that the latter now has responsibility for ensuring that when it
returns true, the former will always succeed.

Fixed tests that compile with CUDA, but cannot run unless there
are visible comatible devices and a valid context.

Refs #2347, #2322, #2321

Change-Id: I34acf8be4c0f0dcc29e931d83c970ba945865ca7
---
 src/gromacs/gpu_utils/gpu_utils.cu            | 90 ++++++++++++-------
 src/gromacs/gpu_utils/gpu_utils.h             | 28 +++---
 src/gromacs/gpu_utils/gpu_utils_ocl.cpp       | 45 +++++-----
 src/gromacs/gpu_utils/tests/gputest.cpp       |  8 +-
 src/gromacs/gpu_utils/tests/hostallocator.cpp | 10 +++
 .../gpu_utils/tests/pinnedmemorychecker.cpp   | 25 ++++++
 src/gromacs/hardware/detecthardware.cpp       | 33 +++----
 src/programs/mdrun/tests/pmetest.cpp          | 14 +--
 8 files changed, 152 insertions(+), 101 deletions(-)

diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu
index 8aa59a45e5..9ca379ee96 100644
--- a/src/gromacs/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gpu_utils/gpu_utils.cu
@@ -55,6 +55,7 @@
 #include "gromacs/hardware/gpu_hw_info.h"
 #include "gromacs/utility/basedefinitions.h"
 #include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/gmxassert.h"
 #include "gromacs/utility/logger.h"
@@ -644,7 +645,7 @@ static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
     }
 }
 
-bool canDetectGpus()
+bool canDetectGpus(std::string *errorMessage)
 {
     cudaError_t        stat;
     int                driverVersion = -1;
@@ -654,18 +655,56 @@ bool canDetectGpus()
                        gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
                                          cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
     bool foundDriver = (driverVersion > 0);
-    return foundDriver;
+    if (!foundDriver)
+    {
+        // Can't detect GPUs if there is no driver
+        if (errorMessage != nullptr)
+        {
+            errorMessage->assign("No valid CUDA driver found");
+        }
+        return false;
+    }
+
+    int numDevices;
+    stat = cudaGetDeviceCount(&numDevices);
+    if (stat != cudaSuccess)
+    {
+        if (errorMessage != nullptr)
+        {
+            /* cudaGetDeviceCount failed which means that there is
+             * something wrong with the machine: driver-runtime
+             * mismatch, all GPUs being busy in exclusive mode,
+             * invalid CUDA_VISIBLE_DEVICES, or some other condition
+             * which should result in GROMACS issuing a warning a
+             * falling back to CPUs. */
+            errorMessage->assign(cudaGetErrorString(stat));
+        }
+
+        // Consume the error now that we have prepared to handle
+        // it. This stops it reappearing next time we check for
+        // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
+        // valid devices, then cudaGetLastError returns the
+        // (undocumented) cudaErrorNoDevice, but this should not be a
+        // problem as there should be no future CUDA API calls.
+        // NVIDIA bug report #2038718 has been filed.
+        cudaGetLastError();
+        // Can't detect GPUs
+        return false;
+    }
+
+    // We don't actually use numDevices here, that's not the job of
+    // this function.
+    return true;
 }
 
-int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
+void findGpus(gmx_gpu_info_t *gpu_info)
 {
-    int                i, ndev, checkres, retval;
+    int                i, ndev, checkres;
     cudaError_t        stat;
     cudaDeviceProp     prop;
     gmx_device_info_t *devs;
 
     assert(gpu_info);
-    assert(err_str);
 
     gpu_info->n_dev_compatible = 0;
 
@@ -675,43 +714,28 @@ int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
     stat = cudaGetDeviceCount(&ndev);
     if (stat != cudaSuccess)
     {
-        const char *s;
-
-        /* cudaGetDeviceCount failed which means that there is something
-         * wrong with the machine: driver-runtime mismatch, all GPUs being
-         * busy in exclusive mode, or some other condition which should
-         * result in us issuing a warning a falling back to CPUs. */
-        retval = -1;
-        s      = cudaGetErrorString(stat);
-        strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
-
-        // Consume the error now that we have prepared to handle
-        // it. This stops it reappearing next time we check for errors.
-        cudaGetLastError();
+        GMX_THROW(gmx::InternalError("Invalid call of findGpus() when CUDA API returned an error, perhaps "
+                                     "canDetectGpus() was not called appropriately beforehand."));
     }
-    else
+
+    snew(devs, ndev);
+    for (i = 0; i < ndev; i++)
     {
-        snew(devs, ndev);
-        for (i = 0; i < ndev; i++)
-        {
-            checkres = is_gmx_supported_gpu_id(i, &prop);
+        checkres = is_gmx_supported_gpu_id(i, &prop);
 
-            devs[i].id   = i;
-            devs[i].prop = prop;
-            devs[i].stat = checkres;
+        devs[i].id   = i;
+        devs[i].prop = prop;
+        devs[i].stat = checkres;
 
-            if (checkres == egpuCompatible)
-            {
-                gpu_info->n_dev_compatible++;
-            }
+        if (checkres == egpuCompatible)
+        {
+            gpu_info->n_dev_compatible++;
         }
-        retval = 0;
     }
+    GMX_RELEASE_ASSERT(cudaSuccess == cudaPeekAtLastError(), "Should be cudaSuccess");
 
     gpu_info->n_dev   = ndev;
     gpu_info->gpu_dev = devs;
-
-    return retval;
 }
 
 std::vector<int> getCompatibleGpus(const gmx_gpu_info_t &gpu_info)
diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h
index d7e724aedf..1a5ced2c0b 100644
--- a/src/gromacs/gpu_utils/gpu_utils.h
+++ b/src/gromacs/gpu_utils/gpu_utils.h
@@ -47,6 +47,7 @@
 
 #include <cstdio>
 
+#include <string>
 #include <vector>
 
 #include "gromacs/gpu_utils/gpu_macros.h"
@@ -77,32 +78,31 @@ enum class GpuTaskCompletion
 /*! \brief Return whether GPUs can be detected
  *
  * Returns true when this is a build of \Gromacs configured to support
- * GPU usage, and a valid device driver or ICD was detected by the GPU
- * runtime.
+ * GPU usage, and a valid device driver, ICD, and/or runtime was detected.
+ *
+ * \param[out] errorMessage  When returning false and non-nullptr was passed,
+ *                           the string contains a descriptive message about
+ *                           why GPUs cannot be detected.
  *
  * Does not throw. */
 GPU_FUNC_QUALIFIER
-bool canDetectGpus() GPU_FUNC_TERM_WITH_RETURN(false);
+bool canDetectGpus(std::string *GPU_FUNC_ARGUMENT(errorMessage)) GPU_FUNC_TERM_WITH_RETURN(false);
 
-/*! \brief Detect all GPUs in the system.
+/*! \brief Find all GPUs in the system.
  *
- *  Will detect every GPU supported by the device driver in use. If
- *  the device driver is missing or unsuitable, returns the same error
- *  as for "no valid devices detected," so generally calling code
- *  should have checked the return value from canDetectGpus() first,
- *  in order to understand the behaviour of this routine. This routine
+ *  Will detect every GPU supported by the device driver in use. Must
+ *  only be called if canDetectGpus() has returned true. This routine
  *  also checks for the compatibility of each and fill the
  *  gpu_info->gpu_dev array with the required information on each the
  *  device: ID, device properties, status.
  *
  *  \param[in] gpu_info    pointer to structure holding GPU information.
- *  \param[out] err_str    The error message of any GPU API error that caused
- *                         the detection to fail (if there was any). The memory
- *                         the pointer points to should be managed externally.
- *  \returns               non-zero if the detection encountered a failure, zero otherwise.
+ *
+ *  \throws                InternalError if a GPU API returns an unexpected failure (because
+ *                         the call to canDetectGpus() should always prevent this occuring)
  */
 GPU_FUNC_QUALIFIER
-int detect_gpus(struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info), char *GPU_FUNC_ARGUMENT(err_str)) GPU_FUNC_TERM_WITH_RETURN(-1)
+void findGpus(struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info)) GPU_FUNC_TERM
 
 /*! \brief Return a container of the detected GPUs that are compatible.
  *
diff --git a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
index dc1293d32f..ee7683db84 100644
--- a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
+++ b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
@@ -58,23 +58,11 @@
 #include "gromacs/hardware/hw_info.h"
 #include "gromacs/mdtypes/md_enums.h"
 #include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/smalloc.h"
 #include "gromacs/utility/stringutil.h"
 
-/*! \brief Helper macro for error handling */
-#define CALLOCLFUNC_LOGERROR(func, err_str, retval) { \
-        cl_int opencl_ret = func; \
-        if (CL_SUCCESS != opencl_ret) \
-        { \
-            sprintf(err_str, "OpenCL error %d", opencl_ret); \
-            retval = -1; \
-        } \
-        else{ \
-            retval = 0; } \
-}
-
-
 /*! \brief Return true if executing on compatible OS for AMD OpenCL.
  *
  * This is assumed to be true for OS X version of at least 10.10.4 and
@@ -163,7 +151,7 @@ static ocl_vendor_id_t get_vendor_id(char *vendor_name)
 
 
 //! This function is documented in the header file
-bool canDetectGpus()
+bool canDetectGpus(std::string *errorMessage)
 {
     cl_uint numPlatforms;
     cl_int  status       = clGetPlatformIDs(0, nullptr, &numPlatforms);
@@ -171,24 +159,30 @@ bool canDetectGpus()
     if (status == CL_PLATFORM_NOT_FOUND_KHR)
     {
         // No valid ICDs found
+        if (errorMessage != nullptr)
+        {
+            errorMessage->assign("No valid OpenCL driver found");
+        }
         return false;
     }
     GMX_RELEASE_ASSERT(status == CL_SUCCESS,
                        gmx::formatString("An unexpected value was returned from clGetPlatformIDs %u: %s",
                                          status, ocl_get_error_string(status).c_str()).c_str());
     bool foundPlatform = (numPlatforms > 0);
+    if (!foundPlatform && errorMessage != nullptr)
+    {
+        errorMessage->assign("No OpenCL platforms found even though the driver was valid");
+    }
     return foundPlatform;
 }
 
 //! This function is documented in the header file
-int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
+void findGpus(gmx_gpu_info_t *gpu_info)
 {
-    int             retval;
     cl_uint         ocl_platform_count;
     cl_platform_id *ocl_platform_ids;
     cl_device_type  req_dev_type = CL_DEVICE_TYPE_GPU;
 
-    retval           = 0;
     ocl_platform_ids = NULL;
 
     if (getenv("GMX_OCL_FORCE_CPU") != NULL)
@@ -198,23 +192,26 @@ int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
 
     while (1)
     {
-        CALLOCLFUNC_LOGERROR(clGetPlatformIDs(0, NULL, &ocl_platform_count), err_str, retval)
-        if (0 != retval)
+        cl_int status = clGetPlatformIDs(0, NULL, &ocl_platform_count);
+        if (CL_SUCCESS != status)
         {
-            break;
+            GMX_THROW(gmx::InternalError(gmx::formatString("An unexpected value %u was returned from clGetPlatformIDs: ",
+                                                           status) + ocl_get_error_string(status)));
         }
 
         if (1 > ocl_platform_count)
         {
+            // TODO this should have a descriptive error message that we only support one OpenCL platform
             break;
         }
 
         snew(ocl_platform_ids, ocl_platform_count);
 
-        CALLOCLFUNC_LOGERROR(clGetPlatformIDs(ocl_platform_count, ocl_platform_ids, NULL), err_str, retval)
-        if (0 != retval)
+        status = clGetPlatformIDs(ocl_platform_count, ocl_platform_ids, NULL);
+        if (CL_SUCCESS != status)
         {
-            break;
+            GMX_THROW(gmx::InternalError(gmx::formatString("An unexpected value %u was returned from clGetPlatformIDs: ",
+                                                           status) + ocl_get_error_string(status)));
         }
 
         for (unsigned int i = 0; i < ocl_platform_count; i++)
@@ -346,8 +343,6 @@ int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
     }
 
     sfree(ocl_platform_ids);
-
-    return retval;
 }
 
 //! This function is documented in the header file
diff --git a/src/gromacs/gpu_utils/tests/gputest.cpp b/src/gromacs/gpu_utils/tests/gputest.cpp
index 2cd80e1609..03abbf8784 100644
--- a/src/gromacs/gpu_utils/tests/gputest.cpp
+++ b/src/gromacs/gpu_utils/tests/gputest.cpp
@@ -46,7 +46,6 @@
 
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/hardware/gpu_hw_info.h"
-#include "gromacs/utility/cstringutil.h"
 #include "gromacs/utility/smalloc.h"
 
 namespace gmx
@@ -57,8 +56,11 @@ namespace test
 GpuTest::GpuTest()
 {
     snew(gpuInfo_, 1);
-    char errorString[STRLEN];
-    detect_gpus(gpuInfo_, errorString);
+    if (canDetectGpus(nullptr))
+    {
+        findGpus(gpuInfo_);
+    }
+    // Failing to find valid GPUs does not require further action
 }
 
 GpuTest::~GpuTest()
diff --git a/src/gromacs/gpu_utils/tests/hostallocator.cpp b/src/gromacs/gpu_utils/tests/hostallocator.cpp
index 809cce2b5a..53daa9cb98 100644
--- a/src/gromacs/gpu_utils/tests/hostallocator.cpp
+++ b/src/gromacs/gpu_utils/tests/hostallocator.cpp
@@ -226,6 +226,11 @@ TYPED_TEST(HostAllocatorTest, FillInputAlsoWorksAfterCallingReserve)
 
 TYPED_TEST(HostAllocatorTest, TransfersWithPinningWorkWithCuda)
 {
+    if (!this->haveValidGpus())
+    {
+        return;
+    }
+
     typename TestFixture::VectorType input;
     changePinningPolicy(&input, PinningPolicy::CanBePinned);
     this->fillInput(&input);
@@ -246,6 +251,11 @@ bool isPinned(const VectorType &v)
 
 TYPED_TEST(HostAllocatorTest, ManualPinningOperationsWorkWithCuda)
 {
+    if (!this->haveValidGpus())
+    {
+        return;
+    }
+
     typename TestFixture::VectorType input;
     changePinningPolicy(&input, PinningPolicy::CanBePinned);
     EXPECT_FALSE(isPinned(input));
diff --git a/src/gromacs/gpu_utils/tests/pinnedmemorychecker.cpp b/src/gromacs/gpu_utils/tests/pinnedmemorychecker.cpp
index 0f34dab15f..f6d1643a9e 100644
--- a/src/gromacs/gpu_utils/tests/pinnedmemorychecker.cpp
+++ b/src/gromacs/gpu_utils/tests/pinnedmemorychecker.cpp
@@ -66,12 +66,22 @@ using PinnedMemoryCheckerTest = GpuTest;
 
 TEST_F(PinnedMemoryCheckerTest, DefaultContainerIsRecognized)
 {
+    if (!haveValidGpus())
+    {
+        return;
+    }
+
     std::vector<real> dummy(3, 1.5);
     EXPECT_FALSE(isHostMemoryPinned(dummy.data()));
 }
 
 TEST_F(PinnedMemoryCheckerTest, NonpinnedContainerIsRecognized)
 {
+    if (!haveValidGpus())
+    {
+        return;
+    }
+
     HostVector<real> dummy(3, 1.5);
     changePinningPolicy(&dummy, PinningPolicy::CannotBePinned);
     EXPECT_FALSE(isHostMemoryPinned(dummy.data()));
@@ -79,6 +89,11 @@ TEST_F(PinnedMemoryCheckerTest, NonpinnedContainerIsRecognized)
 
 TEST_F(PinnedMemoryCheckerTest, PinnedContainerIsRecognized)
 {
+    if (!haveValidGpus())
+    {
+        return;
+    }
+
     HostVector<real> dummy(3, 1.5);
     changePinningPolicy(&dummy, PinningPolicy::CanBePinned);
     EXPECT_TRUE(isHostMemoryPinned(dummy.data()));
@@ -86,6 +101,11 @@ TEST_F(PinnedMemoryCheckerTest, PinnedContainerIsRecognized)
 
 TEST_F(PinnedMemoryCheckerTest, DefaultCBufferIsRecognized)
 {
+    if (!haveValidGpus())
+    {
+        return;
+    }
+
     real *dummy;
     snew(dummy, 3);
     EXPECT_FALSE(isHostMemoryPinned(dummy));
@@ -94,6 +114,11 @@ TEST_F(PinnedMemoryCheckerTest, DefaultCBufferIsRecognized)
 
 TEST_F(PinnedMemoryCheckerTest, PinnedCBufferIsRecognized)
 {
+    if (!haveValidGpus())
+    {
+        return;
+    }
+
     real *dummy = nullptr;
     pmalloc((void **)&dummy, 3 * sizeof(real));
     EXPECT_TRUE(isHostMemoryPinned(dummy));
diff --git a/src/gromacs/hardware/detecthardware.cpp b/src/gromacs/hardware/detecthardware.cpp
index 015f3003ad..f72237eab6 100644
--- a/src/gromacs/hardware/detecthardware.cpp
+++ b/src/gromacs/hardware/detecthardware.cpp
@@ -162,32 +162,25 @@ static void gmx_detect_gpus(const gmx::MDLogger &mdlog, const t_commrec *cr)
     bool gpusCanBeDetected = false;
     if (isMasterRankOfNode || isOpenclPpRank)
     {
-        gpusCanBeDetected = canDetectGpus();
-        // No need to tell the user anything at this point, they get a
-        // hardware report later.
-    }
-
-    if (gpusCanBeDetected)
-    {
-        char detection_error[STRLEN] = "", sbuf[STRLEN];
-
-        if (detect_gpus(&hwinfo_g->gpu_info, detection_error) != 0)
+        std::string errorMessage;
+        gpusCanBeDetected = canDetectGpus(&errorMessage);
+        if (!gpusCanBeDetected)
         {
-            if (detection_error[0] != '\0')
-            {
-                sprintf(sbuf, ":\n      %s\n", detection_error);
-            }
-            else
-            {
-                sprintf(sbuf, ".");
-            }
             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
-                    "NOTE: Error occurred during GPU detection%s"
+                    "NOTE: GPUs cannot be detected:\n"
+                    "      %s\n"
                     "      Can not use GPU acceleration, will fall back to CPU kernels.",
-                    sbuf);
+                    errorMessage.c_str());
         }
     }
 
+    if (gpusCanBeDetected)
+    {
+        findGpus(&hwinfo_g->gpu_info);
+        // No need to tell the user anything at this point, they get a
+        // hardware report later.
+    }
+
 #if GMX_LIB_MPI
     if (!isOpenclPpRank)
     {
diff --git a/src/programs/mdrun/tests/pmetest.cpp b/src/programs/mdrun/tests/pmetest.cpp
index 57ae23e152..0c11b336b6 100644
--- a/src/programs/mdrun/tests/pmetest.cpp
+++ b/src/programs/mdrun/tests/pmetest.cpp
@@ -75,7 +75,9 @@ namespace test
 namespace
 {
 
-//! A basic PME runner
+/*! \brief A basic PME runner
+ *
+ * \todo Consider also using GpuTest class. */
 class PmeTest : public MdrunTestFixture
 {
     public:
@@ -90,16 +92,16 @@ bool PmeTest::s_hasCompatibleCudaGpus = false;
 void PmeTest::SetUpTestCase()
 {
     gmx_gpu_info_t gpuInfo {};
-    char           detection_error[STRLEN];
-    GMX_UNUSED_VALUE(detection_error); //TODO
     // It would be nicer to do this detection once and have mdrun
     // re-use it, but this is OK. Note that this also caters for when
     // there is no GPU support in the build.
+    //
+    // TODO report any error messages gracefully.
     if (GMX_GPU == GMX_GPU_CUDA &&
-        (detect_gpus(&gpuInfo, detection_error) >= 0) &&
-        gpuInfo.n_dev_compatible > 0)
+        canDetectGpus(nullptr))
     {
-        s_hasCompatibleCudaGpus = true;
+        findGpus(&gpuInfo);
+        s_hasCompatibleCudaGpus = (gpuInfo.n_dev_compatible > 0);
     }
     free_gpu_info(&gpuInfo);
 }
-- 
2.22.0