Check for GPU detection support before detecting

author Mark Abraham <mark.j.abraham@gmail.com>

Sun, 10 Dec 2017 07:35:52 +0000 (18:35 +1100)

committer Erik Lindahl <erik.lindahl@gmail.com>

Mon, 11 Dec 2017 10:08:32 +0000 (11:08 +0100)
author Mark Abraham <mark.j.abraham@gmail.com>
Sun, 10 Dec 2017 07:35:52 +0000 (18:35 +1100)
committer Erik Lindahl <erik.lindahl@gmail.com>
Mon, 11 Dec 2017 10:08:32 +0000 (11:08 +0100)
diff --git a/src/gromacs/gpu_utils/gmxopencl.h b/src/gromacs/gpu_utils/gmxopencl.h

index b9ccfa361d3973ea56d7b6bdf96950575675dc3f..5119d8985afbcbd791ffd9c2dbec4ceff1aa4b74 100644 (file)
--- a/src/gromacs/gpu_utils/gmxopencl.h
+++ b/src/gromacs/gpu_utils/gmxopencl.h
@@ -50,9 +50,13 @@
  #define GMX_GPU_UTILS_GMXOPENCL_H
  
  /*! \brief Declare to OpenCL SDKs that we intend to use OpenCL API
  #define GMX_GPU_UTILS_GMXOPENCL_H
  
  /*! \brief Declare to OpenCL SDKs that we intend to use OpenCL API
-   features that were deprecated in 2.0, so that they don't warn about
-   it. */
+   features that were deprecated in 1.2 or 2.0, so that they don't
+   warn about it. */
+///@{
+#  define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#  define CL_USE_DEPRECATED_OPENCL_1_2_APIS
  #  define CL_USE_DEPRECATED_OPENCL_2_0_APIS
  #  define CL_USE_DEPRECATED_OPENCL_2_0_APIS
+///@}
  #  ifdef __APPLE__
  #    include <OpenCL/opencl.h>
  #  else
  #  ifdef __APPLE__
  #    include <OpenCL/opencl.h>
  #  else
diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu

index e479f3e7435f95d2a95d5af7df87a37c14cd2a7e..5364ff64c0f7d60c782611981399da10cb1f560e 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gpu_utils/gpu_utils.cu
@@ -636,6 +636,18 @@ static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
      }
  }
  
      }
  }
  
+bool canDetectGpus()
+{
+    cudaError_t        stat;
+    int                driverVersion = -1;
+    stat = cudaDriverGetVersion(&driverVersion);
+    GMX_ASSERT(stat != cudaErrorInvalidValue, "An impossible null pointer was passed to cudaDriverGetVersion");
+    GMX_RELEASE_ASSERT(stat == cudaSuccess,
+                       gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
+                                         cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
+    bool foundDriver = (driverVersion > 0);
+    return foundDriver;
+}
  
  int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
  {
  
  int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
  {
diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h

index 88323f87629083e02e13e43b88ce6b2b1098bfe5..d7e724aedf022b7e10d80b414201028bf3b19247 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_utils.h
+++ b/src/gromacs/gpu_utils/gpu_utils.h
@@ -74,12 +74,26 @@ enum class GpuTaskCompletion
      Check /*<< Only check whether the task has completed */
  };
  
      Check /*<< Only check whether the task has completed */
  };
  
+/*! \brief Return whether GPUs can be detected
+ *
+ * Returns true when this is a build of \Gromacs configured to support
+ * GPU usage, and a valid device driver or ICD was detected by the GPU
+ * runtime.
+ *
+ * Does not throw. */
+GPU_FUNC_QUALIFIER
+bool canDetectGpus() GPU_FUNC_TERM_WITH_RETURN(false);
+
  /*! \brief Detect all GPUs in the system.
   *
  /*! \brief Detect all GPUs in the system.
   *
- *  Will detect every GPU supported by the device driver in use. Also
- *  check for the compatibility of each and fill the gpu_info->gpu_dev array
- *  with the required information on each the device: ID, device properties,
- *  status.
+ *  Will detect every GPU supported by the device driver in use. If
+ *  the device driver is missing or unsuitable, returns the same error
+ *  as for "no valid devices detected," so generally calling code
+ *  should have checked the return value from canDetectGpus() first,
+ *  in order to understand the behaviour of this routine. This routine
+ *  also checks for the compatibility of each and fill the
+ *  gpu_info->gpu_dev array with the required information on each the
+ *  device: ID, device properties, status.
   *
   *  \param[in] gpu_info    pointer to structure holding GPU information.
   *  \param[out] err_str    The error message of any GPU API error that caused
   *
   *  \param[in] gpu_info    pointer to structure holding GPU information.
   *  \param[out] err_str    The error message of any GPU API error that caused
diff --git a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp

index c121f813b3904ed3870f1307c4acbd9682e92fc3..88edc975a198e6ebdcd7079915966d69b150e7ea 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
+++ b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
@@ -60,6 +60,7 @@
  #include "gromacs/utility/cstringutil.h"
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/smalloc.h"
  #include "gromacs/utility/cstringutil.h"
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/smalloc.h"
+#include "gromacs/utility/stringutil.h"
  
  /*! \brief Helper macro for error handling */
  #define CALLOCLFUNC_LOGERROR(func, err_str, retval) { \
  
  /*! \brief Helper macro for error handling */
  #define CALLOCLFUNC_LOGERROR(func, err_str, retval) { \
@@ -161,6 +162,24 @@ static ocl_vendor_id_t get_vendor_id(char *vendor_name)
  }
  
  
  }
  
  
+//! This function is documented in the header file
+bool canDetectGpus()
+{
+    cl_uint numPlatforms = -1;
+    cl_int  status       = clGetPlatformIDs(0, nullptr, &numPlatforms);
+    GMX_ASSERT(status != CL_INVALID_VALUE, "Incorrect call of clGetPlatformIDs detected");
+    if (status == CL_PLATFORM_NOT_FOUND_KHR)
+    {
+        // No valid ICDs found
+        return false;
+    }
+    GMX_RELEASE_ASSERT(status == CL_SUCCESS,
+                       gmx::formatString("An unexpected value was returned from clGetPlatformIDs %u: %s",
+                                         status, ocl_get_error_string(status).c_str()).c_str());
+    bool foundPlatform = (numPlatforms > 0);
+    return foundPlatform;
+}
+
  //! This function is documented in the header file
  int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
  {
  //! This function is documented in the header file
  int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
  {
diff --git a/src/gromacs/hardware/detecthardware.cpp b/src/gromacs/hardware/detecthardware.cpp

index a8e9304730e628f4726ee28290bca917c6aca6ed..015f3003ad2165cb9bf4797dab51ac6844da4f0e 100644 (file)
--- a/src/gromacs/hardware/detecthardware.cpp
+++ b/src/gromacs/hardware/detecthardware.cpp
@@ -114,7 +114,7 @@ static void gmx_detect_gpus(const gmx::MDLogger &mdlog, const t_commrec *cr)
      int              rank_world;
      MPI_Comm         physicalnode_comm;
  #endif
      int              rank_world;
      MPI_Comm         physicalnode_comm;
  #endif
-    int              rank_local;
+    bool             isMasterRankOfNode;
  
      hwinfo_g->gpu_info.bDetectGPUs =
          (bGPUBinary && getenv("GMX_DISABLE_GPU_DETECTION") == nullptr);
  
      hwinfo_g->gpu_info.bDetectGPUs =
          (bGPUBinary && getenv("GMX_DISABLE_GPU_DETECTION") == nullptr);
@@ -142,20 +142,32 @@ static void gmx_detect_gpus(const gmx::MDLogger &mdlog, const t_commrec *cr)
      MPI_Comm_rank(MPI_COMM_WORLD, &rank_world);
      MPI_Comm_split(MPI_COMM_WORLD, gmx_physicalnode_id_hash(),
                     rank_world, &physicalnode_comm);
      MPI_Comm_rank(MPI_COMM_WORLD, &rank_world);
      MPI_Comm_split(MPI_COMM_WORLD, gmx_physicalnode_id_hash(),
                     rank_world, &physicalnode_comm);
-    MPI_Comm_rank(physicalnode_comm, &rank_local);
+    {
+        int rankOnNode = -1;
+        MPI_Comm_rank(physicalnode_comm, &rankOnNode);
+        isMasterRankOfNode = (rankOnNode == 0);
+    }
      GMX_UNUSED_VALUE(cr);
  #else
      GMX_UNUSED_VALUE(cr);
  #else
-    /* Here there should be only one process, check this */
+    // Here there should be only one process, because if we are using
+    // thread-MPI, only one thread is active so far. So we check this.
      GMX_RELEASE_ASSERT(cr->nnodes == 1 && cr->sim_nodeid == 0, "Only a single (master) process should execute here");
      GMX_RELEASE_ASSERT(cr->nnodes == 1 && cr->sim_nodeid == 0, "Only a single (master) process should execute here");
-
-    rank_local = 0;
+    isMasterRankOfNode = true;
  #endif
  
      /*  With CUDA detect only on one rank per host, with OpenCL need do
       *  the detection on all PP ranks */
      bool isOpenclPpRank = ((GMX_GPU == GMX_GPU_OPENCL) && thisRankHasDuty(cr, DUTY_PP));
  
  #endif
  
      /*  With CUDA detect only on one rank per host, with OpenCL need do
       *  the detection on all PP ranks */
      bool isOpenclPpRank = ((GMX_GPU == GMX_GPU_OPENCL) && thisRankHasDuty(cr, DUTY_PP));
  
-    if (rank_local == 0 || isOpenclPpRank)
+    bool gpusCanBeDetected = false;
+    if (isMasterRankOfNode || isOpenclPpRank)
+    {
+        gpusCanBeDetected = canDetectGpus();
+        // No need to tell the user anything at this point, they get a
+        // hardware report later.
+    }
+
+    if (gpusCanBeDetected)
      {
          char detection_error[STRLEN] = "", sbuf[STRLEN];
  
      {
          char detection_error[STRLEN] = "", sbuf[STRLEN];
  
@@ -188,7 +200,7 @@ static void gmx_detect_gpus(const gmx::MDLogger &mdlog, const t_commrec *cr)
  
              dev_size = hwinfo_g->gpu_info.n_dev*sizeof_gpu_dev_info();
  
  
              dev_size = hwinfo_g->gpu_info.n_dev*sizeof_gpu_dev_info();
  
-            if (rank_local > 0)
+            if (!isMasterRankOfNode)
              {
                  hwinfo_g->gpu_info.gpu_dev =
                      (struct gmx_device_info_t *)malloc(dev_size);
              {
                  hwinfo_g->gpu_info.gpu_dev =
                      (struct gmx_device_info_t *)malloc(dev_size);
diff --git a/src/gromacs/hardware/printhardware.cpp b/src/gromacs/hardware/printhardware.cpp

index 17b120765bfb7795b35564dfb83a0801ae7f1b7b..76c9830c1eb1191a51b5b4f0f5deb2cc2959710b 100644 (file)
--- a/src/gromacs/hardware/printhardware.cpp
+++ b/src/gromacs/hardware/printhardware.cpp
@@ -344,16 +344,12 @@ static std::string detected_hardware_string(const gmx_hw_info_t *hwinfo,
          }
      }
  
          }
      }
  
-    if (bGPUBinary && (hwinfo->ngpu_compatible_tot > 0 ||
-                       hwinfo->gpu_info.n_dev > 0))
+    if (bGPUBinary && hwinfo->gpu_info.n_dev > 0)
      {
          s += gmx::formatString("  GPU info:\n");
          s += gmx::formatString("    Number of GPUs detected: %d\n",
                                 hwinfo->gpu_info.n_dev);
      {
          s += gmx::formatString("  GPU info:\n");
          s += gmx::formatString("    Number of GPUs detected: %d\n",
                                 hwinfo->gpu_info.n_dev);
-        if (hwinfo->gpu_info.n_dev > 0)
-        {
-            s += sprint_gpus(hwinfo->gpu_info) + "\n";
-        }
+        s += sprint_gpus(hwinfo->gpu_info) + "\n";
      }
      return s;
  }
      }
      return s;
  }
author	Mark Abraham <mark.j.abraham@gmail.com>
	Sun, 10 Dec 2017 07:35:52 +0000 (18:35 +1100)
committer	Erik Lindahl <erik.lindahl@gmail.com>
	Mon, 11 Dec 2017 10:08:32 +0000 (11:08 +0100)
src/gromacs/gpu_utils/gmxopencl.h		patch \| blob \| history
src/gromacs/gpu_utils/gpu_utils.cu		patch \| blob \| history
src/gromacs/gpu_utils/gpu_utils.h		patch \| blob \| history
src/gromacs/gpu_utils/gpu_utils_ocl.cpp		patch \| blob \| history
src/gromacs/hardware/detecthardware.cpp		patch \| blob \| history
src/gromacs/hardware/printhardware.cpp		patch \| blob \| history