When a CUDA-enabled binary was run on a node with no CUDA driver
available, a note was issued that the version of the CUDA driver is
insufficient, which was wrong.
Fixed this by separating detection of a valid CUDA driver (or OpenCL
platform) from enumerating the compatible devices. This permits a
GPU-enabled build configuration to gracefully degrade to the same
behaviour as a CPU-only build configuration.
Also suppressed more warnings about use of OpenCL API elements that
have been deprecated but which we intend to contiune to use
regardless.
Also fixed confusing name of rank_local, and replaced it with a
boolean that cleanly describes the required functionality.
Also fixed and simplified logic of printing the GPU report. The
implementation only prints details about the node of the master rank,
so there is no value in checking a variable that reflects the number
of GPUs detected across all nodes.
Fixes #2322
Change-Id: I831d3c0017dafc00f7bb82e3f71be5b122657d1e
#define GMX_GPU_UTILS_GMXOPENCL_H
/*! \brief Declare to OpenCL SDKs that we intend to use OpenCL API
#define GMX_GPU_UTILS_GMXOPENCL_H
/*! \brief Declare to OpenCL SDKs that we intend to use OpenCL API
- features that were deprecated in 2.0, so that they don't warn about
- it. */
+ features that were deprecated in 1.2 or 2.0, so that they don't
+ warn about it. */
+///@{
+# define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+# define CL_USE_DEPRECATED_OPENCL_1_2_APIS
# define CL_USE_DEPRECATED_OPENCL_2_0_APIS
# define CL_USE_DEPRECATED_OPENCL_2_0_APIS
# ifdef __APPLE__
# include <OpenCL/opencl.h>
# else
# ifdef __APPLE__
# include <OpenCL/opencl.h>
# else
+bool canDetectGpus()
+{
+ cudaError_t stat;
+ int driverVersion = -1;
+ stat = cudaDriverGetVersion(&driverVersion);
+ GMX_ASSERT(stat != cudaErrorInvalidValue, "An impossible null pointer was passed to cudaDriverGetVersion");
+ GMX_RELEASE_ASSERT(stat == cudaSuccess,
+ gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
+ cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
+ bool foundDriver = (driverVersion > 0);
+ return foundDriver;
+}
int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
{
int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
{
Check /*<< Only check whether the task has completed */
};
Check /*<< Only check whether the task has completed */
};
+/*! \brief Return whether GPUs can be detected
+ *
+ * Returns true when this is a build of \Gromacs configured to support
+ * GPU usage, and a valid device driver or ICD was detected by the GPU
+ * runtime.
+ *
+ * Does not throw. */
+GPU_FUNC_QUALIFIER
+bool canDetectGpus() GPU_FUNC_TERM_WITH_RETURN(false);
+
/*! \brief Detect all GPUs in the system.
*
/*! \brief Detect all GPUs in the system.
*
- * Will detect every GPU supported by the device driver in use. Also
- * check for the compatibility of each and fill the gpu_info->gpu_dev array
- * with the required information on each the device: ID, device properties,
- * status.
+ * Will detect every GPU supported by the device driver in use. If
+ * the device driver is missing or unsuitable, returns the same error
+ * as for "no valid devices detected," so generally calling code
+ * should have checked the return value from canDetectGpus() first,
+ * in order to understand the behaviour of this routine. This routine
+ * also checks for the compatibility of each and fill the
+ * gpu_info->gpu_dev array with the required information on each the
+ * device: ID, device properties, status.
*
* \param[in] gpu_info pointer to structure holding GPU information.
* \param[out] err_str The error message of any GPU API error that caused
*
* \param[in] gpu_info pointer to structure holding GPU information.
* \param[out] err_str The error message of any GPU API error that caused
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/smalloc.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/smalloc.h"
+#include "gromacs/utility/stringutil.h"
/*! \brief Helper macro for error handling */
#define CALLOCLFUNC_LOGERROR(func, err_str, retval) { \
/*! \brief Helper macro for error handling */
#define CALLOCLFUNC_LOGERROR(func, err_str, retval) { \
+//! This function is documented in the header file
+bool canDetectGpus()
+{
+ cl_uint numPlatforms = -1;
+ cl_int status = clGetPlatformIDs(0, nullptr, &numPlatforms);
+ GMX_ASSERT(status != CL_INVALID_VALUE, "Incorrect call of clGetPlatformIDs detected");
+ if (status == CL_PLATFORM_NOT_FOUND_KHR)
+ {
+ // No valid ICDs found
+ return false;
+ }
+ GMX_RELEASE_ASSERT(status == CL_SUCCESS,
+ gmx::formatString("An unexpected value was returned from clGetPlatformIDs %u: %s",
+ status, ocl_get_error_string(status).c_str()).c_str());
+ bool foundPlatform = (numPlatforms > 0);
+ return foundPlatform;
+}
+
//! This function is documented in the header file
int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
{
//! This function is documented in the header file
int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
{
int rank_world;
MPI_Comm physicalnode_comm;
#endif
int rank_world;
MPI_Comm physicalnode_comm;
#endif
+ bool isMasterRankOfNode;
hwinfo_g->gpu_info.bDetectGPUs =
(bGPUBinary && getenv("GMX_DISABLE_GPU_DETECTION") == nullptr);
hwinfo_g->gpu_info.bDetectGPUs =
(bGPUBinary && getenv("GMX_DISABLE_GPU_DETECTION") == nullptr);
MPI_Comm_rank(MPI_COMM_WORLD, &rank_world);
MPI_Comm_split(MPI_COMM_WORLD, gmx_physicalnode_id_hash(),
rank_world, &physicalnode_comm);
MPI_Comm_rank(MPI_COMM_WORLD, &rank_world);
MPI_Comm_split(MPI_COMM_WORLD, gmx_physicalnode_id_hash(),
rank_world, &physicalnode_comm);
- MPI_Comm_rank(physicalnode_comm, &rank_local);
+ {
+ int rankOnNode = -1;
+ MPI_Comm_rank(physicalnode_comm, &rankOnNode);
+ isMasterRankOfNode = (rankOnNode == 0);
+ }
GMX_UNUSED_VALUE(cr);
#else
GMX_UNUSED_VALUE(cr);
#else
- /* Here there should be only one process, check this */
+ // Here there should be only one process, because if we are using
+ // thread-MPI, only one thread is active so far. So we check this.
GMX_RELEASE_ASSERT(cr->nnodes == 1 && cr->sim_nodeid == 0, "Only a single (master) process should execute here");
GMX_RELEASE_ASSERT(cr->nnodes == 1 && cr->sim_nodeid == 0, "Only a single (master) process should execute here");
+ isMasterRankOfNode = true;
#endif
/* With CUDA detect only on one rank per host, with OpenCL need do
* the detection on all PP ranks */
bool isOpenclPpRank = ((GMX_GPU == GMX_GPU_OPENCL) && thisRankHasDuty(cr, DUTY_PP));
#endif
/* With CUDA detect only on one rank per host, with OpenCL need do
* the detection on all PP ranks */
bool isOpenclPpRank = ((GMX_GPU == GMX_GPU_OPENCL) && thisRankHasDuty(cr, DUTY_PP));
- if (rank_local == 0 || isOpenclPpRank)
+ bool gpusCanBeDetected = false;
+ if (isMasterRankOfNode || isOpenclPpRank)
+ {
+ gpusCanBeDetected = canDetectGpus();
+ // No need to tell the user anything at this point, they get a
+ // hardware report later.
+ }
+
+ if (gpusCanBeDetected)
{
char detection_error[STRLEN] = "", sbuf[STRLEN];
{
char detection_error[STRLEN] = "", sbuf[STRLEN];
dev_size = hwinfo_g->gpu_info.n_dev*sizeof_gpu_dev_info();
dev_size = hwinfo_g->gpu_info.n_dev*sizeof_gpu_dev_info();
+ if (!isMasterRankOfNode)
{
hwinfo_g->gpu_info.gpu_dev =
(struct gmx_device_info_t *)malloc(dev_size);
{
hwinfo_g->gpu_info.gpu_dev =
(struct gmx_device_info_t *)malloc(dev_size);
- if (bGPUBinary && (hwinfo->ngpu_compatible_tot > 0 ||
- hwinfo->gpu_info.n_dev > 0))
+ if (bGPUBinary && hwinfo->gpu_info.n_dev > 0)
{
s += gmx::formatString(" GPU info:\n");
s += gmx::formatString(" Number of GPUs detected: %d\n",
hwinfo->gpu_info.n_dev);
{
s += gmx::formatString(" GPU info:\n");
s += gmx::formatString(" Number of GPUs detected: %d\n",
hwinfo->gpu_info.n_dev);
- if (hwinfo->gpu_info.n_dev > 0)
- {
- s += sprint_gpus(hwinfo->gpu_info) + "\n";
- }
+ s += sprint_gpus(hwinfo->gpu_info) + "\n";