1. Make the array with device status names static.
2. Expilcitely list integers in enumeration.
}
#if GMX_GPU == GMX_GPU_NONE
-int gpu_info_get_stat(const gmx_gpu_info_t& /*unused*/, int /*unused*/)
+DeviceStatus gpu_info_get_stat(const gmx_gpu_info_t& /*unused*/, int /*unused*/)
{
- return egpuNonexistent;
+ return DeviceStatus::Nonexistent;
}
#endif
for (int i = 0; i < gpu_info.n_dev; i++)
{
assert(gpu_info.deviceInfo);
- if (gpu_info_get_stat(gpu_info, i) == egpuCompatible)
+ if (gpu_info_get_stat(gpu_info, i) == DeviceStatus::Compatible)
{
compatibleGpus.push_back(i);
}
const char* getGpuCompatibilityDescription(const gmx_gpu_info_t& gpu_info, int index)
{
- return (index >= gpu_info.n_dev ? gpu_detect_res_str[egpuNonexistent]
- : gpu_detect_res_str[gpu_info_get_stat(gpu_info, index)]);
+ return (index >= gpu_info.n_dev ? c_deviceStateString[DeviceStatus::Nonexistent]
+ : c_deviceStateString[gpu_info_get_stat(gpu_info, index)]);
}
/*! \brief Help build a descriptive message in \c error if there are
* \c errorReasons why nonbondeds on a GPU are not supported.
* Runs a series of checks to determine that the given GPU and underlying CUDA
* driver/runtime functions properly.
*
+ * \todo Currently we do not make a distinction between the type of errors
+ * that can appear during functionality checks. This needs to be improved,
+ * e.g if the dummy test kernel fails to execute with a "device busy message"
+ * we should appropriately report that the device is busy instead of NonFunctional.
+ *
+ * \todo Introduce errors codes and handle errors more smoothly.
+ *
+ *
* \param[in] dev_id the device ID of the GPU or -1 if the device has already been initialized
* \param[in] dev_prop The device properties structure
* \returns 0 if the device looks OK, -1 if it sanity checks failed, and -2 if the device is busy
- *
- * TODO: introduce errors codes and handle errors more smoothly.
*/
-static int do_sanity_checks(int dev_id, const cudaDeviceProp& dev_prop)
+static DeviceStatus isDeviceFunctional(int dev_id, const cudaDeviceProp& dev_prop)
{
cudaError_t cu_err;
int dev_count, id;
if (cu_err != cudaSuccess)
{
fprintf(stderr, "Error %d while querying device count: %s\n", cu_err, cudaGetErrorString(cu_err));
- return -1;
+ return DeviceStatus::NonFunctional;
}
/* no CUDA compatible device at all */
if (dev_count == 0)
{
- return -1;
+ return DeviceStatus::NonFunctional;
}
/* things might go horribly wrong if cudart is not compatible with the driver */
if (dev_count < 0 || dev_count > cuda_max_device_count)
{
- return -1;
+ return DeviceStatus::NonFunctional;
}
if (dev_id == -1) /* device already selected let's not destroy the context */
if (cu_err != cudaSuccess)
{
fprintf(stderr, "Error %d while querying device id: %s\n", cu_err, cudaGetErrorString(cu_err));
- return -1;
+ return DeviceStatus::NonFunctional;
}
}
else
fprintf(stderr,
"The requested device with id %d does not seem to exist (device count=%d)\n",
dev_id, dev_count);
- return -1;
+ return DeviceStatus::NonFunctional;
}
}
/* both major & minor is 9999 if no CUDA capable devices are present */
if (dev_prop.major == 9999 && dev_prop.minor == 9999)
{
- return -1;
+ return DeviceStatus::NonFunctional;
}
/* we don't care about emulation mode */
if (dev_prop.major == 0)
{
- return -1;
+ return DeviceStatus::NonFunctional;
}
if (id != -1)
{
fprintf(stderr, "Error %d while switching to device #%d: %s\n", cu_err, id,
cudaGetErrorString(cu_err));
- return -1;
+ return DeviceStatus::NonFunctional;
}
}
// if we encounter it that will happen in cudaFuncGetAttributes in the above function.
if (cu_err == cudaErrorDevicesUnavailable)
{
- return -2;
+ return DeviceStatus::Unavailable;
}
else if (cu_err != cudaSuccess)
{
- return -1;
+ return DeviceStatus::NonFunctional;
}
/* try to execute a dummy kernel */
fprintf(stderr,
"Error occurred while running dummy kernel sanity check on device #%d:\n %s\n", id,
formatExceptionMessageToString(ex).c_str());
- return -1;
+ return DeviceStatus::NonFunctional;
}
if (cudaDeviceSynchronize() != cudaSuccess)
{
- return -1;
+ return DeviceStatus::NonFunctional;
}
/* destroy context if we created one */
CU_RET_ERR(cu_err, "cudaDeviceReset failed");
}
- return 0;
+ return DeviceStatus::Compatible;
}
void init_gpu(const DeviceInformation* deviceInfo)
* \param[in] deviceProp the CUDA device properties of the device checked.
* \returns the status of the requested device
*/
-static int is_gmx_supported_gpu_id(int deviceId, const cudaDeviceProp& deviceProp)
+static DeviceStatus checkDeviceStatus(int deviceId, const cudaDeviceProp& deviceProp)
{
if (!is_gmx_supported_gpu(deviceProp))
{
- return egpuIncompatible;
- }
-
- /* TODO: currently we do not make a distinction between the type of errors
- * that can appear during sanity checks. This needs to be improved, e.g if
- * the dummy test kernel fails to execute with a "device busy message" we
- * should appropriately report that the device is busy instead of insane.
- */
- const int checkResult = do_sanity_checks(deviceId, deviceProp);
- switch (checkResult)
- {
- case 0: return egpuCompatible;
- case -1: return egpuInsane;
- case -2: return egpuUnavailable;
- default:
- GMX_RELEASE_ASSERT(false, "Invalid do_sanity_checks() return value");
- return egpuCompatible;
+ return DeviceStatus::Incompatible;
}
+ return isDeviceFunctional(deviceId, deviceProp);
}
bool isGpuDetectionFunctional(std::string* errorMessage)
cudaDeviceProp prop;
memset(&prop, 0, sizeof(cudaDeviceProp));
stat = cudaGetDeviceProperties(&prop, i);
- int checkResult;
- if (stat != cudaSuccess)
- {
- // Will handle the error reporting below
- checkResult = egpuInsane;
- }
- else
- {
- checkResult = is_gmx_supported_gpu_id(i, prop);
- }
+ const DeviceStatus checkResult =
+ (stat != cudaSuccess) ? DeviceStatus::NonFunctional : checkDeviceStatus(i, prop);
devs[i].id = i;
devs[i].prop = prop;
devs[i].stat = checkResult;
- if (checkResult == egpuCompatible)
+ if (checkResult == DeviceStatus::Compatible)
{
gpu_info->n_dev_compatible++;
}
DeviceInformation* dinfo = &gpu_info.deviceInfo[index];
- bool bGpuExists = (dinfo->stat != egpuNonexistent && dinfo->stat != egpuInsane);
+ bool bGpuExists =
+ (dinfo->stat != DeviceStatus::Nonexistent && dinfo->stat != DeviceStatus::NonFunctional);
if (!bGpuExists)
{
- sprintf(s, "#%d: %s, stat: %s", dinfo->id, "N/A", gpu_detect_res_str[dinfo->stat]);
+ sprintf(s, "#%d: %s, stat: %s", dinfo->id, "N/A", c_deviceStateString[dinfo->stat]);
}
else
{
sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s", dinfo->id,
dinfo->prop.name, dinfo->prop.major, dinfo->prop.minor,
- dinfo->prop.ECCEnabled ? "yes" : " no", gpu_detect_res_str[dinfo->stat]);
+ dinfo->prop.ECCEnabled ? "yes" : " no", c_deviceStateString[dinfo->stat]);
}
}
}
}
-int gpu_info_get_stat(const gmx_gpu_info_t& info, int index)
+DeviceStatus gpu_info_get_stat(const gmx_gpu_info_t& info, int index)
{
return info.deviceInfo[index].stat;
}
#include "gromacs/utility/basedefinitions.h"
struct DeviceInformation;
+enum class DeviceStatus : int;
struct gmx_gpu_info_t;
namespace gmx
size_t sizeof_gpu_dev_info() GPU_FUNC_TERM_WITH_RETURN(0);
//! Get status of device with specified index
-int gpu_info_get_stat(const gmx_gpu_info_t& info, int index);
+DeviceStatus gpu_info_get_stat(const gmx_gpu_info_t& info, int index);
/*! \brief Check if GROMACS has been built with GPU support.
*
* \throws std::bad_alloc When out of memory.
* \returns Whether the device passed sanity checks
*/
-static bool isDeviceSane(const DeviceInformation* deviceInfo, std::string* errorMessage)
+static bool isDeviceFunctional(const DeviceInformation* deviceInfo, std::string* errorMessage)
{
cl_context_properties properties[] = {
CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(deviceInfo->oclPlatformId), 0
* \param[in] deviceInfo The device info pointer.
* \returns The result of the compatibility checks.
*/
-static int isDeviceSupported(const DeviceInformation* deviceInfo)
+static DeviceStatus isDeviceSupported(const DeviceInformation* deviceInfo)
{
if (getenv("GMX_OCL_DISABLE_COMPATIBILITY_CHECK") != nullptr)
{
// Assume the device is compatible because checking has been disabled.
- return egpuCompatible;
+ return DeviceStatus::Compatible;
}
// OpenCL device version check, ensure >= REQUIRED_OPENCL_MIN_VERSION
|| (deviceVersionMajor == minVersionMajor && deviceVersionMinor >= minVersionMinor)));
if (!versionLargeEnough)
{
- return egpuIncompatible;
+ return DeviceStatus::Incompatible;
}
/* Only AMD, Intel, and NVIDIA GPUs are supported for now */
switch (deviceInfo->deviceVendor)
{
- case DeviceVendor::Nvidia: return egpuCompatible;
+ case DeviceVendor::Nvidia: return DeviceStatus::Compatible;
case DeviceVendor::Amd:
- return runningOnCompatibleOSForAmd() ? egpuCompatible : egpuIncompatible;
+ return runningOnCompatibleOSForAmd() ? DeviceStatus::Compatible : DeviceStatus::Incompatible;
case DeviceVendor::Intel:
- return GMX_OPENCL_NB_CLUSTER_SIZE == 4 ? egpuCompatible : egpuIncompatibleClusterSize;
- default: return egpuIncompatible;
+ return GMX_OPENCL_NB_CLUSTER_SIZE == 4 ? DeviceStatus::Compatible
+ : DeviceStatus::IncompatibleClusterSize;
+ default: return DeviceStatus::Incompatible;
}
}
*
* \param[in] deviceId The runtime-reported numeric ID of the device.
* \param[in] deviceInfo The device info pointer.
- * \returns An e_gpu_detect_res_t to indicate how the GPU coped with
- * the sanity and compatibility check.
+ * \returns A DeviceStatus to indicate if the GPU device is supported and if it was able to run
+ * basic functionality checks.
*/
-static int checkGpu(size_t deviceId, const DeviceInformation* deviceInfo)
+static DeviceStatus checkGpu(size_t deviceId, const DeviceInformation* deviceInfo)
{
- int supportStatus = isDeviceSupported(deviceInfo);
- if (supportStatus != egpuCompatible)
+ DeviceStatus supportStatus = isDeviceSupported(deviceInfo);
+ if (supportStatus != DeviceStatus::Compatible)
{
return supportStatus;
}
std::string errorMessage;
- if (!isDeviceSane(deviceInfo, &errorMessage))
+ if (!isDeviceFunctional(deviceInfo, &errorMessage))
{
gmx_warning("While sanity checking device #%zu, %s", deviceId, errorMessage.c_str());
- return egpuInsane;
+ return DeviceStatus::NonFunctional;
}
- return egpuCompatible;
+ return DeviceStatus::Compatible;
}
} // namespace gmx
gpu_info->deviceInfo[device_index].stat =
gmx::checkGpu(device_index, gpu_info->deviceInfo + device_index);
- if (egpuCompatible == gpu_info->deviceInfo[device_index].stat)
+ if (DeviceStatus::Compatible == gpu_info->deviceInfo[device_index].stat)
{
gpu_info->n_dev_compatible++;
}
DeviceInformation* dinfo = &gpu_info.deviceInfo[index];
- bool bGpuExists = (dinfo->stat != egpuNonexistent && dinfo->stat != egpuInsane);
+ bool bGpuExists =
+ (dinfo->stat != DeviceStatus::Nonexistent && dinfo->stat != DeviceStatus::NonFunctional);
if (!bGpuExists)
{
- sprintf(s, "#%d: %s, stat: %s", index, "N/A", gpu_detect_res_str[dinfo->stat]);
+ sprintf(s, "#%d: %s, stat: %s", index, "N/A", c_deviceStateString[dinfo->stat]);
}
else
{
sprintf(s, "#%d: name: %s, vendor: %s, device version: %s, stat: %s", index, dinfo->device_name,
- dinfo->vendorName, dinfo->device_version, gpu_detect_res_str[dinfo->stat]);
+ dinfo->vendorName, dinfo->device_version, c_deviceStateString[dinfo->stat]);
}
}
return sizeof(DeviceInformation);
}
-int gpu_info_get_stat(const gmx_gpu_info_t& info, int index)
+DeviceStatus gpu_info_get_stat(const gmx_gpu_info_t& info, int index)
{
return info.deviceInfo[index].stat;
}
* \inlibraryapi
* \ingroup module_gpu_utils
*/
-
#include <cuda_runtime.h>
+#include "gromacs/hardware/gpu_hw_info.h"
+
/*! \brief CUDA device information.
*
* The CUDA device information is queried and set at detection and contains
int id;
//! CUDA device properties.
cudaDeviceProp prop;
- //! Result of the device check.
- int stat;
+ //! Device status.
+ DeviceStatus stat;
};
//! \brief Single GPU call timing event - meaningless in CUDA
*/
#include "gromacs/gpu_utils/gmxopencl.h"
+#include "gromacs/hardware/gpu_hw_info.h"
//! OpenCL device vendors
enum class DeviceVendor : int
char vendorName[256]; //!< Device vendor name.
int compute_units; //!< Number of compute units.
int adress_bits; //!< Number of address bits the device is capable of.
- int stat; //!< Device status takes values of e_gpu_detect_res_t.
+ DeviceStatus stat; //!< Device status.
DeviceVendor deviceVendor; //!< Device vendor.
size_t maxWorkItemSizes[3]; //!< Workgroup size limits (CL_DEVICE_MAX_WORK_ITEM_SIZES).
size_t maxWorkGroupSize; //!< Workgroup total size limit (CL_DEVICE_MAX_WORK_GROUP_SIZE).
#
# This file is part of the GROMACS molecular simulation package.
#
-# Copyright (c) 2015,2016,2017, by the GROMACS development team, led by
+# Copyright (c) 2015,2016,2017,2020, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# top-level source directory and at http://www.gromacs.org.
gmx_add_libgromacs_sources(
cpuinfo.cpp
detecthardware.cpp
- gpu_hw_info.cpp
hardwaretopology.cpp
printhardware.cpp
identifyavx512fmaunits.cpp
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-#include "gmxpre.h"
-
-#include "gpu_hw_info.h"
-
-#include "config.h"
-
-/* Note that some of the following arrays must match the "GPU support
- * enumeration" in src/config.h.cmakein, so that GMX_GPU looks up an
- * array entry. */
-
-// TODO If/when we unify CUDA and OpenCL support code, this should
-// move to a single place in gpu_utils.
-/* Names of the GPU detection/check results (see e_gpu_detect_res_t in hw_info.h). */
-const char* const gpu_detect_res_str[egpuNR] = {
- "compatible", "nonexistent",
- "incompatible", "incompatible (please recompile with GMX_OPENCL_NB_CLUSTER_SIZE=4)",
- "insane", "unavailable"
-};
#define GMX_HARDWARE_GPU_HW_INFO_H
#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/enumerationhelpers.h"
struct DeviceInformation;
-/*! \brief Possible results of the GPU detection/check.
- *
- * The egpuInsane value means that during the sanity checks an error
- * occurred that indicates malfunctioning of the device, driver, or
- * incompatible driver/runtime.
- * eGpuUnavailable indicates that CUDA devices are busy or unavailable
- * typically due to use of cudaComputeModeExclusive, cudaComputeModeProhibited modes.
- */
-typedef enum
+//! Possible results of the GPU detection/check.
+enum class DeviceStatus : int
{
- egpuCompatible = 0,
- egpuNonexistent,
- egpuIncompatible,
- egpuIncompatibleClusterSize,
- egpuInsane,
- egpuUnavailable,
- egpuNR
-} e_gpu_detect_res_t;
+ //! The device is compatible
+ Compatible = 0,
+ //! Device does not exist
+ Nonexistent = 1,
+ //! Device is not compatible
+ Incompatible = 2,
+ //! OpenCL device has incompatible cluster size for non-bonded kernels.
+ IncompatibleClusterSize = 3,
+ /*! \brief An error occurred he functionality checks.
+ * That indicates malfunctioning of the device, driver, or incompatible driver/runtime.
+ */
+ NonFunctional = 4,
+ /*! \brief CUDA devices are busy or unavailable.
+ * typically due to use of \p cudaComputeModeExclusive, \p cudaComputeModeProhibited modes.
+ */
+ Unavailable = 5,
+ //! Enumeration size
+ Count = 6
+};
/*! \brief Names of the GPU detection/check results
- *
- * \todo Make a proper class enumeration with helper string */
-extern const char* const gpu_detect_res_str[egpuNR];
+ */
+static const gmx::EnumerationArray<DeviceStatus, const char*> c_deviceStateString = {
+ "compatible", "nonexistent",
+ "incompatible", "incompatible (please recompile with GMX_OPENCL_NB_CLUSTER_SIZE=4)",
+ "non-functional", "unavailable"
+};
/*! \brief Information about GPU devices on this physical node.
*