/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013,2014,2015,2017 The GROMACS development team.
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team.
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
/*! \internal \file
* \brief Defines the CUDA implementations of the device management.
*
+ * \author Anca Hamuraru <anca@streamcomputing.eu>
+ * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ * \author Teemu Virolainen <teemu@streamcomputing.eu>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \author Szilárd Páll <pall.szilard@gmail.com>
* \author Artem Zhmurov <zhmurov@gmail.com>
*
* \ingroup module_hardware
#include "gromacs/gpu_utils/cudautils.cuh"
#include "gromacs/gpu_utils/device_context.h"
#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/utility/exceptions.h"
#include "gromacs/utility/programcontext.h"
#include "gromacs/utility/smalloc.h"
+#include "gromacs/utility/stringutil.h"
+
+#include "device_information.h"
/*! \internal \brief
* Max number of devices supported by CUDA (for consistency checking).
*
* In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
*/
-static int cuda_max_device_count = 32;
+static const int c_cudaMaxDeviceCount = 32;
/** Dummy kernel used for sanity checking. */
-static __global__ void k_dummy_test(void) {}
+static __global__ void dummy_kernel() {}
static cudaError_t checkCompiledTargetCompatibility(int deviceId, const cudaDeviceProp& deviceProp)
{
cudaFuncAttributes attributes;
- cudaError_t stat = cudaFuncGetAttributes(&attributes, k_dummy_test);
+ cudaError_t stat = cudaFuncGetAttributes(&attributes, dummy_kernel);
if (cudaErrorInvalidDeviceFunction == stat)
{
"\nWARNING: The %s binary does not include support for the CUDA architecture of "
"the GPU ID #%d (compute capability %d.%d) detected during detection. "
"By default, GROMACS supports all architectures of compute "
- "capability >= 3.0, so your GPU "
+ "capability >= 3.5, so your GPU "
"might be rare, or some architectures were disabled in the build. \n"
"Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
"GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture. \n",
- gmx::getProgramContext().displayName(), deviceId, deviceProp.major, deviceProp.minor);
+ gmx::getProgramContext().displayName(),
+ deviceId,
+ deviceProp.major,
+ deviceProp.minor);
}
return stat;
* \todo Introduce errors codes and handle errors more smoothly.
*
*
- * \param[in] dev_id the device ID of the GPU or -1 if the device has already been initialized
- * \param[in] dev_prop The device properties structure
- * \returns 0 if the device looks OK, -1 if it sanity checks failed, and -2 if the device is busy
+ * \param[in] deviceInfo Device information on the device to check.
+ * \returns The status enumeration value for the checked device:
*/
-static DeviceStatus isDeviceFunctional(int dev_id, const cudaDeviceProp& dev_prop)
+static DeviceStatus isDeviceFunctional(const DeviceInformation& deviceInfo)
{
cudaError_t cu_err;
- int dev_count, id;
-
- cu_err = cudaGetDeviceCount(&dev_count);
- if (cu_err != cudaSuccess)
- {
- fprintf(stderr, "Error %d while querying device count: %s\n", cu_err, cudaGetErrorString(cu_err));
- return DeviceStatus::NonFunctional;
- }
-
- /* no CUDA compatible device at all */
- if (dev_count == 0)
- {
- return DeviceStatus::NonFunctional;
- }
-
- /* things might go horribly wrong if cudart is not compatible with the driver */
- if (dev_count < 0 || dev_count > cuda_max_device_count)
- {
- return DeviceStatus::NonFunctional;
- }
-
- if (dev_id == -1) /* device already selected let's not destroy the context */
- {
- cu_err = cudaGetDevice(&id);
- if (cu_err != cudaSuccess)
- {
- fprintf(stderr, "Error %d while querying device id: %s\n", cu_err, cudaGetErrorString(cu_err));
- return DeviceStatus::NonFunctional;
- }
- }
- else
- {
- id = dev_id;
- if (id > dev_count - 1) /* pfff there's no such device */
- {
- fprintf(stderr,
- "The requested device with id %d does not seem to exist (device count=%d)\n",
- dev_id, dev_count);
- return DeviceStatus::NonFunctional;
- }
- }
/* both major & minor is 9999 if no CUDA capable devices are present */
- if (dev_prop.major == 9999 && dev_prop.minor == 9999)
+ if (deviceInfo.prop.major == 9999 && deviceInfo.prop.minor == 9999)
{
return DeviceStatus::NonFunctional;
}
/* we don't care about emulation mode */
- if (dev_prop.major == 0)
+ if (deviceInfo.prop.major == 0)
{
return DeviceStatus::NonFunctional;
}
- if (id != -1)
+ cu_err = cudaSetDevice(deviceInfo.id);
+ if (cu_err != cudaSuccess)
{
- cu_err = cudaSetDevice(id);
- if (cu_err != cudaSuccess)
- {
- fprintf(stderr, "Error %d while switching to device #%d: %s\n", cu_err, id,
- cudaGetErrorString(cu_err));
- return DeviceStatus::NonFunctional;
- }
+ fprintf(stderr,
+ "Error while switching to device #%d. %s\n",
+ deviceInfo.id,
+ gmx::getDeviceErrorString(cu_err).c_str());
+ return DeviceStatus::NonFunctional;
}
- cu_err = checkCompiledTargetCompatibility(dev_id, dev_prop);
+ cu_err = checkCompiledTargetCompatibility(deviceInfo.id, deviceInfo.prop);
// Avoid triggering an error if GPU devices are in exclusive or prohibited mode;
// it is enough to check for cudaErrorDevicesUnavailable only here because
// if we encounter it that will happen in cudaFuncGetAttributes in the above function.
{
KernelLaunchConfig config;
config.blockSize[0] = 512;
- const auto dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
- DeviceInformation deviceInfo;
+ const auto dummyArguments = prepareGpuKernelArguments(dummy_kernel, config);
const DeviceContext deviceContext(deviceInfo);
const DeviceStream deviceStream(deviceContext, DeviceStreamPriority::Normal, false);
- launchGpuKernel(k_dummy_test, config, deviceStream, nullptr, "Dummy kernel", dummyArguments);
+ launchGpuKernel(dummy_kernel, config, deviceStream, nullptr, "Dummy kernel", dummyArguments);
}
catch (gmx::GromacsException& ex)
{
// launchGpuKernel error is not fatal and should continue with marking the device bad
fprintf(stderr,
- "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n", id,
+ "Error occurred while running dummy kernel sanity check on device #%d:\n %s\n",
+ deviceInfo.id,
formatExceptionMessageToString(ex).c_str());
return DeviceStatus::NonFunctional;
}
return DeviceStatus::NonFunctional;
}
- /* destroy context if we created one */
- if (id != -1)
- {
- cu_err = cudaDeviceReset();
- CU_RET_ERR(cu_err, "cudaDeviceReset failed");
- }
+ cu_err = cudaDeviceReset();
+ CU_RET_ERR(cu_err, "cudaDeviceReset failed");
return DeviceStatus::Compatible;
}
-/*! \brief Returns true if the gpu characterized by the device properties is
- * supported by the native gpu acceleration.
+/*! \brief Returns true if the gpu characterized by the device properties is supported
+ * by the native gpu acceleration.
*
- * \param[in] dev_prop the CUDA device properties of the gpus to test.
- * \returns true if the GPU properties passed indicate a compatible
- * GPU, otherwise false.
+ * \param[in] deviceProperties The CUDA device properties of the gpus to test.
+ * \returns True if the GPU properties passed indicate a compatible
+ * GPU, otherwise false.
*/
-static bool is_gmx_supported_gpu(const cudaDeviceProp& dev_prop)
+static bool isDeviceGenerationSupported(const cudaDeviceProp& deviceProperties)
{
- return (dev_prop.major >= 3);
+ return (deviceProperties.major >= 3);
}
/*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
* upon return. Note that this also means it is the caller's responsibility to
* reset the CUDA runtime state.
*
- * \param[in] deviceId the ID of the GPU to check.
- * \param[in] deviceProp the CUDA device properties of the device checked.
+ * \param[in] deviceInfo The device information on the device to check.
* \returns the status of the requested device
*/
-static DeviceStatus checkDeviceStatus(int deviceId, const cudaDeviceProp& deviceProp)
+static DeviceStatus checkDeviceStatus(const DeviceInformation& deviceInfo)
{
- if (!is_gmx_supported_gpu(deviceProp))
+ if (!isDeviceGenerationSupported(deviceInfo.prop))
{
return DeviceStatus::Incompatible;
}
- return isDeviceFunctional(deviceId, deviceProp);
+ return isDeviceFunctional(deviceInfo);
}
-bool isGpuDetectionFunctional(std::string* errorMessage)
+bool isDeviceDetectionFunctional(std::string* errorMessage)
{
cudaError_t stat;
int driverVersion = -1;
stat = cudaDriverGetVersion(&driverVersion);
GMX_ASSERT(stat != cudaErrorInvalidValue,
"An impossible null pointer was passed to cudaDriverGetVersion");
- GMX_RELEASE_ASSERT(
- stat == cudaSuccess,
- gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
- cudaGetErrorName(stat), cudaGetErrorString(stat))
- .c_str());
+ GMX_RELEASE_ASSERT(stat == cudaSuccess,
+ ("An unexpected value was returned from cudaDriverGetVersion. "
+ + gmx::getDeviceErrorString(stat))
+ .c_str());
bool foundDriver = (driverVersion > 0);
if (!foundDriver)
{
return true;
}
-void findGpus(gmx_gpu_info_t* gpu_info)
+std::vector<std::unique_ptr<DeviceInformation>> findDevices()
{
- assert(gpu_info);
+ int numDevices;
+ cudaError_t stat = cudaGetDeviceCount(&numDevices);
+ gmx::checkDeviceError(stat,
+ "Invalid call of findDevices() when CUDA API returned an error, perhaps "
+ "canPerformDeviceDetection() was not called appropriately beforehand.");
- gpu_info->n_dev_compatible = 0;
-
- int ndev;
- cudaError_t stat = cudaGetDeviceCount(&ndev);
- if (stat != cudaSuccess)
- {
- GMX_THROW(gmx::InternalError(
- "Invalid call of findGpus() when CUDA API returned an error, perhaps "
- "canDetectGpus() was not called appropriately beforehand."));
- }
+ /* things might go horribly wrong if cudart is not compatible with the driver */
+ numDevices = std::min(numDevices, c_cudaMaxDeviceCount);
// We expect to start device support/sanity checks with a clean runtime error state
- gmx::ensureNoPendingCudaError("");
+ gmx::ensureNoPendingDeviceError("Trying to find available CUDA devices.");
- DeviceInformation* devs;
- snew(devs, ndev);
- for (int i = 0; i < ndev; i++)
+ std::vector<std::unique_ptr<DeviceInformation>> deviceInfoList(numDevices);
+ for (int i = 0; i < numDevices; i++)
{
cudaDeviceProp prop;
memset(&prop, 0, sizeof(cudaDeviceProp));
stat = cudaGetDeviceProperties(&prop, i);
- const DeviceStatus checkResult =
- (stat != cudaSuccess) ? DeviceStatus::NonFunctional : checkDeviceStatus(i, prop);
- devs[i].id = i;
- devs[i].prop = prop;
- devs[i].stat = checkResult;
+ deviceInfoList[i] = std::make_unique<DeviceInformation>();
+ deviceInfoList[i]->id = i;
+ deviceInfoList[i]->prop = prop;
+ deviceInfoList[i]->deviceVendor = DeviceVendor::Nvidia;
- if (checkResult == DeviceStatus::Compatible)
- {
- gpu_info->n_dev_compatible++;
- }
- else
+ const DeviceStatus checkResult = (stat != cudaSuccess) ? DeviceStatus::NonFunctional
+ : checkDeviceStatus(*deviceInfoList[i]);
+
+ deviceInfoList[i]->status = checkResult;
+
+ if (checkResult != DeviceStatus::Compatible)
{
// TODO:
// - we inspect the CUDA API state to retrieve and record any
//
// Here we also clear the CUDA API error state so potential
// errors during sanity checks don't propagate.
- if ((stat = cudaGetLastError()) != cudaSuccess)
- {
- gmx_warning("An error occurred while sanity checking device #%d; %s: %s",
- devs[i].id, cudaGetErrorName(stat), cudaGetErrorString(stat));
- }
+ const std::string errorMessage = gmx::formatString(
+ "An error occurred while sanity checking device #%d.", deviceInfoList[i]->id);
+ gmx::ensureNoPendingDeviceError(errorMessage);
}
}
stat = cudaPeekAtLastError();
- GMX_RELEASE_ASSERT(stat == cudaSuccess,
- gmx::formatString("We promise to return with clean CUDA state, but "
- "non-success state encountered: %s: %s",
- cudaGetErrorName(stat), cudaGetErrorString(stat))
- .c_str());
+ GMX_RELEASE_ASSERT(
+ stat == cudaSuccess,
+ ("We promise to return with clean CUDA state, but non-success state encountered. "
+ + gmx::getDeviceErrorString(stat))
+ .c_str());
- gpu_info->n_dev = ndev;
- gpu_info->deviceInfo = devs;
+ return deviceInfoList;
}
-void init_gpu(const DeviceInformation* deviceInfo)
+void setActiveDevice(const DeviceInformation& deviceInfo)
{
+ int deviceId = deviceInfo.id;
cudaError_t stat;
- assert(deviceInfo);
-
- stat = cudaSetDevice(deviceInfo->id);
+ stat = cudaSetDevice(deviceId);
if (stat != cudaSuccess)
{
- auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id);
- CU_RET_ERR(stat, message.c_str());
+ auto message = gmx::formatString("Failed to initialize GPU #%d", deviceId);
+ CU_RET_ERR(stat, message);
}
if (debug)
{
- fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceInfo->id, deviceInfo->prop.name);
+ fprintf(stderr, "Initialized GPU ID #%d: %s\n", deviceId, deviceInfo.prop.name);
}
}
-void free_gpu(const DeviceInformation* deviceInfo)
+void releaseDevice(DeviceInformation* deviceInfo)
{
- // One should only attempt to clear the device context when
- // it has been used, but currently the only way to know that a GPU
// device was used is that deviceInfo will be non-null.
- if (deviceInfo == nullptr)
+ if (deviceInfo != nullptr)
{
- return;
- }
+ cudaError_t stat;
- cudaError_t stat;
-
- if (debug)
- {
int gpuid;
stat = cudaGetDevice(&gpuid);
- CU_RET_ERR(stat, "cudaGetDevice failed");
- fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
- }
-
- stat = cudaDeviceReset();
- if (stat != cudaSuccess)
- {
- gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat));
- }
-}
+ if (stat == cudaSuccess)
+ {
+ if (debug)
+ {
+ fprintf(stderr, "Cleaning up context on GPU ID #%d.\n", gpuid);
+ }
-DeviceInformation* getDeviceInfo(const gmx_gpu_info_t& gpu_info, int deviceId)
-{
- if (deviceId < 0 || deviceId >= gpu_info.n_dev)
- {
- gmx_incons("Invalid GPU deviceId requested");
+ stat = cudaDeviceReset();
+ if (stat != cudaSuccess)
+ {
+ gmx_warning("Failed to free GPU #%d. %s", gpuid, gmx::getDeviceErrorString(stat).c_str());
+ }
+ }
}
- return &gpu_info.deviceInfo[deviceId];
}
-void get_gpu_device_info_string(char* s, const gmx_gpu_info_t& gpu_info, int index)
+std::string getDeviceInformationString(const DeviceInformation& deviceInfo)
{
- assert(s);
+ bool gpuExists = (deviceInfo.status != DeviceStatus::Nonexistent
+ && deviceInfo.status != DeviceStatus::NonFunctional);
- if (index < 0 && index >= gpu_info.n_dev)
+ if (!gpuExists)
{
- return;
- }
-
- DeviceInformation* dinfo = &gpu_info.deviceInfo[index];
-
- bool bGpuExists =
- (dinfo->stat != DeviceStatus::Nonexistent && dinfo->stat != DeviceStatus::NonFunctional);
-
- if (!bGpuExists)
- {
- sprintf(s, "#%d: %s, stat: %s", dinfo->id, "N/A", c_deviceStateString[dinfo->stat]);
+ return gmx::formatString(
+ "#%d: %s, stat: %s", deviceInfo.id, "N/A", c_deviceStateString[deviceInfo.status]);
}
else
{
- sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s", dinfo->id,
- dinfo->prop.name, dinfo->prop.major, dinfo->prop.minor,
- dinfo->prop.ECCEnabled ? "yes" : " no", c_deviceStateString[dinfo->stat]);
+ return gmx::formatString("#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
+ deviceInfo.id,
+ deviceInfo.prop.name,
+ deviceInfo.prop.major,
+ deviceInfo.prop.minor,
+ deviceInfo.prop.ECCEnabled ? "yes" : " no",
+ c_deviceStateString[deviceInfo.status]);
}
}
-
-size_t sizeof_gpu_dev_info(void)
-{
- return sizeof(DeviceInformation);
-}
-
-DeviceStatus gpu_info_get_stat(const gmx_gpu_info_t& info, int index)
-{
- return info.deviceInfo[index].stat;
-}