/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include "gromacs/hardware/gpu_hw_info.h"
#include "gromacs/utility/basedefinitions.h"
#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/logger.h"
if (cudaErrorInvalidDeviceFunction == stat)
{
gmx_fatal(FARGS,
- "The %s binary was not compiled for the selected GPU "
- "(device ID #%d, compute capability %d.%d).\n"
- "When selecting target GPU architectures with GMX_CUDA_TARGET_SM, "
- "make sure to pass the appropriate architecture(s) corresponding to the "
- "device(s) intended to be used (see in the GPU info listing) or alternatively "
- "pass in GMX_CUDA_TARGET_COMPUTE an appropriate virtual architecture. ",
+ "The %s binary does not include support for the CUDA architecture "
+ "of the selected GPU (device ID #%d, compute capability %d.%d). "
+ "By default, GROMACS supports all common architectures, so your GPU "
+ "might be rare, or some architectures were disabled in the build. ",
+ "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and ",
+ "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture.",
gmx::getProgramContext().displayName(), devInfo->id,
devInfo->prop.major, devInfo->prop.minor);
}
}
}
-bool canDetectGpus()
+bool canDetectGpus(std::string *errorMessage)
{
cudaError_t stat;
int driverVersion = -1;
gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
bool foundDriver = (driverVersion > 0);
- return foundDriver;
+ if (!foundDriver)
+ {
+ // Can't detect GPUs if there is no driver
+ if (errorMessage != nullptr)
+ {
+ errorMessage->assign("No valid CUDA driver found");
+ }
+ return false;
+ }
+
+ int numDevices;
+ stat = cudaGetDeviceCount(&numDevices);
+ if (stat != cudaSuccess)
+ {
+ if (errorMessage != nullptr)
+ {
+ /* cudaGetDeviceCount failed which means that there is
+ * something wrong with the machine: driver-runtime
+ * mismatch, all GPUs being busy in exclusive mode,
+ * invalid CUDA_VISIBLE_DEVICES, or some other condition
+ * which should result in GROMACS issuing a warning a
+ * falling back to CPUs. */
+ errorMessage->assign(cudaGetErrorString(stat));
+ }
+
+ // Consume the error now that we have prepared to handle
+ // it. This stops it reappearing next time we check for
+ // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
+ // valid devices, then cudaGetLastError returns the
+ // (undocumented) cudaErrorNoDevice, but this should not be a
+ // problem as there should be no future CUDA API calls.
+ // NVIDIA bug report #2038718 has been filed.
+ cudaGetLastError();
+ // Can't detect GPUs
+ return false;
+ }
+
+ // We don't actually use numDevices here, that's not the job of
+ // this function.
+ return true;
}
-int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
+void findGpus(gmx_gpu_info_t *gpu_info)
{
- int i, ndev, checkres, retval;
+ int i, ndev, checkres;
cudaError_t stat;
cudaDeviceProp prop;
gmx_device_info_t *devs;
assert(gpu_info);
- assert(err_str);
gpu_info->n_dev_compatible = 0;
stat = cudaGetDeviceCount(&ndev);
if (stat != cudaSuccess)
{
- const char *s;
-
- /* cudaGetDeviceCount failed which means that there is something
- * wrong with the machine: driver-runtime mismatch, all GPUs being
- * busy in exclusive mode, or some other condition which should
- * result in us issuing a warning a falling back to CPUs. */
- retval = -1;
- s = cudaGetErrorString(stat);
- strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
-
- // Consume the error now that we have prepared to handle
- // it. This stops it reappearing next time we check for errors.
- cudaGetLastError();
+ GMX_THROW(gmx::InternalError("Invalid call of findGpus() when CUDA API returned an error, perhaps "
+ "canDetectGpus() was not called appropriately beforehand."));
}
- else
+
+ snew(devs, ndev);
+ for (i = 0; i < ndev; i++)
{
- snew(devs, ndev);
- for (i = 0; i < ndev; i++)
- {
- checkres = is_gmx_supported_gpu_id(i, &prop);
+ checkres = is_gmx_supported_gpu_id(i, &prop);
- devs[i].id = i;
- devs[i].prop = prop;
- devs[i].stat = checkres;
+ devs[i].id = i;
+ devs[i].prop = prop;
+ devs[i].stat = checkres;
- if (checkres == egpuCompatible)
- {
- gpu_info->n_dev_compatible++;
- }
+ if (checkres == egpuCompatible)
+ {
+ gpu_info->n_dev_compatible++;
}
- retval = 0;
}
+ GMX_RELEASE_ASSERT(cudaSuccess == cudaPeekAtLastError(), "Should be cudaSuccess");
gpu_info->n_dev = ndev;
gpu_info->gpu_dev = devs;
-
- return retval;
}
std::vector<int> getCompatibleGpus(const gmx_gpu_info_t &gpu_info)