#include "gromacs/hardware/gpu_hw_info.h"
#include "gromacs/utility/basedefinitions.h"
#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/logger.h"
}
}
-bool canDetectGpus()
+bool canDetectGpus(std::string *errorMessage)
{
cudaError_t stat;
int driverVersion = -1;
gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
bool foundDriver = (driverVersion > 0);
- return foundDriver;
+ if (!foundDriver)
+ {
+ // Can't detect GPUs if there is no driver
+ if (errorMessage != nullptr)
+ {
+ errorMessage->assign("No valid CUDA driver found");
+ }
+ return false;
+ }
+
+ int numDevices;
+ stat = cudaGetDeviceCount(&numDevices);
+ if (stat != cudaSuccess)
+ {
+ if (errorMessage != nullptr)
+ {
+ /* cudaGetDeviceCount failed which means that there is
+ * something wrong with the machine: driver-runtime
+ * mismatch, all GPUs being busy in exclusive mode,
+ * invalid CUDA_VISIBLE_DEVICES, or some other condition
+ * which should result in GROMACS issuing a warning a
+ * falling back to CPUs. */
+ errorMessage->assign(cudaGetErrorString(stat));
+ }
+
+ // Consume the error now that we have prepared to handle
+ // it. This stops it reappearing next time we check for
+ // errors. Note that if CUDA_VISIBLE_DEVICES does not contain
+ // valid devices, then cudaGetLastError returns the
+ // (undocumented) cudaErrorNoDevice, but this should not be a
+ // problem as there should be no future CUDA API calls.
+ // NVIDIA bug report #2038718 has been filed.
+ cudaGetLastError();
+ // Can't detect GPUs
+ return false;
+ }
+
+ // We don't actually use numDevices here, that's not the job of
+ // this function.
+ return true;
}
-int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
+void findGpus(gmx_gpu_info_t *gpu_info)
{
- int i, ndev, checkres, retval;
+ int i, ndev, checkres;
cudaError_t stat;
cudaDeviceProp prop;
gmx_device_info_t *devs;
assert(gpu_info);
- assert(err_str);
gpu_info->n_dev_compatible = 0;
stat = cudaGetDeviceCount(&ndev);
if (stat != cudaSuccess)
{
- const char *s;
-
- /* cudaGetDeviceCount failed which means that there is something
- * wrong with the machine: driver-runtime mismatch, all GPUs being
- * busy in exclusive mode, or some other condition which should
- * result in us issuing a warning a falling back to CPUs. */
- retval = -1;
- s = cudaGetErrorString(stat);
- strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
-
- // Consume the error now that we have prepared to handle
- // it. This stops it reappearing next time we check for errors.
- cudaGetLastError();
+ GMX_THROW(gmx::InternalError("Invalid call of findGpus() when CUDA API returned an error, perhaps "
+ "canDetectGpus() was not called appropriately beforehand."));
}
- else
+
+ snew(devs, ndev);
+ for (i = 0; i < ndev; i++)
{
- snew(devs, ndev);
- for (i = 0; i < ndev; i++)
- {
- checkres = is_gmx_supported_gpu_id(i, &prop);
+ checkres = is_gmx_supported_gpu_id(i, &prop);
- devs[i].id = i;
- devs[i].prop = prop;
- devs[i].stat = checkres;
+ devs[i].id = i;
+ devs[i].prop = prop;
+ devs[i].stat = checkres;
- if (checkres == egpuCompatible)
- {
- gpu_info->n_dev_compatible++;
- }
+ if (checkres == egpuCompatible)
+ {
+ gpu_info->n_dev_compatible++;
}
- retval = 0;
}
+ GMX_RELEASE_ASSERT(cudaSuccess == cudaPeekAtLastError(), "Should be cudaSuccess");
gpu_info->n_dev = ndev;
gpu_info->gpu_dev = devs;
-
- return retval;
}
std::vector<int> getCompatibleGpus(const gmx_gpu_info_t &gpu_info)
#include <cstdio>
+#include <string>
#include <vector>
#include "gromacs/gpu_utils/gpu_macros.h"
/*! \brief Return whether GPUs can be detected
*
* Returns true when this is a build of \Gromacs configured to support
- * GPU usage, and a valid device driver or ICD was detected by the GPU
- * runtime.
+ * GPU usage, and a valid device driver, ICD, and/or runtime was detected.
+ *
+ * \param[out] errorMessage When returning false and non-nullptr was passed,
+ * the string contains a descriptive message about
+ * why GPUs cannot be detected.
*
* Does not throw. */
GPU_FUNC_QUALIFIER
-bool canDetectGpus() GPU_FUNC_TERM_WITH_RETURN(false);
+bool canDetectGpus(std::string *GPU_FUNC_ARGUMENT(errorMessage)) GPU_FUNC_TERM_WITH_RETURN(false);
-/*! \brief Detect all GPUs in the system.
+/*! \brief Find all GPUs in the system.
*
- * Will detect every GPU supported by the device driver in use. If
- * the device driver is missing or unsuitable, returns the same error
- * as for "no valid devices detected," so generally calling code
- * should have checked the return value from canDetectGpus() first,
- * in order to understand the behaviour of this routine. This routine
+ * Will detect every GPU supported by the device driver in use. Must
+ * only be called if canDetectGpus() has returned true. This routine
* also checks for the compatibility of each and fill the
* gpu_info->gpu_dev array with the required information on each the
* device: ID, device properties, status.
*
* \param[in] gpu_info pointer to structure holding GPU information.
- * \param[out] err_str The error message of any GPU API error that caused
- * the detection to fail (if there was any). The memory
- * the pointer points to should be managed externally.
- * \returns non-zero if the detection encountered a failure, zero otherwise.
+ *
+ * \throws InternalError if a GPU API returns an unexpected failure (because
+ * the call to canDetectGpus() should always prevent this occuring)
*/
GPU_FUNC_QUALIFIER
-int detect_gpus(struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info), char *GPU_FUNC_ARGUMENT(err_str)) GPU_FUNC_TERM_WITH_RETURN(-1)
+void findGpus(struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info)) GPU_FUNC_TERM
/*! \brief Return a container of the detected GPUs that are compatible.
*
#include "gromacs/hardware/hw_info.h"
#include "gromacs/mdtypes/md_enums.h"
#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/smalloc.h"
#include "gromacs/utility/stringutil.h"
-/*! \brief Helper macro for error handling */
-#define CALLOCLFUNC_LOGERROR(func, err_str, retval) { \
- cl_int opencl_ret = func; \
- if (CL_SUCCESS != opencl_ret) \
- { \
- sprintf(err_str, "OpenCL error %d", opencl_ret); \
- retval = -1; \
- } \
- else{ \
- retval = 0; } \
-}
-
-
/*! \brief Return true if executing on compatible OS for AMD OpenCL.
*
* This is assumed to be true for OS X version of at least 10.10.4 and
//! This function is documented in the header file
-bool canDetectGpus()
+bool canDetectGpus(std::string *errorMessage)
{
cl_uint numPlatforms;
cl_int status = clGetPlatformIDs(0, nullptr, &numPlatforms);
if (status == CL_PLATFORM_NOT_FOUND_KHR)
{
// No valid ICDs found
+ if (errorMessage != nullptr)
+ {
+ errorMessage->assign("No valid OpenCL driver found");
+ }
return false;
}
GMX_RELEASE_ASSERT(status == CL_SUCCESS,
gmx::formatString("An unexpected value was returned from clGetPlatformIDs %u: %s",
status, ocl_get_error_string(status).c_str()).c_str());
bool foundPlatform = (numPlatforms > 0);
+ if (!foundPlatform && errorMessage != nullptr)
+ {
+ errorMessage->assign("No OpenCL platforms found even though the driver was valid");
+ }
return foundPlatform;
}
//! This function is documented in the header file
-int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
+void findGpus(gmx_gpu_info_t *gpu_info)
{
- int retval;
cl_uint ocl_platform_count;
cl_platform_id *ocl_platform_ids;
cl_device_type req_dev_type = CL_DEVICE_TYPE_GPU;
- retval = 0;
ocl_platform_ids = NULL;
if (getenv("GMX_OCL_FORCE_CPU") != NULL)
while (1)
{
- CALLOCLFUNC_LOGERROR(clGetPlatformIDs(0, NULL, &ocl_platform_count), err_str, retval)
- if (0 != retval)
+ cl_int status = clGetPlatformIDs(0, NULL, &ocl_platform_count);
+ if (CL_SUCCESS != status)
{
- break;
+ GMX_THROW(gmx::InternalError(gmx::formatString("An unexpected value %u was returned from clGetPlatformIDs: ",
+ status) + ocl_get_error_string(status)));
}
if (1 > ocl_platform_count)
{
+ // TODO this should have a descriptive error message that we only support one OpenCL platform
break;
}
snew(ocl_platform_ids, ocl_platform_count);
- CALLOCLFUNC_LOGERROR(clGetPlatformIDs(ocl_platform_count, ocl_platform_ids, NULL), err_str, retval)
- if (0 != retval)
+ status = clGetPlatformIDs(ocl_platform_count, ocl_platform_ids, NULL);
+ if (CL_SUCCESS != status)
{
- break;
+ GMX_THROW(gmx::InternalError(gmx::formatString("An unexpected value %u was returned from clGetPlatformIDs: ",
+ status) + ocl_get_error_string(status)));
}
for (unsigned int i = 0; i < ocl_platform_count; i++)
}
sfree(ocl_platform_ids);
-
- return retval;
}
//! This function is documented in the header file
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/hardware/gpu_hw_info.h"
-#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/smalloc.h"
namespace gmx
GpuTest::GpuTest()
{
snew(gpuInfo_, 1);
- char errorString[STRLEN];
- detect_gpus(gpuInfo_, errorString);
+ if (canDetectGpus(nullptr))
+ {
+ findGpus(gpuInfo_);
+ }
+ // Failing to find valid GPUs does not require further action
}
GpuTest::~GpuTest()
TYPED_TEST(HostAllocatorTest, TransfersWithPinningWorkWithCuda)
{
+ if (!this->haveValidGpus())
+ {
+ return;
+ }
+
typename TestFixture::VectorType input;
changePinningPolicy(&input, PinningPolicy::CanBePinned);
this->fillInput(&input);
TYPED_TEST(HostAllocatorTest, ManualPinningOperationsWorkWithCuda)
{
+ if (!this->haveValidGpus())
+ {
+ return;
+ }
+
typename TestFixture::VectorType input;
changePinningPolicy(&input, PinningPolicy::CanBePinned);
EXPECT_FALSE(isPinned(input));
TEST_F(PinnedMemoryCheckerTest, DefaultContainerIsRecognized)
{
+ if (!haveValidGpus())
+ {
+ return;
+ }
+
std::vector<real> dummy(3, 1.5);
EXPECT_FALSE(isHostMemoryPinned(dummy.data()));
}
TEST_F(PinnedMemoryCheckerTest, NonpinnedContainerIsRecognized)
{
+ if (!haveValidGpus())
+ {
+ return;
+ }
+
HostVector<real> dummy(3, 1.5);
changePinningPolicy(&dummy, PinningPolicy::CannotBePinned);
EXPECT_FALSE(isHostMemoryPinned(dummy.data()));
TEST_F(PinnedMemoryCheckerTest, PinnedContainerIsRecognized)
{
+ if (!haveValidGpus())
+ {
+ return;
+ }
+
HostVector<real> dummy(3, 1.5);
changePinningPolicy(&dummy, PinningPolicy::CanBePinned);
EXPECT_TRUE(isHostMemoryPinned(dummy.data()));
TEST_F(PinnedMemoryCheckerTest, DefaultCBufferIsRecognized)
{
+ if (!haveValidGpus())
+ {
+ return;
+ }
+
real *dummy;
snew(dummy, 3);
EXPECT_FALSE(isHostMemoryPinned(dummy));
TEST_F(PinnedMemoryCheckerTest, PinnedCBufferIsRecognized)
{
+ if (!haveValidGpus())
+ {
+ return;
+ }
+
real *dummy = nullptr;
pmalloc((void **)&dummy, 3 * sizeof(real));
EXPECT_TRUE(isHostMemoryPinned(dummy));
bool gpusCanBeDetected = false;
if (isMasterRankOfNode || isOpenclPpRank)
{
- gpusCanBeDetected = canDetectGpus();
- // No need to tell the user anything at this point, they get a
- // hardware report later.
- }
-
- if (gpusCanBeDetected)
- {
- char detection_error[STRLEN] = "", sbuf[STRLEN];
-
- if (detect_gpus(&hwinfo_g->gpu_info, detection_error) != 0)
+ std::string errorMessage;
+ gpusCanBeDetected = canDetectGpus(&errorMessage);
+ if (!gpusCanBeDetected)
{
- if (detection_error[0] != '\0')
- {
- sprintf(sbuf, ":\n %s\n", detection_error);
- }
- else
- {
- sprintf(sbuf, ".");
- }
GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
- "NOTE: Error occurred during GPU detection%s"
+ "NOTE: GPUs cannot be detected:\n"
+ " %s\n"
" Can not use GPU acceleration, will fall back to CPU kernels.",
- sbuf);
+ errorMessage.c_str());
}
}
+ if (gpusCanBeDetected)
+ {
+ findGpus(&hwinfo_g->gpu_info);
+ // No need to tell the user anything at this point, they get a
+ // hardware report later.
+ }
+
#if GMX_LIB_MPI
if (!isOpenclPpRank)
{
namespace
{
-//! A basic PME runner
+/*! \brief A basic PME runner
+ *
+ * \todo Consider also using GpuTest class. */
class PmeTest : public MdrunTestFixture
{
public:
void PmeTest::SetUpTestCase()
{
gmx_gpu_info_t gpuInfo {};
- char detection_error[STRLEN];
- GMX_UNUSED_VALUE(detection_error); //TODO
// It would be nicer to do this detection once and have mdrun
// re-use it, but this is OK. Note that this also caters for when
// there is no GPU support in the build.
+ //
+ // TODO report any error messages gracefully.
if (GMX_GPU == GMX_GPU_CUDA &&
- (detect_gpus(&gpuInfo, detection_error) >= 0) &&
- gpuInfo.n_dev_compatible > 0)
+ canDetectGpus(nullptr))
{
- s_hasCompatibleCudaGpus = true;
+ findGpus(&gpuInfo);
+ s_hasCompatibleCudaGpus = (gpuInfo.n_dev_compatible > 0);
}
free_gpu_info(&gpuInfo);
}