namespace
{
+/*! \brief Add the API information on the specific error to the error message.
+ *
+ * \param[in] deviceError The error to assert cudaSuccess on.
+ *
+ * \returns A description of the API error. Returns '(CUDA error #0 (cudaSuccess): no error)' in case deviceError is cudaSuccess.
+ */
+static inline std::string getDeviceErrorString(const cudaError_t deviceError)
+{
+ return formatString("CUDA error #%d (%s): %s.", deviceError, cudaGetErrorName(deviceError),
+ cudaGetErrorString(deviceError));
+}
+
+/*! \brief Check if API returned an error and throw an exception with information on it.
+ *
+ * \param[in] deviceError The error to assert cudaSuccess on.
+ * \param[in] errorMessage Undecorated error message.
+ *
+ * \throws InternalError if deviceError is not a success.
+ */
+static inline void checkDeviceError(const cudaError_t deviceError, const std::string& errorMessage)
+{
+ if (deviceError != cudaSuccess)
+ {
+ GMX_THROW(gmx::InternalError(errorMessage + " " + getDeviceErrorString(deviceError)));
+ }
+}
+
/*! \brief Helper function to ensure no pending error silently
* disrupts error handling.
*
* Asserts in a debug build if an unhandled error is present. Issues a
* warning at run time otherwise.
*
- * \todo This is similar to CU_CHECK_PREV_ERR, which should be
- * consolidated.
+ * \param[in] errorMessage Undecorated error message.
*/
-static inline void ensureNoPendingCudaError(const char* errorMessage)
+static inline void ensureNoPendingDeviceError(const std::string& errorMessage)
{
// Ensure there is no pending error that would otherwise affect
// the behaviour of future error handling.
- cudaError_t stat = cudaGetLastError();
- if (stat == cudaSuccess)
+ cudaError_t deviceError = cudaGetLastError();
+ if (deviceError == cudaSuccess)
{
return;
}
// If we would find an error in a release build, we do not know
// what is appropriate to do about it, so assert only for debug
// builds.
- auto fullMessage = formatString(
- "%s An unhandled error from a previous CUDA operation was detected. %s: %s",
- errorMessage, cudaGetErrorName(stat), cudaGetErrorString(stat));
- GMX_ASSERT(stat == cudaSuccess, fullMessage.c_str());
+ const std::string fullErrorMessage =
+ errorMessage + " An unhandled error from a previous CUDA operation was detected. "
+ + gmx::getDeviceErrorString(deviceError);
+ GMX_ASSERT(deviceError == cudaSuccess, fullErrorMessage.c_str());
// TODO When we evolve a better logging framework, use that
// for release-build error reporting.
- gmx_warning("%s", fullMessage.c_str());
+ gmx_warning("%s", fullErrorMessage.c_str());
}
} // namespace
#ifdef CHECK_CUDA_ERRORS
/*! Check for CUDA error on the return status of a CUDA RT API call. */
-# define CU_RET_ERR(status, msg) \
- do \
- { \
- if (status != cudaSuccess) \
- { \
- gmx_fatal(FARGS, "%s: %s\n", msg, cudaGetErrorString(status)); \
- } \
- } while (0)
-
-/*! Check for any previously occurred uncaught CUDA error. */
-# define CU_CHECK_PREV_ERR() \
- do \
- { \
- cudaError_t _CU_CHECK_PREV_ERR_status = cudaGetLastError(); \
- if (_CU_CHECK_PREV_ERR_status != cudaSuccess) \
- { \
- gmx_warning( \
- "Just caught a previously occurred CUDA error (%s), will try to " \
- "continue.", \
- cudaGetErrorString(_CU_CHECK_PREV_ERR_status)); \
- } \
+# define CU_RET_ERR(deviceError, msg) \
+ do \
+ { \
+ if (deviceError != cudaSuccess) \
+ { \
+ gmx_fatal(FARGS, "%s\n", (msg + gmx::getDeviceErrorString(deviceError)).c_str()); \
+ } \
} while (0)
#else /* CHECK_CUDA_ERRORS */
do \
{ \
} while (0)
-# define CU_CHECK_PREV_ERR() \
- do \
- { \
- } while (0)
#endif /* CHECK_CUDA_ERRORS */
return false;
}
- GMX_ASSERT(stat != cudaErrorInvalidResourceHandle, "Stream identifier not valid");
+ GMX_ASSERT(stat != cudaErrorInvalidResourceHandle,
+ ("Stream identifier not valid. " + gmx::getDeviceErrorString(stat)).c_str());
// cudaSuccess and cudaErrorNotReady are the expected return values
- CU_RET_ERR(stat, "Unexpected cudaStreamQuery failure");
+ CU_RET_ERR(stat, "Unexpected cudaStreamQuery failure. ");
GMX_ASSERT(stat == cudaSuccess,
- "Values other than cudaSuccess should have been explicitly handled");
+ ("Values other than cudaSuccess should have been explicitly handled. "
+ + gmx::getDeviceErrorString(stat))
+ .c_str());
return true;
}
cudaLaunchKernel((void*)kernel, gridSize, blockSize, const_cast<void**>(kernelArgs.data()),
config.sharedMemorySize, deviceStream.stream());
- cudaError_t status = cudaGetLastError();
- if (cudaSuccess != status)
- {
- const std::string errorMessage =
- "GPU kernel (" + std::string(kernelName)
- + ") failed to launch: " + std::string(cudaGetErrorString(status));
- GMX_THROW(gmx::InternalError(errorMessage));
- }
+ gmx::ensureNoPendingDeviceError("GPU kernel (" + std::string(kernelName)
+ + ") failed to launch.");
}
#endif
#include "device_stream.h"
+#include "gromacs/gpu_utils/cudautils.cuh"
#include "gromacs/utility/exceptions.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/stringutil.h"
if (priority == DeviceStreamPriority::Normal)
{
stat = cudaStreamCreate(&stream_);
- if (stat != cudaSuccess)
- {
- GMX_THROW(gmx::InternalError(gmx::formatString(
- "Could not create CUDA stream (CUDA error %d: %s).", stat, cudaGetErrorString(stat))));
- }
+ gmx::checkDeviceError(stat, "Could not create CUDA stream.");
}
else if (priority == DeviceStreamPriority::High)
{
// range, which in that case will be a single value.
int highestPriority;
stat = cudaDeviceGetStreamPriorityRange(nullptr, &highestPriority);
- if (stat != cudaSuccess)
- {
- GMX_THROW(gmx::InternalError(gmx::formatString(
- "Could not query CUDA stream priority range (CUDA error %d: %s).", stat,
- cudaGetErrorString(stat))));
- }
+ gmx::checkDeviceError(stat, "Could not query CUDA stream priority range.");
stat = cudaStreamCreateWithPriority(&stream_, cudaStreamDefault, highestPriority);
- if (stat != cudaSuccess)
- {
- GMX_THROW(gmx::InternalError(gmx::formatString(
- "Could not create CUDA stream with high priority (CUDA error %d: %s).", stat,
- cudaGetErrorString(stat))));
- }
+ gmx::checkDeviceError(stat, "Could not create CUDA stream with high priority.");
}
}
{
cudaError_t stat = cudaStreamDestroy(stream_);
GMX_RELEASE_ASSERT(stat == cudaSuccess,
- gmx::formatString("Failed to release CUDA stream (CUDA error %d: %s).",
- stat, cudaGetErrorString(stat))
- .c_str());
+ ("Failed to release CUDA stream. " + gmx::getDeviceErrorString(stat)).c_str());
stream_ = nullptr;
}
}
{
cudaError_t stat = cudaStreamSynchronize(stream_);
GMX_RELEASE_ASSERT(stat == cudaSuccess,
- gmx::formatString("cudaStreamSynchronize failed (CUDA error %d: %s).", stat,
- cudaGetErrorString(stat))
- .c_str());
+ ("cudaStreamSynchronize failed. " + gmx::getDeviceErrorString(stat)).c_str());
}
*/
#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
+#include "gromacs/gpu_utils/cudautils.cuh"
#include "gromacs/gpu_utils/device_context.h"
#include "gromacs/gpu_utils/device_stream.h"
#include "gromacs/gpu_utils/devicebuffer_datatype.h"
{
GMX_ASSERT(buffer, "needs a buffer pointer");
cudaError_t stat = cudaMalloc((void**)buffer, numValues * sizeof(ValueType));
- GMX_RELEASE_ASSERT(stat == cudaSuccess, "cudaMalloc failure");
+ GMX_RELEASE_ASSERT(
+ stat == cudaSuccess,
+ ("Allocation of the device buffer failed. " + gmx::getDeviceErrorString(stat)).c_str());
}
/*! \brief
GMX_ASSERT(buffer, "needs a buffer pointer");
if (*buffer)
{
- GMX_RELEASE_ASSERT(cudaFree(*buffer) == cudaSuccess, "cudaFree failed");
+ cudaError_t stat = cudaFree(*buffer);
+ GMX_RELEASE_ASSERT(
+ stat == cudaSuccess,
+ ("Freeing of the device buffer failed. " + gmx::getDeviceErrorString(stat)).c_str());
}
}
"Source host buffer was not pinned for CUDA");
stat = cudaMemcpyAsync(*((ValueType**)buffer) + startingOffset, hostBuffer, bytes,
cudaMemcpyHostToDevice, deviceStream.stream());
- GMX_RELEASE_ASSERT(stat == cudaSuccess, "Asynchronous H2D copy failed");
+ GMX_RELEASE_ASSERT(
+ stat == cudaSuccess,
+ ("Asynchronous H2D copy failed. " + gmx::getDeviceErrorString(stat)).c_str());
break;
case GpuApiCallBehavior::Sync:
stat = cudaMemcpy(*((ValueType**)buffer) + startingOffset, hostBuffer, bytes,
cudaMemcpyHostToDevice);
- GMX_RELEASE_ASSERT(stat == cudaSuccess, "Synchronous H2D copy failed");
+ GMX_RELEASE_ASSERT(
+ stat == cudaSuccess,
+ ("Synchronous H2D copy failed. " + gmx::getDeviceErrorString(stat)).c_str());
break;
default: throw;
"Destination host buffer was not pinned for CUDA");
stat = cudaMemcpyAsync(hostBuffer, *((ValueType**)buffer) + startingOffset, bytes,
cudaMemcpyDeviceToHost, deviceStream.stream());
- GMX_RELEASE_ASSERT(stat == cudaSuccess, "Asynchronous D2H copy failed");
+ GMX_RELEASE_ASSERT(
+ stat == cudaSuccess,
+ ("Asynchronous D2H copy failed. " + gmx::getDeviceErrorString(stat)).c_str());
break;
case GpuApiCallBehavior::Sync:
stat = cudaMemcpy(hostBuffer, *((ValueType**)buffer) + startingOffset, bytes,
cudaMemcpyDeviceToHost);
- GMX_RELEASE_ASSERT(stat == cudaSuccess, "Synchronous D2H copy failed");
+ GMX_RELEASE_ASSERT(
+ stat == cudaSuccess,
+ ("Synchronous D2H copy failed. " + gmx::getDeviceErrorString(stat)).c_str());
break;
default: throw;
cudaError_t stat = cudaMemsetAsync(*((ValueType**)buffer) + startingOffset, pattern, bytes,
deviceStream.stream());
- GMX_RELEASE_ASSERT(stat == cudaSuccess, "Couldn't clear the device buffer");
+ GMX_RELEASE_ASSERT(stat == cudaSuccess,
+ ("Couldn't clear the device buffer. " + gmx::getDeviceErrorString(stat)).c_str());
}
/*! \brief Check the validity of the device buffer.
cudaError_t stat =
cudaMemcpy(*((ValueType**)deviceBuffer), hostBuffer, sizeInBytes, cudaMemcpyHostToDevice);
- GMX_RELEASE_ASSERT(
- stat == cudaSuccess,
- gmx::formatString("Synchronous H2D copy failed (CUDA error: %s).", cudaGetErrorName(stat))
- .c_str());
+ GMX_RELEASE_ASSERT(stat == cudaSuccess,
+ ("Synchronous H2D copy failed. " + gmx::getDeviceErrorString(stat)).c_str());
if (!c_disableCudaTextures)
{
memset(&td, 0, sizeof(td));
td.readMode = cudaReadModeElementType;
stat = cudaCreateTextureObject(deviceTexture, &rd, &td, nullptr);
- GMX_RELEASE_ASSERT(stat == cudaSuccess,
- gmx::formatString("cudaCreateTextureObject failed (CUDA error: %s).",
- cudaGetErrorName(stat))
- .c_str());
+ GMX_RELEASE_ASSERT(
+ stat == cudaSuccess,
+ ("Binding of the texture object failed. " + gmx::getDeviceErrorString(stat)).c_str());
}
}
cudaError_t stat = cudaDestroyTextureObject(deviceTexture);
GMX_RELEASE_ASSERT(
stat == cudaSuccess,
- gmx::formatString(
- "cudaDestroyTextureObject on texture object failed (CUDA error: %s).",
- cudaGetErrorName(stat))
- .c_str());
+ ("Destruction of the texture object failed. " + gmx::getDeviceErrorString(stat)).c_str());
}
freeDeviceBuffer(deviceBuffer);
}
.asParagraph()
.appendTextFormatted(
"GPU peer access not enabled between GPUs %d and %d due to unexpected "
- "return value from %s: %s",
- gpuA, gpuB, cudaCallName, cudaGetErrorString(stat));
+ "return value from %s. %s",
+ gpuA, gpuB, cudaCallName, gmx::getDeviceErrorString(stat).c_str());
}
}
.asParagraph()
.appendTextFormatted(
"GPU peer access not enabled due to unexpected return value from "
- "cudaSetDevice(%d): %s",
- gpuA, cudaGetErrorString(stat));
+ "cudaSetDevice(%d). %s",
+ gpuA, gmx::getDeviceErrorString(stat).c_str());
return;
}
for (unsigned int j = 0; j < gpuIdsToUse.size(); j++)
#ifndef GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_CUH
#define GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_CUH
+#include "gromacs/gpu_utils/cudautils.cuh"
#include "gromacs/gpu_utils/device_stream.h"
#include "gromacs/gpu_utils/gputraits.cuh"
#include "gromacs/utility/gmxassert.h"
GpuEventSynchronizer()
{
cudaError_t gmx_used_in_debug stat = cudaEventCreateWithFlags(&event_, cudaEventDisableTiming);
- GMX_RELEASE_ASSERT(stat == cudaSuccess, "cudaEventCreate failed");
+ GMX_RELEASE_ASSERT(stat == cudaSuccess,
+ ("cudaEventCreate failed. " + gmx::getDeviceErrorString(stat)).c_str());
}
~GpuEventSynchronizer()
{
cudaError_t gmx_used_in_debug stat = cudaEventDestroy(event_);
- GMX_ASSERT(stat == cudaSuccess, "cudaEventDestroy failed");
+ GMX_RELEASE_ASSERT(stat == cudaSuccess,
+ ("cudaEventDestroy failed. " + gmx::getDeviceErrorString(stat)).c_str());
}
//! No copying
GpuEventSynchronizer(const GpuEventSynchronizer&) = delete;
inline void markEvent(const DeviceStream& deviceStream)
{
cudaError_t gmx_used_in_debug stat = cudaEventRecord(event_, deviceStream.stream());
- GMX_ASSERT(stat == cudaSuccess, "cudaEventRecord failed");
+ GMX_ASSERT(stat == cudaSuccess,
+ ("cudaEventRecord failed. " + gmx::getDeviceErrorString(stat)).c_str());
}
/*! \brief Synchronizes the host thread on the marked event. */
inline void waitForEvent()
{
cudaError_t gmx_used_in_debug stat = cudaEventSynchronize(event_);
- GMX_ASSERT(stat == cudaSuccess, "cudaEventSynchronize failed");
+ GMX_ASSERT(stat == cudaSuccess,
+ ("cudaEventSynchronize failed. " + gmx::getDeviceErrorString(stat)).c_str());
}
/*! \brief Enqueues a wait for the recorded event in stream \p stream */
inline void enqueueWaitEvent(const DeviceStream& deviceStream)
{
cudaError_t gmx_used_in_debug stat = cudaStreamWaitEvent(deviceStream.stream(), event_, 0);
- GMX_ASSERT(stat == cudaSuccess, "cudaStreamWaitEvent failed");
+ GMX_ASSERT(stat == cudaSuccess,
+ ("cudaStreamWaitEvent failed. " + gmx::getDeviceErrorString(stat)).c_str());
}
private:
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
void pinBuffer(void* pointer, std::size_t numBytes) noexcept
{
- const char* errorMessage =
- "Could not register the host memory for page locking for GPU transfers.";
+ const std::string errorMessage =
+ "Could not register the host memory for page locking for GPU transfers. ";
GMX_ASSERT(isAligned(pointer, PageAlignedAllocationPolicy::alignment()),
- formatString("%s Host memory needs to be page aligned.", errorMessage).c_str());
+ (errorMessage + "Host memory needs to be page aligned.").c_str());
numBytes = std::max<size_t>(
1, numBytes); // C++11 3.7.4.1 gurantees that every pointer is different thus at least 1 byte
- ensureNoPendingCudaError(errorMessage);
+ ensureNoPendingDeviceError(errorMessage);
cudaError_t stat = cudaHostRegister(pointer, numBytes, cudaHostRegisterDefault);
// These errors can only arise from a coding error somewhere.
- GMX_RELEASE_ASSERT(
- stat != cudaErrorInvalidValue && stat != cudaErrorNotSupported
- && stat != cudaErrorHostMemoryAlreadyRegistered,
- formatString("%s %s: %s", errorMessage, cudaGetErrorName(stat), cudaGetErrorString(stat))
- .c_str());
+ GMX_RELEASE_ASSERT(stat != cudaErrorInvalidValue && stat != cudaErrorNotSupported
+ && stat != cudaErrorHostMemoryAlreadyRegistered,
+ (errorMessage + getDeviceErrorString(stat)).c_str());
// We always handle the error, but if it's a type we didn't expect
// (e.g. because CUDA changes the set of errors it returns) then
// we should get a descriptive assertion in Debug mode so we know
// to fix our expectations.
GMX_ASSERT(stat != cudaErrorMemoryAllocation,
- formatString("%s %s: %s which was an unexpected error", errorMessage,
- cudaGetErrorName(stat), cudaGetErrorString(stat))
- .c_str());
+ (errorMessage + getDeviceErrorString(stat) + " which was an unexpected error").c_str());
// It might be preferable to throw InternalError here, because the
// failing condition can only happen when GROMACS is used with a
// engineer GROMACS to be forward-compatible with future CUDA
// versions, so if this proves to be a problem in practice, then
// GROMACS must be patched, or a supported CUDA version used.
- GMX_RELEASE_ASSERT(stat == cudaSuccess, formatString("%s %s: %s", errorMessage,
- cudaGetErrorName(stat), cudaGetErrorString(stat))
- .c_str());
+ GMX_RELEASE_ASSERT(stat == cudaSuccess, (errorMessage + getDeviceErrorString(stat)).c_str());
}
void unpinBuffer(void* pointer) noexcept
{
- const char* errorMessage = "Could not unregister pinned host memory used for GPU transfers.";
+ const std::string errorMessage =
+ "Could not unregister pinned host memory used for GPU transfers. ";
- GMX_ASSERT(pointer != nullptr,
- formatString("%s pointer should not be nullptr when pinned.", errorMessage).c_str());
+ GMX_ASSERT(pointer != nullptr, (errorMessage + "Pointer should not be nullptr when pinned.").c_str());
- ensureNoPendingCudaError(errorMessage);
+ ensureNoPendingDeviceError(errorMessage);
cudaError_t stat = cudaHostUnregister(pointer);
// These errors can only arise from a coding error somewhere.
- GMX_RELEASE_ASSERT(
- stat != cudaErrorInvalidValue && stat != cudaErrorHostMemoryNotRegistered,
- formatString("%s %s: %s", errorMessage, cudaGetErrorName(stat), cudaGetErrorString(stat))
- .c_str());
+ GMX_RELEASE_ASSERT(stat != cudaErrorInvalidValue && stat != cudaErrorHostMemoryNotRegistered,
+ (errorMessage + getDeviceErrorString(stat)).c_str());
// If there's an error whose type we didn't expect (e.g. because a
// future CUDA changes the set of errors it returns) then we
// should assert, because our code is wrong.
// unpin() from a destructor, in which case any attempt to throw
// an uncaught exception would anyway terminate the program. A
// release assertion is a better behaviour than that.
- GMX_RELEASE_ASSERT(stat == cudaSuccess,
- formatString("%s %s: %s which was an unexpected error", errorMessage,
- cudaGetErrorName(stat), cudaGetErrorString(stat))
- .c_str());
+ GMX_RELEASE_ASSERT(stat == cudaSuccess, (errorMessage + getDeviceErrorString(stat)).c_str());
}
} // namespace gmx
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2014,2015,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2012,2014,2015,2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
return;
}
- CU_CHECK_PREV_ERR();
+ gmx::ensureNoPendingDeviceError("Could not allocate page-locked memory.");
stat = cudaMallocHost(h_ptr, nbytes, flag);
sprintf(strbuf, "cudaMallocHost of size %d bytes failed", (int)nbytes);
return;
}
- CU_CHECK_PREV_ERR();
+ gmx::ensureNoPendingDeviceError("Could not allocate page-locked memory with write-combining.");
stat = cudaMallocHost(h_ptr, nbytes, flag);
sprintf(strbuf, "cudaMallocHost of size %d bytes failed", (int)nbytes);
return;
}
- CU_CHECK_PREV_ERR();
+ gmx::ensureNoPendingDeviceError("Could not free page-locked memory.");
stat = cudaFreeHost(h_ptr);
CU_RET_ERR(stat, "cudaFreeHost failed");
namespace gmx
{
-namespace
-{
-
-/*! \brief Help give useful diagnostics about error \c status while doing \c message.
- *
- * \throws InternalError If status indicates failure, supplying
- * descriptive text from \c message. */
-static void throwUponFailure(cudaError_t status, const char* message)
-{
- if (status != cudaSuccess)
- {
- GMX_THROW(InternalError(formatString("Failure while %s", message)));
- ;
- }
-}
-
-} // namespace
void doDeviceTransfers(const DeviceInformation& deviceInfo, ArrayRef<const char> input, ArrayRef<char> output)
{
int oldDeviceId;
status = cudaGetDevice(&oldDeviceId);
- throwUponFailure(status, "getting old device id");
+ checkDeviceError(status, "Error while getting old device id.");
status = cudaSetDevice(deviceInfo.id);
- throwUponFailure(status, "setting device id to the first compatible GPU");
+ checkDeviceError(status, "Error while setting device id to the first compatible GPU.");
void* devicePointer;
status = cudaMalloc(&devicePointer, input.size());
- throwUponFailure(status, "creating buffer");
+ checkDeviceError(status, "Error while creating buffer.");
status = cudaMemcpy(devicePointer, input.data(), input.size(), cudaMemcpyHostToDevice);
- throwUponFailure(status, "transferring host to device");
+ checkDeviceError(status, "Error while transferring host to device.");
status = cudaMemcpy(output.data(), devicePointer, output.size(), cudaMemcpyDeviceToHost);
- throwUponFailure(status, "transferring device to host");
+ checkDeviceError(status, "Error while transferring device to host.");
status = cudaFree(devicePointer);
- throwUponFailure(status, "releasing buffer");
+ checkDeviceError(status, "Error while releasing buffer.");
status = cudaSetDevice(oldDeviceId);
- throwUponFailure(status, "setting old device id");
+ checkDeviceError(status, "Error while setting old device id.");
}
} // namespace gmx
cu_err = cudaSetDevice(deviceInfo.id);
if (cu_err != cudaSuccess)
{
- fprintf(stderr, "Error %d while switching to device #%d: %s\n", cu_err, deviceInfo.id,
- cudaGetErrorString(cu_err));
+ fprintf(stderr, "Error while switching to device #%d. %s\n", deviceInfo.id,
+ gmx::getDeviceErrorString(cu_err).c_str());
return DeviceStatus::NonFunctional;
}
stat = cudaDriverGetVersion(&driverVersion);
GMX_ASSERT(stat != cudaErrorInvalidValue,
"An impossible null pointer was passed to cudaDriverGetVersion");
- GMX_RELEASE_ASSERT(
- stat == cudaSuccess,
- gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
- cudaGetErrorName(stat), cudaGetErrorString(stat))
- .c_str());
+ GMX_RELEASE_ASSERT(stat == cudaSuccess,
+ ("An unexpected value was returned from cudaDriverGetVersion. "
+ + gmx::getDeviceErrorString(stat))
+ .c_str());
bool foundDriver = (driverVersion > 0);
if (!foundDriver)
{
{
int numDevices;
cudaError_t stat = cudaGetDeviceCount(&numDevices);
- if (stat != cudaSuccess)
- {
- GMX_THROW(gmx::InternalError(
- "Invalid call of findDevices() when CUDA API returned an error, perhaps "
- "canPerformDeviceDetection() was not called appropriately beforehand."));
- }
+ gmx::checkDeviceError(stat,
+ "Invalid call of findDevices() when CUDA API returned an error, perhaps "
+ "canPerformDeviceDetection() was not called appropriately beforehand.");
/* things might go horribly wrong if cudart is not compatible with the driver */
numDevices = std::min(numDevices, c_cudaMaxDeviceCount);
// We expect to start device support/sanity checks with a clean runtime error state
- gmx::ensureNoPendingCudaError("");
+ gmx::ensureNoPendingDeviceError("Trying to find available CUDA devices.");
std::vector<std::unique_ptr<DeviceInformation>> deviceInfoList(numDevices);
for (int i = 0; i < numDevices; i++)
//
// Here we also clear the CUDA API error state so potential
// errors during sanity checks don't propagate.
- if ((stat = cudaGetLastError()) != cudaSuccess)
- {
- gmx_warning("An error occurred while sanity checking device #%d; %s: %s",
- deviceInfoList[i]->id, cudaGetErrorName(stat), cudaGetErrorString(stat));
- }
+ const std::string errorMessage = gmx::formatString(
+ "An error occurred while sanity checking device #%d.", deviceInfoList[i]->id);
+ gmx::ensureNoPendingDeviceError(errorMessage);
}
}
stat = cudaPeekAtLastError();
- GMX_RELEASE_ASSERT(stat == cudaSuccess,
- gmx::formatString("We promise to return with clean CUDA state, but "
- "non-success state encountered: %s: %s",
- cudaGetErrorName(stat), cudaGetErrorString(stat))
- .c_str());
+ GMX_RELEASE_ASSERT(
+ stat == cudaSuccess,
+ ("We promise to return with clean CUDA state, but non-success state encountered. "
+ + gmx::getDeviceErrorString(stat))
+ .c_str());
return deviceInfoList;
}
{
if (debug)
{
- fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
+ fprintf(stderr, "Cleaning up context on GPU ID #%d.\n", gpuid);
}
stat = cudaDeviceReset();
if (stat != cudaSuccess)
{
- gmx_warning("Failed to free GPU #%d: %s", gpuid, cudaGetErrorString(stat));
+ gmx_warning("Failed to free GPU #%d. %s", gpuid, gmx::getDeviceErrorString(stat).c_str());
}
}
}
const matrix prVelocityScalingMatrix)
{
- ensureNoPendingCudaError("In CUDA version of Leap-Frog integrator");
+ ensureNoPendingDeviceError("In CUDA version of Leap-Frog integrator");
auto kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
if (doTemperatureScaling || doParrinelloRahman)
tensor virialScaled,
const PbcAiuc pbcAiuc)
{
- ensureNoPendingCudaError("In CUDA version of LINCS");
+ ensureNoPendingDeviceError("In CUDA version of LINCS");
// Early exit if no constraints
if (kernelParams_.numConstraintsThreads == 0)
const PbcAiuc pbcAiuc)
{
- ensureNoPendingCudaError("In CUDA version SETTLE");
+ ensureNoPendingDeviceError("In CUDA version SETTLE");
// Early exit if no settles
if (numSettles_ == 0)