From: Artem Zhmurov <zhmurov@gmail.com>
Date: Tue, 24 Nov 2020 08:02:48 +0000 (+0000)
Subject: Improve handling of CUDA API errors
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=a30cf2a9ff42e025590f2c27b3d57715ae25dc9c;p=alexxy%2Fgromacs.git

Improve handling of CUDA API errors

Consolidate the construction of the printed error message in one
place and make sure that the CUDA API information on error is
printed when the error is detected.
---

diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh
index d03768a04b..9a69fe9ef0 100644
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -55,21 +55,47 @@ namespace gmx
 namespace
 {
 
+/*! \brief Add the API information on the specific error to the error message.
+ *
+ * \param[in]  deviceError  The error to assert cudaSuccess on.
+ *
+ * \returns A description of the API error. Returns '(CUDA error #0 (cudaSuccess): no error)' in case deviceError is cudaSuccess.
+ */
+static inline std::string getDeviceErrorString(const cudaError_t deviceError)
+{
+    return formatString("CUDA error #%d (%s): %s.", deviceError, cudaGetErrorName(deviceError),
+                        cudaGetErrorString(deviceError));
+}
+
+/*! \brief Check if API returned an error and throw an exception with information on it.
+ *
+ * \param[in]  deviceError  The error to assert cudaSuccess on.
+ * \param[in]  errorMessage  Undecorated error message.
+ *
+ *  \throws InternalError if deviceError is not a success.
+ */
+static inline void checkDeviceError(const cudaError_t deviceError, const std::string& errorMessage)
+{
+    if (deviceError != cudaSuccess)
+    {
+        GMX_THROW(gmx::InternalError(errorMessage + " " + getDeviceErrorString(deviceError)));
+    }
+}
+
 /*! \brief Helper function to ensure no pending error silently
  * disrupts error handling.
  *
  * Asserts in a debug build if an unhandled error is present. Issues a
  * warning at run time otherwise.
  *
- * \todo This is similar to CU_CHECK_PREV_ERR, which should be
- * consolidated.
+ * \param[in]  errorMessage  Undecorated error message.
  */
-static inline void ensureNoPendingCudaError(const char* errorMessage)
+static inline void ensureNoPendingDeviceError(const std::string& errorMessage)
 {
     // Ensure there is no pending error that would otherwise affect
     // the behaviour of future error handling.
-    cudaError_t stat = cudaGetLastError();
-    if (stat == cudaSuccess)
+    cudaError_t deviceError = cudaGetLastError();
+    if (deviceError == cudaSuccess)
     {
         return;
     }
@@ -77,13 +103,13 @@ static inline void ensureNoPendingCudaError(const char* errorMessage)
     // If we would find an error in a release build, we do not know
     // what is appropriate to do about it, so assert only for debug
     // builds.
-    auto fullMessage = formatString(
-            "%s An unhandled error from a previous CUDA operation was detected. %s: %s",
-            errorMessage, cudaGetErrorName(stat), cudaGetErrorString(stat));
-    GMX_ASSERT(stat == cudaSuccess, fullMessage.c_str());
+    const std::string fullErrorMessage =
+            errorMessage + " An unhandled error from a previous CUDA operation was detected. "
+            + gmx::getDeviceErrorString(deviceError);
+    GMX_ASSERT(deviceError == cudaSuccess, fullErrorMessage.c_str());
     // TODO When we evolve a better logging framework, use that
     // for release-build error reporting.
-    gmx_warning("%s", fullMessage.c_str());
+    gmx_warning("%s", fullErrorMessage.c_str());
 }
 
 } // namespace
@@ -107,27 +133,13 @@ enum class GpuApiCallBehavior;
 #ifdef CHECK_CUDA_ERRORS
 
 /*! Check for CUDA error on the return status of a CUDA RT API call. */
-#    define CU_RET_ERR(status, msg)                                            \
-        do                                                                     \
-        {                                                                      \
-            if (status != cudaSuccess)                                         \
-            {                                                                  \
-                gmx_fatal(FARGS, "%s: %s\n", msg, cudaGetErrorString(status)); \
-            }                                                                  \
-        } while (0)
-
-/*! Check for any previously occurred uncaught CUDA error. */
-#    define CU_CHECK_PREV_ERR()                                                           \
-        do                                                                                \
-        {                                                                                 \
-            cudaError_t _CU_CHECK_PREV_ERR_status = cudaGetLastError();                   \
-            if (_CU_CHECK_PREV_ERR_status != cudaSuccess)                                 \
-            {                                                                             \
-                gmx_warning(                                                              \
-                        "Just caught a previously occurred CUDA error (%s), will try to " \
-                        "continue.",                                                      \
-                        cudaGetErrorString(_CU_CHECK_PREV_ERR_status));                   \
-            }                                                                             \
+#    define CU_RET_ERR(deviceError, msg)                                                          \
+        do                                                                                        \
+        {                                                                                         \
+            if (deviceError != cudaSuccess)                                                       \
+            {                                                                                     \
+                gmx_fatal(FARGS, "%s\n", (msg + gmx::getDeviceErrorString(deviceError)).c_str()); \
+            }                                                                                     \
         } while (0)
 
 #else /* CHECK_CUDA_ERRORS */
@@ -136,10 +148,6 @@ enum class GpuApiCallBehavior;
         do                          \
         {                           \
         } while (0)
-#    define CU_CHECK_PREV_ERR() \
-        do                      \
-        {                       \
-        } while (0)
 
 #endif /* CHECK_CUDA_ERRORS */
 
@@ -174,13 +182,16 @@ static inline bool haveStreamTasksCompleted(const DeviceStream& deviceStream)
         return false;
     }
 
-    GMX_ASSERT(stat != cudaErrorInvalidResourceHandle, "Stream identifier not valid");
+    GMX_ASSERT(stat != cudaErrorInvalidResourceHandle,
+               ("Stream identifier not valid. " + gmx::getDeviceErrorString(stat)).c_str());
 
     // cudaSuccess and cudaErrorNotReady are the expected return values
-    CU_RET_ERR(stat, "Unexpected cudaStreamQuery failure");
+    CU_RET_ERR(stat, "Unexpected cudaStreamQuery failure. ");
 
     GMX_ASSERT(stat == cudaSuccess,
-               "Values other than cudaSuccess should have been explicitly handled");
+               ("Values other than cudaSuccess should have been explicitly handled. "
+                + gmx::getDeviceErrorString(stat))
+                       .c_str());
 
     return true;
 }
@@ -273,14 +284,8 @@ void launchGpuKernel(void (*kernel)(Args...),
     cudaLaunchKernel((void*)kernel, gridSize, blockSize, const_cast<void**>(kernelArgs.data()),
                      config.sharedMemorySize, deviceStream.stream());
 
-    cudaError_t status = cudaGetLastError();
-    if (cudaSuccess != status)
-    {
-        const std::string errorMessage =
-                "GPU kernel (" + std::string(kernelName)
-                + ") failed to launch: " + std::string(cudaGetErrorString(status));
-        GMX_THROW(gmx::InternalError(errorMessage));
-    }
+    gmx::ensureNoPendingDeviceError("GPU kernel (" + std::string(kernelName)
+                                    + ") failed to launch.");
 }
 
 #endif
diff --git a/src/gromacs/gpu_utils/device_stream.cu b/src/gromacs/gpu_utils/device_stream.cu
index cc1f879862..5a309c1156 100644
--- a/src/gromacs/gpu_utils/device_stream.cu
+++ b/src/gromacs/gpu_utils/device_stream.cu
@@ -44,6 +44,7 @@
 
 #include "device_stream.h"
 
+#include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/utility/exceptions.h"
 #include "gromacs/utility/gmxassert.h"
 #include "gromacs/utility/stringutil.h"
@@ -57,11 +58,7 @@ DeviceStream::DeviceStream(const DeviceContext& /* deviceContext */,
     if (priority == DeviceStreamPriority::Normal)
     {
         stat = cudaStreamCreate(&stream_);
-        if (stat != cudaSuccess)
-        {
-            GMX_THROW(gmx::InternalError(gmx::formatString(
-                    "Could not create CUDA stream (CUDA error %d: %s).", stat, cudaGetErrorString(stat))));
-        }
+        gmx::checkDeviceError(stat, "Could not create CUDA stream.");
     }
     else if (priority == DeviceStreamPriority::High)
     {
@@ -70,20 +67,10 @@ DeviceStream::DeviceStream(const DeviceContext& /* deviceContext */,
         // range, which in that case will be a single value.
         int highestPriority;
         stat = cudaDeviceGetStreamPriorityRange(nullptr, &highestPriority);
-        if (stat != cudaSuccess)
-        {
-            GMX_THROW(gmx::InternalError(gmx::formatString(
-                    "Could not query CUDA stream priority range (CUDA error %d: %s).", stat,
-                    cudaGetErrorString(stat))));
-        }
+        gmx::checkDeviceError(stat, "Could not query CUDA stream priority range.");
 
         stat = cudaStreamCreateWithPriority(&stream_, cudaStreamDefault, highestPriority);
-        if (stat != cudaSuccess)
-        {
-            GMX_THROW(gmx::InternalError(gmx::formatString(
-                    "Could not create CUDA stream with high priority (CUDA error %d: %s).", stat,
-                    cudaGetErrorString(stat))));
-        }
+        gmx::checkDeviceError(stat, "Could not create CUDA stream with high priority.");
     }
 }
 
@@ -93,9 +80,7 @@ DeviceStream::~DeviceStream()
     {
         cudaError_t stat = cudaStreamDestroy(stream_);
         GMX_RELEASE_ASSERT(stat == cudaSuccess,
-                           gmx::formatString("Failed to release CUDA stream (CUDA error %d: %s).",
-                                             stat, cudaGetErrorString(stat))
-                                   .c_str());
+                           ("Failed to release CUDA stream. " + gmx::getDeviceErrorString(stat)).c_str());
         stream_ = nullptr;
     }
 }
@@ -114,7 +99,5 @@ void DeviceStream::synchronize() const
 {
     cudaError_t stat = cudaStreamSynchronize(stream_);
     GMX_RELEASE_ASSERT(stat == cudaSuccess,
-                       gmx::formatString("cudaStreamSynchronize failed  (CUDA error %d: %s).", stat,
-                                         cudaGetErrorString(stat))
-                               .c_str());
+                       ("cudaStreamSynchronize failed. " + gmx::getDeviceErrorString(stat)).c_str());
 }
diff --git a/src/gromacs/gpu_utils/devicebuffer.cuh b/src/gromacs/gpu_utils/devicebuffer.cuh
index 584edc7775..ec544aa3c2 100644
--- a/src/gromacs/gpu_utils/devicebuffer.cuh
+++ b/src/gromacs/gpu_utils/devicebuffer.cuh
@@ -46,6 +46,7 @@
  */
 
 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
+#include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/device_stream.h"
 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
@@ -68,7 +69,9 @@ void allocateDeviceBuffer(DeviceBuffer<ValueType>* buffer, size_t numValues, con
 {
     GMX_ASSERT(buffer, "needs a buffer pointer");
     cudaError_t stat = cudaMalloc((void**)buffer, numValues * sizeof(ValueType));
-    GMX_RELEASE_ASSERT(stat == cudaSuccess, "cudaMalloc failure");
+    GMX_RELEASE_ASSERT(
+            stat == cudaSuccess,
+            ("Allocation of the device buffer failed. " + gmx::getDeviceErrorString(stat)).c_str());
 }
 
 /*! \brief
@@ -85,7 +88,10 @@ void freeDeviceBuffer(DeviceBuffer* buffer)
     GMX_ASSERT(buffer, "needs a buffer pointer");
     if (*buffer)
     {
-        GMX_RELEASE_ASSERT(cudaFree(*buffer) == cudaSuccess, "cudaFree failed");
+        cudaError_t stat = cudaFree(*buffer);
+        GMX_RELEASE_ASSERT(
+                stat == cudaSuccess,
+                ("Freeing of the device buffer failed. " + gmx::getDeviceErrorString(stat)).c_str());
     }
 }
 
@@ -127,13 +133,17 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
                        "Source host buffer was not pinned for CUDA");
             stat = cudaMemcpyAsync(*((ValueType**)buffer) + startingOffset, hostBuffer, bytes,
                                    cudaMemcpyHostToDevice, deviceStream.stream());
-            GMX_RELEASE_ASSERT(stat == cudaSuccess, "Asynchronous H2D copy failed");
+            GMX_RELEASE_ASSERT(
+                    stat == cudaSuccess,
+                    ("Asynchronous H2D copy failed. " + gmx::getDeviceErrorString(stat)).c_str());
             break;
 
         case GpuApiCallBehavior::Sync:
             stat = cudaMemcpy(*((ValueType**)buffer) + startingOffset, hostBuffer, bytes,
                               cudaMemcpyHostToDevice);
-            GMX_RELEASE_ASSERT(stat == cudaSuccess, "Synchronous H2D copy failed");
+            GMX_RELEASE_ASSERT(
+                    stat == cudaSuccess,
+                    ("Synchronous H2D copy failed. " + gmx::getDeviceErrorString(stat)).c_str());
             break;
 
         default: throw;
@@ -178,13 +188,17 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
                        "Destination host buffer was not pinned for CUDA");
             stat = cudaMemcpyAsync(hostBuffer, *((ValueType**)buffer) + startingOffset, bytes,
                                    cudaMemcpyDeviceToHost, deviceStream.stream());
-            GMX_RELEASE_ASSERT(stat == cudaSuccess, "Asynchronous D2H copy failed");
+            GMX_RELEASE_ASSERT(
+                    stat == cudaSuccess,
+                    ("Asynchronous D2H copy failed. " + gmx::getDeviceErrorString(stat)).c_str());
             break;
 
         case GpuApiCallBehavior::Sync:
             stat = cudaMemcpy(hostBuffer, *((ValueType**)buffer) + startingOffset, bytes,
                               cudaMemcpyDeviceToHost);
-            GMX_RELEASE_ASSERT(stat == cudaSuccess, "Synchronous D2H copy failed");
+            GMX_RELEASE_ASSERT(
+                    stat == cudaSuccess,
+                    ("Synchronous D2H copy failed. " + gmx::getDeviceErrorString(stat)).c_str());
             break;
 
         default: throw;
@@ -212,7 +226,8 @@ void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer,
 
     cudaError_t stat = cudaMemsetAsync(*((ValueType**)buffer) + startingOffset, pattern, bytes,
                                        deviceStream.stream());
-    GMX_RELEASE_ASSERT(stat == cudaSuccess, "Couldn't clear the device buffer");
+    GMX_RELEASE_ASSERT(stat == cudaSuccess,
+                       ("Couldn't clear the device buffer. " + gmx::getDeviceErrorString(stat)).c_str());
 }
 
 /*! \brief Check the validity of the device buffer.
@@ -270,10 +285,8 @@ void initParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer,
     cudaError_t stat =
             cudaMemcpy(*((ValueType**)deviceBuffer), hostBuffer, sizeInBytes, cudaMemcpyHostToDevice);
 
-    GMX_RELEASE_ASSERT(
-            stat == cudaSuccess,
-            gmx::formatString("Synchronous H2D copy failed (CUDA error: %s).", cudaGetErrorName(stat))
-                    .c_str());
+    GMX_RELEASE_ASSERT(stat == cudaSuccess,
+                       ("Synchronous H2D copy failed. " + gmx::getDeviceErrorString(stat)).c_str());
 
     if (!c_disableCudaTextures)
     {
@@ -289,10 +302,9 @@ void initParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer,
         memset(&td, 0, sizeof(td));
         td.readMode = cudaReadModeElementType;
         stat        = cudaCreateTextureObject(deviceTexture, &rd, &td, nullptr);
-        GMX_RELEASE_ASSERT(stat == cudaSuccess,
-                           gmx::formatString("cudaCreateTextureObject failed (CUDA error: %s).",
-                                             cudaGetErrorName(stat))
-                                   .c_str());
+        GMX_RELEASE_ASSERT(
+                stat == cudaSuccess,
+                ("Binding of the texture object failed. " + gmx::getDeviceErrorString(stat)).c_str());
     }
 }
 
@@ -311,10 +323,7 @@ void destroyParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer, DeviceTextur
         cudaError_t stat = cudaDestroyTextureObject(deviceTexture);
         GMX_RELEASE_ASSERT(
                 stat == cudaSuccess,
-                gmx::formatString(
-                        "cudaDestroyTextureObject on texture object failed (CUDA error: %s).",
-                        cudaGetErrorName(stat))
-                        .c_str());
+                ("Destruction of the texture object failed. " + gmx::getDeviceErrorString(stat)).c_str());
     }
     freeDeviceBuffer(deviceBuffer);
 }
diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu
index c68a8cda63..8df9282b6b 100644
--- a/src/gromacs/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gpu_utils/gpu_utils.cu
@@ -171,8 +171,8 @@ static void peerAccessCheckStat(const cudaError_t    stat,
                 .asParagraph()
                 .appendTextFormatted(
                         "GPU peer access not enabled between GPUs %d and %d due to unexpected "
-                        "return value from %s: %s",
-                        gpuA, gpuB, cudaCallName, cudaGetErrorString(stat));
+                        "return value from %s. %s",
+                        gpuA, gpuB, cudaCallName, gmx::getDeviceErrorString(stat).c_str());
     }
 }
 
@@ -199,8 +199,8 @@ void setupGpuDevicePeerAccess(const std::vector<int>& gpuIdsToUse, const gmx::MD
                     .asParagraph()
                     .appendTextFormatted(
                             "GPU peer access not enabled due to unexpected return value from "
-                            "cudaSetDevice(%d): %s",
-                            gpuA, cudaGetErrorString(stat));
+                            "cudaSetDevice(%d). %s",
+                            gpuA, gmx::getDeviceErrorString(stat).c_str());
             return;
         }
         for (unsigned int j = 0; j < gpuIdsToUse.size(); j++)
diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer.cuh b/src/gromacs/gpu_utils/gpueventsynchronizer.cuh
index 14c3ac93cc..fee73cdb04 100644
--- a/src/gromacs/gpu_utils/gpueventsynchronizer.cuh
+++ b/src/gromacs/gpu_utils/gpueventsynchronizer.cuh
@@ -41,6 +41,7 @@
 #ifndef GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_CUH
 #define GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_CUH
 
+#include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/gpu_utils/device_stream.h"
 #include "gromacs/gpu_utils/gputraits.cuh"
 #include "gromacs/utility/gmxassert.h"
@@ -66,12 +67,14 @@ public:
     GpuEventSynchronizer()
     {
         cudaError_t gmx_used_in_debug stat = cudaEventCreateWithFlags(&event_, cudaEventDisableTiming);
-        GMX_RELEASE_ASSERT(stat == cudaSuccess, "cudaEventCreate failed");
+        GMX_RELEASE_ASSERT(stat == cudaSuccess,
+                           ("cudaEventCreate failed. " + gmx::getDeviceErrorString(stat)).c_str());
     }
     ~GpuEventSynchronizer()
     {
         cudaError_t gmx_used_in_debug stat = cudaEventDestroy(event_);
-        GMX_ASSERT(stat == cudaSuccess, "cudaEventDestroy failed");
+        GMX_RELEASE_ASSERT(stat == cudaSuccess,
+                           ("cudaEventDestroy failed. " + gmx::getDeviceErrorString(stat)).c_str());
     }
     //! No copying
     GpuEventSynchronizer(const GpuEventSynchronizer&) = delete;
@@ -86,19 +89,22 @@ public:
     inline void markEvent(const DeviceStream& deviceStream)
     {
         cudaError_t gmx_used_in_debug stat = cudaEventRecord(event_, deviceStream.stream());
-        GMX_ASSERT(stat == cudaSuccess, "cudaEventRecord failed");
+        GMX_ASSERT(stat == cudaSuccess,
+                   ("cudaEventRecord failed. " + gmx::getDeviceErrorString(stat)).c_str());
     }
     /*! \brief Synchronizes the host thread on the marked event. */
     inline void waitForEvent()
     {
         cudaError_t gmx_used_in_debug stat = cudaEventSynchronize(event_);
-        GMX_ASSERT(stat == cudaSuccess, "cudaEventSynchronize failed");
+        GMX_ASSERT(stat == cudaSuccess,
+                   ("cudaEventSynchronize failed. " + gmx::getDeviceErrorString(stat)).c_str());
     }
     /*! \brief Enqueues a wait for the recorded event in stream \p stream */
     inline void enqueueWaitEvent(const DeviceStream& deviceStream)
     {
         cudaError_t gmx_used_in_debug stat = cudaStreamWaitEvent(deviceStream.stream(), event_, 0);
-        GMX_ASSERT(stat == cudaSuccess, "cudaStreamWaitEvent failed");
+        GMX_ASSERT(stat == cudaSuccess,
+                   ("cudaStreamWaitEvent failed. " + gmx::getDeviceErrorString(stat)).c_str());
     }
 
 private:
diff --git a/src/gromacs/gpu_utils/pinning.cu b/src/gromacs/gpu_utils/pinning.cu
index 8bed9219b3..adb401165a 100644
--- a/src/gromacs/gpu_utils/pinning.cu
+++ b/src/gromacs/gpu_utils/pinning.cu
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -67,33 +67,29 @@ gmx_unused static inline bool isAligned(const void* ptr, size_t bytes)
 
 void pinBuffer(void* pointer, std::size_t numBytes) noexcept
 {
-    const char* errorMessage =
-            "Could not register the host memory for page locking for GPU transfers.";
+    const std::string errorMessage =
+            "Could not register the host memory for page locking for GPU transfers. ";
 
     GMX_ASSERT(isAligned(pointer, PageAlignedAllocationPolicy::alignment()),
-               formatString("%s Host memory needs to be page aligned.", errorMessage).c_str());
+               (errorMessage + "Host memory needs to be page aligned.").c_str());
 
     numBytes = std::max<size_t>(
             1, numBytes); // C++11 3.7.4.1 gurantees that every pointer is different thus at least 1 byte
 
-    ensureNoPendingCudaError(errorMessage);
+    ensureNoPendingDeviceError(errorMessage);
     cudaError_t stat = cudaHostRegister(pointer, numBytes, cudaHostRegisterDefault);
 
     // These errors can only arise from a coding error somewhere.
-    GMX_RELEASE_ASSERT(
-            stat != cudaErrorInvalidValue && stat != cudaErrorNotSupported
-                    && stat != cudaErrorHostMemoryAlreadyRegistered,
-            formatString("%s %s: %s", errorMessage, cudaGetErrorName(stat), cudaGetErrorString(stat))
-                    .c_str());
+    GMX_RELEASE_ASSERT(stat != cudaErrorInvalidValue && stat != cudaErrorNotSupported
+                               && stat != cudaErrorHostMemoryAlreadyRegistered,
+                       (errorMessage + getDeviceErrorString(stat)).c_str());
 
     // We always handle the error, but if it's a type we didn't expect
     // (e.g. because CUDA changes the set of errors it returns) then
     // we should get a descriptive assertion in Debug mode so we know
     // to fix our expectations.
     GMX_ASSERT(stat != cudaErrorMemoryAllocation,
-               formatString("%s %s: %s which was an unexpected error", errorMessage,
-                            cudaGetErrorName(stat), cudaGetErrorString(stat))
-                       .c_str());
+               (errorMessage + getDeviceErrorString(stat) + " which was an unexpected error").c_str());
 
     // It might be preferable to throw InternalError here, because the
     // failing condition can only happen when GROMACS is used with a
@@ -101,25 +97,21 @@ void pinBuffer(void* pointer, std::size_t numBytes) noexcept
     // engineer GROMACS to be forward-compatible with future CUDA
     // versions, so if this proves to be a problem in practice, then
     // GROMACS must be patched, or a supported CUDA version used.
-    GMX_RELEASE_ASSERT(stat == cudaSuccess, formatString("%s %s: %s", errorMessage,
-                                                         cudaGetErrorName(stat), cudaGetErrorString(stat))
-                                                    .c_str());
+    GMX_RELEASE_ASSERT(stat == cudaSuccess, (errorMessage + getDeviceErrorString(stat)).c_str());
 }
 
 void unpinBuffer(void* pointer) noexcept
 {
-    const char* errorMessage = "Could not unregister pinned host memory used for GPU transfers.";
+    const std::string errorMessage =
+            "Could not unregister pinned host memory used for GPU transfers. ";
 
-    GMX_ASSERT(pointer != nullptr,
-               formatString("%s pointer should not be nullptr when pinned.", errorMessage).c_str());
+    GMX_ASSERT(pointer != nullptr, (errorMessage + "Pointer should not be nullptr when pinned.").c_str());
 
-    ensureNoPendingCudaError(errorMessage);
+    ensureNoPendingDeviceError(errorMessage);
     cudaError_t stat = cudaHostUnregister(pointer);
     // These errors can only arise from a coding error somewhere.
-    GMX_RELEASE_ASSERT(
-            stat != cudaErrorInvalidValue && stat != cudaErrorHostMemoryNotRegistered,
-            formatString("%s %s: %s", errorMessage, cudaGetErrorName(stat), cudaGetErrorString(stat))
-                    .c_str());
+    GMX_RELEASE_ASSERT(stat != cudaErrorInvalidValue && stat != cudaErrorHostMemoryNotRegistered,
+                       (errorMessage + getDeviceErrorString(stat)).c_str());
     // If there's an error whose type we didn't expect (e.g. because a
     // future CUDA changes the set of errors it returns) then we
     // should assert, because our code is wrong.
@@ -128,10 +120,7 @@ void unpinBuffer(void* pointer) noexcept
     // unpin() from a destructor, in which case any attempt to throw
     // an uncaught exception would anyway terminate the program. A
     // release assertion is a better behaviour than that.
-    GMX_RELEASE_ASSERT(stat == cudaSuccess,
-                       formatString("%s %s: %s which was an unexpected error", errorMessage,
-                                    cudaGetErrorName(stat), cudaGetErrorString(stat))
-                               .c_str());
+    GMX_RELEASE_ASSERT(stat == cudaSuccess, (errorMessage + getDeviceErrorString(stat)).c_str());
 }
 
 } // namespace gmx
diff --git a/src/gromacs/gpu_utils/pmalloc_cuda.cu b/src/gromacs/gpu_utils/pmalloc_cuda.cu
index 0a88abb268..2d5e122052 100644
--- a/src/gromacs/gpu_utils/pmalloc_cuda.cu
+++ b/src/gromacs/gpu_utils/pmalloc_cuda.cu
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2014,2015,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2012,2014,2015,2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -63,7 +63,7 @@ void pmalloc(void** h_ptr, size_t nbytes)
         return;
     }
 
-    CU_CHECK_PREV_ERR();
+    gmx::ensureNoPendingDeviceError("Could not allocate page-locked memory.");
 
     stat = cudaMallocHost(h_ptr, nbytes, flag);
     sprintf(strbuf, "cudaMallocHost of size %d bytes failed", (int)nbytes);
@@ -86,7 +86,7 @@ void pmalloc_wc(void** h_ptr, size_t nbytes)
         return;
     }
 
-    CU_CHECK_PREV_ERR();
+    gmx::ensureNoPendingDeviceError("Could not allocate page-locked memory with write-combining.");
 
     stat = cudaMallocHost(h_ptr, nbytes, flag);
     sprintf(strbuf, "cudaMallocHost of size %d bytes failed", (int)nbytes);
@@ -106,7 +106,7 @@ void pfree(void* h_ptr)
         return;
     }
 
-    CU_CHECK_PREV_ERR();
+    gmx::ensureNoPendingDeviceError("Could not free page-locked memory.");
 
     stat = cudaFreeHost(h_ptr);
     CU_RET_ERR(stat, "cudaFreeHost failed");
diff --git a/src/gromacs/gpu_utils/tests/devicetransfers.cu b/src/gromacs/gpu_utils/tests/devicetransfers.cu
index 4e7e14779d..63b1f77a46 100644
--- a/src/gromacs/gpu_utils/tests/devicetransfers.cu
+++ b/src/gromacs/gpu_utils/tests/devicetransfers.cu
@@ -57,23 +57,6 @@
 
 namespace gmx
 {
-namespace
-{
-
-/*! \brief Help give useful diagnostics about error \c status while doing \c message.
- *
- * \throws InternalError  If status indicates failure, supplying
- *                        descriptive text from \c message. */
-static void throwUponFailure(cudaError_t status, const char* message)
-{
-    if (status != cudaSuccess)
-    {
-        GMX_THROW(InternalError(formatString("Failure while %s", message)));
-        ;
-    }
-}
-
-} // namespace
 
 void doDeviceTransfers(const DeviceInformation& deviceInfo, ArrayRef<const char> input, ArrayRef<char> output)
 {
@@ -83,24 +66,24 @@ void doDeviceTransfers(const DeviceInformation& deviceInfo, ArrayRef<const char>
     int oldDeviceId;
 
     status = cudaGetDevice(&oldDeviceId);
-    throwUponFailure(status, "getting old device id");
+    checkDeviceError(status, "Error while getting old device id.");
     status = cudaSetDevice(deviceInfo.id);
-    throwUponFailure(status, "setting device id to the first compatible GPU");
+    checkDeviceError(status, "Error while setting device id to the first compatible GPU.");
 
     void* devicePointer;
     status = cudaMalloc(&devicePointer, input.size());
-    throwUponFailure(status, "creating buffer");
+    checkDeviceError(status, "Error while creating buffer.");
 
     status = cudaMemcpy(devicePointer, input.data(), input.size(), cudaMemcpyHostToDevice);
-    throwUponFailure(status, "transferring host to device");
+    checkDeviceError(status, "Error while transferring host to device.");
     status = cudaMemcpy(output.data(), devicePointer, output.size(), cudaMemcpyDeviceToHost);
-    throwUponFailure(status, "transferring device to host");
+    checkDeviceError(status, "Error while transferring device to host.");
 
     status = cudaFree(devicePointer);
-    throwUponFailure(status, "releasing buffer");
+    checkDeviceError(status, "Error while releasing buffer.");
 
     status = cudaSetDevice(oldDeviceId);
-    throwUponFailure(status, "setting old device id");
+    checkDeviceError(status, "Error while setting old device id.");
 }
 
 } // namespace gmx
diff --git a/src/gromacs/hardware/device_management.cu b/src/gromacs/hardware/device_management.cu
index a09d6bf99a..0e77621a5b 100644
--- a/src/gromacs/hardware/device_management.cu
+++ b/src/gromacs/hardware/device_management.cu
@@ -127,8 +127,8 @@ static DeviceStatus isDeviceFunctional(const DeviceInformation& deviceInfo)
     cu_err = cudaSetDevice(deviceInfo.id);
     if (cu_err != cudaSuccess)
     {
-        fprintf(stderr, "Error %d while switching to device #%d: %s\n", cu_err, deviceInfo.id,
-                cudaGetErrorString(cu_err));
+        fprintf(stderr, "Error while switching to device #%d. %s\n", deviceInfo.id,
+                gmx::getDeviceErrorString(cu_err).c_str());
         return DeviceStatus::NonFunctional;
     }
 
@@ -216,11 +216,10 @@ bool isDeviceDetectionFunctional(std::string* errorMessage)
     stat                      = cudaDriverGetVersion(&driverVersion);
     GMX_ASSERT(stat != cudaErrorInvalidValue,
                "An impossible null pointer was passed to cudaDriverGetVersion");
-    GMX_RELEASE_ASSERT(
-            stat == cudaSuccess,
-            gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
-                              cudaGetErrorName(stat), cudaGetErrorString(stat))
-                    .c_str());
+    GMX_RELEASE_ASSERT(stat == cudaSuccess,
+                       ("An unexpected value was returned from cudaDriverGetVersion. "
+                        + gmx::getDeviceErrorString(stat))
+                               .c_str());
     bool foundDriver = (driverVersion > 0);
     if (!foundDriver)
     {
@@ -268,18 +267,15 @@ std::vector<std::unique_ptr<DeviceInformation>> findDevices()
 {
     int         numDevices;
     cudaError_t stat = cudaGetDeviceCount(&numDevices);
-    if (stat != cudaSuccess)
-    {
-        GMX_THROW(gmx::InternalError(
-                "Invalid call of findDevices() when CUDA API returned an error, perhaps "
-                "canPerformDeviceDetection() was not called appropriately beforehand."));
-    }
+    gmx::checkDeviceError(stat,
+                          "Invalid call of findDevices() when CUDA API returned an error, perhaps "
+                          "canPerformDeviceDetection() was not called appropriately beforehand.");
 
     /* things might go horribly wrong if cudart is not compatible with the driver */
     numDevices = std::min(numDevices, c_cudaMaxDeviceCount);
 
     // We expect to start device support/sanity checks with a clean runtime error state
-    gmx::ensureNoPendingCudaError("");
+    gmx::ensureNoPendingDeviceError("Trying to find available CUDA devices.");
 
     std::vector<std::unique_ptr<DeviceInformation>> deviceInfoList(numDevices);
     for (int i = 0; i < numDevices; i++)
@@ -311,20 +307,18 @@ std::vector<std::unique_ptr<DeviceInformation>> findDevices()
             //
             // Here we also clear the CUDA API error state so potential
             // errors during sanity checks don't propagate.
-            if ((stat = cudaGetLastError()) != cudaSuccess)
-            {
-                gmx_warning("An error occurred while sanity checking device #%d; %s: %s",
-                            deviceInfoList[i]->id, cudaGetErrorName(stat), cudaGetErrorString(stat));
-            }
+            const std::string errorMessage = gmx::formatString(
+                    "An error occurred while sanity checking device #%d.", deviceInfoList[i]->id);
+            gmx::ensureNoPendingDeviceError(errorMessage);
         }
     }
 
     stat = cudaPeekAtLastError();
-    GMX_RELEASE_ASSERT(stat == cudaSuccess,
-                       gmx::formatString("We promise to return with clean CUDA state, but "
-                                         "non-success state encountered: %s: %s",
-                                         cudaGetErrorName(stat), cudaGetErrorString(stat))
-                               .c_str());
+    GMX_RELEASE_ASSERT(
+            stat == cudaSuccess,
+            ("We promise to return with clean CUDA state, but non-success state encountered. "
+             + gmx::getDeviceErrorString(stat))
+                    .c_str());
 
     return deviceInfoList;
 }
@@ -360,13 +354,13 @@ void releaseDevice(DeviceInformation* deviceInfo)
         {
             if (debug)
             {
-                fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
+                fprintf(stderr, "Cleaning up context on GPU ID #%d.\n", gpuid);
             }
 
             stat = cudaDeviceReset();
             if (stat != cudaSuccess)
             {
-                gmx_warning("Failed to free GPU #%d: %s", gpuid, cudaGetErrorString(stat));
+                gmx_warning("Failed to free GPU #%d. %s", gpuid, gmx::getDeviceErrorString(stat).c_str());
             }
         }
     }
diff --git a/src/gromacs/mdlib/leapfrog_gpu.cu b/src/gromacs/mdlib/leapfrog_gpu.cu
index 1793a386b2..a5ac290f6f 100644
--- a/src/gromacs/mdlib/leapfrog_gpu.cu
+++ b/src/gromacs/mdlib/leapfrog_gpu.cu
@@ -249,7 +249,7 @@ void LeapFrogGpu::integrate(const DeviceBuffer<float3>        d_x,
                             const matrix                      prVelocityScalingMatrix)
 {
 
-    ensureNoPendingCudaError("In CUDA version of Leap-Frog integrator");
+    ensureNoPendingDeviceError("In CUDA version of Leap-Frog integrator");
 
     auto kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
     if (doTemperatureScaling || doParrinelloRahman)
diff --git a/src/gromacs/mdlib/lincs_gpu.cu b/src/gromacs/mdlib/lincs_gpu.cu
index f3b832f18b..edffc43821 100644
--- a/src/gromacs/mdlib/lincs_gpu.cu
+++ b/src/gromacs/mdlib/lincs_gpu.cu
@@ -435,7 +435,7 @@ void LincsGpu::apply(const float3* d_x,
                      tensor        virialScaled,
                      const PbcAiuc pbcAiuc)
 {
-    ensureNoPendingCudaError("In CUDA version of LINCS");
+    ensureNoPendingDeviceError("In CUDA version of LINCS");
 
     // Early exit if no constraints
     if (kernelParams_.numConstraintsThreads == 0)
diff --git a/src/gromacs/mdlib/settle_gpu.cu b/src/gromacs/mdlib/settle_gpu.cu
index 8b95a9fec5..c02f0d5884 100644
--- a/src/gromacs/mdlib/settle_gpu.cu
+++ b/src/gromacs/mdlib/settle_gpu.cu
@@ -405,7 +405,7 @@ void SettleGpu::apply(const float3* d_x,
                       const PbcAiuc pbcAiuc)
 {
 
-    ensureNoPendingCudaError("In CUDA version SETTLE");
+    ensureNoPendingDeviceError("In CUDA version SETTLE");
 
     // Early exit if no settles
     if (numSettles_ == 0)