Improve handling of CUDA API errors

author Artem Zhmurov <zhmurov@gmail.com>

Tue, 24 Nov 2020 08:02:48 +0000 (08:02 +0000)

committer Paul Bauer <paul.bauer.q@gmail.com>

Tue, 24 Nov 2020 08:02:48 +0000 (08:02 +0000)
author Artem Zhmurov <zhmurov@gmail.com>
Tue, 24 Nov 2020 08:02:48 +0000 (08:02 +0000)
committer Paul Bauer <paul.bauer.q@gmail.com>
Tue, 24 Nov 2020 08:02:48 +0000 (08:02 +0000)
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh

index d03768a04b949011e523ca0067f37ddcf68c1f70..9a69fe9ef0b5b6da6fb8e8ddfb36b9600835fd7c 100644 (file)
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -55,21 +55,47 @@ namespace gmx
  namespace
  {
  
+/*! \brief Add the API information on the specific error to the error message.
+ *
+ * \param[in]  deviceError  The error to assert cudaSuccess on.
+ *
+ * \returns A description of the API error. Returns '(CUDA error #0 (cudaSuccess): no error)' in case deviceError is cudaSuccess.
+ */
+static inline std::string getDeviceErrorString(const cudaError_t deviceError)
+{
+    return formatString("CUDA error #%d (%s): %s.", deviceError, cudaGetErrorName(deviceError),
+                        cudaGetErrorString(deviceError));
+}
+
+/*! \brief Check if API returned an error and throw an exception with information on it.
+ *
+ * \param[in]  deviceError  The error to assert cudaSuccess on.
+ * \param[in]  errorMessage  Undecorated error message.
+ *
+ *  \throws InternalError if deviceError is not a success.
+ */
+static inline void checkDeviceError(const cudaError_t deviceError, const std::string& errorMessage)
+{
+    if (deviceError != cudaSuccess)
+    {
+        GMX_THROW(gmx::InternalError(errorMessage + " " + getDeviceErrorString(deviceError)));
+    }
+}
+
  /*! \brief Helper function to ensure no pending error silently
   * disrupts error handling.
   *
   * Asserts in a debug build if an unhandled error is present. Issues a
   * warning at run time otherwise.
   *
- * \todo This is similar to CU_CHECK_PREV_ERR, which should be
- * consolidated.
+ * \param[in]  errorMessage  Undecorated error message.
   */
-static inline void ensureNoPendingCudaError(const char* errorMessage)
+static inline void ensureNoPendingDeviceError(const std::string& errorMessage)
  {
      // Ensure there is no pending error that would otherwise affect
      // the behaviour of future error handling.
-    cudaError_t stat = cudaGetLastError();
-    if (stat == cudaSuccess)
+    cudaError_t deviceError = cudaGetLastError();
+    if (deviceError == cudaSuccess)
      {
          return;
      }
@@ -77,13 +103,13 @@ static inline void ensureNoPendingCudaError(const char* errorMessage)
      // If we would find an error in a release build, we do not know
      // what is appropriate to do about it, so assert only for debug
      // builds.
-    auto fullMessage = formatString(
-            "%s An unhandled error from a previous CUDA operation was detected. %s: %s",
-            errorMessage, cudaGetErrorName(stat), cudaGetErrorString(stat));
-    GMX_ASSERT(stat == cudaSuccess, fullMessage.c_str());
+    const std::string fullErrorMessage =
+            errorMessage + " An unhandled error from a previous CUDA operation was detected. "
+            + gmx::getDeviceErrorString(deviceError);
+    GMX_ASSERT(deviceError == cudaSuccess, fullErrorMessage.c_str());
      // TODO When we evolve a better logging framework, use that
      // for release-build error reporting.
-    gmx_warning("%s", fullMessage.c_str());
+    gmx_warning("%s", fullErrorMessage.c_str());
  }
  
  } // namespace
@@ -107,27 +133,13 @@ enum class GpuApiCallBehavior;
  #ifdef CHECK_CUDA_ERRORS
  
  /*! Check for CUDA error on the return status of a CUDA RT API call. */
-#    define CU_RET_ERR(status, msg)                                            \
-        do                                                                     \
-        {                                                                      \
-            if (status != cudaSuccess)                                         \
-            {                                                                  \
-                gmx_fatal(FARGS, "%s: %s\n", msg, cudaGetErrorString(status)); \
-            }                                                                  \
-        } while (0)
-
-/*! Check for any previously occurred uncaught CUDA error. */
-#    define CU_CHECK_PREV_ERR()                                                           \
-        do                                                                                \
-        {                                                                                 \
-            cudaError_t _CU_CHECK_PREV_ERR_status = cudaGetLastError();                   \
-            if (_CU_CHECK_PREV_ERR_status != cudaSuccess)                                 \
-            {                                                                             \
-                gmx_warning(                                                              \
-                        "Just caught a previously occurred CUDA error (%s), will try to " \
-                        "continue.",                                                      \
-                        cudaGetErrorString(_CU_CHECK_PREV_ERR_status));                   \
-            }                                                                             \
+#    define CU_RET_ERR(deviceError, msg)                                                          \
+        do                                                                                        \
+        {                                                                                         \
+            if (deviceError != cudaSuccess)                                                       \
+            {                                                                                     \
+                gmx_fatal(FARGS, "%s\n", (msg + gmx::getDeviceErrorString(deviceError)).c_str()); \
+            }                                                                                     \
          } while (0)
  
  #else /* CHECK_CUDA_ERRORS */
@@ -136,10 +148,6 @@ enum class GpuApiCallBehavior;
          do                          \
          {                           \
          } while (0)
-#    define CU_CHECK_PREV_ERR() \
-        do                      \
-        {                       \
-        } while (0)
  
  #endif /* CHECK_CUDA_ERRORS */
  
@@ -174,13 +182,16 @@ static inline bool haveStreamTasksCompleted(const DeviceStream& deviceStream)
          return false;
      }
  
-    GMX_ASSERT(stat != cudaErrorInvalidResourceHandle, "Stream identifier not valid");
+    GMX_ASSERT(stat != cudaErrorInvalidResourceHandle,
+               ("Stream identifier not valid. " + gmx::getDeviceErrorString(stat)).c_str());
  
      // cudaSuccess and cudaErrorNotReady are the expected return values
-    CU_RET_ERR(stat, "Unexpected cudaStreamQuery failure");
+    CU_RET_ERR(stat, "Unexpected cudaStreamQuery failure. ");
  
      GMX_ASSERT(stat == cudaSuccess,
-               "Values other than cudaSuccess should have been explicitly handled");
+               ("Values other than cudaSuccess should have been explicitly handled. "
+                + gmx::getDeviceErrorString(stat))
+                       .c_str());
  
      return true;
  }
@@ -273,14 +284,8 @@ void launchGpuKernel(void (*kernel)(Args...),
      cudaLaunchKernel((void*)kernel, gridSize, blockSize, const_cast<void**>(kernelArgs.data()),
                       config.sharedMemorySize, deviceStream.stream());
  
-    cudaError_t status = cudaGetLastError();
-    if (cudaSuccess != status)
-    {
-        const std::string errorMessage =
-                "GPU kernel (" + std::string(kernelName)
-                + ") failed to launch: " + std::string(cudaGetErrorString(status));
-        GMX_THROW(gmx::InternalError(errorMessage));
-    }
+    gmx::ensureNoPendingDeviceError("GPU kernel (" + std::string(kernelName)
+                                    + ") failed to launch.");
  }
  
  #endif
diff --git a/src/gromacs/gpu_utils/device_stream.cu b/src/gromacs/gpu_utils/device_stream.cu

index cc1f8798622bc30a284cdef90f3967cef8eae88e..5a309c1156256563a512eb8d501759cc66a2dd00 100644 (file)
--- a/src/gromacs/gpu_utils/device_stream.cu
+++ b/src/gromacs/gpu_utils/device_stream.cu
@@ -44,6 +44,7 @@
  
  #include "device_stream.h"
  
+#include "gromacs/gpu_utils/cudautils.cuh"
  #include "gromacs/utility/exceptions.h"
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/stringutil.h"
@@ -57,11 +58,7 @@ DeviceStream::DeviceStream(const DeviceContext& /* deviceContext */,
      if (priority == DeviceStreamPriority::Normal)
      {
          stat = cudaStreamCreate(&stream_);
-        if (stat != cudaSuccess)
-        {
-            GMX_THROW(gmx::InternalError(gmx::formatString(
-                    "Could not create CUDA stream (CUDA error %d: %s).", stat, cudaGetErrorString(stat))));
-        }
+        gmx::checkDeviceError(stat, "Could not create CUDA stream.");
      }
      else if (priority == DeviceStreamPriority::High)
      {
@@ -70,20 +67,10 @@ DeviceStream::DeviceStream(const DeviceContext& /* deviceContext */,
          // range, which in that case will be a single value.
          int highestPriority;
          stat = cudaDeviceGetStreamPriorityRange(nullptr, &highestPriority);
-        if (stat != cudaSuccess)
-        {
-            GMX_THROW(gmx::InternalError(gmx::formatString(
-                    "Could not query CUDA stream priority range (CUDA error %d: %s).", stat,
-                    cudaGetErrorString(stat))));
-        }
+        gmx::checkDeviceError(stat, "Could not query CUDA stream priority range.");
  
          stat = cudaStreamCreateWithPriority(&stream_, cudaStreamDefault, highestPriority);
-        if (stat != cudaSuccess)
-        {
-            GMX_THROW(gmx::InternalError(gmx::formatString(
-                    "Could not create CUDA stream with high priority (CUDA error %d: %s).", stat,
-                    cudaGetErrorString(stat))));
-        }
+        gmx::checkDeviceError(stat, "Could not create CUDA stream with high priority.");
      }
  }
  
@@ -93,9 +80,7 @@ DeviceStream::~DeviceStream()
      {
          cudaError_t stat = cudaStreamDestroy(stream_);
          GMX_RELEASE_ASSERT(stat == cudaSuccess,
-                           gmx::formatString("Failed to release CUDA stream (CUDA error %d: %s).",
-                                             stat, cudaGetErrorString(stat))
-                                   .c_str());
+                           ("Failed to release CUDA stream. " + gmx::getDeviceErrorString(stat)).c_str());
          stream_ = nullptr;
      }
  }
@@ -114,7 +99,5 @@ void DeviceStream::synchronize() const
  {
      cudaError_t stat = cudaStreamSynchronize(stream_);
      GMX_RELEASE_ASSERT(stat == cudaSuccess,
-                       gmx::formatString("cudaStreamSynchronize failed  (CUDA error %d: %s).", stat,
-                                         cudaGetErrorString(stat))
-                               .c_str());
+                       ("cudaStreamSynchronize failed. " + gmx::getDeviceErrorString(stat)).c_str());
  }
diff --git a/src/gromacs/gpu_utils/devicebuffer.cuh b/src/gromacs/gpu_utils/devicebuffer.cuh

index 584edc7775f1def740132f9427fcbde10385745e..ec544aa3c2a58cdc693a9d117f7d79dd9f900957 100644 (file)
--- a/src/gromacs/gpu_utils/devicebuffer.cuh
+++ b/src/gromacs/gpu_utils/devicebuffer.cuh
@@ -46,6 +46,7 @@
   */
  
  #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
+#include "gromacs/gpu_utils/cudautils.cuh"
  #include "gromacs/gpu_utils/device_context.h"
  #include "gromacs/gpu_utils/device_stream.h"
  #include "gromacs/gpu_utils/devicebuffer_datatype.h"
@@ -68,7 +69,9 @@ void allocateDeviceBuffer(DeviceBuffer<ValueType>* buffer, size_t numValues, con
  {
      GMX_ASSERT(buffer, "needs a buffer pointer");
      cudaError_t stat = cudaMalloc((void**)buffer, numValues * sizeof(ValueType));
-    GMX_RELEASE_ASSERT(stat == cudaSuccess, "cudaMalloc failure");
+    GMX_RELEASE_ASSERT(
+            stat == cudaSuccess,
+            ("Allocation of the device buffer failed. " + gmx::getDeviceErrorString(stat)).c_str());
  }
  
  /*! \brief
@@ -85,7 +88,10 @@ void freeDeviceBuffer(DeviceBuffer* buffer)
      GMX_ASSERT(buffer, "needs a buffer pointer");
      if (*buffer)
      {
-        GMX_RELEASE_ASSERT(cudaFree(*buffer) == cudaSuccess, "cudaFree failed");
+        cudaError_t stat = cudaFree(*buffer);
+        GMX_RELEASE_ASSERT(
+                stat == cudaSuccess,
+                ("Freeing of the device buffer failed. " + gmx::getDeviceErrorString(stat)).c_str());
      }
  }
  
@@ -127,13 +133,17 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
                         "Source host buffer was not pinned for CUDA");
              stat = cudaMemcpyAsync(*((ValueType**)buffer) + startingOffset, hostBuffer, bytes,
                                     cudaMemcpyHostToDevice, deviceStream.stream());
-            GMX_RELEASE_ASSERT(stat == cudaSuccess, "Asynchronous H2D copy failed");
+            GMX_RELEASE_ASSERT(
+                    stat == cudaSuccess,
+                    ("Asynchronous H2D copy failed. " + gmx::getDeviceErrorString(stat)).c_str());
              break;
  
          case GpuApiCallBehavior::Sync:
              stat = cudaMemcpy(*((ValueType**)buffer) + startingOffset, hostBuffer, bytes,
                                cudaMemcpyHostToDevice);
-            GMX_RELEASE_ASSERT(stat == cudaSuccess, "Synchronous H2D copy failed");
+            GMX_RELEASE_ASSERT(
+                    stat == cudaSuccess,
+                    ("Synchronous H2D copy failed. " + gmx::getDeviceErrorString(stat)).c_str());
              break;
  
          default: throw;
@@ -178,13 +188,17 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
                         "Destination host buffer was not pinned for CUDA");
              stat = cudaMemcpyAsync(hostBuffer, *((ValueType**)buffer) + startingOffset, bytes,
                                     cudaMemcpyDeviceToHost, deviceStream.stream());
-            GMX_RELEASE_ASSERT(stat == cudaSuccess, "Asynchronous D2H copy failed");
+            GMX_RELEASE_ASSERT(
+                    stat == cudaSuccess,
+                    ("Asynchronous D2H copy failed. " + gmx::getDeviceErrorString(stat)).c_str());
              break;
  
          case GpuApiCallBehavior::Sync:
              stat = cudaMemcpy(hostBuffer, *((ValueType**)buffer) + startingOffset, bytes,
                                cudaMemcpyDeviceToHost);
-            GMX_RELEASE_ASSERT(stat == cudaSuccess, "Synchronous D2H copy failed");
+            GMX_RELEASE_ASSERT(
+                    stat == cudaSuccess,
+                    ("Synchronous D2H copy failed. " + gmx::getDeviceErrorString(stat)).c_str());
              break;
  
          default: throw;
@@ -212,7 +226,8 @@ void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer,
  
      cudaError_t stat = cudaMemsetAsync(*((ValueType**)buffer) + startingOffset, pattern, bytes,
                                         deviceStream.stream());
-    GMX_RELEASE_ASSERT(stat == cudaSuccess, "Couldn't clear the device buffer");
+    GMX_RELEASE_ASSERT(stat == cudaSuccess,
+                       ("Couldn't clear the device buffer. " + gmx::getDeviceErrorString(stat)).c_str());
  }
  
  /*! \brief Check the validity of the device buffer.
@@ -270,10 +285,8 @@ void initParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer,
      cudaError_t stat =
              cudaMemcpy(*((ValueType**)deviceBuffer), hostBuffer, sizeInBytes, cudaMemcpyHostToDevice);
  
-    GMX_RELEASE_ASSERT(
-            stat == cudaSuccess,
-            gmx::formatString("Synchronous H2D copy failed (CUDA error: %s).", cudaGetErrorName(stat))
-                    .c_str());
+    GMX_RELEASE_ASSERT(stat == cudaSuccess,
+                       ("Synchronous H2D copy failed. " + gmx::getDeviceErrorString(stat)).c_str());
  
      if (!c_disableCudaTextures)
      {
@@ -289,10 +302,9 @@ void initParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer,
          memset(&td, 0, sizeof(td));
          td.readMode = cudaReadModeElementType;
          stat        = cudaCreateTextureObject(deviceTexture, &rd, &td, nullptr);
-        GMX_RELEASE_ASSERT(stat == cudaSuccess,
-                           gmx::formatString("cudaCreateTextureObject failed (CUDA error: %s).",
-                                             cudaGetErrorName(stat))
-                                   .c_str());
+        GMX_RELEASE_ASSERT(
+                stat == cudaSuccess,
+                ("Binding of the texture object failed. " + gmx::getDeviceErrorString(stat)).c_str());
      }
  }
  
@@ -311,10 +323,7 @@ void destroyParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer, DeviceTextur
          cudaError_t stat = cudaDestroyTextureObject(deviceTexture);
          GMX_RELEASE_ASSERT(
                  stat == cudaSuccess,
-                gmx::formatString(
-                        "cudaDestroyTextureObject on texture object failed (CUDA error: %s).",
-                        cudaGetErrorName(stat))
-                        .c_str());
+                ("Destruction of the texture object failed. " + gmx::getDeviceErrorString(stat)).c_str());
      }
      freeDeviceBuffer(deviceBuffer);
  }
diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu

index c68a8cda63b216212c4c33ec854566536967c688..8df9282b6bc708d5bc77ffb37ce876d8644d2e72 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gpu_utils/gpu_utils.cu
@@ -171,8 +171,8 @@ static void peerAccessCheckStat(const cudaError_t    stat,
                  .asParagraph()
                  .appendTextFormatted(
                          "GPU peer access not enabled between GPUs %d and %d due to unexpected "
-                        "return value from %s: %s",
-                        gpuA, gpuB, cudaCallName, cudaGetErrorString(stat));
+                        "return value from %s. %s",
+                        gpuA, gpuB, cudaCallName, gmx::getDeviceErrorString(stat).c_str());
      }
  }
  
@@ -199,8 +199,8 @@ void setupGpuDevicePeerAccess(const std::vector<int>& gpuIdsToUse, const gmx::MD
                      .asParagraph()
                      .appendTextFormatted(
                              "GPU peer access not enabled due to unexpected return value from "
-                            "cudaSetDevice(%d): %s",
-                            gpuA, cudaGetErrorString(stat));
+                            "cudaSetDevice(%d). %s",
+                            gpuA, gmx::getDeviceErrorString(stat).c_str());
              return;
          }
          for (unsigned int j = 0; j < gpuIdsToUse.size(); j++)
diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer.cuh b/src/gromacs/gpu_utils/gpueventsynchronizer.cuh

index 14c3ac93cc2a9bc2c9f44333b141090eb2f6f401..fee73cdb04b9c77dc503b97cac777afdeef4d822 100644 (file)
--- a/src/gromacs/gpu_utils/gpueventsynchronizer.cuh
+++ b/src/gromacs/gpu_utils/gpueventsynchronizer.cuh
@@ -41,6 +41,7 @@
  #ifndef GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_CUH
  #define GMX_GPU_UTILS_GPUEVENTSYNCHRONIZER_CUH
  
+#include "gromacs/gpu_utils/cudautils.cuh"
  #include "gromacs/gpu_utils/device_stream.h"
  #include "gromacs/gpu_utils/gputraits.cuh"
  #include "gromacs/utility/gmxassert.h"
@@ -66,12 +67,14 @@ public:
      GpuEventSynchronizer()
      {
          cudaError_t gmx_used_in_debug stat = cudaEventCreateWithFlags(&event_, cudaEventDisableTiming);
-        GMX_RELEASE_ASSERT(stat == cudaSuccess, "cudaEventCreate failed");
+        GMX_RELEASE_ASSERT(stat == cudaSuccess,
+                           ("cudaEventCreate failed. " + gmx::getDeviceErrorString(stat)).c_str());
      }
      ~GpuEventSynchronizer()
      {
          cudaError_t gmx_used_in_debug stat = cudaEventDestroy(event_);
-        GMX_ASSERT(stat == cudaSuccess, "cudaEventDestroy failed");
+        GMX_RELEASE_ASSERT(stat == cudaSuccess,
+                           ("cudaEventDestroy failed. " + gmx::getDeviceErrorString(stat)).c_str());
      }
      //! No copying
      GpuEventSynchronizer(const GpuEventSynchronizer&) = delete;
@@ -86,19 +89,22 @@ public:
      inline void markEvent(const DeviceStream& deviceStream)
      {
          cudaError_t gmx_used_in_debug stat = cudaEventRecord(event_, deviceStream.stream());
-        GMX_ASSERT(stat == cudaSuccess, "cudaEventRecord failed");
+        GMX_ASSERT(stat == cudaSuccess,
+                   ("cudaEventRecord failed. " + gmx::getDeviceErrorString(stat)).c_str());
      }
      /*! \brief Synchronizes the host thread on the marked event. */
      inline void waitForEvent()
      {
          cudaError_t gmx_used_in_debug stat = cudaEventSynchronize(event_);
-        GMX_ASSERT(stat == cudaSuccess, "cudaEventSynchronize failed");
+        GMX_ASSERT(stat == cudaSuccess,
+                   ("cudaEventSynchronize failed. " + gmx::getDeviceErrorString(stat)).c_str());
      }
      /*! \brief Enqueues a wait for the recorded event in stream \p stream */
      inline void enqueueWaitEvent(const DeviceStream& deviceStream)
      {
          cudaError_t gmx_used_in_debug stat = cudaStreamWaitEvent(deviceStream.stream(), event_, 0);
-        GMX_ASSERT(stat == cudaSuccess, "cudaStreamWaitEvent failed");
+        GMX_ASSERT(stat == cudaSuccess,
+                   ("cudaStreamWaitEvent failed. " + gmx::getDeviceErrorString(stat)).c_str());
      }
  
  private:
diff --git a/src/gromacs/gpu_utils/pinning.cu b/src/gromacs/gpu_utils/pinning.cu

index 8bed9219b34ac3b3960b8520ceeabb23bfc2fad6..adb401165a495bcb5d71c2fdfe0a057b59fd3535 100644 (file)
--- a/src/gromacs/gpu_utils/pinning.cu
+++ b/src/gromacs/gpu_utils/pinning.cu
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -67,33 +67,29 @@ gmx_unused static inline bool isAligned(const void* ptr, size_t bytes)
  
  void pinBuffer(void* pointer, std::size_t numBytes) noexcept
  {
-    const char* errorMessage =
-            "Could not register the host memory for page locking for GPU transfers.";
+    const std::string errorMessage =
+            "Could not register the host memory for page locking for GPU transfers. ";
  
      GMX_ASSERT(isAligned(pointer, PageAlignedAllocationPolicy::alignment()),
-               formatString("%s Host memory needs to be page aligned.", errorMessage).c_str());
+               (errorMessage + "Host memory needs to be page aligned.").c_str());
  
      numBytes = std::max<size_t>(
              1, numBytes); // C++11 3.7.4.1 gurantees that every pointer is different thus at least 1 byte
  
-    ensureNoPendingCudaError(errorMessage);
+    ensureNoPendingDeviceError(errorMessage);
      cudaError_t stat = cudaHostRegister(pointer, numBytes, cudaHostRegisterDefault);
  
      // These errors can only arise from a coding error somewhere.
-    GMX_RELEASE_ASSERT(
-            stat != cudaErrorInvalidValue && stat != cudaErrorNotSupported
-                    && stat != cudaErrorHostMemoryAlreadyRegistered,
-            formatString("%s %s: %s", errorMessage, cudaGetErrorName(stat), cudaGetErrorString(stat))
-                    .c_str());
+    GMX_RELEASE_ASSERT(stat != cudaErrorInvalidValue && stat != cudaErrorNotSupported
+                               && stat != cudaErrorHostMemoryAlreadyRegistered,
+                       (errorMessage + getDeviceErrorString(stat)).c_str());
  
      // We always handle the error, but if it's a type we didn't expect
      // (e.g. because CUDA changes the set of errors it returns) then
      // we should get a descriptive assertion in Debug mode so we know
      // to fix our expectations.
      GMX_ASSERT(stat != cudaErrorMemoryAllocation,
-               formatString("%s %s: %s which was an unexpected error", errorMessage,
-                            cudaGetErrorName(stat), cudaGetErrorString(stat))
-                       .c_str());
+               (errorMessage + getDeviceErrorString(stat) + " which was an unexpected error").c_str());
  
      // It might be preferable to throw InternalError here, because the
      // failing condition can only happen when GROMACS is used with a
@@ -101,25 +97,21 @@ void pinBuffer(void* pointer, std::size_t numBytes) noexcept
      // engineer GROMACS to be forward-compatible with future CUDA
      // versions, so if this proves to be a problem in practice, then
      // GROMACS must be patched, or a supported CUDA version used.
-    GMX_RELEASE_ASSERT(stat == cudaSuccess, formatString("%s %s: %s", errorMessage,
-                                                         cudaGetErrorName(stat), cudaGetErrorString(stat))
-                                                    .c_str());
+    GMX_RELEASE_ASSERT(stat == cudaSuccess, (errorMessage + getDeviceErrorString(stat)).c_str());
  }
  
  void unpinBuffer(void* pointer) noexcept
  {
-    const char* errorMessage = "Could not unregister pinned host memory used for GPU transfers.";
+    const std::string errorMessage =
+            "Could not unregister pinned host memory used for GPU transfers. ";
  
-    GMX_ASSERT(pointer != nullptr,
-               formatString("%s pointer should not be nullptr when pinned.", errorMessage).c_str());
+    GMX_ASSERT(pointer != nullptr, (errorMessage + "Pointer should not be nullptr when pinned.").c_str());
  
-    ensureNoPendingCudaError(errorMessage);
+    ensureNoPendingDeviceError(errorMessage);
      cudaError_t stat = cudaHostUnregister(pointer);
      // These errors can only arise from a coding error somewhere.
-    GMX_RELEASE_ASSERT(
-            stat != cudaErrorInvalidValue && stat != cudaErrorHostMemoryNotRegistered,
-            formatString("%s %s: %s", errorMessage, cudaGetErrorName(stat), cudaGetErrorString(stat))
-                    .c_str());
+    GMX_RELEASE_ASSERT(stat != cudaErrorInvalidValue && stat != cudaErrorHostMemoryNotRegistered,
+                       (errorMessage + getDeviceErrorString(stat)).c_str());
      // If there's an error whose type we didn't expect (e.g. because a
      // future CUDA changes the set of errors it returns) then we
      // should assert, because our code is wrong.
@@ -128,10 +120,7 @@ void unpinBuffer(void* pointer) noexcept
      // unpin() from a destructor, in which case any attempt to throw
      // an uncaught exception would anyway terminate the program. A
      // release assertion is a better behaviour than that.
-    GMX_RELEASE_ASSERT(stat == cudaSuccess,
-                       formatString("%s %s: %s which was an unexpected error", errorMessage,
-                                    cudaGetErrorName(stat), cudaGetErrorString(stat))
-                               .c_str());
+    GMX_RELEASE_ASSERT(stat == cudaSuccess, (errorMessage + getDeviceErrorString(stat)).c_str());
  }
  
  } // namespace gmx
diff --git a/src/gromacs/gpu_utils/pmalloc_cuda.cu b/src/gromacs/gpu_utils/pmalloc_cuda.cu

index 0a88abb268595c838af6899272ca73d1e7d859c6..2d5e1220529a245011656c69e6c66d40b9ab3b5c 100644 (file)
--- a/src/gromacs/gpu_utils/pmalloc_cuda.cu
+++ b/src/gromacs/gpu_utils/pmalloc_cuda.cu
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2014,2015,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2012,2014,2015,2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -63,7 +63,7 @@ void pmalloc(void** h_ptr, size_t nbytes)
          return;
      }
  
-    CU_CHECK_PREV_ERR();
+    gmx::ensureNoPendingDeviceError("Could not allocate page-locked memory.");
  
      stat = cudaMallocHost(h_ptr, nbytes, flag);
      sprintf(strbuf, "cudaMallocHost of size %d bytes failed", (int)nbytes);
@@ -86,7 +86,7 @@ void pmalloc_wc(void** h_ptr, size_t nbytes)
          return;
      }
  
-    CU_CHECK_PREV_ERR();
+    gmx::ensureNoPendingDeviceError("Could not allocate page-locked memory with write-combining.");
  
      stat = cudaMallocHost(h_ptr, nbytes, flag);
      sprintf(strbuf, "cudaMallocHost of size %d bytes failed", (int)nbytes);
@@ -106,7 +106,7 @@ void pfree(void* h_ptr)
          return;
      }
  
-    CU_CHECK_PREV_ERR();
+    gmx::ensureNoPendingDeviceError("Could not free page-locked memory.");
  
      stat = cudaFreeHost(h_ptr);
      CU_RET_ERR(stat, "cudaFreeHost failed");
diff --git a/src/gromacs/gpu_utils/tests/devicetransfers.cu b/src/gromacs/gpu_utils/tests/devicetransfers.cu

index 4e7e14779d16ddfa991b6fbb2f39c9098707c9cf..63b1f77a4646502fdb572a2f961b59d1966e0438 100644 (file)
--- a/src/gromacs/gpu_utils/tests/devicetransfers.cu
+++ b/src/gromacs/gpu_utils/tests/devicetransfers.cu
@@ -57,23 +57,6 @@
  
  namespace gmx
  {
-namespace
-{
-
-/*! \brief Help give useful diagnostics about error \c status while doing \c message.
- *
- * \throws InternalError  If status indicates failure, supplying
- *                        descriptive text from \c message. */
-static void throwUponFailure(cudaError_t status, const char* message)
-{
-    if (status != cudaSuccess)
-    {
-        GMX_THROW(InternalError(formatString("Failure while %s", message)));
-        ;
-    }
-}
-
-} // namespace
  
  void doDeviceTransfers(const DeviceInformation& deviceInfo, ArrayRef<const char> input, ArrayRef<char> output)
  {
@@ -83,24 +66,24 @@ void doDeviceTransfers(const DeviceInformation& deviceInfo, ArrayRef<const char>
      int oldDeviceId;
  
      status = cudaGetDevice(&oldDeviceId);
-    throwUponFailure(status, "getting old device id");
+    checkDeviceError(status, "Error while getting old device id.");
      status = cudaSetDevice(deviceInfo.id);
-    throwUponFailure(status, "setting device id to the first compatible GPU");
+    checkDeviceError(status, "Error while setting device id to the first compatible GPU.");
  
      void* devicePointer;
      status = cudaMalloc(&devicePointer, input.size());
-    throwUponFailure(status, "creating buffer");
+    checkDeviceError(status, "Error while creating buffer.");
  
      status = cudaMemcpy(devicePointer, input.data(), input.size(), cudaMemcpyHostToDevice);
-    throwUponFailure(status, "transferring host to device");
+    checkDeviceError(status, "Error while transferring host to device.");
      status = cudaMemcpy(output.data(), devicePointer, output.size(), cudaMemcpyDeviceToHost);
-    throwUponFailure(status, "transferring device to host");
+    checkDeviceError(status, "Error while transferring device to host.");
  
      status = cudaFree(devicePointer);
-    throwUponFailure(status, "releasing buffer");
+    checkDeviceError(status, "Error while releasing buffer.");
  
      status = cudaSetDevice(oldDeviceId);
-    throwUponFailure(status, "setting old device id");
+    checkDeviceError(status, "Error while setting old device id.");
  }
  
  } // namespace gmx
diff --git a/src/gromacs/hardware/device_management.cu b/src/gromacs/hardware/device_management.cu

index a09d6bf99a2d729c6d37ee4739106da95cf2b58c..0e77621a5bdf61ba82aadcea9cd882eae77b8759 100644 (file)
--- a/src/gromacs/hardware/device_management.cu
+++ b/src/gromacs/hardware/device_management.cu
@@ -127,8 +127,8 @@ static DeviceStatus isDeviceFunctional(const DeviceInformation& deviceInfo)
      cu_err = cudaSetDevice(deviceInfo.id);
      if (cu_err != cudaSuccess)
      {
-        fprintf(stderr, "Error %d while switching to device #%d: %s\n", cu_err, deviceInfo.id,
-                cudaGetErrorString(cu_err));
+        fprintf(stderr, "Error while switching to device #%d. %s\n", deviceInfo.id,
+                gmx::getDeviceErrorString(cu_err).c_str());
          return DeviceStatus::NonFunctional;
      }
  
@@ -216,11 +216,10 @@ bool isDeviceDetectionFunctional(std::string* errorMessage)
      stat                      = cudaDriverGetVersion(&driverVersion);
      GMX_ASSERT(stat != cudaErrorInvalidValue,
                 "An impossible null pointer was passed to cudaDriverGetVersion");
-    GMX_RELEASE_ASSERT(
-            stat == cudaSuccess,
-            gmx::formatString("An unexpected value was returned from cudaDriverGetVersion %s: %s",
-                              cudaGetErrorName(stat), cudaGetErrorString(stat))
-                    .c_str());
+    GMX_RELEASE_ASSERT(stat == cudaSuccess,
+                       ("An unexpected value was returned from cudaDriverGetVersion. "
+                        + gmx::getDeviceErrorString(stat))
+                               .c_str());
      bool foundDriver = (driverVersion > 0);
      if (!foundDriver)
      {
@@ -268,18 +267,15 @@ std::vector<std::unique_ptr<DeviceInformation>> findDevices()
  {
      int         numDevices;
      cudaError_t stat = cudaGetDeviceCount(&numDevices);
-    if (stat != cudaSuccess)
-    {
-        GMX_THROW(gmx::InternalError(
-                "Invalid call of findDevices() when CUDA API returned an error, perhaps "
-                "canPerformDeviceDetection() was not called appropriately beforehand."));
-    }
+    gmx::checkDeviceError(stat,
+                          "Invalid call of findDevices() when CUDA API returned an error, perhaps "
+                          "canPerformDeviceDetection() was not called appropriately beforehand.");
  
      /* things might go horribly wrong if cudart is not compatible with the driver */
      numDevices = std::min(numDevices, c_cudaMaxDeviceCount);
  
      // We expect to start device support/sanity checks with a clean runtime error state
-    gmx::ensureNoPendingCudaError("");
+    gmx::ensureNoPendingDeviceError("Trying to find available CUDA devices.");
  
      std::vector<std::unique_ptr<DeviceInformation>> deviceInfoList(numDevices);
      for (int i = 0; i < numDevices; i++)
@@ -311,20 +307,18 @@ std::vector<std::unique_ptr<DeviceInformation>> findDevices()
              //
              // Here we also clear the CUDA API error state so potential
              // errors during sanity checks don't propagate.
-            if ((stat = cudaGetLastError()) != cudaSuccess)
-            {
-                gmx_warning("An error occurred while sanity checking device #%d; %s: %s",
-                            deviceInfoList[i]->id, cudaGetErrorName(stat), cudaGetErrorString(stat));
-            }
+            const std::string errorMessage = gmx::formatString(
+                    "An error occurred while sanity checking device #%d.", deviceInfoList[i]->id);
+            gmx::ensureNoPendingDeviceError(errorMessage);
          }
      }
  
      stat = cudaPeekAtLastError();
-    GMX_RELEASE_ASSERT(stat == cudaSuccess,
-                       gmx::formatString("We promise to return with clean CUDA state, but "
-                                         "non-success state encountered: %s: %s",
-                                         cudaGetErrorName(stat), cudaGetErrorString(stat))
-                               .c_str());
+    GMX_RELEASE_ASSERT(
+            stat == cudaSuccess,
+            ("We promise to return with clean CUDA state, but non-success state encountered. "
+             + gmx::getDeviceErrorString(stat))
+                    .c_str());
  
      return deviceInfoList;
  }
@@ -360,13 +354,13 @@ void releaseDevice(DeviceInformation* deviceInfo)
          {
              if (debug)
              {
-                fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
+                fprintf(stderr, "Cleaning up context on GPU ID #%d.\n", gpuid);
              }
  
              stat = cudaDeviceReset();
              if (stat != cudaSuccess)
              {
-                gmx_warning("Failed to free GPU #%d: %s", gpuid, cudaGetErrorString(stat));
+                gmx_warning("Failed to free GPU #%d. %s", gpuid, gmx::getDeviceErrorString(stat).c_str());
              }
          }
      }
diff --git a/src/gromacs/mdlib/leapfrog_gpu.cu b/src/gromacs/mdlib/leapfrog_gpu.cu

index 1793a386b2e4e428eda25a7b688b1163e5c65b98..a5ac290f6f7725390e3dd8d3483cdd87b57e7992 100644 (file)
--- a/src/gromacs/mdlib/leapfrog_gpu.cu
+++ b/src/gromacs/mdlib/leapfrog_gpu.cu
@@ -249,7 +249,7 @@ void LeapFrogGpu::integrate(const DeviceBuffer<float3>        d_x,
                              const matrix                      prVelocityScalingMatrix)
  {
  
-    ensureNoPendingCudaError("In CUDA version of Leap-Frog integrator");
+    ensureNoPendingDeviceError("In CUDA version of Leap-Frog integrator");
  
      auto kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
      if (doTemperatureScaling || doParrinelloRahman)
diff --git a/src/gromacs/mdlib/lincs_gpu.cu b/src/gromacs/mdlib/lincs_gpu.cu

index f3b832f18bfdcd59113a68b44f5158f92fc3d1a1..edffc43821a5af85e58132e5707061383759704f 100644 (file)
--- a/src/gromacs/mdlib/lincs_gpu.cu
+++ b/src/gromacs/mdlib/lincs_gpu.cu
@@ -435,7 +435,7 @@ void LincsGpu::apply(const float3* d_x,
                       tensor        virialScaled,
                       const PbcAiuc pbcAiuc)
  {
-    ensureNoPendingCudaError("In CUDA version of LINCS");
+    ensureNoPendingDeviceError("In CUDA version of LINCS");
  
      // Early exit if no constraints
      if (kernelParams_.numConstraintsThreads == 0)
diff --git a/src/gromacs/mdlib/settle_gpu.cu b/src/gromacs/mdlib/settle_gpu.cu

index 8b95a9fec5df227b842e8cda895897cc705103f7..c02f0d588438d5a45698a4ec65b35f6f16820d9a 100644 (file)
--- a/src/gromacs/mdlib/settle_gpu.cu
+++ b/src/gromacs/mdlib/settle_gpu.cu
@@ -405,7 +405,7 @@ void SettleGpu::apply(const float3* d_x,
                        const PbcAiuc pbcAiuc)
  {
  
-    ensureNoPendingCudaError("In CUDA version SETTLE");
+    ensureNoPendingDeviceError("In CUDA version SETTLE");
  
      // Early exit if no settles
      if (numSettles_ == 0)
author	Artem Zhmurov <zhmurov@gmail.com>
	Tue, 24 Nov 2020 08:02:48 +0000 (08:02 +0000)
committer	Paul Bauer <paul.bauer.q@gmail.com>
	Tue, 24 Nov 2020 08:02:48 +0000 (08:02 +0000)
src/gromacs/gpu_utils/cudautils.cuh		patch \| blob \| history
src/gromacs/gpu_utils/device_stream.cu		patch \| blob \| history
src/gromacs/gpu_utils/devicebuffer.cuh		patch \| blob \| history
src/gromacs/gpu_utils/gpu_utils.cu		patch \| blob \| history
src/gromacs/gpu_utils/gpueventsynchronizer.cuh		patch \| blob \| history
src/gromacs/gpu_utils/pinning.cu		patch \| blob \| history
src/gromacs/gpu_utils/pmalloc_cuda.cu		patch \| blob \| history
src/gromacs/gpu_utils/tests/devicetransfers.cu		patch \| blob \| history
src/gromacs/hardware/device_management.cu		patch \| blob \| history
src/gromacs/mdlib/leapfrog_gpu.cu		patch \| blob \| history
src/gromacs/mdlib/lincs_gpu.cu		patch \| blob \| history
src/gromacs/mdlib/settle_gpu.cu		patch \| blob \| history