From 74400c159757f63b2fb7ace7a16428afec7bf456 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= <pall.szilard@gmail.com>
Date: Fri, 16 Feb 2018 19:48:46 +0100
Subject: [PATCH] Avoid aborting mdrun when GPU sanity check detects errors

A release assertion was added which assumed that the GPU
compatibility/sanity checks return with a clean CUDA API state.
Consequently, any run that encountered a non-success return value from
the CUDA API would abort the run instead of continuing the run without
using the GPU in question.
This change adds code to handle and issue a note on the error
encountered as well as ensures that the CUDA API error state cleared
at the return of the GPU detection.

Fixes #2415

Change-Id: I5d7ed59ef8e4052a75b51c9a526b8dcb465ff611
---
 src/gromacs/gpu_utils/gpu_utils.cu | 31 ++++++++++++++++++++++++++++--
 src/gromacs/gpu_utils/gpu_utils.h  |  7 ++++++-
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu
index 9ca379ee96..7660d7cc24 100644
--- a/src/gromacs/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gpu_utils/gpu_utils.cu
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -603,6 +603,11 @@ static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
  *  errors: incompatibility, insistence, or insanity (=unexpected behavior).
  *  It also returns the respective device's properties in \dev_prop (if applicable).
  *
+ *  As the error handling only permits returning the state of the GPU, this function
+ *  does not clear the CUDA runtime API status allowing the caller to inspect the error
+ *  upon return. Note that this also means it is the caller's responsibility to
+ *  reset the CUDA runtime state.
+ *
  *  \param[in]  dev_id   the ID of the GPU to check.
  *  \param[out] dev_prop the CUDA device properties of the device checked.
  *  \returns             the status of the requested device
@@ -718,6 +723,9 @@ void findGpus(gmx_gpu_info_t *gpu_info)
                                      "canDetectGpus() was not called appropriately beforehand."));
     }
 
+    // We expect to start device support/sanity checks with a clean runtime error state
+    gmx::ensureNoPendingCudaError("");
+
     snew(devs, ndev);
     for (i = 0; i < ndev; i++)
     {
@@ -731,8 +739,27 @@ void findGpus(gmx_gpu_info_t *gpu_info)
         {
             gpu_info->n_dev_compatible++;
         }
+        else
+        {
+            // TODO:
+            //  - we inspect the CUDA API state to retrieve and record any
+            //    errors that occurred during is_gmx_supported_gpu_id() here,
+            //    but this would be more elegant done within is_gmx_supported_gpu_id()
+            //    and only return a string with the error if one was encountered.
+            //  - we'll be reporting without rank information which is not ideal.
+            //  - we'll end up warning also in cases where users would already
+            //    get an error before mdrun aborts.
+            //
+            // Here we also clear the CUDA API error state so potential
+            // errors during sanity checks don't propagate.
+            if ((stat = cudaGetLastError()) != cudaSuccess)
+            {
+                gmx_warning(gmx::formatString("An error occurred while sanity checking device #%d; %s: %s",
+                                              devs[i].id, cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str());
+            }
+        }
     }
-    GMX_RELEASE_ASSERT(cudaSuccess == cudaPeekAtLastError(), "Should be cudaSuccess");
+    GMX_RELEASE_ASSERT(cudaSuccess == cudaPeekAtLastError(), "We promise to return with clean CUDA state!");
 
     gpu_info->n_dev   = ndev;
     gpu_info->gpu_dev = devs;
diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h
index 1a5ced2c0b..9634c38034 100644
--- a/src/gromacs/gpu_utils/gpu_utils.h
+++ b/src/gromacs/gpu_utils/gpu_utils.h
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2010, The GROMACS development team.
- * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -96,6 +96,11 @@ bool canDetectGpus(std::string *GPU_FUNC_ARGUMENT(errorMessage)) GPU_FUNC_TERM_W
  *  gpu_info->gpu_dev array with the required information on each the
  *  device: ID, device properties, status.
  *
+ *  Note that this function leaves the GPU runtime API error state clean;
+ *  this is implemented ATM in the CUDA flavor.
+ *  TODO: check if errors do propagate in OpenCL as they do in CUDA and
+ *  whether there is a mechanism to "clear" them.
+ *
  *  \param[in] gpu_info    pointer to structure holding GPU information.
  *
  *  \throws                InternalError if a GPU API returns an unexpected failure (because
-- 
2.22.0