From 74400c159757f63b2fb7ace7a16428afec7bf456 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Fri, 16 Feb 2018 19:48:46 +0100 Subject: [PATCH] Avoid aborting mdrun when GPU sanity check detects errors A release assertion was added which assumed that the GPU compatibility/sanity checks return with a clean CUDA API state. Consequently, any run that encountered a non-success return value from the CUDA API would abort the run instead of continuing the run without using the GPU in question. This change adds code to handle and issue a note on the error encountered as well as ensures that the CUDA API error state cleared at the return of the GPU detection. Fixes #2415 Change-Id: I5d7ed59ef8e4052a75b51c9a526b8dcb465ff611 --- src/gromacs/gpu_utils/gpu_utils.cu | 31 ++++++++++++++++++++++++++++-- src/gromacs/gpu_utils/gpu_utils.h | 7 ++++++- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu index 9ca379ee96..7660d7cc24 100644 --- a/src/gromacs/gpu_utils/gpu_utils.cu +++ b/src/gromacs/gpu_utils/gpu_utils.cu @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by + * Copyright (c) 2010,2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -603,6 +603,11 @@ static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop) * errors: incompatibility, insistence, or insanity (=unexpected behavior). * It also returns the respective device's properties in \dev_prop (if applicable). * + * As the error handling only permits returning the state of the GPU, this function + * does not clear the CUDA runtime API status allowing the caller to inspect the error + * upon return. Note that this also means it is the caller's responsibility to + * reset the CUDA runtime state. + * * \param[in] dev_id the ID of the GPU to check. * \param[out] dev_prop the CUDA device properties of the device checked. * \returns the status of the requested device @@ -718,6 +723,9 @@ void findGpus(gmx_gpu_info_t *gpu_info) "canDetectGpus() was not called appropriately beforehand.")); } + // We expect to start device support/sanity checks with a clean runtime error state + gmx::ensureNoPendingCudaError(""); + snew(devs, ndev); for (i = 0; i < ndev; i++) { @@ -731,8 +739,27 @@ void findGpus(gmx_gpu_info_t *gpu_info) { gpu_info->n_dev_compatible++; } + else + { + // TODO: + // - we inspect the CUDA API state to retrieve and record any + // errors that occurred during is_gmx_supported_gpu_id() here, + // but this would be more elegant done within is_gmx_supported_gpu_id() + // and only return a string with the error if one was encountered. + // - we'll be reporting without rank information which is not ideal. + // - we'll end up warning also in cases where users would already + // get an error before mdrun aborts. + // + // Here we also clear the CUDA API error state so potential + // errors during sanity checks don't propagate. + if ((stat = cudaGetLastError()) != cudaSuccess) + { + gmx_warning(gmx::formatString("An error occurred while sanity checking device #%d; %s: %s", + devs[i].id, cudaGetErrorName(stat), cudaGetErrorString(stat)).c_str()); + } + } } - GMX_RELEASE_ASSERT(cudaSuccess == cudaPeekAtLastError(), "Should be cudaSuccess"); + GMX_RELEASE_ASSERT(cudaSuccess == cudaPeekAtLastError(), "We promise to return with clean CUDA state!"); gpu_info->n_dev = ndev; gpu_info->gpu_dev = devs; diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h index 1a5ced2c0b..9634c38034 100644 --- a/src/gromacs/gpu_utils/gpu_utils.h +++ b/src/gromacs/gpu_utils/gpu_utils.h @@ -3,7 +3,7 @@ * * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2010, The GROMACS development team. - * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -96,6 +96,11 @@ bool canDetectGpus(std::string *GPU_FUNC_ARGUMENT(errorMessage)) GPU_FUNC_TERM_W * gpu_info->gpu_dev array with the required information on each the * device: ID, device properties, status. * + * Note that this function leaves the GPU runtime API error state clean; + * this is implemented ATM in the CUDA flavor. + * TODO: check if errors do propagate in OpenCL as they do in CUDA and + * whether there is a mechanism to "clear" them. + * * \param[in] gpu_info pointer to structure holding GPU information. * * \throws InternalError if a GPU API returns an unexpected failure (because -- 2.22.0