Refactor error handling in init_gpu

author Mark Abraham <mark.j.abraham@gmail.com>

Fri, 30 Jun 2017 23:12:58 +0000 (01:12 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Fri, 7 Jul 2017 13:08:35 +0000 (15:08 +0200)
author Mark Abraham <mark.j.abraham@gmail.com>
Fri, 30 Jun 2017 23:12:58 +0000 (01:12 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Fri, 7 Jul 2017 13:08:35 +0000 (15:08 +0200)
diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu

index c2f5527efb2f696451448cacb2039af514e7afdd..260001f006b46c863d97833d0c533e347c42b203 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gpu_utils/gpu_utils.cu
@@ -57,6 +57,7 @@
  #include "gromacs/utility/cstringutil.h"
  #include "gromacs/utility/logger.h"
  #include "gromacs/utility/smalloc.h"
+#include "gromacs/utility/snprintf.h"
  
  #if HAVE_NVML
  #include <nvml.h>
@@ -452,29 +453,34 @@ static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused
  #endif /* HAVE_NVML_APPLICATION_CLOCKS */
  }
  
-gmx_bool init_gpu(const gmx::MDLogger &mdlog, int mygpu, char *result_str,
-                  const struct gmx_gpu_info_t *gpu_info,
-                  const struct gmx_gpu_opt_t *gpu_opt)
+void init_gpu(const gmx::MDLogger &mdlog, int rank, int mygpu,
+              const struct gmx_gpu_info_t *gpu_info,
+              const struct gmx_gpu_opt_t *gpu_opt)
  {
      cudaError_t stat;
      char        sbuf[STRLEN];
      int         gpuid;
  
      assert(gpu_info);
-    assert(result_str);
+    assert(gpu_opt);
  
      if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use)
      {
-        sprintf(sbuf, "Trying to initialize an non-existent GPU: "
-                "there are %d selected GPU(s), but #%d was requested.",
-                gpu_opt->n_dev_use, mygpu);
+        snprintf(sbuf, STRLEN, "On rank %d trying to initialize an non-existent GPU: "
+                 "there are %d selected GPU(s), but #%d was requested.",
+                 rank, gpu_opt->n_dev_use, mygpu);
          gmx_incons(sbuf);
      }
  
      gpuid = gpu_info->gpu_dev[gpu_opt->dev_use[mygpu]].id;
  
      stat = cudaSetDevice(gpuid);
-    strncpy(result_str, cudaGetErrorString(stat), STRLEN);
+    if (stat != cudaSuccess)
+    {
+        snprintf(sbuf, STRLEN, "On rank %d failed to initialize GPU #%d",
+                 rank, mygpu);
+        CU_RET_ERR(stat, sbuf);
+    }
  
      if (debug)
      {
@@ -482,11 +488,7 @@ gmx_bool init_gpu(const gmx::MDLogger &mdlog, int mygpu, char *result_str,
      }
  
      //Ignoring return value as NVML errors should be treated not critical.
-    if (stat == cudaSuccess)
-    {
-        init_gpu_application_clocks(mdlog, gpuid, gpu_info);
-    }
-    return (stat == cudaSuccess);
+    init_gpu_application_clocks(mdlog, gpuid, gpu_info);
  }
  
  gmx_bool free_cuda_gpu(
diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h

index 86001cbdf1dfca93ae851bcf9d352b493dd19bc2..bcafdf7f7ff4598b8261b17f684af4ba925c5f46 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_utils.h
+++ b/src/gromacs/gpu_utils/gpu_utils.h
@@ -107,19 +107,20 @@ void free_gpu_info(const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info)) GPU
   * gpu_info.gpu_dev array.
   *
   * \param      mdlog        log file to write to
+ * \param[in]  rank         MPI rank of this process (for error output)
   * \param[in]  mygpu        index of the GPU to initialize
- * \param[out] result_str   the message related to the error that occurred
- *                          during the initialization (if there was any).
   * \param[in] gpu_info      GPU info of all detected devices in the system.
   * \param[in] gpu_opt       options for using the GPUs in gpu_info
- * \returns                 true if no error occurs during initialization.
+ *
+ * Issues a fatal error for any critical errors that occur during
+ * initialization.
   */
  GPU_FUNC_QUALIFIER
-gmx_bool init_gpu(const gmx::MDLogger &GPU_FUNC_ARGUMENT(mdlog),
-                  int GPU_FUNC_ARGUMENT(mygpu),
-                  char *GPU_FUNC_ARGUMENT(result_str),
-                  const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info),
-                  const gmx_gpu_opt_t *GPU_FUNC_ARGUMENT(gpu_opt)) GPU_FUNC_TERM_WITH_RETURN(-1)
+void init_gpu(const gmx::MDLogger &GPU_FUNC_ARGUMENT(mdlog),
+              int GPU_FUNC_ARGUMENT(rank),
+              int GPU_FUNC_ARGUMENT(mygpu),
+              const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info),
+              const gmx_gpu_opt_t *GPU_FUNC_ARGUMENT(gpu_opt)) GPU_FUNC_TERM
  
  /*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
   *
diff --git a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp

index 2b245395d80bc678699e3c280e51c65dfe24a304..8d02efa51dd3d593c45bb690568fef033b06d587 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
+++ b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
@@ -397,23 +397,21 @@ void get_gpu_device_info_string(char gmx_unused *s, const gmx_gpu_info_t gmx_unu
  }
  
  //! This function is documented in the header file
-gmx_bool init_gpu(const gmx::MDLogger              & /*mdlog*/,
-                  int                              mygpu,
-                  char                            *result_str,
-                  const gmx_gpu_info_t gmx_unused *gpu_info,
-                  const gmx_gpu_opt_t             *gpu_opt
-                  )
+void init_gpu(const gmx::MDLogger               & /*mdlog*/,
+              int                               rank,
+              int                               mygpu,
+              const gmx_gpu_info_t             *gpu_info,
+              const gmx_gpu_opt_t              *gpu_opt
+              )
  {
-    assert(result_str);
-
-    result_str[0] = 0;
+    assert(gpu_opt);
  
      if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use)
      {
          char        sbuf[STRLEN];
-        sprintf(sbuf, "Trying to initialize an non-existent GPU: "
+        sprintf(sbuf, "On rank %d trying to initialize an non-existent GPU: "
                  "there are %d selected GPU(s), but #%d was requested.",
-                gpu_opt->n_dev_use, mygpu);
+                rank, gpu_opt->n_dev_use, mygpu);
          gmx_incons(sbuf);
      }
  
@@ -433,8 +431,6 @@ gmx_bool init_gpu(const gmx::MDLogger              & /*mdlog*/,
          setenv("CUDA_CACHE_DISABLE", "1", 0);
  #endif
      }
-
-    return TRUE;
  }
  
  //! This function is documented in the header file
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp

index 77591bda1a1b5fbb2cdff28cac8f975be7743841..451822d9944c1640b5299bc36872a622670ec249 100644 (file)
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -1768,8 +1768,6 @@ static void pick_nbnxn_resources(const gmx::MDLogger &mdlog,
                                   bool                 emulateGpu,
                                   const gmx_gpu_opt_t *gpu_opt)
  {
-    char     gpu_err_str[STRLEN];
-
      *bUseGPU = FALSE;
  
      /* Enable GPU mode when GPUs are available or no GPU emulation is requested.
@@ -1777,21 +1775,15 @@ static void pick_nbnxn_resources(const gmx::MDLogger &mdlog,
      if (gpu_opt->n_dev_use > 0 && !emulateGpu)
      {
          /* Each PP node will use the intra-node id-th device from the
-         * list of detected/selected GPUs. */
-        if (!init_gpu(mdlog, cr->rank_pp_intranode, gpu_err_str,
-                      &hwinfo->gpu_info, gpu_opt))
-        {
-            /* At this point the init should never fail as we made sure that
-             * we have all the GPUs we need. If it still does, we'll bail. */
-            /* TODO the decorating of gpu_err_str is nicer if it
-               happens inside init_gpu. Out here, the decorating with
-               the MPI rank makes sense. */
-            gmx_fatal(FARGS, "On rank %d failed to initialize GPU #%d: %s",
-                      cr->nodeid,
-                      get_gpu_device_id(&hwinfo->gpu_info, gpu_opt,
-                                        cr->rank_pp_intranode),
-                      gpu_err_str);
-        }
+         * list of detected/selected GPUs.
+         *
+         * At this point the init should never fail as we made sure that
+         * we have all the GPUs we need. If it still does, we'll exit.
+         *
+         * TODO The error reporting will be nicer when the logger is
+         * aware of MPI ranks. */
+        init_gpu(mdlog, cr->nodeid, cr->rank_pp_intranode,
+                 &hwinfo->gpu_info, gpu_opt);
  
          /* Here we actually turn on hardware GPU acceleration */
          *bUseGPU = TRUE;
author	Mark Abraham <mark.j.abraham@gmail.com>
	Fri, 30 Jun 2017 23:12:58 +0000 (01:12 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Fri, 7 Jul 2017 13:08:35 +0000 (15:08 +0200)
src/gromacs/gpu_utils/gpu_utils.cu		patch \| blob \| history
src/gromacs/gpu_utils/gpu_utils.h		patch \| blob \| history
src/gromacs/gpu_utils/gpu_utils_ocl.cpp		patch \| blob \| history
src/gromacs/mdlib/forcerec.cpp		patch \| blob \| history