don't use polling with new driver

author Szilard Pall <pszilard@cbr.su.se>

Fri, 11 Jan 2013 06:11:55 +0000 (07:11 +0100)

committer Gerrit Code Review <gerrit@gerrit.gromacs.org>

Thu, 17 Jan 2013 00:32:23 +0000 (01:32 +0100)
author Szilard Pall <pszilard@cbr.su.se>
Fri, 11 Jan 2013 06:11:55 +0000 (07:11 +0100)
committer Gerrit Code Review <gerrit@gerrit.gromacs.org>
Thu, 17 Jan 2013 00:32:23 +0000 (01:32 +0100)
diff --git a/include/types/hw_info.h b/include/types/hw_info.h

index e800856e098f68044f571fa16fe9d34bbceb024a..809111d35b375e2ecab9cc2ab5b7d91439cd39db 100644 (file)
--- a/include/types/hw_info.h
+++ b/include/types/hw_info.h
@@ -68,6 +68,8 @@ static const char * const gpu_detect_res_str[] =
  typedef struct 
  {
      gmx_bool            bUserSet;       /* true if the GPUs in cuda_dev_use are manually provided by the user */
+    gmx_bool            bDevShare;      /* true if any of the devices is shared by
+                                           (t)MPI ranks, with auto-detection always FALSE */
  
      int                 ncuda_dev_use;  /* number of devices selected to be used */
      int                 *cuda_dev_use;  /* index of the devices selected to be used */
diff --git a/src/gmxlib/gmx_detect_hardware.c b/src/gmxlib/gmx_detect_hardware.c

index fe4ca975a4df59d120570293e406a9039c1308ba..b9da2bd3267e0d2764691ab5b7035bf26558d061 100644 (file)
--- a/src/gmxlib/gmx_detect_hardware.c
+++ b/src/gmxlib/gmx_detect_hardware.c
@@ -332,7 +332,7 @@ void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
                      md_print_warn(cr,fplog,
                                    "NOTE: potentially sub-optimal launch configuration, %s started with less\n"
                                    "      PP %s%s%s than GPU%s available.\n"
-                                  "      Each PP %s can only use one GPU, so only %d GPU%s%s will be used.",
+                                  "      Each PP %s can use only one GPU, %d GPU%s%s will be used.\n",
                                    ShortProgram(),
                                    th_or_proc, th_or_proc_plural, pernode, gpu_plural,
                                    th_or_proc, npppn, gpu_plural, pernode);
@@ -370,12 +370,13 @@ void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
              }
          }
  
+        hwinfo->gpu_info.bDevShare = FALSE;
          if (hwinfo->gpu_info.bUserSet && (cr->rank_pp_intranode == 0))
          {
              int i, j, same_count;
              gmx_bool bSomeSame, bAllDifferent;
  
-            same_count = 0;
+            same_count = 0; /* number of GPUs shared among ranks */
              bSomeSame = FALSE;
              bAllDifferent = TRUE;
  
@@ -389,6 +390,9 @@ void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
                  }
              }
  
+            /* store the number of shared/oversubscribed GPUs */
+            hwinfo->gpu_info.bDevShare = bSomeSame;
+
              if (btMPI && !bAllDifferent)
              {
                  gmx_fatal(FARGS,
@@ -400,8 +404,8 @@ void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
              {
                  md_print_warn(cr,fplog,
                                "NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n"
-                              "      multiple %s%s; this should be avoided as it generally\n"
-                              "      causes performance loss.",
+                              "      multiple %s%s; this should be avoided as it can cause\n"
+                              "      performance loss.\n",
                                same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
              }
          }
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu

index 57f3bd09d2bc2ae8c0e913928f474062a6e3b152..ad69c1eac50eac3228a78244cba29bf30db00525 100644 (file)
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -71,6 +71,21 @@ extern void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo);
  extern const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref();
  extern const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref();
  
+/* We should actually be using md_print_warn in md_logging.c,
+ * but we can't include mpi.h in CUDA code.
+ */
+static void md_print_warn(FILE *fplog, const char *buf)
+{
+    if (fplog != NULL)
+    {
+        /* We should only print to stderr on the master node,
+         * in most cases fplog is only set on the master node, so this works.
+         */
+        fprintf(stderr, "\n%s\n", buf);
+        fprintf(fplog,  "\n%s\n", buf);
+    }
+}
+
  /* Fw. decl. */
  static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb);
  
@@ -395,7 +410,8 @@ void nbnxn_cuda_init(FILE *fplog,
      cudaError_t stat;
      nbnxn_cuda_ptr_t  nb;
      char sbuf[STRLEN];
-    bool bStreamSync, bNoStreamSync, bTMPIAtomics, bX86;
+    bool bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
+    int cuda_drv_ver;
  
      assert(gpu_info);
  
@@ -447,6 +463,13 @@ void nbnxn_cuda_init(FILE *fplog,
       * waiting to preserve performance. This requires support for atomic
       * operations and only works on x86/x86_64.
       * With polling wait event-timing also needs to be disabled.
+     *
+     * The overhead is greatly reduced in 304.xx drivers (independent of runtime ver).
+     * The corresponding driver API version (which is what we can query) should
+     * be at least 5.0. Hence we will not switch to polling when >=5.0 is returned.
+     *
+     * NOTE: Unfortunately, this is knonw to fail when GPUs are shared by (t)MPI,
+     * ranks so we will also disable it in that case.
       */
  
      bStreamSync    = getenv("GMX_CUDA_STREAMSYNC") != NULL;
@@ -469,61 +492,54 @@ void nbnxn_cuda_init(FILE *fplog,
          gmx_fatal(FARGS, "Conflicting environment variables: both GMX_CUDA_STREAMSYNC and GMX_NO_CUDA_STREAMSYNC defined");
      }
  
+    stat = cudaDriverGetVersion(&cuda_drv_ver);
+    CU_RET_ERR(stat, "cudaDriverGetVersion failed");
+    bOldDriver = (cuda_drv_ver < 5000);
+
      if (nb->dev_info->prop.ECCEnabled == 1)
      {
          if (bStreamSync)
          {
              nb->bUseStreamSync = true;
  
-            sprintf(sbuf,
-                    "NOTE: Using a GPU with ECC enabled, but cudaStreamSynchronize-based waiting is\n"
-                    "      forced by the GMX_CUDA_STREAMSYNC env. var. Due to a CUDA bug, this \n"
-                    "      combination causes performance loss.");
-            fprintf(stderr, "\n%s\n", sbuf);
-            if (fplog)
+            /* only warn if polling should be used */
+            if (bOldDriver && !gpu_info->bDevShare)
              {
-                fprintf(fplog, "\n%s\n", sbuf);
+                md_print_warn(fplog,
+                              "NOTE: Using a GPU with ECC enabled and a driver older than 5.0, but\n"
+                              "      cudaStreamSynchronize waiting is forced by the GMX_CUDA_STREAMSYNC env. var.\n");
              }
          }
          else
          {
-            /* can use polling wait only on x86/x86_64 *if* atomics are available */
-            nb->bUseStreamSync = ((bX86 && bTMPIAtomics) == false);
-
-            if (!bX86)
+            /* Can/should turn of cudaStreamSynchronize wait only if
+             *   - we're on x86/x86_64
+             *   - atomics are available
+             *   - GPUs are not being shared
+             *   - and driver is old. */
+            nb->bUseStreamSync =
+                (bX86 && bTMPIAtomics && !gpu_info->bDevShare && bOldDriver) ?
+                true : false;
+
+            if (nb->bUseStreamSync)
              {
-                sprintf(sbuf,
-                        "Using a GPU with ECC on; the standard cudaStreamSynchronize waiting, due to a\n"
-                        "      CUDA bug, causes performance loss when used in combination with ECC.\n"
-                        "      However, the polling waiting workaround can not be used as it is only\n"
-                        "      supported on x86/x86_64, but not on the current architecture.");
-                gmx_warning("%s\n", sbuf);
-                if (fplog)
-                {
-                    fprintf(fplog, "\n%s\n", sbuf);
-                }
-
+                md_print_warn(fplog,
+                              "NOTE: Using a GPU with ECC enabled and CUDA driver version <5.0, will switch to\n"
+                              "      polling wait to avoid performance loss. If you encounter issues, set the\n"
+                              "      GMX_CUDA_STREAMSYNC env. var. to switch back to standard GPU waiting.\n");
              }
-            else if (bTMPIAtomics)
-            {
-                if (fplog)
-                {
-                    fprintf(fplog,
-                            "NOTE: Using a GPU with ECC enabled; will use polling waiting.\n");
-                }
-            }
-            else
+            else if (bOldDriver)
              {
+                /* Tell the user that the ECC+old driver combination can be bad */
                  sprintf(sbuf,
-                        "Using a GPU with ECC on; the standard cudaStreamSynchronize waiting, due to a\n"
-                        "      CUDA bug, causes performance loss when used in combination with ECC.\n"
-                        "      However, the polling waiting workaround can not be used as atomic\n"
-                        "      operations are not supported by the current CPU+compiler combination.");
-                gmx_warning("%s\n", sbuf);
-                if (fplog)
-                {
-                    fprintf(fplog, "\n%s\n", sbuf);
-                }
+                        "NOTE: Using a GPU with ECC enabled and driver version <5.0. A bug in this\n"
+                        "      driver can cause performance loss.\n"
+                        "      However, the polling waiting workaround can not be used because\n%s\n"
+                        "      Consider updating the driver or turning ECC off.",
+                        (!bX86 || !bTMPIAtomics) ?
+                           "         atomic operations are not supported by the platform/CPU+compiler." :
+                           "         GPU(s) are being oversubscribed.");
+                md_print_warn(fplog, sbuf);
              }
          }
      }
@@ -533,14 +549,8 @@ void nbnxn_cuda_init(FILE *fplog,
          {
              nb->bUseStreamSync = false;
  
-            sprintf(sbuf,
-                    "NOTE: Using a GPU with no/disabled ECC, but cudaStreamSynchronize-based waiting\n"
-                    "      is turned off and polling turned on by the GMX_NO_CUDA_STREAMSYNC env. var.");
-            fprintf(stderr, "\n%s\n", sbuf);
-            if (fplog)
-            {
-                fprintf(fplog, "\n%s\n", sbuf);
-            }
+            md_print_warn(fplog,
+                          "NOTE: Polling wait for GPU synchronization requested by GMX_NO_CUDA_STREAMSYNC\n");
          }
          else
          {
author	Szilard Pall <pszilard@cbr.su.se>
	Fri, 11 Jan 2013 06:11:55 +0000 (07:11 +0100)
committer	Gerrit Code Review <gerrit@gerrit.gromacs.org>
	Thu, 17 Jan 2013 00:32:23 +0000 (01:32 +0100)
include/types/hw_info.h		patch \| blob \| history
src/gmxlib/gmx_detect_hardware.c		patch \| blob \| history
src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu		patch \| blob \| history