Add support for CUDA CC 6.0/6.1

author Szilárd Páll <pall.szilard@gmail.com>

Tue, 31 May 2016 12:47:30 +0000 (14:47 +0200)

committer Erik Lindahl <erik.lindahl@gmail.com>

Sun, 5 Jun 2016 07:49:12 +0000 (09:49 +0200)
author Szilárd Páll <pall.szilard@gmail.com>
Tue, 31 May 2016 12:47:30 +0000 (14:47 +0200)
committer Erik Lindahl <erik.lindahl@gmail.com>
Sun, 5 Jun 2016 07:49:12 +0000 (09:49 +0200)
diff --git a/cmake/gmxManageNvccConfig.cmake b/cmake/gmxManageNvccConfig.cmake

index c82e56c291fd9859ad0c7e1fe3a7563841c61375..028a83ff1c857dc9495157b78db4ea42b640b141 100644 (file)
--- a/cmake/gmxManageNvccConfig.cmake
+++ b/cmake/gmxManageNvccConfig.cmake
@@ -173,6 +173,9 @@ else()
      #     => compile sm_20, sm_30, sm_35, sm_37 sm_50, SASS, and compute_50 PTX
      # - with CUDA >=7.0         CC 5.2 is supported (5.3, Tegra X1 we don't generate code for)
      #     => compile sm_20, sm_30, sm_35, sm_37, sm_50, & sm_52 SASS, and compute_52 PTX
+    # - with CUDA >=8.0         CC 6.0-6.2 is supported (but we know nothing about CC 6.2, so we won't generate code or it)
+    #     => compile sm_20, sm_30, sm_35, sm_37, sm_50, sm_52, sm_60, sm_61 SASS, and compute_60 and compute_61 PTX
+    #
      #
      #   Note that CUDA 6.5.19 second patch release supports cc 5.2 too, but
      #   CUDA_VERSION does not contain patch version and having PTX 5.0 JIT-ed is
@@ -194,6 +197,10 @@ else()
      if(NOT CUDA_VERSION VERSION_LESS "7.0") # >= 7.0
          list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_52,code=sm_52")
      endif()
+    if(NOT CUDA_VERSION VERSION_LESS "8.0") # >= 8.0
+        list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_60,code=sm_60")
+        list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_61,code=sm_61")
+    endif()
  
      # Next add flags that trigger PTX code generation for the newest supported virtual arch
      # that's useful to JIT to future architectures
@@ -205,8 +212,11 @@ else()
          list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_35,code=compute_35")
      elseif(CUDA_VERSION VERSION_LESS "7.0")
          list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_50,code=compute_50")
-    else() # version >= 7.0
+    elseif(CUDA_VERSION VERSION_LESS "8.0")
          list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_52,code=compute_52")
+    else() # version >= 8.0
+        list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_60,code=compute_60")
+        list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_61,code=compute_61")
      endif()
  endif()
  
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

index feac290b8eebefa7ba18927172128f4ea6ca02a3..0429445dd034b982a95101ffcfa4e678ccd81b36 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -82,36 +82,6 @@ texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
  #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  
-/* NTHREAD_Z controls the number of j-clusters processed concurrently on NTHREAD_Z
- * warp-pairs per block.
- *
- * - On CC 2.0-3.5, 5.0, and 5.2, NTHREAD_Z == 1, translating to 64 th/block with 16
- * blocks/multiproc, is the fastest even though this setup gives low occupancy.
- * NTHREAD_Z > 1 results in excessive register spilling unless the minimum blocks
- * per multiprocessor is reduced proportionally to get the original number of max
- * threads in flight (and slightly lower performance).
- * - On CC 3.7 there are enough registers to double the number of threads; using
- * NTHREADS_Z == 2 is fastest with 16 blocks (TODO: test with RF and other kernels
- * with low-register use).
- *
- * Note that the current kernel implementation only supports NTHREAD_Z > 1 with
- * shuffle-based reduction, hence CC >= 3.0.
- */
-
-/* Kernel launch bounds as function of NTHREAD_Z.
- * - CC 3.5/5.2: NTHREAD_Z=1, (64, 16) bounds
- * - CC 3.7:     NTHREAD_Z=2, (128, 16) bounds
- */
-#if __CUDA_ARCH__ == 370
-#define NTHREAD_Z           (2)
-#define MIN_BLOCKS_PER_MP   (16)
-#else
-#define NTHREAD_Z           (1)
-#define MIN_BLOCKS_PER_MP   (16)
-#endif
-#define THREADS_PER_BLOCK   (CL_SIZE*CL_SIZE*NTHREAD_Z)
-
-
  /***** The kernels come here *****/
  #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh"
  
@@ -432,7 +402,8 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
       * - The 1D block-grid contains as many blocks as super-clusters.
       */
      int num_threads_z = 1;
-    if (nb->dev_info->prop.major == 3 && nb->dev_info->prop.minor == 7)
+    if ((nb->dev_info->prop.major == 3 && nb->dev_info->prop.minor == 7) ||
+        (nb->dev_info->prop.major == 6 && nb->dev_info->prop.minor == 0))
      {
          num_threads_z = 2;
      }
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh

index 01d848a93b2fd41df06b21b0c93643bccf1a6e87..91ab0c6967edb70d9d8f45ca218041ef69ee88ae 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -93,13 +93,29 @@
      Each thread calculates an i force-component taking one pair of i-j atoms.
   */
  
+/* NTHREAD_Z controls the number of j-clusters processed concurrently on NTHREAD_Z
+ * warp-pairs per block.
+ *
+ * - On CC 2.0-3.5, 5.0, and 5.2, NTHREAD_Z == 1, translating to 64 th/block with 16
+ * blocks/multiproc, is the fastest even though this setup gives low occupancy.
+ * NTHREAD_Z > 1 results in excessive register spilling unless the minimum blocks
+ * per multiprocessor is reduced proportionally to get the original number of max
+ * threads in flight (and slightly lower performance).
+ * - On CC 3.7 and 6.0 there are enough registers to double the number of threads; using
+ * NTHREADS_Z == 2 is fastest with 16 blocks (TODO: test with RF and other kernels
+ * with low-register use).
+ *
+ * Note that the current kernel implementation only supports NTHREAD_Z > 1 with
+ * shuffle-based reduction, hence CC >= 3.0.
+ */
+
  /* Kernel launch bounds as function of NTHREAD_Z.
- * - CC 3.5/5.2: NTHREAD_Z=1, (64, 16) bounds
- * - CC 3.7:     NTHREAD_Z=2, (128, 16) bounds
+ * - CC 3.0/3.5/5.x, >=6.1: NTHREAD_Z=1, (64, 16) bounds
+ * - CC 3.7, 6.0:           NTHREAD_Z=2, (128, 16) bounds
   *
   * Note: convenience macros, need to be undef-ed at the end of the file.
   */
-#if __CUDA_ARCH__ == 370
+#if __CUDA_ARCH__ == 370 || __CUDA_ARCH__ == 600
  #define NTHREAD_Z           (2)
  #define MIN_BLOCKS_PER_MP   (16)
  #else
author	Szilárd Páll <pall.szilard@gmail.com>
	Tue, 31 May 2016 12:47:30 +0000 (14:47 +0200)
committer	Erik Lindahl <erik.lindahl@gmail.com>
	Sun, 5 Jun 2016 07:49:12 +0000 (09:49 +0200)
cmake/gmxManageNvccConfig.cmake		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh		patch \| blob \| history