allow compilation to optimize for CUDA compute cap. 3.5
authorSzilard Pall <pall.szilard@gmail.com>
Thu, 26 Sep 2013 10:45:55 +0000 (12:45 +0200)
committerGerrit Code Review <gerrit@gerrit.gromacs.org>
Sun, 29 Sep 2013 21:39:17 +0000 (23:39 +0200)
Enabling optimizations targeting compute capability 3.5 devices
(GK110) slightly improves performance of both PME and RF kernels.
This requires a hint for the compiler optimization indicating
the maximum number of threads/block and minimum number of
blocks/multiprocessor. This change allows nvcc >=5.0 to generate
code for CC 3.5 devices and switches to including PTX 3.5 code
(instead of 3.0) in the binary.

Change-Id: If7e14d31165bc05859250db7468bf6bd8c186264

cmake/gmxManageNvccConfig.cmake
src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh

index 59c8516f8664645635b94e91c29e88832b50dd14..3c1c9bc08cf9de0c11b70888aa747678a15abfe0 100644 (file)
@@ -143,8 +143,10 @@ if (NOT DEFINED CUDA_NVCC_FLAGS_SET)
     #   optimized for sm_35 results in lower performance than with sm_30.
     if(CUDA_VERSION VERSION_LESS "4.2")
         set(_CUDA_ARCH_STR "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_20,code=compute_20")
-    else()
+    elseif(CUDA_VERSION VERSION_LESS "5.0")
         set(_CUDA_ARCH_STR "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_30,code=compute_30")
+    else()
+        set(_CUDA_ARCH_STR "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_35,code=compute_35")
     endif()
 
     # finally set the damn flags
index 73acdfecd594e222c40e00b25e0127de1419c839..437849d70e9e460277f1c62c9e875ff1ccdf2d4d 100644 (file)
@@ -62,6 +62,9 @@
 
     Each thread calculates an i force-component taking one pair of i-j atoms.
  */
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(64,16)
+#endif
 #ifdef PRUNE_NBL
 #ifdef CALC_ENERGIES
 __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener_prune)
index cff062d7a1381d3f909c2489e8de614ee527676c..b0012ee3619b34af8ac8630a9f2448e588cb803b 100644 (file)
@@ -56,6 +56,9 @@
 
     Each thread calculates an i force-component taking one pair of i-j atoms.
  */
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(64,16)
+#endif
 #ifdef PRUNE_NBL
 #ifdef CALC_ENERGIES
 __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener_prune_legacy)