From: Szilard Pall <pall.szilard@gmail.com>
Date: Thu, 26 Sep 2013 10:45:55 +0000 (+0200)
Subject: allow compilation to optimize for CUDA compute cap. 3.5
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=cd01238b6b0eca4ddf115efc3abda44e98eabe6d;p=alexxy%2Fgromacs.git

allow compilation to optimize for CUDA compute cap. 3.5

Enabling optimizations targeting compute capability 3.5 devices
(GK110) slightly improves performance of both PME and RF kernels.
This requires a hint for the compiler optimization indicating
the maximum number of threads/block and minimum number of
blocks/multiprocessor. This change allows nvcc >=5.0 to generate
code for CC 3.5 devices and switches to including PTX 3.5 code
(instead of 3.0) in the binary.

Change-Id: If7e14d31165bc05859250db7468bf6bd8c186264
---

diff --git a/cmake/gmxManageNvccConfig.cmake b/cmake/gmxManageNvccConfig.cmake
index 59c8516f86..3c1c9bc08c 100644
--- a/cmake/gmxManageNvccConfig.cmake
+++ b/cmake/gmxManageNvccConfig.cmake
@@ -143,8 +143,10 @@ if (NOT DEFINED CUDA_NVCC_FLAGS_SET)
     #   optimized for sm_35 results in lower performance than with sm_30.
     if(CUDA_VERSION VERSION_LESS "4.2")
         set(_CUDA_ARCH_STR "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_20,code=compute_20")
-    else()
+    elseif(CUDA_VERSION VERSION_LESS "5.0")
         set(_CUDA_ARCH_STR "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_30,code=compute_30")
+    else()
+        set(_CUDA_ARCH_STR "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_35,code=compute_35")
     endif()
 
     # finally set the damn flags
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
index 73acdfecd5..437849d70e 100644
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -62,6 +62,9 @@
 
     Each thread calculates an i force-component taking one pair of i-j atoms.
  */
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(64,16)
+#endif
 #ifdef PRUNE_NBL
 #ifdef CALC_ENERGIES
 __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener_prune)
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh
index cff062d7a1..b0012ee361 100644
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh
@@ -56,6 +56,9 @@
 
     Each thread calculates an i force-component taking one pair of i-j atoms.
  */
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(64,16)
+#endif
 #ifdef PRUNE_NBL
 #ifdef CALC_ENERGIES
 __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener_prune_legacy)