allow compilation to optimize for CUDA compute cap. 3.5

author Szilard Pall <pall.szilard@gmail.com>

Thu, 26 Sep 2013 10:45:55 +0000 (12:45 +0200)

committer Gerrit Code Review <gerrit@gerrit.gromacs.org>

Sun, 29 Sep 2013 21:39:17 +0000 (23:39 +0200)
author Szilard Pall <pall.szilard@gmail.com>
Thu, 26 Sep 2013 10:45:55 +0000 (12:45 +0200)
committer Gerrit Code Review <gerrit@gerrit.gromacs.org>
Sun, 29 Sep 2013 21:39:17 +0000 (23:39 +0200)
diff --git a/cmake/gmxManageNvccConfig.cmake b/cmake/gmxManageNvccConfig.cmake

index 59c8516f8664645635b94e91c29e88832b50dd14..3c1c9bc08cf9de0c11b70888aa747678a15abfe0 100644 (file)
--- a/cmake/gmxManageNvccConfig.cmake
+++ b/cmake/gmxManageNvccConfig.cmake
@@ -143,8 +143,10 @@ if (NOT DEFINED CUDA_NVCC_FLAGS_SET)
      #   optimized for sm_35 results in lower performance than with sm_30.
      if(CUDA_VERSION VERSION_LESS "4.2")
          set(_CUDA_ARCH_STR "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_20,code=compute_20")
-    else()
+    elseif(CUDA_VERSION VERSION_LESS "5.0")
          set(_CUDA_ARCH_STR "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_30,code=compute_30")
+    else()
+        set(_CUDA_ARCH_STR "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_35,code=compute_35")
      endif()
  
      # finally set the damn flags
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh

index 73acdfecd594e222c40e00b25e0127de1419c839..437849d70e9e460277f1c62c9e875ff1ccdf2d4d 100644 (file)
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -62,6 +62,9 @@
  
      Each thread calculates an i force-component taking one pair of i-j atoms.
   */
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(64,16)
+#endif
  #ifdef PRUNE_NBL
  #ifdef CALC_ENERGIES
  __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener_prune)
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh

index cff062d7a1381d3f909c2489e8de614ee527676c..b0012ee3619b34af8ac8630a9f2448e588cb803b 100644 (file)
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh
@@ -56,6 +56,9 @@
  
      Each thread calculates an i force-component taking one pair of i-j atoms.
   */
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(64,16)
+#endif
  #ifdef PRUNE_NBL
  #ifdef CALC_ENERGIES
  __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener_prune_legacy)
author	Szilard Pall <pall.szilard@gmail.com>
	Thu, 26 Sep 2013 10:45:55 +0000 (12:45 +0200)
committer	Gerrit Code Review <gerrit@gerrit.gromacs.org>
	Sun, 29 Sep 2013 21:39:17 +0000 (23:39 +0200)
cmake/gmxManageNvccConfig.cmake		patch \| blob \| history
src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh		patch \| blob \| history
src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh		patch \| blob \| history