From: Szilard Pall Date: Thu, 26 Sep 2013 10:45:55 +0000 (+0200) Subject: allow compilation to optimize for CUDA compute cap. 3.5 X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=cd01238b6b0eca4ddf115efc3abda44e98eabe6d;p=alexxy%2Fgromacs.git allow compilation to optimize for CUDA compute cap. 3.5 Enabling optimizations targeting compute capability 3.5 devices (GK110) slightly improves performance of both PME and RF kernels. This requires a hint for the compiler optimization indicating the maximum number of threads/block and minimum number of blocks/multiprocessor. This change allows nvcc >=5.0 to generate code for CC 3.5 devices and switches to including PTX 3.5 code (instead of 3.0) in the binary. Change-Id: If7e14d31165bc05859250db7468bf6bd8c186264 --- diff --git a/cmake/gmxManageNvccConfig.cmake b/cmake/gmxManageNvccConfig.cmake index 59c8516f86..3c1c9bc08c 100644 --- a/cmake/gmxManageNvccConfig.cmake +++ b/cmake/gmxManageNvccConfig.cmake @@ -143,8 +143,10 @@ if (NOT DEFINED CUDA_NVCC_FLAGS_SET) # optimized for sm_35 results in lower performance than with sm_30. if(CUDA_VERSION VERSION_LESS "4.2") set(_CUDA_ARCH_STR "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_20,code=compute_20") - else() + elseif(CUDA_VERSION VERSION_LESS "5.0") set(_CUDA_ARCH_STR "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_30,code=compute_30") + else() + set(_CUDA_ARCH_STR "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_35,code=compute_35") endif() # finally set the damn flags diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh index 73acdfecd5..437849d70e 100644 --- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh +++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh @@ -62,6 +62,9 @@ Each thread calculates an i force-component taking one pair of i-j atoms. */ +#if __CUDA_ARCH__ >= 350 +__launch_bounds__(64,16) +#endif #ifdef PRUNE_NBL #ifdef CALC_ENERGIES __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener_prune) diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh index cff062d7a1..b0012ee361 100644 --- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh +++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh @@ -56,6 +56,9 @@ Each thread calculates an i force-component taking one pair of i-j atoms. */ +#if __CUDA_ARCH__ >= 350 +__launch_bounds__(64,16) +#endif #ifdef PRUNE_NBL #ifdef CALC_ENERGIES __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener_prune_legacy)