Enabling optimizations targeting compute capability 3.5 devices
(GK110) slightly improves performance of both PME and RF kernels.
This requires a hint for the compiler optimization indicating
the maximum number of threads/block and minimum number of
blocks/multiprocessor. This change allows nvcc >=5.0 to generate
code for CC 3.5 devices and switches to including PTX 3.5 code
(instead of 3.0) in the binary.
Change-Id: If7e14d31165bc05859250db7468bf6bd8c186264
# optimized for sm_35 results in lower performance than with sm_30.
if(CUDA_VERSION VERSION_LESS "4.2")
set(_CUDA_ARCH_STR "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_20,code=compute_20")
# optimized for sm_35 results in lower performance than with sm_30.
if(CUDA_VERSION VERSION_LESS "4.2")
set(_CUDA_ARCH_STR "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_20,code=compute_20")
+ elseif(CUDA_VERSION VERSION_LESS "5.0")
set(_CUDA_ARCH_STR "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_30,code=compute_30")
set(_CUDA_ARCH_STR "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_30,code=compute_30")
+ else()
+ set(_CUDA_ARCH_STR "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_35,code=compute_35")
endif()
# finally set the damn flags
endif()
# finally set the damn flags
Each thread calculates an i force-component taking one pair of i-j atoms.
*/
Each thread calculates an i force-component taking one pair of i-j atoms.
*/
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(64,16)
+#endif
#ifdef PRUNE_NBL
#ifdef CALC_ENERGIES
__global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener_prune)
#ifdef PRUNE_NBL
#ifdef CALC_ENERGIES
__global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener_prune)
Each thread calculates an i force-component taking one pair of i-j atoms.
*/
Each thread calculates an i force-component taking one pair of i-j atoms.
*/
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(64,16)
+#endif
#ifdef PRUNE_NBL
#ifdef CALC_ENERGIES
__global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener_prune_legacy)
#ifdef PRUNE_NBL
#ifdef CALC_ENERGIES
__global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener_prune_legacy)