# optimized for sm_35 results in lower performance than with sm_30.
if(CUDA_VERSION VERSION_LESS "4.2")
set(_CUDA_ARCH_STR "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_20,code=compute_20")
- else()
+ elseif(CUDA_VERSION VERSION_LESS "5.0")
set(_CUDA_ARCH_STR "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_30,code=compute_30")
+ else()
+ set(_CUDA_ARCH_STR "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_35,code=compute_35")
endif()
# finally set the damn flags
Each thread calculates an i force-component taking one pair of i-j atoms.
*/
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(64,16)
+#endif
#ifdef PRUNE_NBL
#ifdef CALC_ENERGIES
__global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener_prune)
Each thread calculates an i force-component taking one pair of i-j atoms.
*/
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(64,16)
+#endif
#ifdef PRUNE_NBL
#ifdef CALC_ENERGIES
__global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener_prune_legacy)