From 290e36927123f3e22992778e9960371d380984a2 Mon Sep 17 00:00:00 2001 From: Szilard Pall Date: Thu, 18 Jun 2015 00:59:57 +0200 Subject: [PATCH] fix minor CUDA NB kernel performance regression Commit f2b9db26 introduced the thread index z component as a stride in the middle j4 loop. As this index is not a constant but a value loaded from a special register, this change caused up to a few % performance loss in the force kernels. This went unnoticed because some architectures (cc 3.5/5.2) and some compilers (CUDA 7.0) were barely affected. Change-Id: I423790e8fb01a35f7234d26ff064dcc555e73c48 --- src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh index ef6fad72c1..b24915c1f9 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh @@ -160,7 +160,11 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda) unsigned int tidxi = threadIdx.x; unsigned int tidxj = threadIdx.y; unsigned int tidx = threadIdx.y * blockDim.x + threadIdx.x; +#if NTHREAD_Z == 1 + unsigned int tidxz = 0; +#else unsigned int tidxz = threadIdx.z; +#endif unsigned int bidx = blockIdx.x; unsigned int widx = tidx / WARP_SIZE; /* warp index */ -- 2.22.0