From 290e36927123f3e22992778e9960371d380984a2 Mon Sep 17 00:00:00 2001
From: Szilard Pall <pall.szilard@gmail.com>
Date: Thu, 18 Jun 2015 00:59:57 +0200
Subject: [PATCH] fix minor CUDA NB kernel performance regression

Commit f2b9db26 introduced the thread index z component as a stride in
the middle j4 loop. As this index is not a constant but a value
loaded from a special register, this change caused up to a few %
performance loss in the force kernels. This went unnoticed because
some architectures (cc 3.5/5.2) and some compilers (CUDA 7.0) were
barely affected.

Change-Id: I423790e8fb01a35f7234d26ff064dcc555e73c48
---
 src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
index ef6fad72c1..b24915c1f9 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -160,7 +160,11 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
     unsigned int tidxi  = threadIdx.x;
     unsigned int tidxj  = threadIdx.y;
     unsigned int tidx   = threadIdx.y * blockDim.x + threadIdx.x;
+#if NTHREAD_Z == 1
+    unsigned int tidxz  = 0;
+#else
     unsigned int tidxz  = threadIdx.z;
+#endif
     unsigned int bidx   = blockIdx.x;
     unsigned int widx   = tidx / WARP_SIZE; /* warp index */
 
-- 
2.22.0