From 17e38531c7b9eb15654be49cf9d453c2f32a84d9 Mon Sep 17 00:00:00 2001 From: Szilard Pall Date: Fri, 28 Feb 2014 22:11:08 +0100 Subject: [PATCH] Improved CUDA non-bonded kernel performance Some old tweak which was supposed to improve performance had in fact the opposite effect. Removing this tweak and with it eliminating shared memory bank conflicts it caused improved performance by up to 2.5% in the force-only CUDA kernel. Change-Id: I7fcb24defed2c68627457522c39805afc83b3276 --- src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh index c4a92c59d1..b681ca81f9 100644 --- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh +++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh @@ -175,15 +175,12 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn) cij4_start = nb_sci.cj4_ind_start; /* first ...*/ cij4_end = nb_sci.cj4_ind_end; /* and last index of j clusters */ - /* Store the i-atom x and q in shared memory */ - /* Note: the thread indexing here is inverted with respect to the - inner-loop as this results in slightly higher performance */ - ci = sci * NCL_PER_SUPERCL + tidxi; - ai = ci * CL_SIZE + tidxj; - xqib[tidxi * CL_SIZE + tidxj] = xq[ai] + shift_vec[nb_sci.shift]; -#ifdef IATYPE_SHMEM + /* Pre-load i-atom x and q into shared memory */ ci = sci * NCL_PER_SUPERCL + tidxj; ai = ci * CL_SIZE + tidxi; + xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + shift_vec[nb_sci.shift]; +#ifdef IATYPE_SHMEM + /* Pre-load the i-atom types into shared memory */ atib[tidxj * CL_SIZE + tidxi] = atom_types[ai]; #endif __syncthreads(); -- 2.22.0