Improved CUDA non-bonded kernel performance

author Szilard Pall <pall.szilard@gmail.com>

Fri, 28 Feb 2014 21:11:08 +0000 (22:11 +0100)

committer Szilárd Páll <pall.szilard@gmail.com>

Fri, 28 Feb 2014 21:14:37 +0000 (22:14 +0100)
author Szilard Pall <pall.szilard@gmail.com>
Fri, 28 Feb 2014 21:11:08 +0000 (22:11 +0100)
committer Szilárd Páll <pall.szilard@gmail.com>
Fri, 28 Feb 2014 21:14:37 +0000 (22:14 +0100)
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh

index c4a92c59d1f401f3b1e2a584bf76b77462ef6da5..b681ca81f99f5ead94ae4073fd627dfcaa1dd2ea 100644 (file)
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -175,15 +175,12 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
      cij4_start  = nb_sci.cj4_ind_start; /* first ...*/
      cij4_end    = nb_sci.cj4_ind_end;   /* and last index of j clusters */
  
-    /* Store the i-atom x and q in shared memory */
-    /* Note: the thread indexing here is inverted with respect to the
-       inner-loop as this results in slightly higher performance */
-    ci = sci * NCL_PER_SUPERCL + tidxi;
-    ai = ci * CL_SIZE + tidxj;
-    xqib[tidxi * CL_SIZE + tidxj] = xq[ai] + shift_vec[nb_sci.shift];
-#ifdef IATYPE_SHMEM
+    /* Pre-load i-atom x and q into shared memory */
      ci = sci * NCL_PER_SUPERCL + tidxj;
      ai = ci * CL_SIZE + tidxi;
+    xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + shift_vec[nb_sci.shift];
+#ifdef IATYPE_SHMEM
+    /* Pre-load the i-atom types into shared memory */
      atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
  #endif
      __syncthreads();
author	Szilard Pall <pall.szilard@gmail.com>
	Fri, 28 Feb 2014 21:11:08 +0000 (22:11 +0100)
committer	Szilárd Páll <pall.szilard@gmail.com>
	Fri, 28 Feb 2014 21:14:37 +0000 (22:14 +0100)