Merge release-4-6 into master

[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_cuda / nbnxn_cuda_kernel.cuh
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh

index 2d813ec61a62e31e3a9a333359c85da4824af507..07520285d97c9d3bb124bc19f9be4c2ec2f29d43 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -191,15 +191,12 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
      cij4_start  = nb_sci.cj4_ind_start; /* first ...*/
      cij4_end    = nb_sci.cj4_ind_end;   /* and last index of j clusters */
  
-    /* Store the i-atom x and q in shared memory */
-    /* Note: the thread indexing here is inverted with respect to the
-       inner-loop as this results in slightly higher performance */
-    ci = sci * NCL_PER_SUPERCL + tidxi;
-    ai = ci * CL_SIZE + tidxj;
-    xqib[tidxi * CL_SIZE + tidxj] = xq[ai] + shift_vec[nb_sci.shift];
-#ifdef IATYPE_SHMEM
+    /* Pre-load i-atom x and q into shared memory */
      ci = sci * NCL_PER_SUPERCL + tidxj;
      ai = ci * CL_SIZE + tidxi;
+    xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + shift_vec[nb_sci.shift];
+#ifdef IATYPE_SHMEM
+    /* Pre-load the i-atom types into shared memory */
      atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
  #endif
      __syncthreads();