Some old tweak which was supposed to improve performance had in fact
the opposite effect. Removing this tweak and with it eliminating
shared memory bank conflicts it caused improved performance by up
to 2.5% in the force-only CUDA kernel.
Change-Id: I7fcb24defed2c68627457522c39805afc83b3276
cij4_start = nb_sci.cj4_ind_start; /* first ...*/
cij4_end = nb_sci.cj4_ind_end; /* and last index of j clusters */
cij4_start = nb_sci.cj4_ind_start; /* first ...*/
cij4_end = nb_sci.cj4_ind_end; /* and last index of j clusters */
- /* Store the i-atom x and q in shared memory */
- /* Note: the thread indexing here is inverted with respect to the
- inner-loop as this results in slightly higher performance */
- ci = sci * NCL_PER_SUPERCL + tidxi;
- ai = ci * CL_SIZE + tidxj;
- xqib[tidxi * CL_SIZE + tidxj] = xq[ai] + shift_vec[nb_sci.shift];
-#ifdef IATYPE_SHMEM
+ /* Pre-load i-atom x and q into shared memory */
ci = sci * NCL_PER_SUPERCL + tidxj;
ai = ci * CL_SIZE + tidxi;
ci = sci * NCL_PER_SUPERCL + tidxj;
ai = ci * CL_SIZE + tidxi;
+ xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + shift_vec[nb_sci.shift];
+#ifdef IATYPE_SHMEM
+ /* Pre-load the i-atom types into shared memory */
atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
#endif
__syncthreads();
atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
#endif
__syncthreads();