* \ingroup module_mdlib
*/
-#ifndef _WARPLESS_SOURCE_
-/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for the "nowarp" kernel
+#if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
+/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
* Note that this should precede the kernel_utils include.
*/
#define USE_CJ_PREFETCH 1
if (tidxz == 0)
{
- /* Pre-load i-atom x and q into shared memory */
- int ci = sci * c_numClPerSupercl + tidxj;
- int ai = ci * c_clSize + tidxi;
+ for (int i = 0; i < NCL_PER_SUPERCL; i += CL_SIZE)
+ {
+ /* Pre-load i-atom x and q into shared memory */
+ int ci = sci * c_numClPerSupercl + tidxj+i;
+ int ai = ci * c_clSize + tidxi;
- /* We don't need q, but using float4 in shmem avoids bank conflicts */
- float4 tmp = xq[ai];
- float4 xi = tmp + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
- xib[tidxj * c_clSize + tidxi] = xi;
+ /* We don't need q, but using float4 in shmem avoids bank conflicts */
+ float4 tmp = xq[ai];
+ float4 xi = tmp + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+ xib[(tidxj + i) * c_clSize + tidxi] = xi;
+ }
}
barrier(CLK_LOCAL_MEM_FENCE);
}
preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck);
-#if defined _WARPLESS_SOURCE_ && USE_CJ_PREFETCH
- /* can't assume wavefront width, need to sync before we can consume cj4 from local memory */
- barrier(CLK_LOCAL_MEM_FENCE);
-#endif
if (imaskCheck)
{