Allow OCL CL_SIZE to be set to 4 for Intel

[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_ocl / nbnxn_ocl_kernel_pruneonly.clh
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh

index 126c47adc85344bae8de1d683a97243dfd2c61cb..6a5128c7f02738f3567ea452dfae4f7162d5b19e 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
@@ -42,8 +42,8 @@
   *  \ingroup module_mdlib
   */
  
-#ifndef _WARPLESS_SOURCE_
-/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for the "nowarp" kernel
+#if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
+/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
   * Note that this should precede the kernel_utils include.
   */
  #define USE_CJ_PREFETCH 1
@@ -150,14 +150,17 @@ __kernel void nbnxn_kernel_prune_rolling_opencl
  
      if (tidxz == 0)
      {
-        /* Pre-load i-atom x and q into shared memory */
-        int ci = sci * c_numClPerSupercl + tidxj;
-        int ai = ci * c_clSize + tidxi;
+        for (int i = 0; i < NCL_PER_SUPERCL; i += CL_SIZE)
+        {
+            /* Pre-load i-atom x and q into shared memory */
+            int ci = sci * c_numClPerSupercl + tidxj+i;
+            int ai = ci * c_clSize + tidxi;
  
-        /* We don't need q, but using float4 in shmem avoids bank conflicts */
-        float4 tmp = xq[ai];
-        float4 xi  = tmp + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
-        xib[tidxj * c_clSize + tidxi] = xi;
+            /* We don't need q, but using float4 in shmem avoids bank conflicts */
+            float4 tmp = xq[ai];
+            float4 xi  = tmp + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+            xib[(tidxj + i) * c_clSize + tidxi] = xi;
+        }
      }
      barrier(CLK_LOCAL_MEM_FENCE);
  
@@ -186,10 +189,6 @@ __kernel void nbnxn_kernel_prune_rolling_opencl
          }
  
          preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck);
-#if defined _WARPLESS_SOURCE_ && USE_CJ_PREFETCH
-        /* can't assume wavefront width, need to sync before we can consume cj4 from local memory */
-        barrier(CLK_LOCAL_MEM_FENCE);
-#endif
  
          if (imaskCheck)
          {