Allow OCL CL_SIZE to be set to 4 for Intel

[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_ocl / nbnxn_ocl_kernel.clh
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh

index 41d8e6988467e7703e735849124dadaae460f0a7..a40eb78610feb22add40b52903c0d7f187130438 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
@@ -255,24 +255,26 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
      cij4_start  = nb_sci.cj4_ind_start; /* first ...*/
      cij4_end    = nb_sci.cj4_ind_end;   /* and last index of j clusters */
  
-    /* Pre-load i-atom x and q into shared memory */
-    ci = sci * NCL_PER_SUPERCL + tidxj;
-    ai = ci * CL_SIZE + tidxi;
-
-    xqbuf    = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
-    xqbuf.w *= nbparam->epsfac;
-    xqib[tidxj * CL_SIZE + tidxi] = xqbuf;
+    for (i = 0; i < NCL_PER_SUPERCL; i += CL_SIZE)
+    {
+        /* Pre-load i-atom x and q into shared memory */
+        ci = sci * NCL_PER_SUPERCL + tidxj+i;
+        ai = ci * CL_SIZE + tidxi;
  
+        xqbuf    = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+        xqbuf.w *= nbparam->epsfac;
+        xqib[(tidxj + i) * CL_SIZE + tidxi] = xqbuf;
  #ifdef IATYPE_SHMEM
  #ifndef LJ_COMB
-    /* Pre-load the i-atom types into shared memory */
-    atib[tidxj * CL_SIZE + tidxi]   = atom_types[ai];
+        /* Pre-load the i-atom types into shared memory */
+        atib[(tidxj + i) * CL_SIZE + tidxi]   = atom_types[ai];
  #else
-    ljcpib[tidxj * CL_SIZE + tidxi] = lj_comb[ai];
+        ljcpib[(tidxj + i) * CL_SIZE + tidxi] = lj_comb[ai];
  #endif
  #endif
+    }
      /* Initialise warp vote. (8x8 block) 2 warps for nvidia */
-    if (tidx == 0 || tidx == 32)
+    if (tidx == 0 || tidx == WARP_SIZE)
      {
          warp_any[widx] = 0;
      }
@@ -625,7 +627,6 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
          reduce_force_i(f_buf, f,
                         &fshift_buf, bCalcFshift,
                         tidxi, tidxj, ai);
-        barrier(CLK_LOCAL_MEM_FENCE);
      }
  
      /* add up local shift forces into global mem */
@@ -646,7 +647,6 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
      f_buf[              tidx] = E_lj;
      f_buf[FBUF_STRIDE + tidx] = E_el;
      reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
-
  #endif
  }