cij4_start = nb_sci.cj4_ind_start; /* first ...*/
cij4_end = nb_sci.cj4_ind_end; /* and last index of j clusters */
- /* Pre-load i-atom x and q into shared memory */
- ci = sci * NCL_PER_SUPERCL + tidxj;
- ai = ci * CL_SIZE + tidxi;
-
- xqbuf = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
- xqbuf.w *= nbparam->epsfac;
- xqib[tidxj * CL_SIZE + tidxi] = xqbuf;
+ for (i = 0; i < NCL_PER_SUPERCL; i += CL_SIZE)
+ {
+ /* Pre-load i-atom x and q into shared memory */
+ ci = sci * NCL_PER_SUPERCL + tidxj+i;
+ ai = ci * CL_SIZE + tidxi;
+ xqbuf = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+ xqbuf.w *= nbparam->epsfac;
+ xqib[(tidxj + i) * CL_SIZE + tidxi] = xqbuf;
#ifdef IATYPE_SHMEM
#ifndef LJ_COMB
- /* Pre-load the i-atom types into shared memory */
- atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
+ /* Pre-load the i-atom types into shared memory */
+ atib[(tidxj + i) * CL_SIZE + tidxi] = atom_types[ai];
#else
- ljcpib[tidxj * CL_SIZE + tidxi] = lj_comb[ai];
+ ljcpib[(tidxj + i) * CL_SIZE + tidxi] = lj_comb[ai];
#endif
#endif
+ }
/* Initialise warp vote. (8x8 block) 2 warps for nvidia */
- if (tidx == 0 || tidx == 32)
+ if (tidx == 0 || tidx == WARP_SIZE)
{
warp_any[widx] = 0;
}
reduce_force_i(f_buf, f,
&fshift_buf, bCalcFshift,
tidxi, tidxj, ai);
- barrier(CLK_LOCAL_MEM_FENCE);
}
/* add up local shift forces into global mem */
f_buf[ tidx] = E_lj;
f_buf[FBUF_STRIDE + tidx] = E_el;
reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
-
#endif
}