/* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
/* i-atom x+q in shared memory */
shmem = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
+ /* cj in shared memory, for both warps separately */
+ shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
#ifdef IATYPE_SHMEM
/* i-atom types in shared memory */
shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
/* shmem buffer for i x+q pre-loading */
extern __shared__ float4 xqib[];
+ /* shmem buffer for cj, for both warps separately */
+ int *cjs = (int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
#ifdef IATYPE_SHMEM
/* shmem buffer for i atom-type pre-loading */
- int *atib = (int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
+ int *atib = (int *)(cjs + 2 * NBNXN_GPU_JGROUP_SIZE);
#endif
#ifndef REDUCE_SHUFFLE
#ifdef IATYPE_SHMEM
float *f_buf = (float *)(atib + NCL_PER_SUPERCL * CL_SIZE);
#else
- float *f_buf = (float *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
+ float *f_buf = (float *)(cjs + 2 * NBNXN_GPU_JGROUP_SIZE);
#endif
#endif
if (imask)
#endif
{
+ /* Pre-load cj into shared memory on both warps separately */
+ if ((tidxj == 0 || tidxj == 4) && tidxi < NBNXN_GPU_JGROUP_SIZE)
+ {
+ cjs[tidxi + tidxj * NBNXN_GPU_JGROUP_SIZE / 4] = pl_cj4[j4].cj[tidxi];
+ }
+
/* Unrolling this loop
- with pruning leads to register spilling;
- on Kepler is much slower;
{
mask_ji = (1U << (jm * NCL_PER_SUPERCL));
- cj = pl_cj4[j4].cj[jm];
+ cj = cjs[jm + (tidxj & 4) * NBNXN_GPU_JGROUP_SIZE / 4];
aj = cj * CL_SIZE + tidxj;
/* load j atom data */