#define NCL_PER_SUPERCL (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
#define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE)
+/* NTHREAD_Z controls the number of j-clusters processed concurrently on NTHREAD_Z
+ * warp-pairs per block.
+ *
+ * On current architectures NTHREAD_Z == 1, translating to 64 th/block with 16
+ * blocks/multiproc, is the fastest even though this setup gives low occupancy.
+ * NTHREAD_Z > 1 results in excessive register spilling unless the minimum blocks
+ * per multiprocessor is reduced proportionally to get the original number of max
+ * threads in flight.
+ *
+ * Note that the current kernel implementation only supports NTHREAD_Z > 1 with
+ * shuffle-based reduction, hence CC >= 3.0.
+ */
+#define NTHREAD_Z (1)
+/* Kernel launch bounds as function of NTHREAD_Z. On current architectures (up to
+ * CC 3.5/5.2) the (64, 16) bounds are always is used.
+ */
+#define THREADS_PER_BLOCK (CL_SIZE*CL_SIZE*NTHREAD_Z)
+#define MIN_BLOCKS_PER_MP (16/NTHREAD_Z)
+
+
/***** The kernels come here *****/
#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh"
/* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
/* i-atom x+q in shared memory */
shmem = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
- /* cj in shared memory, for both warps separately */
- shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
+ /* cj in shared memory, for each warp separately */
+ shmem += NTHREAD_Z * 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
#ifdef IATYPE_SHMEM
/* i-atom types in shared memory */
shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
bCalcEner,
plist->bDoPrune || always_prune);
- /* kernel launch config */
+ /* Kernel launch config:
+ * - The thread block dimensions match the size of i-clusters, j-clusters,
+ * and j-cluster concurrency, in x, y, and z, respectively.
+ * - The 1D block-grid contains as many blocks as super-clusters.
+ */
nblock = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
- dim_block = dim3(CL_SIZE, CL_SIZE, 1);
+ dim_block = dim3(CL_SIZE, CL_SIZE, NTHREAD_Z);
dim_grid = dim3(nblock, 1, 1);
shmem = calc_shmem_required();
if (debug)
{
fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
- "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
+ "\tGrid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n"
+ "\tShMem: %d\n",
dim_block.x, dim_block.y, dim_block.z,
dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
- NCL_PER_SUPERCL, plist->na_c);
+ NCL_PER_SUPERCL, plist->na_c,
+ shmem);
}
nb_kernel<<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, bCalcFshift);
/*
Kernel launch parameters:
- #blocks = #pair lists, blockId = pair list Id
- - #threads = CL_SIZE^2
- - shmem = CL_SIZE^2 * sizeof(float)
+ - #threads = NTHREAD_Z * CL_SIZE^2
+ - shmem = see nbnxn_cuda.cu:calc_shmem_required()
Each thread calculates an i force-component taking one pair of i-j atoms.
*/
+
#if __CUDA_ARCH__ >= 350
-__launch_bounds__(64, 16)
+__launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP)
+#else
+__launch_bounds__(THREADS_PER_BLOCK)
#endif
#ifdef PRUNE_NBL
#ifdef CALC_ENERGIES
unsigned int tidxi = threadIdx.x;
unsigned int tidxj = threadIdx.y;
unsigned int tidx = threadIdx.y * blockDim.x + threadIdx.x;
+ unsigned int tidxz = threadIdx.z;
unsigned int bidx = blockIdx.x;
unsigned int widx = tidx / WARP_SIZE; /* warp index */
/* shmem buffer for i x+q pre-loading */
extern __shared__ float4 xqib[];
- /* shmem buffer for cj, for both warps separately */
- int *cjs = (int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
+ /* shmem buffer for cj, for each warp separately */
+ int *cjs = ((int *)(xqib + NCL_PER_SUPERCL * CL_SIZE)) + tidxz * 2 * NBNXN_GPU_JGROUP_SIZE;
#ifdef IATYPE_SHMEM
/* shmem buffer for i atom-type pre-loading */
- int *atib = (int *)(cjs + 2 * NBNXN_GPU_JGROUP_SIZE);
+ int *atib = ((int *)(xqib + NCL_PER_SUPERCL * CL_SIZE)) + NTHREAD_Z * 2 * NBNXN_GPU_JGROUP_SIZE;
#endif
#ifndef REDUCE_SHUFFLE
#ifdef IATYPE_SHMEM
float *f_buf = (float *)(atib + NCL_PER_SUPERCL * CL_SIZE);
#else
- float *f_buf = (float *)(cjs + 2 * NBNXN_GPU_JGROUP_SIZE);
+ float *f_buf = (float *)(cjs + NTHREAD_Z * 2 * NBNXN_GPU_JGROUP_SIZE);
#endif
#endif
cij4_start = nb_sci.cj4_ind_start; /* first ...*/
cij4_end = nb_sci.cj4_ind_end; /* and last index of j clusters */
- /* Pre-load i-atom x and q into shared memory */
- ci = sci * NCL_PER_SUPERCL + tidxj;
- ai = ci * CL_SIZE + tidxi;
- xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + shift_vec[nb_sci.shift];
+ if (tidxz == 0)
+ {
+ /* Pre-load i-atom x and q into shared memory */
+ ci = sci * NCL_PER_SUPERCL + tidxj;
+ ai = ci * CL_SIZE + tidxi;
+ xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + shift_vec[nb_sci.shift];
#ifdef IATYPE_SHMEM
- /* Pre-load the i-atom types into shared memory */
- atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
+ /* Pre-load the i-atom types into shared memory */
+ atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
#endif
+ }
__syncthreads();
for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
/* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
#ifdef LJ_EWALD
- E_lj /= CL_SIZE;
+ E_lj /= CL_SIZE*NTHREAD_Z;
E_lj *= 0.5f*ONE_SIXTH_F*lje_coeff6_6;
#endif /* LJ_EWALD */
#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
- E_el /= CL_SIZE;
+ E_el /= CL_SIZE*NTHREAD_Z;
#if defined EL_RF || defined EL_CUTOFF
E_el *= -nbparam.epsfac*0.5f*c_rf;
#else
fshift_buf = make_float3(0.0f);
/* loop over the j clusters = seen by any of the atoms in the current super-cluster */
- for (j4 = cij4_start; j4 < cij4_end; j4++)
+ for (j4 = cij4_start + tidxz; j4 < cij4_end; j4 += NTHREAD_Z)
{
wexcl_idx = pl_cj4[j4].imei[widx].excl_ind;
imask = pl_cj4[j4].imei[widx].imask;