Allow increasing CUDA thread block size

author Szilárd Páll <pszilard@kth.se>

Fri, 24 Oct 2014 00:37:34 +0000 (02:37 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Mon, 17 Nov 2014 13:26:57 +0000 (14:26 +0100)
author Szilárd Páll <pszilard@kth.se>
Fri, 24 Oct 2014 00:37:34 +0000 (02:37 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Mon, 17 Nov 2014 13:26:57 +0000 (14:26 +0100)
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

index c024f7b02c628ea814e30ed38c23212fbde6aba4..978b74facbb583cf32aa23b1002c34f781d20dda 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -80,6 +80,26 @@ texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
  #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  
+/* NTHREAD_Z controls the number of j-clusters processed concurrently on NTHREAD_Z
+ * warp-pairs per block.
+ *
+ * On current architectures NTHREAD_Z == 1, translating to 64 th/block with 16
+ * blocks/multiproc, is the fastest even though this setup gives low occupancy.
+ * NTHREAD_Z > 1 results in excessive register spilling unless the minimum blocks
+ * per multiprocessor is reduced proportionally to get the original number of max
+ * threads in flight.
+ *
+ * Note that the current kernel implementation only supports NTHREAD_Z > 1 with
+ * shuffle-based reduction, hence CC >= 3.0.
+ */
+#define NTHREAD_Z           (1)
+/* Kernel launch bounds as function of NTHREAD_Z. On current architectures (up to
+ * CC 3.5/5.2) the (64, 16) bounds are always is used.
+ */
+#define THREADS_PER_BLOCK   (CL_SIZE*CL_SIZE*NTHREAD_Z)
+#define MIN_BLOCKS_PER_MP   (16/NTHREAD_Z)
+
+
  /***** The kernels come here *****/
  #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh"
  
@@ -249,8 +269,8 @@ static inline int calc_shmem_required()
      /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
      /* i-atom x+q in shared memory */
      shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
-    /* cj in shared memory, for both warps separately */
-    shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
+    /* cj in shared memory, for each warp separately */
+    shmem += NTHREAD_Z * 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
  #ifdef IATYPE_SHMEM
      /* i-atom types in shared memory */
      shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
@@ -368,19 +388,25 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t        cu_nb,
                                      bCalcEner,
                                      plist->bDoPrune || always_prune);
  
-    /* kernel launch config */
+    /* Kernel launch config:
+     * - The thread block dimensions match the size of i-clusters, j-clusters,
+     *   and j-cluster concurrency, in x, y, and z, respectively.
+     * - The 1D block-grid contains as many blocks as super-clusters.
+     */
      nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
-    dim_block = dim3(CL_SIZE, CL_SIZE, 1);
+    dim_block = dim3(CL_SIZE, CL_SIZE, NTHREAD_Z);
      dim_grid  = dim3(nblock, 1, 1);
      shmem     = calc_shmem_required();
  
      if (debug)
      {
          fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
-                "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
+                "\tGrid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n"
+                "\tShMem: %d\n",
                  dim_block.x, dim_block.y, dim_block.z,
                  dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
-                NCL_PER_SUPERCL, plist->na_c);
+                NCL_PER_SUPERCL, plist->na_c,
+                shmem);
      }
  
      nb_kernel<<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, bCalcFshift);
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh

index 43ebddb034d2abe8861f307d693874a521f6e55e..924cc2a357e898fbee12babfd95f32877579f92f 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -70,13 +70,16 @@
  /*
     Kernel launch parameters:
      - #blocks   = #pair lists, blockId = pair list Id
-    - #threads  = CL_SIZE^2
-    - shmem     = CL_SIZE^2 * sizeof(float)
+    - #threads  = NTHREAD_Z * CL_SIZE^2
+    - shmem     = see nbnxn_cuda.cu:calc_shmem_required()
  
      Each thread calculates an i force-component taking one pair of i-j atoms.
   */
+
  #if __CUDA_ARCH__ >= 350
-__launch_bounds__(64, 16)
+__launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP)
+#else
+__launch_bounds__(THREADS_PER_BLOCK)
  #endif
  #ifdef PRUNE_NBL
  #ifdef CALC_ENERGIES
@@ -145,6 +148,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
      unsigned int tidxi  = threadIdx.x;
      unsigned int tidxj  = threadIdx.y;
      unsigned int tidx   = threadIdx.y * blockDim.x + threadIdx.x;
+    unsigned int tidxz  = threadIdx.z;
      unsigned int bidx   = blockIdx.x;
      unsigned int widx   = tidx / WARP_SIZE; /* warp index */
  
@@ -172,11 +176,11 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
  
      /* shmem buffer for i x+q pre-loading */
      extern __shared__  float4 xqib[];
-    /* shmem buffer for cj, for both warps separately */
-    int *cjs     = (int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
+    /* shmem buffer for cj, for each warp separately */
+    int *cjs     = ((int *)(xqib + NCL_PER_SUPERCL * CL_SIZE)) + tidxz * 2 * NBNXN_GPU_JGROUP_SIZE;
  #ifdef IATYPE_SHMEM
      /* shmem buffer for i atom-type pre-loading */
-    int *atib = (int *)(cjs + 2 * NBNXN_GPU_JGROUP_SIZE);
+    int *atib    = ((int *)(xqib + NCL_PER_SUPERCL * CL_SIZE)) + NTHREAD_Z * 2 * NBNXN_GPU_JGROUP_SIZE;
  #endif
  
  #ifndef REDUCE_SHUFFLE
@@ -184,7 +188,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
  #ifdef IATYPE_SHMEM
      float *f_buf = (float *)(atib + NCL_PER_SUPERCL * CL_SIZE);
  #else
-    float *f_buf = (float *)(cjs + 2 * NBNXN_GPU_JGROUP_SIZE);
+    float *f_buf = (float *)(cjs + NTHREAD_Z * 2 * NBNXN_GPU_JGROUP_SIZE);
  #endif
  #endif
  
@@ -193,14 +197,17 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
      cij4_start  = nb_sci.cj4_ind_start; /* first ...*/
      cij4_end    = nb_sci.cj4_ind_end;   /* and last index of j clusters */
  
-    /* Pre-load i-atom x and q into shared memory */
-    ci = sci * NCL_PER_SUPERCL + tidxj;
-    ai = ci * CL_SIZE + tidxi;
-    xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + shift_vec[nb_sci.shift];
+    if (tidxz == 0)
+    {
+        /* Pre-load i-atom x and q into shared memory */
+        ci = sci * NCL_PER_SUPERCL + tidxj;
+        ai = ci * CL_SIZE + tidxi;
+        xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + shift_vec[nb_sci.shift];
  #ifdef IATYPE_SHMEM
-    /* Pre-load the i-atom types into shared memory */
-    atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
+        /* Pre-load the i-atom types into shared memory */
+        atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
  #endif
+    }
      __syncthreads();
  
      for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
@@ -242,12 +249,12 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
  
          /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
  #ifdef LJ_EWALD
-        E_lj /= CL_SIZE;
+        E_lj /= CL_SIZE*NTHREAD_Z;
          E_lj *= 0.5f*ONE_SIXTH_F*lje_coeff6_6;
  #endif  /* LJ_EWALD */
  
  #if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
-        E_el /= CL_SIZE;
+        E_el /= CL_SIZE*NTHREAD_Z;
  #if defined EL_RF || defined EL_CUTOFF
          E_el *= -nbparam.epsfac*0.5f*c_rf;
  #else
@@ -268,7 +275,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
      fshift_buf = make_float3(0.0f);
  
      /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
-    for (j4 = cij4_start; j4 < cij4_end; j4++)
+    for (j4 = cij4_start + tidxz; j4 < cij4_end; j4 += NTHREAD_Z)
      {
          wexcl_idx   = pl_cj4[j4].imei[widx].excl_ind;
          imask       = pl_cj4[j4].imei[widx].imask;
author	Szilárd Páll <pszilard@kth.se>
	Fri, 24 Oct 2014 00:37:34 +0000 (02:37 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Mon, 17 Nov 2014 13:26:57 +0000 (14:26 +0100)
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh		patch \| blob \| history