improved CUDA kernel performance by pre-loading cj

author Berk Hess <hess@kth.se>

Sat, 12 Jan 2013 21:41:15 +0000 (22:41 +0100)

committer Berk Hess <hess@kth.se>

Thu, 17 Jan 2013 16:52:28 +0000 (17:52 +0100)
author Berk Hess <hess@kth.se>
Sat, 12 Jan 2013 21:41:15 +0000 (22:41 +0100)
committer Berk Hess <hess@kth.se>
Thu, 17 Jan 2013 16:52:28 +0000 (17:52 +0100)
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu

index 98d28f96b7950a8dbcf319007911fe3daea8cd39..f36104a618b3c2099812c74ba15669cd07bb101c 100644 (file)
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -202,6 +202,8 @@ static inline int calc_shmem_required(int kver)
          /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
          /* i-atom x+q in shared memory */
          shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
+        /* cj in shared memory, for both warps separately */
+        shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
  #ifdef IATYPE_SHMEM
          /* i-atom types in shared memory */
          shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh

index df19e73adf465d664244c5dd40c723714cac8fd3..964188ae07e3aef393139c64764cee68a512c8c5 100644 (file)
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -142,9 +142,11 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
  
      /* shmem buffer for i x+q pre-loading */
      extern __shared__  float4 xqib[];
+    /* shmem buffer for cj, for both warps separately */
+    int *cjs     = (int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
  #ifdef IATYPE_SHMEM
      /* shmem buffer for i atom-type pre-loading */
-    int *atib = (int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
+    int *atib = (int *)(cjs + 2 * NBNXN_GPU_JGROUP_SIZE);
  #endif
  
  #ifndef REDUCE_SHUFFLE
@@ -152,7 +154,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
  #ifdef IATYPE_SHMEM
      float *f_buf = (float *)(atib + NCL_PER_SUPERCL * CL_SIZE);
  #else
-    float *f_buf = (float *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
+    float *f_buf = (float *)(cjs + 2 * NBNXN_GPU_JGROUP_SIZE);
  #endif
  #endif
  
@@ -222,6 +224,12 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
          if (imask)
  #endif
          {
+            /* Pre-load cj into shared memory on both warps separately */
+            if ((tidxj == 0 || tidxj == 4) && tidxi < NBNXN_GPU_JGROUP_SIZE)
+            {
+                cjs[tidxi + tidxj * NBNXN_GPU_JGROUP_SIZE / 4] = pl_cj4[j4].cj[tidxi];
+            }
+
              /* Unrolling this loop
                 - with pruning leads to register spilling;
                 - on Kepler is much slower;
@@ -236,7 +244,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
                  {
                      mask_ji = (1U << (jm * NCL_PER_SUPERCL));
  
-                    cj      = pl_cj4[j4].cj[jm];
+                    cj      = cjs[jm + (tidxj & 4) * NBNXN_GPU_JGROUP_SIZE / 4];
                      aj      = cj * CL_SIZE + tidxj;
  
                      /* load j atom data */
author	Berk Hess <hess@kth.se>
	Sat, 12 Jan 2013 21:41:15 +0000 (22:41 +0100)
committer	Berk Hess <hess@kth.se>
	Thu, 17 Jan 2013 16:52:28 +0000 (17:52 +0100)
src/mdlib/nbnxn_cuda/nbnxn_cuda.cu		patch \| blob \| history
src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh		patch \| blob \| history