Merge release-4-6 into master

[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_cuda / nbnxn_cuda_kernel.cuh
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh

index 22c6bb931801c58072e04aafe569ac8c8bb14b26..c7ab7cc6c62fc2db701e8b7df5d451ca8f9278fa 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -139,9 +139,11 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
  
      /* shmem buffer for i x+q pre-loading */
      extern __shared__  float4 xqib[];
+    /* shmem buffer for cj, for both warps separately */
+    int *cjs     = (int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
  #ifdef IATYPE_SHMEM
      /* shmem buffer for i atom-type pre-loading */
-    int *atib = (int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
+    int *atib = (int *)(cjs + 2 * NBNXN_GPU_JGROUP_SIZE);
  #endif
  
  #ifndef REDUCE_SHUFFLE
@@ -149,7 +151,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
  #ifdef IATYPE_SHMEM
      float *f_buf = (float *)(atib + NCL_PER_SUPERCL * CL_SIZE);
  #else
-    float *f_buf = (float *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
+    float *f_buf = (float *)(cjs + 2 * NBNXN_GPU_JGROUP_SIZE);
  #endif
  #endif
  
@@ -219,6 +221,12 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
          if (imask)
  #endif
          {
+            /* Pre-load cj into shared memory on both warps separately */
+            if ((tidxj == 0 || tidxj == 4) && tidxi < NBNXN_GPU_JGROUP_SIZE)
+            {
+                cjs[tidxi + tidxj * NBNXN_GPU_JGROUP_SIZE / 4] = pl_cj4[j4].cj[tidxi];
+            }
+
              /* Unrolling this loop
                 - with pruning leads to register spilling;
                 - on Kepler is much slower;
@@ -233,7 +241,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
                  {
                      mask_ji = (1U << (jm * NCL_PER_SUPERCL));
  
-                    cj      = pl_cj4[j4].cj[jm];
+                    cj      = cjs[jm + (tidxj & 4) * NBNXN_GPU_JGROUP_SIZE / 4];
                      aj      = cj * CL_SIZE + tidxj;
  
                      /* load j atom data */