Fix CUDA architecture dependent issues

[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_cuda / nbnxn_cuda.cu
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

index 3ab80b45d6203daa653e36fa94e7e2337447b458..4f3efa349d93375e5f82db1768e742ec9fd5b294 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -69,10 +69,6 @@
  
  #include "nbnxn_cuda_types.h"
  
-#if defined HAVE_CUDA_TEXOBJ_SUPPORT && __CUDA_ARCH__ >= 300
-#define USE_TEXOBJ
-#endif
-
  /*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */
  texture<float, 1, cudaReadModeElementType> nbfp_texref;
  
@@ -281,25 +277,31 @@ static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int  eeltype,
  }
  
  /*! Calculates the amount of shared memory required by the CUDA kernel in use. */
-static inline int calc_shmem_required(const int num_threads_z)
+static inline int calc_shmem_required(const int num_threads_z, gmx_device_info_t gmx_unused *dinfo)
  {
      int shmem;
  
+    assert(dinfo);
+
      /* size of shmem (force-buffers/xq/atom type preloading) */
      /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
      /* i-atom x+q in shared memory */
      shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
      /* cj in shared memory, for each warp separately */
      shmem += num_threads_z * 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
-#ifdef IATYPE_SHMEM
-    /* i-atom types in shared memory */
-    shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
-#endif
-#if __CUDA_ARCH__ < 300
-    /* force reduction buffers in shared memory */
-    shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
+    /* CUDA versions below 4.2 won't generate code for sm>=3.0 */
+#if GMX_CUDA_VERSION >= 4200
+    if (dinfo->prop.major >= 3)
+    {
+        /* i-atom types in shared memory */
+        shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
+    }
+    if (dinfo->prop.major < 3)
  #endif
-
+    {
+        /* force reduction buffers in shared memory */
+        shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
+    }
      return shmem;
  }
  
@@ -435,7 +437,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
      nblock    = calc_nb_kernel_nblock(plist->nsci, nb->dev_info);
      dim_block = dim3(CL_SIZE, CL_SIZE, num_threads_z);
      dim_grid  = dim3(nblock, 1, 1);
-    shmem     = calc_shmem_required(num_threads_z);
+    shmem     = calc_shmem_required(num_threads_z, nb->dev_info);
  
      if (debug)
      {