Fix CUDA architecture dependent issues

author Szilard Pall <pall.szilard@gmail.com>

Wed, 24 Jun 2015 21:19:46 +0000 (23:19 +0200)

committer Gerrit Code Review <gerrit@gerrit.gromacs.org>

Sat, 27 Jun 2015 22:27:19 +0000 (00:27 +0200)
author Szilard Pall <pall.szilard@gmail.com>
Wed, 24 Jun 2015 21:19:46 +0000 (23:19 +0200)
committer Gerrit Code Review <gerrit@gerrit.gromacs.org>
Sat, 27 Jun 2015 22:27:19 +0000 (00:27 +0200)
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

index 3ab80b45d6203daa653e36fa94e7e2337447b458..4f3efa349d93375e5f82db1768e742ec9fd5b294 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -69,10 +69,6 @@
  
  #include "nbnxn_cuda_types.h"
  
-#if defined HAVE_CUDA_TEXOBJ_SUPPORT && __CUDA_ARCH__ >= 300
-#define USE_TEXOBJ
-#endif
-
  /*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */
  texture<float, 1, cudaReadModeElementType> nbfp_texref;
  
@@ -281,25 +277,31 @@ static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int  eeltype,
  }
  
  /*! Calculates the amount of shared memory required by the CUDA kernel in use. */
-static inline int calc_shmem_required(const int num_threads_z)
+static inline int calc_shmem_required(const int num_threads_z, gmx_device_info_t gmx_unused *dinfo)
  {
      int shmem;
  
+    assert(dinfo);
+
      /* size of shmem (force-buffers/xq/atom type preloading) */
      /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
      /* i-atom x+q in shared memory */
      shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
      /* cj in shared memory, for each warp separately */
      shmem += num_threads_z * 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
-#ifdef IATYPE_SHMEM
-    /* i-atom types in shared memory */
-    shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
-#endif
-#if __CUDA_ARCH__ < 300
-    /* force reduction buffers in shared memory */
-    shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
+    /* CUDA versions below 4.2 won't generate code for sm>=3.0 */
+#if GMX_CUDA_VERSION >= 4200
+    if (dinfo->prop.major >= 3)
+    {
+        /* i-atom types in shared memory */
+        shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
+    }
+    if (dinfo->prop.major < 3)
  #endif
-
+    {
+        /* force reduction buffers in shared memory */
+        shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
+    }
      return shmem;
  }
  
@@ -435,7 +437,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
      nblock    = calc_nb_kernel_nblock(plist->nsci, nb->dev_info);
      dim_block = dim3(CL_SIZE, CL_SIZE, num_threads_z);
      dim_grid  = dim3(nblock, 1, 1);
-    shmem     = calc_shmem_required(num_threads_z);
+    shmem     = calc_shmem_required(num_threads_z, nb->dev_info);
  
      if (debug)
      {
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu

index 3c2b38bcb272712ad3935cc88566ff68cf407075..2923125d8a97265e71b1f854a0d461be48f6f106 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -118,6 +118,15 @@ static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam,
                                            const gmx_device_info_t *dev_info);
  
  
+
+#ifdef HAVE_CUDA_TEXOBJ_SUPPORT
+static bool use_texobj(const gmx_device_info_t *dev_info)
+{
+    /* Only device CC >= 3.0 (Kepler and later) support texture objects */
+    return (dev_info->prop.major >= 3);
+}
+#endif
+
  /*! Tabulates the Ewald Coulomb force and initializes the size/scale
      and the table GPU array. If called with an already allocated table,
      it just re-uploads the table.
@@ -141,7 +150,7 @@ static void init_ewald_coulomb_force_table(const interaction_const_t *ic,
  
  #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
      /* Only device CC >= 3.0 (Kepler and later) support texture objects */
-    if (dev_info->prop.major >= 3)
+    if (use_texobj(dev_info))
      {
          cudaResourceDesc rd;
          memset(&rd, 0, sizeof(rd));
@@ -372,7 +381,7 @@ static void init_nbparam(cu_nbparam_t              *nbp,
  
  #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
      /* Only device CC >= 3.0 (Kepler and later) support texture objects */
-    if (dev_info->prop.major >= 3)
+    if (use_texobj(dev_info))
      {
          cudaResourceDesc rd;
          cudaTextureDesc  td;
@@ -922,7 +931,7 @@ static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam,
      {
  #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
          /* Only device CC >= 3.0 (Kepler and later) support texture objects */
-        if (dev_info->prop.major >= 3)
+        if (use_texobj(dev_info))
          {
              stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
              CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
@@ -1009,7 +1018,7 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
  
  #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
      /* Only device CC >= 3.0 (Kepler and later) support texture objects */
-    if (nb->dev_info->prop.major >= 3)
+    if (use_texobj(nb->dev_info))
      {
          stat = cudaDestroyTextureObject(nbparam->nbfp_texobj);
          CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_texobj failed");
@@ -1026,7 +1035,7 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
      {
  #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
          /* Only device CC >= 3.0 (Kepler and later) support texture objects */
-        if (nb->dev_info->prop.major >= 3)
+        if (use_texobj(nb->dev_info))
          {
              stat = cudaDestroyTextureObject(nbparam->nbfp_comb_texobj);
              CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_comb_texobj failed");
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh

index 4afcd4e7cfa3b3192065c753fd42da610606a0cd..d6793ca658750a1581d94259a8e233c63bcd8c58 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -58,6 +58,9 @@
  /* On Kepler pre-loading i-atom types to shmem gives a few %,
     but on Fermi it does not */
  #define IATYPE_SHMEM
+#ifdef HAVE_CUDA_TEXOBJ_SUPPORT
+#define USE_TEXOBJ
+#endif
  #endif
  
  #if defined EL_EWALD_ANA || defined EL_EWALD_TAB
@@ -90,6 +93,21 @@
      Each thread calculates an i force-component taking one pair of i-j atoms.
   */
  
+/* Kernel launch bounds as function of NTHREAD_Z.
+ * - CC 3.5/5.2: NTHREAD_Z=1, (64, 16) bounds
+ * - CC 3.7:     NTHREAD_Z=2, (128, 16) bounds
+ *
+ * Note: convenience macros, need to be undef-ed at the end of the file.
+ */
+#if __CUDA_ARCH__ == 370
+#define NTHREAD_Z           (2)
+#define MIN_BLOCKS_PER_MP   (16)
+#else
+#define NTHREAD_Z           (1)
+#define MIN_BLOCKS_PER_MP   (16)
+#endif
+#define THREADS_PER_BLOCK   (CL_SIZE*CL_SIZE*NTHREAD_Z)
+
  #if __CUDA_ARCH__ >= 350
  __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP)
  #else
@@ -579,6 +597,11 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
  
  #undef REDUCE_SHUFFLE
  #undef IATYPE_SHMEM
+#undef USE_TEXOBJ
+
+#undef NTHREAD_Z
+#undef MIN_BLOCKS_PER_MP
+#undef THREADS_PER_BLOCK
  
  #undef EL_EWALD_ANY
  #undef EXCLUSION_FORCES
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

index 769a74031b35041888f867f76bb8a17fc5faac16..2b7b4ddcad599acfb16b4396919e3ac9fbf5f648 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
@@ -44,6 +44,12 @@
  #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
  #define NBNXN_CUDA_KERNEL_UTILS_CUH
  
+
+#if defined HAVE_CUDA_TEXOBJ_SUPPORT && __CUDA_ARCH__ >= 300
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define USE_TEXOBJ
+#endif
+
  #define WARP_SIZE_POW2_EXPONENT     (5)
  #define CL_SIZE_POW2_EXPONENT       (3)  /* change this together with GPU_NS_CLUSTER_SIZE !*/
  #define CL_SIZE_SQ                  (CL_SIZE * CL_SIZE)
@@ -622,4 +628,6 @@ void reduce_energy_warp_shfl(float E_lj, float E_el,
  }
  #endif /* __CUDA_ARCH__ */
  
+#undef USE_TEXOBJ
+
  #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */
author	Szilard Pall <pall.szilard@gmail.com>
	Wed, 24 Jun 2015 21:19:46 +0000 (23:19 +0200)
committer	Gerrit Code Review <gerrit@gerrit.gromacs.org>
	Sat, 27 Jun 2015 22:27:19 +0000 (00:27 +0200)
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh		patch \| blob \| history