use CUDA texture objects when supported
authorSzilard Pall <pszilard@cbr.su.se>
Wed, 18 Sep 2013 19:59:36 +0000 (21:59 +0200)
committerMark Abraham <mark.j.abraham@gmail.com>
Mon, 14 Oct 2013 15:00:06 +0000 (17:00 +0200)
CUDA texture objects are more efficient than texture references, their
use reduces the kernel launch overhead by up to 20%. The kernel
performance is not affected.

Change-Id: Ifa7c148eb2eea8e33ed0b2f1d8ef092d59ba768e

src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh
src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h

index de3bdc9bf7a2c3b55d124cfce81d072b3b940641..e65c6ed603460b66dd1f77911e1104a5bc0690a5 100644 (file)
 #include "nbnxn_cuda.h"
 #include "nbnxn_cuda_data_mgmt.h"
 
+#if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ > 300
+#define USE_TEXOBJ
+#endif
 
 /*! Texture reference for nonbonded parameters; bound to cu_nbparam_t.nbfp*/
-texture<float, 1, cudaReadModeElementType> tex_nbfp;
+texture<float, 1, cudaReadModeElementType> nbfp_texref;
 
 /*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
-texture<float, 1, cudaReadModeElementType> tex_coulomb_tab;
+texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
 
 /* Convenience defines */
 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
@@ -662,13 +665,13 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
 /*! Return the reference to the nbfp texture. */
 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref()
 {
-    return tex_nbfp;
+    return nbfp_texref;
 }
 
 /*! Return the reference to the coulomb_tab. */
 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref()
 {
-    return tex_coulomb_tab;
+    return coulomb_tab_texref;
 }
 
 /*! Set up the cache configuration for the non-bonded kernels,
index 67e83ff89727db683593a860e4d914265c06f312..cf9b0f772579e4c3e97077afb9a9e0d873a422cb 100644 (file)
@@ -110,7 +110,8 @@ static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb);
     and the table GPU array. If called with an already allocated table,
     it just re-uploads the table.
  */
-static void init_ewald_coulomb_force_table(cu_nbparam_t *nbp)
+static void init_ewald_coulomb_force_table(cu_nbparam_t          *nbp,
+                                           const cuda_dev_info_t *dev_info)
 {
     float       *ftmp, *coul_tab;
     int         tabsize;
@@ -137,10 +138,32 @@ static void init_ewald_coulomb_force_table(cu_nbparam_t *nbp)
 
         nbp->coulomb_tab = coul_tab;
 
-        cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
-        stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
-                               coul_tab, &cd, tabsize*sizeof(*coul_tab));
-        CU_RET_ERR(stat, "cudaBindTexture on coul_tab failed");
+#ifdef TEXOBJ_SUPPORTED
+        /* Only device CC >= 3.0 (Kepler and later) support texture objects */
+        if (dev_info->prop.major >= 3)
+        {
+            cudaResourceDesc rd;
+            memset(&rd, 0, sizeof(rd));
+            rd.resType                  = cudaResourceTypeLinear;
+            rd.res.linear.devPtr        = nbp->coulomb_tab;
+            rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
+            rd.res.linear.desc.x        = 32;
+            rd.res.linear.sizeInBytes   = tabsize*sizeof(*coul_tab);
+
+            cudaTextureDesc td;
+            memset(&td, 0, sizeof(td));
+            td.readMode                 = cudaReadModeElementType;
+            stat = cudaCreateTextureObject(&nbp->coulomb_tab_texobj, &rd, &td, NULL);
+            CU_RET_ERR(stat, "cudaCreateTextureObject on coulomb_tab_texobj failed");
+        }
+        else
+#endif
+        {
+            cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
+            stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
+                                   coul_tab, &cd, tabsize*sizeof(*coul_tab));
+            CU_RET_ERR(stat, "cudaBindTexture on coulomb_tab_texref failed");
+        }
     }
 
     cu_copy_H2D(coul_tab, ftmp, tabsize*sizeof(*coul_tab));
@@ -279,7 +302,7 @@ static void init_nbparam(cu_nbparam_t *nbp,
     nbp->coulomb_tab = NULL;
     if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
     {
-        init_ewald_coulomb_force_table(nbp);
+        init_ewald_coulomb_force_table(nbp, dev_info);
     }
 
     nnbfp = 2*ntypes*ntypes;
@@ -287,10 +310,32 @@ static void init_nbparam(cu_nbparam_t *nbp,
     CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
     cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
 
-    cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
-    stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
-                           nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
-    CU_RET_ERR(stat, "cudaBindTexture on nbfp failed");
+#ifdef TEXOBJ_SUPPORTED
+        /* Only device CC >= 3.0 (Kepler and later) support texture objects */
+        if (dev_info->prop.major >= 3)
+        {
+            cudaResourceDesc rd;
+            memset(&rd, 0, sizeof(rd));
+            rd.resType                  = cudaResourceTypeLinear;
+            rd.res.linear.devPtr        = nbp->nbfp;
+            rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
+            rd.res.linear.desc.x        = 32;
+            rd.res.linear.sizeInBytes   = nnbfp*sizeof(*nbp->nbfp);
+
+            cudaTextureDesc td;
+            memset(&td, 0, sizeof(td));
+            td.readMode                 = cudaReadModeElementType;
+            stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL);
+            CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed");
+        }
+        else
+#endif
+        {
+            cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
+            stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
+                                   nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
+            CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed");
+        }
 }
 
 /*! Re-generate the GPU Ewald force table, resets rlist, and update the
@@ -307,7 +352,7 @@ void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t cu_nb,
     nbp->eeltype        = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
                                                  cu_nb->dev_info);
 
-    init_ewald_coulomb_force_table(cu_nb->nbparam);
+    init_ewald_coulomb_force_table(cu_nb->nbparam, cu_nb->dev_info);
 }
 
 /*! Initializes the pair list data structure. */
@@ -848,9 +893,21 @@ void nbnxn_cuda_free(FILE *fplog, nbnxn_cuda_ptr_t cu_nb)
 
     if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
     {
-      stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
-      CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab failed");
-      cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
+
+#ifdef TEXOBJ_SUPPORTED
+        /* Only device CC >= 3.0 (Kepler and later) support texture objects */
+        if (cu_nb->dev_info->prop.major >= 3)
+        {
+            stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
+            CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
+        }
+        else
+#endif
+        {
+            stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
+            CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab_texref failed");
+        }
+        cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
     }
 
     stat = cudaEventDestroy(cu_nb->nonlocal_done);
@@ -893,8 +950,19 @@ void nbnxn_cuda_free(FILE *fplog, nbnxn_cuda_ptr_t cu_nb)
         }
     }
 
-    stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref());
-    CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab failed");
+#ifdef TEXOBJ_SUPPORTED
+    /* Only device CC >= 3.0 (Kepler and later) support texture objects */
+    if (cu_nb->dev_info->prop.major >= 3)
+    {
+        stat = cudaDestroyTextureObject(nbparam->nbfp_texobj);
+        CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_texobj failed");
+    }
+    else
+#endif
+    {
+        stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref());
+        CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_texref failed");
+    }
     cu_free_buffered(nbparam->nbfp);
 
     stat = cudaFree(atdat->shift_vec);
index 437849d70e9e460277f1c62c9e875ff1ccdf2d4d..c4a92c59d1f401f3b1e2a584bf76b77462ef6da5 100644 (file)
@@ -317,8 +317,14 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
 #endif
 
                                 /* LJ 6*C6 and 12*C12 */
-                                c6      = tex1Dfetch(tex_nbfp, 2 * (ntypes * typei + typej));
-                                c12     = tex1Dfetch(tex_nbfp, 2 * (ntypes * typei + typej) + 1);
+#ifdef USE_TEXOBJ
+                                c6      = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * (ntypes * typei + typej));
+                                c12     = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * (ntypes * typei + typej) + 1);
+#else
+                                c6      = tex1Dfetch(nbfp_texref, 2 * (ntypes * typei + typej));
+                                c12     = tex1Dfetch(nbfp_texref, 2 * (ntypes * typei + typej) + 1);
+#endif /* USE_TEXOBJ */
+
 
                                 /* avoid NaN for excluded pairs at r=0 */
                                 r2      += (1.0f - int_bit) * NBNXN_AVOID_SING_R2_INC;
@@ -360,7 +366,13 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
 #if defined EL_EWALD_ANA
                                 F_invr  += qi * qj_f * (int_bit*inv_r2*inv_r + pmecorrF(beta2*r2)*beta3);
 #elif defined EL_EWALD_TAB
-                                F_invr  += qi * qj_f * (int_bit*inv_r2 - interpolate_coulomb_force_r(r2 * inv_r, coulomb_tab_scale)) * inv_r;
+                                F_invr  += qi * qj_f * (int_bit*inv_r2 -
+#ifdef USE_TEXOBJ
+                                                        interpolate_coulomb_force_r(nbparam.coulomb_tab_texobj, r2 * inv_r, coulomb_tab_scale)
+#else
+                                                        interpolate_coulomb_force_r(r2 * inv_r, coulomb_tab_scale)
+#endif /* USE_TEXOBJ */
+                                                        ) * inv_r;
 #endif /* EL_EWALD_ANA/TAB */
 
 #ifdef CALC_ENERGIES
index b0012ee3619b34af8ac8630a9f2448e588cb803b..2f6f96b4677308f07961c6d053fdd2b0e4712af0 100644 (file)
@@ -277,8 +277,8 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
                             typei   = atom_types[ai];
 
                             /* LJ 6*C6 and 12*C12 */
-                            c6      = tex1Dfetch(tex_nbfp, 2 * (ntypes * typei + typej));
-                            c12     = tex1Dfetch(tex_nbfp, 2 * (ntypes * typei + typej) + 1);
+                            c6      = tex1Dfetch(nbfp_texref, 2 * (ntypes * typei + typej));
+                            c12     = tex1Dfetch(nbfp_texref, 2 * (ntypes * typei + typej) + 1);
 
                             /* avoid NaN for excluded pairs at r=0 */
                             r2      += (1.0f - int_bit) * NBNXN_AVOID_SING_R2_INC;
index ad08ffa5e66b8d592506f8414f13dc3f5e858051..eaaf3a48bf0d19291ca4127872b4b2df77b29534 100644 (file)
@@ -65,10 +65,26 @@ float interpolate_coulomb_force_r(float r, float scale)
     float   fract2 = normalized - index;
     float   fract1 = 1.0f - fract2;
 
-    return  fract1 * tex1Dfetch(tex_coulomb_tab, index)
-            + fract2 * tex1Dfetch(tex_coulomb_tab, index + 1);
+    return  fract1 * tex1Dfetch(coulomb_tab_texref, index)
+            + fract2 * tex1Dfetch(coulomb_tab_texref, index + 1);
 }
 
+#ifdef TEXOBJ_SUPPORTED
+static inline __device__
+float interpolate_coulomb_force_r(cudaTextureObject_t texobj_coulomb_tab,
+                                  float r, float scale)
+{
+    float   normalized = scale * r;
+    int     index = (int) normalized;
+    float   fract2 = normalized - index;
+    float   fract1 = 1.0f - fract2;
+
+    return  fract1 * tex1Dfetch<float>(texobj_coulomb_tab, index) +
+            fract2 * tex1Dfetch<float>(texobj_coulomb_tab, index + 1);
+}
+#endif
+
+
 /*! Calculate analytical Ewald correction term. */
 static inline __device__
 float pmecorrF(float z2)
index 901e784c328860761b006e8cb2ae2f38eefb9316..53cebe4f258aabcb20322461205e6ef2ed013baf 100644 (file)
 #include "types/nbnxn_cuda_types_ext.h"
 #include "../../gmxlib/cuda_tools/cudautils.cuh"
 
+/* CUDA versions from 5.0 above support texture objects. */
+#if CUDA_VERSION >= 5000
+#define TEXOBJ_SUPPORTED
+#else  /* CUDA_VERSION */
+/* This typedef allows us to define only one version of struct cu_nbparam */
+typedef int cudaTextureObject_t;
+#endif /* CUDA_VERSION */
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -129,12 +137,15 @@ struct cu_nbparam
     float    rlist_sq;       /* pair-list cut-off                            */
     float    sh_invrc6;      /* LJ potential correction term                 */
 
-    float   *nbfp;           /* nonbonded parameter table with C6/C12 pairs  */
+    /* Non-bonded parameters - accessed through texture memory */
+    float            *nbfp;        /* nonbonded parameter table with C6/C12 pairs  */
+    cudaTextureObject_t nbfp_texobj; /* texture object bound to nbfp                 */
 
-    /* Ewald Coulomb force table */
-    int      coulomb_tab_size;
-    float    coulomb_tab_scale;
-    float   *coulomb_tab;
+    /* Ewald Coulomb force table data - accessed through texture memory */
+    int               coulomb_tab_size;
+    float             coulomb_tab_scale;
+    float            *coulomb_tab;
+    cudaTextureObject_t  coulomb_tab_texobj; /* texture object bound to coulomb_tab   */
 };
 
 /*! Pair list data */