From 43b41cb8cead1a67339e29ae0775dae2c23d0d3e Mon Sep 17 00:00:00 2001
From: Szilard Pall <pszilard@cbr.su.se>
Date: Wed, 18 Sep 2013 21:59:36 +0200
Subject: [PATCH] use CUDA texture objects when supported

CUDA texture objects are more efficient than texture references, their
use reduces the kernel launch overhead by up to 20%. The kernel
performance is not affected.

Change-Id: Ifa7c148eb2eea8e33ed0b2f1d8ef092d59ba768e
---
 src/mdlib/nbnxn_cuda/nbnxn_cuda.cu            |  11 +-
 src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu  | 100 +++++++++++++++---
 src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh    |  18 +++-
 .../nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh   |   4 +-
 .../nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh    |  20 +++-
 src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h       |  21 +++-
 6 files changed, 142 insertions(+), 32 deletions(-)
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
index de3bdc9bf7..e65c6ed603 100644
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -61,12 +61,15 @@
 #include "nbnxn_cuda.h"
 #include "nbnxn_cuda_data_mgmt.h"
 
+#if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ > 300
+#define USE_TEXOBJ
+#endif
 
 /*! Texture reference for nonbonded parameters; bound to cu_nbparam_t.nbfp*/
-texture<float, 1, cudaReadModeElementType> tex_nbfp;
+texture<float, 1, cudaReadModeElementType> nbfp_texref;
 
 /*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
-texture<float, 1, cudaReadModeElementType> tex_coulomb_tab;
+texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
 
 /* Convenience defines */
 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
@@ -662,13 +665,13 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
 /*! Return the reference to the nbfp texture. */
 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref()
 {
-    return tex_nbfp;
+    return nbfp_texref;
 }
 
 /*! Return the reference to the coulomb_tab. */
 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref()
 {
-    return tex_coulomb_tab;
+    return coulomb_tab_texref;
 }
 
 /*! Set up the cache configuration for the non-bonded kernels,
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
index 67e83ff897..cf9b0f7725 100644
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -110,7 +110,8 @@ static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb);
     and the table GPU array. If called with an already allocated table,
     it just re-uploads the table.
  */
-static void init_ewald_coulomb_force_table(cu_nbparam_t *nbp)
+static void init_ewald_coulomb_force_table(cu_nbparam_t          *nbp,
+                                           const cuda_dev_info_t *dev_info)
 {
     float       *ftmp, *coul_tab;
     int         tabsize;
@@ -137,10 +138,32 @@ static void init_ewald_coulomb_force_table(cu_nbparam_t *nbp)
 
         nbp->coulomb_tab = coul_tab;
 
-        cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
-        stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
-                               coul_tab, &cd, tabsize*sizeof(*coul_tab));
-        CU_RET_ERR(stat, "cudaBindTexture on coul_tab failed");
+#ifdef TEXOBJ_SUPPORTED
+        /* Only device CC >= 3.0 (Kepler and later) support texture objects */
+        if (dev_info->prop.major >= 3)
+        {
+            cudaResourceDesc rd;
+            memset(&rd, 0, sizeof(rd));
+            rd.resType                  = cudaResourceTypeLinear;
+            rd.res.linear.devPtr        = nbp->coulomb_tab;
+            rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
+            rd.res.linear.desc.x        = 32;
+            rd.res.linear.sizeInBytes   = tabsize*sizeof(*coul_tab);
+
+            cudaTextureDesc td;
+            memset(&td, 0, sizeof(td));
+            td.readMode                 = cudaReadModeElementType;
+            stat = cudaCreateTextureObject(&nbp->coulomb_tab_texobj, &rd, &td, NULL);
+            CU_RET_ERR(stat, "cudaCreateTextureObject on coulomb_tab_texobj failed");
+        }
+        else
+#endif
+        {
+            cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
+            stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
+                                   coul_tab, &cd, tabsize*sizeof(*coul_tab));
+            CU_RET_ERR(stat, "cudaBindTexture on coulomb_tab_texref failed");
+        }
     }
 
     cu_copy_H2D(coul_tab, ftmp, tabsize*sizeof(*coul_tab));
@@ -279,7 +302,7 @@ static void init_nbparam(cu_nbparam_t *nbp,
     nbp->coulomb_tab = NULL;
     if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
     {
-        init_ewald_coulomb_force_table(nbp);
+        init_ewald_coulomb_force_table(nbp, dev_info);
     }
 
     nnbfp = 2*ntypes*ntypes;
@@ -287,10 +310,32 @@ static void init_nbparam(cu_nbparam_t *nbp,
     CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
     cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
 
-    cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
-    stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
-                           nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
-    CU_RET_ERR(stat, "cudaBindTexture on nbfp failed");
+#ifdef TEXOBJ_SUPPORTED
+        /* Only device CC >= 3.0 (Kepler and later) support texture objects */
+        if (dev_info->prop.major >= 3)
+        {
+            cudaResourceDesc rd;
+            memset(&rd, 0, sizeof(rd));
+            rd.resType                  = cudaResourceTypeLinear;
+            rd.res.linear.devPtr        = nbp->nbfp;
+            rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
+            rd.res.linear.desc.x        = 32;
+            rd.res.linear.sizeInBytes   = nnbfp*sizeof(*nbp->nbfp);
+
+            cudaTextureDesc td;
+            memset(&td, 0, sizeof(td));
+            td.readMode                 = cudaReadModeElementType;
+            stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL);
+            CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed");
+        }
+        else
+#endif
+        {
+            cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
+            stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
+                                   nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
+            CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed");
+        }
 }
 
 /*! Re-generate the GPU Ewald force table, resets rlist, and update the
@@ -307,7 +352,7 @@ void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t cu_nb,
     nbp->eeltype        = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
                                                  cu_nb->dev_info);
 
-    init_ewald_coulomb_force_table(cu_nb->nbparam);
+    init_ewald_coulomb_force_table(cu_nb->nbparam, cu_nb->dev_info);
 }
 
 /*! Initializes the pair list data structure. */
@@ -848,9 +893,21 @@ void nbnxn_cuda_free(FILE *fplog, nbnxn_cuda_ptr_t cu_nb)
 
     if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
     {
-      stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
-      CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab failed");
-      cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
+
+#ifdef TEXOBJ_SUPPORTED
+        /* Only device CC >= 3.0 (Kepler and later) support texture objects */
+        if (cu_nb->dev_info->prop.major >= 3)
+        {
+            stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
+            CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
+        }
+        else
+#endif
+        {
+            stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
+            CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab_texref failed");
+        }
+        cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
     }
 
     stat = cudaEventDestroy(cu_nb->nonlocal_done);
@@ -893,8 +950,19 @@ void nbnxn_cuda_free(FILE *fplog, nbnxn_cuda_ptr_t cu_nb)
         }
     }
 
-    stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref());
-    CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab failed");
+#ifdef TEXOBJ_SUPPORTED
+    /* Only device CC >= 3.0 (Kepler and later) support texture objects */
+    if (cu_nb->dev_info->prop.major >= 3)
+    {
+        stat = cudaDestroyTextureObject(nbparam->nbfp_texobj);
+        CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_texobj failed");
+    }
+    else
+#endif
+    {
+        stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref());
+        CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_texref failed");
+    }
     cu_free_buffered(nbparam->nbfp);
 
     stat = cudaFree(atdat->shift_vec);
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
index 437849d70e..c4a92c59d1 100644
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -317,8 +317,14 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
 #endif
 
                                 /* LJ 6*C6 and 12*C12 */
-                                c6      = tex1Dfetch(tex_nbfp, 2 * (ntypes * typei + typej));
-                                c12     = tex1Dfetch(tex_nbfp, 2 * (ntypes * typei + typej) + 1);
+#ifdef USE_TEXOBJ
+                                c6      = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * (ntypes * typei + typej));
+                                c12     = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * (ntypes * typei + typej) + 1);
+#else
+                                c6      = tex1Dfetch(nbfp_texref, 2 * (ntypes * typei + typej));
+                                c12     = tex1Dfetch(nbfp_texref, 2 * (ntypes * typei + typej) + 1);
+#endif /* USE_TEXOBJ */
+
 
                                 /* avoid NaN for excluded pairs at r=0 */
                                 r2      += (1.0f - int_bit) * NBNXN_AVOID_SING_R2_INC;
@@ -360,7 +366,13 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
 #if defined EL_EWALD_ANA
                                 F_invr  += qi * qj_f * (int_bit*inv_r2*inv_r + pmecorrF(beta2*r2)*beta3);
 #elif defined EL_EWALD_TAB
-                                F_invr  += qi * qj_f * (int_bit*inv_r2 - interpolate_coulomb_force_r(r2 * inv_r, coulomb_tab_scale)) * inv_r;
+                                F_invr  += qi * qj_f * (int_bit*inv_r2 -
+#ifdef USE_TEXOBJ
+                                                        interpolate_coulomb_force_r(nbparam.coulomb_tab_texobj, r2 * inv_r, coulomb_tab_scale)
+#else
+                                                        interpolate_coulomb_force_r(r2 * inv_r, coulomb_tab_scale)
+#endif /* USE_TEXOBJ */
+                                                        ) * inv_r;
 #endif /* EL_EWALD_ANA/TAB */
 
 #ifdef CALC_ENERGIES
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh
index b0012ee361..2f6f96b467 100644
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh
@@ -277,8 +277,8 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
                             typei   = atom_types[ai];
 
                             /* LJ 6*C6 and 12*C12 */
-                            c6      = tex1Dfetch(tex_nbfp, 2 * (ntypes * typei + typej));
-                            c12     = tex1Dfetch(tex_nbfp, 2 * (ntypes * typei + typej) + 1);
+                            c6      = tex1Dfetch(nbfp_texref, 2 * (ntypes * typei + typej));
+                            c12     = tex1Dfetch(nbfp_texref, 2 * (ntypes * typei + typej) + 1);
 
                             /* avoid NaN for excluded pairs at r=0 */
                             r2      += (1.0f - int_bit) * NBNXN_AVOID_SING_R2_INC;
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
index ad08ffa5e6..eaaf3a48bf 100644
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
@@ -65,10 +65,26 @@ float interpolate_coulomb_force_r(float r, float scale)
     float   fract2 = normalized - index;
     float   fract1 = 1.0f - fract2;
 
-    return  fract1 * tex1Dfetch(tex_coulomb_tab, index)
-            + fract2 * tex1Dfetch(tex_coulomb_tab, index + 1);
+    return  fract1 * tex1Dfetch(coulomb_tab_texref, index)
+            + fract2 * tex1Dfetch(coulomb_tab_texref, index + 1);
 }
 
+#ifdef TEXOBJ_SUPPORTED
+static inline __device__
+float interpolate_coulomb_force_r(cudaTextureObject_t texobj_coulomb_tab,
+                                  float r, float scale)
+{
+    float   normalized = scale * r;
+    int     index = (int) normalized;
+    float   fract2 = normalized - index;
+    float   fract1 = 1.0f - fract2;
+
+    return  fract1 * tex1Dfetch<float>(texobj_coulomb_tab, index) +
+            fract2 * tex1Dfetch<float>(texobj_coulomb_tab, index + 1);
+}
+#endif
+
+
 /*! Calculate analytical Ewald correction term. */
 static inline __device__
 float pmecorrF(float z2)
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
index 901e784c32..53cebe4f25 100644
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
@@ -43,6 +43,14 @@
 #include "types/nbnxn_cuda_types_ext.h"
 #include "../../gmxlib/cuda_tools/cudautils.cuh"
 
+/* CUDA versions from 5.0 above support texture objects. */
+#if CUDA_VERSION >= 5000
+#define TEXOBJ_SUPPORTED
+#else  /* CUDA_VERSION */
+/* This typedef allows us to define only one version of struct cu_nbparam */
+typedef int cudaTextureObject_t;
+#endif /* CUDA_VERSION */
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -129,12 +137,15 @@ struct cu_nbparam
     float    rlist_sq;       /* pair-list cut-off                            */
     float    sh_invrc6;      /* LJ potential correction term                 */
 
-    float   *nbfp;           /* nonbonded parameter table with C6/C12 pairs  */
+    /* Non-bonded parameters - accessed through texture memory */
+    float            *nbfp;        /* nonbonded parameter table with C6/C12 pairs  */
+    cudaTextureObject_t nbfp_texobj; /* texture object bound to nbfp                 */
 
-    /* Ewald Coulomb force table */
-    int      coulomb_tab_size;
-    float    coulomb_tab_scale;
-    float   *coulomb_tab;
+    /* Ewald Coulomb force table data - accessed through texture memory */
+    int               coulomb_tab_size;
+    float             coulomb_tab_scale;
+    float            *coulomb_tab;
+    cudaTextureObject_t  coulomb_tab_texobj; /* texture object bound to coulomb_tab   */
 };
 
 /*! Pair list data */
-- 
2.22.0