From 43b41cb8cead1a67339e29ae0775dae2c23d0d3e Mon Sep 17 00:00:00 2001 From: Szilard Pall Date: Wed, 18 Sep 2013 21:59:36 +0200 Subject: [PATCH] use CUDA texture objects when supported CUDA texture objects are more efficient than texture references, their use reduces the kernel launch overhead by up to 20%. The kernel performance is not affected. Change-Id: Ifa7c148eb2eea8e33ed0b2f1d8ef092d59ba768e --- src/mdlib/nbnxn_cuda/nbnxn_cuda.cu | 11 +- src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu | 100 +++++++++++++++--- src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh | 18 +++- .../nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh | 4 +- .../nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh | 20 +++- src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h | 21 +++- 6 files changed, 142 insertions(+), 32 deletions(-) diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu index de3bdc9bf7..e65c6ed603 100644 --- a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu +++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu @@ -61,12 +61,15 @@ #include "nbnxn_cuda.h" #include "nbnxn_cuda_data_mgmt.h" +#if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ > 300 +#define USE_TEXOBJ +#endif /*! Texture reference for nonbonded parameters; bound to cu_nbparam_t.nbfp*/ -texture tex_nbfp; +texture nbfp_texref; /*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */ -texture tex_coulomb_tab; +texture coulomb_tab_texref; /* Convenience defines */ #define NCL_PER_SUPERCL (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER) @@ -662,13 +665,13 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb, /*! Return the reference to the nbfp texture. */ const struct texture& nbnxn_cuda_get_nbfp_texref() { - return tex_nbfp; + return nbfp_texref; } /*! Return the reference to the coulomb_tab. */ const struct texture& nbnxn_cuda_get_coulomb_tab_texref() { - return tex_coulomb_tab; + return coulomb_tab_texref; } /*! Set up the cache configuration for the non-bonded kernels, diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu index 67e83ff897..cf9b0f7725 100644 --- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu +++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu @@ -110,7 +110,8 @@ static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb); and the table GPU array. If called with an already allocated table, it just re-uploads the table. */ -static void init_ewald_coulomb_force_table(cu_nbparam_t *nbp) +static void init_ewald_coulomb_force_table(cu_nbparam_t *nbp, + const cuda_dev_info_t *dev_info) { float *ftmp, *coul_tab; int tabsize; @@ -137,10 +138,32 @@ static void init_ewald_coulomb_force_table(cu_nbparam_t *nbp) nbp->coulomb_tab = coul_tab; - cudaChannelFormatDesc cd = cudaCreateChannelDesc(); - stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(), - coul_tab, &cd, tabsize*sizeof(*coul_tab)); - CU_RET_ERR(stat, "cudaBindTexture on coul_tab failed"); +#ifdef TEXOBJ_SUPPORTED + /* Only device CC >= 3.0 (Kepler and later) support texture objects */ + if (dev_info->prop.major >= 3) + { + cudaResourceDesc rd; + memset(&rd, 0, sizeof(rd)); + rd.resType = cudaResourceTypeLinear; + rd.res.linear.devPtr = nbp->coulomb_tab; + rd.res.linear.desc.f = cudaChannelFormatKindFloat; + rd.res.linear.desc.x = 32; + rd.res.linear.sizeInBytes = tabsize*sizeof(*coul_tab); + + cudaTextureDesc td; + memset(&td, 0, sizeof(td)); + td.readMode = cudaReadModeElementType; + stat = cudaCreateTextureObject(&nbp->coulomb_tab_texobj, &rd, &td, NULL); + CU_RET_ERR(stat, "cudaCreateTextureObject on coulomb_tab_texobj failed"); + } + else +#endif + { + cudaChannelFormatDesc cd = cudaCreateChannelDesc(); + stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(), + coul_tab, &cd, tabsize*sizeof(*coul_tab)); + CU_RET_ERR(stat, "cudaBindTexture on coulomb_tab_texref failed"); + } } cu_copy_H2D(coul_tab, ftmp, tabsize*sizeof(*coul_tab)); @@ -279,7 +302,7 @@ static void init_nbparam(cu_nbparam_t *nbp, nbp->coulomb_tab = NULL; if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN) { - init_ewald_coulomb_force_table(nbp); + init_ewald_coulomb_force_table(nbp, dev_info); } nnbfp = 2*ntypes*ntypes; @@ -287,10 +310,32 @@ static void init_nbparam(cu_nbparam_t *nbp, CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp"); cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp)); - cudaChannelFormatDesc cd = cudaCreateChannelDesc(); - stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(), - nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp)); - CU_RET_ERR(stat, "cudaBindTexture on nbfp failed"); +#ifdef TEXOBJ_SUPPORTED + /* Only device CC >= 3.0 (Kepler and later) support texture objects */ + if (dev_info->prop.major >= 3) + { + cudaResourceDesc rd; + memset(&rd, 0, sizeof(rd)); + rd.resType = cudaResourceTypeLinear; + rd.res.linear.devPtr = nbp->nbfp; + rd.res.linear.desc.f = cudaChannelFormatKindFloat; + rd.res.linear.desc.x = 32; + rd.res.linear.sizeInBytes = nnbfp*sizeof(*nbp->nbfp); + + cudaTextureDesc td; + memset(&td, 0, sizeof(td)); + td.readMode = cudaReadModeElementType; + stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL); + CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed"); + } + else +#endif + { + cudaChannelFormatDesc cd = cudaCreateChannelDesc(); + stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(), + nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp)); + CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed"); + } } /*! Re-generate the GPU Ewald force table, resets rlist, and update the @@ -307,7 +352,7 @@ void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t cu_nb, nbp->eeltype = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw, cu_nb->dev_info); - init_ewald_coulomb_force_table(cu_nb->nbparam); + init_ewald_coulomb_force_table(cu_nb->nbparam, cu_nb->dev_info); } /*! Initializes the pair list data structure. */ @@ -848,9 +893,21 @@ void nbnxn_cuda_free(FILE *fplog, nbnxn_cuda_ptr_t cu_nb) if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN) { - stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref()); - CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab failed"); - cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size); + +#ifdef TEXOBJ_SUPPORTED + /* Only device CC >= 3.0 (Kepler and later) support texture objects */ + if (cu_nb->dev_info->prop.major >= 3) + { + stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj); + CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed"); + } + else +#endif + { + stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref()); + CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab_texref failed"); + } + cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size); } stat = cudaEventDestroy(cu_nb->nonlocal_done); @@ -893,8 +950,19 @@ void nbnxn_cuda_free(FILE *fplog, nbnxn_cuda_ptr_t cu_nb) } } - stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref()); - CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab failed"); +#ifdef TEXOBJ_SUPPORTED + /* Only device CC >= 3.0 (Kepler and later) support texture objects */ + if (cu_nb->dev_info->prop.major >= 3) + { + stat = cudaDestroyTextureObject(nbparam->nbfp_texobj); + CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_texobj failed"); + } + else +#endif + { + stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref()); + CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_texref failed"); + } cu_free_buffered(nbparam->nbfp); stat = cudaFree(atdat->shift_vec); diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh index 437849d70e..c4a92c59d1 100644 --- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh +++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh @@ -317,8 +317,14 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn) #endif /* LJ 6*C6 and 12*C12 */ - c6 = tex1Dfetch(tex_nbfp, 2 * (ntypes * typei + typej)); - c12 = tex1Dfetch(tex_nbfp, 2 * (ntypes * typei + typej) + 1); +#ifdef USE_TEXOBJ + c6 = tex1Dfetch(nbparam.nbfp_texobj, 2 * (ntypes * typei + typej)); + c12 = tex1Dfetch(nbparam.nbfp_texobj, 2 * (ntypes * typei + typej) + 1); +#else + c6 = tex1Dfetch(nbfp_texref, 2 * (ntypes * typei + typej)); + c12 = tex1Dfetch(nbfp_texref, 2 * (ntypes * typei + typej) + 1); +#endif /* USE_TEXOBJ */ + /* avoid NaN for excluded pairs at r=0 */ r2 += (1.0f - int_bit) * NBNXN_AVOID_SING_R2_INC; @@ -360,7 +366,13 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn) #if defined EL_EWALD_ANA F_invr += qi * qj_f * (int_bit*inv_r2*inv_r + pmecorrF(beta2*r2)*beta3); #elif defined EL_EWALD_TAB - F_invr += qi * qj_f * (int_bit*inv_r2 - interpolate_coulomb_force_r(r2 * inv_r, coulomb_tab_scale)) * inv_r; + F_invr += qi * qj_f * (int_bit*inv_r2 - +#ifdef USE_TEXOBJ + interpolate_coulomb_force_r(nbparam.coulomb_tab_texobj, r2 * inv_r, coulomb_tab_scale) +#else + interpolate_coulomb_force_r(r2 * inv_r, coulomb_tab_scale) +#endif /* USE_TEXOBJ */ + ) * inv_r; #endif /* EL_EWALD_ANA/TAB */ #ifdef CALC_ENERGIES diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh index b0012ee361..2f6f96b467 100644 --- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh +++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh @@ -277,8 +277,8 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy) typei = atom_types[ai]; /* LJ 6*C6 and 12*C12 */ - c6 = tex1Dfetch(tex_nbfp, 2 * (ntypes * typei + typej)); - c12 = tex1Dfetch(tex_nbfp, 2 * (ntypes * typei + typej) + 1); + c6 = tex1Dfetch(nbfp_texref, 2 * (ntypes * typei + typej)); + c12 = tex1Dfetch(nbfp_texref, 2 * (ntypes * typei + typej) + 1); /* avoid NaN for excluded pairs at r=0 */ r2 += (1.0f - int_bit) * NBNXN_AVOID_SING_R2_INC; diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh index ad08ffa5e6..eaaf3a48bf 100644 --- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh +++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh @@ -65,10 +65,26 @@ float interpolate_coulomb_force_r(float r, float scale) float fract2 = normalized - index; float fract1 = 1.0f - fract2; - return fract1 * tex1Dfetch(tex_coulomb_tab, index) - + fract2 * tex1Dfetch(tex_coulomb_tab, index + 1); + return fract1 * tex1Dfetch(coulomb_tab_texref, index) + + fract2 * tex1Dfetch(coulomb_tab_texref, index + 1); } +#ifdef TEXOBJ_SUPPORTED +static inline __device__ +float interpolate_coulomb_force_r(cudaTextureObject_t texobj_coulomb_tab, + float r, float scale) +{ + float normalized = scale * r; + int index = (int) normalized; + float fract2 = normalized - index; + float fract1 = 1.0f - fract2; + + return fract1 * tex1Dfetch(texobj_coulomb_tab, index) + + fract2 * tex1Dfetch(texobj_coulomb_tab, index + 1); +} +#endif + + /*! Calculate analytical Ewald correction term. */ static inline __device__ float pmecorrF(float z2) diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h index 901e784c32..53cebe4f25 100644 --- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h +++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h @@ -43,6 +43,14 @@ #include "types/nbnxn_cuda_types_ext.h" #include "../../gmxlib/cuda_tools/cudautils.cuh" +/* CUDA versions from 5.0 above support texture objects. */ +#if CUDA_VERSION >= 5000 +#define TEXOBJ_SUPPORTED +#else /* CUDA_VERSION */ +/* This typedef allows us to define only one version of struct cu_nbparam */ +typedef int cudaTextureObject_t; +#endif /* CUDA_VERSION */ + #ifdef __cplusplus extern "C" { #endif @@ -129,12 +137,15 @@ struct cu_nbparam float rlist_sq; /* pair-list cut-off */ float sh_invrc6; /* LJ potential correction term */ - float *nbfp; /* nonbonded parameter table with C6/C12 pairs */ + /* Non-bonded parameters - accessed through texture memory */ + float *nbfp; /* nonbonded parameter table with C6/C12 pairs */ + cudaTextureObject_t nbfp_texobj; /* texture object bound to nbfp */ - /* Ewald Coulomb force table */ - int coulomb_tab_size; - float coulomb_tab_scale; - float *coulomb_tab; + /* Ewald Coulomb force table data - accessed through texture memory */ + int coulomb_tab_size; + float coulomb_tab_scale; + float *coulomb_tab; + cudaTextureObject_t coulomb_tab_texobj; /* texture object bound to coulomb_tab */ }; /*! Pair list data */ -- 2.22.0