#include "nbnxn_cuda_types.h"
-#if defined HAVE_CUDA_TEXOBJ_SUPPORT && __CUDA_ARCH__ >= 300
-#define USE_TEXOBJ
-#endif
-
/*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */
texture<float, 1, cudaReadModeElementType> nbfp_texref;
}
/*! Calculates the amount of shared memory required by the CUDA kernel in use. */
-static inline int calc_shmem_required(const int num_threads_z)
+static inline int calc_shmem_required(const int num_threads_z, gmx_device_info_t gmx_unused *dinfo)
{
int shmem;
+ assert(dinfo);
+
/* size of shmem (force-buffers/xq/atom type preloading) */
/* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
/* i-atom x+q in shared memory */
shmem = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
/* cj in shared memory, for each warp separately */
shmem += num_threads_z * 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
-#ifdef IATYPE_SHMEM
- /* i-atom types in shared memory */
- shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
-#endif
-#if __CUDA_ARCH__ < 300
- /* force reduction buffers in shared memory */
- shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
+ /* CUDA versions below 4.2 won't generate code for sm>=3.0 */
+#if GMX_CUDA_VERSION >= 4200
+ if (dinfo->prop.major >= 3)
+ {
+ /* i-atom types in shared memory */
+ shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
+ }
+ if (dinfo->prop.major < 3)
#endif
-
+ {
+ /* force reduction buffers in shared memory */
+ shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
+ }
return shmem;
}
nblock = calc_nb_kernel_nblock(plist->nsci, nb->dev_info);
dim_block = dim3(CL_SIZE, CL_SIZE, num_threads_z);
dim_grid = dim3(nblock, 1, 1);
- shmem = calc_shmem_required(num_threads_z);
+ shmem = calc_shmem_required(num_threads_z, nb->dev_info);
if (debug)
{
const gmx_device_info_t *dev_info);
+
+#ifdef HAVE_CUDA_TEXOBJ_SUPPORT
+static bool use_texobj(const gmx_device_info_t *dev_info)
+{
+ /* Only device CC >= 3.0 (Kepler and later) support texture objects */
+ return (dev_info->prop.major >= 3);
+}
+#endif
+
/*! Tabulates the Ewald Coulomb force and initializes the size/scale
and the table GPU array. If called with an already allocated table,
it just re-uploads the table.
#ifdef HAVE_CUDA_TEXOBJ_SUPPORT
/* Only device CC >= 3.0 (Kepler and later) support texture objects */
- if (dev_info->prop.major >= 3)
+ if (use_texobj(dev_info))
{
cudaResourceDesc rd;
memset(&rd, 0, sizeof(rd));
#ifdef HAVE_CUDA_TEXOBJ_SUPPORT
/* Only device CC >= 3.0 (Kepler and later) support texture objects */
- if (dev_info->prop.major >= 3)
+ if (use_texobj(dev_info))
{
cudaResourceDesc rd;
cudaTextureDesc td;
{
#ifdef HAVE_CUDA_TEXOBJ_SUPPORT
/* Only device CC >= 3.0 (Kepler and later) support texture objects */
- if (dev_info->prop.major >= 3)
+ if (use_texobj(dev_info))
{
stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
#ifdef HAVE_CUDA_TEXOBJ_SUPPORT
/* Only device CC >= 3.0 (Kepler and later) support texture objects */
- if (nb->dev_info->prop.major >= 3)
+ if (use_texobj(nb->dev_info))
{
stat = cudaDestroyTextureObject(nbparam->nbfp_texobj);
CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_texobj failed");
{
#ifdef HAVE_CUDA_TEXOBJ_SUPPORT
/* Only device CC >= 3.0 (Kepler and later) support texture objects */
- if (nb->dev_info->prop.major >= 3)
+ if (use_texobj(nb->dev_info))
{
stat = cudaDestroyTextureObject(nbparam->nbfp_comb_texobj);
CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_comb_texobj failed");
/* On Kepler pre-loading i-atom types to shmem gives a few %,
but on Fermi it does not */
#define IATYPE_SHMEM
+#ifdef HAVE_CUDA_TEXOBJ_SUPPORT
+#define USE_TEXOBJ
+#endif
#endif
#if defined EL_EWALD_ANA || defined EL_EWALD_TAB
Each thread calculates an i force-component taking one pair of i-j atoms.
*/
+/* Kernel launch bounds as function of NTHREAD_Z.
+ * - CC 3.5/5.2: NTHREAD_Z=1, (64, 16) bounds
+ * - CC 3.7: NTHREAD_Z=2, (128, 16) bounds
+ *
+ * Note: convenience macros, need to be undef-ed at the end of the file.
+ */
+#if __CUDA_ARCH__ == 370
+#define NTHREAD_Z (2)
+#define MIN_BLOCKS_PER_MP (16)
+#else
+#define NTHREAD_Z (1)
+#define MIN_BLOCKS_PER_MP (16)
+#endif
+#define THREADS_PER_BLOCK (CL_SIZE*CL_SIZE*NTHREAD_Z)
+
#if __CUDA_ARCH__ >= 350
__launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP)
#else
#undef REDUCE_SHUFFLE
#undef IATYPE_SHMEM
+#undef USE_TEXOBJ
+
+#undef NTHREAD_Z
+#undef MIN_BLOCKS_PER_MP
+#undef THREADS_PER_BLOCK
#undef EL_EWALD_ANY
#undef EXCLUSION_FORCES
#ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
#define NBNXN_CUDA_KERNEL_UTILS_CUH
+
+#if defined HAVE_CUDA_TEXOBJ_SUPPORT && __CUDA_ARCH__ >= 300
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define USE_TEXOBJ
+#endif
+
#define WARP_SIZE_POW2_EXPONENT (5)
#define CL_SIZE_POW2_EXPONENT (3) /* change this together with GPU_NS_CLUSTER_SIZE !*/
#define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
}
#endif /* __CUDA_ARCH__ */
+#undef USE_TEXOBJ
+
#endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */