#include "nbnxn_cuda_types.h"
-/*
- * Texture references are created at compile-time and need to be declared
- * at file scope as global variables (see http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#texture-reference-api).
- * The texture references below are used in two translation units;
- * we declare them here along the kernels that use them (when compiling legacy Fermi kernels),
- * and provide getters (see below) used by the data_mgmt module where the
- * textures are bound/unbound.
- * (In principle we could do it the other way arond, but that would likely require
- * device linking and we'd rather avoid technical hurdles.)
- */
-/*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */
-texture<float, 1, cudaReadModeElementType> nbfp_texref;
-
-/*! Texture reference for LJ-PME parameters; bound to cu_nbparam_t.nbfp_comb */
-texture<float, 1, cudaReadModeElementType> nbfp_comb_texref;
-
-/*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
-texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
-
/***** The kernel declarations/definitions come here *****/
#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_VF_noprune.cu"
#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_VF_prune.cu"
#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_pruneonly.cu"
-#else
-/* Prevent compilation in multiple compilation unit mode for CC 2.x. Although we have
- * build-time checks to prevent this, the user could manually tweaks nvcc flags
- * which would lead to buggy kernels getting compiled.
- */
-#if GMX_PTX_ARCH > 0 && GMX_PTX_ARCH <= 210 && !defined(__clang__)
-#error Due to an CUDA nvcc compiler bug, the CUDA non-bonded module can not be compiled with multiple compilation units for CC 2.x devices. If you have changed the nvcc flags manually, either use the GMX_CUDA_TARGET_* variables instead or set GMX_CUDA_NB_SINGLE_COMPILATION_UNIT=ON CMake option.
-#endif
#endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
/*********************************/
-/* XXX switch between chevron and cudaLaunch (supported only in CUDA >=7.0)
- -- only for benchmarking purposes */
-static const bool bUseCudaLaunchKernel =
- (GMX_CUDA_VERSION >= 7000) && (getenv("GMX_DISABLE_CUDALAUNCH") == NULL);
-
/*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
static inline int calc_nb_kernel_nblock(int nwork_units, const gmx_device_info_t *dinfo)
{
cudaError_t stat;
int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
/* CUDA kernel launch-related stuff */
- int shmem, nblock;
+ int nblock;
dim3 dim_block, dim_grid;
nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
num_threads_z = 2;
}
nblock = calc_nb_kernel_nblock(plist->nsci, nb->dev_info);
- dim_block = dim3(c_clSize, c_clSize, num_threads_z);
- dim_grid = dim3(nblock, 1, 1);
- shmem = calc_shmem_required_nonbonded(num_threads_z, nb->dev_info, nbp);
+
+ KernelLaunchConfig config;
+ config.blockSize[0] = c_clSize;
+ config.blockSize[1] = c_clSize;
+ config.blockSize[2] = num_threads_z;
+ config.gridSize[0] = nblock;
+ config.sharedMemorySize = calc_shmem_required_nonbonded(num_threads_z, nb->dev_info, nbp);
+ config.stream = stream;
if (debug)
{
- fprintf(debug, "Non-bonded GPU launch configuration:\n\tThread block: %ux%ux%u\n\t"
- "\tGrid: %ux%u\n\t#Super-clusters/clusters: %d/%d (%d)\n"
- "\tShMem: %d\n",
- dim_block.x, dim_block.y, dim_block.z,
- dim_grid.x, dim_grid.y, plist->nsci*c_numClPerSupercl,
+ fprintf(debug, "Non-bonded GPU launch configuration:\n\tThread block: %zux%zux%zu\n\t"
+ "\tGrid: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
+ "\tShMem: %zu\n",
+ config.blockSize[0], config.blockSize[1], config.blockSize[2],
+ config.gridSize[0], config.gridSize[1], plist->nsci*c_numClPerSupercl,
c_numClPerSupercl, plist->na_c,
- shmem);
+ config.sharedMemorySize);
}
- if (bUseCudaLaunchKernel)
- {
- gmx_unused void* kernel_args[4];
- kernel_args[0] = adat;
- kernel_args[1] = nbp;
- kernel_args[2] = plist;
- kernel_args[3] = &bCalcFshift;
-
-#if GMX_CUDA_VERSION >= 7000
- cudaLaunchKernel((void *)nb_kernel, dim_grid, dim_block, kernel_args, shmem, stream);
-#endif
- }
- else
- {
- nb_kernel<<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, bCalcFshift);
- }
- CU_LAUNCH_ERR("k_calc_nb");
+ auto *timingEvent = bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr;
+ const auto kernelArgs = prepareGpuKernelArguments(nb_kernel, config, adat, nbp, plist, &bCalcFshift);
+ launchGpuKernel(nb_kernel, config, timingEvent, "k_calc_nb", kernelArgs);
if (bDoTime)
{
* and j-cluster concurrency, in x, y, and z, respectively.
* - The 1D block-grid contains as many blocks as super-clusters.
*/
- int num_threads_z = c_cudaPruneKernelJ4Concurrency;
- int nblock = calc_nb_kernel_nblock(numSciInPart, nb->dev_info);
- dim3 dim_block = dim3(c_clSize, c_clSize, num_threads_z);
- dim3 dim_grid = dim3(nblock, 1, 1);
- int shmem = calc_shmem_required_prune(num_threads_z);
+ int num_threads_z = c_cudaPruneKernelJ4Concurrency;
+ int nblock = calc_nb_kernel_nblock(numSciInPart, nb->dev_info);
+ KernelLaunchConfig config;
+ config.blockSize[0] = c_clSize;
+ config.blockSize[1] = c_clSize;
+ config.blockSize[2] = num_threads_z;
+ config.gridSize[0] = nblock;
+ config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
+ config.stream = stream;
if (debug)
{
- fprintf(debug, "Pruning GPU kernel launch configuration:\n\tThread block: %ux%ux%u\n\t"
- "\tGrid: %ux%u\n\t#Super-clusters/clusters: %d/%d (%d)\n"
- "\tShMem: %d\n",
- dim_block.x, dim_block.y, dim_block.z,
- dim_grid.x, dim_grid.y, numSciInPart*c_numClPerSupercl,
+ fprintf(debug, "Pruning GPU kernel launch configuration:\n\tThread block: %zux%zux%zu\n\t"
+ "\tGrid: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
+ "\tShMem: %zu\n",
+ config.blockSize[0], config.blockSize[1], config.blockSize[2],
+ config.gridSize[0], config.gridSize[1], numSciInPart*c_numClPerSupercl,
c_numClPerSupercl, plist->na_c,
- shmem);
+ config.sharedMemorySize);
}
- if (bUseCudaLaunchKernel)
- {
- gmx_unused void* kernel_args[5];
- kernel_args[0] = adat;
- kernel_args[1] = nbp;
- kernel_args[2] = plist;
- kernel_args[3] = &numParts;
- kernel_args[4] = ∂
-
-#if GMX_CUDA_VERSION >= 7000
- if (plist->haveFreshList)
- {
- cudaLaunchKernel((void *)nbnxn_kernel_prune_cuda<true>, dim_grid, dim_block, kernel_args, shmem, stream);
- }
- else
- {
- cudaLaunchKernel((void *)nbnxn_kernel_prune_cuda<false>, dim_grid, dim_block, kernel_args, shmem, stream);
- }
-#endif
- }
- else
- {
- if (plist->haveFreshList)
- {
- nbnxn_kernel_prune_cuda<true><<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, numParts, part);
- }
- else
- {
- nbnxn_kernel_prune_cuda<false><<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, numParts, part);
- }
- }
- CU_LAUNCH_ERR("k_pruneonly");
+ auto *timingEvent = bDoTime ? timer->fetchNextEvent() : nullptr;
+ constexpr char kernelName[] = "k_pruneonly";
+ const auto &kernel = plist->haveFreshList ? nbnxn_kernel_prune_cuda<true> : nbnxn_kernel_prune_cuda<false>;
+ const auto kernelArgs = prepareGpuKernelArguments(kernel, config, adat, nbp, plist, &numParts, &part);
+ launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs);
/* TODO: consider a more elegant way to track which kernel has been called
(combined or separate 1st pass prune, rolling prune). */
}
}
-const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref()
-{
- return nbfp_texref;
-}
-
-const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref()
-{
- return nbfp_comb_texref;
-}
-
-const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref()
-{
- return coulomb_tab_texref;
-}
-
void nbnxn_cuda_set_cacheconfig(const gmx_device_info_t *devinfo)
{
cudaError_t stat;