Merge release-2018 into master

[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_cuda / nbnxn_cuda.cu
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

index 85d0feb35502a7e6d05bd93ba53fe66595779d79..960649956abe6a938cd5566fdd9e7bd3344e72c8 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -66,25 +66,6 @@
  
  #include "nbnxn_cuda_types.h"
  
-/*
- * Texture references are created at compile-time and need to be declared
- * at file scope as global variables (see http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#texture-reference-api).
- * The texture references below are used in two translation units;
- * we declare them here along the kernels that use them (when compiling legacy Fermi kernels),
- * and provide getters (see below) used by the data_mgmt module where the
- * textures are bound/unbound.
- * (In principle we could do it the other way arond, but that would likely require
- * device linking and we'd rather avoid technical hurdles.)
- */
-/*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */
-texture<float, 1, cudaReadModeElementType> nbfp_texref;
-
-/*! Texture reference for LJ-PME parameters; bound to cu_nbparam_t.nbfp_comb */
-texture<float, 1, cudaReadModeElementType> nbfp_comb_texref;
-
-/*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
-texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
-
  
  /***** The kernel declarations/definitions come here *****/
  
@@ -124,14 +105,6 @@ texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
  #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_VF_noprune.cu"
  #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_VF_prune.cu"
  #include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_pruneonly.cu"
-#else
-/* Prevent compilation in multiple compilation unit mode for CC 2.x. Although we have
- * build-time checks to prevent this, the user could manually tweaks nvcc flags
- * which would lead to buggy kernels getting compiled.
- */
-#if GMX_PTX_ARCH > 0 && GMX_PTX_ARCH <= 210 && !defined(__clang__)
-#error Due to an CUDA nvcc compiler bug, the CUDA non-bonded module can not be compiled with multiple compilation units for CC 2.x devices. If you have changed the nvcc flags manually, either use the GMX_CUDA_TARGET_* variables instead or set GMX_CUDA_NB_SINGLE_COMPILATION_UNIT=ON CMake option.
-#endif
  #endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
  
  
@@ -143,11 +116,6 @@ typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
  
  /*********************************/
  
-/* XXX switch between chevron and cudaLaunch (supported only in CUDA >=7.0)
-   -- only for benchmarking purposes */
-static const bool bUseCudaLaunchKernel =
-    (GMX_CUDA_VERSION >= 7000) && (getenv("GMX_DISABLE_CUDALAUNCH") == NULL);
-
  /*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
  static inline int calc_nb_kernel_nblock(int nwork_units, const gmx_device_info_t *dinfo)
  {
@@ -331,7 +299,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
      cudaError_t          stat;
      int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
      /* CUDA kernel launch-related stuff */
-    int                  shmem, nblock;
+    int                  nblock;
      dim3                 dim_block, dim_grid;
      nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
  
@@ -444,38 +412,29 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
          num_threads_z = 2;
      }
      nblock    = calc_nb_kernel_nblock(plist->nsci, nb->dev_info);
-    dim_block = dim3(c_clSize, c_clSize, num_threads_z);
-    dim_grid  = dim3(nblock, 1, 1);
-    shmem     = calc_shmem_required_nonbonded(num_threads_z, nb->dev_info, nbp);
+
+    KernelLaunchConfig config;
+    config.blockSize[0]     = c_clSize;
+    config.blockSize[1]     = c_clSize;
+    config.blockSize[2]     = num_threads_z;
+    config.gridSize[0]      = nblock;
+    config.sharedMemorySize = calc_shmem_required_nonbonded(num_threads_z, nb->dev_info, nbp);
+    config.stream           = stream;
  
      if (debug)
      {
-        fprintf(debug, "Non-bonded GPU launch configuration:\n\tThread block: %ux%ux%u\n\t"
-                "\tGrid: %ux%u\n\t#Super-clusters/clusters: %d/%d (%d)\n"
-                "\tShMem: %d\n",
-                dim_block.x, dim_block.y, dim_block.z,
-                dim_grid.x, dim_grid.y, plist->nsci*c_numClPerSupercl,
+        fprintf(debug, "Non-bonded GPU launch configuration:\n\tThread block: %zux%zux%zu\n\t"
+                "\tGrid: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
+                "\tShMem: %zu\n",
+                config.blockSize[0], config.blockSize[1], config.blockSize[2],
+                config.gridSize[0], config.gridSize[1], plist->nsci*c_numClPerSupercl,
                  c_numClPerSupercl, plist->na_c,
-                shmem);
+                config.sharedMemorySize);
      }
  
-    if (bUseCudaLaunchKernel)
-    {
-        gmx_unused void* kernel_args[4];
-        kernel_args[0] = adat;
-        kernel_args[1] = nbp;
-        kernel_args[2] = plist;
-        kernel_args[3] = &bCalcFshift;
-
-#if GMX_CUDA_VERSION >= 7000
-        cudaLaunchKernel((void *)nb_kernel, dim_grid, dim_block, kernel_args, shmem, stream);
-#endif
-    }
-    else
-    {
-        nb_kernel<<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, bCalcFshift);
-    }
-    CU_LAUNCH_ERR("k_calc_nb");
+    auto      *timingEvent = bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr;
+    const auto kernelArgs  = prepareGpuKernelArguments(nb_kernel, config, adat, nbp, plist, &bCalcFshift);
+    launchGpuKernel(nb_kernel, config, timingEvent, "k_calc_nb", kernelArgs);
  
      if (bDoTime)
      {
@@ -573,55 +532,32 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t       *nb,
       *   and j-cluster concurrency, in x, y, and z, respectively.
       * - The 1D block-grid contains as many blocks as super-clusters.
       */
-    int  num_threads_z  = c_cudaPruneKernelJ4Concurrency;
-    int  nblock         = calc_nb_kernel_nblock(numSciInPart, nb->dev_info);
-    dim3 dim_block      = dim3(c_clSize, c_clSize, num_threads_z);
-    dim3 dim_grid       = dim3(nblock, 1, 1);
-    int  shmem          = calc_shmem_required_prune(num_threads_z);
+    int                num_threads_z  = c_cudaPruneKernelJ4Concurrency;
+    int                nblock         = calc_nb_kernel_nblock(numSciInPart, nb->dev_info);
+    KernelLaunchConfig config;
+    config.blockSize[0]     = c_clSize;
+    config.blockSize[1]     = c_clSize;
+    config.blockSize[2]     = num_threads_z;
+    config.gridSize[0]      = nblock;
+    config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
+    config.stream           = stream;
  
      if (debug)
      {
-        fprintf(debug, "Pruning GPU kernel launch configuration:\n\tThread block: %ux%ux%u\n\t"
-                "\tGrid: %ux%u\n\t#Super-clusters/clusters: %d/%d (%d)\n"
-                "\tShMem: %d\n",
-                dim_block.x, dim_block.y, dim_block.z,
-                dim_grid.x, dim_grid.y, numSciInPart*c_numClPerSupercl,
+        fprintf(debug, "Pruning GPU kernel launch configuration:\n\tThread block: %zux%zux%zu\n\t"
+                "\tGrid: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
+                "\tShMem: %zu\n",
+                config.blockSize[0], config.blockSize[1], config.blockSize[2],
+                config.gridSize[0], config.gridSize[1], numSciInPart*c_numClPerSupercl,
                  c_numClPerSupercl, plist->na_c,
-                shmem);
+                config.sharedMemorySize);
      }
  
-    if (bUseCudaLaunchKernel)
-    {
-        gmx_unused void* kernel_args[5];
-        kernel_args[0] = adat;
-        kernel_args[1] = nbp;
-        kernel_args[2] = plist;
-        kernel_args[3] = &numParts;
-        kernel_args[4] = &part;
-
-#if GMX_CUDA_VERSION >= 7000
-        if (plist->haveFreshList)
-        {
-            cudaLaunchKernel((void *)nbnxn_kernel_prune_cuda<true>, dim_grid, dim_block, kernel_args, shmem, stream);
-        }
-        else
-        {
-            cudaLaunchKernel((void *)nbnxn_kernel_prune_cuda<false>, dim_grid, dim_block, kernel_args, shmem, stream);
-        }
-#endif
-    }
-    else
-    {
-        if (plist->haveFreshList)
-        {
-            nbnxn_kernel_prune_cuda<true><<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, numParts, part);
-        }
-        else
-        {
-            nbnxn_kernel_prune_cuda<false><<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, numParts, part);
-        }
-    }
-    CU_LAUNCH_ERR("k_pruneonly");
+    auto          *timingEvent  = bDoTime ? timer->fetchNextEvent() : nullptr;
+    constexpr char kernelName[] = "k_pruneonly";
+    const auto    &kernel       = plist->haveFreshList ? nbnxn_kernel_prune_cuda<true> : nbnxn_kernel_prune_cuda<false>;
+    const auto     kernelArgs   = prepareGpuKernelArguments(kernel, config, adat, nbp, plist, &numParts, &part);
+    launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs);
  
      /* TODO: consider a more elegant way to track which kernel has been called
         (combined or separate 1st pass prune, rolling prune). */
@@ -730,21 +666,6 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
      }
  }
  
-const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref()
-{
-    return nbfp_texref;
-}
-
-const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref()
-{
-    return nbfp_comb_texref;
-}
-
-const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref()
-{
-    return coulomb_tab_texref;
-}
-
  void nbnxn_cuda_set_cacheconfig(const gmx_device_info_t *devinfo)
  {
      cudaError_t stat;