Merge branch origin/release-5-0

[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_cuda / nbnxn_cuda.cu
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

index 69eab0a4606a9716d1034153f132515c63f64396..3ab80b45d6203daa653e36fa94e7e2337447b458 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -32,12 +32,19 @@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
+/*! \file
+ *  \brief Define CUDA implementation of nbnxn_gpu.h
+ *
+ *  \author Szilard Pall <pall.szilard@gmail.com>
+ */
+#include "gmxpre.h"
+
+#include "config.h"
  
-#include <stdlib.h>
  #include <assert.h>
+#include <stdlib.h>
+
+#include "gromacs/mdlib/nbnxn_gpu.h"
  
  #if defined(_MSVC)
  #include <limits>
@@ -45,23 +52,24 @@
  
  #include <cuda.h>
  
-#include "types/simple.h"
-#include "types/nbnxn_pairlist.h"
-#include "types/nb_verlet.h"
-#include "types/ishift.h"
-#include "types/force_flags.h"
-#include "../nbnxn_consts.h"
-
  #ifdef TMPI_ATOMICS
  #include "thread_mpi/atomic.h"
  #endif
  
+#include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
+#include "gromacs/legacyheaders/types/force_flags.h"
+#include "gromacs/legacyheaders/types/simple.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/timing/gpu_timing.h"
+#include "gromacs/utility/cstringutil.h"
+
  #include "nbnxn_cuda_types.h"
-#include "../../gmxlib/cuda_tools/cudautils.cuh"
-#include "nbnxn_cuda.h"
-#include "nbnxn_cuda_data_mgmt.h"
  
-#if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
+#if defined HAVE_CUDA_TEXOBJ_SUPPORT && __CUDA_ARCH__ >= 300
  #define USE_TEXOBJ
  #endif
  
@@ -78,8 +86,38 @@ texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
  #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  
+/* NTHREAD_Z controls the number of j-clusters processed concurrently on NTHREAD_Z
+ * warp-pairs per block.
+ *
+ * - On CC 2.0-3.5, 5.0, and 5.2, NTHREAD_Z == 1, translating to 64 th/block with 16
+ * blocks/multiproc, is the fastest even though this setup gives low occupancy.
+ * NTHREAD_Z > 1 results in excessive register spilling unless the minimum blocks
+ * per multiprocessor is reduced proportionally to get the original number of max
+ * threads in flight (and slightly lower performance).
+ * - On CC 3.7 there are enough registers to double the number of threads; using
+ * NTHREADS_Z == 2 is fastest with 16 blocks (TODO: test with RF and other kernels
+ * with low-register use).
+ *
+ * Note that the current kernel implementation only supports NTHREAD_Z > 1 with
+ * shuffle-based reduction, hence CC >= 3.0.
+ */
+
+/* Kernel launch bounds as function of NTHREAD_Z.
+ * - CC 3.5/5.2: NTHREAD_Z=1, (64, 16) bounds
+ * - CC 3.7:     NTHREAD_Z=2, (128, 16) bounds
+ */
+#if __CUDA_ARCH__ == 370
+#define NTHREAD_Z           (2)
+#define MIN_BLOCKS_PER_MP   (16)
+#else
+#define NTHREAD_Z           (1)
+#define MIN_BLOCKS_PER_MP   (16)
+#endif
+#define THREADS_PER_BLOCK   (CL_SIZE*CL_SIZE*NTHREAD_Z)
+
+
  /***** The kernels come here *****/
-#include "nbnxn_cuda_kernel_utils.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh"
  
  /* Top-level kernel generation: will generate through multiple inclusion the
   * following flavors for all kernels:
@@ -89,22 +127,23 @@ texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
   * - force and energy output with pair list pruning.
   */
  /** Force only **/
-#include "nbnxn_cuda_kernels.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
  /** Force & energy **/
  #define CALC_ENERGIES
-#include "nbnxn_cuda_kernels.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
  #undef CALC_ENERGIES
  
  /*** Pair-list pruning kernels ***/
  /** Force only **/
  #define PRUNE_NBL
-#include "nbnxn_cuda_kernels.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
  /** Force & energy **/
  #define CALC_ENERGIES
-#include "nbnxn_cuda_kernels.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
  #undef CALC_ENERGIES
  #undef PRUNE_NBL
  
+
  /*! Nonbonded kernel function pointer type */
  typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
                                       const cu_nbparam_t,
@@ -126,7 +165,7 @@ static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
  static unsigned int poll_wait_pattern = (0x7FU << 23);
  
  /*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
-static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
+static inline int calc_nb_kernel_nblock(int nwork_units, gmx_device_info_t *dinfo)
  {
      int max_grid_x_size;
  
@@ -242,7 +281,7 @@ static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int  eeltype,
  }
  
  /*! Calculates the amount of shared memory required by the CUDA kernel in use. */
-static inline int calc_shmem_required()
+static inline int calc_shmem_required(const int num_threads_z)
  {
      int shmem;
  
@@ -250,8 +289,8 @@ static inline int calc_shmem_required()
      /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
      /* i-atom x+q in shared memory */
      shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
-    /* cj in shared memory, for both warps separately */
-    shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
+    /* cj in shared memory, for each warp separately */
+    shmem += num_threads_z * 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
  #ifdef IATYPE_SHMEM
      /* i-atom types in shared memory */
      shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
@@ -280,10 +319,10 @@ static inline int calc_shmem_required()
     misc_ops_done event to record the point in time when the above  operations
     are finished and synchronize with this event in the non-local stream.
   */
-void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t        cu_nb,
-                              const nbnxn_atomdata_t *nbatom,
-                              int                     flags,
-                              int                     iloc)
+void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
+                             const nbnxn_atomdata_t *nbatom,
+                             int                     flags,
+                             int                     iloc)
  {
      cudaError_t          stat;
      int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
@@ -292,15 +331,15 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t        cu_nb,
      dim3                 dim_block, dim_grid;
      nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
  
-    cu_atomdata_t       *adat    = cu_nb->atdat;
-    cu_nbparam_t        *nbp     = cu_nb->nbparam;
-    cu_plist_t          *plist   = cu_nb->plist[iloc];
-    cu_timers_t         *t       = cu_nb->timers;
-    cudaStream_t         stream  = cu_nb->stream[iloc];
+    cu_atomdata_t       *adat    = nb->atdat;
+    cu_nbparam_t        *nbp     = nb->nbparam;
+    cu_plist_t          *plist   = nb->plist[iloc];
+    cu_timers_t         *t       = nb->timers;
+    cudaStream_t         stream  = nb->stream[iloc];
  
      bool                 bCalcEner   = flags & GMX_FORCE_ENERGY;
      bool                 bCalcFshift = flags & GMX_FORCE_VIRIAL;
-    bool                 bDoTime     = cu_nb->bDoTime;
+    bool                 bDoTime     = nb->bDoTime;
  
      /* turn energy calculation always on/off (for debugging/testing only) */
      bCalcEner = (bCalcEner || always_ener) && !never_ener;
@@ -333,16 +372,16 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t        cu_nb,
  
      /* When we get here all misc operations issues in the local stream are done,
         so we record that in the local stream and wait for it in the nonlocal one. */
-    if (cu_nb->bUseTwoStreams)
+    if (nb->bUseTwoStreams)
      {
          if (iloc == eintLocal)
          {
-            stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
+            stat = cudaEventRecord(nb->misc_ops_done, stream);
              CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
          }
          else
          {
-            stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
+            stat = cudaStreamWaitEvent(stream, nb->misc_ops_done, 0);
              CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
          }
      }
@@ -383,19 +422,30 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t        cu_nb,
                                      bCalcEner,
                                      plist->bDoPrune || always_prune);
  
-    /* kernel launch config */
-    nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
-    dim_block = dim3(CL_SIZE, CL_SIZE, 1);
+    /* Kernel launch config:
+     * - The thread block dimensions match the size of i-clusters, j-clusters,
+     *   and j-cluster concurrency, in x, y, and z, respectively.
+     * - The 1D block-grid contains as many blocks as super-clusters.
+     */
+    int num_threads_z = 1;
+    if (nb->dev_info->prop.major == 3 && nb->dev_info->prop.minor == 7)
+    {
+        num_threads_z = 2;
+    }
+    nblock    = calc_nb_kernel_nblock(plist->nsci, nb->dev_info);
+    dim_block = dim3(CL_SIZE, CL_SIZE, num_threads_z);
      dim_grid  = dim3(nblock, 1, 1);
-    shmem     = calc_shmem_required();
+    shmem     = calc_shmem_required(num_threads_z);
  
      if (debug)
      {
          fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
-                "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
+                "\tGrid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n"
+                "\tShMem: %d\n",
                  dim_block.x, dim_block.y, dim_block.z,
                  dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
-                NCL_PER_SUPERCL, plist->na_c);
+                NCL_PER_SUPERCL, plist->na_c,
+                shmem);
      }
  
      nb_kernel<<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, bCalcFshift);
@@ -408,10 +458,10 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t        cu_nb,
      }
  }
  
-void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t        cu_nb,
-                               const nbnxn_atomdata_t *nbatom,
-                               int                     flags,
-                               int                     aloc)
+void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
+                              const nbnxn_atomdata_t *nbatom,
+                              int                     flags,
+                              int                     aloc)
  {
      cudaError_t stat;
      int         adat_begin, adat_len, adat_end; /* local/nonlocal offset and length used for xq and f */
@@ -434,16 +484,16 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t        cu_nb,
          gmx_incons(stmp);
      }
  
-    cu_atomdata_t   *adat    = cu_nb->atdat;
-    cu_timers_t     *t       = cu_nb->timers;
-    bool             bDoTime = cu_nb->bDoTime;
-    cudaStream_t     stream  = cu_nb->stream[iloc];
+    cu_atomdata_t   *adat    = nb->atdat;
+    cu_timers_t     *t       = nb->timers;
+    bool             bDoTime = nb->bDoTime;
+    cudaStream_t     stream  = nb->stream[iloc];
  
      bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
      bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
  
      /* don't launch non-local copy-back if there was no non-local work to do */
-    if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0)
+    if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
      {
          return;
      }
@@ -453,13 +503,13 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t        cu_nb,
      {
          adat_begin  = 0;
          adat_len    = adat->natoms_local;
-        adat_end    = cu_nb->atdat->natoms_local;
+        adat_end    = nb->atdat->natoms_local;
      }
      else
      {
          adat_begin  = adat->natoms_local;
          adat_len    = adat->natoms - adat->natoms_local;
-        adat_end    = cu_nb->atdat->natoms;
+        adat_end    = nb->atdat->natoms;
      }
  
      /* beginning of timed D2H section */
@@ -469,7 +519,7 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t        cu_nb,
          CU_RET_ERR(stat, "cudaEventRecord failed");
      }
  
-    if (!cu_nb->bUseStreamSync)
+    if (!nb->bUseStreamSync)
      {
          /* For safety reasons set a few (5%) forces to NaN. This way even if the
             polling "hack" fails with some future NVIDIA driver we'll get a crash. */
@@ -499,9 +549,9 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t        cu_nb,
  
      /* With DD the local D2H transfer can only start after the non-local
         kernel has finished. */
-    if (iloc == eintLocal && cu_nb->bUseTwoStreams)
+    if (iloc == eintLocal && nb->bUseTwoStreams)
      {
-        stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
+        stat = cudaStreamWaitEvent(stream, nb->nonlocal_done, 0);
          CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
      }
  
@@ -515,7 +565,7 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t        cu_nb,
         back first. */
      if (iloc == eintNonlocal)
      {
-        stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
+        stat = cudaEventRecord(nb->nonlocal_done, stream);
          CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
      }
  
@@ -525,17 +575,17 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t        cu_nb,
          /* DtoH fshift */
          if (bCalcFshift)
          {
-            cu_copy_D2H_async(cu_nb->nbst.fshift, adat->fshift,
-                              SHIFTS * sizeof(*cu_nb->nbst.fshift), stream);
+            cu_copy_D2H_async(nb->nbst.fshift, adat->fshift,
+                              SHIFTS * sizeof(*nb->nbst.fshift), stream);
          }
  
          /* DtoH energies */
          if (bCalcEner)
          {
-            cu_copy_D2H_async(cu_nb->nbst.e_lj, adat->e_lj,
-                              sizeof(*cu_nb->nbst.e_lj), stream);
-            cu_copy_D2H_async(cu_nb->nbst.e_el, adat->e_el,
-                              sizeof(*cu_nb->nbst.e_el), stream);
+            cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj,
+                              sizeof(*nb->nbst.e_lj), stream);
+            cu_copy_D2H_async(nb->nbst.e_el, adat->e_el,
+                              sizeof(*nb->nbst.e_el), stream);
          }
      }
  
@@ -563,10 +613,10 @@ static inline bool atomic_cas(volatile unsigned int *ptr,
  #endif
  }
  
-void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
-                         const nbnxn_atomdata_t *nbatom,
-                         int flags, int aloc,
-                         real *e_lj, real *e_el, rvec *fshift)
+void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_cuda_t *nb,
+                            const nbnxn_atomdata_t *nbatom,
+                            int flags, int aloc,
+                            real *e_lj, real *e_el, rvec *fshift)
  {
      /* NOTE:  only implemented for single-precision at this time */
      cudaError_t            stat;
@@ -590,13 +640,13 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
          gmx_incons(stmp);
      }
  
-    cu_plist_t      *plist    = cu_nb->plist[iloc];
-    cu_timers_t     *timers   = cu_nb->timers;
-    wallclock_gpu_t *timings  = cu_nb->timings;
-    nb_staging       nbst     = cu_nb->nbst;
+    cu_plist_t                 *plist    = nb->plist[iloc];
+    cu_timers_t                *timers   = nb->timers;
+    struct gmx_wallclock_gpu_t *timings  = nb->timings;
+    nb_staging                  nbst     = nb->nbst;
  
-    bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
-    bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
+    bool                        bCalcEner   = flags & GMX_FORCE_ENERGY;
+    bool                        bCalcFshift = flags & GMX_FORCE_VIRIAL;
  
      /* turn energy calculation always on/off (for debugging/testing only) */
      bCalcEner = (bCalcEner || always_ener) && !never_ener;
@@ -608,7 +658,7 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
         NOTE: if timing with multiple GPUs (streams) becomes possible, the
         counters could end up being inconsistent due to not being incremented
         on some of the nodes! */
-    if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0)
+    if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
      {
          return;
      }
@@ -616,16 +666,16 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
      /* calculate the atom data index range based on locality */
      if (LOCAL_A(aloc))
      {
-        adat_end = cu_nb->atdat->natoms_local;
+        adat_end = nb->atdat->natoms_local;
      }
      else
      {
-        adat_end = cu_nb->atdat->natoms;
+        adat_end = nb->atdat->natoms;
      }
  
-    if (cu_nb->bUseStreamSync)
+    if (nb->bUseStreamSync)
      {
-        stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
+        stat = cudaStreamSynchronize(nb->stream[iloc]);
          CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
      }
      else
@@ -643,7 +693,7 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
      }
  
      /* timing data accumulation */
-    if (cu_nb->bDoTime)
+    if (nb->bDoTime)
      {
          /* only increase counter once (at local F wait) */
          if (LOCAL_I(iloc))
@@ -722,7 +772,7 @@ const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_
  
  /*! Set up the cache configuration for the non-bonded kernels,
   */
-void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
+void nbnxn_cuda_set_cacheconfig(gmx_device_info_t *devinfo)
  {
      cudaError_t stat;
  
@@ -733,18 +783,18 @@ void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
              if (devinfo->prop.major >= 3)
              {
                  /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
-                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared);
-                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared);
-                stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared);
+                cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared);
+                cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared);
+                cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared);
                  stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferShared);
              }
              else
              {
                  /* On Fermi prefer L1 gives 2% higher performance */
                  /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
-                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
-                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
-                stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
+                cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
+                cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
+                cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
                  stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferL1);
              }
              CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");