GPU code reorganization and tweaks

author Szilárd Páll <pall.szilard@gmail.com>

Tue, 19 Jan 2016 22:29:45 +0000 (23:29 +0100)

committer Gerrit Code Review <gerrit@gerrit.gromacs.org>

Tue, 26 Jan 2016 23:58:34 +0000 (00:58 +0100)
author Szilárd Páll <pall.szilard@gmail.com>
Tue, 19 Jan 2016 22:29:45 +0000 (23:29 +0100)
committer Gerrit Code Review <gerrit@gerrit.gromacs.org>
Tue, 26 Jan 2016 23:58:34 +0000 (00:58 +0100)
diff --git a/src/gromacs/gpu_utils/cuda_arch_utils.cuh b/src/gromacs/gpu_utils/cuda_arch_utils.cuh

index 082499aecebea9b5def90435b4f151b0ae6a2dfd..8be983365c11f7d3111cf0561c24e566fb159eda 100644 (file)
--- a/src/gromacs/gpu_utils/cuda_arch_utils.cuh
+++ b/src/gromacs/gpu_utils/cuda_arch_utils.cuh
@@ -32,6 +32,8 @@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
+#ifndef CUDA_ARCH_UTILS_CUH_
+#define CUDA_ARCH_UTILS_CUH_
  
  /*! \file
   *  \brief CUDA arch dependent definitions.
@@ -39,8 +41,6 @@
   *  \author Szilard Pall <pall.szilard@gmail.com>
   */
  
-/* TODO move here other CUDA arch-related defines like WARP_SIZE */
-
  /* GMX_PTX_ARCH is set to the virtual arch (PTX) version targeted by
   * the current compiler pass or zero for the host pass and it is
   * intended to be used instead of __CUDA_ARCH__.
@@ -50,3 +50,12 @@
  #else
      #define GMX_PTX_ARCH __CUDA_ARCH__
  #endif
+
+/* Until CC 5.2 and likely for the near future all NVIDIA architectures
+   have a warp size of 32, but this could change later. If it does, the
+   following constants should depend on the value of GMX_PTX_ARCH.
+ */
+static const int warp_size      = 32;
+static const int warp_size_log2 = 5;
+
+#endif /* CUDA_ARCH_UTILS_CUH_ */
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh

index f5ec468c734af37e129d0b6aad68002a25b859eb..1fa80b4c44e6d1e475d7d501e80804ad25f053e3 100644 (file)
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -44,12 +44,6 @@
  
  #include "gromacs/utility/fatalerror.h"
  
-/* CUDA library and hardware related defines */
-/* TODO list some constants instead that can be used for consistency checks to
-   detect future devices with features that make the currect code incompatible
-   with them (e.g. expected warp size = 32, check against the dev_info->props.warpsize). */
-#define WARP_SIZE           32
-
  /* TODO error checking needs to be rewritten. We have 2 types of error checks needed
     based on where they occur in the code:
     - non performance-critical: these errors are unsafe to be ignored and must be
diff --git a/src/gromacs/gpu_utils/vectype_ops.cuh b/src/gromacs/gpu_utils/vectype_ops.cuh

index 03095bff9a7ce408397204ddc1f5d0609edaef65..8d973489fcd5b17ab1c89f336853d55b2bbd4982 100644 (file)
--- a/src/gromacs/gpu_utils/vectype_ops.cuh
+++ b/src/gromacs/gpu_utils/vectype_ops.cuh
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2015, by the GROMACS development team, led by
+ * Copyright (c) 2012,2015,2016, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -37,67 +37,67 @@
  #define VECTYPE_OPS_CUH
  
  /**** float3 ****/
-inline __host__ __device__ float3 make_float3(float s)
+__forceinline__ __host__ __device__ float3 make_float3(float s)
  {
      return make_float3(s, s, s);
  }
-inline __host__ __device__ float3 make_float3(float4 a)
+__forceinline__ __host__ __device__ float3 make_float3(float4 a)
  {
      return make_float3(a.x, a.y, a.z);
  }
-inline __host__ __device__ float3 operator-(float3 &a)
+__forceinline__ __host__ __device__ float3 operator-(float3 &a)
  {
      return make_float3(-a.x, -a.y, -a.z);
  }
-inline __host__ __device__ float3 operator+(float3 a, float3 b)
+__forceinline__ __host__ __device__ float3 operator+(float3 a, float3 b)
  {
      return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
  }
-inline __host__ __device__ float3 operator-(float3 a, float3 b)
+__forceinline__ __host__ __device__ float3 operator-(float3 a, float3 b)
  {
      return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
  }
-inline __host__ __device__ float3 operator*(float3 a, float k)
+__forceinline__ __host__ __device__ float3 operator*(float3 a, float k)
  {
      return make_float3(k * a.x, k * a.y, k * a.z);
  }
-inline __host__ __device__ float3 operator*(float k, float3 a)
+__forceinline__ __host__ __device__ float3 operator*(float k, float3 a)
  {
      return make_float3(k * a.x, k * a.y, k * a.z);
  }
-inline __host__ __device__ void operator += (float3 &a, float3 b)
+__forceinline__ __host__ __device__ void operator += (float3 &a, float3 b)
  {
      a.x += b.x; a.y += b.y; a.z += b.z;
  }
-inline __host__ __device__ void operator += (float3 &a, float4 b)
+__forceinline__ __host__ __device__ void operator += (float3 &a, float4 b)
  {
      a.x += b.x; a.y += b.y; a.z += b.z;
  }
-inline __host__ __device__ void operator -= (float3 &a, float3 b)
+__forceinline__ __host__ __device__ void operator -= (float3 &a, float3 b)
  {
      a.x -= b.x; a.y -= b.y; a.z -= b.z;
  }
-inline __host__ __device__ float norm(float3 a)
+__forceinline__ __host__ __device__ float norm(float3 a)
  {
      return sqrt(a.x * a.x + a.y * a.y + a.z * a.z);
  }
-inline __host__ __device__ float norm2(float3 a)
+__forceinline__ __host__ __device__ float norm2(float3 a)
  {
      return (a.x * a.x + a.y * a.y + a.z * a.z);
  }
-inline __host__ __device__ float dist3(float3 a, float3 b)
+__forceinline__ __host__ __device__ float dist3(float3 a, float3 b)
  {
      return norm(b - a);
  }
-inline __host__ __device__ float3 operator*(float3 a, float3 b)
+__forceinline__ __host__ __device__ float3 operator*(float3 a, float3 b)
  {
      return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
  }
-inline __host__ __device__ void operator *= (float3 &a, float3 b)
+__forceinline__ __host__ __device__ void operator *= (float3 &a, float3 b)
  {
      a.x *= b.x; a.y *= b.y; a.z *= b.z;
  }
-inline __device__ void atomicAdd(float3 *addr, float3 val)
+__forceinline__ __device__ void atomicAdd(float3 *addr, float3 val)
  {
      atomicAdd(&addr->x, val.x);
      atomicAdd(&addr->y, val.y);
@@ -106,49 +106,49 @@ inline __device__ void atomicAdd(float3 *addr, float3 val)
  /****************************************************************/
  
  /**** float4 ****/
-inline __host__ __device__ float4 make_float4(float s)
+__forceinline__ __host__ __device__ float4 make_float4(float s)
  {
      return make_float4(s, s, s, s);
  }
-inline __host__ __device__ float4 make_float4(float3 a)
+__forceinline__ __host__ __device__ float4 make_float4(float3 a)
  {
      return make_float4(a.x, a.y, a.z, 0.0f);
  }
-inline __host__ __device__ float4 operator+(float4 a, float4 b)
+__forceinline__ __host__ __device__ float4 operator+(float4 a, float4 b)
  {
      return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
  }
-inline __host__ __device__ float4 operator+(float4 a, float3 b)
+__forceinline__ __host__ __device__ float4 operator+(float4 a, float3 b)
  {
      return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w);
  }
-inline __host__ __device__ float4 operator-(float4 a, float4 b)
+__forceinline__ __host__ __device__ float4 operator-(float4 a, float4 b)
  {
      return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
  }
-inline __host__ __device__ float4 operator*(float4 a, float k)
+__forceinline__ __host__ __device__ float4 operator*(float4 a, float k)
  {
      return make_float4(k * a.x, k * a.y, k * a.z, k * a.w);
  }
-inline __host__ __device__ void operator += (float4 &a, float4 b)
+__forceinline__ __host__ __device__ void operator += (float4 &a, float4 b)
  {
      a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
  }
-inline __host__ __device__ void operator += (float4 &a, float3 b)
+__forceinline__ __host__ __device__ void operator += (float4 &a, float3 b)
  {
      a.x += b.x; a.y += b.y; a.z += b.z;
  }
-inline __host__ __device__ void operator -= (float4 &a, float3 b)
+__forceinline__ __host__ __device__ void operator -= (float4 &a, float3 b)
  {
      a.x -= b.x; a.y -= b.y; a.z -= b.z;
  }
  
-inline __host__ __device__ float norm(float4 a)
+__forceinline__ __host__ __device__ float norm(float4 a)
  {
      return sqrt(a.x * a.x + a.y * a.y + a.z * a.z + a.w * a.w);
  }
  
-inline __host__ __device__ float dist3(float4 a, float4 b)
+__forceinline__ __host__ __device__ float dist3(float4 a, float4 b)
  {
      return norm(b - a);
  }
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh

index 5bbc6f63734cf9f4a98bcee4e1d392beeb0c24a5..31afd47ee0de46642afb999ba7c70cf261885647 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -117,38 +117,38 @@
   * - CC 3.7:                NTHREAD_Z=2, (128, 16) bounds
   */
  #if GMX_PTX_ARCH == 370
-#define NTHREAD_Z           (2)
-#define MIN_BLOCKS_PER_MP   (16)
+    #define NTHREAD_Z           (2)
+    #define MIN_BLOCKS_PER_MP   (16)
  #else
-#define NTHREAD_Z           (1)
-#define MIN_BLOCKS_PER_MP   (16)
-#endif
+    #define NTHREAD_Z           (1)
+    #define MIN_BLOCKS_PER_MP   (16)
+#endif /* GMX_PTX_ARCH == 370 */
  #define THREADS_PER_BLOCK   (CL_SIZE*CL_SIZE*NTHREAD_Z)
  
  
  #if GMX_PTX_ARCH >= 350
  #if (GMX_PTX_ARCH <= 210) && (NTHREAD_Z > 1)
-#error NTHREAD_Z > 1 will give incorrect results on CC 2.x
+    #error NTHREAD_Z > 1 will give incorrect results on CC 2.x
  #endif
  /**@}*/
  
  __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP)
  #else
  __launch_bounds__(THREADS_PER_BLOCK)
-#endif
+#endif /* GMX_PTX_ARCH >= 350 */
  #ifdef PRUNE_NBL
  #ifdef CALC_ENERGIES
  __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_prune_cuda)
  #else
  __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_prune_cuda)
-#endif
+#endif /* CALC_ENERGIES */
  #else
  #ifdef CALC_ENERGIES
  __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_cuda)
  #else
  __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
-#endif
-#endif
+#endif /* CALC_ENERGIES */
+#endif /* PRUNE_NBL */
  (const cu_atomdata_t atdat,
   const cu_nbparam_t nbparam,
   const cu_plist_t plist,
@@ -212,7 +212,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
      unsigned int tidxz  = threadIdx.z;
  #endif
      unsigned int bidx   = blockIdx.x;
-    unsigned int widx   = tidx / WARP_SIZE; /* warp index */
+    unsigned int widx   = tidx / warp_size; /* warp index */
  
      int          sci, ci, cj, ci_offset,
                   ai, aj,
@@ -339,7 +339,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
      {
          wexcl_idx   = pl_cj4[j4].imei[widx].excl_ind;
          imask       = pl_cj4[j4].imei[widx].imask;
-        wexcl       = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)];
+        wexcl       = excl[wexcl_idx].pair[(tidx) & (warp_size - 1)];
  
  #ifndef PRUNE_NBL
          if (imask)
@@ -617,7 +617,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
      /* flush the energies to shmem and reduce them */
      f_buf[              tidx] = E_lj;
      f_buf[FBUF_STRIDE + tidx] = E_el;
-    reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
+    reduce_energy_pow2(f_buf + (tidx & warp_size), e_lj, e_el, tidx & ~warp_size);
  #endif
  #endif
  }
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

index d61f47dbafe08c4d3d136f589013d385cb91dc95..428e0eaccecc466fab0f91a1763d405755c94ec4 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
@@ -63,10 +63,9 @@
  #define USE_TEXOBJ
  #endif
  
-#define WARP_SIZE_POW2_EXPONENT     (5)
-#define CL_SIZE_POW2_EXPONENT       (3)  /* change this together with GPU_NS_CLUSTER_SIZE !*/
-#define CL_SIZE_SQ                  (CL_SIZE * CL_SIZE)
-#define FBUF_STRIDE                 (CL_SIZE_SQ)
+#define CL_SIZE_LOG2 (3)  /* change this together with CL_SIZE !*/
+#define CL_SIZE_SQ   (CL_SIZE * CL_SIZE)
+#define FBUF_STRIDE  (CL_SIZE_SQ)
  
  #define ONE_SIXTH_F     0.16666667f
  #define ONE_TWELVETH_F  0.08333333f
@@ -85,7 +84,7 @@ extern texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
  #endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
  
  /*! Apply force switch,  force + energy version. */
-static inline __device__
+static __forceinline__ __device__
  void calculate_force_switch_F(const  cu_nbparam_t nbparam,
                                float               c6,
                                float               c12,
@@ -111,7 +110,7 @@ void calculate_force_switch_F(const  cu_nbparam_t nbparam,
  }
  
  /*! Apply force switch, force-only version. */
-static inline __device__
+static __forceinline__ __device__
  void calculate_force_switch_F_E(const  cu_nbparam_t nbparam,
                                  float               c6,
                                  float               c12,
@@ -146,7 +145,7 @@ void calculate_force_switch_F_E(const  cu_nbparam_t nbparam,
  }
  
  /*! Apply potential switch, force-only version. */
-static inline __device__
+static __forceinline__ __device__
  void calculate_potential_switch_F(const  cu_nbparam_t nbparam,
                                    float               c6,
                                    float               c12,
@@ -180,7 +179,7 @@ void calculate_potential_switch_F(const  cu_nbparam_t nbparam,
  }
  
  /*! Apply potential switch, force + energy version. */
-static inline __device__
+static __forceinline__ __device__
  void calculate_potential_switch_F_E(const  cu_nbparam_t nbparam,
                                      float               c6,
                                      float               c12,
@@ -215,7 +214,7 @@ void calculate_potential_switch_F_E(const  cu_nbparam_t nbparam,
  /*! Calculate LJ-PME grid force contribution with
   *  geometric combination rule.
   */
-static inline __device__
+static __forceinline__ __device__
  void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
                                      int                typei,
                                      int                typej,
@@ -246,7 +245,7 @@ void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
  /*! Calculate LJ-PME grid force + energy contribution with
   *  geometric combination rule.
   */
-static inline __device__
+static __forceinline__ __device__
  void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
                                        int                typei,
                                        int                typej,
@@ -285,7 +284,7 @@ void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
   *  We use a single F+E kernel with conditional because the performance impact
   *  of this is pretty small and LB on the CPU is anyway very slow.
   */
-static inline __device__
+static __forceinline__ __device__
  void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
                                      int                typei,
                                      int                typej,
@@ -333,7 +332,7 @@ void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
  /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
   *  Original idea: from the OpenMM project
   */
-static inline __device__
+static __forceinline__ __device__
  float interpolate_coulomb_force_r(float r, float scale)
  {
      float   normalized = scale * r;
@@ -345,7 +344,7 @@ float interpolate_coulomb_force_r(float r, float scale)
             + fract2 * tex1Dfetch(coulomb_tab_texref, index + 1);
  }
  
-static inline __device__
+static __forceinline__ __device__
  float interpolate_coulomb_force_r(cudaTextureObject_t texobj_coulomb_tab,
                                    float r, float scale)
  {
@@ -359,7 +358,7 @@ float interpolate_coulomb_force_r(cudaTextureObject_t texobj_coulomb_tab,
  }
  
  /*! Calculate analytical Ewald correction term. */
-static inline __device__
+static __forceinline__ __device__
  float pmecorrF(float z2)
  {
      const float FN6 = -1.7357322914161492954e-8f;
@@ -401,7 +400,7 @@ float pmecorrF(float z2)
  /*! Final j-force reduction; this generic implementation works with
   *  arbitrary array sizes.
   */
-static inline __device__
+static __forceinline__ __device__
  void reduce_force_j_generic(float *f_buf, float3 *fout,
                              int tidxi, int tidxj, int aidx)
  {
@@ -421,7 +420,7 @@ void reduce_force_j_generic(float *f_buf, float3 *fout,
   *  array sizes and with sm >= 3.0
   */
  #if GMX_PTX_ARCH >= 300
-static inline __device__
+static __forceinline__ __device__
  void reduce_force_j_warp_shfl(float3 f, float3 *fout,
                                int tidxi, int aidx)
  {
@@ -455,7 +454,7 @@ void reduce_force_j_warp_shfl(float3 f, float3 *fout,
   *  arbitrary array sizes.
   * TODO: add the tidxi < 3 trick
   */
-static inline __device__
+static __forceinline__ __device__
  void reduce_force_i_generic(float *f_buf, float3 *fout,
                              float *fshift_buf, bool bCalcFshift,
                              int tidxi, int tidxj, int aidx)
@@ -480,7 +479,7 @@ void reduce_force_i_generic(float *f_buf, float3 *fout,
  /*! Final i-force reduction; this implementation works only with power of two
   *  array sizes.
   */
-static inline __device__
+static __forceinline__ __device__
  void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
                           float *fshift_buf, bool bCalcFshift,
                           int tidxi, int tidxj, int aidx)
@@ -493,8 +492,8 @@ void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
       * Can't just use i as loop variable because than nvcc refuses to unroll.
       */
      i = CL_SIZE/2;
-    # pragma unroll 5
-    for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
+#pragma unroll 5
+    for (j = CL_SIZE_LOG2 - 1; j > 0; j--)
      {
          if (tidxj < i)
          {
@@ -526,7 +525,7 @@ void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
  /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
   *  on whether the size of the array to be reduced is power of two or not.
   */
-static inline __device__
+static __forceinline__ __device__
  void reduce_force_i(float *f_buf, float3 *f,
                      float *fshift_buf, bool bCalcFshift,
                      int tidxi, int tidxj, int ai)
@@ -545,7 +544,7 @@ void reduce_force_i(float *f_buf, float3 *f,
   *  array sizes and with sm >= 3.0
   */
  #if GMX_PTX_ARCH >= 300
-static inline __device__
+static __forceinline__ __device__
  void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
                                float *fshift_buf, bool bCalcFshift,
                                int tidxj, int aidx)
@@ -583,7 +582,7 @@ void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
  /*! Energy reduction; this implementation works only with power of two
   *  array sizes.
   */
-static inline __device__
+static __forceinline__ __device__
  void reduce_energy_pow2(volatile float *buf,
                          float *e_lj, float *e_el,
                          unsigned int tidx)
@@ -591,11 +590,11 @@ void reduce_energy_pow2(volatile float *buf,
      int     i, j;
      float   e1, e2;
  
-    i = WARP_SIZE/2;
+    i = warp_size/2;
  
      /* Can't just use i as loop variable because than nvcc refuses to unroll. */
-# pragma unroll 10
-    for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
+#pragma unroll 10
+    for (j = warp_size_log2 - 1; j > 0; j--)
      {
          if (tidx < i)
          {
@@ -622,7 +621,7 @@ void reduce_energy_pow2(volatile float *buf,
   *  array sizes and with sm >= 3.0
   */
  #if GMX_PTX_ARCH >= 300
-static inline __device__
+static __forceinline__ __device__
  void reduce_energy_warp_shfl(float E_lj, float E_el,
                               float *e_lj, float *e_el,
                               int tidx)
@@ -639,7 +638,7 @@ void reduce_energy_warp_shfl(float E_lj, float E_el,
      }
  
      /* The first thread in the warp writes the reduced energies */
-    if (tidx == 0 || tidx == WARP_SIZE)
+    if (tidx == 0 || tidx == warp_size)
      {
          atomicAdd(e_lj, E_lj);
          atomicAdd(e_el, E_el);
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh

index 721a5d863c7db6cdf89f85c0e542275d2b2de6b3..bf4bdf14ba67091f84f290f31ebcad0fa8e1c6f9 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
@@ -61,10 +61,10 @@ __constant sampler_t generic_sampler     = CLK_NORMALIZED_COORDS_FALSE  /* Natur
  
  #define __device__
  
-#define WARP_SIZE_POW2_EXPONENT     (5)
-#define CL_SIZE_POW2_EXPONENT       (3)  /* change this together with GPU_NS_CLUSTER_SIZE !*/
-#define CL_SIZE_SQ                  (CL_SIZE * CL_SIZE)
-#define FBUF_STRIDE                 (CL_SIZE_SQ)
+#define WARP_SIZE_LOG2  (5)
+#define CL_SIZE_LOG2    (3)  /* change this together with CL_SIZE !*/
+#define CL_SIZE_SQ      (CL_SIZE * CL_SIZE)
+#define FBUF_STRIDE     (CL_SIZE_SQ)
  
  #define ONE_SIXTH_F     0.16666667f
  #define ONE_TWELVETH_F  0.08333333f
author	Szilárd Páll <pall.szilard@gmail.com>
	Tue, 19 Jan 2016 22:29:45 +0000 (23:29 +0100)
committer	Gerrit Code Review <gerrit@gerrit.gromacs.org>
	Tue, 26 Jan 2016 23:58:34 +0000 (00:58 +0100)
src/gromacs/gpu_utils/cuda_arch_utils.cuh		patch \| blob \| history
src/gromacs/gpu_utils/cudautils.cuh		patch \| blob \| history
src/gromacs/gpu_utils/vectype_ops.cuh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh		patch \| blob \| history