From 2fed6b4c53a5b48fe748d93ab2960d9e77ecc42a Mon Sep 17 00:00:00 2001
From: Roland Schulz <roland.schulz@intel.com>
Date: Tue, 2 Oct 2018 17:08:14 -0700
Subject: [PATCH] Fix OCL compiler warnings

Related #2661

Change-Id: I715553a40d2d913bd484f455d8e39e46632a8372
---
 .../mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh      |  34 +++---
 .../nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh  |   3 +-
 .../nbnxn_ocl/nbnxn_ocl_kernel_utils.clh      | 113 +++++++++---------
 .../mdlib/nbnxn_ocl/nbnxn_ocl_kernels.cl      |   2 -
 4 files changed, 77 insertions(+), 75 deletions(-)

diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
index aba1c026bf..c19fcf8dcd 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
@@ -110,24 +110,24 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
 #endif
 (
 #ifndef LJ_COMB
-    int ntypes,                               /* IN  */
-#endif
-    cl_nbparam_params_t nbparam_params,       /* IN  */
-    const __global float4 *restrict xq,       /* IN  */
-    __global float *restrict f,               /* OUT stores float3 values */
-    __global float *restrict e_lj,            /* OUT */
-    __global float *restrict e_el,            /* OUT */
-    __global float *restrict fshift,          /* OUT stores float3 values */
+    int ntypes,                                       /* IN  */
+#endif
+    cl_nbparam_params_t nbparam_params,               /* IN  */
+    const __global float4 *restrict xq,               /* IN  */
+    __global float *restrict f,                       /* OUT stores float3 values */
+    __global float *restrict gmx_unused e_lj,         /* OUT */
+    __global float *restrict gmx_unused e_el,         /* OUT */
+    __global float *restrict fshift,                  /* OUT stores float3 values */
 #ifdef LJ_COMB
-    const __global float2 *restrict lj_comb,  /* IN stores float2 values */
+    const __global float2 *restrict lj_comb,          /* IN stores float2 values */
 #else
-    const __global int *restrict atom_types,  /* IN  */
+    const __global int *restrict atom_types,          /* IN  */
 #endif
-    const __global float *restrict shift_vec, /* IN stores float3 values */
-    __constant float* nbfp_climg2d,           /* IN  */
-    __constant float* nbfp_comb_climg2d,      /* IN  */
-    __constant float* coulomb_tab_climg2d,    /* IN  */
-    const __global nbnxn_sci_t* pl_sci,       /* IN  */
+    const __global float *restrict shift_vec,         /* IN stores float3 values */
+    __constant float* gmx_unused nbfp_climg2d,        /* IN  */
+    __constant float* gmx_unused nbfp_comb_climg2d,   /* IN  */
+    __constant float* gmx_unused coulomb_tab_climg2d, /* IN  */
+    const __global nbnxn_sci_t* pl_sci,               /* IN  */
 #ifndef PRUNE_NBL
     const
 #endif
@@ -178,12 +178,12 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
     const unsigned superClInteractionMask = ((1U << NCL_PER_SUPERCL) - 1U);
 
 #define LOCAL_OFFSET (xqib + NCL_PER_SUPERCL * CL_SIZE)
-    CjType cjs;
+    CjType cjs = 0;
 #if USE_CJ_PREFETCH
     /* shmem buffer for cj, for both warps separately */
     cjs = (__local int *)(LOCAL_OFFSET);
     #undef LOCAL_OFFSET
-    #define LOCAL_OFFSET cjs + 2 * c_nbnxnGpuJgroupSize
+    #define LOCAL_OFFSET (cjs + 2 * c_nbnxnGpuJgroupSize)
 #endif //USE_CJ_PREFETCH
 
 #ifdef IATYPE_SHMEM
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
index ce4b885b2f..14d993c66b 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
@@ -106,14 +106,13 @@ __kernel void nbnxn_kernel_prune_rolling_opencl
 
     // TODO pass this value at compile-time as a macro
     const int c_nbnxnGpuClusterpairSplit = 2;
-    const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
 
     /*! i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
     const unsigned superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
 
     #define LOCAL_OFFSET (xib + c_numClPerSupercl * c_clSize)
     /* shmem buffer for i cj pre-loading */
-    CjType cjs;
+    CjType cjs = 0;
 #if USE_CJ_PREFETCH
     cjs = (((__local int *)(LOCAL_OFFSET)) + tidxz * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize);
     #undef LOCAL_OFFSET
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
index be24e7906e..753e0b897f 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
@@ -33,6 +33,8 @@
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 
+#define GMX_DOUBLE 0
+
 #include "gromacs/gpu_utils/vectype_ops.clh"
 #include "gromacs/gpu_utils/device_utils.clh"
 #include "gromacs/mdlib/nbnxn_consts.h"
@@ -103,6 +105,13 @@
 #define ONE_TWELVETH_F  0.08333333f
 
 
+#ifdef __GNUC__
+/* GCC, clang, and some ICC pretending to be GCC */
+#  define gmx_unused __attribute__ ((unused))
+#else
+#  define gmx_unused
+#endif
+
 // Data structures shared between OpenCL device code and OpenCL host code
 // TODO: review, improve
 // Replaced real by float for now, to avoid including any other header
@@ -188,8 +197,7 @@ void preloadCj4Generic(__local int        *sm_cjPreload,
                        const __global int *gm_cj,
                        int                 tidxi,
                        int                 tidxj,
-                       bool                iMaskCond)
-
+                       bool gmx_unused     iMaskCond)
 {
     /* Pre-load cj into shared memory */
 #if defined _AMD_SOURCE_ //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
@@ -201,7 +209,6 @@ void preloadCj4Generic(__local int        *sm_cjPreload,
     const int c_clSize                   = CL_SIZE;
     const int c_nbnxnGpuClusterpairSplit = 2;
     const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
-
     if ((tidxj == 0 | tidxj == c_splitClSize) & (tidxi < c_nbnxnGpuJgroupSize))
     {
         sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = gm_cj[tidxi];
@@ -236,11 +243,11 @@ typedef __local int* CjType;
  * it's ready. This function does not call a barrier.
  */
 gmx_opencl_inline
-void preloadCj4(CjType             *cjs,
-                const __global int *gm_cj,
-                int                 tidxi,
-                int                 tidxj,
-                bool                iMaskCond)
+void preloadCj4(CjType gmx_unused             *cjs,
+                const __global int gmx_unused *gm_cj,
+                int                            tidxi,
+                int                            tidxj,
+                bool                           iMaskCond)
 {
 #if USE_SUBGROUP_PRELOAD
     *cjs = preloadCj4Subgroup(gm_cj);
@@ -252,10 +259,10 @@ void preloadCj4(CjType             *cjs,
 }
 
 gmx_opencl_inline
-int loadCjPreload(__local int*        sm_cjPreload,
-                  int                 jm,
-                  int                 tidxi,
-                  int                 tidxj)
+int loadCjPreload(__local int           *        sm_cjPreload,
+                  int                            jm,
+                  int gmx_unused                 tidxi,
+                  int gmx_unused                 tidxj)
 {
 #if defined _AMD_SOURCE_
     int       warpLoadOffset = 0; //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
@@ -263,8 +270,7 @@ int loadCjPreload(__local int*        sm_cjPreload,
     const int c_clSize                   = CL_SIZE;
     const int c_nbnxnGpuClusterpairSplit = 2;
     const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
-
-    int       warpLoadOffset = (tidxj & c_splitClSize) * c_nbnxnGpuJgroupSize/c_splitClSize;
+    int       warpLoadOffset             = (tidxj & c_splitClSize) * c_nbnxnGpuJgroupSize/c_splitClSize;
 #endif
     return sm_cjPreload[jm + warpLoadOffset];
 }
@@ -369,7 +375,7 @@ void calculate_potential_switch_F(cl_nbparam_params_t *nbparam,
                                   float                inv_r,
                                   float                r2,
                                   float               *F_invr,
-                                  float               *E_lj)
+                                  const float         *E_lj)
 {
     float r, r_switch;
     float sw, dsw;
@@ -430,14 +436,14 @@ void calculate_potential_switch_F_E(cl_nbparam_params_t *nbparam,
  *  geometric combination rule.
  */
 gmx_opencl_inline
-void calculate_lj_ewald_comb_geom_F(__constant float * nbfp_comb_climg2d,
-                                    int                typei,
-                                    int                typej,
-                                    float              r2,
-                                    float              inv_r2,
-                                    float              lje_coeff2,
-                                    float              lje_coeff6_6,
-                                    float             *F_invr)
+void calculate_lj_ewald_comb_geom_F(__constant const float *nbfp_comb_climg2d,
+                                    int                     typei,
+                                    int                     typej,
+                                    float                   r2,
+                                    float                   inv_r2,
+                                    float                   lje_coeff2,
+                                    float                   lje_coeff6_6,
+                                    float                  *F_invr)
 {
     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 
@@ -457,17 +463,17 @@ void calculate_lj_ewald_comb_geom_F(__constant float * nbfp_comb_climg2d,
  *  geometric combination rule.
  */
 gmx_opencl_inline
-void calculate_lj_ewald_comb_geom_F_E(__constant float    *nbfp_comb_climg2d,
-                                      cl_nbparam_params_t *nbparam,
-                                      int                  typei,
-                                      int                  typej,
-                                      float                r2,
-                                      float                inv_r2,
-                                      float                lje_coeff2,
-                                      float                lje_coeff6_6,
-                                      float                int_bit,
-                                      float               *F_invr,
-                                      float               *E_lj)
+void calculate_lj_ewald_comb_geom_F_E(__constant const float *nbfp_comb_climg2d,
+                                      cl_nbparam_params_t    *nbparam,
+                                      int                     typei,
+                                      int                     typej,
+                                      float                   r2,
+                                      float                   inv_r2,
+                                      float                   lje_coeff2,
+                                      float                   lje_coeff6_6,
+                                      float                   int_bit,
+                                      float                  *F_invr,
+                                      float                  *E_lj)
 {
     float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
 
@@ -493,18 +499,18 @@ void calculate_lj_ewald_comb_geom_F_E(__constant float    *nbfp_comb_climg2d,
  *  of this is pretty small and LB on the CPU is anyway very slow.
  */
 gmx_opencl_inline
-void calculate_lj_ewald_comb_LB_F_E(__constant float    *nbfp_comb_climg2d,
-                                    cl_nbparam_params_t *nbparam,
-                                    int                  typei,
-                                    int                  typej,
-                                    float                r2,
-                                    float                inv_r2,
-                                    float                lje_coeff2,
-                                    float                lje_coeff6_6,
-                                    float                int_bit,
-                                    bool                 with_E_lj,
-                                    float               *F_invr,
-                                    float               *E_lj)
+void calculate_lj_ewald_comb_LB_F_E(__constant const float *nbfp_comb_climg2d,
+                                    cl_nbparam_params_t    *nbparam,
+                                    int                     typei,
+                                    int                     typej,
+                                    float                   r2,
+                                    float                   inv_r2,
+                                    float                   lje_coeff2,
+                                    float                   lje_coeff6_6,
+                                    float                   int_bit,
+                                    bool                    with_E_lj,
+                                    float                  *F_invr,
+                                    float                  *E_lj)
 {
     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
     float sigma, sigma2, epsilon;
@@ -540,9 +546,9 @@ void calculate_lj_ewald_comb_LB_F_E(__constant float    *nbfp_comb_climg2d,
  *  Original idea: from the OpenMM project
  */
 gmx_opencl_inline float
-interpolate_coulomb_force_r(__constant float *coulomb_tab_climg2d,
-                            float             r,
-                            float             scale)
+interpolate_coulomb_force_r(__constant const float *coulomb_tab_climg2d,
+                            float                   r,
+                            float                   scale)
 {
     float   normalized = scale * r;
     int     index      = (int) normalized;
@@ -835,10 +841,9 @@ void reduce_energy_pow2(volatile __local float  *buf,
                         volatile __global float *e_el,
                         unsigned int             tidx)
 {
-    int     i, j;
-    float   e1, e2;
+    int          j;
 
-    i = WARP_SIZE/2;
+    unsigned int i = WARP_SIZE/2;
 
     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
     for (j = WARP_SIZE_LOG2 - 1; j > 0; j--)
@@ -854,8 +859,8 @@ void reduce_energy_pow2(volatile __local float  *buf,
     /* last reduction step, writing to global mem */
     if (tidx == 0)
     {
-        e1 = buf[              tidx] + buf[              tidx + i];
-        e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
+        float e1 = buf[              tidx] + buf[              tidx + i];
+        float e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 
         atomicAdd_g_f(e_lj, e1);
         atomicAdd_g_f(e_el, e2);
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels.cl b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels.cl
index 0cdb06128f..72edb2c9c5 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels.cl
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels.cl
@@ -32,8 +32,6 @@
  * To help us fund GROMACS development, we humbly ask that you cite
  * the research papers on the package. Check out http://www.gromacs.org.
  */
-#define __IN_OPENCL_KERNEL__
-
 /* Auxiliary kernels */
 __kernel void
 memset_f3(__global float3 *buf, const float value, const unsigned int Nbuf)
-- 
2.22.0