Fix clang-tidy 8 warnings in OpenCL nbnxm kernels

author Szilárd Páll <pall.szilard@gmail.com>

Wed, 11 Sep 2019 22:41:01 +0000 (00:41 +0200)

committer Paul Bauer <paul.bauer.q@gmail.com>

Sun, 17 Nov 2019 14:59:13 +0000 (15:59 +0100)
author Szilárd Páll <pall.szilard@gmail.com>
Wed, 11 Sep 2019 22:41:01 +0000 (00:41 +0200)
committer Paul Bauer <paul.bauer.q@gmail.com>
Sun, 17 Nov 2019 14:59:13 +0000 (15:59 +0100)
diff --git a/src/gromacs/nbnxm/opencl/CMakeLists.txt b/src/gromacs/nbnxm/opencl/CMakeLists.txt

index 7f05efbd71495063599f2c0f2da45febd41474f8..d9e5b32e42a04192526ae2e5025e9f3852e59e31 100644 (file)
--- a/src/gromacs/nbnxm/opencl/CMakeLists.txt
+++ b/src/gromacs/nbnxm/opencl/CMakeLists.txt
@@ -54,7 +54,8 @@ set(VDW_DEFS
      "-DLJ_EWALD_COMB_LB\;-DVDWNAME=_VdwLJEwCombLB")
  if(CLANG_TIDY_EXE)
     set(OCL_COMPILER "${CLANG_TIDY_EXE}")
-   set(CLANG_TIDY_ARGS "-quiet;-checks=*,-readability-implicit-bool-conversion,-llvm-header-guard,-hicpp-signed-bitwise,-clang-analyzer-deadcode.DeadStores,-google-readability-todo;--;${CMAKE_C_COMPILER}")
+   # TODO: remove readability-isolate-declaration when the nbnxm OpenCL kernels get modernized
+   set(CLANG_TIDY_ARGS "-quiet;-checks=*,-readability-isolate-declaration,-readability-implicit-bool-conversion,-llvm-header-guard,-hicpp-signed-bitwise,-clang-analyzer-deadcode.DeadStores,-google-readability-todo;--;${CMAKE_C_COMPILER}")
  else()
     set(OCL_COMPILER "${CMAKE_C_COMPILER}")
  endif()
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh

index abaff9bc5cdea248abef46f30c6e62c44df64944..cbdb45f01a68310725983d1d23682ec367e17e68 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh
@@ -171,11 +171,11 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
  #endif     /* CALC_ENERGIES */
  
      /* thread/block/warp id-s */
-    const unsigned int tidxi = get_local_id(0);
-    const unsigned int tidxj = get_local_id(1);
-    const unsigned int tidx  = get_local_id(1) * get_local_size(0) + get_local_id(0);
-    const unsigned int bidx  = get_group_id(0);
-    const unsigned int widx  = tidx / WARP_SIZE; /* warp index */
+    const int tidxi = get_local_id(0);
+    const int tidxj = get_local_id(1);
+    const int tidx  = (int)(get_local_id(1) * get_local_size(0) + get_local_id(0));
+    const int bidx  = get_group_id(0);
+    const int widx  = tidx / WARP_SIZE; /* warp index */
  
      /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL=8 bits set */
      const unsigned superClInteractionMask = ((1U << NCL_PER_SUPERCL) - 1U);
@@ -208,14 +208,15 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
  #    undef LOCAL_OFFSET
  #    define LOCAL_OFFSET (f_buf + CL_SIZE * CL_SIZE * 3)
  #else
-    __local float* f_buf              = 0;
+    __local float* f_buf             = 0;
  #endif
  #if !USE_SUBGROUP_ANY
      /* Local buffer used to implement __any warp vote function from CUDA.
         volatile is used to avoid compiler optimizations for AMD builds. */
-    volatile __local uint* warp_any = (__local uint*)(LOCAL_OFFSET);
+    //NOLINTNEXTLINE(google-readability-casting)
+    volatile __local int* warp_any = (__local int*)(LOCAL_OFFSET);
  #else
-    __local uint gmx_unused* warp_any = 0;
+    __local int gmx_unused* warp_any = 0;
  #endif
  #undef LOCAL_OFFSET
  
@@ -232,7 +233,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
  
          float4 xqbuf = xq[ai]
                         + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1],
-                                  shift_vec[3 * nb_sci.shift + 2], 0.0f);
+                                  shift_vec[3 * nb_sci.shift + 2], 0.0F);
          xqbuf.w *= nbparam->epsfac;
          xqib[(tidxj + i) * CL_SIZE + tidxi] = xqbuf;
  #ifdef IATYPE_SHMEM
@@ -256,7 +257,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
      float3 fci_buf[NCL_PER_SUPERCL]; /* i force buffer */
      for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
      {
-        fci_buf[ci_offset] = (float3)(0.0f);
+        fci_buf[ci_offset] = (float3)(0.0F);
      }
  
  #ifdef LJ_EWALD
@@ -267,8 +268,8 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
  
  
  #ifdef CALC_ENERGIES
-    float E_lj = 0.0f;
-    float E_el = 0.0f;
+    float E_lj = 0.0F;
+    float E_el = 0.0F;
  
  #    if defined EXCLUSION_FORCES /* Ewald or RF */
      if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci * NCL_PER_SUPERCL)
@@ -288,14 +289,14 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
          /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
  #        ifdef LJ_EWALD
          E_lj /= CL_SIZE;
-        E_lj *= 0.5f * ONE_SIXTH_F * lje_coeff6_6;
+        E_lj *= HALF_F * ONE_SIXTH_F * lje_coeff6_6;
  #        endif /* LJ_EWALD */
  
  #        if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
          /* Correct for epsfac^2 due to adding qi^2 */
          E_el /= nbparam->epsfac * CL_SIZE;
  #            if defined EL_RF || defined EL_CUTOFF
-        E_el *= -0.5f * c_rf;
+        E_el *= -HALF_F * c_rf;
  #            else
          E_el *= -beta * M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
  #            endif
@@ -316,7 +317,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
          unsigned int       imask     = pl_cj4[j4].imei[widx].imask;
          const unsigned int wexcl     = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)];
  
-        preloadCj4(&cjs, pl_cj4[j4].cj, tidxi, tidxj, imask != 0u);
+        preloadCj4(&cjs, pl_cj4[j4].cj, tidxi, tidxj, imask != 0U);
  
  #ifndef PRUNE_NBL
          if (imask)
@@ -351,7 +352,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
                      const float2 ljcp_j = lj_comb[aj];
  #endif
  
-                    float3 fcj_buf = (float3)(0.0f);
+                    float3 fcj_buf = (float3)(0.0F);
  
  #if !defined PRUNE_NBL
  #    pragma unroll 8
@@ -377,13 +378,13 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
                              }
  #endif
  
-                            const float int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
+                            const float int_bit = (wexcl & mask_ji) ? 1.0F : 0.0F;
  
                              /* cutoff & exclusion check */
  #ifdef EXCLUSION_FORCES
                              if ((r2 < rcoulomb_sq) * (nonSelfInteraction | (ci != cj)))
  #else
-                            if ((r2 < rcoulomb_sq) * int_bit)
+                            if ((float)(r2 < rcoulomb_sq) * int_bit != 0.0F)
  #endif
                              {
                                  /* load the rest of the i-atom parameters */
@@ -451,7 +452,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
                                  sig_r6 *= int_bit;
  #    endif /* EXCLUSION_FORCES */
  
-                                float F_invr = epsilon * sig_r6 * (sig_r6 - 1.0f) * inv_r2;
+                                float F_invr = epsilon * sig_r6 * (sig_r6 - 1.0F) * inv_r2;
  #endif     /* ! LJ_COMB_LB || CALC_ENERGIES */
  
  
@@ -498,7 +499,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
                                  /* Separate VDW cut-off check to enable twin-range cut-offs
                                   * (rvdw < rcoulomb <= rlist)
                                   */
-                                const float vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
+                                const float vdw_in_range = (r2 < rvdw_sq) ? 1.0F : 0.0F;
                                  F_invr *= vdw_in_range;
  #    ifdef CALC_ENERGIES
                                  E_lj_p *= vdw_in_range;
@@ -537,10 +538,10 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
                                  E_el += qi * qj_f * (int_bit * inv_r - c_rf);
  #    endif
  #    ifdef EL_RF
-                                E_el += qi * qj_f * (int_bit * inv_r + 0.5f * two_k_rf * r2 - c_rf);
+                                E_el += qi * qj_f * (int_bit * inv_r + HALF_F * two_k_rf * r2 - c_rf);
  #    endif
  #    ifdef EL_EWALD_ANY
-                                /* 1.0f - erff is faster than erfcf */
+                                /* 1.0F - erff is faster than erfcf */
                                  E_el += qi * qj_f
                                          * (inv_r * (int_bit - erf(r2 * inv_r * beta)) - int_bit * ewald_shift);
  #    endif /* EL_EWALD_ANY */
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_pruneonly.clh b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_pruneonly.clh

index 96a40b60d62e9de99af2857b5d38d08672c5a707..475c1d68b063d692ca5ed82bbfa4b41825395d97 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_pruneonly.clh
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_pruneonly.clh
@@ -83,16 +83,16 @@ nbnxn_kernel_prune_rolling_opencl
      const float rlistInner_sq = nbparam->rlistInner_sq;
  
      /* thread/block/warp id-s */
-    const unsigned int tidxi = get_local_id(0);
-    const unsigned int tidxj = get_local_id(1);
-    const unsigned int tidx  = get_local_id(1) * get_local_size(0) + get_local_id(0);
+    const int tidxi = get_local_id(0);
+    const int tidxj = get_local_id(1);
+    const int tidx  = (int)(get_local_id(1) * get_local_size(0) + get_local_id(0));
  #if NTHREAD_Z == 1
-    const unsigned int tidxz = 0;
+    const int tidxz = 0;
  #else
-    const unsigned int tidxz         = get_local_id(2);
+    const int          tidxz         = get_local_id(2);
  #endif
-    const unsigned int bidx = get_group_id(0);
-    const unsigned int widx = tidx / WARP_SIZE;
+    const int bidx = get_group_id(0);
+    const int widx = tidx / WARP_SIZE;
  
  #ifdef HAVE_FRESH_LIST
      const bool haveFreshList = true;
@@ -124,16 +124,18 @@ nbnxn_kernel_prune_rolling_opencl
  #if !USE_SUBGROUP_ANY
      /* Local buffer used to implement __any warp vote function from CUDA.
         volatile is used to avoid compiler optimizations for AMD builds. */
-    volatile __local uint* const warp_any     = (__local uint*)(LOCAL_OFFSET);
-    const unsigned int           warpVoteSlot = NTHREAD_Z * tidxz + widx;
+    //NOLINTNEXTLINE(google-readability-casting)
+    volatile __local int* const warp_any     = (__local int*)(LOCAL_OFFSET);
+    const int                   warpVoteSlot = NTHREAD_Z * tidxz + widx;
      /* Initialise warp vote.*/
-    if (tidx == 0 || tidx == 32)
+    // if (tidx == 0 || tidx == 32)
+    if (tidx % (c_clSize * c_clSize / c_nbnxnGpuClusterpairSplit) == 0)
      {
          warp_any[warpVoteSlot] = 0;
      }
  #else
-    __local uint* const warp_any     = 0;
-    const unsigned int  warpVoteSlot = 0;
+    __local int* const warp_any      = 0;
+    const int          warpVoteSlot  = 0;
  #endif
  #undef LOCAL_OFFSET
  
@@ -155,7 +157,7 @@ nbnxn_kernel_prune_rolling_opencl
              const float4 tmp = xq[ai];
              const float4 xi  = tmp
                                + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1],
-                                         shift_vec[3 * nb_sci.shift + 2], 0.0f);
+                                         shift_vec[3 * nb_sci.shift + 2], 0.0F);
              xib[(tidxj + i) * c_clSize + tidxi] = xi;
          }
      }
@@ -185,7 +187,7 @@ nbnxn_kernel_prune_rolling_opencl
              imaskCheck = (imaskNew ^ imaskFull);
          }
  
-        preloadCj4(&cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck != 0u);
+        preloadCj4(&cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck != 0U);
  
          if (imaskCheck)
          {
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh

index 73b81efb511fb100276ac3dc9e403cbae8adbb94..ec5d40f83d484e38531bea0d7eb0086404478c85 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh
@@ -82,7 +82,7 @@
  #define USE_SUBGROUP_PRELOAD HAVE_INTEL_SUBGROUP
  
  /* 1.0 / sqrt(M_PI) */
-#define M_FLOAT_1_SQRTPI 0.564189583547756f
+#define M_FLOAT_1_SQRTPI 0.564189583547756F
  
  //-------------------
  
@@ -102,8 +102,9 @@
  #    define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
  #    define FBUF_STRIDE (CL_SIZE_SQ)
  
-#    define ONE_SIXTH_F 0.16666667f
-#    define ONE_TWELVETH_F 0.08333333f
+#    define ONE_SIXTH_F 0.16666667F
+#    define ONE_TWELVETH_F 0.08333333F
+#    define HALF_F 0.5F
  
  
  #    ifdef __GNUC__
@@ -321,7 +322,7 @@ gmx_opencl_inline void calculate_force_switch_F(const cl_nbparam_params_t* nbpar
  
      r        = r2 * inv_r;
      r_switch = r - nbparam->rvdw_switch;
-    r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+    r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
  
      *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
                 + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
@@ -351,7 +352,7 @@ gmx_opencl_inline void calculate_force_switch_F_E(const cl_nbparam_params_t* nbp
  
      r        = r2 * inv_r;
      r_switch = r - nbparam->rvdw_switch;
-    r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+    r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
  
      *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
                 + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
@@ -381,9 +382,9 @@ gmx_opencl_inline void calculate_potential_switch_F(const cl_nbparam_params_t* n
      r_switch = r - nbparam->rvdw_switch;
  
      /* Unlike in the F+E kernel, conditional is faster here */
-    if (r_switch > 0.0f)
+    if (r_switch > 0.0F)
      {
-        sw  = 1.0f + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
+        sw  = 1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
          dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
  
          *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
@@ -410,10 +411,10 @@ gmx_opencl_inline void calculate_potential_switch_F_E(const cl_nbparam_params_t*
  
      r        = r2 * inv_r;
      r_switch = r - nbparam->rvdw_switch;
-    r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+    r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
  
      /* Unlike in the F-only kernel, masking is faster here */
-    sw  = 1.0f + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
+    sw  = 1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
      dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
  
      *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
@@ -440,7 +441,7 @@ gmx_opencl_inline void calculate_lj_ewald_comb_geom_F(__constant const float* nb
      inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
      cr2       = lje_coeff2 * r2;
      expmcr2   = exp(-cr2);
-    poly      = 1.0f + cr2 + 0.5f * cr2 * cr2;
+    poly      = 1.0F + cr2 + HALF_F * cr2 * cr2;
  
      /* Subtract the grid force from the total LJ force */
      *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
@@ -469,14 +470,14 @@ gmx_opencl_inline void calculate_lj_ewald_comb_geom_F_E(__constant const float*
      inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
      cr2       = lje_coeff2 * r2;
      expmcr2   = exp(-cr2);
-    poly      = 1.0f + cr2 + 0.5f * cr2 * cr2;
+    poly      = 1.0F + cr2 + HALF_F * cr2 * cr2;
  
      /* Subtract the grid force from the total LJ force */
      *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
  
      /* Shift should be applied only to real LJ pairs */
      sh_mask = nbparam->sh_lj_ewald * int_bit;
-    *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0f - expmcr2 * poly) + sh_mask);
+    *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
  }
  
  /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
@@ -512,7 +513,7 @@ gmx_opencl_inline void calculate_lj_ewald_comb_LB_F_E(__constant const float*
      inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
      cr2       = lje_coeff2 * r2;
      expmcr2   = exp(-cr2);
-    poly      = 1.0f + cr2 + 0.5f * cr2 * cr2;
+    poly      = 1.0F + cr2 + HALF_F * cr2 * cr2;
  
      /* Subtract the grid force from the total LJ force */
      *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
@@ -523,7 +524,7 @@ gmx_opencl_inline void calculate_lj_ewald_comb_LB_F_E(__constant const float*
  
          /* Shift should be applied only to real LJ pairs */
          sh_mask = nbparam->sh_lj_ewald * int_bit;
-        *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0f - expmcr2 * poly) + sh_mask);
+        *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
      }
  }
  
@@ -536,8 +537,8 @@ gmx_opencl_inline float interpolate_coulomb_force_r(__constant const float* coul
  {
      float normalized = scale * r;
      int   index      = (int)normalized;
-    float fract2     = normalized - index;
-    float fract1     = 1.0f - fract2;
+    float fract2     = normalized - (float)index;
+    float fract1     = 1.0F - fract2;
  
      return fract1 * coulomb_tab_climg2d[index] + fract2 * coulomb_tab_climg2d[index + 1];
  }
@@ -545,19 +546,19 @@ gmx_opencl_inline float interpolate_coulomb_force_r(__constant const float* coul
  /*! Calculate analytical Ewald correction term. */
  gmx_opencl_inline float pmecorrF(float z2)
  {
-    const float FN6 = -1.7357322914161492954e-8f;
-    const float FN5 = 1.4703624142580877519e-6f;
-    const float FN4 = -0.000053401640219807709149f;
-    const float FN3 = 0.0010054721316683106153f;
-    const float FN2 = -0.019278317264888380590f;
-    const float FN1 = 0.069670166153766424023f;
-    const float FN0 = -0.75225204789749321333f;
+    const float FN6 = -1.7357322914161492954e-8F;
+    const float FN5 = 1.4703624142580877519e-6F;
+    const float FN4 = -0.000053401640219807709149F;
+    const float FN3 = 0.0010054721316683106153F;
+    const float FN2 = -0.019278317264888380590F;
+    const float FN1 = 0.069670166153766424023F;
+    const float FN0 = -0.75225204789749321333F;
  
-    const float FD4 = 0.0011193462567257629232f;
-    const float FD3 = 0.014866955030185295499f;
-    const float FD2 = 0.11583842382862377919f;
-    const float FD1 = 0.50736591960530292870f;
-    const float FD0 = 1.0f;
+    const float FD4 = 0.0011193462567257629232F;
+    const float FD3 = 0.014866955030185295499F;
+    const float FD2 = 0.11583842382862377919F;
+    const float FD1 = 0.50736591960530292870F;
+    const float FD0 = 1.0F;
  
      float z4;
      float polyFN0, polyFN1, polyFD0, polyFD1;
@@ -569,7 +570,7 @@ gmx_opencl_inline float pmecorrF(float z2)
      polyFD0 = polyFD0 * z4 + FD0;
      polyFD0 = polyFD1 * z2 + polyFD0;
  
-    polyFD0 = 1.0f / polyFD0;
+    polyFD0 = 1.0F / polyFD0;
  
      polyFN0 = FN6 * z4 + FN4;
      polyFN1 = FN5 * z4 + FN3;
@@ -622,7 +623,7 @@ reduce_force_j_generic(__local float* f_buf, float3 fcj_buf, __global float* fou
         The reduction is performed for each line tidxj of f_buf. */
      if (tidxi < 3)
      {
-        float f = 0.0f;
+        float f = 0.0F;
          for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
          {
              f += f_buf[FBUF_STRIDE * tidxi + j];
@@ -727,13 +728,12 @@ gmx_opencl_inline void reduce_force_i_and_shift_pow2(volatile __local float* f_b
          f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
          barrier(CLK_LOCAL_MEM_FENCE);
  
-        int i, j;
          /* Reduce the initial CL_SIZE values for each i atom to half
           * every step by using CL_SIZE * i threads.
           * Can't just use i as loop variable because than nvcc refuses to unroll.
           */
-        i = CL_SIZE / 2;
-        for (j = CL_SIZE_LOG2 - 1; j > 0; j--)
+        int i = CL_SIZE / 2;
+        for (int j = CL_SIZE_LOG2 - 1; j > 0; j--)
          {
              if (tidxj < i)
              {
@@ -809,7 +809,7 @@ gmx_opencl_inline void reduce_energy_shfl(float                    E_lj,
                                            float                    E_el,
                                            volatile __global float* e_lj,
                                            volatile __global float* e_el,
-                                          unsigned int             tidx)
+                                          int                      tidx)
  {
      E_lj = sub_group_reduce_add(E_lj);
      E_el = sub_group_reduce_add(E_el);
@@ -832,11 +832,11 @@ gmx_opencl_inline void reduce_energy_shfl(float                    E_lj,
  gmx_opencl_inline void reduce_energy_pow2(volatile __local float*  buf,
                                            volatile __global float* e_lj,
                                            volatile __global float* e_el,
-                                          unsigned int             tidx)
+                                          int                      tidx)
  {
      int j;
  
-    unsigned int i = WARP_SIZE / 2;
+    int i = WARP_SIZE / 2;
  
      /* Can't just use i as loop variable because than nvcc refuses to unroll. */
      for (j = WARP_SIZE_LOG2 - 1; j > 0; j--)
@@ -865,7 +865,7 @@ gmx_opencl_inline void reduce_energy(volatile __local float gmx_unused* buf,
                                       float                              E_el,
                                       volatile __global float*           e_lj,
                                       volatile __global float*           e_el,
-                                     unsigned int                       tidx)
+                                     int                                tidx)
  {
  #    if REDUCE_SHUFFLE
      reduce_energy_shfl(E_lj, E_el, e_lj, e_el, tidx);
@@ -877,7 +877,7 @@ gmx_opencl_inline void reduce_energy(volatile __local float gmx_unused* buf,
  #    endif
  }
  
-gmx_opencl_inline bool gmx_sub_group_any_localmem(volatile __local uint* warp_any, int widx, bool pred)
+gmx_opencl_inline bool gmx_sub_group_any_localmem(volatile __local int* warp_any, int widx, bool pred)
  {
      if (pred)
      {
@@ -892,9 +892,7 @@ gmx_opencl_inline bool gmx_sub_group_any_localmem(volatile __local uint* warp_an
  }
  
  //! Returns a true if predicate is true for any work item in warp
-gmx_opencl_inline bool gmx_sub_group_any(volatile __local uint gmx_unused* warp_any,
-                                         int gmx_unused widx,
-                                         bool           pred)
+gmx_opencl_inline bool gmx_sub_group_any(volatile __local int gmx_unused* warp_any, int gmx_unused widx, bool pred)
  {
  #    if USE_SUBGROUP_ANY
      return sub_group_any(pred);
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernels.cl b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernels.cl

index b60d18a848ba6c4a12ce2e19d851827ce7394cc1..8aa7f451204069fc62bbca8953c548566f067fe1 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernels.cl
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernels.cl
@@ -40,12 +40,12 @@ __kernel void zero_e_fshift(__global float* fshift, __global float* e_lj, __glob
      unsigned int tidx = get_global_id(0);
      if (tidx < Nbuf)
      {
-        fshift[tidx] = 0.0f;
+        fshift[tidx] = 0.0F;
      }
      if (tidx == 0)
      {
-        *e_lj = 0.0f;
-        *e_el = 0.0f;
+        *e_lj = 0.0F;
+        *e_el = 0.0F;
      }
  }
author	Szilárd Páll <pall.szilard@gmail.com>
	Wed, 11 Sep 2019 22:41:01 +0000 (00:41 +0200)
committer	Paul Bauer <paul.bauer.q@gmail.com>
	Sun, 17 Nov 2019 14:59:13 +0000 (15:59 +0100)
src/gromacs/nbnxm/opencl/CMakeLists.txt		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_pruneonly.clh		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_kernels.cl		patch \| blob \| history