From: Szilárd Páll Date: Wed, 11 Sep 2019 22:41:01 +0000 (+0200) Subject: Fix clang-tidy 8 warnings in OpenCL nbnxm kernels X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=e3d577301e895c2a69311e341ba3bc16fa86c514;p=alexxy%2Fgromacs.git Fix clang-tidy 8 warnings in OpenCL nbnxm kernels Since the bump from v7 to 8 new warnings need to be dealt with. Change-Id: I392308a01087797dcbf3522df53c7fbd7f1fa31a --- diff --git a/src/gromacs/nbnxm/opencl/CMakeLists.txt b/src/gromacs/nbnxm/opencl/CMakeLists.txt index 7f05efbd71..d9e5b32e42 100644 --- a/src/gromacs/nbnxm/opencl/CMakeLists.txt +++ b/src/gromacs/nbnxm/opencl/CMakeLists.txt @@ -54,7 +54,8 @@ set(VDW_DEFS "-DLJ_EWALD_COMB_LB\;-DVDWNAME=_VdwLJEwCombLB") if(CLANG_TIDY_EXE) set(OCL_COMPILER "${CLANG_TIDY_EXE}") - set(CLANG_TIDY_ARGS "-quiet;-checks=*,-readability-implicit-bool-conversion,-llvm-header-guard,-hicpp-signed-bitwise,-clang-analyzer-deadcode.DeadStores,-google-readability-todo;--;${CMAKE_C_COMPILER}") + # TODO: remove readability-isolate-declaration when the nbnxm OpenCL kernels get modernized + set(CLANG_TIDY_ARGS "-quiet;-checks=*,-readability-isolate-declaration,-readability-implicit-bool-conversion,-llvm-header-guard,-hicpp-signed-bitwise,-clang-analyzer-deadcode.DeadStores,-google-readability-todo;--;${CMAKE_C_COMPILER}") else() set(OCL_COMPILER "${CMAKE_C_COMPILER}") endif() diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh index abaff9bc5c..cbdb45f01a 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh @@ -171,11 +171,11 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) #endif /* CALC_ENERGIES */ /* thread/block/warp id-s */ - const unsigned int tidxi = get_local_id(0); - const unsigned int tidxj = get_local_id(1); - const unsigned int tidx = get_local_id(1) * get_local_size(0) + get_local_id(0); - const unsigned int bidx = get_group_id(0); - const unsigned int widx = tidx / WARP_SIZE; /* warp index */ + const int tidxi = get_local_id(0); + const int tidxj = get_local_id(1); + const int tidx = (int)(get_local_id(1) * get_local_size(0) + get_local_id(0)); + const int bidx = get_group_id(0); + const int widx = tidx / WARP_SIZE; /* warp index */ /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL=8 bits set */ const unsigned superClInteractionMask = ((1U << NCL_PER_SUPERCL) - 1U); @@ -208,14 +208,15 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) # undef LOCAL_OFFSET # define LOCAL_OFFSET (f_buf + CL_SIZE * CL_SIZE * 3) #else - __local float* f_buf = 0; + __local float* f_buf = 0; #endif #if !USE_SUBGROUP_ANY /* Local buffer used to implement __any warp vote function from CUDA. volatile is used to avoid compiler optimizations for AMD builds. */ - volatile __local uint* warp_any = (__local uint*)(LOCAL_OFFSET); + //NOLINTNEXTLINE(google-readability-casting) + volatile __local int* warp_any = (__local int*)(LOCAL_OFFSET); #else - __local uint gmx_unused* warp_any = 0; + __local int gmx_unused* warp_any = 0; #endif #undef LOCAL_OFFSET @@ -232,7 +233,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) float4 xqbuf = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], - shift_vec[3 * nb_sci.shift + 2], 0.0f); + shift_vec[3 * nb_sci.shift + 2], 0.0F); xqbuf.w *= nbparam->epsfac; xqib[(tidxj + i) * CL_SIZE + tidxi] = xqbuf; #ifdef IATYPE_SHMEM @@ -256,7 +257,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) float3 fci_buf[NCL_PER_SUPERCL]; /* i force buffer */ for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++) { - fci_buf[ci_offset] = (float3)(0.0f); + fci_buf[ci_offset] = (float3)(0.0F); } #ifdef LJ_EWALD @@ -267,8 +268,8 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) #ifdef CALC_ENERGIES - float E_lj = 0.0f; - float E_el = 0.0f; + float E_lj = 0.0F; + float E_el = 0.0F; # if defined EXCLUSION_FORCES /* Ewald or RF */ if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci * NCL_PER_SUPERCL) @@ -288,14 +289,14 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */ # ifdef LJ_EWALD E_lj /= CL_SIZE; - E_lj *= 0.5f * ONE_SIXTH_F * lje_coeff6_6; + E_lj *= HALF_F * ONE_SIXTH_F * lje_coeff6_6; # endif /* LJ_EWALD */ # if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF /* Correct for epsfac^2 due to adding qi^2 */ E_el /= nbparam->epsfac * CL_SIZE; # if defined EL_RF || defined EL_CUTOFF - E_el *= -0.5f * c_rf; + E_el *= -HALF_F * c_rf; # else E_el *= -beta * M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */ # endif @@ -316,7 +317,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) unsigned int imask = pl_cj4[j4].imei[widx].imask; const unsigned int wexcl = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)]; - preloadCj4(&cjs, pl_cj4[j4].cj, tidxi, tidxj, imask != 0u); + preloadCj4(&cjs, pl_cj4[j4].cj, tidxi, tidxj, imask != 0U); #ifndef PRUNE_NBL if (imask) @@ -351,7 +352,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) const float2 ljcp_j = lj_comb[aj]; #endif - float3 fcj_buf = (float3)(0.0f); + float3 fcj_buf = (float3)(0.0F); #if !defined PRUNE_NBL # pragma unroll 8 @@ -377,13 +378,13 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) } #endif - const float int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f; + const float int_bit = (wexcl & mask_ji) ? 1.0F : 0.0F; /* cutoff & exclusion check */ #ifdef EXCLUSION_FORCES if ((r2 < rcoulomb_sq) * (nonSelfInteraction | (ci != cj))) #else - if ((r2 < rcoulomb_sq) * int_bit) + if ((float)(r2 < rcoulomb_sq) * int_bit != 0.0F) #endif { /* load the rest of the i-atom parameters */ @@ -451,7 +452,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) sig_r6 *= int_bit; # endif /* EXCLUSION_FORCES */ - float F_invr = epsilon * sig_r6 * (sig_r6 - 1.0f) * inv_r2; + float F_invr = epsilon * sig_r6 * (sig_r6 - 1.0F) * inv_r2; #endif /* ! LJ_COMB_LB || CALC_ENERGIES */ @@ -498,7 +499,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) /* Separate VDW cut-off check to enable twin-range cut-offs * (rvdw < rcoulomb <= rlist) */ - const float vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f; + const float vdw_in_range = (r2 < rvdw_sq) ? 1.0F : 0.0F; F_invr *= vdw_in_range; # ifdef CALC_ENERGIES E_lj_p *= vdw_in_range; @@ -537,10 +538,10 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) E_el += qi * qj_f * (int_bit * inv_r - c_rf); # endif # ifdef EL_RF - E_el += qi * qj_f * (int_bit * inv_r + 0.5f * two_k_rf * r2 - c_rf); + E_el += qi * qj_f * (int_bit * inv_r + HALF_F * two_k_rf * r2 - c_rf); # endif # ifdef EL_EWALD_ANY - /* 1.0f - erff is faster than erfcf */ + /* 1.0F - erff is faster than erfcf */ E_el += qi * qj_f * (inv_r * (int_bit - erf(r2 * inv_r * beta)) - int_bit * ewald_shift); # endif /* EL_EWALD_ANY */ diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_pruneonly.clh b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_pruneonly.clh index 96a40b60d6..475c1d68b0 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_pruneonly.clh +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_pruneonly.clh @@ -83,16 +83,16 @@ nbnxn_kernel_prune_rolling_opencl const float rlistInner_sq = nbparam->rlistInner_sq; /* thread/block/warp id-s */ - const unsigned int tidxi = get_local_id(0); - const unsigned int tidxj = get_local_id(1); - const unsigned int tidx = get_local_id(1) * get_local_size(0) + get_local_id(0); + const int tidxi = get_local_id(0); + const int tidxj = get_local_id(1); + const int tidx = (int)(get_local_id(1) * get_local_size(0) + get_local_id(0)); #if NTHREAD_Z == 1 - const unsigned int tidxz = 0; + const int tidxz = 0; #else - const unsigned int tidxz = get_local_id(2); + const int tidxz = get_local_id(2); #endif - const unsigned int bidx = get_group_id(0); - const unsigned int widx = tidx / WARP_SIZE; + const int bidx = get_group_id(0); + const int widx = tidx / WARP_SIZE; #ifdef HAVE_FRESH_LIST const bool haveFreshList = true; @@ -124,16 +124,18 @@ nbnxn_kernel_prune_rolling_opencl #if !USE_SUBGROUP_ANY /* Local buffer used to implement __any warp vote function from CUDA. volatile is used to avoid compiler optimizations for AMD builds. */ - volatile __local uint* const warp_any = (__local uint*)(LOCAL_OFFSET); - const unsigned int warpVoteSlot = NTHREAD_Z * tidxz + widx; + //NOLINTNEXTLINE(google-readability-casting) + volatile __local int* const warp_any = (__local int*)(LOCAL_OFFSET); + const int warpVoteSlot = NTHREAD_Z * tidxz + widx; /* Initialise warp vote.*/ - if (tidx == 0 || tidx == 32) + // if (tidx == 0 || tidx == 32) + if (tidx % (c_clSize * c_clSize / c_nbnxnGpuClusterpairSplit) == 0) { warp_any[warpVoteSlot] = 0; } #else - __local uint* const warp_any = 0; - const unsigned int warpVoteSlot = 0; + __local int* const warp_any = 0; + const int warpVoteSlot = 0; #endif #undef LOCAL_OFFSET @@ -155,7 +157,7 @@ nbnxn_kernel_prune_rolling_opencl const float4 tmp = xq[ai]; const float4 xi = tmp + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], - shift_vec[3 * nb_sci.shift + 2], 0.0f); + shift_vec[3 * nb_sci.shift + 2], 0.0F); xib[(tidxj + i) * c_clSize + tidxi] = xi; } } @@ -185,7 +187,7 @@ nbnxn_kernel_prune_rolling_opencl imaskCheck = (imaskNew ^ imaskFull); } - preloadCj4(&cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck != 0u); + preloadCj4(&cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck != 0U); if (imaskCheck) { diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh index 73b81efb51..ec5d40f83d 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh @@ -82,7 +82,7 @@ #define USE_SUBGROUP_PRELOAD HAVE_INTEL_SUBGROUP /* 1.0 / sqrt(M_PI) */ -#define M_FLOAT_1_SQRTPI 0.564189583547756f +#define M_FLOAT_1_SQRTPI 0.564189583547756F //------------------- @@ -102,8 +102,9 @@ # define CL_SIZE_SQ (CL_SIZE * CL_SIZE) # define FBUF_STRIDE (CL_SIZE_SQ) -# define ONE_SIXTH_F 0.16666667f -# define ONE_TWELVETH_F 0.08333333f +# define ONE_SIXTH_F 0.16666667F +# define ONE_TWELVETH_F 0.08333333F +# define HALF_F 0.5F # ifdef __GNUC__ @@ -321,7 +322,7 @@ gmx_opencl_inline void calculate_force_switch_F(const cl_nbparam_params_t* nbpar r = r2 * inv_r; r_switch = r - nbparam->rvdw_switch; - r_switch = r_switch >= 0.0f ? r_switch : 0.0f; + r_switch = r_switch >= 0.0F ? r_switch : 0.0F; *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r; @@ -351,7 +352,7 @@ gmx_opencl_inline void calculate_force_switch_F_E(const cl_nbparam_params_t* nbp r = r2 * inv_r; r_switch = r - nbparam->rvdw_switch; - r_switch = r_switch >= 0.0f ? r_switch : 0.0f; + r_switch = r_switch >= 0.0F ? r_switch : 0.0F; *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r; @@ -381,9 +382,9 @@ gmx_opencl_inline void calculate_potential_switch_F(const cl_nbparam_params_t* n r_switch = r - nbparam->rvdw_switch; /* Unlike in the F+E kernel, conditional is faster here */ - if (r_switch > 0.0f) + if (r_switch > 0.0F) { - sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch; + sw = 1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch; dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch; *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw; @@ -410,10 +411,10 @@ gmx_opencl_inline void calculate_potential_switch_F_E(const cl_nbparam_params_t* r = r2 * inv_r; r_switch = r - nbparam->rvdw_switch; - r_switch = r_switch >= 0.0f ? r_switch : 0.0f; + r_switch = r_switch >= 0.0F ? r_switch : 0.0F; /* Unlike in the F-only kernel, masking is faster here */ - sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch; + sw = 1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch; dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch; *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw; @@ -440,7 +441,7 @@ gmx_opencl_inline void calculate_lj_ewald_comb_geom_F(__constant const float* nb inv_r6_nm = inv_r2 * inv_r2 * inv_r2; cr2 = lje_coeff2 * r2; expmcr2 = exp(-cr2); - poly = 1.0f + cr2 + 0.5f * cr2 * cr2; + poly = 1.0F + cr2 + HALF_F * cr2 * cr2; /* Subtract the grid force from the total LJ force */ *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2; @@ -469,14 +470,14 @@ gmx_opencl_inline void calculate_lj_ewald_comb_geom_F_E(__constant const float* inv_r6_nm = inv_r2 * inv_r2 * inv_r2; cr2 = lje_coeff2 * r2; expmcr2 = exp(-cr2); - poly = 1.0f + cr2 + 0.5f * cr2 * cr2; + poly = 1.0F + cr2 + HALF_F * cr2 * cr2; /* Subtract the grid force from the total LJ force */ *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2; /* Shift should be applied only to real LJ pairs */ sh_mask = nbparam->sh_lj_ewald * int_bit; - *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0f - expmcr2 * poly) + sh_mask); + *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask); } /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with @@ -512,7 +513,7 @@ gmx_opencl_inline void calculate_lj_ewald_comb_LB_F_E(__constant const float* inv_r6_nm = inv_r2 * inv_r2 * inv_r2; cr2 = lje_coeff2 * r2; expmcr2 = exp(-cr2); - poly = 1.0f + cr2 + 0.5f * cr2 * cr2; + poly = 1.0F + cr2 + HALF_F * cr2 * cr2; /* Subtract the grid force from the total LJ force */ *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2; @@ -523,7 +524,7 @@ gmx_opencl_inline void calculate_lj_ewald_comb_LB_F_E(__constant const float* /* Shift should be applied only to real LJ pairs */ sh_mask = nbparam->sh_lj_ewald * int_bit; - *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0f - expmcr2 * poly) + sh_mask); + *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask); } } @@ -536,8 +537,8 @@ gmx_opencl_inline float interpolate_coulomb_force_r(__constant const float* coul { float normalized = scale * r; int index = (int)normalized; - float fract2 = normalized - index; - float fract1 = 1.0f - fract2; + float fract2 = normalized - (float)index; + float fract1 = 1.0F - fract2; return fract1 * coulomb_tab_climg2d[index] + fract2 * coulomb_tab_climg2d[index + 1]; } @@ -545,19 +546,19 @@ gmx_opencl_inline float interpolate_coulomb_force_r(__constant const float* coul /*! Calculate analytical Ewald correction term. */ gmx_opencl_inline float pmecorrF(float z2) { - const float FN6 = -1.7357322914161492954e-8f; - const float FN5 = 1.4703624142580877519e-6f; - const float FN4 = -0.000053401640219807709149f; - const float FN3 = 0.0010054721316683106153f; - const float FN2 = -0.019278317264888380590f; - const float FN1 = 0.069670166153766424023f; - const float FN0 = -0.75225204789749321333f; + const float FN6 = -1.7357322914161492954e-8F; + const float FN5 = 1.4703624142580877519e-6F; + const float FN4 = -0.000053401640219807709149F; + const float FN3 = 0.0010054721316683106153F; + const float FN2 = -0.019278317264888380590F; + const float FN1 = 0.069670166153766424023F; + const float FN0 = -0.75225204789749321333F; - const float FD4 = 0.0011193462567257629232f; - const float FD3 = 0.014866955030185295499f; - const float FD2 = 0.11583842382862377919f; - const float FD1 = 0.50736591960530292870f; - const float FD0 = 1.0f; + const float FD4 = 0.0011193462567257629232F; + const float FD3 = 0.014866955030185295499F; + const float FD2 = 0.11583842382862377919F; + const float FD1 = 0.50736591960530292870F; + const float FD0 = 1.0F; float z4; float polyFN0, polyFN1, polyFD0, polyFD1; @@ -569,7 +570,7 @@ gmx_opencl_inline float pmecorrF(float z2) polyFD0 = polyFD0 * z4 + FD0; polyFD0 = polyFD1 * z2 + polyFD0; - polyFD0 = 1.0f / polyFD0; + polyFD0 = 1.0F / polyFD0; polyFN0 = FN6 * z4 + FN4; polyFN1 = FN5 * z4 + FN3; @@ -622,7 +623,7 @@ reduce_force_j_generic(__local float* f_buf, float3 fcj_buf, __global float* fou The reduction is performed for each line tidxj of f_buf. */ if (tidxi < 3) { - float f = 0.0f; + float f = 0.0F; for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++) { f += f_buf[FBUF_STRIDE * tidxi + j]; @@ -727,13 +728,12 @@ gmx_opencl_inline void reduce_force_i_and_shift_pow2(volatile __local float* f_b f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z; barrier(CLK_LOCAL_MEM_FENCE); - int i, j; /* Reduce the initial CL_SIZE values for each i atom to half * every step by using CL_SIZE * i threads. * Can't just use i as loop variable because than nvcc refuses to unroll. */ - i = CL_SIZE / 2; - for (j = CL_SIZE_LOG2 - 1; j > 0; j--) + int i = CL_SIZE / 2; + for (int j = CL_SIZE_LOG2 - 1; j > 0; j--) { if (tidxj < i) { @@ -809,7 +809,7 @@ gmx_opencl_inline void reduce_energy_shfl(float E_lj, float E_el, volatile __global float* e_lj, volatile __global float* e_el, - unsigned int tidx) + int tidx) { E_lj = sub_group_reduce_add(E_lj); E_el = sub_group_reduce_add(E_el); @@ -832,11 +832,11 @@ gmx_opencl_inline void reduce_energy_shfl(float E_lj, gmx_opencl_inline void reduce_energy_pow2(volatile __local float* buf, volatile __global float* e_lj, volatile __global float* e_el, - unsigned int tidx) + int tidx) { int j; - unsigned int i = WARP_SIZE / 2; + int i = WARP_SIZE / 2; /* Can't just use i as loop variable because than nvcc refuses to unroll. */ for (j = WARP_SIZE_LOG2 - 1; j > 0; j--) @@ -865,7 +865,7 @@ gmx_opencl_inline void reduce_energy(volatile __local float gmx_unused* buf, float E_el, volatile __global float* e_lj, volatile __global float* e_el, - unsigned int tidx) + int tidx) { # if REDUCE_SHUFFLE reduce_energy_shfl(E_lj, E_el, e_lj, e_el, tidx); @@ -877,7 +877,7 @@ gmx_opencl_inline void reduce_energy(volatile __local float gmx_unused* buf, # endif } -gmx_opencl_inline bool gmx_sub_group_any_localmem(volatile __local uint* warp_any, int widx, bool pred) +gmx_opencl_inline bool gmx_sub_group_any_localmem(volatile __local int* warp_any, int widx, bool pred) { if (pred) { @@ -892,9 +892,7 @@ gmx_opencl_inline bool gmx_sub_group_any_localmem(volatile __local uint* warp_an } //! Returns a true if predicate is true for any work item in warp -gmx_opencl_inline bool gmx_sub_group_any(volatile __local uint gmx_unused* warp_any, - int gmx_unused widx, - bool pred) +gmx_opencl_inline bool gmx_sub_group_any(volatile __local int gmx_unused* warp_any, int gmx_unused widx, bool pred) { # if USE_SUBGROUP_ANY return sub_group_any(pred); diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernels.cl b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernels.cl index b60d18a848..8aa7f45120 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernels.cl +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernels.cl @@ -40,12 +40,12 @@ __kernel void zero_e_fshift(__global float* fshift, __global float* e_lj, __glob unsigned int tidx = get_global_id(0); if (tidx < Nbuf) { - fshift[tidx] = 0.0f; + fshift[tidx] = 0.0F; } if (tidx == 0) { - *e_lj = 0.0f; - *e_el = 0.0f; + *e_lj = 0.0F; + *e_el = 0.0F; } }