#endif /* CALC_ENERGIES */
/* thread/block/warp id-s */
- const unsigned int tidxi = get_local_id(0);
- const unsigned int tidxj = get_local_id(1);
- const unsigned int tidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
- const unsigned int bidx = get_group_id(0);
- const unsigned int widx = tidx / WARP_SIZE; /* warp index */
+ const int tidxi = get_local_id(0);
+ const int tidxj = get_local_id(1);
+ const int tidx = (int)(get_local_id(1) * get_local_size(0) + get_local_id(0));
+ const int bidx = get_group_id(0);
+ const int widx = tidx / WARP_SIZE; /* warp index */
/*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL=8 bits set */
const unsigned superClInteractionMask = ((1U << NCL_PER_SUPERCL) - 1U);
# undef LOCAL_OFFSET
# define LOCAL_OFFSET (f_buf + CL_SIZE * CL_SIZE * 3)
#else
- __local float* f_buf = 0;
+ __local float* f_buf = 0;
#endif
#if !USE_SUBGROUP_ANY
/* Local buffer used to implement __any warp vote function from CUDA.
volatile is used to avoid compiler optimizations for AMD builds. */
- volatile __local uint* warp_any = (__local uint*)(LOCAL_OFFSET);
+ //NOLINTNEXTLINE(google-readability-casting)
+ volatile __local int* warp_any = (__local int*)(LOCAL_OFFSET);
#else
- __local uint gmx_unused* warp_any = 0;
+ __local int gmx_unused* warp_any = 0;
#endif
#undef LOCAL_OFFSET
float4 xqbuf = xq[ai]
+ (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1],
- shift_vec[3 * nb_sci.shift + 2], 0.0f);
+ shift_vec[3 * nb_sci.shift + 2], 0.0F);
xqbuf.w *= nbparam->epsfac;
xqib[(tidxj + i) * CL_SIZE + tidxi] = xqbuf;
#ifdef IATYPE_SHMEM
float3 fci_buf[NCL_PER_SUPERCL]; /* i force buffer */
for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
{
- fci_buf[ci_offset] = (float3)(0.0f);
+ fci_buf[ci_offset] = (float3)(0.0F);
}
#ifdef LJ_EWALD
#ifdef CALC_ENERGIES
- float E_lj = 0.0f;
- float E_el = 0.0f;
+ float E_lj = 0.0F;
+ float E_el = 0.0F;
# if defined EXCLUSION_FORCES /* Ewald or RF */
if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci * NCL_PER_SUPERCL)
/* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
# ifdef LJ_EWALD
E_lj /= CL_SIZE;
- E_lj *= 0.5f * ONE_SIXTH_F * lje_coeff6_6;
+ E_lj *= HALF_F * ONE_SIXTH_F * lje_coeff6_6;
# endif /* LJ_EWALD */
# if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
/* Correct for epsfac^2 due to adding qi^2 */
E_el /= nbparam->epsfac * CL_SIZE;
# if defined EL_RF || defined EL_CUTOFF
- E_el *= -0.5f * c_rf;
+ E_el *= -HALF_F * c_rf;
# else
E_el *= -beta * M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
# endif
unsigned int imask = pl_cj4[j4].imei[widx].imask;
const unsigned int wexcl = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)];
- preloadCj4(&cjs, pl_cj4[j4].cj, tidxi, tidxj, imask != 0u);
+ preloadCj4(&cjs, pl_cj4[j4].cj, tidxi, tidxj, imask != 0U);
#ifndef PRUNE_NBL
if (imask)
const float2 ljcp_j = lj_comb[aj];
#endif
- float3 fcj_buf = (float3)(0.0f);
+ float3 fcj_buf = (float3)(0.0F);
#if !defined PRUNE_NBL
# pragma unroll 8
}
#endif
- const float int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
+ const float int_bit = (wexcl & mask_ji) ? 1.0F : 0.0F;
/* cutoff & exclusion check */
#ifdef EXCLUSION_FORCES
if ((r2 < rcoulomb_sq) * (nonSelfInteraction | (ci != cj)))
#else
- if ((r2 < rcoulomb_sq) * int_bit)
+ if ((float)(r2 < rcoulomb_sq) * int_bit != 0.0F)
#endif
{
/* load the rest of the i-atom parameters */
sig_r6 *= int_bit;
# endif /* EXCLUSION_FORCES */
- float F_invr = epsilon * sig_r6 * (sig_r6 - 1.0f) * inv_r2;
+ float F_invr = epsilon * sig_r6 * (sig_r6 - 1.0F) * inv_r2;
#endif /* ! LJ_COMB_LB || CALC_ENERGIES */
/* Separate VDW cut-off check to enable twin-range cut-offs
* (rvdw < rcoulomb <= rlist)
*/
- const float vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
+ const float vdw_in_range = (r2 < rvdw_sq) ? 1.0F : 0.0F;
F_invr *= vdw_in_range;
# ifdef CALC_ENERGIES
E_lj_p *= vdw_in_range;
E_el += qi * qj_f * (int_bit * inv_r - c_rf);
# endif
# ifdef EL_RF
- E_el += qi * qj_f * (int_bit * inv_r + 0.5f * two_k_rf * r2 - c_rf);
+ E_el += qi * qj_f * (int_bit * inv_r + HALF_F * two_k_rf * r2 - c_rf);
# endif
# ifdef EL_EWALD_ANY
- /* 1.0f - erff is faster than erfcf */
+ /* 1.0F - erff is faster than erfcf */
E_el += qi * qj_f
* (inv_r * (int_bit - erf(r2 * inv_r * beta)) - int_bit * ewald_shift);
# endif /* EL_EWALD_ANY */
const float rlistInner_sq = nbparam->rlistInner_sq;
/* thread/block/warp id-s */
- const unsigned int tidxi = get_local_id(0);
- const unsigned int tidxj = get_local_id(1);
- const unsigned int tidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
+ const int tidxi = get_local_id(0);
+ const int tidxj = get_local_id(1);
+ const int tidx = (int)(get_local_id(1) * get_local_size(0) + get_local_id(0));
#if NTHREAD_Z == 1
- const unsigned int tidxz = 0;
+ const int tidxz = 0;
#else
- const unsigned int tidxz = get_local_id(2);
+ const int tidxz = get_local_id(2);
#endif
- const unsigned int bidx = get_group_id(0);
- const unsigned int widx = tidx / WARP_SIZE;
+ const int bidx = get_group_id(0);
+ const int widx = tidx / WARP_SIZE;
#ifdef HAVE_FRESH_LIST
const bool haveFreshList = true;
#if !USE_SUBGROUP_ANY
/* Local buffer used to implement __any warp vote function from CUDA.
volatile is used to avoid compiler optimizations for AMD builds. */
- volatile __local uint* const warp_any = (__local uint*)(LOCAL_OFFSET);
- const unsigned int warpVoteSlot = NTHREAD_Z * tidxz + widx;
+ //NOLINTNEXTLINE(google-readability-casting)
+ volatile __local int* const warp_any = (__local int*)(LOCAL_OFFSET);
+ const int warpVoteSlot = NTHREAD_Z * tidxz + widx;
/* Initialise warp vote.*/
- if (tidx == 0 || tidx == 32)
+ // if (tidx == 0 || tidx == 32)
+ if (tidx % (c_clSize * c_clSize / c_nbnxnGpuClusterpairSplit) == 0)
{
warp_any[warpVoteSlot] = 0;
}
#else
- __local uint* const warp_any = 0;
- const unsigned int warpVoteSlot = 0;
+ __local int* const warp_any = 0;
+ const int warpVoteSlot = 0;
#endif
#undef LOCAL_OFFSET
const float4 tmp = xq[ai];
const float4 xi = tmp
+ (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1],
- shift_vec[3 * nb_sci.shift + 2], 0.0f);
+ shift_vec[3 * nb_sci.shift + 2], 0.0F);
xib[(tidxj + i) * c_clSize + tidxi] = xi;
}
}
imaskCheck = (imaskNew ^ imaskFull);
}
- preloadCj4(&cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck != 0u);
+ preloadCj4(&cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck != 0U);
if (imaskCheck)
{
#define USE_SUBGROUP_PRELOAD HAVE_INTEL_SUBGROUP
/* 1.0 / sqrt(M_PI) */
-#define M_FLOAT_1_SQRTPI 0.564189583547756f
+#define M_FLOAT_1_SQRTPI 0.564189583547756F
//-------------------
# define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
# define FBUF_STRIDE (CL_SIZE_SQ)
-# define ONE_SIXTH_F 0.16666667f
-# define ONE_TWELVETH_F 0.08333333f
+# define ONE_SIXTH_F 0.16666667F
+# define ONE_TWELVETH_F 0.08333333F
+# define HALF_F 0.5F
# ifdef __GNUC__
r = r2 * inv_r;
r_switch = r - nbparam->rvdw_switch;
- r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+ r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
*F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
+ c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
r = r2 * inv_r;
r_switch = r - nbparam->rvdw_switch;
- r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+ r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
*F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
+ c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
r_switch = r - nbparam->rvdw_switch;
/* Unlike in the F+E kernel, conditional is faster here */
- if (r_switch > 0.0f)
+ if (r_switch > 0.0F)
{
- sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
+ sw = 1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
*F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
r = r2 * inv_r;
r_switch = r - nbparam->rvdw_switch;
- r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+ r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
/* Unlike in the F-only kernel, masking is faster here */
- sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
+ sw = 1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
*F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
cr2 = lje_coeff2 * r2;
expmcr2 = exp(-cr2);
- poly = 1.0f + cr2 + 0.5f * cr2 * cr2;
+ poly = 1.0F + cr2 + HALF_F * cr2 * cr2;
/* Subtract the grid force from the total LJ force */
*F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
cr2 = lje_coeff2 * r2;
expmcr2 = exp(-cr2);
- poly = 1.0f + cr2 + 0.5f * cr2 * cr2;
+ poly = 1.0F + cr2 + HALF_F * cr2 * cr2;
/* Subtract the grid force from the total LJ force */
*F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
/* Shift should be applied only to real LJ pairs */
sh_mask = nbparam->sh_lj_ewald * int_bit;
- *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0f - expmcr2 * poly) + sh_mask);
+ *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
}
/*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
cr2 = lje_coeff2 * r2;
expmcr2 = exp(-cr2);
- poly = 1.0f + cr2 + 0.5f * cr2 * cr2;
+ poly = 1.0F + cr2 + HALF_F * cr2 * cr2;
/* Subtract the grid force from the total LJ force */
*F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
/* Shift should be applied only to real LJ pairs */
sh_mask = nbparam->sh_lj_ewald * int_bit;
- *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0f - expmcr2 * poly) + sh_mask);
+ *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
}
}
{
float normalized = scale * r;
int index = (int)normalized;
- float fract2 = normalized - index;
- float fract1 = 1.0f - fract2;
+ float fract2 = normalized - (float)index;
+ float fract1 = 1.0F - fract2;
return fract1 * coulomb_tab_climg2d[index] + fract2 * coulomb_tab_climg2d[index + 1];
}
/*! Calculate analytical Ewald correction term. */
gmx_opencl_inline float pmecorrF(float z2)
{
- const float FN6 = -1.7357322914161492954e-8f;
- const float FN5 = 1.4703624142580877519e-6f;
- const float FN4 = -0.000053401640219807709149f;
- const float FN3 = 0.0010054721316683106153f;
- const float FN2 = -0.019278317264888380590f;
- const float FN1 = 0.069670166153766424023f;
- const float FN0 = -0.75225204789749321333f;
+ const float FN6 = -1.7357322914161492954e-8F;
+ const float FN5 = 1.4703624142580877519e-6F;
+ const float FN4 = -0.000053401640219807709149F;
+ const float FN3 = 0.0010054721316683106153F;
+ const float FN2 = -0.019278317264888380590F;
+ const float FN1 = 0.069670166153766424023F;
+ const float FN0 = -0.75225204789749321333F;
- const float FD4 = 0.0011193462567257629232f;
- const float FD3 = 0.014866955030185295499f;
- const float FD2 = 0.11583842382862377919f;
- const float FD1 = 0.50736591960530292870f;
- const float FD0 = 1.0f;
+ const float FD4 = 0.0011193462567257629232F;
+ const float FD3 = 0.014866955030185295499F;
+ const float FD2 = 0.11583842382862377919F;
+ const float FD1 = 0.50736591960530292870F;
+ const float FD0 = 1.0F;
float z4;
float polyFN0, polyFN1, polyFD0, polyFD1;
polyFD0 = polyFD0 * z4 + FD0;
polyFD0 = polyFD1 * z2 + polyFD0;
- polyFD0 = 1.0f / polyFD0;
+ polyFD0 = 1.0F / polyFD0;
polyFN0 = FN6 * z4 + FN4;
polyFN1 = FN5 * z4 + FN3;
The reduction is performed for each line tidxj of f_buf. */
if (tidxi < 3)
{
- float f = 0.0f;
+ float f = 0.0F;
for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
{
f += f_buf[FBUF_STRIDE * tidxi + j];
f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
barrier(CLK_LOCAL_MEM_FENCE);
- int i, j;
/* Reduce the initial CL_SIZE values for each i atom to half
* every step by using CL_SIZE * i threads.
* Can't just use i as loop variable because than nvcc refuses to unroll.
*/
- i = CL_SIZE / 2;
- for (j = CL_SIZE_LOG2 - 1; j > 0; j--)
+ int i = CL_SIZE / 2;
+ for (int j = CL_SIZE_LOG2 - 1; j > 0; j--)
{
if (tidxj < i)
{
float E_el,
volatile __global float* e_lj,
volatile __global float* e_el,
- unsigned int tidx)
+ int tidx)
{
E_lj = sub_group_reduce_add(E_lj);
E_el = sub_group_reduce_add(E_el);
gmx_opencl_inline void reduce_energy_pow2(volatile __local float* buf,
volatile __global float* e_lj,
volatile __global float* e_el,
- unsigned int tidx)
+ int tidx)
{
int j;
- unsigned int i = WARP_SIZE / 2;
+ int i = WARP_SIZE / 2;
/* Can't just use i as loop variable because than nvcc refuses to unroll. */
for (j = WARP_SIZE_LOG2 - 1; j > 0; j--)
float E_el,
volatile __global float* e_lj,
volatile __global float* e_el,
- unsigned int tidx)
+ int tidx)
{
# if REDUCE_SHUFFLE
reduce_energy_shfl(E_lj, E_el, e_lj, e_el, tidx);
# endif
}
-gmx_opencl_inline bool gmx_sub_group_any_localmem(volatile __local uint* warp_any, int widx, bool pred)
+gmx_opencl_inline bool gmx_sub_group_any_localmem(volatile __local int* warp_any, int widx, bool pred)
{
if (pred)
{
}
//! Returns a true if predicate is true for any work item in warp
-gmx_opencl_inline bool gmx_sub_group_any(volatile __local uint gmx_unused* warp_any,
- int gmx_unused widx,
- bool pred)
+gmx_opencl_inline bool gmx_sub_group_any(volatile __local int gmx_unused* warp_any, int gmx_unused widx, bool pred)
{
# if USE_SUBGROUP_ANY
return sub_group_any(pred);