Also made convert_sigma_epsilon_to_c6_c12() return a float2 with c6 and
c12 instead of returning results in pointers.
* \author Aleksei Iupinov <a.yupinov@gmail.com>
*/
* \author Aleksei Iupinov <a.yupinov@gmail.com>
*/
-#include "pme_gpu_types.h"
#include "pme_gpu_calculate_splines.clh"
#include "pme_gpu_calculate_splines.clh"
+#include "pme_gpu_types.h"
#ifndef COMPILE_GATHER_HELPERS_ONCE
# define COMPILE_GATHER_HELPERS_ONCE
#ifndef COMPILE_GATHER_HELPERS_ONCE
# define COMPILE_GATHER_HELPERS_ONCE
if (numGrids == 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (numGrids == 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
- fx = 0.0f;
- fy = 0.0f;
- fz = 0.0f;
+ fx = 0.0F;
+ fy = 0.0F;
+ fz = 0.0F;
chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsB[atomIndexGlobal]);
if (chargeCheck)
{
chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsB[atomIndexGlobal]);
if (chargeCheck)
{
#pragma unroll
for (int i = 0; i < numIter; i++)
{
#pragma unroll
for (int i = 0; i < numIter; i++)
{
- const int outputIndexLocal = i * iterThreads + threadLocalId;
- const int outputIndexGlobal = get_group_id(XX) * blockForcesSize + outputIndexLocal;
+ const int outputIndexLocal = i * iterThreads + threadLocalId;
+ const int outputIndexGlobal = (int)get_group_id(XX) * blockForcesSize + outputIndexLocal;
const float outputForceComponent = sm_forces[outputIndexLocal];
gm_forces[outputIndexGlobal] += outputForceComponent;
}
const float outputForceComponent = sm_forces[outputIndexLocal];
gm_forces[outputIndexGlobal] += outputForceComponent;
}
__global float2* __restrict__ gm_grid)
{
/* This kernel supports 2 different grid dimension orderings: YZX and XYZ */
__global float2* __restrict__ gm_grid)
{
/* This kernel supports 2 different grid dimension orderings: YZX and XYZ */
- int majorDim;
- int middleDim;
- int minorDim;
+ int majorDim = 0;
+ int middleDim = 0;
+ int minorDim = 0;
if (gridOrdering == YZX)
{
majorDim = YY;
if (gridOrdering == YZX)
{
majorDim = YY;
/* We should skip the k-space point (0,0,0) */
const bool notZeroPoint = (kMinor > 0) | (kMajor > 0) | (kMiddle > 0);
/* We should skip the k-space point (0,0,0) */
const bool notZeroPoint = (kMinor > 0) | (kMajor > 0) | (kMiddle > 0);
- float mX;
- float mY;
- float mZ;
+ float mX = 0.0F;
+ float mY = 0.0F;
+ float mZ = 0.0F;
if (gridOrdering == YZX)
{
mX = mMinor;
if (gridOrdering == YZX)
{
mX = mMinor;
#include "gromacs/gpu_utils/vectype_ops.clh"
#include "gromacs/gpu_utils/vectype_ops.clh"
-#include "pme_gpu_types.h"
#include "pme_gpu_calculate_splines.clh"
#include "pme_gpu_calculate_splines.clh"
+#include "pme_gpu_types.h"
/*
* This define affects the spline calculation behaviour in the kernel.
/*
* This define affects the spline calculation behaviour in the kernel.
/* Indices interpolation */
if (orderIndex == 0)
{
/* Indices interpolation */
if (orderIndex == 0)
{
- int tableIndex;
- int tInt;
- float n;
- float t;
- const float3 x = vload3(atomIndexLocal, sm_coordinates);
+ int tableIndex = 0;
+ float n = 0.0F;
+ float t = 0.0F;
+ const float3 x = vload3(atomIndexLocal, sm_coordinates);
/* Accessing fields in fshOffset/nXYZ/recipbox/... with dimIndex offset
* puts them into local memory(!) instead of accessing the constant memory directly.
/* Accessing fields in fshOffset/nXYZ/recipbox/... with dimIndex offset
* puts them into local memory(!) instead of accessing the constant memory directly.
/* Fractional coordinates along box vectors, adding a positive shift to ensure t is positive for triclinic boxes */
t = (t + shift) * n;
/* Fractional coordinates along box vectors, adding a positive shift to ensure t is positive for triclinic boxes */
t = (t + shift) * n;
+ const int tInt = (int)t;
sm_fractCoords[sharedMemoryIndex] = t - (float)tInt;
tableIndex += tInt;
assert(tInt >= 0);
sm_fractCoords[sharedMemoryIndex] = t - (float)tInt;
tableIndex += tInt;
assert(tInt >= 0);
const int chargeCheck = pme_gpu_check_atom_charge(sm_coefficients[atomIndexLocal]);
if (chargeCheck)
{
const int chargeCheck = pme_gpu_check_atom_charge(sm_coefficients[atomIndexLocal]);
if (chargeCheck)
{
int o = orderIndex; // This is an index that is set once for PME_GPU_PARALLEL_SPLINE == 1
const float dr = sm_fractCoords[sharedMemoryIndex];
int o = orderIndex; // This is an index that is set once for PME_GPU_PARALLEL_SPLINE == 1
const float dr = sm_fractCoords[sharedMemoryIndex];
# pragma unroll order
for (int k = 3; k < order; k++)
{
# pragma unroll order
for (int k = 3; k < order; k++)
{
- div = 1.0F / ((float)k - 1.0F);
+ const float div = 1.0F / ((float)k - 1.0F);
*SPLINE_DATA_PTR(k - 1) = div * dr * SPLINE_DATA(k - 2);
# pragma unroll
for (int l = 1; l < (k - 1); l++)
*SPLINE_DATA_PTR(k - 1) = div * dr * SPLINE_DATA(k - 2);
# pragma unroll
for (int l = 1; l < (k - 1); l++)
gm_dtheta[thetaGlobalIndex] = dtheta;
}
gm_dtheta[thetaGlobalIndex] = dtheta;
}
- div = 1.0F / (order - 1.0F);
+ const float div = 1.0F / (order - 1.0F);
*SPLINE_DATA_PTR(order - 1) = div * dr * SPLINE_DATA(order - 2);
# pragma unroll
for (int k = 1; k < (order - 1); k++)
*SPLINE_DATA_PTR(order - 1) = div * dr * SPLINE_DATA(order - 2);
# pragma unroll
for (int k = 1; k < (order - 1); k++)
-Xclang -finclude-default-header -D_${VENDOR}_SOURCE_
-DGMX_OCL_FASTGEN ${ELEC_DEF} ${VDW_DEF}
-Dc_nbnxnGpuClusterSize=${CLUSTER_SIZE}
-Xclang -finclude-default-header -D_${VENDOR}_SOURCE_
-DGMX_OCL_FASTGEN ${ELEC_DEF} ${VDW_DEF}
-Dc_nbnxnGpuClusterSize=${CLUSTER_SIZE}
- -Dc_nbnxnMinDistanceSquared=3.82e-07F
+ -DNBNXM_MIN_DISTANCE_SQUARED_VALUE_FLOAT=3.82e-07
-Dc_nbnxnGpuNumClusterPerSupercluster=8
-Dc_nbnxnGpuJgroupSize=4
-DIATYPE_SHMEM
-Dc_nbnxnGpuNumClusterPerSupercluster=8
-Dc_nbnxnGpuJgroupSize=4
-DIATYPE_SHMEM
/* Here we pass macros and static const/constexpr int variables defined
* in include files outside the opencl as macros, to avoid
* including those files in the plain-C JIT compilation that happens
/* Here we pass macros and static const/constexpr int variables defined
* in include files outside the opencl as macros, to avoid
* including those files in the plain-C JIT compilation that happens
+ * at runtime.
+ * Note that we need to re-add the the suffix to the floating point literals
+ * passed the to the kernel to avoid type ambiguity.
+ */
extraDefines += gmx::formatString(
" -Dc_nbnxnGpuClusterSize=%d"
extraDefines += gmx::formatString(
" -Dc_nbnxnGpuClusterSize=%d"
- " -Dc_nbnxnMinDistanceSquared=%g"
+ " -DNBNXM_MIN_DISTANCE_SQUARED_VALUE_FLOAT=%g"
" -Dc_nbnxnGpuNumClusterPerSupercluster=%d"
" -Dc_nbnxnGpuJgroupSize=%d"
"%s",
" -Dc_nbnxnGpuNumClusterPerSupercluster=%d"
" -Dc_nbnxnGpuJgroupSize=%d"
"%s",
const float c12 = ljcp_i.y * ljcp_j.y;
# else
/* LJ 2^(1/6)*sigma and 12*epsilon */
const float c12 = ljcp_i.y * ljcp_j.y;
# else
/* LJ 2^(1/6)*sigma and 12*epsilon */
- const float sigma = ljcp_i.x + ljcp_j.x;
- const float epsilon = ljcp_i.y * ljcp_j.y;
+ const float sigma = ljcp_i.x + ljcp_j.x;
+ const float epsilon = ljcp_i.y * ljcp_j.y;
# if defined CALC_ENERGIES || defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
# if defined CALC_ENERGIES || defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
- float c6, c12;
- convert_sigma_epsilon_to_c6_c12(sigma, epsilon, &c6, &c12);
+ const float2 c6c12 = convert_sigma_epsilon_to_c6_c12(sigma, epsilon);
+ const float c6 = c6c12.x;
+ const float c12 = c6c12.y;
# endif
# endif /* LJ_COMB_GEOM */
#endif /* LJ_COMB */
// Ensure distance do not become so small that r^-12 overflows.
# endif
# endif /* LJ_COMB_GEOM */
#endif /* LJ_COMB */
// Ensure distance do not become so small that r^-12 overflows.
- // Cast to float to ensure the correct built-in max() function
- // is called.
- r2 = max(r2, (float)c_nbnxnMinDistanceSquared);
+ r2 = max(r2, c_nbnxnMinDistanceSquared);
const float inv_r = rsqrt(r2);
const float inv_r2 = inv_r * inv_r;
const float inv_r = rsqrt(r2);
const float inv_r2 = inv_r * inv_r;
/* loop over the j clusters = seen by any of the atoms in the current super-cluster */
for (int j4 = cij4_start + tidxz; j4 < cij4_end; j4 += NTHREAD_Z)
{
/* loop over the j clusters = seen by any of the atoms in the current super-cluster */
for (int j4 = cij4_start + tidxz; j4 < cij4_end; j4 += NTHREAD_Z)
{
- unsigned int imaskFull, imaskCheck, imaskNew;
+ unsigned int imaskFull = 0, imaskCheck = 0, imaskNew = 0;
/*! i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster bits set */
__constant unsigned supercl_interaction_mask = ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U);
/*! i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster bits set */
__constant unsigned supercl_interaction_mask = ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U);
+/*! Minimum single precision threshold for r^2 to avoid r^-12 overflow. */
+__constant float c_nbnxnMinDistanceSquared = NBNXM_MIN_DISTANCE_SQUARED_VALUE_FLOAT;
+
gmx_opencl_inline void preloadCj4Generic(__local int* sm_cjPreload,
const __global int* gm_cj,
int tidxi,
gmx_opencl_inline void preloadCj4Generic(__local int* sm_cjPreload,
const __global int* gm_cj,
int tidxi,
}
/*! Convert LJ sigma,epsilon parameters to C6,C12. */
}
/*! Convert LJ sigma,epsilon parameters to C6,C12. */
-gmx_opencl_inline void convert_sigma_epsilon_to_c6_c12(const float sigma, const float epsilon, float* c6, float* c12)
-{
- float sigma2, sigma6;
-
- sigma2 = sigma * sigma;
- sigma6 = sigma2 * sigma2 * sigma2;
- *c6 = epsilon * sigma6;
- *c12 = *c6 * sigma6;
+gmx_opencl_inline float2 convert_sigma_epsilon_to_c6_c12(const float sigma, const float epsilon)
+{
+ const float sigma2 = sigma * sigma;
+ const float sigma6 = sigma2 * sigma2 * sigma2;
+ const float c6 = epsilon * sigma6;
+ const float2 c6c12 = (float2)(c6, /* c6 */
+ c6 * sigma6); /* c12 */
+ return c6c12;
float r2,
float* F_invr)
{
float r2,
float* F_invr)
{
/* force switch constants */
/* force switch constants */
- float disp_shift_V2 = nbparam->dispersion_shift.c2;
- float disp_shift_V3 = nbparam->dispersion_shift.c3;
- float repu_shift_V2 = nbparam->repulsion_shift.c2;
- float repu_shift_V3 = nbparam->repulsion_shift.c3;
+ const float disp_shift_V2 = nbparam->dispersion_shift.c2;
+ const float disp_shift_V3 = nbparam->dispersion_shift.c3;
+ const float repu_shift_V2 = nbparam->repulsion_shift.c2;
+ const float repu_shift_V3 = nbparam->repulsion_shift.c3;
- r = r2 * inv_r;
- r_switch = r - nbparam->rvdw_switch;
- r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
+ const float r = r2 * inv_r;
+ float r_switch = r - nbparam->rvdw_switch;
+ r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
*F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
+ c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
*F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
+ c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
float* F_invr,
float* E_lj)
{
float* F_invr,
float* E_lj)
{
/* force switch constants */
/* force switch constants */
- float disp_shift_V2 = nbparam->dispersion_shift.c2;
- float disp_shift_V3 = nbparam->dispersion_shift.c3;
- float repu_shift_V2 = nbparam->repulsion_shift.c2;
- float repu_shift_V3 = nbparam->repulsion_shift.c3;
+ const float disp_shift_V2 = nbparam->dispersion_shift.c2;
+ const float disp_shift_V3 = nbparam->dispersion_shift.c3;
+ const float repu_shift_V2 = nbparam->repulsion_shift.c2;
+ const float repu_shift_V3 = nbparam->repulsion_shift.c3;
- float disp_shift_F2 = nbparam->dispersion_shift.c2 / 3;
- float disp_shift_F3 = nbparam->dispersion_shift.c3 / 4;
- float repu_shift_F2 = nbparam->repulsion_shift.c2 / 3;
- float repu_shift_F3 = nbparam->repulsion_shift.c3 / 4;
+ const float disp_shift_F2 = nbparam->dispersion_shift.c2 / 3;
+ const float disp_shift_F3 = nbparam->dispersion_shift.c3 / 4;
+ const float repu_shift_F2 = nbparam->repulsion_shift.c2 / 3;
+ const float repu_shift_F3 = nbparam->repulsion_shift.c3 / 4;
- r = r2 * inv_r;
- r_switch = r - nbparam->rvdw_switch;
- r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
+ const float r = r2 * inv_r;
+ float r_switch = r - nbparam->rvdw_switch;
+ r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
*F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
+ c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
*F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
+ c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
float* F_invr,
const float* E_lj)
{
float* F_invr,
const float* E_lj)
{
- float r, r_switch;
- float sw, dsw;
-
/* potential switch constants */
/* potential switch constants */
- float switch_V3 = nbparam->vdw_switch.c3;
- float switch_V4 = nbparam->vdw_switch.c4;
- float switch_V5 = nbparam->vdw_switch.c5;
- float switch_F2 = nbparam->vdw_switch.c3;
- float switch_F3 = nbparam->vdw_switch.c4;
- float switch_F4 = nbparam->vdw_switch.c5;
+ const float switch_V3 = nbparam->vdw_switch.c3;
+ const float switch_V4 = nbparam->vdw_switch.c4;
+ const float switch_V5 = nbparam->vdw_switch.c5;
+ const float switch_F2 = nbparam->vdw_switch.c3;
+ const float switch_F3 = nbparam->vdw_switch.c4;
+ const float switch_F4 = nbparam->vdw_switch.c5;
- r = r2 * inv_r;
- r_switch = r - nbparam->rvdw_switch;
+ const float r = r2 * inv_r;
+ const float r_switch = r - nbparam->rvdw_switch;
/* Unlike in the F+E kernel, conditional is faster here */
if (r_switch > 0.0F)
{
/* Unlike in the F+E kernel, conditional is faster here */
if (r_switch > 0.0F)
{
- sw = 1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
- dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
+ const float sw = 1.0F
+ + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch
+ * r_switch * r_switch;
+ const float dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
*F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
}
*F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
}
float* F_invr,
float* E_lj)
{
float* F_invr,
float* E_lj)
{
- float r, r_switch;
- float sw, dsw;
-
/* potential switch constants */
/* potential switch constants */
- float switch_V3 = nbparam->vdw_switch.c3;
- float switch_V4 = nbparam->vdw_switch.c4;
- float switch_V5 = nbparam->vdw_switch.c5;
- float switch_F2 = nbparam->vdw_switch.c3;
- float switch_F3 = nbparam->vdw_switch.c4;
- float switch_F4 = nbparam->vdw_switch.c5;
+ const float switch_V3 = nbparam->vdw_switch.c3;
+ const float switch_V4 = nbparam->vdw_switch.c4;
+ const float switch_V5 = nbparam->vdw_switch.c5;
+ const float switch_F2 = nbparam->vdw_switch.c3;
+ const float switch_F3 = nbparam->vdw_switch.c4;
+ const float switch_F4 = nbparam->vdw_switch.c5;
- r = r2 * inv_r;
- r_switch = r - nbparam->rvdw_switch;
- r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
+ const float r = r2 * inv_r;
+ float r_switch = r - nbparam->rvdw_switch;
+ r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
/* Unlike in the F-only kernel, masking is faster here */
/* Unlike in the F-only kernel, masking is faster here */
- sw = 1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
- dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
+ const float sw =
+ 1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
+ const float dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
*F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
*E_lj *= sw;
*F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
*E_lj *= sw;
float lje_coeff6_6,
float* F_invr)
{
float lje_coeff6_6,
float* F_invr)
{
- float c6grid, inv_r6_nm, cr2, expmcr2, poly;
-
- c6grid = nbfp_comb_climg2d[2 * typei] * nbfp_comb_climg2d[2 * typej];
+ const float c6grid = nbfp_comb_climg2d[2 * typei] * nbfp_comb_climg2d[2 * typej];
/* Recalculate inv_r6 without exclusion mask */
/* Recalculate inv_r6 without exclusion mask */
- inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
- cr2 = lje_coeff2 * r2;
- expmcr2 = exp(-cr2);
- poly = 1.0F + cr2 + HALF_F * cr2 * cr2;
+ const float inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
+ const float cr2 = lje_coeff2 * r2;
+ const float expmcr2 = exp(-cr2);
+ const float poly = 1.0F + cr2 + HALF_F * cr2 * cr2;
/* Subtract the grid force from the total LJ force */
*F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
/* Subtract the grid force from the total LJ force */
*F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
float* F_invr,
float* E_lj)
{
float* F_invr,
float* E_lj)
{
- float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
-
- c6grid = nbfp_comb_climg2d[2 * typei] * nbfp_comb_climg2d[2 * typej];
+ const float c6grid = nbfp_comb_climg2d[2 * typei] * nbfp_comb_climg2d[2 * typej];
/* Recalculate inv_r6 without exclusion mask */
/* Recalculate inv_r6 without exclusion mask */
- inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
- cr2 = lje_coeff2 * r2;
- expmcr2 = exp(-cr2);
- poly = 1.0F + cr2 + HALF_F * cr2 * cr2;
+ const float inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
+ const float cr2 = lje_coeff2 * r2;
+ const float expmcr2 = exp(-cr2);
+ const float poly = 1.0F + cr2 + HALF_F * cr2 * cr2;
/* Subtract the grid force from the total LJ force */
*F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
/* Shift should be applied only to real LJ pairs */
/* Subtract the grid force from the total LJ force */
*F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
/* Shift should be applied only to real LJ pairs */
- sh_mask = nbparam->sh_lj_ewald * int_bit;
+ const float sh_mask = nbparam->sh_lj_ewald * int_bit;
*E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
}
*E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
}
float* F_invr,
float* E_lj)
{
float* F_invr,
float* E_lj)
{
- float c6grid, inv_r6_nm, cr2, expmcr2, poly;
- float sigma, sigma2, epsilon;
-
/* sigma and epsilon are scaled to give 6*C6 */
/* sigma and epsilon are scaled to give 6*C6 */
- sigma = nbfp_comb_climg2d[2 * typei] + nbfp_comb_climg2d[2 * typej];
+ const float sigma = nbfp_comb_climg2d[2 * typei] + nbfp_comb_climg2d[2 * typej];
- epsilon = nbfp_comb_climg2d[2 * typei + 1] * nbfp_comb_climg2d[2 * typej + 1];
+ const float epsilon = nbfp_comb_climg2d[2 * typei + 1] * nbfp_comb_climg2d[2 * typej + 1];
- sigma2 = sigma * sigma;
- c6grid = epsilon * sigma2 * sigma2 * sigma2;
+ const float sigma2 = sigma * sigma;
+ const float c6grid = epsilon * sigma2 * sigma2 * sigma2;
/* Recalculate inv_r6 without exclusion mask */
/* Recalculate inv_r6 without exclusion mask */
- inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
- cr2 = lje_coeff2 * r2;
- expmcr2 = exp(-cr2);
- poly = 1.0F + cr2 + HALF_F * cr2 * cr2;
+ const float inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
+ const float cr2 = lje_coeff2 * r2;
+ const float expmcr2 = exp(-cr2);
+ const float poly = 1.0F + cr2 + HALF_F * cr2 * cr2;
/* Subtract the grid force from the total LJ force */
*F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
if (with_E_lj)
{
/* Subtract the grid force from the total LJ force */
*F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
if (with_E_lj)
{
/* Shift should be applied only to real LJ pairs */
/* Shift should be applied only to real LJ pairs */
- sh_mask = nbparam->sh_lj_ewald * int_bit;
+ const float sh_mask = nbparam->sh_lj_ewald * int_bit;
*E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
}
}
*E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
}
}
const float FD1 = 0.50736591960530292870F;
const float FD0 = 1.0F;
const float FD1 = 0.50736591960530292870F;
const float FD0 = 1.0F;
- float z4;
- float polyFN0, polyFN1, polyFD0, polyFD1;
-
- z4 = z2 * z2;
+ const float z4 = z2 * z2;
- polyFD0 = FD4 * z4 + FD2;
- polyFD1 = FD3 * z4 + FD1;
- polyFD0 = polyFD0 * z4 + FD0;
- polyFD0 = polyFD1 * z2 + polyFD0;
+ float polyFD0 = FD4 * z4 + FD2;
+ float polyFD1 = FD3 * z4 + FD1;
+ polyFD0 = polyFD0 * z4 + FD0;
+ polyFD0 = polyFD1 * z2 + polyFD0;
polyFD0 = 1.0F / polyFD0;
polyFD0 = 1.0F / polyFD0;
- polyFN0 = FN6 * z4 + FN4;
- polyFN1 = FN5 * z4 + FN3;
- polyFN0 = polyFN0 * z4 + FN2;
- polyFN1 = polyFN1 * z4 + FN1;
- polyFN0 = polyFN0 * z4 + FN0;
- polyFN0 = polyFN1 * z2 + polyFN0;
+ float polyFN0 = FN6 * z4 + FN4;
+ float polyFN1 = FN5 * z4 + FN3;
+ polyFN0 = polyFN0 * z4 + FN2;
+ polyFN1 = polyFN1 * z4 + FN1;
+ polyFN0 = polyFN0 * z4 + FN0;
+ polyFN0 = polyFN1 * z2 + polyFN0;
return polyFN0 * polyFD0;
}
return polyFN0 * polyFD0;
}
volatile __global float* e_el,
int tidx)
{
volatile __global float* e_el,
int tidx)
{
int i = WARP_SIZE / 2;
/* Can't just use i as loop variable because than nvcc refuses to unroll. */
int i = WARP_SIZE / 2;
/* Can't just use i as loop variable because than nvcc refuses to unroll. */
- for (j = WARP_SIZE_LOG2 - 1; j > 0; j--)
+ for (int j = WARP_SIZE_LOG2 - 1; j > 0; j--)