#endif
(
#ifndef LJ_COMB
- int ntypes, /* IN */
-#endif
- cl_nbparam_params_t nbparam_params, /* IN */
- const __global float4 *restrict xq, /* IN */
- __global float *restrict f, /* OUT stores float3 values */
- __global float *restrict e_lj, /* OUT */
- __global float *restrict e_el, /* OUT */
- __global float *restrict fshift, /* OUT stores float3 values */
+ int ntypes, /* IN */
+#endif
+ cl_nbparam_params_t nbparam_params, /* IN */
+ const __global float4 *restrict xq, /* IN */
+ __global float *restrict f, /* OUT stores float3 values */
+ __global float *restrict gmx_unused e_lj, /* OUT */
+ __global float *restrict gmx_unused e_el, /* OUT */
+ __global float *restrict fshift, /* OUT stores float3 values */
#ifdef LJ_COMB
- const __global float2 *restrict lj_comb, /* IN stores float2 values */
+ const __global float2 *restrict lj_comb, /* IN stores float2 values */
#else
- const __global int *restrict atom_types, /* IN */
+ const __global int *restrict atom_types, /* IN */
#endif
- const __global float *restrict shift_vec, /* IN stores float3 values */
- __constant float* nbfp_climg2d, /* IN */
- __constant float* nbfp_comb_climg2d, /* IN */
- __constant float* coulomb_tab_climg2d, /* IN */
- const __global nbnxn_sci_t* pl_sci, /* IN */
+ const __global float *restrict shift_vec, /* IN stores float3 values */
+ __constant float* gmx_unused nbfp_climg2d, /* IN */
+ __constant float* gmx_unused nbfp_comb_climg2d, /* IN */
+ __constant float* gmx_unused coulomb_tab_climg2d, /* IN */
+ const __global nbnxn_sci_t* pl_sci, /* IN */
#ifndef PRUNE_NBL
const
#endif
const unsigned superClInteractionMask = ((1U << NCL_PER_SUPERCL) - 1U);
#define LOCAL_OFFSET (xqib + NCL_PER_SUPERCL * CL_SIZE)
- CjType cjs;
+ CjType cjs = 0;
#if USE_CJ_PREFETCH
/* shmem buffer for cj, for both warps separately */
cjs = (__local int *)(LOCAL_OFFSET);
#undef LOCAL_OFFSET
- #define LOCAL_OFFSET cjs + 2 * c_nbnxnGpuJgroupSize
+ #define LOCAL_OFFSET (cjs + 2 * c_nbnxnGpuJgroupSize)
#endif //USE_CJ_PREFETCH
#ifdef IATYPE_SHMEM
* the research papers on the package. Check out http://www.gromacs.org.
*/
+#define GMX_DOUBLE 0
+
#include "gromacs/gpu_utils/vectype_ops.clh"
#include "gromacs/gpu_utils/device_utils.clh"
#include "gromacs/mdlib/nbnxn_consts.h"
#define ONE_TWELVETH_F 0.08333333f
+#ifdef __GNUC__
+/* GCC, clang, and some ICC pretending to be GCC */
+# define gmx_unused __attribute__ ((unused))
+#else
+# define gmx_unused
+#endif
+
// Data structures shared between OpenCL device code and OpenCL host code
// TODO: review, improve
// Replaced real by float for now, to avoid including any other header
const __global int *gm_cj,
int tidxi,
int tidxj,
- bool iMaskCond)
-
+ bool gmx_unused iMaskCond)
{
/* Pre-load cj into shared memory */
#if defined _AMD_SOURCE_ //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
const int c_clSize = CL_SIZE;
const int c_nbnxnGpuClusterpairSplit = 2;
const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
-
if ((tidxj == 0 | tidxj == c_splitClSize) & (tidxi < c_nbnxnGpuJgroupSize))
{
sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = gm_cj[tidxi];
* it's ready. This function does not call a barrier.
*/
gmx_opencl_inline
-void preloadCj4(CjType *cjs,
- const __global int *gm_cj,
- int tidxi,
- int tidxj,
- bool iMaskCond)
+void preloadCj4(CjType gmx_unused *cjs,
+ const __global int gmx_unused *gm_cj,
+ int tidxi,
+ int tidxj,
+ bool iMaskCond)
{
#if USE_SUBGROUP_PRELOAD
*cjs = preloadCj4Subgroup(gm_cj);
}
gmx_opencl_inline
-int loadCjPreload(__local int* sm_cjPreload,
- int jm,
- int tidxi,
- int tidxj)
+int loadCjPreload(__local int * sm_cjPreload,
+ int jm,
+ int gmx_unused tidxi,
+ int gmx_unused tidxj)
{
#if defined _AMD_SOURCE_
int warpLoadOffset = 0; //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
const int c_clSize = CL_SIZE;
const int c_nbnxnGpuClusterpairSplit = 2;
const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
-
- int warpLoadOffset = (tidxj & c_splitClSize) * c_nbnxnGpuJgroupSize/c_splitClSize;
+ int warpLoadOffset = (tidxj & c_splitClSize) * c_nbnxnGpuJgroupSize/c_splitClSize;
#endif
return sm_cjPreload[jm + warpLoadOffset];
}
float inv_r,
float r2,
float *F_invr,
- float *E_lj)
+ const float *E_lj)
{
float r, r_switch;
float sw, dsw;
* geometric combination rule.
*/
gmx_opencl_inline
-void calculate_lj_ewald_comb_geom_F(__constant float * nbfp_comb_climg2d,
- int typei,
- int typej,
- float r2,
- float inv_r2,
- float lje_coeff2,
- float lje_coeff6_6,
- float *F_invr)
+void calculate_lj_ewald_comb_geom_F(__constant const float *nbfp_comb_climg2d,
+ int typei,
+ int typej,
+ float r2,
+ float inv_r2,
+ float lje_coeff2,
+ float lje_coeff6_6,
+ float *F_invr)
{
float c6grid, inv_r6_nm, cr2, expmcr2, poly;
* geometric combination rule.
*/
gmx_opencl_inline
-void calculate_lj_ewald_comb_geom_F_E(__constant float *nbfp_comb_climg2d,
- cl_nbparam_params_t *nbparam,
- int typei,
- int typej,
- float r2,
- float inv_r2,
- float lje_coeff2,
- float lje_coeff6_6,
- float int_bit,
- float *F_invr,
- float *E_lj)
+void calculate_lj_ewald_comb_geom_F_E(__constant const float *nbfp_comb_climg2d,
+ cl_nbparam_params_t *nbparam,
+ int typei,
+ int typej,
+ float r2,
+ float inv_r2,
+ float lje_coeff2,
+ float lje_coeff6_6,
+ float int_bit,
+ float *F_invr,
+ float *E_lj)
{
float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
* of this is pretty small and LB on the CPU is anyway very slow.
*/
gmx_opencl_inline
-void calculate_lj_ewald_comb_LB_F_E(__constant float *nbfp_comb_climg2d,
- cl_nbparam_params_t *nbparam,
- int typei,
- int typej,
- float r2,
- float inv_r2,
- float lje_coeff2,
- float lje_coeff6_6,
- float int_bit,
- bool with_E_lj,
- float *F_invr,
- float *E_lj)
+void calculate_lj_ewald_comb_LB_F_E(__constant const float *nbfp_comb_climg2d,
+ cl_nbparam_params_t *nbparam,
+ int typei,
+ int typej,
+ float r2,
+ float inv_r2,
+ float lje_coeff2,
+ float lje_coeff6_6,
+ float int_bit,
+ bool with_E_lj,
+ float *F_invr,
+ float *E_lj)
{
float c6grid, inv_r6_nm, cr2, expmcr2, poly;
float sigma, sigma2, epsilon;
* Original idea: from the OpenMM project
*/
gmx_opencl_inline float
-interpolate_coulomb_force_r(__constant float *coulomb_tab_climg2d,
- float r,
- float scale)
+interpolate_coulomb_force_r(__constant const float *coulomb_tab_climg2d,
+ float r,
+ float scale)
{
float normalized = scale * r;
int index = (int) normalized;
volatile __global float *e_el,
unsigned int tidx)
{
- int i, j;
- float e1, e2;
+ int j;
- i = WARP_SIZE/2;
+ unsigned int i = WARP_SIZE/2;
/* Can't just use i as loop variable because than nvcc refuses to unroll. */
for (j = WARP_SIZE_LOG2 - 1; j > 0; j--)
/* last reduction step, writing to global mem */
if (tidx == 0)
{
- e1 = buf[ tidx] + buf[ tidx + i];
- e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
+ float e1 = buf[ tidx] + buf[ tidx + i];
+ float e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
atomicAdd_g_f(e_lj, e1);
atomicAdd_g_f(e_el, e2);