#
# This file is part of the GROMACS molecular simulation package.
#
-# Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014,2015,2018, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# top-level source directory and at http://www.gromacs.org.
file(GLOB MDLIB_OPENCL_KERNELS *.cl *.clh)
set(MDLIB_OPENCL_KERNELS ${MDLIB_OPENCL_KERNELS} PARENT_SCOPE)
endif()
+
+set(ELEC_DEFS
+ "-DEL_CUTOFF\;-DEELNAME=_ElecCut"
+ "-DEL_RF\;-DEELNAME=_ElecRF"
+ "-DEL_EWALD_TAB\;-DEELNAME=_ElecEwQSTab"
+ "-DEL_EWALD_TAB\;-DVDW_CUTOFF_CHECK\;-DEELNAME=_ElecEwQSTabTwinCut"
+ "-DEL_EWALD_ANA\;-DEELNAME=_ElecEw"
+ "-DEL_EWALD_ANA\;-DVDW_CUTOFF_CHECK\;-DEELNAME=_ElecEwTwinCut")
+set(VDW_DEFS
+ "-DVDWNAME=_VdwLJ"
+ "-DLJ_COMB_GEOM\;-DVDWNAME=_VdwLJCombGeom"
+ "-DLJ_COMB_LB\;-DVDWNAME=_VdwLJCombLB"
+ "-DLJ_FORCE_SWITCH\;-DVDWNAME=_VdwLJFsw"
+ "-DLJ_POT_SWITCH\;-DVDWNAME=_VdwLJPsw"
+ "-DLJ_EWALD_COMB_GEOM\;-DVDWNAME=_VdwLJEwCombGeom"
+ "-DLJ_EWALD_COMB_LB\;-DVDWNAME=_VdwLJEwCombLB")
+if(CLANG_TIDY_EXE)
+ set(OCL_COMPILER "${CLANG_TIDY_EXE}")
+ set(CLANG_TIDY_ARGS "-quiet;-checks=*,-readability-implicit-bool-conversion,-llvm-header-guard,-hicpp-signed-bitwise,-clang-analyzer-deadcode.DeadStores,-google-readability-todo;--;${CMAKE_C_COMPILER}")
+else()
+ set(OCL_COMPILER "${CMAKE_C_COMPILER}")
+endif()
+foreach(ELEC_DEF IN LISTS ELEC_DEFS)
+ foreach(VDW_DEF IN LISTS VDW_DEFS)
+ foreach(VENDOR AMD NVIDIA INTEL)
+ if(VENDOR STREQUAL INTEL)
+ set(CLUSTER_SIZE 4)
+ else()
+ set(CLUSTER_SIZE 8)
+ endif()
+ string(REGEX REPLACE ".*=" "" ELEC_NAME "${ELEC_DEF}")
+ string(REGEX REPLACE ".*=" "" VDW_NAME "${VDW_DEF}")
+ set(OBJ_FILE nbnxn_ocl_kernel${ELEC_NAME}${VDW_NAME}_${VENDOR}.o)
+ add_custom_command(OUTPUT ${OBJ_FILE} COMMAND ${OCL_COMPILER}
+ ${CMAKE_CURRENT_SOURCE_DIR}/nbnxn_ocl_kernels.cl ${CLANG_TIDY_ARGS}
+ -Xclang -finclude-default-header -D_${VENDOR}_SOURCE_
+ -DGMX_OCL_FASTGEN ${ELEC_DEF} ${VDW_DEF}
+ -DNBNXN_GPU_CLUSTER_SIZE=${CLUSTER_SIZE} -DIATYPE_SHMEM
+ -c -I ${CMAKE_SOURCE_DIR}/src -std=cl1.2
+ -Weverything -Wno-conversion -Wno-missing-variable-declarations -Wno-used-but-marked-unused
+ -Wno-cast-align -Wno-incompatible-pointer-types
+ -o${OBJ_FILE}
+ )
+ list(APPEND NBNXN_OCL_KERNELS ${OBJ_FILE})
+ endforeach()
+ endforeach()
+endforeach()
+add_custom_target(ocl_kernel DEPENDS ${NBNXN_OCL_KERNELS})
#endif
(
#ifndef LJ_COMB
- int ntypes, /* IN */
-#endif
- cl_nbparam_params_t nbparam_params, /* IN */
- const __global float4 *restrict xq, /* IN */
- __global float *restrict f, /* OUT stores float3 values */
- __global float *restrict gmx_unused e_lj, /* OUT */
- __global float *restrict gmx_unused e_el, /* OUT */
- __global float *restrict fshift, /* OUT stores float3 values */
+ int ntypes, /* IN */
+#endif
+ cl_nbparam_params_t nbparam_params, /* IN */
+ const __global float4 *restrict xq, /* IN */
+ __global float *restrict f, /* OUT stores float3 values */
+ __global float *restrict gmx_unused e_lj, /* OUT */
+ __global float *restrict gmx_unused e_el, /* OUT */
+ __global float *restrict fshift, /* OUT stores float3 values */
#ifdef LJ_COMB
- const __global float2 *restrict lj_comb, /* IN stores float2 values */
+ const __global float2 *restrict lj_comb, /* IN stores float2 values */
#else
- const __global int *restrict atom_types, /* IN */
+ const __global int *restrict atom_types, /* IN */
#endif
- const __global float *restrict shift_vec, /* IN stores float3 values */
- __constant float* gmx_unused nbfp_climg2d, /* IN */
- __constant float* gmx_unused nbfp_comb_climg2d, /* IN */
- __constant float* gmx_unused coulomb_tab_climg2d, /* IN */
- const __global nbnxn_sci_t* pl_sci, /* IN */
+ const __global float *restrict shift_vec, /* IN stores float3 values */
+ __constant const float* gmx_unused nbfp_climg2d, /* IN */
+ __constant const float* gmx_unused nbfp_comb_climg2d, /* IN */
+ __constant const float* gmx_unused coulomb_tab_climg2d, /* IN */
+ const __global nbnxn_sci_t* pl_sci, /* IN */
#ifndef PRUNE_NBL
const
#endif
#ifdef CALC_ENERGIES
#ifdef EL_EWALD_ANY
- const float beta = nbparam->ewald_beta;
- const float ewald_shift = nbparam->sh_ewald;
+ const float beta = nbparam->ewald_beta;
+ const float ewald_shift = nbparam->sh_ewald;
#else
- const float c_rf = nbparam->c_rf;
+ const float gmx_unused c_rf = nbparam->c_rf;
#endif /* EL_EWALD_ANY */
#endif /* CALC_ENERGIES */
#ifdef IATYPE_SHMEM
#ifndef LJ_COMB
/* shmem buffer for i atom-type pre-loading */
- __local int *atib = (__local int *)(LOCAL_OFFSET);
+ __local int *atib = (__local int *)(LOCAL_OFFSET); //NOLINT(google-readability-casting)
#undef LOCAL_OFFSET
- #define LOCAL_OFFSET atib + NCL_PER_SUPERCL * CL_SIZE
+ #define LOCAL_OFFSET (atib + NCL_PER_SUPERCL * CL_SIZE)
#else
__local float2 *ljcpib = (__local float2 *)(LOCAL_OFFSET);
#undef LOCAL_OFFSET
- #define LOCAL_OFFSET ljcpib + NCL_PER_SUPERCL * CL_SIZE
+ #define LOCAL_OFFSET (ljcpib + NCL_PER_SUPERCL * CL_SIZE)
#endif
#endif
#if !USE_SUBGROUP_ANY
/* Local buffer used to implement __any warp vote function from CUDA.
volatile is used to avoid compiler optimizations for AMD builds. */
- volatile __local uint *warp_any = (__local uint*)(LOCAL_OFFSET);
+ volatile __local uint *warp_any = (__local uint*)(LOCAL_OFFSET);
#else
- __local uint *warp_any = 0;
+ __local uint gmx_unused *warp_any = 0;
#endif
#undef LOCAL_OFFSET
const int aj = cj * CL_SIZE + tidxj;
/* load j atom data */
- const float4 xqbuf = xq[aj];
- const float3 xj = (float3)(xqbuf.xyz);
- const float qj_f = xqbuf.w;
+ const float4 xjqbuf = xq[aj];
+ const float3 xj = (float3)(xjqbuf.xyz);
+ const float qj_f = xjqbuf.w;
#ifndef LJ_COMB
const int typej = atom_types[aj];
#else
{
if (imask & mask_ji)
{
- const int ci = sci * NCL_PER_SUPERCL + i; /* i cluster index */
- const int ai = ci * CL_SIZE + tidxi; /* i atom index */
+ const int gmx_unused ci = sci * NCL_PER_SUPERCL + i; /* i cluster index */
/* all threads load an atom from i cluster ci into shmem! */
- const float4 xqbuf = xqib[i * CL_SIZE + tidxi];
- const float3 xi = (float3)(xqbuf.xyz);
+ const float4 xiqbuf = xqib[i * CL_SIZE + tidxi];
+ const float3 xi = (float3)(xiqbuf.xyz);
/* distance between i and j atoms */
const float3 rv = xi - xj;
#endif
{
/* load the rest of the i-atom parameters */
- const float qi = xqbuf.w;
+ const float qi = xiqbuf.w;
#ifdef IATYPE_SHMEM
#ifndef LJ_COMB
const int typei = atib[i * CL_SIZE + tidxi];
#else
const float2 ljcp_i = ljcpib[i * CL_SIZE + tidxi];
#endif
-#else /* IATYPE_SHMEM */
+#else /* IATYPE_SHMEM */
+ const int ai = ci * CL_SIZE + tidxi; /* i atom index */
+
#ifndef LJ_COMB
const int typei = atom_types[ai];
#else
const float c12 = ljcp_i.y * ljcp_j.y;
#else
/* LJ 2^(1/6)*sigma and 12*epsilon */
- float c6, c12;
const float sigma = ljcp_i.x + ljcp_j.x;
const float epsilon = ljcp_i.y * ljcp_j.y;
#if defined CALC_ENERGIES || defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
+ float c6, c12;
convert_sigma_epsilon_to_c6_c12(sigma, epsilon, &c6, &c12);
#endif
#endif /* LJ_COMB_GEOM */
#define GMX_DOUBLE 0
-#include "gromacs/gpu_utils/vectype_ops.clh"
#include "gromacs/gpu_utils/device_utils.clh"
+#include "gromacs/gpu_utils/vectype_ops.clh"
#include "gromacs/mdlib/nbnxn_consts.h"
#include "gromacs/pbcutil/ishift.h"
#define USE_CJ_PREFETCH 0
#endif
-#if (defined cl_intel_subgroups || defined cl_khr_subgroups || __OPENCL_VERSION__ >= 210)
+#if defined cl_intel_subgroups || defined cl_khr_subgroups || (defined __OPENCL_VERSION__ && __OPENCL_VERSION__ >= 210)
#define HAVE_SUBGROUP 1
#else
#define HAVE_SUBGROUP 0
#define HAVE_INTEL_SUBGROUP 0
#endif
-#if _INTEL_SOURCE_
+#if defined _INTEL_SOURCE_
#define SUBGROUP_SIZE 8
-#elif _AMD_SOURCE_
+#elif defined _AMD_SOURCE_
#define SUBGROUP_SIZE 64
#else
#define SUBGROUP_SIZE 32
#endif //USE_SUBGROUP_PRELOAD
#if USE_SUBGROUP_PRELOAD
-typedef int CjType;
+typedef size_t CjType;
#else
typedef __local int* CjType;
#endif
gmx_opencl_inline
void preloadCj4(CjType gmx_unused *cjs,
const __global int gmx_unused *gm_cj,
- int tidxi,
- int tidxj,
- bool iMaskCond)
+ int gmx_unused tidxi,
+ int gmx_unused tidxj,
+ bool gmx_unused iMaskCond)
{
#if USE_SUBGROUP_PRELOAD
*cjs = preloadCj4Subgroup(gm_cj);
* If cj4 preloading is enabled, it loads from the local memory, otherwise from global.
*/
gmx_opencl_inline
-int loadCj(CjType cjs, const __global int *gm_cj,
- int jm, int tidxi, int tidxj)
+int loadCj(CjType cjs, const __global int gmx_unused* gm_cj,
+ int jm, int gmx_unused tidxi, int gmx_unused tidxj)
{
#if USE_SUBGROUP_PRELOAD
return sub_group_broadcast(cjs, jm);
#if REDUCE_SHUFFLE
gmx_opencl_inline
void reduce_force_j_shfl(float3 fin, __global float *fout,
- int tidxi, int tidxj, int aidx)
+ int gmx_unused tidxi, int gmx_unused tidxj, int aidx)
{
/* Only does reduction over 4 elements in cluster. Needs to be changed
* for CL_SIZE>4. See CUDA code for required code */
/*! Final j-force reduction
*/
gmx_opencl_inline
-void reduce_force_j(__local float *f_buf, float3 fcj_buf, __global float *fout,
+void reduce_force_j(__local float gmx_unused *f_buf, float3 fcj_buf, __global float *fout,
int tidxi, int tidxj, int aidx)
{
#if REDUCE_SHUFFLE
/*! Final i-force reduction
*/
gmx_opencl_inline
-void reduce_force_i_and_shift(__local float *f_buf, float3* fci_buf, __global float *f,
+void reduce_force_i_and_shift(__local float gmx_unused *f_buf, float3* fci_buf, __global float *f,
bool bCalcFshift, int tidxi, int tidxj, int sci,
int shift, __global float *fshift)
{
}
gmx_opencl_inline
-void reduce_energy(volatile __local float *buf,
+void reduce_energy(volatile __local float gmx_unused *buf,
float E_lj, float E_el,
volatile __global float *e_lj,
volatile __global float *e_el,
#endif
}
+gmx_opencl_inline
bool gmx_sub_group_any_localmem(volatile __local uint *warp_any, int widx, bool pred)
{
if (pred)
}
//! Returns a true if predicate is true for any work item in warp
-bool gmx_sub_group_any(volatile __local uint *warp_any, int widx, bool pred)
+gmx_opencl_inline
+bool gmx_sub_group_any(volatile __local uint gmx_unused *warp_any, int gmx_unused widx, bool pred)
{
#if USE_SUBGROUP_ANY
return sub_group_any(pred);