Add OCL kernel target

author Roland Schulz <roland.schulz@intel.com>

Wed, 3 Oct 2018 04:25:13 +0000 (21:25 -0700)

committer Mark Abraham <mark.j.abraham@gmail.com>

Mon, 8 Oct 2018 21:18:18 +0000 (23:18 +0200)
author Roland Schulz <roland.schulz@intel.com>
Wed, 3 Oct 2018 04:25:13 +0000 (21:25 -0700)
committer Mark Abraham <mark.j.abraham@gmail.com>
Mon, 8 Oct 2018 21:18:18 +0000 (23:18 +0200)
diff --git a/src/gromacs/mdlib/nbnxn_ocl/CMakeLists.txt b/src/gromacs/mdlib/nbnxn_ocl/CMakeLists.txt

index 434e708f2619af50ee91784599bd3304fb073664..816928b317bc68257f51e87eec9b5d24275a0d08 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/CMakeLists.txt
+++ b/src/gromacs/mdlib/nbnxn_ocl/CMakeLists.txt
@@ -1,7 +1,7 @@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
-# Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014,2015,2018, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -38,3 +38,51 @@ if(GMX_USE_OPENCL)
      file(GLOB MDLIB_OPENCL_KERNELS *.cl *.clh)
      set(MDLIB_OPENCL_KERNELS ${MDLIB_OPENCL_KERNELS} PARENT_SCOPE)
  endif()
+
+set(ELEC_DEFS
+    "-DEL_CUTOFF\;-DEELNAME=_ElecCut"
+    "-DEL_RF\;-DEELNAME=_ElecRF"
+    "-DEL_EWALD_TAB\;-DEELNAME=_ElecEwQSTab"
+    "-DEL_EWALD_TAB\;-DVDW_CUTOFF_CHECK\;-DEELNAME=_ElecEwQSTabTwinCut"
+    "-DEL_EWALD_ANA\;-DEELNAME=_ElecEw"
+    "-DEL_EWALD_ANA\;-DVDW_CUTOFF_CHECK\;-DEELNAME=_ElecEwTwinCut")
+set(VDW_DEFS
+    "-DVDWNAME=_VdwLJ"
+    "-DLJ_COMB_GEOM\;-DVDWNAME=_VdwLJCombGeom"
+    "-DLJ_COMB_LB\;-DVDWNAME=_VdwLJCombLB"
+    "-DLJ_FORCE_SWITCH\;-DVDWNAME=_VdwLJFsw"
+    "-DLJ_POT_SWITCH\;-DVDWNAME=_VdwLJPsw"
+    "-DLJ_EWALD_COMB_GEOM\;-DVDWNAME=_VdwLJEwCombGeom"
+    "-DLJ_EWALD_COMB_LB\;-DVDWNAME=_VdwLJEwCombLB")
+if(CLANG_TIDY_EXE)
+   set(OCL_COMPILER "${CLANG_TIDY_EXE}")
+   set(CLANG_TIDY_ARGS "-quiet;-checks=*,-readability-implicit-bool-conversion,-llvm-header-guard,-hicpp-signed-bitwise,-clang-analyzer-deadcode.DeadStores,-google-readability-todo;--;${CMAKE_C_COMPILER}")
+else()
+   set(OCL_COMPILER "${CMAKE_C_COMPILER}")
+endif()
+foreach(ELEC_DEF IN LISTS ELEC_DEFS)
+    foreach(VDW_DEF IN LISTS VDW_DEFS)
+        foreach(VENDOR AMD NVIDIA INTEL)
+            if(VENDOR STREQUAL INTEL)
+               set(CLUSTER_SIZE 4)
+            else()
+               set(CLUSTER_SIZE 8)
+            endif()
+            string(REGEX REPLACE ".*=" "" ELEC_NAME "${ELEC_DEF}")
+            string(REGEX REPLACE ".*=" "" VDW_NAME "${VDW_DEF}")
+            set(OBJ_FILE nbnxn_ocl_kernel${ELEC_NAME}${VDW_NAME}_${VENDOR}.o)
+            add_custom_command(OUTPUT ${OBJ_FILE} COMMAND ${OCL_COMPILER}
+                ${CMAKE_CURRENT_SOURCE_DIR}/nbnxn_ocl_kernels.cl ${CLANG_TIDY_ARGS}
+                -Xclang -finclude-default-header  -D_${VENDOR}_SOURCE_
+                -DGMX_OCL_FASTGEN ${ELEC_DEF} ${VDW_DEF}
+                -DNBNXN_GPU_CLUSTER_SIZE=${CLUSTER_SIZE} -DIATYPE_SHMEM
+                -c -I ${CMAKE_SOURCE_DIR}/src -std=cl1.2
+                -Weverything  -Wno-conversion -Wno-missing-variable-declarations -Wno-used-but-marked-unused
+                -Wno-cast-align -Wno-incompatible-pointer-types
+                -o${OBJ_FILE}
+                )
+            list(APPEND NBNXN_OCL_KERNELS ${OBJ_FILE})
+        endforeach()
+    endforeach()
+endforeach()
+add_custom_target(ocl_kernel DEPENDS ${NBNXN_OCL_KERNELS})
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh

index c19fcf8dcd0cd7bb194ba7fb69a860f97bbd0a91..238711401a94fe3f819ad9ac944fc9b329a42271 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
@@ -110,24 +110,24 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
  #endif
  (
  #ifndef LJ_COMB
-    int ntypes,                                       /* IN  */
-#endif
-    cl_nbparam_params_t nbparam_params,               /* IN  */
-    const __global float4 *restrict xq,               /* IN  */
-    __global float *restrict f,                       /* OUT stores float3 values */
-    __global float *restrict gmx_unused e_lj,         /* OUT */
-    __global float *restrict gmx_unused e_el,         /* OUT */
-    __global float *restrict fshift,                  /* OUT stores float3 values */
+    int ntypes,                                             /* IN  */
+#endif
+    cl_nbparam_params_t nbparam_params,                     /* IN  */
+    const __global float4 *restrict xq,                     /* IN  */
+    __global float *restrict f,                             /* OUT stores float3 values */
+    __global float *restrict gmx_unused e_lj,               /* OUT */
+    __global float *restrict gmx_unused e_el,               /* OUT */
+    __global float *restrict fshift,                        /* OUT stores float3 values */
  #ifdef LJ_COMB
-    const __global float2 *restrict lj_comb,          /* IN stores float2 values */
+    const __global float2 *restrict lj_comb,                /* IN stores float2 values */
  #else
-    const __global int *restrict atom_types,          /* IN  */
+    const __global int *restrict atom_types,                /* IN  */
  #endif
-    const __global float *restrict shift_vec,         /* IN stores float3 values */
-    __constant float* gmx_unused nbfp_climg2d,        /* IN  */
-    __constant float* gmx_unused nbfp_comb_climg2d,   /* IN  */
-    __constant float* gmx_unused coulomb_tab_climg2d, /* IN  */
-    const __global nbnxn_sci_t* pl_sci,               /* IN  */
+    const __global float *restrict shift_vec,               /* IN stores float3 values */
+    __constant const float* gmx_unused nbfp_climg2d,        /* IN  */
+    __constant const float* gmx_unused nbfp_comb_climg2d,   /* IN  */
+    __constant const float* gmx_unused coulomb_tab_climg2d, /* IN  */
+    const __global nbnxn_sci_t* pl_sci,                     /* IN  */
  #ifndef PRUNE_NBL
      const
  #endif
@@ -160,10 +160,10 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
  
  #ifdef CALC_ENERGIES
  #ifdef EL_EWALD_ANY
-    const float  beta        = nbparam->ewald_beta;
-    const float  ewald_shift = nbparam->sh_ewald;
+    const float            beta        = nbparam->ewald_beta;
+    const float            ewald_shift = nbparam->sh_ewald;
  #else
-    const float  c_rf        = nbparam->c_rf;
+    const float gmx_unused c_rf        = nbparam->c_rf;
  #endif /* EL_EWALD_ANY */
  #endif /* CALC_ENERGIES */
  
@@ -189,13 +189,13 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
  #ifdef IATYPE_SHMEM
  #ifndef LJ_COMB
      /* shmem buffer for i atom-type pre-loading */
-    __local int *atib      = (__local int *)(LOCAL_OFFSET);
+    __local int *atib      = (__local int *)(LOCAL_OFFSET); //NOLINT(google-readability-casting)
      #undef LOCAL_OFFSET
-    #define LOCAL_OFFSET atib + NCL_PER_SUPERCL * CL_SIZE
+    #define LOCAL_OFFSET (atib + NCL_PER_SUPERCL * CL_SIZE)
  #else
      __local float2 *ljcpib = (__local float2 *)(LOCAL_OFFSET);
      #undef LOCAL_OFFSET
-    #define LOCAL_OFFSET ljcpib + NCL_PER_SUPERCL * CL_SIZE
+    #define LOCAL_OFFSET (ljcpib + NCL_PER_SUPERCL * CL_SIZE)
  #endif
  #endif
  
@@ -210,9 +210,9 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
  #if !USE_SUBGROUP_ANY
      /* Local buffer used to implement __any warp vote function from CUDA.
         volatile is used to avoid compiler optimizations for AMD builds. */
-    volatile __local uint *warp_any = (__local uint*)(LOCAL_OFFSET);
+    volatile __local uint            *warp_any = (__local uint*)(LOCAL_OFFSET);
  #else
-    __local uint          *warp_any = 0;
+    __local uint          gmx_unused *warp_any = 0;
  #endif
  #undef LOCAL_OFFSET
  
@@ -337,9 +337,9 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
                      const int    aj = cj * CL_SIZE + tidxj;
  
                      /* load j atom data */
-                    const float4 xqbuf = xq[aj];
-                    const float3 xj    = (float3)(xqbuf.xyz);
-                    const float  qj_f  = xqbuf.w;
+                    const float4 xjqbuf = xq[aj];
+                    const float3 xj     = (float3)(xjqbuf.xyz);
+                    const float  qj_f   = xjqbuf.w;
  #ifndef LJ_COMB
                      const int    typej   = atom_types[aj];
  #else
@@ -355,12 +355,11 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
                      {
                          if (imask & mask_ji)
                          {
-                            const int ci = sci * NCL_PER_SUPERCL + i; /* i cluster index */
-                            const int ai = ci * CL_SIZE + tidxi;      /* i atom index */
+                            const int gmx_unused ci = sci * NCL_PER_SUPERCL + i; /* i cluster index */
  
                              /* all threads load an atom from i cluster ci into shmem! */
-                            const float4 xqbuf   = xqib[i * CL_SIZE + tidxi];
-                            const float3 xi      = (float3)(xqbuf.xyz);
+                            const float4 xiqbuf   = xqib[i * CL_SIZE + tidxi];
+                            const float3 xi       = (float3)(xiqbuf.xyz);
  
                              /* distance between i and j atoms */
                              const float3 rv      = xi - xj;
@@ -383,14 +382,16 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
  #endif
                              {
                                  /* load the rest of the i-atom parameters */
-                                const float qi = xqbuf.w;
+                                const float qi = xiqbuf.w;
  #ifdef IATYPE_SHMEM
  #ifndef LJ_COMB
                                  const int    typei   = atib[i * CL_SIZE + tidxi];
  #else
                                  const float2 ljcp_i  = ljcpib[i * CL_SIZE + tidxi];
  #endif
-#else                           /* IATYPE_SHMEM */
+#else                                                                /* IATYPE_SHMEM */
+                                const int ai = ci * CL_SIZE + tidxi; /* i atom index */
+
  #ifndef LJ_COMB
                                  const int    typei   = atom_types[ai];
  #else
@@ -407,10 +408,10 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
                                  const float c12 = ljcp_i.y * ljcp_j.y;
  #else
                                  /* LJ 2^(1/6)*sigma and 12*epsilon */
-                                float       c6, c12;
                                  const float sigma   = ljcp_i.x + ljcp_j.x;
                                  const float epsilon = ljcp_i.y * ljcp_j.y;
  #if defined CALC_ENERGIES || defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
+                                float       c6, c12;
                                  convert_sigma_epsilon_to_c6_c12(sigma, epsilon, &c6, &c12);
  #endif
  #endif                          /* LJ_COMB_GEOM */
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh

index 753e0b897f44ace00344ea035b38632f8f9fdbb0..931eafb2929ac09c4ef5b228304199107aa9b67b 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
@@ -35,8 +35,8 @@
  
  #define GMX_DOUBLE 0
  
-#include "gromacs/gpu_utils/vectype_ops.clh"
  #include "gromacs/gpu_utils/device_utils.clh"
+#include "gromacs/gpu_utils/vectype_ops.clh"
  #include "gromacs/mdlib/nbnxn_consts.h"
  #include "gromacs/pbcutil/ishift.h"
  
@@ -56,7 +56,7 @@
  #define USE_CJ_PREFETCH 0
  #endif
  
-#if (defined cl_intel_subgroups || defined cl_khr_subgroups || __OPENCL_VERSION__ >= 210)
+#if defined cl_intel_subgroups || defined cl_khr_subgroups || (defined __OPENCL_VERSION__ && __OPENCL_VERSION__ >= 210)
  #define HAVE_SUBGROUP 1
  #else
  #define HAVE_SUBGROUP 0
@@ -68,9 +68,9 @@
  #define HAVE_INTEL_SUBGROUP 0
  #endif
  
-#if _INTEL_SOURCE_
+#if defined _INTEL_SOURCE_
  #define SUBGROUP_SIZE 8
-#elif _AMD_SOURCE_
+#elif defined _AMD_SOURCE_
  #define SUBGROUP_SIZE 64
  #else
  #define SUBGROUP_SIZE 32
@@ -228,7 +228,7 @@ int  preloadCj4Subgroup(const __global int *gm_cj)
  #endif //USE_SUBGROUP_PRELOAD
  
  #if USE_SUBGROUP_PRELOAD
-typedef int CjType;
+typedef size_t CjType;
  #else
  typedef __local int* CjType;
  #endif
@@ -245,9 +245,9 @@ typedef __local int* CjType;
  gmx_opencl_inline
  void preloadCj4(CjType gmx_unused             *cjs,
                  const __global int gmx_unused *gm_cj,
-                int                            tidxi,
-                int                            tidxj,
-                bool                           iMaskCond)
+                int gmx_unused                 tidxi,
+                int gmx_unused                 tidxj,
+                bool gmx_unused                iMaskCond)
  {
  #if USE_SUBGROUP_PRELOAD
      *cjs = preloadCj4Subgroup(gm_cj);
@@ -280,8 +280,8 @@ int loadCjPreload(__local int           *        sm_cjPreload,
   * If cj4 preloading is enabled, it loads from the local memory, otherwise from global.
   */
  gmx_opencl_inline
-int loadCj(CjType cjs, const __global int *gm_cj,
-           int jm, int tidxi, int tidxj)
+int loadCj(CjType cjs, const __global int gmx_unused* gm_cj,
+           int jm, int gmx_unused tidxi, int gmx_unused tidxj)
  {
  #if USE_SUBGROUP_PRELOAD
      return sub_group_broadcast(cjs, jm);
@@ -602,7 +602,7 @@ float pmecorrF(float z2)
  #if REDUCE_SHUFFLE
  gmx_opencl_inline
  void reduce_force_j_shfl(float3 fin, __global float *fout,
-                         int tidxi, int tidxj, int aidx)
+                         int gmx_unused tidxi, int gmx_unused tidxj, int aidx)
  {
      /* Only does reduction over 4 elements in cluster. Needs to be changed
       * for CL_SIZE>4. See CUDA code for required code */
@@ -655,7 +655,7 @@ void reduce_force_j_generic(__local float *f_buf, float3 fcj_buf, __global float
  /*! Final j-force reduction
   */
  gmx_opencl_inline
-void reduce_force_j(__local float *f_buf, float3 fcj_buf, __global float *fout,
+void reduce_force_j(__local float gmx_unused *f_buf, float3 fcj_buf, __global float *fout,
                      int tidxi, int tidxj, int aidx)
  {
  #if REDUCE_SHUFFLE
@@ -795,7 +795,7 @@ void reduce_force_i_and_shift_pow2(volatile __local float *f_buf, float3* fci_bu
  /*! Final i-force reduction
   */
  gmx_opencl_inline
-void reduce_force_i_and_shift(__local float *f_buf, float3* fci_buf, __global float *f,
+void reduce_force_i_and_shift(__local float gmx_unused *f_buf, float3* fci_buf, __global float *f,
                                bool bCalcFshift, int tidxi, int tidxj, int sci,
                                int shift, __global float *fshift)
  {
@@ -868,7 +868,7 @@ void reduce_energy_pow2(volatile __local float  *buf,
  }
  
  gmx_opencl_inline
-void reduce_energy(volatile __local float  *buf,
+void reduce_energy(volatile __local float  gmx_unused *buf,
                     float E_lj, float E_el,
                     volatile __global float *e_lj,
                     volatile __global float *e_el,
@@ -884,6 +884,7 @@ void reduce_energy(volatile __local float  *buf,
  #endif
  }
  
+gmx_opencl_inline
  bool gmx_sub_group_any_localmem(volatile __local uint *warp_any, int widx, bool pred)
  {
      if (pred)
@@ -899,7 +900,8 @@ bool gmx_sub_group_any_localmem(volatile __local uint *warp_any, int widx, bool
  }
  
  //! Returns a true if predicate is true for any work item in warp
-bool gmx_sub_group_any(volatile __local uint *warp_any, int widx, bool pred)
+gmx_opencl_inline
+bool gmx_sub_group_any(volatile __local uint gmx_unused *warp_any, int gmx_unused widx, bool pred)
  {
  #if USE_SUBGROUP_ANY
      return sub_group_any(pred);
author	Roland Schulz <roland.schulz@intel.com>
	Wed, 3 Oct 2018 04:25:13 +0000 (21:25 -0700)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Mon, 8 Oct 2018 21:18:18 +0000 (23:18 +0200)
src/gromacs/mdlib/nbnxn_ocl/CMakeLists.txt		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh		patch \| blob \| history