From a168d2e692fa1228998e222076de8f47956569b3 Mon Sep 17 00:00:00 2001 From: Roland Schulz Date: Wed, 28 Mar 2018 18:55:52 -0700 Subject: [PATCH] Allow OCL CL_SIZE to be set to 4 for Intel Add GMX_OCL_CLUSTER_SIZE which can be set to 4 for e.g. Intel. The kernel should now work on any HW with at least CL_SIZE*CL_SIZE/2 wide sub-groups (warp-sync execution). This is 8(/32) for CL_SIZE 4(/8). Not tested for CL_SIZE other than 4 or 8. Fixes: - make_fep_list_supersub was incorrect for CL_SIZE!=8. - reduce_force_i_pow2 was incorrect for CL_SIZE<8 and 2 warps. - i-atom preload, nbnxn_excl_t, warp-any init for CL_SIZE!=8. - gpu_ref for CL_SIZE!=8. Change-Id: I1114e408d28b9eb6306722c41fd6a6ccec52211b --- cmake/gmxManageOpenCL.cmake | 3 ++ src/config.h.cmakein | 3 ++ src/gromacs/gpu_utils/ocl_compiler.cpp | 5 ++- .../nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp | 6 ++-- .../mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh | 26 +++++++-------- .../nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh | 25 +++++++------- .../nbnxn_ocl/nbnxn_ocl_kernel_utils.clh | 33 ++++++++++++++----- src/gromacs/mdlib/nbnxn_pairlist.h | 8 ++++- src/gromacs/mdlib/nbnxn_search.cpp | 3 +- 9 files changed, 72 insertions(+), 40 deletions(-) diff --git a/cmake/gmxManageOpenCL.cmake b/cmake/gmxManageOpenCL.cmake index 4b77330a12..447e02ecad 100644 --- a/cmake/gmxManageOpenCL.cmake +++ b/cmake/gmxManageOpenCL.cmake @@ -64,6 +64,9 @@ add_definitions(${OpenCL_DEFINITIONS}) include_directories(SYSTEM ${OpenCL_INCLUDE_DIRS}) +set(GMX_OCL_CLUSTER_SIZE 8 CACHE STRING "Cluster size used by OpenCL kernel. Set to 4 for Intel GPUs.") +mark_as_advanced(GMX_OCL_CLUSTER_SIZE) + macro(gmx_gpu_setup) # no OpenMP is no good! if(NOT GMX_OPENMP) diff --git a/src/config.h.cmakein b/src/config.h.cmakein index a5b3598601..347e31acee 100644 --- a/src/config.h.cmakein +++ b/src/config.h.cmakein @@ -241,6 +241,9 @@ /* Use a single compilation unit when compiling the CUDA (non-bonded) kernels. */ #cmakedefine01 GMX_CUDA_NB_SINGLE_COMPILATION_UNIT +/* Cluster size used by OpenCL kernel. Should be 8 for NVIDIA/AMD and 4 for Intel */ +#define GMX_OCL_CLUSTER_SIZE @GMX_OCL_CLUSTER_SIZE@ + /* Use NVML */ #cmakedefine01 HAVE_NVML diff --git a/src/gromacs/gpu_utils/ocl_compiler.cpp b/src/gromacs/gpu_utils/ocl_compiler.cpp index 182ec20192..62f8a4e75f 100644 --- a/src/gromacs/gpu_utils/ocl_compiler.cpp +++ b/src/gromacs/gpu_utils/ocl_compiler.cpp @@ -326,8 +326,11 @@ makeVendorFlavorChoice(ocl_vendor_id_t vendorId) case OCL_VENDOR_NVIDIA: choice = "-D_NVIDIA_SOURCE_"; break; + case OCL_VENDOR_INTEL: + choice = "-D_INTEL_SOURCE_"; + break; default: - choice = "-D_WARPLESS_SOURCE_"; + choice = ""; break; } return choice; diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp index 309d9a33e4..9ccd46a4eb 100644 --- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp +++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -234,7 +234,9 @@ nbnxn_kernel_gpu_ref(const nbnxn_pairlist_t *nbl, continue; } - int_bit = ((excl[jc >> 2]->pair[(jc & 3)*c_clSize + ic] >> (jm*c_numClPerSupercl + im)) & 1); + constexpr int clusterPerSplit = c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit; + int_bit = ((excl[jc/clusterPerSplit]->pair[(jc & (clusterPerSplit - 1))*c_clSize + ic] + >> (jm*c_numClPerSupercl + im)) & 1); js = ja*nbat->xstride; jfs = ja*nbat->fstride; diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh index 41d8e69884..a40eb78610 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh @@ -255,24 +255,26 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) cij4_start = nb_sci.cj4_ind_start; /* first ...*/ cij4_end = nb_sci.cj4_ind_end; /* and last index of j clusters */ - /* Pre-load i-atom x and q into shared memory */ - ci = sci * NCL_PER_SUPERCL + tidxj; - ai = ci * CL_SIZE + tidxi; - - xqbuf = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f); - xqbuf.w *= nbparam->epsfac; - xqib[tidxj * CL_SIZE + tidxi] = xqbuf; + for (i = 0; i < NCL_PER_SUPERCL; i += CL_SIZE) + { + /* Pre-load i-atom x and q into shared memory */ + ci = sci * NCL_PER_SUPERCL + tidxj+i; + ai = ci * CL_SIZE + tidxi; + xqbuf = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f); + xqbuf.w *= nbparam->epsfac; + xqib[(tidxj + i) * CL_SIZE + tidxi] = xqbuf; #ifdef IATYPE_SHMEM #ifndef LJ_COMB - /* Pre-load the i-atom types into shared memory */ - atib[tidxj * CL_SIZE + tidxi] = atom_types[ai]; + /* Pre-load the i-atom types into shared memory */ + atib[(tidxj + i) * CL_SIZE + tidxi] = atom_types[ai]; #else - ljcpib[tidxj * CL_SIZE + tidxi] = lj_comb[ai]; + ljcpib[(tidxj + i) * CL_SIZE + tidxi] = lj_comb[ai]; #endif #endif + } /* Initialise warp vote. (8x8 block) 2 warps for nvidia */ - if (tidx == 0 || tidx == 32) + if (tidx == 0 || tidx == WARP_SIZE) { warp_any[widx] = 0; } @@ -625,7 +627,6 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) reduce_force_i(f_buf, f, &fshift_buf, bCalcFshift, tidxi, tidxj, ai); - barrier(CLK_LOCAL_MEM_FENCE); } /* add up local shift forces into global mem */ @@ -646,7 +647,6 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) f_buf[ tidx] = E_lj; f_buf[FBUF_STRIDE + tidx] = E_el; reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE); - #endif } diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh index 126c47adc8..6a5128c7f0 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh @@ -42,8 +42,8 @@ * \ingroup module_mdlib */ -#ifndef _WARPLESS_SOURCE_ -/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for the "nowarp" kernel +#if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_ +/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors * Note that this should precede the kernel_utils include. */ #define USE_CJ_PREFETCH 1 @@ -150,14 +150,17 @@ __kernel void nbnxn_kernel_prune_rolling_opencl if (tidxz == 0) { - /* Pre-load i-atom x and q into shared memory */ - int ci = sci * c_numClPerSupercl + tidxj; - int ai = ci * c_clSize + tidxi; + for (int i = 0; i < NCL_PER_SUPERCL; i += CL_SIZE) + { + /* Pre-load i-atom x and q into shared memory */ + int ci = sci * c_numClPerSupercl + tidxj+i; + int ai = ci * c_clSize + tidxi; - /* We don't need q, but using float4 in shmem avoids bank conflicts */ - float4 tmp = xq[ai]; - float4 xi = tmp + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f); - xib[tidxj * c_clSize + tidxi] = xi; + /* We don't need q, but using float4 in shmem avoids bank conflicts */ + float4 tmp = xq[ai]; + float4 xi = tmp + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f); + xib[(tidxj + i) * c_clSize + tidxi] = xi; + } } barrier(CLK_LOCAL_MEM_FENCE); @@ -186,10 +189,6 @@ __kernel void nbnxn_kernel_prune_rolling_opencl } preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck); -#if defined _WARPLESS_SOURCE_ && USE_CJ_PREFETCH - /* can't assume wavefront width, need to sync before we can consume cj4 from local memory */ - barrier(CLK_LOCAL_MEM_FENCE); -#endif if (imaskCheck) { diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh index 63d13a3bc6..99ac7e1e30 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh @@ -38,7 +38,7 @@ #define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE) #define NCL_PER_SUPERCL (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER) -#define WARP_SIZE 32 +#define WARP_SIZE (CL_SIZE*CL_SIZE/2) #undef KERNEL_UTILS_INLINE #ifdef KERNEL_UTILS_INLINE @@ -61,8 +61,16 @@ __constant sampler_t generic_sampler = (CLK_NORMALIZED_COORDS_FALSE /* Natur #define __device__ +#if CL_SIZE == 8 #define WARP_SIZE_LOG2 (5) -#define CL_SIZE_LOG2 (3) /* change this together with CL_SIZE !*/ +#define CL_SIZE_LOG2 (3) +#elif CL_SIZE == 4 +#define WARP_SIZE_LOG2 (3) +#define CL_SIZE_LOG2 (2) +#else +#error unsupported CL_SIZE +#endif + #define CL_SIZE_SQ (CL_SIZE * CL_SIZE) #define FBUF_STRIDE (CL_SIZE_SQ) @@ -142,9 +150,9 @@ typedef struct { typedef struct { - unsigned int pair[32]; /* Topology exclusion interaction bits for one warp, - * each unsigned has bitS for 4*8 i clusters - */ + unsigned int pair[CL_SIZE*CL_SIZE/2]; /* Topology exclusion interaction bits for one warp, + * each unsigned has bitS for 4*8 i clusters + */ } nbnxn_excl_t; /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */ @@ -210,13 +218,14 @@ int loadCj(__local int *sm_cjPreload, const int c_nbnxnGpuClusterpairSplit = 2; const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit; -#if _NVIDIA_SOURCE_ +#if USE_CJ_PREFETCH +#if defined _NVIDIA_SOURCE_ int warpLoadOffset = (tidxj & 4) * c_nbnxnGpuJgroupSize/c_splitClSize; -#else // defined _WARPLESS_SOURCE_ || defined _AMD_SOURCE_ +#elif defined _AMD_SOURCE_ int warpLoadOffset = 0; +#else +#error Not supported #endif - -#if USE_CJ_PREFETCH return sm_cjPreload[jm + warpLoadOffset]; #else return gm_cj[jm]; @@ -583,6 +592,7 @@ void reduce_force_i_generic(__local float *f_buf, __global float *fout, (*fshift_buf) += f; } } + barrier(CLK_LOCAL_MEM_FENCE); } /*! Final i-force reduction; this implementation works only with power of two @@ -610,6 +620,11 @@ void reduce_force_i_pow2(volatile __local float *f_buf, __global float *fout, } i >>= 1; } + /* needed because + * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp + * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call + */ + barrier(CLK_LOCAL_MEM_FENCE); /* i == 1, last reduction step, writing to global mem */ /* Split the reduction between the first 3 line threads diff --git a/src/gromacs/mdlib/nbnxn_pairlist.h b/src/gromacs/mdlib/nbnxn_pairlist.h index 4e064ec910..352c679146 100644 --- a/src/gromacs/mdlib/nbnxn_pairlist.h +++ b/src/gromacs/mdlib/nbnxn_pairlist.h @@ -36,6 +36,8 @@ #ifndef _nbnxn_pairlist_h #define _nbnxn_pairlist_h +#include "config.h" + #include #include "gromacs/math/vectypes.h" @@ -74,8 +76,12 @@ struct NbnxnListParameters /*! \endcond */ -/* With GPU kernels the i and j cluster size is 8 atoms */ +/* With GPU kernels the i and j cluster size is 8 atoms for CUDA and can be set at compile time for OpenCL */ +#if GMX_GPU == GMX_GPU_OPENCL +static constexpr int c_nbnxnGpuClusterSize = GMX_OCL_CLUSTER_SIZE; +#else static constexpr int c_nbnxnGpuClusterSize = 8; +#endif /* The number of clusters in a super-cluster, used for GPU */ static constexpr int c_nbnxnGpuNumClusterPerSupercluster = 8; diff --git a/src/gromacs/mdlib/nbnxn_search.cpp b/src/gromacs/mdlib/nbnxn_search.cpp index 917319f910..e9d990bd22 100644 --- a/src/gromacs/mdlib/nbnxn_search.cpp +++ b/src/gromacs/mdlib/nbnxn_search.cpp @@ -1986,7 +1986,8 @@ static void make_fep_list_supersub(const nbnxn_search_t nbs, unsigned int excl_bit; real dx, dy, dz; - get_nbl_exclusions_1(nbl, cj4_ind, j>>2, &excl); + const int jHalf = j/(c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit); + get_nbl_exclusions_1(nbl, cj4_ind, jHalf, &excl); excl_pair = a_mod_wj(j)*nbl->na_ci + i; excl_bit = (1U << (gcj*c_gpuNumClusterPerCell + c)); -- 2.22.0