From: Roland Schulz Date: Sun, 29 Apr 2018 21:19:26 +0000 (-0700) Subject: Use subgroup for warp_any and CJ4 prefetch X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=1df76148809fed6687dfc66ce887e442c882b767;p=alexxy%2Fgromacs.git Use subgroup for warp_any and CJ4 prefetch Change-Id: I548d669d0125084d3b6533ad072e758ff9fe5cc1 --- diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh index eb7e43fc60..e58efc6121 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh @@ -46,12 +46,6 @@ /* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for the "nowarp" kernel * Note that this should precede the kernel_utils include. */ -#if defined _AMD_SOURCE_ || defined _NVIDIA_SOURCE_ -#define USE_CJ_PREFETCH 1 -#else -#define USE_CJ_PREFETCH 0 -#endif - #include "nbnxn_ocl_kernel_utils.clh" ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -98,8 +92,8 @@ Thus if more strings need to be appended a new macro must be written or it must be directly appended here. */ __attribute__((reqd_work_group_size(CL_SIZE, CL_SIZE, 1))) -#if REDUCE_SHUFFLE -__attribute__((intel_reqd_sub_group_size(WARP_SIZE))) //2*WARP_SIZE could be enabled, see comment in reduce_energy_shfl +#ifdef cl_intel_required_subgroup_size +__attribute__((intel_reqd_sub_group_size(SUBGROUP_SIZE))) #endif #ifdef PRUNE_NBL #ifdef CALC_ENERGIES @@ -220,13 +214,13 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) const unsigned superClInteractionMask = ((1U << NCL_PER_SUPERCL) - 1U); #define LOCAL_OFFSET (xqib + NCL_PER_SUPERCL * CL_SIZE) - __local int *cjs; + CjType cjs; #if USE_CJ_PREFETCH /* shmem buffer for cj, for both warps separately */ cjs = (__local int *)(LOCAL_OFFSET); #undef LOCAL_OFFSET #define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE -#endif +#endif //USE_CJ_PREFETCH #ifdef IATYPE_SHMEM #ifndef LJ_COMB @@ -249,9 +243,13 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) #else __local float *f_buf = 0; #endif +#if !USE_SUBGROUP_ANY /* Local buffer used to implement __any warp vote function from CUDA. volatile is used to avoid compiler optimizations for AMD builds. */ volatile __local uint *warp_any = (__local uint*)(LOCAL_OFFSET); +#else + __local uint *warp_any = 0; +#endif #undef LOCAL_OFFSET nb_sci = pl_sci[bidx]; /* my i super-cluster's index = current bidx */ @@ -277,12 +275,13 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) #endif #endif } +#if !USE_SUBGROUP_ANY /* Initialise warp vote. (8x8 block) 2 warps for nvidia */ if (tidx == 0 || tidx == WARP_SIZE) { warp_any[widx] = 0; } - +#endif barrier(CLK_LOCAL_MEM_FENCE); for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++) @@ -347,7 +346,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) imask = pl_cj4[j4].imei[widx].imask; wexcl = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)]; - preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imask != 0u); + preloadCj4(&cjs, pl_cj4[j4].cj, tidxi, tidxj, imask != 0u); #ifndef PRUNE_NBL if (imask) @@ -363,7 +362,6 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) #if !defined PRUNE_NBL && !defined _NVIDIA_SOURCE_ #pragma unroll 4 #endif - for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++) { if (imask & (superClInteractionMask << (jm * NCL_PER_SUPERCL))) @@ -404,21 +402,10 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) r2 = norm2(rv); #ifdef PRUNE_NBL - /* vote.. should code shmem serialisation, wonder what the hit will be */ - if (r2 < rlist_sq) - { - warp_any[widx] = 1; - } - - /* If _none_ of the atoms pairs are in cutoff range, - the bit corresponding to the current - cluster-pair in imask gets set to 0. */ - if (!warp_any[widx]) + if (!gmx_sub_group_any(warp_any, widx, r2 < rlist_sq)) { imask &= ~mask_ji; } - - warp_any[widx] = 0; #endif int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f; diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh index 7d31a58e53..4c253f7992 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh @@ -42,15 +42,6 @@ * \ingroup module_mdlib */ -#if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_ -/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors - * Note that this should precede the kernel_utils include. - */ -#define USE_CJ_PREFETCH 1 -#else -#define USE_CJ_PREFETCH 0 -#endif - #include "nbnxn_ocl_kernel_utils.clh" /* Note: the AMD compiler testing was done with (fglrx 15.12) performs best with wg @@ -123,25 +114,28 @@ __kernel void nbnxn_kernel_prune_rolling_opencl #define LOCAL_OFFSET (xib + c_numClPerSupercl * c_clSize) /* shmem buffer for i cj pre-loading */ - __local int *cjs; + CjType cjs; #if USE_CJ_PREFETCH cjs = (((__local int *)(LOCAL_OFFSET)) + tidxz * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize); #undef LOCAL_OFFSET /* Offset calculated using xib because cjs depends on on tidxz! */ #define LOCAL_OFFSET (((__local int *)(xib + c_numClPerSupercl * c_clSize)) + (NTHREAD_Z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize)) #endif - +#if !USE_SUBGROUP_ANY /* Local buffer used to implement __any warp vote function from CUDA. volatile is used to avoid compiler optimizations for AMD builds. */ - volatile __local uint *warp_any = (__local uint*)(LOCAL_OFFSET); - #undef LOCAL_OFFSET - - unsigned int warpVoteSlot = NTHREAD_Z*tidxz + widx; + volatile __local uint *const warp_any = (__local uint*)(LOCAL_OFFSET); + const unsigned int warpVoteSlot = NTHREAD_Z*tidxz + widx; /* Initialise warp vote.*/ if (tidx == 0 || tidx == 32) { warp_any[warpVoteSlot] = 0; } +#else + __local uint *const warp_any = 0; + const unsigned int warpVoteSlot = 0; +#endif + #undef LOCAL_OFFSET nbnxn_sci_t nb_sci = pl_sci[bidx*numParts + part]; /* my i super-cluster's index = sciOffset + current bidx * numParts + part */ int sci = nb_sci.sci; /* super-cluster */ @@ -188,7 +182,7 @@ __kernel void nbnxn_kernel_prune_rolling_opencl imaskCheck = (imaskNew ^ imaskFull); } - preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck != 0u); + preloadCj4(&cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck != 0u); if (imaskCheck) { @@ -223,27 +217,17 @@ __kernel void nbnxn_kernel_prune_rolling_opencl /* If _none_ of the atoms pairs are in cutoff range, the bit corresponding to the current cluster-pair in imask gets set to 0. */ - if (r2 < rlistOuter_sq) - { - warp_any[warpVoteSlot] = 1; - } - if (!warp_any[warpVoteSlot]) + if (!gmx_sub_group_any(warp_any, warpVoteSlot, r2 < rlistOuter_sq)) { imaskFull &= ~mask_ji; } - warp_any[warpVoteSlot] = 0; } /* If any atom pair is within range, set the bit corresponding to the current cluster-pair. */ - if (r2 < rlistInner_sq) - { - warp_any[warpVoteSlot] = 1; - } - if (warp_any[warpVoteSlot]) + if (gmx_sub_group_any(warp_any, warpVoteSlot, r2 < rlistInner_sq)) { imaskNew |= mask_ji; } - warp_any[warpVoteSlot] = 0; } /* shift the mask bit by 1 */ diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh index 88aa20bfc1..4d5e7c9ba2 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh @@ -41,17 +41,39 @@ #define WARP_SIZE (CL_SIZE*CL_SIZE/2) //Currently only c_nbnxnGpuClusterpairSplit=2 supported -/* Nvidia+AMD don't support any subgroup extension (2.1 core or cl_khr_subgroups). - Code doesn't support CL_SIZE=8. - cl_intel_required_subgroup_size required for intel_reqd_sub_group_size. - cl_intel_subgroups required for intel_sub_group_shuffle_up/down. +#if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_ +/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors + * Note that this should precede the kernel_utils include. */ -#if defined cl_intel_required_subgroup_size && defined cl_intel_subgroups && CL_SIZE == 4 -#define REDUCE_SHUFFLE 1 +#define USE_CJ_PREFETCH 1 #else -#define REDUCE_SHUFFLE 0 +#define USE_CJ_PREFETCH 0 #endif +#if (defined cl_intel_subgroups || defined cl_khr_subgroups || __OPENCL_VERSION__ >= 210) +#define HAVE_SUBGROUP 1 +#else +#define HAVE_SUBGROUP 0 +#endif + +#ifdef cl_intel_subgroups +#define HAVE_INTEL_SUBGROUP 1 +#else +#define HAVE_INTEL_SUBGROUP 0 +#endif + +#if _INTEL_SOURCE_ +#define SUBGROUP_SIZE 8 +#elif _AMD_SOURCE_ +#define SUBGROUP_SIZE 64 +#else +#define SUBGROUP_SIZE 32 +#endif + +#define REDUCE_SHUFFLE (HAVE_INTEL_SUBGROUP && CL_SIZE == 4 && SUBGROUP_SIZE == WARP_SIZE) +#define USE_SUBGROUP_ANY (HAVE_SUBGROUP && SUBGROUP_SIZE == WARP_SIZE) +#define USE_SUBGROUP_PRELOAD HAVE_INTEL_SUBGROUP + /* 1.0 / sqrt(M_PI) */ #define M_FLOAT_1_SQRTPI 0.564189583547756f @@ -157,48 +179,92 @@ typedef struct { /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */ __constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U); +gmx_opencl_inline +void preloadCj4Generic(__local int *sm_cjPreload, + const __global int *gm_cj, + int tidxi, + int tidxj, + bool iMaskCond) -/*! \brief Preload cj4 into local memory. +{ + /* Pre-load cj into shared memory */ +#if defined _AMD_SOURCE_ //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly + if (tidxj == 0 & tidxi < NBNXN_GPU_JGROUP_SIZE) + { + sm_cjPreload[tidxi] = gm_cj[tidxi]; + } +#else + const int c_clSize = CL_SIZE; + const int c_nbnxnGpuJgroupSize = NBNXN_GPU_JGROUP_SIZE; + const int c_nbnxnGpuClusterpairSplit = 2; + const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit; + + if ((tidxj == 0 | tidxj == c_splitClSize) & (tidxi < c_nbnxnGpuJgroupSize)) + { + sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = gm_cj[tidxi]; + } +#endif +} + + +#if USE_SUBGROUP_PRELOAD +gmx_opencl_inline +int preloadCj4Subgroup(const __global int *gm_cj) +{ + //loads subgroup-size # of elements (8) instead of the 4 required + //equivalent to *cjs = *gm_cj + return intel_sub_group_block_read((const __global uint *)gm_cj); +} +#endif //USE_SUBGROUP_PRELOAD + +#if USE_SUBGROUP_PRELOAD +typedef int CjType; +#else +typedef __local int* CjType; +#endif + +/*! \brief Preload cj4 * * - For AMD we load once for a wavefront of 64 threads (on 4 threads * NTHREAD_Z) * - For NVIDIA once per warp (on 2x4 threads * NTHREAD_Z) - * - Same as AMD in the nowarp kernel; we do not assume execution width and therefore - * the caller needs to sync. + * - For Intel(/USE_SUBGROUP_PRELOAD) loads into private memory(/register) instead of local memory * * It is the caller's responsibility to make sure that data is consumed only when * it's ready. This function does not call a barrier. */ gmx_opencl_inline -void preloadCj4(__local int *sm_cjPreload, +void preloadCj4(CjType *cjs, const __global int *gm_cj, int tidxi, int tidxj, bool iMaskCond) - { -#if !USE_CJ_PREFETCH - return; +#if USE_SUBGROUP_PRELOAD + *cjs = preloadCj4Subgroup(gm_cj); +#elif USE_CJ_PREFETCH + preloadCj4Generic(*cjs, gm_cj, tidxi, tidxj, iMaskCond); +#else + //nothing to do #endif +} +gmx_opencl_inline +int loadCjPreload(__local int* sm_cjPreload, + int jm, + int tidxi, + int tidxj) +{ +#if defined _AMD_SOURCE_ + int warpLoadOffset = 0; //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly +#else const int c_clSize = CL_SIZE; const int c_nbnxnGpuJgroupSize = NBNXN_GPU_JGROUP_SIZE; const int c_nbnxnGpuClusterpairSplit = 2; const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit; - /* Pre-load cj into shared memory */ -#if defined _NVIDIA_SOURCE_ - /* on both warps separately for NVIDIA */ - if ((tidxj == 0 | tidxj == 4) & (tidxi < c_nbnxnGpuJgroupSize)) - { - sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = gm_cj[tidxi]; - } -#else // AMD or nowarp - /* Note that with "nowarp" / on hardware with wavefronts <64 a barrier is needed after preload. */ - if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize) - { - sm_cjPreload[tidxi] = gm_cj[tidxi]; - } + int warpLoadOffset = (tidxj & c_splitClSize) * c_nbnxnGpuJgroupSize/c_splitClSize; #endif + return sm_cjPreload[jm + warpLoadOffset]; } /* \brief Load a cj given a jm index. @@ -206,32 +272,18 @@ void preloadCj4(__local int *sm_cjPreload, * If cj4 preloading is enabled, it loads from the local memory, otherwise from global. */ gmx_opencl_inline -int loadCj(__local int *sm_cjPreload, - const __global int *gm_cj, - int jm, - int tidxi, - int tidxj) +int loadCj(CjType cjs, const __global int *gm_cj, + int jm, int tidxi, int tidxj) { - const int c_clSize = CL_SIZE; - const int c_nbnxnGpuJgroupSize = NBNXN_GPU_JGROUP_SIZE; - const int c_nbnxnGpuClusterpairSplit = 2; - const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit; - -#if USE_CJ_PREFETCH -#if defined _NVIDIA_SOURCE_ - int warpLoadOffset = (tidxj & 4) * c_nbnxnGpuJgroupSize/c_splitClSize; -#elif defined _AMD_SOURCE_ - int warpLoadOffset = 0; -#else -#error Not supported -#endif - return sm_cjPreload[jm + warpLoadOffset]; +#if USE_SUBGROUP_PRELOAD + return sub_group_broadcast(cjs, jm); +#elif USE_CJ_PREFETCH + return loadCjPreload(cjs, jm, tidxi, tidxj); #else return gm_cj[jm]; #endif } - /*! Convert LJ sigma,epsilon parameters to C6,C12. */ gmx_opencl_inline void convert_sigma_epsilon_to_c6_c12(const float sigma, @@ -549,7 +601,7 @@ void reduce_force_j_shfl(float3 fin, __global float *fout, fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 1); fin.y += intel_sub_group_shuffle_up (fin.y, fin.y, 1); fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, 1); - if (tidxi & 1 == 1) + if ((tidxi & 1) == 1) { fin.x = fin.y; } @@ -753,8 +805,8 @@ void reduce_force_i_and_shift(__local float *f_buf, float3* fci_buf, __global fl #if REDUCE_SHUFFLE gmx_opencl_inline void reduce_energy_shfl(float E_lj, float E_el, - __global float *e_lj, - __global float *e_el, + volatile __global float *e_lj, + volatile __global float *e_el, unsigned int tidx) { E_lj = sub_group_reduce_add(E_lj); @@ -825,4 +877,28 @@ void reduce_energy(volatile __local float *buf, #endif } +bool gmx_sub_group_any_localmem(volatile __local uint *warp_any, int widx, bool pred) +{ + if (pred) + { + warp_any[widx] = 1; + } + + bool ret = warp_any[widx]; + + warp_any[widx] = 0; + + return ret; +} + +//! Returns a true if predicate is true for any work item in warp +bool gmx_sub_group_any(volatile __local uint *warp_any, int widx, bool pred) +{ +#if USE_SUBGROUP_ANY + return sub_group_any(pred); +#else + return gmx_sub_group_any_localmem(warp_any, widx, pred); +#endif +} + #endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */