From 67837a421a087ce05f9ef17c3a1de0fc6ce99870 Mon Sep 17 00:00:00 2001 From: Roland Schulz Date: Wed, 25 Apr 2018 22:25:40 -0700 Subject: [PATCH] Use subgroup shuffle for reduction Change-Id: I121821e70836111adccd0619db992037af6bee3c --- .../mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh | 65 +--- .../nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh | 4 +- .../nbnxn_ocl/nbnxn_ocl_kernel_utils.clh | 280 +++++++++++++----- 3 files changed, 226 insertions(+), 123 deletions(-) diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh index a40eb78610..eb7e43fc60 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh @@ -98,6 +98,9 @@ Thus if more strings need to be appended a new macro must be written or it must be directly appended here. */ __attribute__((reqd_work_group_size(CL_SIZE, CL_SIZE, 1))) +#if REDUCE_SHUFFLE +__attribute__((intel_reqd_sub_group_size(WARP_SIZE))) //2*WARP_SIZE could be enabled, see comment in reduce_energy_shfl +#endif #ifdef PRUNE_NBL #ifdef CALC_ENERGIES __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_prune_opencl) @@ -209,15 +212,14 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) #endif unsigned int wexcl, imask, mask_ji; float4 xqbuf; - float3 xi, xj, rv, f_ij, fcj_buf /*, fshift_buf*/; - float fshift_buf; + float3 xi, xj, rv, f_ij, fcj_buf; float3 fci_buf[NCL_PER_SUPERCL]; /* i force buffer */ nbnxn_sci_t nb_sci; /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL=8 bits set */ const unsigned superClInteractionMask = ((1U << NCL_PER_SUPERCL) - 1U); -#define LOCAL_OFFSET xqib + NCL_PER_SUPERCL * CL_SIZE +#define LOCAL_OFFSET (xqib + NCL_PER_SUPERCL * CL_SIZE) __local int *cjs; #if USE_CJ_PREFETCH /* shmem buffer for cj, for both warps separately */ @@ -239,11 +241,13 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) #endif #endif -#ifndef REDUCE_SHUFFLE +#if !REDUCE_SHUFFLE /* shmem j force buffer */ __local float *f_buf = (__local float *)(LOCAL_OFFSET); #undef LOCAL_OFFSET - #define LOCAL_OFFSET f_buf + CL_SIZE * CL_SIZE * 3 + #define LOCAL_OFFSET (f_buf + CL_SIZE * CL_SIZE * 3) +#else + __local float *f_buf = 0; #endif /* Local buffer used to implement __any warp vote function from CUDA. volatile is used to avoid compiler optimizations for AMD builds. */ @@ -343,7 +347,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) imask = pl_cj4[j4].imei[widx].imask; wexcl = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)]; - preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imask); + preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imask != 0u); #ifndef PRUNE_NBL if (imask) @@ -388,8 +392,6 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) { if (imask & mask_ji) { - ci_offset = i; /* i force buffer offset */ - ci = sci * NCL_PER_SUPERCL + i; /* i cluster index */ ai = ci * CL_SIZE + tidxi; /* i atom index */ @@ -580,7 +582,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) fcj_buf -= f_ij; /* accumulate i forces in registers */ - fci_buf[ci_offset] += f_ij; + fci_buf[i] += f_ij; } } @@ -589,13 +591,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) } /* reduce j forces */ - - /* store j forces in shmem */ - f_buf[ tidx] = fcj_buf.x; - f_buf[ FBUF_STRIDE + tidx] = fcj_buf.y; - f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z; - - reduce_force_j_generic(f_buf, f, tidxi, tidxj, aj); + reduce_force_j(f_buf, fcj_buf, f, tidxi, tidxj, aj); } } #ifdef PRUNE_NBL @@ -610,43 +606,14 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) /* skip central shifts when summing shift forces */ if (nb_sci.shift == CENTRAL) { - bCalcFshift = false; + bCalcFshift = 0; } - - fshift_buf = 0.0f; - /* reduce i forces */ - for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++) - { - ai = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi; - - f_buf[ tidx] = fci_buf[ci_offset].x; - f_buf[ FBUF_STRIDE + tidx] = fci_buf[ci_offset].y; - f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z; - barrier(CLK_LOCAL_MEM_FENCE); - reduce_force_i(f_buf, f, - &fshift_buf, bCalcFshift, - tidxi, tidxj, ai); - } - - /* add up local shift forces into global mem */ - if (bCalcFshift) - { - /* Only threads with tidxj < 3 will update fshift. - The threads performing the update must be the same with the threads - which stored the reduction result in reduce_force_i function - */ - if (tidxj < 3) - { - atomicAdd_g_f(&(fshift[3 * nb_sci.shift + tidxj]), fshift_buf); - } - } + reduce_force_i_and_shift(f_buf, fci_buf, f, bCalcFshift != 0, tidxi, tidxj, sci, + nb_sci.shift, fshift); #ifdef CALC_ENERGIES - /* flush the energies to shmem and reduce them */ - f_buf[ tidx] = E_lj; - f_buf[FBUF_STRIDE + tidx] = E_el; - reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE); + reduce_energy(f_buf, E_lj, E_el, e_lj, e_el, tidx); #endif } diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh index 6a5128c7f0..7d31a58e53 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh @@ -121,7 +121,7 @@ __kernel void nbnxn_kernel_prune_rolling_opencl /*! i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */ const unsigned superClInteractionMask = ((1U << c_numClPerSupercl) - 1U); - #define LOCAL_OFFSET xib + c_numClPerSupercl * c_clSize + #define LOCAL_OFFSET (xib + c_numClPerSupercl * c_clSize) /* shmem buffer for i cj pre-loading */ __local int *cjs; #if USE_CJ_PREFETCH @@ -188,7 +188,7 @@ __kernel void nbnxn_kernel_prune_rolling_opencl imaskCheck = (imaskNew ^ imaskFull); } - preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck); + preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck != 0u); if (imaskCheck) { diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh index b882f59012..88aa20bfc1 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh @@ -39,7 +39,18 @@ #define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE) #define NCL_PER_SUPERCL (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER) -#define WARP_SIZE (CL_SIZE*CL_SIZE/2) +#define WARP_SIZE (CL_SIZE*CL_SIZE/2) //Currently only c_nbnxnGpuClusterpairSplit=2 supported + +/* Nvidia+AMD don't support any subgroup extension (2.1 core or cl_khr_subgroups). + Code doesn't support CL_SIZE=8. + cl_intel_required_subgroup_size required for intel_reqd_sub_group_size. + cl_intel_subgroups required for intel_sub_group_shuffle_up/down. + */ +#if defined cl_intel_required_subgroup_size && defined cl_intel_subgroups && CL_SIZE == 4 +#define REDUCE_SHUFFLE 1 +#else +#define REDUCE_SHUFFLE 0 +#endif /* 1.0 / sqrt(M_PI) */ #define M_FLOAT_1_SQRTPI 0.564189583547756f @@ -49,10 +60,6 @@ #ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH #define NBNXN_OPENCL_KERNEL_UTILS_CLH -__constant sampler_t generic_sampler = (CLK_NORMALIZED_COORDS_FALSE /* Natural coords */ - | CLK_ADDRESS_NONE /* No clamp/repeat*/ - | CLK_FILTER_NEAREST); /* No interpolation */ - #if CL_SIZE == 8 #define WARP_SIZE_LOG2 (5) #define CL_SIZE_LOG2 (3) @@ -465,7 +472,7 @@ void calculate_lj_ewald_comb_LB_F_E(__constant float *nbfp_comb_climg2d, /* Subtract the grid force from the total LJ force */ *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2; - if (with_E_lj == true) + if (with_E_lj) { float sh_mask; @@ -532,13 +539,42 @@ float pmecorrF(float z2) return polyFN0*polyFD0; } -/*! Final j-force reduction; this generic implementation works with - * arbitrary array sizes. - */ +#if REDUCE_SHUFFLE +gmx_opencl_inline +void reduce_force_j_shfl(float3 fin, __global float *fout, + int tidxi, int tidxj, int aidx) +{ + /* Only does reduction over 4 elements in cluster. Needs to be changed + * for CL_SIZE>4. See CUDA code for required code */ + fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 1); + fin.y += intel_sub_group_shuffle_up (fin.y, fin.y, 1); + fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, 1); + if (tidxi & 1 == 1) + { + fin.x = fin.y; + } + fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 2); + fin.z += intel_sub_group_shuffle_up (fin.z, fin.z, 2); + if (tidxi == 2) + { + fin.x = fin.z; + } + if (tidxi < 3) + { + atomicAdd_g_f(&fout[3 * aidx + tidxi], fin.x); + } +} +#endif + gmx_opencl_inline -void reduce_force_j_generic(__local float *f_buf, __global float *fout, +void reduce_force_j_generic(__local float *f_buf, float3 fcj_buf, __global float *fout, int tidxi, int tidxj, int aidx) { + int tidx = tidxi + tidxj*CL_SIZE; + f_buf[ tidx] = fcj_buf.x; + f_buf[ FBUF_STRIDE + tidx] = fcj_buf.y; + f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z; + /* Split the reduction between the first 3 column threads Threads with column id 0 will do the reduction for (float3).x components Threads with column id 1 will do the reduction for (float3).y components @@ -556,102 +592,185 @@ void reduce_force_j_generic(__local float *f_buf, __global float *fout, } } -/*! Final i-force reduction; this generic implementation works with - * arbitrary array sizes. +/*! Final j-force reduction */ gmx_opencl_inline -void reduce_force_i_generic(__local float *f_buf, __global float *fout, - float *fshift_buf, bool bCalcFshift, - int tidxi, int tidxj, int aidx) +void reduce_force_j(__local float *f_buf, float3 fcj_buf, __global float *fout, + int tidxi, int tidxj, int aidx) +{ +#if REDUCE_SHUFFLE + reduce_force_j_shfl(fcj_buf, fout, tidxi, tidxj, aidx); +#else + reduce_force_j_generic(f_buf, fcj_buf, fout, tidxi, tidxj, aidx); +#endif +} + +#if REDUCE_SHUFFLE +gmx_opencl_inline +void reduce_force_i_and_shift_shfl(float3* fci_buf, __global float *fout, + bool bCalcFshift, int tidxi, int tidxj, + int sci, int shift, __global float *fshift) { - /* Split the reduction between the first 3 line threads - Threads with line id 0 will do the reduction for (float3).x components - Threads with line id 1 will do the reduction for (float3).y components - Threads with line id 2 will do the reduction for (float3).z components. */ - if (tidxj < 3) + /* Only does reduction over 4 elements in cluster (2 per warp). Needs to be changed + * for CL_SIZE>4.*/ + float2 fshift_buf = 0; + for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++) { - float f = 0.0f; - for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE) + int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi; + float3 fin = fci_buf[ci_offset]; + fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, CL_SIZE); + fin.y += intel_sub_group_shuffle_up (fin.y, fin.y, CL_SIZE); + fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, CL_SIZE); + + if (tidxj & 1) { - f += f_buf[tidxj * FBUF_STRIDE + j]; + fin.x = fin.y; } - - atomicAdd_g_f(&fout[3 * aidx + tidxj], f); - + /* Threads 0,1 and 2,3 increment x,y for their warp */ + atomicAdd_g_f(&fout[3*aidx + (tidxj & 1)], fin.x); if (bCalcFshift) { - (*fshift_buf) += f; + fshift_buf[0] += fin.x; + } + /* Threads 0 and 2 increment z for their warp */ + if ((tidxj & 1) == 0) + { + atomicAdd_g_f(&fout[3*aidx+2], fin.z); + if (bCalcFshift) + { + fshift_buf[1] += fin.z; + } + } + } + /* add up local shift forces into global mem */ + if (bCalcFshift) + { + //Threads 0,1 and 2,3 update x,y + atomicAdd_g_f(&(fshift[3 * shift + (tidxj&1)]), fshift_buf[0]); + //Threads 0 and 2 update z + if ((tidxj & 1) == 0) + { + atomicAdd_g_f(&(fshift[3 * shift + 2]), fshift_buf[1]); } } - barrier(CLK_LOCAL_MEM_FENCE); } +#endif /*! Final i-force reduction; this implementation works only with power of two * array sizes. */ gmx_opencl_inline -void reduce_force_i_pow2(volatile __local float *f_buf, __global float *fout, - float *fshift_buf, bool bCalcFshift, - int tidxi, int tidxj, int aidx) +void reduce_force_i_and_shift_pow2(volatile __local float *f_buf, float3* fci_buf, + __global float *fout, + bool bCalcFshift, + int tidxi, int tidxj, + int sci, int shift, __global float *fshift) { - int i, j; - /* Reduce the initial CL_SIZE values for each i atom to half - * every step by using CL_SIZE * i threads. - * Can't just use i as loop variable because than nvcc refuses to unroll. - */ - i = CL_SIZE/2; - for (j = CL_SIZE_LOG2 - 1; j > 0; j--) + float fshift_buf = 0; + for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++) { - if (tidxj < i) + int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi; + int tidx = tidxi + tidxj*CL_SIZE; + /* store i forces in shmem */ + f_buf[ tidx] = fci_buf[ci_offset].x; + f_buf[ FBUF_STRIDE + tidx] = fci_buf[ci_offset].y; + f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z; + barrier(CLK_LOCAL_MEM_FENCE); + + int i, j; + /* Reduce the initial CL_SIZE values for each i atom to half + * every step by using CL_SIZE * i threads. + * Can't just use i as loop variable because than nvcc refuses to unroll. + */ + i = CL_SIZE/2; + for (j = CL_SIZE_LOG2 - 1; j > 0; j--) { + if (tidxj < i) + { + + f_buf[ tidxj * CL_SIZE + tidxi] += f_buf[ (tidxj + i) * CL_SIZE + tidxi]; + f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi]; + f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi]; + } + i >>= 1; + } + /* needed because + * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp + * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call + * TODO: Test on Nvidia for performance difference between having the barrier here or after the atomicAdd + */ + barrier(CLK_LOCAL_MEM_FENCE); + + /* i == 1, last reduction step, writing to global mem */ + /* Split the reduction between the first 3 line threads + Threads with line id 0 will do the reduction for (float3).x components + Threads with line id 1 will do the reduction for (float3).y components + Threads with line id 2 will do the reduction for (float3).z components. */ + if (tidxj < 3) + { + float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi]; - f_buf[ tidxj * CL_SIZE + tidxi] += f_buf[ (tidxj + i) * CL_SIZE + tidxi]; - f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi]; - f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi]; + atomicAdd_g_f(&fout[3 * aidx + tidxj], f); + + if (bCalcFshift) + { + fshift_buf += f; + } } - i >>= 1; } - /* needed because - * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp - * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call - */ - barrier(CLK_LOCAL_MEM_FENCE); - - /* i == 1, last reduction step, writing to global mem */ - /* Split the reduction between the first 3 line threads - Threads with line id 0 will do the reduction for (float3).x components - Threads with line id 1 will do the reduction for (float3).y components - Threads with line id 2 will do the reduction for (float3).z components. */ - if (tidxj < 3) + /* add up local shift forces into global mem */ + if (bCalcFshift) { - float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi]; - - atomicAdd_g_f(&fout[3 * aidx + tidxj], f); - - if (bCalcFshift) + /* Only threads with tidxj < 3 will update fshift. + The threads performing the update, must be the same as the threads + storing the reduction result above. + */ + if (tidxj < 3) { - (*fshift_buf) += f; + atomicAdd_g_f(&(fshift[3 * shift + tidxj]), fshift_buf); } } } -/*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending - * on whether the size of the array to be reduced is power of two or not. +/*! Final i-force reduction */ gmx_opencl_inline -void reduce_force_i(__local float *f_buf, __global float *f, - float *fshift_buf, bool bCalcFshift, - int tidxi, int tidxj, int ai) +void reduce_force_i_and_shift(__local float *f_buf, float3* fci_buf, __global float *f, + bool bCalcFshift, int tidxi, int tidxj, int sci, + int shift, __global float *fshift) { - if ((CL_SIZE & (CL_SIZE - 1))) - { - reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai); - } - else +#if REDUCE_SHUFFLE + reduce_force_i_and_shift_shfl(fci_buf, f, bCalcFshift, tidxi, tidxj, + sci, shift, fshift); +#else + reduce_force_i_and_shift_pow2(f_buf, fci_buf, f, bCalcFshift, tidxi, tidxj, + sci, shift, fshift); +#endif +} + + + +#if REDUCE_SHUFFLE +gmx_opencl_inline +void reduce_energy_shfl(float E_lj, float E_el, + __global float *e_lj, + __global float *e_el, + unsigned int tidx) +{ + E_lj = sub_group_reduce_add(E_lj); + E_el = sub_group_reduce_add(E_el); + /* Should be get_sub_group_local_id()==0. Doesn't work with Intel Classic driver. + * To make it possible to use REDUCE_SHUFFLE with single subgroup per i-j pair + * (e.g. subgroup size 16 with CL_SIZE 4), either this "if" needs to be changed or + * the definition of WARP_SIZE (currently CL_SIZE*CL_SIZE/2) needs to be changed + * (by supporting c_nbnxnGpuClusterpairSplit=1). */ + if (tidx == 0 || tidx == WARP_SIZE) { - reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai); + atomicAdd_g_f(e_lj, E_lj); + atomicAdd_g_f(e_el, E_el); } } +#endif /*! Energy reduction; this implementation works only with power of two * array sizes. @@ -689,4 +808,21 @@ void reduce_energy_pow2(volatile __local float *buf, } } +gmx_opencl_inline +void reduce_energy(volatile __local float *buf, + float E_lj, float E_el, + volatile __global float *e_lj, + volatile __global float *e_el, + unsigned int tidx) +{ +#if REDUCE_SHUFFLE + reduce_energy_shfl(E_lj, E_el, e_lj, e_el, tidx); +#else + /* flush the energies to shmem and reduce them */ + buf[ tidx] = E_lj; + buf[FBUF_STRIDE + tidx] = E_el; + reduce_energy_pow2(buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE); +#endif +} + #endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */ -- 2.22.0