Thus if more strings need to be appended a new macro must be written or it must be directly appended here.
*/
__attribute__((reqd_work_group_size(CL_SIZE, CL_SIZE, 1)))
+#if REDUCE_SHUFFLE
+__attribute__((intel_reqd_sub_group_size(WARP_SIZE))) //2*WARP_SIZE could be enabled, see comment in reduce_energy_shfl
+#endif
#ifdef PRUNE_NBL
#ifdef CALC_ENERGIES
__kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_prune_opencl)
#endif
unsigned int wexcl, imask, mask_ji;
float4 xqbuf;
- float3 xi, xj, rv, f_ij, fcj_buf /*, fshift_buf*/;
- float fshift_buf;
+ float3 xi, xj, rv, f_ij, fcj_buf;
float3 fci_buf[NCL_PER_SUPERCL]; /* i force buffer */
nbnxn_sci_t nb_sci;
/*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL=8 bits set */
const unsigned superClInteractionMask = ((1U << NCL_PER_SUPERCL) - 1U);
-#define LOCAL_OFFSET xqib + NCL_PER_SUPERCL * CL_SIZE
+#define LOCAL_OFFSET (xqib + NCL_PER_SUPERCL * CL_SIZE)
__local int *cjs;
#if USE_CJ_PREFETCH
/* shmem buffer for cj, for both warps separately */
#endif
#endif
-#ifndef REDUCE_SHUFFLE
+#if !REDUCE_SHUFFLE
/* shmem j force buffer */
__local float *f_buf = (__local float *)(LOCAL_OFFSET);
#undef LOCAL_OFFSET
- #define LOCAL_OFFSET f_buf + CL_SIZE * CL_SIZE * 3
+ #define LOCAL_OFFSET (f_buf + CL_SIZE * CL_SIZE * 3)
+#else
+ __local float *f_buf = 0;
#endif
/* Local buffer used to implement __any warp vote function from CUDA.
volatile is used to avoid compiler optimizations for AMD builds. */
imask = pl_cj4[j4].imei[widx].imask;
wexcl = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)];
- preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imask);
+ preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imask != 0u);
#ifndef PRUNE_NBL
if (imask)
{
if (imask & mask_ji)
{
- ci_offset = i; /* i force buffer offset */
-
ci = sci * NCL_PER_SUPERCL + i; /* i cluster index */
ai = ci * CL_SIZE + tidxi; /* i atom index */
fcj_buf -= f_ij;
/* accumulate i forces in registers */
- fci_buf[ci_offset] += f_ij;
+ fci_buf[i] += f_ij;
}
}
}
/* reduce j forces */
-
- /* store j forces in shmem */
- f_buf[ tidx] = fcj_buf.x;
- f_buf[ FBUF_STRIDE + tidx] = fcj_buf.y;
- f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
-
- reduce_force_j_generic(f_buf, f, tidxi, tidxj, aj);
+ reduce_force_j(f_buf, fcj_buf, f, tidxi, tidxj, aj);
}
}
#ifdef PRUNE_NBL
/* skip central shifts when summing shift forces */
if (nb_sci.shift == CENTRAL)
{
- bCalcFshift = false;
+ bCalcFshift = 0;
}
-
- fshift_buf = 0.0f;
-
/* reduce i forces */
- for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
- {
- ai = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
-
- f_buf[ tidx] = fci_buf[ci_offset].x;
- f_buf[ FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
- f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
- barrier(CLK_LOCAL_MEM_FENCE);
- reduce_force_i(f_buf, f,
- &fshift_buf, bCalcFshift,
- tidxi, tidxj, ai);
- }
-
- /* add up local shift forces into global mem */
- if (bCalcFshift)
- {
- /* Only threads with tidxj < 3 will update fshift.
- The threads performing the update must be the same with the threads
- which stored the reduction result in reduce_force_i function
- */
- if (tidxj < 3)
- {
- atomicAdd_g_f(&(fshift[3 * nb_sci.shift + tidxj]), fshift_buf);
- }
- }
+ reduce_force_i_and_shift(f_buf, fci_buf, f, bCalcFshift != 0, tidxi, tidxj, sci,
+ nb_sci.shift, fshift);
#ifdef CALC_ENERGIES
- /* flush the energies to shmem and reduce them */
- f_buf[ tidx] = E_lj;
- f_buf[FBUF_STRIDE + tidx] = E_el;
- reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
+ reduce_energy(f_buf, E_lj, E_el, e_lj, e_el, tidx);
#endif
}
#define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE)
#define NCL_PER_SUPERCL (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
-#define WARP_SIZE (CL_SIZE*CL_SIZE/2)
+#define WARP_SIZE (CL_SIZE*CL_SIZE/2) //Currently only c_nbnxnGpuClusterpairSplit=2 supported
+
+/* Nvidia+AMD don't support any subgroup extension (2.1 core or cl_khr_subgroups).
+ Code doesn't support CL_SIZE=8.
+ cl_intel_required_subgroup_size required for intel_reqd_sub_group_size.
+ cl_intel_subgroups required for intel_sub_group_shuffle_up/down.
+ */
+#if defined cl_intel_required_subgroup_size && defined cl_intel_subgroups && CL_SIZE == 4
+#define REDUCE_SHUFFLE 1
+#else
+#define REDUCE_SHUFFLE 0
+#endif
/* 1.0 / sqrt(M_PI) */
#define M_FLOAT_1_SQRTPI 0.564189583547756f
#ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
#define NBNXN_OPENCL_KERNEL_UTILS_CLH
-__constant sampler_t generic_sampler = (CLK_NORMALIZED_COORDS_FALSE /* Natural coords */
- | CLK_ADDRESS_NONE /* No clamp/repeat*/
- | CLK_FILTER_NEAREST); /* No interpolation */
-
#if CL_SIZE == 8
#define WARP_SIZE_LOG2 (5)
#define CL_SIZE_LOG2 (3)
/* Subtract the grid force from the total LJ force */
*F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
- if (with_E_lj == true)
+ if (with_E_lj)
{
float sh_mask;
return polyFN0*polyFD0;
}
-/*! Final j-force reduction; this generic implementation works with
- * arbitrary array sizes.
- */
+#if REDUCE_SHUFFLE
+gmx_opencl_inline
+void reduce_force_j_shfl(float3 fin, __global float *fout,
+ int tidxi, int tidxj, int aidx)
+{
+ /* Only does reduction over 4 elements in cluster. Needs to be changed
+ * for CL_SIZE>4. See CUDA code for required code */
+ fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 1);
+ fin.y += intel_sub_group_shuffle_up (fin.y, fin.y, 1);
+ fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, 1);
+ if (tidxi & 1 == 1)
+ {
+ fin.x = fin.y;
+ }
+ fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 2);
+ fin.z += intel_sub_group_shuffle_up (fin.z, fin.z, 2);
+ if (tidxi == 2)
+ {
+ fin.x = fin.z;
+ }
+ if (tidxi < 3)
+ {
+ atomicAdd_g_f(&fout[3 * aidx + tidxi], fin.x);
+ }
+}
+#endif
+
gmx_opencl_inline
-void reduce_force_j_generic(__local float *f_buf, __global float *fout,
+void reduce_force_j_generic(__local float *f_buf, float3 fcj_buf, __global float *fout,
int tidxi, int tidxj, int aidx)
{
+ int tidx = tidxi + tidxj*CL_SIZE;
+ f_buf[ tidx] = fcj_buf.x;
+ f_buf[ FBUF_STRIDE + tidx] = fcj_buf.y;
+ f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
+
/* Split the reduction between the first 3 column threads
Threads with column id 0 will do the reduction for (float3).x components
Threads with column id 1 will do the reduction for (float3).y components
}
}
-/*! Final i-force reduction; this generic implementation works with
- * arbitrary array sizes.
+/*! Final j-force reduction
*/
gmx_opencl_inline
-void reduce_force_i_generic(__local float *f_buf, __global float *fout,
- float *fshift_buf, bool bCalcFshift,
- int tidxi, int tidxj, int aidx)
+void reduce_force_j(__local float *f_buf, float3 fcj_buf, __global float *fout,
+ int tidxi, int tidxj, int aidx)
+{
+#if REDUCE_SHUFFLE
+ reduce_force_j_shfl(fcj_buf, fout, tidxi, tidxj, aidx);
+#else
+ reduce_force_j_generic(f_buf, fcj_buf, fout, tidxi, tidxj, aidx);
+#endif
+}
+
+#if REDUCE_SHUFFLE
+gmx_opencl_inline
+void reduce_force_i_and_shift_shfl(float3* fci_buf, __global float *fout,
+ bool bCalcFshift, int tidxi, int tidxj,
+ int sci, int shift, __global float *fshift)
{
- /* Split the reduction between the first 3 line threads
- Threads with line id 0 will do the reduction for (float3).x components
- Threads with line id 1 will do the reduction for (float3).y components
- Threads with line id 2 will do the reduction for (float3).z components. */
- if (tidxj < 3)
+ /* Only does reduction over 4 elements in cluster (2 per warp). Needs to be changed
+ * for CL_SIZE>4.*/
+ float2 fshift_buf = 0;
+ for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
{
- float f = 0.0f;
- for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
+ int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+ float3 fin = fci_buf[ci_offset];
+ fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, CL_SIZE);
+ fin.y += intel_sub_group_shuffle_up (fin.y, fin.y, CL_SIZE);
+ fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, CL_SIZE);
+
+ if (tidxj & 1)
{
- f += f_buf[tidxj * FBUF_STRIDE + j];
+ fin.x = fin.y;
}
-
- atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
-
+ /* Threads 0,1 and 2,3 increment x,y for their warp */
+ atomicAdd_g_f(&fout[3*aidx + (tidxj & 1)], fin.x);
if (bCalcFshift)
{
- (*fshift_buf) += f;
+ fshift_buf[0] += fin.x;
+ }
+ /* Threads 0 and 2 increment z for their warp */
+ if ((tidxj & 1) == 0)
+ {
+ atomicAdd_g_f(&fout[3*aidx+2], fin.z);
+ if (bCalcFshift)
+ {
+ fshift_buf[1] += fin.z;
+ }
+ }
+ }
+ /* add up local shift forces into global mem */
+ if (bCalcFshift)
+ {
+ //Threads 0,1 and 2,3 update x,y
+ atomicAdd_g_f(&(fshift[3 * shift + (tidxj&1)]), fshift_buf[0]);
+ //Threads 0 and 2 update z
+ if ((tidxj & 1) == 0)
+ {
+ atomicAdd_g_f(&(fshift[3 * shift + 2]), fshift_buf[1]);
}
}
- barrier(CLK_LOCAL_MEM_FENCE);
}
+#endif
/*! Final i-force reduction; this implementation works only with power of two
* array sizes.
*/
gmx_opencl_inline
-void reduce_force_i_pow2(volatile __local float *f_buf, __global float *fout,
- float *fshift_buf, bool bCalcFshift,
- int tidxi, int tidxj, int aidx)
+void reduce_force_i_and_shift_pow2(volatile __local float *f_buf, float3* fci_buf,
+ __global float *fout,
+ bool bCalcFshift,
+ int tidxi, int tidxj,
+ int sci, int shift, __global float *fshift)
{
- int i, j;
- /* Reduce the initial CL_SIZE values for each i atom to half
- * every step by using CL_SIZE * i threads.
- * Can't just use i as loop variable because than nvcc refuses to unroll.
- */
- i = CL_SIZE/2;
- for (j = CL_SIZE_LOG2 - 1; j > 0; j--)
+ float fshift_buf = 0;
+ for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
{
- if (tidxj < i)
+ int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+ int tidx = tidxi + tidxj*CL_SIZE;
+ /* store i forces in shmem */
+ f_buf[ tidx] = fci_buf[ci_offset].x;
+ f_buf[ FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
+ f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ int i, j;
+ /* Reduce the initial CL_SIZE values for each i atom to half
+ * every step by using CL_SIZE * i threads.
+ * Can't just use i as loop variable because than nvcc refuses to unroll.
+ */
+ i = CL_SIZE/2;
+ for (j = CL_SIZE_LOG2 - 1; j > 0; j--)
{
+ if (tidxj < i)
+ {
+
+ f_buf[ tidxj * CL_SIZE + tidxi] += f_buf[ (tidxj + i) * CL_SIZE + tidxi];
+ f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+ f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+ }
+ i >>= 1;
+ }
+ /* needed because
+ * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
+ * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
+ * TODO: Test on Nvidia for performance difference between having the barrier here or after the atomicAdd
+ */
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ /* i == 1, last reduction step, writing to global mem */
+ /* Split the reduction between the first 3 line threads
+ Threads with line id 0 will do the reduction for (float3).x components
+ Threads with line id 1 will do the reduction for (float3).y components
+ Threads with line id 2 will do the reduction for (float3).z components. */
+ if (tidxj < 3)
+ {
+ float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
- f_buf[ tidxj * CL_SIZE + tidxi] += f_buf[ (tidxj + i) * CL_SIZE + tidxi];
- f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
- f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+ atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
+
+ if (bCalcFshift)
+ {
+ fshift_buf += f;
+ }
}
- i >>= 1;
}
- /* needed because
- * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
- * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
- */
- barrier(CLK_LOCAL_MEM_FENCE);
-
- /* i == 1, last reduction step, writing to global mem */
- /* Split the reduction between the first 3 line threads
- Threads with line id 0 will do the reduction for (float3).x components
- Threads with line id 1 will do the reduction for (float3).y components
- Threads with line id 2 will do the reduction for (float3).z components. */
- if (tidxj < 3)
+ /* add up local shift forces into global mem */
+ if (bCalcFshift)
{
- float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
-
- atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
-
- if (bCalcFshift)
+ /* Only threads with tidxj < 3 will update fshift.
+ The threads performing the update, must be the same as the threads
+ storing the reduction result above.
+ */
+ if (tidxj < 3)
{
- (*fshift_buf) += f;
+ atomicAdd_g_f(&(fshift[3 * shift + tidxj]), fshift_buf);
}
}
}
-/*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
- * on whether the size of the array to be reduced is power of two or not.
+/*! Final i-force reduction
*/
gmx_opencl_inline
-void reduce_force_i(__local float *f_buf, __global float *f,
- float *fshift_buf, bool bCalcFshift,
- int tidxi, int tidxj, int ai)
+void reduce_force_i_and_shift(__local float *f_buf, float3* fci_buf, __global float *f,
+ bool bCalcFshift, int tidxi, int tidxj, int sci,
+ int shift, __global float *fshift)
{
- if ((CL_SIZE & (CL_SIZE - 1)))
- {
- reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
- }
- else
+#if REDUCE_SHUFFLE
+ reduce_force_i_and_shift_shfl(fci_buf, f, bCalcFshift, tidxi, tidxj,
+ sci, shift, fshift);
+#else
+ reduce_force_i_and_shift_pow2(f_buf, fci_buf, f, bCalcFshift, tidxi, tidxj,
+ sci, shift, fshift);
+#endif
+}
+
+
+
+#if REDUCE_SHUFFLE
+gmx_opencl_inline
+void reduce_energy_shfl(float E_lj, float E_el,
+ __global float *e_lj,
+ __global float *e_el,
+ unsigned int tidx)
+{
+ E_lj = sub_group_reduce_add(E_lj);
+ E_el = sub_group_reduce_add(E_el);
+ /* Should be get_sub_group_local_id()==0. Doesn't work with Intel Classic driver.
+ * To make it possible to use REDUCE_SHUFFLE with single subgroup per i-j pair
+ * (e.g. subgroup size 16 with CL_SIZE 4), either this "if" needs to be changed or
+ * the definition of WARP_SIZE (currently CL_SIZE*CL_SIZE/2) needs to be changed
+ * (by supporting c_nbnxnGpuClusterpairSplit=1). */
+ if (tidx == 0 || tidx == WARP_SIZE)
{
- reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
+ atomicAdd_g_f(e_lj, E_lj);
+ atomicAdd_g_f(e_el, E_el);
}
}
+#endif
/*! Energy reduction; this implementation works only with power of two
* array sizes.
}
}
+gmx_opencl_inline
+void reduce_energy(volatile __local float *buf,
+ float E_lj, float E_el,
+ volatile __global float *e_lj,
+ volatile __global float *e_el,
+ unsigned int tidx)
+{
+#if REDUCE_SHUFFLE
+ reduce_energy_shfl(E_lj, E_el, e_lj, e_el, tidx);
+#else
+ /* flush the energies to shmem and reduce them */
+ buf[ tidx] = E_lj;
+ buf[FBUF_STRIDE + tidx] = E_el;
+ reduce_energy_pow2(buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
+#endif
+}
+
#endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */