/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for the "nowarp" kernel
* Note that this should precede the kernel_utils include.
*/
-#if defined _AMD_SOURCE_ || defined _NVIDIA_SOURCE_
-#define USE_CJ_PREFETCH 1
-#else
-#define USE_CJ_PREFETCH 0
-#endif
-
#include "nbnxn_ocl_kernel_utils.clh"
/////////////////////////////////////////////////////////////////////////////////////////////////
Thus if more strings need to be appended a new macro must be written or it must be directly appended here.
*/
__attribute__((reqd_work_group_size(CL_SIZE, CL_SIZE, 1)))
-#if REDUCE_SHUFFLE
-__attribute__((intel_reqd_sub_group_size(WARP_SIZE))) //2*WARP_SIZE could be enabled, see comment in reduce_energy_shfl
+#ifdef cl_intel_required_subgroup_size
+__attribute__((intel_reqd_sub_group_size(SUBGROUP_SIZE)))
#endif
#ifdef PRUNE_NBL
#ifdef CALC_ENERGIES
const unsigned superClInteractionMask = ((1U << NCL_PER_SUPERCL) - 1U);
#define LOCAL_OFFSET (xqib + NCL_PER_SUPERCL * CL_SIZE)
- __local int *cjs;
+ CjType cjs;
#if USE_CJ_PREFETCH
/* shmem buffer for cj, for both warps separately */
cjs = (__local int *)(LOCAL_OFFSET);
#undef LOCAL_OFFSET
#define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE
-#endif
+#endif //USE_CJ_PREFETCH
#ifdef IATYPE_SHMEM
#ifndef LJ_COMB
#else
__local float *f_buf = 0;
#endif
+#if !USE_SUBGROUP_ANY
/* Local buffer used to implement __any warp vote function from CUDA.
volatile is used to avoid compiler optimizations for AMD builds. */
volatile __local uint *warp_any = (__local uint*)(LOCAL_OFFSET);
+#else
+ __local uint *warp_any = 0;
+#endif
#undef LOCAL_OFFSET
nb_sci = pl_sci[bidx]; /* my i super-cluster's index = current bidx */
#endif
#endif
}
+#if !USE_SUBGROUP_ANY
/* Initialise warp vote. (8x8 block) 2 warps for nvidia */
if (tidx == 0 || tidx == WARP_SIZE)
{
warp_any[widx] = 0;
}
-
+#endif
barrier(CLK_LOCAL_MEM_FENCE);
for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
imask = pl_cj4[j4].imei[widx].imask;
wexcl = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)];
- preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imask != 0u);
+ preloadCj4(&cjs, pl_cj4[j4].cj, tidxi, tidxj, imask != 0u);
#ifndef PRUNE_NBL
if (imask)
#if !defined PRUNE_NBL && !defined _NVIDIA_SOURCE_
#pragma unroll 4
#endif
-
for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
{
if (imask & (superClInteractionMask << (jm * NCL_PER_SUPERCL)))
r2 = norm2(rv);
#ifdef PRUNE_NBL
- /* vote.. should code shmem serialisation, wonder what the hit will be */
- if (r2 < rlist_sq)
- {
- warp_any[widx] = 1;
- }
-
- /* If _none_ of the atoms pairs are in cutoff range,
- the bit corresponding to the current
- cluster-pair in imask gets set to 0. */
- if (!warp_any[widx])
+ if (!gmx_sub_group_any(warp_any, widx, r2 < rlist_sq))
{
imask &= ~mask_ji;
}
-
- warp_any[widx] = 0;
#endif
int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
* \ingroup module_mdlib
*/
-#if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
-/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
- * Note that this should precede the kernel_utils include.
- */
-#define USE_CJ_PREFETCH 1
-#else
-#define USE_CJ_PREFETCH 0
-#endif
-
#include "nbnxn_ocl_kernel_utils.clh"
/* Note: the AMD compiler testing was done with (fglrx 15.12) performs best with wg
#define LOCAL_OFFSET (xib + c_numClPerSupercl * c_clSize)
/* shmem buffer for i cj pre-loading */
- __local int *cjs;
+ CjType cjs;
#if USE_CJ_PREFETCH
cjs = (((__local int *)(LOCAL_OFFSET)) + tidxz * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize);
#undef LOCAL_OFFSET
/* Offset calculated using xib because cjs depends on on tidxz! */
#define LOCAL_OFFSET (((__local int *)(xib + c_numClPerSupercl * c_clSize)) + (NTHREAD_Z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize))
#endif
-
+#if !USE_SUBGROUP_ANY
/* Local buffer used to implement __any warp vote function from CUDA.
volatile is used to avoid compiler optimizations for AMD builds. */
- volatile __local uint *warp_any = (__local uint*)(LOCAL_OFFSET);
- #undef LOCAL_OFFSET
-
- unsigned int warpVoteSlot = NTHREAD_Z*tidxz + widx;
+ volatile __local uint *const warp_any = (__local uint*)(LOCAL_OFFSET);
+ const unsigned int warpVoteSlot = NTHREAD_Z*tidxz + widx;
/* Initialise warp vote.*/
if (tidx == 0 || tidx == 32)
{
warp_any[warpVoteSlot] = 0;
}
+#else
+ __local uint *const warp_any = 0;
+ const unsigned int warpVoteSlot = 0;
+#endif
+ #undef LOCAL_OFFSET
nbnxn_sci_t nb_sci = pl_sci[bidx*numParts + part]; /* my i super-cluster's index = sciOffset + current bidx * numParts + part */
int sci = nb_sci.sci; /* super-cluster */
imaskCheck = (imaskNew ^ imaskFull);
}
- preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck != 0u);
+ preloadCj4(&cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck != 0u);
if (imaskCheck)
{
/* If _none_ of the atoms pairs are in cutoff range,
the bit corresponding to the current
cluster-pair in imask gets set to 0. */
- if (r2 < rlistOuter_sq)
- {
- warp_any[warpVoteSlot] = 1;
- }
- if (!warp_any[warpVoteSlot])
+ if (!gmx_sub_group_any(warp_any, warpVoteSlot, r2 < rlistOuter_sq))
{
imaskFull &= ~mask_ji;
}
- warp_any[warpVoteSlot] = 0;
}
/* If any atom pair is within range, set the bit
corresponding to the current cluster-pair. */
- if (r2 < rlistInner_sq)
- {
- warp_any[warpVoteSlot] = 1;
- }
- if (warp_any[warpVoteSlot])
+ if (gmx_sub_group_any(warp_any, warpVoteSlot, r2 < rlistInner_sq))
{
imaskNew |= mask_ji;
}
- warp_any[warpVoteSlot] = 0;
}
/* shift the mask bit by 1 */
#define WARP_SIZE (CL_SIZE*CL_SIZE/2) //Currently only c_nbnxnGpuClusterpairSplit=2 supported
-/* Nvidia+AMD don't support any subgroup extension (2.1 core or cl_khr_subgroups).
- Code doesn't support CL_SIZE=8.
- cl_intel_required_subgroup_size required for intel_reqd_sub_group_size.
- cl_intel_subgroups required for intel_sub_group_shuffle_up/down.
+#if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
+/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
+ * Note that this should precede the kernel_utils include.
*/
-#if defined cl_intel_required_subgroup_size && defined cl_intel_subgroups && CL_SIZE == 4
-#define REDUCE_SHUFFLE 1
+#define USE_CJ_PREFETCH 1
#else
-#define REDUCE_SHUFFLE 0
+#define USE_CJ_PREFETCH 0
#endif
+#if (defined cl_intel_subgroups || defined cl_khr_subgroups || __OPENCL_VERSION__ >= 210)
+#define HAVE_SUBGROUP 1
+#else
+#define HAVE_SUBGROUP 0
+#endif
+
+#ifdef cl_intel_subgroups
+#define HAVE_INTEL_SUBGROUP 1
+#else
+#define HAVE_INTEL_SUBGROUP 0
+#endif
+
+#if _INTEL_SOURCE_
+#define SUBGROUP_SIZE 8
+#elif _AMD_SOURCE_
+#define SUBGROUP_SIZE 64
+#else
+#define SUBGROUP_SIZE 32
+#endif
+
+#define REDUCE_SHUFFLE (HAVE_INTEL_SUBGROUP && CL_SIZE == 4 && SUBGROUP_SIZE == WARP_SIZE)
+#define USE_SUBGROUP_ANY (HAVE_SUBGROUP && SUBGROUP_SIZE == WARP_SIZE)
+#define USE_SUBGROUP_PRELOAD HAVE_INTEL_SUBGROUP
+
/* 1.0 / sqrt(M_PI) */
#define M_FLOAT_1_SQRTPI 0.564189583547756f
/*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
__constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
+gmx_opencl_inline
+void preloadCj4Generic(__local int *sm_cjPreload,
+ const __global int *gm_cj,
+ int tidxi,
+ int tidxj,
+ bool iMaskCond)
-/*! \brief Preload cj4 into local memory.
+{
+ /* Pre-load cj into shared memory */
+#if defined _AMD_SOURCE_ //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
+ if (tidxj == 0 & tidxi < NBNXN_GPU_JGROUP_SIZE)
+ {
+ sm_cjPreload[tidxi] = gm_cj[tidxi];
+ }
+#else
+ const int c_clSize = CL_SIZE;
+ const int c_nbnxnGpuJgroupSize = NBNXN_GPU_JGROUP_SIZE;
+ const int c_nbnxnGpuClusterpairSplit = 2;
+ const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
+
+ if ((tidxj == 0 | tidxj == c_splitClSize) & (tidxi < c_nbnxnGpuJgroupSize))
+ {
+ sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = gm_cj[tidxi];
+ }
+#endif
+}
+
+
+#if USE_SUBGROUP_PRELOAD
+gmx_opencl_inline
+int preloadCj4Subgroup(const __global int *gm_cj)
+{
+ //loads subgroup-size # of elements (8) instead of the 4 required
+ //equivalent to *cjs = *gm_cj
+ return intel_sub_group_block_read((const __global uint *)gm_cj);
+}
+#endif //USE_SUBGROUP_PRELOAD
+
+#if USE_SUBGROUP_PRELOAD
+typedef int CjType;
+#else
+typedef __local int* CjType;
+#endif
+
+/*! \brief Preload cj4
*
* - For AMD we load once for a wavefront of 64 threads (on 4 threads * NTHREAD_Z)
* - For NVIDIA once per warp (on 2x4 threads * NTHREAD_Z)
- * - Same as AMD in the nowarp kernel; we do not assume execution width and therefore
- * the caller needs to sync.
+ * - For Intel(/USE_SUBGROUP_PRELOAD) loads into private memory(/register) instead of local memory
*
* It is the caller's responsibility to make sure that data is consumed only when
* it's ready. This function does not call a barrier.
*/
gmx_opencl_inline
-void preloadCj4(__local int *sm_cjPreload,
+void preloadCj4(CjType *cjs,
const __global int *gm_cj,
int tidxi,
int tidxj,
bool iMaskCond)
-
{
-#if !USE_CJ_PREFETCH
- return;
+#if USE_SUBGROUP_PRELOAD
+ *cjs = preloadCj4Subgroup(gm_cj);
+#elif USE_CJ_PREFETCH
+ preloadCj4Generic(*cjs, gm_cj, tidxi, tidxj, iMaskCond);
+#else
+ //nothing to do
#endif
+}
+gmx_opencl_inline
+int loadCjPreload(__local int* sm_cjPreload,
+ int jm,
+ int tidxi,
+ int tidxj)
+{
+#if defined _AMD_SOURCE_
+ int warpLoadOffset = 0; //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
+#else
const int c_clSize = CL_SIZE;
const int c_nbnxnGpuJgroupSize = NBNXN_GPU_JGROUP_SIZE;
const int c_nbnxnGpuClusterpairSplit = 2;
const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
- /* Pre-load cj into shared memory */
-#if defined _NVIDIA_SOURCE_
- /* on both warps separately for NVIDIA */
- if ((tidxj == 0 | tidxj == 4) & (tidxi < c_nbnxnGpuJgroupSize))
- {
- sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = gm_cj[tidxi];
- }
-#else // AMD or nowarp
- /* Note that with "nowarp" / on hardware with wavefronts <64 a barrier is needed after preload. */
- if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize)
- {
- sm_cjPreload[tidxi] = gm_cj[tidxi];
- }
+ int warpLoadOffset = (tidxj & c_splitClSize) * c_nbnxnGpuJgroupSize/c_splitClSize;
#endif
+ return sm_cjPreload[jm + warpLoadOffset];
}
/* \brief Load a cj given a jm index.
* If cj4 preloading is enabled, it loads from the local memory, otherwise from global.
*/
gmx_opencl_inline
-int loadCj(__local int *sm_cjPreload,
- const __global int *gm_cj,
- int jm,
- int tidxi,
- int tidxj)
+int loadCj(CjType cjs, const __global int *gm_cj,
+ int jm, int tidxi, int tidxj)
{
- const int c_clSize = CL_SIZE;
- const int c_nbnxnGpuJgroupSize = NBNXN_GPU_JGROUP_SIZE;
- const int c_nbnxnGpuClusterpairSplit = 2;
- const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
-
-#if USE_CJ_PREFETCH
-#if defined _NVIDIA_SOURCE_
- int warpLoadOffset = (tidxj & 4) * c_nbnxnGpuJgroupSize/c_splitClSize;
-#elif defined _AMD_SOURCE_
- int warpLoadOffset = 0;
-#else
-#error Not supported
-#endif
- return sm_cjPreload[jm + warpLoadOffset];
+#if USE_SUBGROUP_PRELOAD
+ return sub_group_broadcast(cjs, jm);
+#elif USE_CJ_PREFETCH
+ return loadCjPreload(cjs, jm, tidxi, tidxj);
#else
return gm_cj[jm];
#endif
}
-
/*! Convert LJ sigma,epsilon parameters to C6,C12. */
gmx_opencl_inline
void convert_sigma_epsilon_to_c6_c12(const float sigma,
fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 1);
fin.y += intel_sub_group_shuffle_up (fin.y, fin.y, 1);
fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, 1);
- if (tidxi & 1 == 1)
+ if ((tidxi & 1) == 1)
{
fin.x = fin.y;
}
#if REDUCE_SHUFFLE
gmx_opencl_inline
void reduce_energy_shfl(float E_lj, float E_el,
- __global float *e_lj,
- __global float *e_el,
+ volatile __global float *e_lj,
+ volatile __global float *e_el,
unsigned int tidx)
{
E_lj = sub_group_reduce_add(E_lj);
#endif
}
+bool gmx_sub_group_any_localmem(volatile __local uint *warp_any, int widx, bool pred)
+{
+ if (pred)
+ {
+ warp_any[widx] = 1;
+ }
+
+ bool ret = warp_any[widx];
+
+ warp_any[widx] = 0;
+
+ return ret;
+}
+
+//! Returns a true if predicate is true for any work item in warp
+bool gmx_sub_group_any(volatile __local uint *warp_any, int widx, bool pred)
+{
+#if USE_SUBGROUP_ANY
+ return sub_group_any(pred);
+#else
+ return gmx_sub_group_any_localmem(warp_any, widx, pred);
+#endif
+}
+
#endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */