From: Roland Schulz <roland.schulz@intel.com>
Date: Sun, 29 Apr 2018 21:19:26 +0000 (-0700)
Subject: Use subgroup for warp_any and CJ4 prefetch
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=1df76148809fed6687dfc66ce887e442c882b767;p=alexxy%2Fgromacs.git

Use subgroup for warp_any and CJ4 prefetch

Change-Id: I548d669d0125084d3b6533ad072e758ff9fe5cc1
---

diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
index eb7e43fc60..e58efc6121 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
@@ -46,12 +46,6 @@
 /* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for the "nowarp" kernel
  * Note that this should precede the kernel_utils include.
  */
-#if defined _AMD_SOURCE_ || defined _NVIDIA_SOURCE_
-#define USE_CJ_PREFETCH 1
-#else
-#define USE_CJ_PREFETCH 0
-#endif
-
 #include "nbnxn_ocl_kernel_utils.clh"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -98,8 +92,8 @@
    Thus if more strings need to be appended a new macro must be written or it must be directly appended here.
  */
 __attribute__((reqd_work_group_size(CL_SIZE, CL_SIZE, 1)))
-#if REDUCE_SHUFFLE
-__attribute__((intel_reqd_sub_group_size(WARP_SIZE))) //2*WARP_SIZE could be enabled, see comment in reduce_energy_shfl
+#ifdef cl_intel_required_subgroup_size
+__attribute__((intel_reqd_sub_group_size(SUBGROUP_SIZE)))
 #endif
 #ifdef PRUNE_NBL
     #ifdef CALC_ENERGIES
@@ -220,13 +214,13 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
     const unsigned superClInteractionMask = ((1U << NCL_PER_SUPERCL) - 1U);
 
 #define LOCAL_OFFSET (xqib + NCL_PER_SUPERCL * CL_SIZE)
-    __local int *cjs;
+    CjType cjs;
 #if USE_CJ_PREFETCH
     /* shmem buffer for cj, for both warps separately */
     cjs = (__local int *)(LOCAL_OFFSET);
     #undef LOCAL_OFFSET
     #define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE
-#endif
+#endif //USE_CJ_PREFETCH
 
 #ifdef IATYPE_SHMEM
 #ifndef LJ_COMB
@@ -249,9 +243,13 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
 #else
     __local float *f_buf  = 0;
 #endif
+#if !USE_SUBGROUP_ANY
     /* Local buffer used to implement __any warp vote function from CUDA.
        volatile is used to avoid compiler optimizations for AMD builds. */
     volatile __local uint *warp_any = (__local uint*)(LOCAL_OFFSET);
+#else
+    __local uint          *warp_any = 0;
+#endif
 #undef LOCAL_OFFSET
 
     nb_sci      = pl_sci[bidx];         /* my i super-cluster's index = current bidx */
@@ -277,12 +275,13 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
 #endif
 #endif
     }
+#if !USE_SUBGROUP_ANY
     /* Initialise warp vote. (8x8 block) 2 warps for nvidia */
     if (tidx == 0 || tidx == WARP_SIZE)
     {
         warp_any[widx] = 0;
     }
-
+#endif
     barrier(CLK_LOCAL_MEM_FENCE);
 
     for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
@@ -347,7 +346,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
         imask       = pl_cj4[j4].imei[widx].imask;
         wexcl       = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)];
 
-        preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imask != 0u);
+        preloadCj4(&cjs, pl_cj4[j4].cj, tidxi, tidxj, imask != 0u);
 
 #ifndef PRUNE_NBL
         if (imask)
@@ -363,7 +362,6 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
 #if !defined PRUNE_NBL && !defined _NVIDIA_SOURCE_
 #pragma unroll 4
 #endif
-
             for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
             {
                 if (imask & (superClInteractionMask << (jm * NCL_PER_SUPERCL)))
@@ -404,21 +402,10 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
                             r2      = norm2(rv);
 
 #ifdef PRUNE_NBL
-                            /* vote.. should code shmem serialisation, wonder what the hit will be */
-                            if (r2 < rlist_sq)
-                            {
-                                warp_any[widx] = 1;
-                            }
-
-                            /* If _none_ of the atoms pairs are in cutoff range,
-                               the bit corresponding to the current
-                               cluster-pair in imask gets set to 0. */
-                            if (!warp_any[widx])
+                            if (!gmx_sub_group_any(warp_any, widx, r2 < rlist_sq))
                             {
                                 imask &= ~mask_ji;
                             }
-
-                            warp_any[widx] = 0;
 #endif
 
                             int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
index 7d31a58e53..4c253f7992 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
@@ -42,15 +42,6 @@
  *  \ingroup module_mdlib
  */
 
-#if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
-/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
- * Note that this should precede the kernel_utils include.
- */
-#define USE_CJ_PREFETCH 1
-#else
-#define USE_CJ_PREFETCH 0
-#endif
-
 #include "nbnxn_ocl_kernel_utils.clh"
 
 /* Note: the AMD compiler testing was done with (fglrx 15.12) performs best with wg
@@ -123,25 +114,28 @@ __kernel void nbnxn_kernel_prune_rolling_opencl
 
     #define LOCAL_OFFSET (xib + c_numClPerSupercl * c_clSize)
     /* shmem buffer for i cj pre-loading */
-    __local int *cjs;
+    CjType cjs;
 #if USE_CJ_PREFETCH
     cjs = (((__local int *)(LOCAL_OFFSET)) + tidxz * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize);
     #undef LOCAL_OFFSET
     /* Offset calculated using xib because cjs depends on on tidxz! */
     #define LOCAL_OFFSET (((__local int *)(xib + c_numClPerSupercl * c_clSize)) + (NTHREAD_Z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize))
 #endif
-
+#if !USE_SUBGROUP_ANY
     /* Local buffer used to implement __any warp vote function from CUDA.
        volatile is used to avoid compiler optimizations for AMD builds. */
-    volatile __local uint *warp_any = (__local uint*)(LOCAL_OFFSET);
-    #undef LOCAL_OFFSET
-
-    unsigned int warpVoteSlot = NTHREAD_Z*tidxz + widx;
+    volatile __local uint *const warp_any     = (__local uint*)(LOCAL_OFFSET);
+    const unsigned int           warpVoteSlot = NTHREAD_Z*tidxz + widx;
     /* Initialise warp vote.*/
     if (tidx == 0 || tidx == 32)
     {
         warp_any[warpVoteSlot] = 0;
     }
+#else
+    __local uint *const warp_any     = 0;
+    const unsigned int  warpVoteSlot = 0;
+#endif
+    #undef LOCAL_OFFSET
 
     nbnxn_sci_t nb_sci      = pl_sci[bidx*numParts + part]; /* my i super-cluster's index = sciOffset + current bidx * numParts + part */
     int         sci         = nb_sci.sci;                   /* super-cluster */
@@ -188,7 +182,7 @@ __kernel void nbnxn_kernel_prune_rolling_opencl
             imaskCheck = (imaskNew ^ imaskFull);
         }
 
-        preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck != 0u);
+        preloadCj4(&cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck != 0u);
 
         if (imaskCheck)
         {
@@ -223,27 +217,17 @@ __kernel void nbnxn_kernel_prune_rolling_opencl
                                 /* If _none_ of the atoms pairs are in cutoff range,
                                    the bit corresponding to the current
                                    cluster-pair in imask gets set to 0. */
-                                if (r2 < rlistOuter_sq)
-                                {
-                                    warp_any[warpVoteSlot] = 1;
-                                }
-                                if (!warp_any[warpVoteSlot])
+                                if (!gmx_sub_group_any(warp_any, warpVoteSlot, r2 < rlistOuter_sq))
                                 {
                                     imaskFull &= ~mask_ji;
                                 }
-                                warp_any[warpVoteSlot] = 0;
                             }
                             /* If any atom pair is within range, set the bit
                                corresponding to the current cluster-pair. */
-                            if (r2 < rlistInner_sq)
-                            {
-                                warp_any[warpVoteSlot] = 1;
-                            }
-                            if (warp_any[warpVoteSlot])
+                            if (gmx_sub_group_any(warp_any, warpVoteSlot, r2 < rlistInner_sq))
                             {
                                 imaskNew |= mask_ji;
                             }
-                            warp_any[warpVoteSlot] = 0;
                         }
 
                         /* shift the mask bit by 1 */
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
index 88aa20bfc1..4d5e7c9ba2 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
@@ -41,17 +41,39 @@
 
 #define WARP_SIZE  (CL_SIZE*CL_SIZE/2) //Currently only c_nbnxnGpuClusterpairSplit=2 supported
 
-/* Nvidia+AMD don't support any subgroup extension (2.1 core or cl_khr_subgroups).
-   Code doesn't support CL_SIZE=8.
-   cl_intel_required_subgroup_size required for intel_reqd_sub_group_size.
-   cl_intel_subgroups required for intel_sub_group_shuffle_up/down.
+#if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
+/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
+ * Note that this should precede the kernel_utils include.
  */
-#if defined cl_intel_required_subgroup_size && defined cl_intel_subgroups && CL_SIZE == 4
-#define REDUCE_SHUFFLE 1
+#define USE_CJ_PREFETCH 1
 #else
-#define REDUCE_SHUFFLE 0
+#define USE_CJ_PREFETCH 0
 #endif
 
+#if (defined cl_intel_subgroups || defined cl_khr_subgroups || __OPENCL_VERSION__ >= 210)
+#define HAVE_SUBGROUP 1
+#else
+#define HAVE_SUBGROUP 0
+#endif
+
+#ifdef cl_intel_subgroups
+#define HAVE_INTEL_SUBGROUP 1
+#else
+#define HAVE_INTEL_SUBGROUP 0
+#endif
+
+#if _INTEL_SOURCE_
+#define SUBGROUP_SIZE 8
+#elif _AMD_SOURCE_
+#define SUBGROUP_SIZE 64
+#else
+#define SUBGROUP_SIZE 32
+#endif
+
+#define REDUCE_SHUFFLE (HAVE_INTEL_SUBGROUP && CL_SIZE == 4 && SUBGROUP_SIZE == WARP_SIZE)
+#define USE_SUBGROUP_ANY (HAVE_SUBGROUP && SUBGROUP_SIZE == WARP_SIZE)
+#define USE_SUBGROUP_PRELOAD HAVE_INTEL_SUBGROUP
+
 /* 1.0 / sqrt(M_PI) */
 #define M_FLOAT_1_SQRTPI 0.564189583547756f
 
@@ -157,48 +179,92 @@ typedef struct {
 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
 __constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
 
+gmx_opencl_inline
+void preloadCj4Generic(__local int        *sm_cjPreload,
+                       const __global int *gm_cj,
+                       int                 tidxi,
+                       int                 tidxj,
+                       bool                iMaskCond)
 
-/*! \brief Preload cj4 into local memory.
+{
+    /* Pre-load cj into shared memory */
+#if defined _AMD_SOURCE_ //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
+    if (tidxj == 0 & tidxi < NBNXN_GPU_JGROUP_SIZE)
+    {
+        sm_cjPreload[tidxi] = gm_cj[tidxi];
+    }
+#else
+    const int c_clSize                   = CL_SIZE;
+    const int c_nbnxnGpuJgroupSize       = NBNXN_GPU_JGROUP_SIZE;
+    const int c_nbnxnGpuClusterpairSplit = 2;
+    const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
+
+    if ((tidxj == 0 | tidxj == c_splitClSize) & (tidxi < c_nbnxnGpuJgroupSize))
+    {
+        sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = gm_cj[tidxi];
+    }
+#endif
+}
+
+
+#if USE_SUBGROUP_PRELOAD
+gmx_opencl_inline
+int  preloadCj4Subgroup(const __global int *gm_cj)
+{
+    //loads subgroup-size # of elements (8) instead of the 4 required
+    //equivalent to *cjs = *gm_cj
+    return intel_sub_group_block_read((const __global uint *)gm_cj);
+}
+#endif //USE_SUBGROUP_PRELOAD
+
+#if USE_SUBGROUP_PRELOAD
+typedef int CjType;
+#else
+typedef __local int* CjType;
+#endif
+
+/*! \brief Preload cj4
  *
  * - For AMD we load once for a wavefront of 64 threads (on 4 threads * NTHREAD_Z)
  * - For NVIDIA once per warp (on 2x4 threads * NTHREAD_Z)
- * - Same as AMD in the nowarp kernel; we do not assume execution width and therefore
- *   the caller needs to sync.
+ * - For Intel(/USE_SUBGROUP_PRELOAD) loads into private memory(/register) instead of local memory
  *
  * It is the caller's responsibility to make sure that data is consumed only when
  * it's ready. This function does not call a barrier.
  */
 gmx_opencl_inline
-void preloadCj4(__local int        *sm_cjPreload,
+void preloadCj4(CjType             *cjs,
                 const __global int *gm_cj,
                 int                 tidxi,
                 int                 tidxj,
                 bool                iMaskCond)
-
 {
-#if !USE_CJ_PREFETCH
-    return;
+#if USE_SUBGROUP_PRELOAD
+    *cjs = preloadCj4Subgroup(gm_cj);
+#elif USE_CJ_PREFETCH
+    preloadCj4Generic(*cjs, gm_cj, tidxi, tidxj, iMaskCond);
+#else
+    //nothing to do
 #endif
+}
 
+gmx_opencl_inline
+int loadCjPreload(__local int*        sm_cjPreload,
+                  int                 jm,
+                  int                 tidxi,
+                  int                 tidxj)
+{
+#if defined _AMD_SOURCE_
+    int       warpLoadOffset = 0; //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
+#else
     const int c_clSize                   = CL_SIZE;
     const int c_nbnxnGpuJgroupSize       = NBNXN_GPU_JGROUP_SIZE;
     const int c_nbnxnGpuClusterpairSplit = 2;
     const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
 
-    /* Pre-load cj into shared memory */
-#if defined _NVIDIA_SOURCE_
-    /* on both warps separately for NVIDIA */
-    if ((tidxj == 0 | tidxj == 4) & (tidxi < c_nbnxnGpuJgroupSize))
-    {
-        sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = gm_cj[tidxi];
-    }
-#else // AMD or nowarp
-      /* Note that with "nowarp" / on hardware with wavefronts <64 a barrier is needed after preload. */
-    if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize)
-    {
-        sm_cjPreload[tidxi] = gm_cj[tidxi];
-    }
+    int       warpLoadOffset = (tidxj & c_splitClSize) * c_nbnxnGpuJgroupSize/c_splitClSize;
 #endif
+    return sm_cjPreload[jm + warpLoadOffset];
 }
 
 /* \brief Load a cj given a jm index.
@@ -206,32 +272,18 @@ void preloadCj4(__local int        *sm_cjPreload,
  * If cj4 preloading is enabled, it loads from the local memory, otherwise from global.
  */
 gmx_opencl_inline
-int loadCj(__local int        *sm_cjPreload,
-           const __global int *gm_cj,
-           int                 jm,
-           int                 tidxi,
-           int                 tidxj)
+int loadCj(CjType cjs, const __global int *gm_cj,
+           int jm, int tidxi, int tidxj)
 {
-    const int c_clSize                   = CL_SIZE;
-    const int c_nbnxnGpuJgroupSize       = NBNXN_GPU_JGROUP_SIZE;
-    const int c_nbnxnGpuClusterpairSplit = 2;
-    const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
-
-#if USE_CJ_PREFETCH
-#if defined _NVIDIA_SOURCE_
-    int warpLoadOffset = (tidxj & 4) * c_nbnxnGpuJgroupSize/c_splitClSize;
-#elif defined _AMD_SOURCE_
-    int warpLoadOffset = 0;
-#else
-#error Not supported
-#endif
-    return sm_cjPreload[jm + warpLoadOffset];
+#if USE_SUBGROUP_PRELOAD
+    return sub_group_broadcast(cjs, jm);
+#elif USE_CJ_PREFETCH
+    return loadCjPreload(cjs, jm, tidxi, tidxj);
 #else
     return gm_cj[jm];
 #endif
 }
 
-
 /*! Convert LJ sigma,epsilon parameters to C6,C12. */
 gmx_opencl_inline
 void convert_sigma_epsilon_to_c6_c12(const float  sigma,
@@ -549,7 +601,7 @@ void reduce_force_j_shfl(float3 fin, __global float *fout,
     fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 1);
     fin.y += intel_sub_group_shuffle_up  (fin.y, fin.y, 1);
     fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, 1);
-    if (tidxi & 1 == 1)
+    if ((tidxi & 1) == 1)
     {
         fin.x = fin.y;
     }
@@ -753,8 +805,8 @@ void reduce_force_i_and_shift(__local float *f_buf, float3* fci_buf, __global fl
 #if REDUCE_SHUFFLE
 gmx_opencl_inline
 void reduce_energy_shfl(float E_lj, float E_el,
-                        __global float *e_lj,
-                        __global float *e_el,
+                        volatile __global float *e_lj,
+                        volatile __global float *e_el,
                         unsigned int    tidx)
 {
     E_lj = sub_group_reduce_add(E_lj);
@@ -825,4 +877,28 @@ void reduce_energy(volatile __local float  *buf,
 #endif
 }
 
+bool gmx_sub_group_any_localmem(volatile __local uint *warp_any, int widx, bool pred)
+{
+    if (pred)
+    {
+        warp_any[widx] = 1;
+    }
+
+    bool ret = warp_any[widx];
+
+    warp_any[widx] = 0;
+
+    return ret;
+}
+
+//! Returns a true if predicate is true for any work item in warp
+bool gmx_sub_group_any(volatile __local uint *warp_any, int widx, bool pred)
+{
+#if USE_SUBGROUP_ANY
+    return sub_group_any(pred);
+#else
+    return gmx_sub_group_any_localmem(warp_any, widx, pred);
+#endif
+}
+
 #endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */