Use subgroup for warp_any and CJ4 prefetch

author Roland Schulz <roland.schulz@intel.com>

Sun, 29 Apr 2018 21:19:26 +0000 (14:19 -0700)

committer Szilárd Páll <pall.szilard@gmail.com>

Thu, 4 Oct 2018 23:38:09 +0000 (01:38 +0200)
author Roland Schulz <roland.schulz@intel.com>
Sun, 29 Apr 2018 21:19:26 +0000 (14:19 -0700)
committer Szilárd Páll <pall.szilard@gmail.com>
Thu, 4 Oct 2018 23:38:09 +0000 (01:38 +0200)
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh

index eb7e43fc60ccd14c509bc1ac2ac263fa69b2a0d5..e58efc6121d42185da3e34c855d74ad23a03ddef 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
@@ -46,12 +46,6 @@
  /* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for the "nowarp" kernel
   * Note that this should precede the kernel_utils include.
   */
-#if defined _AMD_SOURCE_ || defined _NVIDIA_SOURCE_
-#define USE_CJ_PREFETCH 1
-#else
-#define USE_CJ_PREFETCH 0
-#endif
-
  #include "nbnxn_ocl_kernel_utils.clh"
  
  /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -98,8 +92,8 @@
     Thus if more strings need to be appended a new macro must be written or it must be directly appended here.
   */
  __attribute__((reqd_work_group_size(CL_SIZE, CL_SIZE, 1)))
-#if REDUCE_SHUFFLE
-__attribute__((intel_reqd_sub_group_size(WARP_SIZE))) //2*WARP_SIZE could be enabled, see comment in reduce_energy_shfl
+#ifdef cl_intel_required_subgroup_size
+__attribute__((intel_reqd_sub_group_size(SUBGROUP_SIZE)))
  #endif
  #ifdef PRUNE_NBL
      #ifdef CALC_ENERGIES
@@ -220,13 +214,13 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
      const unsigned superClInteractionMask = ((1U << NCL_PER_SUPERCL) - 1U);
  
  #define LOCAL_OFFSET (xqib + NCL_PER_SUPERCL * CL_SIZE)
-    __local int *cjs;
+    CjType cjs;
  #if USE_CJ_PREFETCH
      /* shmem buffer for cj, for both warps separately */
      cjs = (__local int *)(LOCAL_OFFSET);
      #undef LOCAL_OFFSET
      #define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE
-#endif
+#endif //USE_CJ_PREFETCH
  
  #ifdef IATYPE_SHMEM
  #ifndef LJ_COMB
@@ -249,9 +243,13 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
  #else
      __local float *f_buf  = 0;
  #endif
+#if !USE_SUBGROUP_ANY
      /* Local buffer used to implement __any warp vote function from CUDA.
         volatile is used to avoid compiler optimizations for AMD builds. */
      volatile __local uint *warp_any = (__local uint*)(LOCAL_OFFSET);
+#else
+    __local uint          *warp_any = 0;
+#endif
  #undef LOCAL_OFFSET
  
      nb_sci      = pl_sci[bidx];         /* my i super-cluster's index = current bidx */
@@ -277,12 +275,13 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
  #endif
  #endif
      }
+#if !USE_SUBGROUP_ANY
      /* Initialise warp vote. (8x8 block) 2 warps for nvidia */
      if (tidx == 0 || tidx == WARP_SIZE)
      {
          warp_any[widx] = 0;
      }
-
+#endif
      barrier(CLK_LOCAL_MEM_FENCE);
  
      for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
@@ -347,7 +346,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
          imask       = pl_cj4[j4].imei[widx].imask;
          wexcl       = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)];
  
-        preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imask != 0u);
+        preloadCj4(&cjs, pl_cj4[j4].cj, tidxi, tidxj, imask != 0u);
  
  #ifndef PRUNE_NBL
          if (imask)
@@ -363,7 +362,6 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
  #if !defined PRUNE_NBL && !defined _NVIDIA_SOURCE_
  #pragma unroll 4
  #endif
-
              for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
              {
                  if (imask & (superClInteractionMask << (jm * NCL_PER_SUPERCL)))
@@ -404,21 +402,10 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
                              r2      = norm2(rv);
  
  #ifdef PRUNE_NBL
-                            /* vote.. should code shmem serialisation, wonder what the hit will be */
-                            if (r2 < rlist_sq)
-                            {
-                                warp_any[widx] = 1;
-                            }
-
-                            /* If _none_ of the atoms pairs are in cutoff range,
-                               the bit corresponding to the current
-                               cluster-pair in imask gets set to 0. */
-                            if (!warp_any[widx])
+                            if (!gmx_sub_group_any(warp_any, widx, r2 < rlist_sq))
                              {
                                  imask &= ~mask_ji;
                              }
-
-                            warp_any[widx] = 0;
  #endif
  
                              int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh

index 7d31a58e538d7d872dcefca42ac66496465ced7e..4c253f79921c4fabea79b8583341ea57945cab70 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
@@ -42,15 +42,6 @@
   *  \ingroup module_mdlib
   */
  
-#if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
-/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
- * Note that this should precede the kernel_utils include.
- */
-#define USE_CJ_PREFETCH 1
-#else
-#define USE_CJ_PREFETCH 0
-#endif
-
  #include "nbnxn_ocl_kernel_utils.clh"
  
  /* Note: the AMD compiler testing was done with (fglrx 15.12) performs best with wg
@@ -123,25 +114,28 @@ __kernel void nbnxn_kernel_prune_rolling_opencl
  
      #define LOCAL_OFFSET (xib + c_numClPerSupercl * c_clSize)
      /* shmem buffer for i cj pre-loading */
-    __local int *cjs;
+    CjType cjs;
  #if USE_CJ_PREFETCH
      cjs = (((__local int *)(LOCAL_OFFSET)) + tidxz * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize);
      #undef LOCAL_OFFSET
      /* Offset calculated using xib because cjs depends on on tidxz! */
      #define LOCAL_OFFSET (((__local int *)(xib + c_numClPerSupercl * c_clSize)) + (NTHREAD_Z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize))
  #endif
-
+#if !USE_SUBGROUP_ANY
      /* Local buffer used to implement __any warp vote function from CUDA.
         volatile is used to avoid compiler optimizations for AMD builds. */
-    volatile __local uint *warp_any = (__local uint*)(LOCAL_OFFSET);
-    #undef LOCAL_OFFSET
-
-    unsigned int warpVoteSlot = NTHREAD_Z*tidxz + widx;
+    volatile __local uint *const warp_any     = (__local uint*)(LOCAL_OFFSET);
+    const unsigned int           warpVoteSlot = NTHREAD_Z*tidxz + widx;
      /* Initialise warp vote.*/
      if (tidx == 0 || tidx == 32)
      {
          warp_any[warpVoteSlot] = 0;
      }
+#else
+    __local uint *const warp_any     = 0;
+    const unsigned int  warpVoteSlot = 0;
+#endif
+    #undef LOCAL_OFFSET
  
      nbnxn_sci_t nb_sci      = pl_sci[bidx*numParts + part]; /* my i super-cluster's index = sciOffset + current bidx * numParts + part */
      int         sci         = nb_sci.sci;                   /* super-cluster */
@@ -188,7 +182,7 @@ __kernel void nbnxn_kernel_prune_rolling_opencl
              imaskCheck = (imaskNew ^ imaskFull);
          }
  
-        preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck != 0u);
+        preloadCj4(&cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck != 0u);
  
          if (imaskCheck)
          {
@@ -223,27 +217,17 @@ __kernel void nbnxn_kernel_prune_rolling_opencl
                                  /* If _none_ of the atoms pairs are in cutoff range,
                                     the bit corresponding to the current
                                     cluster-pair in imask gets set to 0. */
-                                if (r2 < rlistOuter_sq)
-                                {
-                                    warp_any[warpVoteSlot] = 1;
-                                }
-                                if (!warp_any[warpVoteSlot])
+                                if (!gmx_sub_group_any(warp_any, warpVoteSlot, r2 < rlistOuter_sq))
                                  {
                                      imaskFull &= ~mask_ji;
                                  }
-                                warp_any[warpVoteSlot] = 0;
                              }
                              /* If any atom pair is within range, set the bit
                                 corresponding to the current cluster-pair. */
-                            if (r2 < rlistInner_sq)
-                            {
-                                warp_any[warpVoteSlot] = 1;
-                            }
-                            if (warp_any[warpVoteSlot])
+                            if (gmx_sub_group_any(warp_any, warpVoteSlot, r2 < rlistInner_sq))
                              {
                                  imaskNew |= mask_ji;
                              }
-                            warp_any[warpVoteSlot] = 0;
                          }
  
                          /* shift the mask bit by 1 */
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh

index 88aa20bfc1a9de42e347e08d79254b4fd82897ff..4d5e7c9ba2d2c400df1e9084e5cf247445deab1e 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
@@ -41,17 +41,39 @@
  
  #define WARP_SIZE  (CL_SIZE*CL_SIZE/2) //Currently only c_nbnxnGpuClusterpairSplit=2 supported
  
-/* Nvidia+AMD don't support any subgroup extension (2.1 core or cl_khr_subgroups).
-   Code doesn't support CL_SIZE=8.
-   cl_intel_required_subgroup_size required for intel_reqd_sub_group_size.
-   cl_intel_subgroups required for intel_sub_group_shuffle_up/down.
+#if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
+/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
+ * Note that this should precede the kernel_utils include.
   */
-#if defined cl_intel_required_subgroup_size && defined cl_intel_subgroups && CL_SIZE == 4
-#define REDUCE_SHUFFLE 1
+#define USE_CJ_PREFETCH 1
  #else
-#define REDUCE_SHUFFLE 0
+#define USE_CJ_PREFETCH 0
  #endif
  
+#if (defined cl_intel_subgroups || defined cl_khr_subgroups || __OPENCL_VERSION__ >= 210)
+#define HAVE_SUBGROUP 1
+#else
+#define HAVE_SUBGROUP 0
+#endif
+
+#ifdef cl_intel_subgroups
+#define HAVE_INTEL_SUBGROUP 1
+#else
+#define HAVE_INTEL_SUBGROUP 0
+#endif
+
+#if _INTEL_SOURCE_
+#define SUBGROUP_SIZE 8
+#elif _AMD_SOURCE_
+#define SUBGROUP_SIZE 64
+#else
+#define SUBGROUP_SIZE 32
+#endif
+
+#define REDUCE_SHUFFLE (HAVE_INTEL_SUBGROUP && CL_SIZE == 4 && SUBGROUP_SIZE == WARP_SIZE)
+#define USE_SUBGROUP_ANY (HAVE_SUBGROUP && SUBGROUP_SIZE == WARP_SIZE)
+#define USE_SUBGROUP_PRELOAD HAVE_INTEL_SUBGROUP
+
  /* 1.0 / sqrt(M_PI) */
  #define M_FLOAT_1_SQRTPI 0.564189583547756f
  
@@ -157,48 +179,92 @@ typedef struct {
  /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
  __constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
  
+gmx_opencl_inline
+void preloadCj4Generic(__local int        *sm_cjPreload,
+                       const __global int *gm_cj,
+                       int                 tidxi,
+                       int                 tidxj,
+                       bool                iMaskCond)
  
-/*! \brief Preload cj4 into local memory.
+{
+    /* Pre-load cj into shared memory */
+#if defined _AMD_SOURCE_ //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
+    if (tidxj == 0 & tidxi < NBNXN_GPU_JGROUP_SIZE)
+    {
+        sm_cjPreload[tidxi] = gm_cj[tidxi];
+    }
+#else
+    const int c_clSize                   = CL_SIZE;
+    const int c_nbnxnGpuJgroupSize       = NBNXN_GPU_JGROUP_SIZE;
+    const int c_nbnxnGpuClusterpairSplit = 2;
+    const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
+
+    if ((tidxj == 0 | tidxj == c_splitClSize) & (tidxi < c_nbnxnGpuJgroupSize))
+    {
+        sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = gm_cj[tidxi];
+    }
+#endif
+}
+
+
+#if USE_SUBGROUP_PRELOAD
+gmx_opencl_inline
+int  preloadCj4Subgroup(const __global int *gm_cj)
+{
+    //loads subgroup-size # of elements (8) instead of the 4 required
+    //equivalent to *cjs = *gm_cj
+    return intel_sub_group_block_read((const __global uint *)gm_cj);
+}
+#endif //USE_SUBGROUP_PRELOAD
+
+#if USE_SUBGROUP_PRELOAD
+typedef int CjType;
+#else
+typedef __local int* CjType;
+#endif
+
+/*! \brief Preload cj4
   *
   * - For AMD we load once for a wavefront of 64 threads (on 4 threads * NTHREAD_Z)
   * - For NVIDIA once per warp (on 2x4 threads * NTHREAD_Z)
- * - Same as AMD in the nowarp kernel; we do not assume execution width and therefore
- *   the caller needs to sync.
+ * - For Intel(/USE_SUBGROUP_PRELOAD) loads into private memory(/register) instead of local memory
   *
   * It is the caller's responsibility to make sure that data is consumed only when
   * it's ready. This function does not call a barrier.
   */
  gmx_opencl_inline
-void preloadCj4(__local int        *sm_cjPreload,
+void preloadCj4(CjType             *cjs,
                  const __global int *gm_cj,
                  int                 tidxi,
                  int                 tidxj,
                  bool                iMaskCond)
-
  {
-#if !USE_CJ_PREFETCH
-    return;
+#if USE_SUBGROUP_PRELOAD
+    *cjs = preloadCj4Subgroup(gm_cj);
+#elif USE_CJ_PREFETCH
+    preloadCj4Generic(*cjs, gm_cj, tidxi, tidxj, iMaskCond);
+#else
+    //nothing to do
  #endif
+}
  
+gmx_opencl_inline
+int loadCjPreload(__local int*        sm_cjPreload,
+                  int                 jm,
+                  int                 tidxi,
+                  int                 tidxj)
+{
+#if defined _AMD_SOURCE_
+    int       warpLoadOffset = 0; //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
+#else
      const int c_clSize                   = CL_SIZE;
      const int c_nbnxnGpuJgroupSize       = NBNXN_GPU_JGROUP_SIZE;
      const int c_nbnxnGpuClusterpairSplit = 2;
      const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
  
-    /* Pre-load cj into shared memory */
-#if defined _NVIDIA_SOURCE_
-    /* on both warps separately for NVIDIA */
-    if ((tidxj == 0 | tidxj == 4) & (tidxi < c_nbnxnGpuJgroupSize))
-    {
-        sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = gm_cj[tidxi];
-    }
-#else // AMD or nowarp
-      /* Note that with "nowarp" / on hardware with wavefronts <64 a barrier is needed after preload. */
-    if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize)
-    {
-        sm_cjPreload[tidxi] = gm_cj[tidxi];
-    }
+    int       warpLoadOffset = (tidxj & c_splitClSize) * c_nbnxnGpuJgroupSize/c_splitClSize;
  #endif
+    return sm_cjPreload[jm + warpLoadOffset];
  }
  
  /* \brief Load a cj given a jm index.
@@ -206,32 +272,18 @@ void preloadCj4(__local int        *sm_cjPreload,
   * If cj4 preloading is enabled, it loads from the local memory, otherwise from global.
   */
  gmx_opencl_inline
-int loadCj(__local int        *sm_cjPreload,
-           const __global int *gm_cj,
-           int                 jm,
-           int                 tidxi,
-           int                 tidxj)
+int loadCj(CjType cjs, const __global int *gm_cj,
+           int jm, int tidxi, int tidxj)
  {
-    const int c_clSize                   = CL_SIZE;
-    const int c_nbnxnGpuJgroupSize       = NBNXN_GPU_JGROUP_SIZE;
-    const int c_nbnxnGpuClusterpairSplit = 2;
-    const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
-
-#if USE_CJ_PREFETCH
-#if defined _NVIDIA_SOURCE_
-    int warpLoadOffset = (tidxj & 4) * c_nbnxnGpuJgroupSize/c_splitClSize;
-#elif defined _AMD_SOURCE_
-    int warpLoadOffset = 0;
-#else
-#error Not supported
-#endif
-    return sm_cjPreload[jm + warpLoadOffset];
+#if USE_SUBGROUP_PRELOAD
+    return sub_group_broadcast(cjs, jm);
+#elif USE_CJ_PREFETCH
+    return loadCjPreload(cjs, jm, tidxi, tidxj);
  #else
      return gm_cj[jm];
  #endif
  }
  
-
  /*! Convert LJ sigma,epsilon parameters to C6,C12. */
  gmx_opencl_inline
  void convert_sigma_epsilon_to_c6_c12(const float  sigma,
@@ -549,7 +601,7 @@ void reduce_force_j_shfl(float3 fin, __global float *fout,
      fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 1);
      fin.y += intel_sub_group_shuffle_up  (fin.y, fin.y, 1);
      fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, 1);
-    if (tidxi & 1 == 1)
+    if ((tidxi & 1) == 1)
      {
          fin.x = fin.y;
      }
@@ -753,8 +805,8 @@ void reduce_force_i_and_shift(__local float *f_buf, float3* fci_buf, __global fl
  #if REDUCE_SHUFFLE
  gmx_opencl_inline
  void reduce_energy_shfl(float E_lj, float E_el,
-                        __global float *e_lj,
-                        __global float *e_el,
+                        volatile __global float *e_lj,
+                        volatile __global float *e_el,
                          unsigned int    tidx)
  {
      E_lj = sub_group_reduce_add(E_lj);
@@ -825,4 +877,28 @@ void reduce_energy(volatile __local float  *buf,
  #endif
  }
  
+bool gmx_sub_group_any_localmem(volatile __local uint *warp_any, int widx, bool pred)
+{
+    if (pred)
+    {
+        warp_any[widx] = 1;
+    }
+
+    bool ret = warp_any[widx];
+
+    warp_any[widx] = 0;
+
+    return ret;
+}
+
+//! Returns a true if predicate is true for any work item in warp
+bool gmx_sub_group_any(volatile __local uint *warp_any, int widx, bool pred)
+{
+#if USE_SUBGROUP_ANY
+    return sub_group_any(pred);
+#else
+    return gmx_sub_group_any_localmem(warp_any, widx, pred);
+#endif
+}
+
  #endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */
author	Roland Schulz <roland.schulz@intel.com>
	Sun, 29 Apr 2018 21:19:26 +0000 (14:19 -0700)
committer	Szilárd Páll <pall.szilard@gmail.com>
	Thu, 4 Oct 2018 23:38:09 +0000 (01:38 +0200)
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh		patch \| blob \| history