SYCL: remove (un)flatten

author Roland Schulz <roland.schulz@intel.com>

Fri, 1 Oct 2021 20:28:24 +0000 (20:28 +0000)

committer Andrey Alekseenko <al42and@gmail.com>

Fri, 1 Oct 2021 20:28:24 +0000 (20:28 +0000)
author Roland Schulz <roland.schulz@intel.com>
Fri, 1 Oct 2021 20:28:24 +0000 (20:28 +0000)
committer Andrey Alekseenko <al42and@gmail.com>
Fri, 1 Oct 2021 20:28:24 +0000 (20:28 +0000)
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp

index 16be5bc7df8ec5f8f38ce0ef64a92e19c2e1dadf..39811fd11d26f9f4f2086f1c382a3c13998535b4 100644 (file)
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
@@ -307,7 +307,7 @@ static inline float interpolateCoulombForceR(const DeviceAccessor<float, mode::r
   * reduced in log2(cl_Size) steps using shift and atomically accumulate them into \p a_f.
   */
  static inline void reduceForceJShuffle(Float3                                   f,
-                                       const cl::sycl::nd_item<1>               itemIdx,
+                                       const cl::sycl::nd_item<3>               itemIdx,
                                         const int                                tidxi,
                                         const int                                aidx,
                                         DeviceAccessor<Float3, mode::read_write> a_f)
@@ -356,7 +356,7 @@ static inline void reduceForceJShuffle(Float3
   * \return For thread with \p tidxi 0: sum of all \p valueToReduce. Other threads: unspecified.
   */
  template<int subGroupSize, int groupSize>
-static inline float groupReduce(const cl::sycl::nd_item<1> itemIdx,
+static inline float groupReduce(const cl::sycl::nd_item<3> itemIdx,
                                  const unsigned int         tidxi,
                                  cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
                                  float valueToReduce)
@@ -390,7 +390,7 @@ static inline float groupReduce(const cl::sycl::nd_item<1> itemIdx,
   */
  static inline void reduceForceJGeneric(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
                                         Float3                                   f,
-                                       const cl::sycl::nd_item<1>               itemIdx,
+                                       const cl::sycl::nd_item<3>               itemIdx,
                                         const int                                tidxi,
                                         const int                                tidxj,
                                         const int                                aidx,
@@ -424,7 +424,7 @@ static inline void reduceForceJGeneric(cl::sycl::accessor<float, 1, mode::read_w
   */
  static inline void reduceForceJ(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
                                  Float3                                                        f,
-                                const cl::sycl::nd_item<1>               itemIdx,
+                                const cl::sycl::nd_item<3>               itemIdx,
                                  const int                                tidxi,
                                  const int                                tidxj,
                                  const int                                aidx,
@@ -452,7 +452,7 @@ static inline void reduceForceJ(cl::sycl::accessor<float, 1, mode::read_write, t
  static inline void reduceForceIAndFShift(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
                                           const Float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster],
                                           const bool   calcFShift,
-                                         const cl::sycl::nd_item<1>               itemIdx,
+                                         const cl::sycl::nd_item<3>               itemIdx,
                                           const int                                tidxi,
                                           const int                                tidxj,
                                           const int                                sci,
@@ -662,16 +662,14 @@ auto nbnxmKernel(cl::sycl::handler&                                        cgh,
      gmx_unused constexpr int subGroupSize = prunedClusterPairSize;
  #endif
  
-    return [=](cl::sycl::nd_item<1> itemIdx) [[intel::reqd_sub_group_size(subGroupSize)]]
+    return [=](cl::sycl::nd_item<3> itemIdx) [[intel::reqd_sub_group_size(subGroupSize)]]
      {
          /* thread/block/warp id-s */
-        const cl::sycl::id<3> localId = unflattenId<c_clSize, c_clSize>(itemIdx.get_local_id());
-        const unsigned        tidxi   = localId[0];
-        const unsigned        tidxj   = localId[1];
-        const unsigned        tidx    = tidxj * c_clSize + tidxi;
-        const unsigned        tidxz   = 0;
+        const unsigned tidxi = itemIdx.get_local_id(2);
+        const unsigned tidxj = itemIdx.get_local_id(1);
+        const unsigned tidx  = tidxj * c_clSize + tidxi;
+        const unsigned tidxz = 0;
  
-        // Group indexing was flat originally, no need to unflatten it.
          const unsigned bidx = itemIdx.get_group(0);
  
          const sycl_2020::sub_group sg = itemIdx.get_sub_group();
@@ -1072,7 +1070,7 @@ cl::sycl::event launchNbnxmKernel(const DeviceStream& deviceStream, const int nu
       * - The 1D block-grid contains as many blocks as super-clusters.
       */
      const int                   numBlocks = numSci;
-    const cl::sycl::range<3>    blockSize{ c_clSize, c_clSize, 1 };
+    const cl::sycl::range<3>    blockSize{ 1, c_clSize, c_clSize };
      const cl::sycl::range<3>    globalSize{ numBlocks * blockSize[0], blockSize[1], blockSize[2] };
      const cl::sycl::nd_range<3> range{ globalSize, blockSize };
  
@@ -1081,7 +1079,7 @@ cl::sycl::event launchNbnxmKernel(const DeviceStream& deviceStream, const int nu
      cl::sycl::event e = q.submit([&](cl::sycl::handler& cgh) {
          auto kernel = nbnxmKernel<doPruneNBL, doCalcEnergies, elecType, vdwType>(
                  cgh, std::forward<Args>(args)...);
-        cgh.parallel_for<kernelNameType>(flattenNDRange(range), kernel);
+        cgh.parallel_for<kernelNameType>(range, kernel);
      });
  
      return e;
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp

index 1562cbc6fddbf0bcb549e16580de564e97458c29..b779ad79e3cc58d403293759c68ba773d753ab21 100644 (file)
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp
@@ -98,16 +98,15 @@ auto nbnxmKernelPruneOnly(cl::sycl::handler&                            cgh,
      constexpr int gmx_unused requiredSubGroupSize = (c_clSize == 4) ? 16 : warpSize;
  
      /* Requirements:
-     * Work group (block) must have range (c_clSize, c_clSize, ...) (for localId calculation, easy
+     * Work group (block) must have range (c_clSize, c_clSize, ...) (for itemIdx calculation, easy
       * to change). */
-    return [=](cl::sycl::nd_item<1> itemIdx) [[intel::reqd_sub_group_size(requiredSubGroupSize)]]
+    return [=](cl::sycl::nd_item<3> itemIdx) [[intel::reqd_sub_group_size(requiredSubGroupSize)]]
      {
-        const cl::sycl::id<3> localId = unflattenId<c_clSize, c_clSize>(itemIdx.get_local_id());
          // thread/block/warp id-s
-        const unsigned tidxi = localId[0];
-        const unsigned tidxj = localId[1];
+        const unsigned tidxi = itemIdx.get_local_id(2);
+        const unsigned tidxj = itemIdx.get_local_id(1);
          const int      tidx  = tidxj * c_clSize + tidxi;
-        const unsigned tidxz = localId[2];
+        const unsigned tidxz = itemIdx.get_local_id(0);
          const unsigned bidx  = itemIdx.get_group(0);
  
          const sycl_2020::sub_group sg   = itemIdx.get_sub_group();
@@ -234,7 +233,7 @@ cl::sycl::event launchNbnxmKernelPruneOnly(const DeviceStream& deviceStream,
       * - The 1D block-grid contains as many blocks as super-clusters.
       */
      const unsigned long         numBlocks = numSciInPart;
-    const cl::sycl::range<3>    blockSize{ c_clSize, c_clSize, c_syclPruneKernelJ4Concurrency };
+    const cl::sycl::range<3>    blockSize{ c_syclPruneKernelJ4Concurrency, c_clSize, c_clSize };
      const cl::sycl::range<3>    globalSize{ numBlocks * blockSize[0], blockSize[1], blockSize[2] };
      const cl::sycl::nd_range<3> range{ globalSize, blockSize };
  
@@ -242,7 +241,7 @@ cl::sycl::event launchNbnxmKernelPruneOnly(const DeviceStream& deviceStream,
  
      cl::sycl::event e = q.submit([&](cl::sycl::handler& cgh) {
          auto kernel = nbnxmKernelPruneOnly<haveFreshList>(cgh, std::forward<Args>(args)...);
-        cgh.parallel_for<kernelNameType>(flattenNDRange(range), kernel);
+        cgh.parallel_for<kernelNameType>(range, kernel);
      });
  
      return e;
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h

index 53633da4bedc1da7edae8cf5db099b7d7f02c7c0..f1d08db642d7c4bf238b69c18a477df397d2ddbb 100644 (file)
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h
@@ -78,32 +78,6 @@ static constexpr float c_oneSixth = 0.16666667F;
  static constexpr float c_oneTwelfth = 0.08333333F;
  /*! \endcond */
  
-/* The following functions are necessary because on some versions of Intel OpenCL RT, subgroups
- * do not properly work (segfault or create subgroups of size 1) if used in kernels
- * with non-1-dimensional workgroup. */
-//! \brief Convert 3D range to 1D
-static inline cl::sycl::range<1> flattenRange(cl::sycl::range<3> range3d)
-{
-    return { range3d.size() };
-}
-
-//! \brief Convert 3D nd_range to 1D
-static inline cl::sycl::nd_range<1> flattenNDRange(cl::sycl::nd_range<3> nd_range3d)
-{
-    return { flattenRange(nd_range3d.get_global_range()), flattenRange(nd_range3d.get_local_range()) };
-}
-
-//! \brief Convert flattened 1D index to 3D
-template<int rangeX, int rangeY>
-static inline cl::sycl::id<3> unflattenId(cl::sycl::id<1> id1d)
-{
-    constexpr unsigned rangeXY = rangeX * rangeY;
-    const unsigned     id      = id1d[0];
-    const unsigned     z       = id / rangeXY;
-    const unsigned     xy      = id % rangeXY;
-    return { xy % rangeX, xy / rangeX, z };
-}
-
  } // namespace Nbnxm
  
  #endif // GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H
author	Roland Schulz <roland.schulz@intel.com>
	Fri, 1 Oct 2021 20:28:24 +0000 (20:28 +0000)
committer	Andrey Alekseenko <al42and@gmail.com>
	Fri, 1 Oct 2021 20:28:24 +0000 (20:28 +0000)
src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp		patch \| blob \| history
src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp		patch \| blob \| history
src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h		patch \| blob \| history