From b340c38071557618f336c630c752bed415a9d522 Mon Sep 17 00:00:00 2001 From: Roland Schulz Date: Fri, 1 Oct 2021 20:28:24 +0000 Subject: [PATCH] SYCL: remove (un)flatten --- src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp | 26 +++++++++---------- .../sycl/nbnxm_sycl_kernel_pruneonly.cpp | 15 +++++------ .../nbnxm/sycl/nbnxm_sycl_kernel_utils.h | 26 ------------------- 3 files changed, 19 insertions(+), 48 deletions(-) diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp index 16be5bc7df..39811fd11d 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp @@ -307,7 +307,7 @@ static inline float interpolateCoulombForceR(const DeviceAccessor itemIdx, + const cl::sycl::nd_item<3> itemIdx, const int tidxi, const int aidx, DeviceAccessor a_f) @@ -356,7 +356,7 @@ static inline void reduceForceJShuffle(Float3 * \return For thread with \p tidxi 0: sum of all \p valueToReduce. Other threads: unspecified. */ template -static inline float groupReduce(const cl::sycl::nd_item<1> itemIdx, +static inline float groupReduce(const cl::sycl::nd_item<3> itemIdx, const unsigned int tidxi, cl::sycl::accessor sm_buf, float valueToReduce) @@ -390,7 +390,7 @@ static inline float groupReduce(const cl::sycl::nd_item<1> itemIdx, */ static inline void reduceForceJGeneric(cl::sycl::accessor sm_buf, Float3 f, - const cl::sycl::nd_item<1> itemIdx, + const cl::sycl::nd_item<3> itemIdx, const int tidxi, const int tidxj, const int aidx, @@ -424,7 +424,7 @@ static inline void reduceForceJGeneric(cl::sycl::accessor sm_buf, Float3 f, - const cl::sycl::nd_item<1> itemIdx, + const cl::sycl::nd_item<3> itemIdx, const int tidxi, const int tidxj, const int aidx, @@ -452,7 +452,7 @@ static inline void reduceForceJ(cl::sycl::accessor sm_buf, const Float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster], const bool calcFShift, - const cl::sycl::nd_item<1> itemIdx, + const cl::sycl::nd_item<3> itemIdx, const int tidxi, const int tidxj, const int sci, @@ -662,16 +662,14 @@ auto nbnxmKernel(cl::sycl::handler& cgh, gmx_unused constexpr int subGroupSize = prunedClusterPairSize; #endif - return [=](cl::sycl::nd_item<1> itemIdx) [[intel::reqd_sub_group_size(subGroupSize)]] + return [=](cl::sycl::nd_item<3> itemIdx) [[intel::reqd_sub_group_size(subGroupSize)]] { /* thread/block/warp id-s */ - const cl::sycl::id<3> localId = unflattenId(itemIdx.get_local_id()); - const unsigned tidxi = localId[0]; - const unsigned tidxj = localId[1]; - const unsigned tidx = tidxj * c_clSize + tidxi; - const unsigned tidxz = 0; + const unsigned tidxi = itemIdx.get_local_id(2); + const unsigned tidxj = itemIdx.get_local_id(1); + const unsigned tidx = tidxj * c_clSize + tidxi; + const unsigned tidxz = 0; - // Group indexing was flat originally, no need to unflatten it. const unsigned bidx = itemIdx.get_group(0); const sycl_2020::sub_group sg = itemIdx.get_sub_group(); @@ -1072,7 +1070,7 @@ cl::sycl::event launchNbnxmKernel(const DeviceStream& deviceStream, const int nu * - The 1D block-grid contains as many blocks as super-clusters. */ const int numBlocks = numSci; - const cl::sycl::range<3> blockSize{ c_clSize, c_clSize, 1 }; + const cl::sycl::range<3> blockSize{ 1, c_clSize, c_clSize }; const cl::sycl::range<3> globalSize{ numBlocks * blockSize[0], blockSize[1], blockSize[2] }; const cl::sycl::nd_range<3> range{ globalSize, blockSize }; @@ -1081,7 +1079,7 @@ cl::sycl::event launchNbnxmKernel(const DeviceStream& deviceStream, const int nu cl::sycl::event e = q.submit([&](cl::sycl::handler& cgh) { auto kernel = nbnxmKernel( cgh, std::forward(args)...); - cgh.parallel_for(flattenNDRange(range), kernel); + cgh.parallel_for(range, kernel); }); return e; diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp index 1562cbc6fd..b779ad79e3 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp @@ -98,16 +98,15 @@ auto nbnxmKernelPruneOnly(cl::sycl::handler& cgh, constexpr int gmx_unused requiredSubGroupSize = (c_clSize == 4) ? 16 : warpSize; /* Requirements: - * Work group (block) must have range (c_clSize, c_clSize, ...) (for localId calculation, easy + * Work group (block) must have range (c_clSize, c_clSize, ...) (for itemIdx calculation, easy * to change). */ - return [=](cl::sycl::nd_item<1> itemIdx) [[intel::reqd_sub_group_size(requiredSubGroupSize)]] + return [=](cl::sycl::nd_item<3> itemIdx) [[intel::reqd_sub_group_size(requiredSubGroupSize)]] { - const cl::sycl::id<3> localId = unflattenId(itemIdx.get_local_id()); // thread/block/warp id-s - const unsigned tidxi = localId[0]; - const unsigned tidxj = localId[1]; + const unsigned tidxi = itemIdx.get_local_id(2); + const unsigned tidxj = itemIdx.get_local_id(1); const int tidx = tidxj * c_clSize + tidxi; - const unsigned tidxz = localId[2]; + const unsigned tidxz = itemIdx.get_local_id(0); const unsigned bidx = itemIdx.get_group(0); const sycl_2020::sub_group sg = itemIdx.get_sub_group(); @@ -234,7 +233,7 @@ cl::sycl::event launchNbnxmKernelPruneOnly(const DeviceStream& deviceStream, * - The 1D block-grid contains as many blocks as super-clusters. */ const unsigned long numBlocks = numSciInPart; - const cl::sycl::range<3> blockSize{ c_clSize, c_clSize, c_syclPruneKernelJ4Concurrency }; + const cl::sycl::range<3> blockSize{ c_syclPruneKernelJ4Concurrency, c_clSize, c_clSize }; const cl::sycl::range<3> globalSize{ numBlocks * blockSize[0], blockSize[1], blockSize[2] }; const cl::sycl::nd_range<3> range{ globalSize, blockSize }; @@ -242,7 +241,7 @@ cl::sycl::event launchNbnxmKernelPruneOnly(const DeviceStream& deviceStream, cl::sycl::event e = q.submit([&](cl::sycl::handler& cgh) { auto kernel = nbnxmKernelPruneOnly(cgh, std::forward(args)...); - cgh.parallel_for(flattenNDRange(range), kernel); + cgh.parallel_for(range, kernel); }); return e; diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h index 53633da4be..f1d08db642 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h @@ -78,32 +78,6 @@ static constexpr float c_oneSixth = 0.16666667F; static constexpr float c_oneTwelfth = 0.08333333F; /*! \endcond */ -/* The following functions are necessary because on some versions of Intel OpenCL RT, subgroups - * do not properly work (segfault or create subgroups of size 1) if used in kernels - * with non-1-dimensional workgroup. */ -//! \brief Convert 3D range to 1D -static inline cl::sycl::range<1> flattenRange(cl::sycl::range<3> range3d) -{ - return { range3d.size() }; -} - -//! \brief Convert 3D nd_range to 1D -static inline cl::sycl::nd_range<1> flattenNDRange(cl::sycl::nd_range<3> nd_range3d) -{ - return { flattenRange(nd_range3d.get_global_range()), flattenRange(nd_range3d.get_local_range()) }; -} - -//! \brief Convert flattened 1D index to 3D -template -static inline cl::sycl::id<3> unflattenId(cl::sycl::id<1> id1d) -{ - constexpr unsigned rangeXY = rangeX * rangeY; - const unsigned id = id1d[0]; - const unsigned z = id / rangeXY; - const unsigned xy = id % rangeXY; - return { xy % rangeX, xy / rangeX, z }; -} - } // namespace Nbnxm #endif // GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H -- 2.22.0