* reduced in log2(cl_Size) steps using shift and atomically accumulate them into \p a_f.
*/
static inline void reduceForceJShuffle(Float3 f,
- const cl::sycl::nd_item<1> itemIdx,
+ const cl::sycl::nd_item<3> itemIdx,
const int tidxi,
const int aidx,
DeviceAccessor<Float3, mode::read_write> a_f)
* \return For thread with \p tidxi 0: sum of all \p valueToReduce. Other threads: unspecified.
*/
template<int subGroupSize, int groupSize>
-static inline float groupReduce(const cl::sycl::nd_item<1> itemIdx,
+static inline float groupReduce(const cl::sycl::nd_item<3> itemIdx,
const unsigned int tidxi,
cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
float valueToReduce)
*/
static inline void reduceForceJGeneric(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
Float3 f,
- const cl::sycl::nd_item<1> itemIdx,
+ const cl::sycl::nd_item<3> itemIdx,
const int tidxi,
const int tidxj,
const int aidx,
*/
static inline void reduceForceJ(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
Float3 f,
- const cl::sycl::nd_item<1> itemIdx,
+ const cl::sycl::nd_item<3> itemIdx,
const int tidxi,
const int tidxj,
const int aidx,
static inline void reduceForceIAndFShift(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
const Float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster],
const bool calcFShift,
- const cl::sycl::nd_item<1> itemIdx,
+ const cl::sycl::nd_item<3> itemIdx,
const int tidxi,
const int tidxj,
const int sci,
gmx_unused constexpr int subGroupSize = prunedClusterPairSize;
#endif
- return [=](cl::sycl::nd_item<1> itemIdx) [[intel::reqd_sub_group_size(subGroupSize)]]
+ return [=](cl::sycl::nd_item<3> itemIdx) [[intel::reqd_sub_group_size(subGroupSize)]]
{
/* thread/block/warp id-s */
- const cl::sycl::id<3> localId = unflattenId<c_clSize, c_clSize>(itemIdx.get_local_id());
- const unsigned tidxi = localId[0];
- const unsigned tidxj = localId[1];
- const unsigned tidx = tidxj * c_clSize + tidxi;
- const unsigned tidxz = 0;
+ const unsigned tidxi = itemIdx.get_local_id(2);
+ const unsigned tidxj = itemIdx.get_local_id(1);
+ const unsigned tidx = tidxj * c_clSize + tidxi;
+ const unsigned tidxz = 0;
- // Group indexing was flat originally, no need to unflatten it.
const unsigned bidx = itemIdx.get_group(0);
const sycl_2020::sub_group sg = itemIdx.get_sub_group();
* - The 1D block-grid contains as many blocks as super-clusters.
*/
const int numBlocks = numSci;
- const cl::sycl::range<3> blockSize{ c_clSize, c_clSize, 1 };
+ const cl::sycl::range<3> blockSize{ 1, c_clSize, c_clSize };
const cl::sycl::range<3> globalSize{ numBlocks * blockSize[0], blockSize[1], blockSize[2] };
const cl::sycl::nd_range<3> range{ globalSize, blockSize };
cl::sycl::event e = q.submit([&](cl::sycl::handler& cgh) {
auto kernel = nbnxmKernel<doPruneNBL, doCalcEnergies, elecType, vdwType>(
cgh, std::forward<Args>(args)...);
- cgh.parallel_for<kernelNameType>(flattenNDRange(range), kernel);
+ cgh.parallel_for<kernelNameType>(range, kernel);
});
return e;
constexpr int gmx_unused requiredSubGroupSize = (c_clSize == 4) ? 16 : warpSize;
/* Requirements:
- * Work group (block) must have range (c_clSize, c_clSize, ...) (for localId calculation, easy
+ * Work group (block) must have range (c_clSize, c_clSize, ...) (for itemIdx calculation, easy
* to change). */
- return [=](cl::sycl::nd_item<1> itemIdx) [[intel::reqd_sub_group_size(requiredSubGroupSize)]]
+ return [=](cl::sycl::nd_item<3> itemIdx) [[intel::reqd_sub_group_size(requiredSubGroupSize)]]
{
- const cl::sycl::id<3> localId = unflattenId<c_clSize, c_clSize>(itemIdx.get_local_id());
// thread/block/warp id-s
- const unsigned tidxi = localId[0];
- const unsigned tidxj = localId[1];
+ const unsigned tidxi = itemIdx.get_local_id(2);
+ const unsigned tidxj = itemIdx.get_local_id(1);
const int tidx = tidxj * c_clSize + tidxi;
- const unsigned tidxz = localId[2];
+ const unsigned tidxz = itemIdx.get_local_id(0);
const unsigned bidx = itemIdx.get_group(0);
const sycl_2020::sub_group sg = itemIdx.get_sub_group();
* - The 1D block-grid contains as many blocks as super-clusters.
*/
const unsigned long numBlocks = numSciInPart;
- const cl::sycl::range<3> blockSize{ c_clSize, c_clSize, c_syclPruneKernelJ4Concurrency };
+ const cl::sycl::range<3> blockSize{ c_syclPruneKernelJ4Concurrency, c_clSize, c_clSize };
const cl::sycl::range<3> globalSize{ numBlocks * blockSize[0], blockSize[1], blockSize[2] };
const cl::sycl::nd_range<3> range{ globalSize, blockSize };
cl::sycl::event e = q.submit([&](cl::sycl::handler& cgh) {
auto kernel = nbnxmKernelPruneOnly<haveFreshList>(cgh, std::forward<Args>(args)...);
- cgh.parallel_for<kernelNameType>(flattenNDRange(range), kernel);
+ cgh.parallel_for<kernelNameType>(range, kernel);
});
return e;
static constexpr float c_oneTwelfth = 0.08333333F;
/*! \endcond */
-/* The following functions are necessary because on some versions of Intel OpenCL RT, subgroups
- * do not properly work (segfault or create subgroups of size 1) if used in kernels
- * with non-1-dimensional workgroup. */
-//! \brief Convert 3D range to 1D
-static inline cl::sycl::range<1> flattenRange(cl::sycl::range<3> range3d)
-{
- return { range3d.size() };
-}
-
-//! \brief Convert 3D nd_range to 1D
-static inline cl::sycl::nd_range<1> flattenNDRange(cl::sycl::nd_range<3> nd_range3d)
-{
- return { flattenRange(nd_range3d.get_global_range()), flattenRange(nd_range3d.get_local_range()) };
-}
-
-//! \brief Convert flattened 1D index to 3D
-template<int rangeX, int rangeY>
-static inline cl::sycl::id<3> unflattenId(cl::sycl::id<1> id1d)
-{
- constexpr unsigned rangeXY = rangeX * rangeY;
- const unsigned id = id1d[0];
- const unsigned z = id / rangeXY;
- const unsigned xy = id % rangeXY;
- return { xy % rangeX, xy / rangeX, z };
-}
-
} // namespace Nbnxm
#endif // GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H