From: Szilárd Páll Date: Fri, 5 Mar 2021 16:18:45 +0000 (+0000) Subject: Clarify the SYCL subgroup size on different targets X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=931c14c048c3b7691a0706c5f4c5d1f8fc29410c;p=alexxy%2Fgromacs.git Clarify the SYCL subgroup size on different targets Also added some doxygen. Refs #3934 --- diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp index 504557d563..44fad58ac3 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp @@ -298,6 +298,11 @@ static inline float interpolateCoulombForceR(const DeviceAccessor itemIdx, const int tidxi, @@ -335,6 +340,10 @@ static inline void reduceForceJShuffle(Float3 f, /*! \brief Final i-force reduction. + * + * Reduce c_nbnxnGpuNumClusterPerSupercluster i-force componets stored in \p fCiBuf[] + * accumulating atomically into \p a_f. + * If \p calcFShift is true, further reduce shift forces and atomically accumulate into \p a_fShift. * * This implementation works only with power of two array sizes. */ @@ -349,6 +358,9 @@ static inline void reduceForceIAndFShift(cl::sycl::accessor a_f, DeviceAccessor a_fShift) { + // must have power of two elements in fCiBuf + static_assert(gmx::isPowerOfTwo(c_nbnxnGpuNumClusterPerSupercluster)); + static constexpr int bufStride = c_clSize * c_clSize; static constexpr int clSizeLog2 = gmx::StaticLog2::value; const int tidx = tidxi + tidxj * c_clSize; @@ -409,7 +421,6 @@ static inline void reduceForceIAndFShift(cl::sycl::accessor prunedClusterPairSize on AMD GCN / CDNA. + // Hence, the two are decoupled. + constexpr int prunedClusterPairSize = c_clSize * c_splitClSize; +#if defined(HIPSYCL_PLATFORM_ROCM) // SYCL-TODO AMD RDNA/RDNA2 has 32-wide exec; how can we check for that? + constexpr int subGroupSize = c_clSize * c_clSize; +#else + constexpr int subGroupSize = prunedClusterPairSize; +#endif return [=](cl::sycl::nd_item<1> itemIdx) [[intel::reqd_sub_group_size(subGroupSize)]] { @@ -533,8 +553,9 @@ auto nbnxmKernel(cl::sycl::handler& cgh, const unsigned bidx = itemIdx.get_group(0); const sycl_2020::sub_group sg = itemIdx.get_sub_group(); - // Better use sg.get_group_range, but too much of the logic relies on it anyway - const unsigned widx = tidx / subGroupSize; + // Could use sg.get_group_range to compute the imask & exclusion Idx, but too much of the logic relies on it anyway + // and in cases where prunedClusterPairSize != subGroupSize we can't use it anyway + const unsigned imeiIdx = tidx / prunedClusterPairSize; Float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster]; // i force buffer for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++) @@ -636,12 +657,12 @@ auto nbnxmKernel(cl::sycl::handler& cgh, // loop over the j clusters = seen by any of the atoms in the current super-cluster for (int j4 = cij4Start + tidxz; j4 < cij4End; j4 += 1) { - unsigned imask = a_plistCJ4[j4].imei[widx].imask; + unsigned imask = a_plistCJ4[j4].imei[imeiIdx].imask; if (!doPruneNBL && !imask) { continue; } - const int wexclIdx = a_plistCJ4[j4].imei[widx].excl_ind; + const int wexclIdx = a_plistCJ4[j4].imei[imeiIdx].excl_ind; const unsigned wexcl = a_plistExcl[wexclIdx].pair[tidx & (subGroupSize - 1)]; // sg.get_local_linear_id() for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++) { @@ -889,7 +910,7 @@ auto nbnxmKernel(cl::sycl::handler& cgh, { /* Update the imask with the new one which does not contain the * out of range clusters anymore. */ - a_plistCJ4[j4].imei[widx].imask = imask; + a_plistCJ4[j4].imei[imeiIdx].imask = imask; } } // for (int j4 = cij4Start; j4 < cij4End; j4 += 1)