From 2c750f8e218bf8ace3e3395668102259e7337886 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Tue, 30 Mar 2021 15:24:08 +0200 Subject: [PATCH] Implement generic j-reduction in the nbnxm SYCL kernels This commit implements the generic j-reduction identical to the OpenCL version of the same. Also added a subGroupBarrier() helper which is needed for correctness on the CUDA backend when targetting NVIDIA architectures. Refs #3934 --- src/gromacs/gpu_utils/sycl_kernel_utils.h | 23 +++++-- src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp | 65 ++++++++++++++++++- .../nbnxm/sycl/nbnxm_sycl_kernel_utils.h | 2 + 3 files changed, 83 insertions(+), 7 deletions(-) diff --git a/src/gromacs/gpu_utils/sycl_kernel_utils.h b/src/gromacs/gpu_utils/sycl_kernel_utils.h index 6c80300adc..18670c92db 100644 --- a/src/gromacs/gpu_utils/sycl_kernel_utils.h +++ b/src/gromacs/gpu_utils/sycl_kernel_utils.h @@ -58,6 +58,9 @@ static constexpr auto mode_atomic = GMX_SYCL_DPCPP ? cl::sycl::access::mode::read_write : /* GMX_SYCL_HIPSYCL */ cl::sycl::access::mode::atomic; +// \brief Full warp active thread mask used in CUDA warp-level primitives. +static constexpr unsigned int c_cudaFullWarpMask = 0xffffffff; + /*! \brief Convenience wrapper to do atomic addition to a global buffer. * * The implementation differences between DPCPP and hipSYCL are explained in \ref mode_atomic. @@ -84,6 +87,20 @@ static inline void atomicFetchAdd(DeviceAccessor acc, const #endif } +/* \brief Issue an intra sub-group barrier. + * + * Equivalent with CUDA syncwarp(c_cudaFullWarpMask). + * + */ +static inline void subGroupBarrier(const cl::sycl::nd_item<1> itemIdx) +{ +#if GMX_SYCL_HIPSYCL + cl::sycl::group_barrier(itemIdx.get_sub_group(), cl::sycl::memory_scope::sub_group); +#else + itemIdx.get_sub_group().barrier(); +#endif +} + namespace sycl_2020 { #if GMX_SYCL_HIPSYCL @@ -94,8 +111,7 @@ __device__ __host__ static inline float shift_left(sycl_2020::sub_group, // No sycl::sub_group::shift_left / shuffle_down in hipSYCL yet # ifdef SYCL_DEVICE_ONLY # if defined(HIPSYCL_PLATFORM_CUDA) && defined(__HIPSYCL_ENABLE_CUDA_TARGET__) - static const unsigned int sc_cudaFullWarpMask = 0xffffffff; - return __shfl_down_sync(sc_cudaFullWarpMask, var, delta); + return __shfl_down_sync(c_cudaFullWarpMask, var, delta); # elif defined(HIPSYCL_PLATFORM_ROCM) && defined(__HIPSYCL_ENABLE_HIP_TARGET__) // Do we need more ifdefs? https://github.com/ROCm-Developer-Tools/HIP/issues/1491 return __shfl_down(var, delta); @@ -125,8 +141,7 @@ __device__ __host__ static inline float shift_right(sycl_2020::sub_group, // No sycl::sub_group::shift_right / shuffle_up in hipSYCL yet # ifdef SYCL_DEVICE_ONLY # if defined(HIPSYCL_PLATFORM_CUDA) && defined(__HIPSYCL_ENABLE_CUDA_TARGET__) - static const unsigned int sc_cudaFullWarpMask = 0xffffffff; - return __shfl_up_sync(sc_cudaFullWarpMask, var, delta); + return __shfl_up_sync(c_cudaFullWarpMask, var, delta); # elif defined(HIPSYCL_PLATFORM_ROCM) && defined(__HIPSYCL_ENABLE_HIP_TARGET__) // Do we need more ifdefs? https://github.com/ROCm-Developer-Tools/HIP/issues/1491 return __shfl_up(var, delta); diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp index 8b3b81b7f6..63be54335b 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp @@ -298,7 +298,7 @@ static inline float interpolateCoulombForceR(const DeviceAccessor sm_buf, + Float3 f, + const cl::sycl::nd_item<1> itemIdx, + const int tidxi, + const int tidxj, + const int aidx, + DeviceAccessor a_f) +{ + static constexpr int sc_fBufferStride = c_clSizeSq; + int tidx = tidxi + tidxj * c_clSize; + sm_buf[0 * sc_fBufferStride + tidx] = f[0]; + sm_buf[1 * sc_fBufferStride + tidx] = f[1]; + sm_buf[2 * sc_fBufferStride + tidx] = f[2]; + + subGroupBarrier(itemIdx); + + // reducing data 8-by-by elements on the leader of same threads as those storing above + assert(itemIdx.get_sub_group().get_local_range().size() >= c_clSize); + + if (tidxi < 3) + { + float fSum = 0.0F; + for (int j = tidxj * c_clSize; j < (tidxj + 1) * c_clSize; j++) + { + fSum += sm_buf[sc_fBufferStride * tidxi + j]; + } + + atomicFetchAdd(a_f, 3 * aidx + tidxi, fSum); + } +} + + +/*! \brief Reduce c_clSize j-force components using either shifts or local memory and atomically accumulate into a_f. + */ +static inline void reduceForceJ(cl::sycl::accessor sm_buf, + Float3 f, + const cl::sycl::nd_item<1> itemIdx, + const int tidxi, + const int tidxj, + const int aidx, + DeviceAccessor a_f) +{ + if constexpr (!gmx::isPowerOfTwo(c_nbnxnGpuNumClusterPerSupercluster)) + { + reduceForceJGeneric(sm_buf, f, itemIdx, tidxi, tidxj, aidx, a_f); + } + else + { + reduceForceJShuffle(f, itemIdx, tidxi, aidx, a_f); + } +} + /*! \brief Final i-force reduction. * @@ -364,7 +423,7 @@ static inline void reduceForceIAndFShift(cl::sycl::accessor::value; const int tidx = tidxi + tidxj * c_clSize; - float fShiftBuf = 0; + float fShiftBuf = 0.0F; for (int ciOffset = 0; ciOffset < c_nbnxnGpuNumClusterPerSupercluster; ciOffset++) { const int aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ciOffset) * c_clSize + tidxi; @@ -906,7 +965,7 @@ auto nbnxmKernel(cl::sycl::handler& cgh, maskJI += maskJI; } // for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++) /* reduce j forces */ - reduceForceJShuffle(fCjBuf, itemIdx, tidxi, aj, a_f); + reduceForceJ(sm_reductionBuffer, fCjBuf, itemIdx, tidxi, tidxj, aj, a_f); } // for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++) if constexpr (doPruneNBL) { diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h index 46c9e38afe..c316555d0f 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h @@ -61,6 +61,8 @@ static constexpr int c_syclPruneKernelJ4Concurrency = GMX_NBNXN_PRUNE_KERNEL_J4_ /*! \cond */ // cluster size = number of atoms per cluster. static constexpr int c_clSize = c_nbnxnGpuClusterSize; +// Square of cluster size. +static constexpr int c_clSizeSq = c_clSize * c_clSize; // j-cluster size after split (4 in the current implementation). static constexpr int c_splitClSize = c_clSize / c_nbnxnGpuClusterpairSplit; // i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster=8 bits set. -- 2.22.0