From 568bccb3acf898ff331aac2eaddbb8c96da0e2ff Mon Sep 17 00:00:00 2001 From: Andrey Alekseenko Date: Sat, 28 Aug 2021 06:26:10 +0000 Subject: [PATCH] SYCL: Fully switch to atomic_ref --- src/gromacs/gpu_utils/gmxsycl.h | 2 +- src/gromacs/gpu_utils/sycl_kernel_utils.h | 55 ++++---------- .../mdlib/settle_gpu_internal_sycl.cpp | 4 +- src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp | 74 +++++++++---------- 4 files changed, 56 insertions(+), 79 deletions(-) diff --git a/src/gromacs/gpu_utils/gmxsycl.h b/src/gromacs/gpu_utils/gmxsycl.h index 353ca45116..2b07b927a7 100644 --- a/src/gromacs/gpu_utils/gmxsycl.h +++ b/src/gromacs/gpu_utils/gmxsycl.h @@ -132,7 +132,7 @@ auto group_reduce(Args&&... args) -> decltype(detail::origin::reduce(std::forwar return detail::origin::reduce(std::forward(args)...); } #elif GMX_SYCL_HIPSYCL -// No atomic_ref in hipSYCL yet (2021-02-22) +using detail::origin::atomic_ref; using detail::origin::group_any_of; using detail::origin::group_reduce; #else diff --git a/src/gromacs/gpu_utils/sycl_kernel_utils.h b/src/gromacs/gpu_utils/sycl_kernel_utils.h index 8a2d6bace1..9a3c041ef8 100644 --- a/src/gromacs/gpu_utils/sycl_kernel_utils.h +++ b/src/gromacs/gpu_utils/sycl_kernel_utils.h @@ -44,53 +44,30 @@ * \author Andrey Alekseenko */ -/*! \brief Access mode to use for atomic accessors. - * - * Intel DPCPP compiler has \c sycl::atomic_ref, but has no \c sycl::atomic_fetch_add for floats. - * However, \c atomic_ref can not be constructed from \c sycl::atomic, so we can not use - * atomic accessors. Thus, we use \c mode::read_write accessors and \c atomic_ref. - * - * hipSYCL does not have \c sycl::atomic_ref, but has \c sycl::atomic_fetch_add for floats, which - * requires using atomic accessors. Thus, we use \c mode::atomic accessors. - * - * The \ref atomicFetchAdd function could be used for doing operations on such accessors. - */ -static constexpr auto mode_atomic = GMX_SYCL_DPCPP ? cl::sycl::access::mode::read_write : - /* GMX_SYCL_HIPSYCL */ cl::sycl::access::mode::atomic; - //! \brief Full warp active thread mask used in CUDA warp-level primitives. static constexpr unsigned int c_cudaFullWarpMask = 0xffffffff; /*! \brief Convenience wrapper to do atomic addition to a global buffer. - * - * The implementation differences between DPCPP and hipSYCL are explained in \ref mode_atomic. */ -template -static inline void atomicFetchAdd(DeviceAccessor acc, const IndexType idx, const float val) +template +static inline void atomicFetchAdd(T& val, const T delta) { -#if GMX_SYCL_DPCPP - sycl_2020::atomic_ref - fout_atomic(acc[idx]); - fout_atomic.fetch_add(val); -#elif GMX_SYCL_HIPSYCL -# ifdef SYCL_DEVICE_ONLY - /* While there is support for float atomics on device, the host implementation uses - * Clang's __atomic_fetch_add intrinsic, that, at least in Clang 11, does not support - * floats. Luckily, we don't want to run on host. */ - // The pragmas below can be removed once we switch to sycl::atomic -# pragma clang diagnostic push -# pragma clang diagnostic ignored "-Wdeprecated-declarations" - acc[idx].fetch_add(val); -# pragma clang diagnostic push -# else - GMX_ASSERT(false, "hipSYCL host codepath not supported"); - GMX_UNUSED_VALUE(val); - GMX_UNUSED_VALUE(acc); - GMX_UNUSED_VALUE(idx); -# endif -#endif + sycl_2020::atomic_ref ref( + val); + ref.fetch_add(delta); } +/*! \brief Convenience wrapper to do atomic loads from a global buffer. + */ +template +static inline T atomicLoad(T& val) +{ + sycl_2020::atomic_ref ref( + val); + return ref.load(); +} + + /*! \brief Issue an intra sub-group barrier. * * Equivalent with CUDA's \c syncwarp(c_cudaFullWarpMask). diff --git a/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp b/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp index 3e5b56fdf3..6a32e856c0 100644 --- a/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp +++ b/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp @@ -69,7 +69,7 @@ auto settleKernel(cl::sycl::handler& c DeviceAccessor a_xp, float invdt, OptionalAccessor a_v, - OptionalAccessor a_virialScaled, + OptionalAccessor a_virialScaled, PbcAiuc pbcAiuc) { cgh.require(a_settles); @@ -340,7 +340,7 @@ auto settleKernel(cl::sycl::handler& c // First 6 threads in the block add the 6 components of virial to the global memory address if (threadIdx < 6) { - atomicFetchAdd(a_virialScaled, threadIdx, sm_threadVirial[threadIdx * blockSize]); + atomicFetchAdd(a_virialScaled[threadIdx], sm_threadVirial[threadIdx * blockSize]); } } }; diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp index 7304922c09..00957bb1b9 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp @@ -306,11 +306,11 @@ static inline float interpolateCoulombForceR(const DeviceAccessor itemIdx, - const int tidxi, - const int aidx, - DeviceAccessor a_f) +static inline void reduceForceJShuffle(Float3 f, + const cl::sycl::nd_item<1> itemIdx, + const int tidxi, + const int aidx, + DeviceAccessor a_f) { static_assert(c_clSize == 8 || c_clSize == 4); sycl_2020::sub_group sg = itemIdx.get_sub_group(); @@ -337,7 +337,7 @@ static inline void reduceForceJShuffle(Float3 f, if (tidxi < 3) { - atomicFetchAdd(a_f, 3 * aidx + tidxi, f[0]); + atomicFetchAdd(a_f[3 * aidx + tidxi], f[0]); } } @@ -389,12 +389,12 @@ static inline float groupReduce(const cl::sycl::nd_item<1> itemIdx, * TODO: implement binary reduction flavor for the case where cl_Size is power of two. */ static inline void reduceForceJGeneric(cl::sycl::accessor sm_buf, - Float3 f, - const cl::sycl::nd_item<1> itemIdx, - const int tidxi, - const int tidxj, - const int aidx, - DeviceAccessor a_f) + Float3 f, + const cl::sycl::nd_item<1> itemIdx, + const int tidxi, + const int tidxj, + const int aidx, + DeviceAccessor a_f) { static constexpr int sc_fBufferStride = c_clSizeSq; int tidx = tidxi + tidxj * c_clSize; @@ -415,7 +415,7 @@ static inline void reduceForceJGeneric(cl::sycl::accessor sm_buf, Float3 f, - const cl::sycl::nd_item<1> itemIdx, - const int tidxi, - const int tidxj, - const int aidx, - DeviceAccessor a_f) + const cl::sycl::nd_item<1> itemIdx, + const int tidxi, + const int tidxj, + const int aidx, + DeviceAccessor a_f) { if constexpr (!gmx::isPowerOfTwo(c_nbnxnGpuNumClusterPerSupercluster)) { @@ -452,13 +452,13 @@ static inline void reduceForceJ(cl::sycl::accessor sm_buf, const Float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster], const bool calcFShift, - const cl::sycl::nd_item<1> itemIdx, - const int tidxi, - const int tidxj, - const int sci, - const int shift, - DeviceAccessor a_f, - DeviceAccessor a_fShift) + const cl::sycl::nd_item<1> itemIdx, + const int tidxi, + const int tidxj, + const int sci, + const int shift, + DeviceAccessor a_f, + DeviceAccessor a_fShift) { // must have power of two elements in fCiBuf static_assert(gmx::isPowerOfTwo(c_nbnxnGpuNumClusterPerSupercluster)); @@ -502,7 +502,7 @@ static inline void reduceForceIAndFShift(cl::sycl::accessor -auto nbnxmKernel(cl::sycl::handler& cgh, - DeviceAccessor a_xq, - DeviceAccessor a_f, - DeviceAccessor a_shiftVec, - DeviceAccessor a_fShift, - OptionalAccessor a_energyElec, - OptionalAccessor a_energyVdw, +auto nbnxmKernel(cl::sycl::handler& cgh, + DeviceAccessor a_xq, + DeviceAccessor a_f, + DeviceAccessor a_shiftVec, + DeviceAccessor a_fShift, + OptionalAccessor a_energyElec, + OptionalAccessor a_energyVdw, DeviceAccessor a_plistCJ4, DeviceAccessor a_plistSci, DeviceAccessor a_plistExcl, @@ -1053,8 +1053,8 @@ auto nbnxmKernel(cl::sycl::handler& cgh, if (tidx == 0) { - atomicFetchAdd(a_energyVdw, 0, energyVdwGroup); - atomicFetchAdd(a_energyElec, 0, energyElecGroup); + atomicFetchAdd(a_energyVdw[0], energyVdwGroup); + atomicFetchAdd(a_energyElec[0], energyElecGroup); } } }; -- 2.22.0