Implement generic j-reduction in the nbnxm SYCL kernels
[alexxy/gromacs.git] / src / gromacs / gpu_utils / sycl_kernel_utils.h
index 6c80300adcbaf3c90af6beaf313c8881bbf3683d..18670c92db53e58e2c4a9e5b079ecb40d173153d 100644 (file)
@@ -58,6 +58,9 @@
 static constexpr auto mode_atomic = GMX_SYCL_DPCPP ? cl::sycl::access::mode::read_write :
                                                    /* GMX_SYCL_HIPSYCL */ cl::sycl::access::mode::atomic;
 
+// \brief Full warp active thread mask used in CUDA warp-level primitives.
+static constexpr unsigned int c_cudaFullWarpMask = 0xffffffff;
+
 /*! \brief Convenience wrapper to do atomic addition to a global buffer.
  *
  * The implementation differences between DPCPP and hipSYCL are explained in \ref mode_atomic.
@@ -84,6 +87,20 @@ static inline void atomicFetchAdd(DeviceAccessor<float, mode_atomic> acc, const
 #endif
 }
 
+/* \brief Issue an intra sub-group barrier.
+ *
+ * Equivalent with CUDA syncwarp(c_cudaFullWarpMask).
+ *
+ */
+static inline void subGroupBarrier(const cl::sycl::nd_item<1> itemIdx)
+{
+#if GMX_SYCL_HIPSYCL
+    cl::sycl::group_barrier(itemIdx.get_sub_group(), cl::sycl::memory_scope::sub_group);
+#else
+    itemIdx.get_sub_group().barrier();
+#endif
+}
+
 namespace sycl_2020
 {
 #if GMX_SYCL_HIPSYCL
@@ -94,8 +111,7 @@ __device__ __host__ static inline float shift_left(sycl_2020::sub_group,
     // No sycl::sub_group::shift_left / shuffle_down in hipSYCL yet
 #    ifdef SYCL_DEVICE_ONLY
 #        if defined(HIPSYCL_PLATFORM_CUDA) && defined(__HIPSYCL_ENABLE_CUDA_TARGET__)
-    static const unsigned int sc_cudaFullWarpMask = 0xffffffff;
-    return __shfl_down_sync(sc_cudaFullWarpMask, var, delta);
+    return __shfl_down_sync(c_cudaFullWarpMask, var, delta);
 #        elif defined(HIPSYCL_PLATFORM_ROCM) && defined(__HIPSYCL_ENABLE_HIP_TARGET__)
     // Do we need more ifdefs? https://github.com/ROCm-Developer-Tools/HIP/issues/1491
     return __shfl_down(var, delta);
@@ -125,8 +141,7 @@ __device__ __host__ static inline float shift_right(sycl_2020::sub_group,
     // No sycl::sub_group::shift_right / shuffle_up in hipSYCL yet
 #    ifdef SYCL_DEVICE_ONLY
 #        if defined(HIPSYCL_PLATFORM_CUDA) && defined(__HIPSYCL_ENABLE_CUDA_TARGET__)
-    static const unsigned int sc_cudaFullWarpMask = 0xffffffff;
-    return __shfl_up_sync(sc_cudaFullWarpMask, var, delta);
+    return __shfl_up_sync(c_cudaFullWarpMask, var, delta);
 #        elif defined(HIPSYCL_PLATFORM_ROCM) && defined(__HIPSYCL_ENABLE_HIP_TARGET__)
     // Do we need more ifdefs? https://github.com/ROCm-Developer-Tools/HIP/issues/1491
     return __shfl_up(var, delta);