From 2c750f8e218bf8ace3e3395668102259e7337886 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= <pall.szilard@gmail.com>
Date: Tue, 30 Mar 2021 15:24:08 +0200
Subject: [PATCH] Implement generic j-reduction in the nbnxm SYCL kernels

This commit implements the generic j-reduction identical to the OpenCL
version of the same.

Also added a subGroupBarrier() helper which is needed for
correctness on the CUDA backend when targetting NVIDIA
architectures.

Refs #3934
---
 src/gromacs/gpu_utils/sycl_kernel_utils.h     | 23 +++++--
 src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp  | 65 ++++++++++++++++++-
 .../nbnxm/sycl/nbnxm_sycl_kernel_utils.h      |  2 +
 3 files changed, 83 insertions(+), 7 deletions(-)

diff --git a/src/gromacs/gpu_utils/sycl_kernel_utils.h b/src/gromacs/gpu_utils/sycl_kernel_utils.h
index 6c80300adc..18670c92db 100644
--- a/src/gromacs/gpu_utils/sycl_kernel_utils.h
+++ b/src/gromacs/gpu_utils/sycl_kernel_utils.h
@@ -58,6 +58,9 @@
 static constexpr auto mode_atomic = GMX_SYCL_DPCPP ? cl::sycl::access::mode::read_write :
                                                    /* GMX_SYCL_HIPSYCL */ cl::sycl::access::mode::atomic;
 
+// \brief Full warp active thread mask used in CUDA warp-level primitives.
+static constexpr unsigned int c_cudaFullWarpMask = 0xffffffff;
+
 /*! \brief Convenience wrapper to do atomic addition to a global buffer.
  *
  * The implementation differences between DPCPP and hipSYCL are explained in \ref mode_atomic.
@@ -84,6 +87,20 @@ static inline void atomicFetchAdd(DeviceAccessor<float, mode_atomic> acc, const
 #endif
 }
 
+/* \brief Issue an intra sub-group barrier.
+ *
+ * Equivalent with CUDA syncwarp(c_cudaFullWarpMask).
+ *
+ */
+static inline void subGroupBarrier(const cl::sycl::nd_item<1> itemIdx)
+{
+#if GMX_SYCL_HIPSYCL
+    cl::sycl::group_barrier(itemIdx.get_sub_group(), cl::sycl::memory_scope::sub_group);
+#else
+    itemIdx.get_sub_group().barrier();
+#endif
+}
+
 namespace sycl_2020
 {
 #if GMX_SYCL_HIPSYCL
@@ -94,8 +111,7 @@ __device__ __host__ static inline float shift_left(sycl_2020::sub_group,
     // No sycl::sub_group::shift_left / shuffle_down in hipSYCL yet
 #    ifdef SYCL_DEVICE_ONLY
 #        if defined(HIPSYCL_PLATFORM_CUDA) && defined(__HIPSYCL_ENABLE_CUDA_TARGET__)
-    static const unsigned int sc_cudaFullWarpMask = 0xffffffff;
-    return __shfl_down_sync(sc_cudaFullWarpMask, var, delta);
+    return __shfl_down_sync(c_cudaFullWarpMask, var, delta);
 #        elif defined(HIPSYCL_PLATFORM_ROCM) && defined(__HIPSYCL_ENABLE_HIP_TARGET__)
     // Do we need more ifdefs? https://github.com/ROCm-Developer-Tools/HIP/issues/1491
     return __shfl_down(var, delta);
@@ -125,8 +141,7 @@ __device__ __host__ static inline float shift_right(sycl_2020::sub_group,
     // No sycl::sub_group::shift_right / shuffle_up in hipSYCL yet
 #    ifdef SYCL_DEVICE_ONLY
 #        if defined(HIPSYCL_PLATFORM_CUDA) && defined(__HIPSYCL_ENABLE_CUDA_TARGET__)
-    static const unsigned int sc_cudaFullWarpMask = 0xffffffff;
-    return __shfl_up_sync(sc_cudaFullWarpMask, var, delta);
+    return __shfl_up_sync(c_cudaFullWarpMask, var, delta);
 #        elif defined(HIPSYCL_PLATFORM_ROCM) && defined(__HIPSYCL_ENABLE_HIP_TARGET__)
     // Do we need more ifdefs? https://github.com/ROCm-Developer-Tools/HIP/issues/1491
     return __shfl_up(var, delta);
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
index 8b3b81b7f6..63be54335b 100644
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
@@ -298,7 +298,7 @@ static inline float interpolateCoulombForceR(const DeviceAccessor<float, mode::r
     return lerp(left, right, fraction); // TODO: cl::sycl::mix
 }
 
-/*! \brief Reduce c_clSize j-force components and atomically accumulate into a_f.
+/*! \brief Reduce c_clSize j-force components using shifts and atomically accumulate into a_f.
  *
  * c_clSize consecutive threads hold the force components of a j-atom which we
  * reduced in log2(cl_Size) steps using shift and atomically accumulate them into \p a_f.
@@ -338,6 +338,65 @@ static inline void reduceForceJShuffle(Float3                             f,
     }
 }
 
+/*! \brief Reduce c_clSize j-force components using local memory and atomically accumulate into a_f.
+ *
+ * c_clSize consecutive threads hold the force components of a j-atom which we
+ * reduced in cl_Size steps using shift and atomically accumulate them into \p a_f.
+ *
+ * TODO: implement binary reduction flavor for the case where cl_Size is power of two.
+ */
+static inline void reduceForceJGeneric(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
+                                       Float3                             f,
+                                       const cl::sycl::nd_item<1>         itemIdx,
+                                       const int                          tidxi,
+                                       const int                          tidxj,
+                                       const int                          aidx,
+                                       DeviceAccessor<float, mode_atomic> a_f)
+{
+    static constexpr int sc_fBufferStride = c_clSizeSq;
+    int                  tidx            = tidxi + tidxj * c_clSize;
+    sm_buf[0 * sc_fBufferStride + tidx]  = f[0];
+    sm_buf[1 * sc_fBufferStride + tidx]  = f[1];
+    sm_buf[2 * sc_fBufferStride + tidx]  = f[2];
+
+    subGroupBarrier(itemIdx);
+
+    // reducing data 8-by-by elements on the leader of same threads as those storing above
+    assert(itemIdx.get_sub_group().get_local_range().size() >= c_clSize);
+
+    if (tidxi < 3)
+    {
+        float fSum = 0.0F;
+        for (int j = tidxj * c_clSize; j < (tidxj + 1) * c_clSize; j++)
+        {
+            fSum += sm_buf[sc_fBufferStride * tidxi + j];
+        }
+
+        atomicFetchAdd(a_f, 3 * aidx + tidxi, fSum);
+    }
+}
+
+
+/*! \brief Reduce c_clSize j-force components using either shifts or local memory and atomically accumulate into a_f.
+ */
+static inline void reduceForceJ(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
+                                Float3                                                        f,
+                                const cl::sycl::nd_item<1>         itemIdx,
+                                const int                          tidxi,
+                                const int                          tidxj,
+                                const int                          aidx,
+                                DeviceAccessor<float, mode_atomic> a_f)
+{
+    if constexpr (!gmx::isPowerOfTwo(c_nbnxnGpuNumClusterPerSupercluster))
+    {
+        reduceForceJGeneric(sm_buf, f, itemIdx, tidxi, tidxj, aidx, a_f);
+    }
+    else
+    {
+        reduceForceJShuffle(f, itemIdx, tidxi, aidx, a_f);
+    }
+}
+
 
 /*! \brief Final i-force reduction.
  *
@@ -364,7 +423,7 @@ static inline void reduceForceIAndFShift(cl::sycl::accessor<float, 1, mode::read
     static constexpr int bufStride  = c_clSize * c_clSize;
     static constexpr int clSizeLog2 = gmx::StaticLog2<c_clSize>::value;
     const int            tidx       = tidxi + tidxj * c_clSize;
-    float                fShiftBuf  = 0;
+    float                fShiftBuf  = 0.0F;
     for (int ciOffset = 0; ciOffset < c_nbnxnGpuNumClusterPerSupercluster; ciOffset++)
     {
         const int aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ciOffset) * c_clSize + tidxi;
@@ -906,7 +965,7 @@ auto nbnxmKernel(cl::sycl::handler&                                   cgh,
                     maskJI += maskJI;
                 } // for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
                 /* reduce j forces */
-                reduceForceJShuffle(fCjBuf, itemIdx, tidxi, aj, a_f);
+                reduceForceJ(sm_reductionBuffer, fCjBuf, itemIdx, tidxi, tidxj, aj, a_f);
             } // for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
             if constexpr (doPruneNBL)
             {
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h
index 46c9e38afe..c316555d0f 100644
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h
@@ -61,6 +61,8 @@ static constexpr int c_syclPruneKernelJ4Concurrency = GMX_NBNXN_PRUNE_KERNEL_J4_
 /*! \cond */
 // cluster size = number of atoms per cluster.
 static constexpr int c_clSize = c_nbnxnGpuClusterSize;
+// Square of cluster size.
+static constexpr int c_clSizeSq = c_clSize * c_clSize;
 // j-cluster size after split (4 in the current implementation).
 static constexpr int c_splitClSize = c_clSize / c_nbnxnGpuClusterpairSplit;
 // i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster=8 bits set.
-- 
2.22.0