Implement generic j-reduction in the nbnxm SYCL kernels

author Szilárd Páll <pall.szilard@gmail.com>

Tue, 30 Mar 2021 13:24:08 +0000 (15:24 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Wed, 26 May 2021 08:08:26 +0000 (08:08 +0000)
author Szilárd Páll <pall.szilard@gmail.com>
Tue, 30 Mar 2021 13:24:08 +0000 (15:24 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Wed, 26 May 2021 08:08:26 +0000 (08:08 +0000)
diff --git a/src/gromacs/gpu_utils/sycl_kernel_utils.h b/src/gromacs/gpu_utils/sycl_kernel_utils.h

index 6c80300adcbaf3c90af6beaf313c8881bbf3683d..18670c92db53e58e2c4a9e5b079ecb40d173153d 100644 (file)
--- a/src/gromacs/gpu_utils/sycl_kernel_utils.h
+++ b/src/gromacs/gpu_utils/sycl_kernel_utils.h
@@ -58,6 +58,9 @@
  static constexpr auto mode_atomic = GMX_SYCL_DPCPP ? cl::sycl::access::mode::read_write :
                                                     /* GMX_SYCL_HIPSYCL */ cl::sycl::access::mode::atomic;
  
  static constexpr auto mode_atomic = GMX_SYCL_DPCPP ? cl::sycl::access::mode::read_write :
                                                     /* GMX_SYCL_HIPSYCL */ cl::sycl::access::mode::atomic;
  
+// \brief Full warp active thread mask used in CUDA warp-level primitives.
+static constexpr unsigned int c_cudaFullWarpMask = 0xffffffff;
+
  /*! \brief Convenience wrapper to do atomic addition to a global buffer.
   *
   * The implementation differences between DPCPP and hipSYCL are explained in \ref mode_atomic.
  /*! \brief Convenience wrapper to do atomic addition to a global buffer.
   *
   * The implementation differences between DPCPP and hipSYCL are explained in \ref mode_atomic.
@@ -84,6 +87,20 @@ static inline void atomicFetchAdd(DeviceAccessor<float, mode_atomic> acc, const
  #endif
  }
  
  #endif
  }
  
+/* \brief Issue an intra sub-group barrier.
+ *
+ * Equivalent with CUDA syncwarp(c_cudaFullWarpMask).
+ *
+ */
+static inline void subGroupBarrier(const cl::sycl::nd_item<1> itemIdx)
+{
+#if GMX_SYCL_HIPSYCL
+    cl::sycl::group_barrier(itemIdx.get_sub_group(), cl::sycl::memory_scope::sub_group);
+#else
+    itemIdx.get_sub_group().barrier();
+#endif
+}
+
  namespace sycl_2020
  {
  #if GMX_SYCL_HIPSYCL
  namespace sycl_2020
  {
  #if GMX_SYCL_HIPSYCL
@@ -94,8 +111,7 @@ __device__ __host__ static inline float shift_left(sycl_2020::sub_group,
      // No sycl::sub_group::shift_left / shuffle_down in hipSYCL yet
  #    ifdef SYCL_DEVICE_ONLY
  #        if defined(HIPSYCL_PLATFORM_CUDA) && defined(__HIPSYCL_ENABLE_CUDA_TARGET__)
      // No sycl::sub_group::shift_left / shuffle_down in hipSYCL yet
  #    ifdef SYCL_DEVICE_ONLY
  #        if defined(HIPSYCL_PLATFORM_CUDA) && defined(__HIPSYCL_ENABLE_CUDA_TARGET__)
-    static const unsigned int sc_cudaFullWarpMask = 0xffffffff;
-    return __shfl_down_sync(sc_cudaFullWarpMask, var, delta);
+    return __shfl_down_sync(c_cudaFullWarpMask, var, delta);
  #        elif defined(HIPSYCL_PLATFORM_ROCM) && defined(__HIPSYCL_ENABLE_HIP_TARGET__)
      // Do we need more ifdefs? https://github.com/ROCm-Developer-Tools/HIP/issues/1491
      return __shfl_down(var, delta);
  #        elif defined(HIPSYCL_PLATFORM_ROCM) && defined(__HIPSYCL_ENABLE_HIP_TARGET__)
      // Do we need more ifdefs? https://github.com/ROCm-Developer-Tools/HIP/issues/1491
      return __shfl_down(var, delta);
@@ -125,8 +141,7 @@ __device__ __host__ static inline float shift_right(sycl_2020::sub_group,
      // No sycl::sub_group::shift_right / shuffle_up in hipSYCL yet
  #    ifdef SYCL_DEVICE_ONLY
  #        if defined(HIPSYCL_PLATFORM_CUDA) && defined(__HIPSYCL_ENABLE_CUDA_TARGET__)
      // No sycl::sub_group::shift_right / shuffle_up in hipSYCL yet
  #    ifdef SYCL_DEVICE_ONLY
  #        if defined(HIPSYCL_PLATFORM_CUDA) && defined(__HIPSYCL_ENABLE_CUDA_TARGET__)
-    static const unsigned int sc_cudaFullWarpMask = 0xffffffff;
-    return __shfl_up_sync(sc_cudaFullWarpMask, var, delta);
+    return __shfl_up_sync(c_cudaFullWarpMask, var, delta);
  #        elif defined(HIPSYCL_PLATFORM_ROCM) && defined(__HIPSYCL_ENABLE_HIP_TARGET__)
      // Do we need more ifdefs? https://github.com/ROCm-Developer-Tools/HIP/issues/1491
      return __shfl_up(var, delta);
  #        elif defined(HIPSYCL_PLATFORM_ROCM) && defined(__HIPSYCL_ENABLE_HIP_TARGET__)
      // Do we need more ifdefs? https://github.com/ROCm-Developer-Tools/HIP/issues/1491
      return __shfl_up(var, delta);
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp

index 8b3b81b7f6aa22460852955e42136cbf3537b0c9..63be54335b0ccd6d9ea4e96976a8fb7dd9ecd85c 100644 (file)
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
@@ -298,7 +298,7 @@ static inline float interpolateCoulombForceR(const DeviceAccessor<float, mode::r
      return lerp(left, right, fraction); // TODO: cl::sycl::mix
  }
  
      return lerp(left, right, fraction); // TODO: cl::sycl::mix
  }
  
-/*! \brief Reduce c_clSize j-force components and atomically accumulate into a_f.
+/*! \brief Reduce c_clSize j-force components using shifts and atomically accumulate into a_f.
   *
   * c_clSize consecutive threads hold the force components of a j-atom which we
   * reduced in log2(cl_Size) steps using shift and atomically accumulate them into \p a_f.
   *
   * c_clSize consecutive threads hold the force components of a j-atom which we
   * reduced in log2(cl_Size) steps using shift and atomically accumulate them into \p a_f.
@@ -338,6 +338,65 @@ static inline void reduceForceJShuffle(Float3                             f,
      }
  }
  
      }
  }
  
+/*! \brief Reduce c_clSize j-force components using local memory and atomically accumulate into a_f.
+ *
+ * c_clSize consecutive threads hold the force components of a j-atom which we
+ * reduced in cl_Size steps using shift and atomically accumulate them into \p a_f.
+ *
+ * TODO: implement binary reduction flavor for the case where cl_Size is power of two.
+ */
+static inline void reduceForceJGeneric(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
+                                       Float3                             f,
+                                       const cl::sycl::nd_item<1>         itemIdx,
+                                       const int                          tidxi,
+                                       const int                          tidxj,
+                                       const int                          aidx,
+                                       DeviceAccessor<float, mode_atomic> a_f)
+{
+    static constexpr int sc_fBufferStride = c_clSizeSq;
+    int                  tidx            = tidxi + tidxj * c_clSize;
+    sm_buf[0 * sc_fBufferStride + tidx]  = f[0];
+    sm_buf[1 * sc_fBufferStride + tidx]  = f[1];
+    sm_buf[2 * sc_fBufferStride + tidx]  = f[2];
+
+    subGroupBarrier(itemIdx);
+
+    // reducing data 8-by-by elements on the leader of same threads as those storing above
+    assert(itemIdx.get_sub_group().get_local_range().size() >= c_clSize);
+
+    if (tidxi < 3)
+    {
+        float fSum = 0.0F;
+        for (int j = tidxj * c_clSize; j < (tidxj + 1) * c_clSize; j++)
+        {
+            fSum += sm_buf[sc_fBufferStride * tidxi + j];
+        }
+
+        atomicFetchAdd(a_f, 3 * aidx + tidxi, fSum);
+    }
+}
+
+
+/*! \brief Reduce c_clSize j-force components using either shifts or local memory and atomically accumulate into a_f.
+ */
+static inline void reduceForceJ(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
+                                Float3                                                        f,
+                                const cl::sycl::nd_item<1>         itemIdx,
+                                const int                          tidxi,
+                                const int                          tidxj,
+                                const int                          aidx,
+                                DeviceAccessor<float, mode_atomic> a_f)
+{
+    if constexpr (!gmx::isPowerOfTwo(c_nbnxnGpuNumClusterPerSupercluster))
+    {
+        reduceForceJGeneric(sm_buf, f, itemIdx, tidxi, tidxj, aidx, a_f);
+    }
+    else
+    {
+        reduceForceJShuffle(f, itemIdx, tidxi, aidx, a_f);
+    }
+}
+
  
  /*! \brief Final i-force reduction.
   *
  
  /*! \brief Final i-force reduction.
   *
@@ -364,7 +423,7 @@ static inline void reduceForceIAndFShift(cl::sycl::accessor<float, 1, mode::read
      static constexpr int bufStride  = c_clSize * c_clSize;
      static constexpr int clSizeLog2 = gmx::StaticLog2<c_clSize>::value;
      const int            tidx       = tidxi + tidxj * c_clSize;
      static constexpr int bufStride  = c_clSize * c_clSize;
      static constexpr int clSizeLog2 = gmx::StaticLog2<c_clSize>::value;
      const int            tidx       = tidxi + tidxj * c_clSize;
-    float                fShiftBuf  = 0;
+    float                fShiftBuf  = 0.0F;
      for (int ciOffset = 0; ciOffset < c_nbnxnGpuNumClusterPerSupercluster; ciOffset++)
      {
          const int aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ciOffset) * c_clSize + tidxi;
      for (int ciOffset = 0; ciOffset < c_nbnxnGpuNumClusterPerSupercluster; ciOffset++)
      {
          const int aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ciOffset) * c_clSize + tidxi;
@@ -906,7 +965,7 @@ auto nbnxmKernel(cl::sycl::handler&                                   cgh,
                      maskJI += maskJI;
                  } // for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
                  /* reduce j forces */
                      maskJI += maskJI;
                  } // for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
                  /* reduce j forces */
-                reduceForceJShuffle(fCjBuf, itemIdx, tidxi, aj, a_f);
+                reduceForceJ(sm_reductionBuffer, fCjBuf, itemIdx, tidxi, tidxj, aj, a_f);
              } // for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
              if constexpr (doPruneNBL)
              {
              } // for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
              if constexpr (doPruneNBL)
              {
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h

index 46c9e38afe28bdb05bb5c1ad322fb16e331ec014..c316555d0fd1bb637ded9db9853cd3f6ec32c598 100644 (file)
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h
@@ -61,6 +61,8 @@ static constexpr int c_syclPruneKernelJ4Concurrency = GMX_NBNXN_PRUNE_KERNEL_J4_
  /*! \cond */
  // cluster size = number of atoms per cluster.
  static constexpr int c_clSize = c_nbnxnGpuClusterSize;
  /*! \cond */
  // cluster size = number of atoms per cluster.
  static constexpr int c_clSize = c_nbnxnGpuClusterSize;
+// Square of cluster size.
+static constexpr int c_clSizeSq = c_clSize * c_clSize;
  // j-cluster size after split (4 in the current implementation).
  static constexpr int c_splitClSize = c_clSize / c_nbnxnGpuClusterpairSplit;
  // i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster=8 bits set.
  // j-cluster size after split (4 in the current implementation).
  static constexpr int c_splitClSize = c_clSize / c_nbnxnGpuClusterpairSplit;
  // i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster=8 bits set.
author	Szilárd Páll <pall.szilard@gmail.com>
	Tue, 30 Mar 2021 13:24:08 +0000 (15:24 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 26 May 2021 08:08:26 +0000 (08:08 +0000)
src/gromacs/gpu_utils/sycl_kernel_utils.h		patch \| blob \| history
src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp		patch \| blob \| history
src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h		patch \| blob \| history