static constexpr auto mode_atomic = GMX_SYCL_DPCPP ? cl::sycl::access::mode::read_write :
/* GMX_SYCL_HIPSYCL */ cl::sycl::access::mode::atomic;
+// \brief Full warp active thread mask used in CUDA warp-level primitives.
+static constexpr unsigned int c_cudaFullWarpMask = 0xffffffff;
+
/*! \brief Convenience wrapper to do atomic addition to a global buffer.
*
* The implementation differences between DPCPP and hipSYCL are explained in \ref mode_atomic.
#endif
}
+/* \brief Issue an intra sub-group barrier.
+ *
+ * Equivalent with CUDA syncwarp(c_cudaFullWarpMask).
+ *
+ */
+static inline void subGroupBarrier(const cl::sycl::nd_item<1> itemIdx)
+{
+#if GMX_SYCL_HIPSYCL
+ cl::sycl::group_barrier(itemIdx.get_sub_group(), cl::sycl::memory_scope::sub_group);
+#else
+ itemIdx.get_sub_group().barrier();
+#endif
+}
+
namespace sycl_2020
{
#if GMX_SYCL_HIPSYCL
// No sycl::sub_group::shift_left / shuffle_down in hipSYCL yet
# ifdef SYCL_DEVICE_ONLY
# if defined(HIPSYCL_PLATFORM_CUDA) && defined(__HIPSYCL_ENABLE_CUDA_TARGET__)
- static const unsigned int sc_cudaFullWarpMask = 0xffffffff;
- return __shfl_down_sync(sc_cudaFullWarpMask, var, delta);
+ return __shfl_down_sync(c_cudaFullWarpMask, var, delta);
# elif defined(HIPSYCL_PLATFORM_ROCM) && defined(__HIPSYCL_ENABLE_HIP_TARGET__)
// Do we need more ifdefs? https://github.com/ROCm-Developer-Tools/HIP/issues/1491
return __shfl_down(var, delta);
// No sycl::sub_group::shift_right / shuffle_up in hipSYCL yet
# ifdef SYCL_DEVICE_ONLY
# if defined(HIPSYCL_PLATFORM_CUDA) && defined(__HIPSYCL_ENABLE_CUDA_TARGET__)
- static const unsigned int sc_cudaFullWarpMask = 0xffffffff;
- return __shfl_up_sync(sc_cudaFullWarpMask, var, delta);
+ return __shfl_up_sync(c_cudaFullWarpMask, var, delta);
# elif defined(HIPSYCL_PLATFORM_ROCM) && defined(__HIPSYCL_ENABLE_HIP_TARGET__)
// Do we need more ifdefs? https://github.com/ROCm-Developer-Tools/HIP/issues/1491
return __shfl_up(var, delta);
return lerp(left, right, fraction); // TODO: cl::sycl::mix
}
-/*! \brief Reduce c_clSize j-force components and atomically accumulate into a_f.
+/*! \brief Reduce c_clSize j-force components using shifts and atomically accumulate into a_f.
*
* c_clSize consecutive threads hold the force components of a j-atom which we
* reduced in log2(cl_Size) steps using shift and atomically accumulate them into \p a_f.
}
}
+/*! \brief Reduce c_clSize j-force components using local memory and atomically accumulate into a_f.
+ *
+ * c_clSize consecutive threads hold the force components of a j-atom which we
+ * reduced in cl_Size steps using shift and atomically accumulate them into \p a_f.
+ *
+ * TODO: implement binary reduction flavor for the case where cl_Size is power of two.
+ */
+static inline void reduceForceJGeneric(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
+ Float3 f,
+ const cl::sycl::nd_item<1> itemIdx,
+ const int tidxi,
+ const int tidxj,
+ const int aidx,
+ DeviceAccessor<float, mode_atomic> a_f)
+{
+ static constexpr int sc_fBufferStride = c_clSizeSq;
+ int tidx = tidxi + tidxj * c_clSize;
+ sm_buf[0 * sc_fBufferStride + tidx] = f[0];
+ sm_buf[1 * sc_fBufferStride + tidx] = f[1];
+ sm_buf[2 * sc_fBufferStride + tidx] = f[2];
+
+ subGroupBarrier(itemIdx);
+
+ // reducing data 8-by-by elements on the leader of same threads as those storing above
+ assert(itemIdx.get_sub_group().get_local_range().size() >= c_clSize);
+
+ if (tidxi < 3)
+ {
+ float fSum = 0.0F;
+ for (int j = tidxj * c_clSize; j < (tidxj + 1) * c_clSize; j++)
+ {
+ fSum += sm_buf[sc_fBufferStride * tidxi + j];
+ }
+
+ atomicFetchAdd(a_f, 3 * aidx + tidxi, fSum);
+ }
+}
+
+
+/*! \brief Reduce c_clSize j-force components using either shifts or local memory and atomically accumulate into a_f.
+ */
+static inline void reduceForceJ(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
+ Float3 f,
+ const cl::sycl::nd_item<1> itemIdx,
+ const int tidxi,
+ const int tidxj,
+ const int aidx,
+ DeviceAccessor<float, mode_atomic> a_f)
+{
+ if constexpr (!gmx::isPowerOfTwo(c_nbnxnGpuNumClusterPerSupercluster))
+ {
+ reduceForceJGeneric(sm_buf, f, itemIdx, tidxi, tidxj, aidx, a_f);
+ }
+ else
+ {
+ reduceForceJShuffle(f, itemIdx, tidxi, aidx, a_f);
+ }
+}
+
/*! \brief Final i-force reduction.
*
static constexpr int bufStride = c_clSize * c_clSize;
static constexpr int clSizeLog2 = gmx::StaticLog2<c_clSize>::value;
const int tidx = tidxi + tidxj * c_clSize;
- float fShiftBuf = 0;
+ float fShiftBuf = 0.0F;
for (int ciOffset = 0; ciOffset < c_nbnxnGpuNumClusterPerSupercluster; ciOffset++)
{
const int aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ciOffset) * c_clSize + tidxi;
maskJI += maskJI;
} // for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
/* reduce j forces */
- reduceForceJShuffle(fCjBuf, itemIdx, tidxi, aj, a_f);
+ reduceForceJ(sm_reductionBuffer, fCjBuf, itemIdx, tidxi, tidxj, aj, a_f);
} // for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
if constexpr (doPruneNBL)
{