* \author Andrey Alekseenko <al42and@gmail.com>
*/
-/*! \brief Access mode to use for atomic accessors.
- *
- * Intel DPCPP compiler has \c sycl::atomic_ref, but has no \c sycl::atomic_fetch_add for floats.
- * However, \c atomic_ref can not be constructed from \c sycl::atomic, so we can not use
- * atomic accessors. Thus, we use \c mode::read_write accessors and \c atomic_ref.
- *
- * hipSYCL does not have \c sycl::atomic_ref, but has \c sycl::atomic_fetch_add for floats, which
- * requires using atomic accessors. Thus, we use \c mode::atomic accessors.
- *
- * The \ref atomicFetchAdd function could be used for doing operations on such accessors.
- */
-static constexpr auto mode_atomic = GMX_SYCL_DPCPP ? cl::sycl::access::mode::read_write :
- /* GMX_SYCL_HIPSYCL */ cl::sycl::access::mode::atomic;
-
//! \brief Full warp active thread mask used in CUDA warp-level primitives.
static constexpr unsigned int c_cudaFullWarpMask = 0xffffffff;
/*! \brief Convenience wrapper to do atomic addition to a global buffer.
- *
- * The implementation differences between DPCPP and hipSYCL are explained in \ref mode_atomic.
*/
-template<class IndexType>
-static inline void atomicFetchAdd(DeviceAccessor<float, mode_atomic> acc, const IndexType idx, const float val)
+template<typename T, sycl_2020::memory_scope MemoryScope = sycl_2020::memory_scope::device>
+static inline void atomicFetchAdd(T& val, const T delta)
{
-#if GMX_SYCL_DPCPP
- sycl_2020::atomic_ref<float, sycl_2020::memory_order::relaxed, sycl_2020::memory_scope::device, cl::sycl::access::address_space::global_space>
- fout_atomic(acc[idx]);
- fout_atomic.fetch_add(val);
-#elif GMX_SYCL_HIPSYCL
-# ifdef SYCL_DEVICE_ONLY
- /* While there is support for float atomics on device, the host implementation uses
- * Clang's __atomic_fetch_add intrinsic, that, at least in Clang 11, does not support
- * floats. Luckily, we don't want to run on host. */
- // The pragmas below can be removed once we switch to sycl::atomic
-# pragma clang diagnostic push
-# pragma clang diagnostic ignored "-Wdeprecated-declarations"
- acc[idx].fetch_add(val);
-# pragma clang diagnostic push
-# else
- GMX_ASSERT(false, "hipSYCL host codepath not supported");
- GMX_UNUSED_VALUE(val);
- GMX_UNUSED_VALUE(acc);
- GMX_UNUSED_VALUE(idx);
-# endif
-#endif
+ sycl_2020::atomic_ref<T, sycl_2020::memory_order::relaxed, MemoryScope, cl::sycl::access::address_space::global_space> ref(
+ val);
+ ref.fetch_add(delta);
}
+/*! \brief Convenience wrapper to do atomic loads from a global buffer.
+ */
+template<typename T, sycl_2020::memory_scope MemoryScope = sycl_2020::memory_scope::device>
+static inline T atomicLoad(T& val)
+{
+ sycl_2020::atomic_ref<T, sycl_2020::memory_order::relaxed, MemoryScope, cl::sycl::access::address_space::global_space> ref(
+ val);
+ return ref.load();
+}
+
+
/*! \brief Issue an intra sub-group barrier.
*
* Equivalent with CUDA's \c syncwarp(c_cudaFullWarpMask).
* c_clSize consecutive threads hold the force components of a j-atom which we
* reduced in log2(cl_Size) steps using shift and atomically accumulate them into \p a_f.
*/
-static inline void reduceForceJShuffle(Float3 f,
- const cl::sycl::nd_item<1> itemIdx,
- const int tidxi,
- const int aidx,
- DeviceAccessor<float, mode_atomic> a_f)
+static inline void reduceForceJShuffle(Float3 f,
+ const cl::sycl::nd_item<1> itemIdx,
+ const int tidxi,
+ const int aidx,
+ DeviceAccessor<float, mode::read_write> a_f)
{
static_assert(c_clSize == 8 || c_clSize == 4);
sycl_2020::sub_group sg = itemIdx.get_sub_group();
if (tidxi < 3)
{
- atomicFetchAdd(a_f, 3 * aidx + tidxi, f[0]);
+ atomicFetchAdd(a_f[3 * aidx + tidxi], f[0]);
}
}
* TODO: implement binary reduction flavor for the case where cl_Size is power of two.
*/
static inline void reduceForceJGeneric(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
- Float3 f,
- const cl::sycl::nd_item<1> itemIdx,
- const int tidxi,
- const int tidxj,
- const int aidx,
- DeviceAccessor<float, mode_atomic> a_f)
+ Float3 f,
+ const cl::sycl::nd_item<1> itemIdx,
+ const int tidxi,
+ const int tidxj,
+ const int aidx,
+ DeviceAccessor<float, mode::read_write> a_f)
{
static constexpr int sc_fBufferStride = c_clSizeSq;
int tidx = tidxi + tidxj * c_clSize;
fSum += sm_buf[sc_fBufferStride * tidxi + j];
}
- atomicFetchAdd(a_f, 3 * aidx + tidxi, fSum);
+ atomicFetchAdd(a_f[3 * aidx + tidxi], fSum);
}
}
*/
static inline void reduceForceJ(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
Float3 f,
- const cl::sycl::nd_item<1> itemIdx,
- const int tidxi,
- const int tidxj,
- const int aidx,
- DeviceAccessor<float, mode_atomic> a_f)
+ const cl::sycl::nd_item<1> itemIdx,
+ const int tidxi,
+ const int tidxj,
+ const int aidx,
+ DeviceAccessor<float, mode::read_write> a_f)
{
if constexpr (!gmx::isPowerOfTwo(c_nbnxnGpuNumClusterPerSupercluster))
{
static inline void reduceForceIAndFShift(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
const Float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster],
const bool calcFShift,
- const cl::sycl::nd_item<1> itemIdx,
- const int tidxi,
- const int tidxj,
- const int sci,
- const int shift,
- DeviceAccessor<float, mode_atomic> a_f,
- DeviceAccessor<float, mode_atomic> a_fShift)
+ const cl::sycl::nd_item<1> itemIdx,
+ const int tidxi,
+ const int tidxj,
+ const int sci,
+ const int shift,
+ DeviceAccessor<float, mode::read_write> a_f,
+ DeviceAccessor<float, mode::read_write> a_fShift)
{
// must have power of two elements in fCiBuf
static_assert(gmx::isPowerOfTwo(c_nbnxnGpuNumClusterPerSupercluster));
{
const float f =
sm_buf[tidxj * bufStride + tidxi] + sm_buf[tidxj * bufStride + c_clSize + tidxi];
- atomicFetchAdd(a_f, 3 * aidx + tidxj, f);
+ atomicFetchAdd(a_f[3 * aidx + tidxj], f);
if (calcFShift)
{
fShiftBuf += f;
fShiftBuf += sycl_2020::shift_left(sg, fShiftBuf, 2);
if (tidxi == 0)
{
- atomicFetchAdd(a_fShift, 3 * shift + tidxj, fShiftBuf);
+ atomicFetchAdd(a_fShift[3 * shift + tidxj], fShiftBuf);
}
}
else
{
- atomicFetchAdd(a_fShift, 3 * shift + tidxj, fShiftBuf);
+ atomicFetchAdd(a_fShift[3 * shift + tidxj], fShiftBuf);
}
}
}
*
*/
template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType>
-auto nbnxmKernel(cl::sycl::handler& cgh,
- DeviceAccessor<Float4, mode::read> a_xq,
- DeviceAccessor<float, mode_atomic> a_f,
- DeviceAccessor<Float3, mode::read> a_shiftVec,
- DeviceAccessor<float, mode_atomic> a_fShift,
- OptionalAccessor<float, mode_atomic, doCalcEnergies> a_energyElec,
- OptionalAccessor<float, mode_atomic, doCalcEnergies> a_energyVdw,
+auto nbnxmKernel(cl::sycl::handler& cgh,
+ DeviceAccessor<Float4, mode::read> a_xq,
+ DeviceAccessor<float, mode::read_write> a_f,
+ DeviceAccessor<Float3, mode::read> a_shiftVec,
+ DeviceAccessor<float, mode::read_write> a_fShift,
+ OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyElec,
+ OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyVdw,
DeviceAccessor<nbnxn_cj4_t, doPruneNBL ? mode::read_write : mode::read> a_plistCJ4,
DeviceAccessor<nbnxn_sci_t, mode::read> a_plistSci,
DeviceAccessor<nbnxn_excl_t, mode::read> a_plistExcl,
if (tidx == 0)
{
- atomicFetchAdd(a_energyVdw, 0, energyVdwGroup);
- atomicFetchAdd(a_energyElec, 0, energyElecGroup);
+ atomicFetchAdd(a_energyVdw[0], energyVdwGroup);
+ atomicFetchAdd(a_energyElec[0], energyElecGroup);
}
}
};