* This implementation works only with power of two array sizes.
*/
static inline void reduceForceIAndFShift(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
* This implementation works only with power of two array sizes.
*/
static inline void reduceForceIAndFShift(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
*/
template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType>
auto nbnxmKernel(cl::sycl::handler& cgh,
*/
template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType>
auto nbnxmKernel(cl::sycl::handler& cgh,
DeviceAccessor<float, mode::read_write> a_fShift,
OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyElec,
OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyVdw,
DeviceAccessor<nbnxn_cj4_t, doPruneNBL ? mode::read_write : mode::read> a_plistCJ4,
DeviceAccessor<nbnxn_sci_t, mode::read> a_plistSci,
DeviceAccessor<nbnxn_excl_t, mode::read> a_plistExcl,
DeviceAccessor<float, mode::read_write> a_fShift,
OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyElec,
OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyVdw,
DeviceAccessor<nbnxn_cj4_t, doPruneNBL ? mode::read_write : mode::read> a_plistCJ4,
DeviceAccessor<nbnxn_sci_t, mode::read> a_plistSci,
DeviceAccessor<nbnxn_excl_t, mode::read> a_plistExcl,
- OptionalAccessor<float2, mode::read, ljComb<vdwType>> a_ljComb,
+ OptionalAccessor<Float2, mode::read, ljComb<vdwType>> a_ljComb,
OptionalAccessor<int, mode::read, !ljComb<vdwType>> a_atomTypes,
OptionalAccessor<float, mode::read, !ljComb<vdwType>> a_nbfp,
OptionalAccessor<float, mode::read, ljEwald<vdwType>> a_nbfpComb,
OptionalAccessor<int, mode::read, !ljComb<vdwType>> a_atomTypes,
OptionalAccessor<float, mode::read, !ljComb<vdwType>> a_nbfp,
OptionalAccessor<float, mode::read, ljEwald<vdwType>> a_nbfpComb,
- cl::sycl::accessor<float4, 2, mode::read_write, target::local> sm_xq(
+ cl::sycl::accessor<Float4, 2, mode::read_write, target::local> sm_xq(
cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh);
// shmem buffer for force reduction
cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh);
// shmem buffer for force reduction
- return cl::sycl::accessor<float2, 2, mode::read_write, target::local>(
+ return cl::sycl::accessor<Float2, 2, mode::read_write, target::local>(
// Better use sg.get_group_range, but too much of the logic relies on it anyway
const unsigned widx = tidx / subGroupSize;
// Better use sg.get_group_range, but too much of the logic relies on it anyway
const unsigned widx = tidx / subGroupSize;
const int ai = ci * c_clSize + tidxi;
const cl::sycl::id<2> cacheIdx = cl::sycl::id<2>(tidxj + i, tidxi);
const int ai = ci * c_clSize + tidxi;
const cl::sycl::id<2> cacheIdx = cl::sycl::id<2>(tidxj + i, tidxi);
- const float3 shift = a_shiftVec[nbSci.shift];
- float4 xqi = a_xq[ai];
- xqi += float4(shift[0], shift[1], shift[2], 0.0F);
+ const Float3 shift = a_shiftVec[nbSci.shift];
+ Float4 xqi = a_xq[ai];
+ xqi += Float4(shift[0], shift[1], shift[2], 0.0F);
// i cluster index
const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + i;
// all threads load an atom from i cluster ci into shmem!
// i cluster index
const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + i;
// all threads load an atom from i cluster ci into shmem!
- const float4 xqi = sm_xq[i][tidxi];
- const float3 xi(xqi[0], xqi[1], xqi[2]);
+ const Float4 xqi = sm_xq[i][tidxi];
+ const Float3 xi(xqi[0], xqi[1], xqi[2]);
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
// Casting to float simplifies using atomic ops in the kernel
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
// Casting to float simplifies using atomic ops in the kernel
auto fShiftAsFloat = fShift.reinterpret<float, 1>(fShift.get_count() * DIM);
cl::sycl::event e = chooseAndLaunchNbnxmKernel(doPruneNBL,
auto fShiftAsFloat = fShift.reinterpret<float, 1>(fShift.get_count() * DIM);
cl::sycl::event e = chooseAndLaunchNbnxmKernel(doPruneNBL,