return lerp(left, right, fraction); // TODO: cl::sycl::mix
}
-static inline void reduceForceJShuffle(float3 f,
+static inline void reduceForceJShuffle(Float3 f,
const cl::sycl::nd_item<1> itemIdx,
const int tidxi,
const int aidx,
* This implementation works only with power of two array sizes.
*/
static inline void reduceForceIAndFShift(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
- const float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster],
+ const Float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster],
const bool calcFShift,
const cl::sycl::nd_item<1> itemIdx,
const int tidxi,
*/
template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType>
auto nbnxmKernel(cl::sycl::handler& cgh,
- DeviceAccessor<float4, mode::read> a_xq,
+ DeviceAccessor<Float4, mode::read> a_xq,
DeviceAccessor<float, mode::read_write> a_f,
- DeviceAccessor<float3, mode::read> a_shiftVec,
+ DeviceAccessor<Float3, mode::read> a_shiftVec,
DeviceAccessor<float, mode::read_write> a_fShift,
OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyElec,
OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyVdw,
DeviceAccessor<nbnxn_cj4_t, doPruneNBL ? mode::read_write : mode::read> a_plistCJ4,
DeviceAccessor<nbnxn_sci_t, mode::read> a_plistSci,
DeviceAccessor<nbnxn_excl_t, mode::read> a_plistExcl,
- OptionalAccessor<float2, mode::read, ljComb<vdwType>> a_ljComb,
+ OptionalAccessor<Float2, mode::read, ljComb<vdwType>> a_ljComb,
OptionalAccessor<int, mode::read, !ljComb<vdwType>> a_atomTypes,
OptionalAccessor<float, mode::read, !ljComb<vdwType>> a_nbfp,
OptionalAccessor<float, mode::read, ljEwald<vdwType>> a_nbfpComb,
}
// shmem buffer for i x+q pre-loading
- cl::sycl::accessor<float4, 2, mode::read_write, target::local> sm_xq(
+ cl::sycl::accessor<Float4, 2, mode::read_write, target::local> sm_xq(
cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh);
// shmem buffer for force reduction
auto sm_ljCombI = [&]() {
if constexpr (props.vdwComb)
{
- return cl::sycl::accessor<float2, 2, mode::read_write, target::local>(
+ return cl::sycl::accessor<Float2, 2, mode::read_write, target::local>(
cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh);
}
else
// Better use sg.get_group_range, but too much of the logic relies on it anyway
const unsigned widx = tidx / subGroupSize;
- float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster]; // i force buffer
+ Float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster]; // i force buffer
for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
{
- fCiBuf[i] = float3(0.0F, 0.0F, 0.0F);
+ fCiBuf[i] = Float3(0.0F, 0.0F, 0.0F);
}
const nbnxn_sci_t nbSci = a_plistSci[bidx];
const int ai = ci * c_clSize + tidxi;
const cl::sycl::id<2> cacheIdx = cl::sycl::id<2>(tidxj + i, tidxi);
- const float3 shift = a_shiftVec[nbSci.shift];
- float4 xqi = a_xq[ai];
- xqi += float4(shift[0], shift[1], shift[2], 0.0F);
+ const Float3 shift = a_shiftVec[nbSci.shift];
+ Float4 xqi = a_xq[ai];
+ xqi += Float4(shift[0], shift[1], shift[2], 0.0F);
xqi[3] *= epsFac;
sm_xq[cacheIdx] = xqi;
const int aj = cj * c_clSize + tidxj;
// load j atom data
- const float4 xqj = a_xq[aj];
+ const Float4 xqj = a_xq[aj];
- const float3 xj(xqj[0], xqj[1], xqj[2]);
+ const Float3 xj(xqj[0], xqj[1], xqj[2]);
const float qj = xqj[3];
int atomTypeJ; // Only needed if (!props.vdwComb)
- float2 ljCombJ; // Only needed if (props.vdwComb)
+ Float2 ljCombJ; // Only needed if (props.vdwComb)
if constexpr (props.vdwComb)
{
ljCombJ = a_ljComb[aj];
atomTypeJ = a_atomTypes[aj];
}
- float3 fCjBuf(0.0F, 0.0F, 0.0F);
+ Float3 fCjBuf(0.0F, 0.0F, 0.0F);
for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
{
// i cluster index
const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + i;
// all threads load an atom from i cluster ci into shmem!
- const float4 xqi = sm_xq[i][tidxi];
- const float3 xi(xqi[0], xqi[1], xqi[2]);
+ const Float4 xqi = sm_xq[i][tidxi];
+ const Float3 xi(xqi[0], xqi[1], xqi[2]);
// distance between i and j atoms
- const float3 rv = xi - xj;
+ const Float3 rv = xi - xj;
float r2 = norm2(rv);
if constexpr (doPruneNBL)
}
else
{
- const float2 ljCombI = sm_ljCombI[i][tidxi];
+ const Float2 ljCombI = sm_ljCombI[i][tidxi];
if constexpr (props.vdwCombGeom)
{
c6 = ljCombI[0] * ljCombJ[0];
}
}
- const float3 forceIJ = rv * fInvR;
+ const Float3 forceIJ = rv * fInvR;
/* accumulate j forces in registers */
fCjBuf -= forceIJ;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
// Casting to float simplifies using atomic ops in the kernel
- cl::sycl::buffer<float3, 1> f(*adat->f.buffer_);
+ cl::sycl::buffer<Float3, 1> f(*adat->f.buffer_);
auto fAsFloat = f.reinterpret<float, 1>(f.get_count() * DIM);
- cl::sycl::buffer<float3, 1> fShift(*adat->fShift.buffer_);
+ cl::sycl::buffer<Float3, 1> fShift(*adat->fShift.buffer_);
auto fShiftAsFloat = fShift.reinterpret<float, 1>(fShift.get_count() * DIM);
cl::sycl::event e = chooseAndLaunchNbnxmKernel(doPruneNBL,
*/
template<bool haveFreshList>
auto nbnxmKernelPruneOnly(cl::sycl::handler& cgh,
- DeviceAccessor<float4, mode::read> a_xq,
- DeviceAccessor<float3, mode::read> a_shiftVec,
+ DeviceAccessor<Float4, mode::read> a_xq,
+ DeviceAccessor<Float3, mode::read> a_shiftVec,
DeviceAccessor<nbnxn_cj4_t, mode::read_write> a_plistCJ4,
DeviceAccessor<nbnxn_sci_t, mode::read> a_plistSci,
DeviceAccessor<unsigned int, haveFreshList ? mode::write : mode::read> a_plistIMask,
cgh.require(a_plistIMask);
/* shmem buffer for i x+q pre-loading */
- cl::sycl::accessor<float4, 2, mode::read_write, target::local> sm_xq(
+ cl::sycl::accessor<Float4, 2, mode::read_write, target::local> sm_xq(
cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh);
constexpr int warpSize = c_clSize * c_clSize / 2;
/* We don't need q, but using float4 in shmem avoids bank conflicts.
(but it also wastes L2 bandwidth). */
- const float4 xq = a_xq[ai];
- const float3 shift = a_shiftVec[nbSci.shift];
- const float4 xi(xq[0] + shift[0], xq[1] + shift[1], xq[2] + shift[2], xq[3]);
+ const Float4 xq = a_xq[ai];
+ const Float3 shift = a_shiftVec[nbSci.shift];
+ const Float4 xi(xq[0] + shift[0], xq[1] + shift[1], xq[2] + shift[2], xq[3]);
sm_xq[tidxj + i][tidxi] = xi;
}
}
const int aj = cj * c_clSize + tidxj;
/* load j atom data */
- const float4 tmp = a_xq[aj];
- const float3 xj(tmp[0], tmp[1], tmp[2]);
+ const Float4 tmp = a_xq[aj];
+ const Float3 xj(tmp[0], tmp[1], tmp[2]);
for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
{
if (imaskCheck & mask_ji)
{
// load i-cluster coordinates from shmem
- const float4 xi = sm_xq[i][tidxi];
+ const Float4 xi = sm_xq[i][tidxi];
// distance between i and j atoms
- float3 rv(xi[0], xi[1], xi[2]);
+ Float3 rv(xi[0], xi[1], xi[2]);
rv -= xj;
const float r2 = norm2(rv);