#include "nbnxm_sycl_kernel_utils.h"
#include "nbnxm_sycl_types.h"
+//! \brief Class name for NBNXM kernel
+template<bool doPruneNBL, bool doCalcEnergies, enum Nbnxm::ElecType elecType, enum Nbnxm::VdwType vdwType>
+class NbnxmKernel;
+
namespace Nbnxm
{
static constexpr bool elecEwaldTab =
(elecType == ElecType::EwaldTab || elecType == ElecType::EwaldTabTwin); ///< EL_EWALD_TAB
static constexpr bool elecEwaldTwin =
- (elecType == ElecType::EwaldAnaTwin || elecType == ElecType::EwaldTabTwin);
- static constexpr bool elecEwald = (elecEwaldAna || elecEwaldTab); ///< EL_EWALD_ANY
- static constexpr bool vdwCombLB = (vdwType == VdwType::CutCombLB);
+ (elecType == ElecType::EwaldAnaTwin || elecType == ElecType::EwaldTabTwin); ///< Use twin cut-off.
+ static constexpr bool elecEwald = (elecEwaldAna || elecEwaldTab); ///< EL_EWALD_ANY
+ static constexpr bool vdwCombLB = (vdwType == VdwType::CutCombLB); ///< LJ_COMB && !LJ_COMB_GEOM
static constexpr bool vdwCombGeom = (vdwType == VdwType::CutCombGeom); ///< LJ_COMB_GEOM
static constexpr bool vdwComb = (vdwCombLB || vdwCombGeom); ///< LJ_COMB
static constexpr bool vdwEwaldCombGeom = (vdwType == VdwType::EwaldGeom); ///< LJ_EWALD_COMB_GEOM
template<enum VdwType vdwType>
constexpr bool ljComb = EnergyFunctionProperties<ElecType::Count, vdwType>().vdwComb;
-template<enum ElecType elecType> // Yes, ElecType
-constexpr bool vdwCutoffCheck = EnergyFunctionProperties<elecType, VdwType::Count>().elecEwaldTwin;
-
template<enum ElecType elecType>
constexpr bool elecEwald = EnergyFunctionProperties<elecType, VdwType::Count>().elecEwald;
using cl::sycl::access::mode;
using cl::sycl::access::target;
-static inline void convertSigmaEpsilonToC6C12(const float sigma,
- const float epsilon,
- cl::sycl::private_ptr<float> c6,
- cl::sycl::private_ptr<float> c12)
+//! \brief Convert \p sigma and \p epsilon VdW parameters to \c c6,c12 pair.
+static inline Float2 convertSigmaEpsilonToC6C12(const float sigma, const float epsilon)
{
const float sigma2 = sigma * sigma;
const float sigma6 = sigma2 * sigma2 * sigma2;
- *c6 = epsilon * sigma6;
- *c12 = (*c6) * sigma6;
+ const float c6 = epsilon * sigma6;
+ const float c12 = c6 * sigma6;
+
+ return { c6, c12 };
}
+//! \brief Calculate force and energy for a pair of atoms, VdW force-switch flavor.
template<bool doCalcEnergies>
static inline void ljForceSwitch(const shift_consts_t dispersionShift,
const shift_consts_t repulsionShift,
//! \brief Fetch C6 grid contribution coefficients and return the product of these.
template<enum VdwType vdwType>
-static inline float calculateLJEwaldC6Grid(const DeviceAccessor<float, mode::read> a_nbfpComb,
- const int typeI,
- const int typeJ)
+static inline float calculateLJEwaldC6Grid(const DeviceAccessor<Float2, mode::read> a_nbfpComb,
+ const int typeI,
+ const int typeJ)
{
if constexpr (vdwType == VdwType::EwaldGeom)
{
- return a_nbfpComb[2 * typeI] * a_nbfpComb[2 * typeJ];
+ return a_nbfpComb[typeI][0] * a_nbfpComb[typeJ][0];
}
else
{
static_assert(vdwType == VdwType::EwaldLB);
/* sigma and epsilon are scaled to give 6*C6 */
- const float c6_i = a_nbfpComb[2 * typeI];
- const float c12_i = a_nbfpComb[2 * typeI + 1];
- const float c6_j = a_nbfpComb[2 * typeJ];
- const float c12_j = a_nbfpComb[2 * typeJ + 1];
+ const Float2 c6c12_i = a_nbfpComb[typeI];
+ const Float2 c6c12_j = a_nbfpComb[typeJ];
- const float sigma = c6_i + c6_j;
- const float epsilon = c12_i * c12_j;
+ const float sigma = c6c12_i[0] + c6c12_j[0];
+ const float epsilon = c6c12_i[1] * c6c12_j[1];
const float sigma2 = sigma * sigma;
return epsilon * sigma2 * sigma2 * sigma2;
//! Calculate LJ-PME grid force contribution with geometric or LB combination rule.
template<bool doCalcEnergies, enum VdwType vdwType>
-static inline void ljEwaldComb(const DeviceAccessor<float, mode::read> a_nbfpComb,
- const float sh_lj_ewald,
- const int typeI,
- const int typeJ,
- const float r2,
- const float r2Inv,
- const float lje_coeff2,
- const float lje_coeff6_6,
- const float int_bit,
- cl::sycl::private_ptr<float> fInvR,
- cl::sycl::private_ptr<float> eLJ)
+static inline void ljEwaldComb(const DeviceAccessor<Float2, mode::read> a_nbfpComb,
+ const float sh_lj_ewald,
+ const int typeI,
+ const int typeJ,
+ const float r2,
+ const float r2Inv,
+ const float lje_coeff2,
+ const float lje_coeff6_6,
+ const float int_bit,
+ cl::sycl::private_ptr<float> fInvR,
+ cl::sycl::private_ptr<float> eLJ)
{
const float c6grid = calculateLJEwaldC6Grid<vdwType>(a_nbfpComb, typeI, typeJ);
return lerp(left, right, fraction); // TODO: cl::sycl::mix
}
-static inline void reduceForceJShuffle(Float3 f,
- const cl::sycl::nd_item<1> itemIdx,
- const int tidxi,
- const int aidx,
- DeviceAccessor<float, mode::read_write> a_f)
+/*! \brief Reduce c_clSize j-force components using shifts and atomically accumulate into a_f.
+ *
+ * c_clSize consecutive threads hold the force components of a j-atom which we
+ * reduced in log2(cl_Size) steps using shift and atomically accumulate them into \p a_f.
+ */
+static inline void reduceForceJShuffle(Float3 f,
+ const cl::sycl::nd_item<3> itemIdx,
+ const int tidxi,
+ const int aidx,
+ DeviceAccessor<Float3, mode::read_write> a_f)
{
static_assert(c_clSize == 8 || c_clSize == 4);
sycl_2020::sub_group sg = itemIdx.get_sub_group();
- f[0] += shuffleDown(f[0], 1, sg);
- f[1] += shuffleUp(f[1], 1, sg);
- f[2] += shuffleDown(f[2], 1, sg);
+ f[0] += sycl_2020::shift_left(sg, f[0], 1);
+ f[1] += sycl_2020::shift_right(sg, f[1], 1);
+ f[2] += sycl_2020::shift_left(sg, f[2], 1);
if (tidxi & 1)
{
f[0] = f[1];
}
- f[0] += shuffleDown(f[0], 2, sg);
- f[2] += shuffleUp(f[2], 2, sg);
+ f[0] += sycl_2020::shift_left(sg, f[0], 2);
+ f[2] += sycl_2020::shift_right(sg, f[2], 2);
if (tidxi & 2)
{
f[0] = f[2];
if constexpr (c_clSize == 8)
{
- f[0] += shuffleDown(f[0], 4, sg);
+ f[0] += sycl_2020::shift_left(sg, f[0], 4);
+ }
+
+ if (tidxi < 3)
+ {
+ atomicFetchAdd(a_f[aidx][tidxi], f[0]);
+ }
+}
+
+/*!
+ * \brief Do workgroup-level reduction of a single \c float.
+ *
+ * While SYCL has \c sycl::reduce_over_group, it currently (oneAPI 2021.3.0) uses a very large
+ * shared memory buffer, which leads to a reduced occupancy.
+ *
+ * \note The caller must make sure there are no races when reusing the \p sm_buf.
+ *
+ * \tparam subGroupSize Size of a sub-group.
+ * \tparam groupSize Size of a work-group.
+ * \param itemIdx Current thread's \c sycl::nd_item.
+ * \param tidxi Current thread's linearized local index.
+ * \param sm_buf Accessor for local reduction buffer.
+ * \param valueToReduce Current thread's value. Must have length of at least 1.
+ * \return For thread with \p tidxi 0: sum of all \p valueToReduce. Other threads: unspecified.
+ */
+template<int subGroupSize, int groupSize>
+static inline float groupReduce(const cl::sycl::nd_item<3> itemIdx,
+ const unsigned int tidxi,
+ cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
+ float valueToReduce)
+{
+ constexpr int numSubGroupsInGroup = groupSize / subGroupSize;
+ static_assert(numSubGroupsInGroup == 1 || numSubGroupsInGroup == 2);
+ sycl_2020::sub_group sg = itemIdx.get_sub_group();
+ valueToReduce = sycl_2020::group_reduce(sg, valueToReduce, sycl_2020::plus<float>());
+ // If we have two sub-groups, we should reduce across them.
+ if constexpr (numSubGroupsInGroup == 2)
+ {
+ if (tidxi == subGroupSize)
+ {
+ sm_buf[0] = valueToReduce;
+ }
+ itemIdx.barrier(fence_space::local_space);
+ if (tidxi == 0)
+ {
+ valueToReduce += sm_buf[0];
+ }
}
+ return valueToReduce;
+}
+
+/*! \brief Reduce c_clSize j-force components using local memory and atomically accumulate into a_f.
+ *
+ * c_clSize consecutive threads hold the force components of a j-atom which we
+ * reduced in cl_Size steps using shift and atomically accumulate them into \p a_f.
+ *
+ * TODO: implement binary reduction flavor for the case where cl_Size is power of two.
+ */
+static inline void reduceForceJGeneric(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
+ Float3 f,
+ const cl::sycl::nd_item<3> itemIdx,
+ const int tidxi,
+ const int tidxj,
+ const int aidx,
+ DeviceAccessor<Float3, mode::read_write> a_f)
+{
+ static constexpr int sc_fBufferStride = c_clSizeSq;
+ int tidx = tidxi + tidxj * c_clSize;
+ sm_buf[0 * sc_fBufferStride + tidx] = f[0];
+ sm_buf[1 * sc_fBufferStride + tidx] = f[1];
+ sm_buf[2 * sc_fBufferStride + tidx] = f[2];
+
+ subGroupBarrier(itemIdx);
+
+ // reducing data 8-by-by elements on the leader of same threads as those storing above
+ assert(itemIdx.get_sub_group().get_local_range().size() >= c_clSize);
if (tidxi < 3)
{
- atomicFetchAdd(a_f, 3 * aidx + tidxi, f[0]);
+ float fSum = 0.0F;
+ for (int j = tidxj * c_clSize; j < (tidxj + 1) * c_clSize; j++)
+ {
+ fSum += sm_buf[sc_fBufferStride * tidxi + j];
+ }
+
+ atomicFetchAdd(a_f[aidx][tidxi], fSum);
+ }
+}
+
+
+/*! \brief Reduce c_clSize j-force components using either shifts or local memory and atomically accumulate into a_f.
+ */
+static inline void reduceForceJ(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
+ Float3 f,
+ const cl::sycl::nd_item<3> itemIdx,
+ const int tidxi,
+ const int tidxj,
+ const int aidx,
+ DeviceAccessor<Float3, mode::read_write> a_f)
+{
+ if constexpr (!gmx::isPowerOfTwo(c_nbnxnGpuNumClusterPerSupercluster))
+ {
+ reduceForceJGeneric(sm_buf, f, itemIdx, tidxi, tidxj, aidx, a_f);
+ }
+ else
+ {
+ reduceForceJShuffle(f, itemIdx, tidxi, aidx, a_f);
}
}
/*! \brief Final i-force reduction.
+ *
+ * Reduce c_nbnxnGpuNumClusterPerSupercluster i-force components stored in \p fCiBuf[]
+ * accumulating atomically into \p a_f.
+ * If \p calcFShift is true, further reduce shift forces and atomically accumulate into \p a_fShift.
*
* This implementation works only with power of two array sizes.
*/
static inline void reduceForceIAndFShift(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
const Float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster],
const bool calcFShift,
- const cl::sycl::nd_item<1> itemIdx,
- const int tidxi,
- const int tidxj,
- const int sci,
- const int shift,
- DeviceAccessor<float, mode::read_write> a_f,
- DeviceAccessor<float, mode::read_write> a_fShift)
+ const cl::sycl::nd_item<3> itemIdx,
+ const int tidxi,
+ const int tidxj,
+ const int sci,
+ const int shift,
+ DeviceAccessor<Float3, mode::read_write> a_f,
+ DeviceAccessor<Float3, mode::read_write> a_fShift)
{
+ // must have power of two elements in fCiBuf
+ static_assert(gmx::isPowerOfTwo(c_nbnxnGpuNumClusterPerSupercluster));
+
static constexpr int bufStride = c_clSize * c_clSize;
static constexpr int clSizeLog2 = gmx::StaticLog2<c_clSize>::value;
const int tidx = tidxi + tidxj * c_clSize;
- float fShiftBuf = 0;
+ float fShiftBuf = 0.0F;
for (int ciOffset = 0; ciOffset < c_nbnxnGpuNumClusterPerSupercluster; ciOffset++)
{
const int aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ciOffset) * c_clSize + tidxi;
{
const float f =
sm_buf[tidxj * bufStride + tidxi] + sm_buf[tidxj * bufStride + c_clSize + tidxi];
- atomicFetchAdd(a_f, 3 * aidx + tidxj, f);
+ atomicFetchAdd(a_f[aidx][tidxj], f);
if (calcFShift)
{
fShiftBuf += f;
storing the reduction result above. */
if (tidxj < 3)
{
- atomicFetchAdd(a_fShift, 3 * shift + tidxj, fShiftBuf);
+ if constexpr (c_clSize == 4)
+ {
+ /* Intel Xe (Gen12LP) and earlier GPUs implement floating-point atomics via
+ * a compare-and-swap (CAS) loop. It has particularly poor performance when
+ * updating the same memory location from the same work-group.
+ * Such optimization might be slightly beneficial for NVIDIA and AMD as well,
+ * but it is unlikely to make a big difference and thus was not evaluated.
+ */
+ auto sg = itemIdx.get_sub_group();
+ fShiftBuf += sycl_2020::shift_left(sg, fShiftBuf, 1);
+ fShiftBuf += sycl_2020::shift_left(sg, fShiftBuf, 2);
+ if (tidxi == 0)
+ {
+ atomicFetchAdd(a_fShift[shift][tidxj], fShiftBuf);
+ }
+ }
+ else
+ {
+ atomicFetchAdd(a_fShift[shift][tidxj], fShiftBuf);
+ }
}
}
}
-
/*! \brief Main kernel for NBNXM.
*
*/
template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType>
auto nbnxmKernel(cl::sycl::handler& cgh,
DeviceAccessor<Float4, mode::read> a_xq,
- DeviceAccessor<float, mode::read_write> a_f,
+ DeviceAccessor<Float3, mode::read_write> a_f,
DeviceAccessor<Float3, mode::read> a_shiftVec,
- DeviceAccessor<float, mode::read_write> a_fShift,
+ DeviceAccessor<Float3, mode::read_write> a_fShift,
OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyElec,
OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyVdw,
DeviceAccessor<nbnxn_cj4_t, doPruneNBL ? mode::read_write : mode::read> a_plistCJ4,
DeviceAccessor<nbnxn_excl_t, mode::read> a_plistExcl,
OptionalAccessor<Float2, mode::read, ljComb<vdwType>> a_ljComb,
OptionalAccessor<int, mode::read, !ljComb<vdwType>> a_atomTypes,
- OptionalAccessor<float, mode::read, !ljComb<vdwType>> a_nbfp,
- OptionalAccessor<float, mode::read, ljEwald<vdwType>> a_nbfpComb,
+ OptionalAccessor<Float2, mode::read, !ljComb<vdwType>> a_nbfp,
+ OptionalAccessor<Float2, mode::read, ljEwald<vdwType>> a_nbfpComb,
OptionalAccessor<float, mode::read, elecEwaldTab<elecType>> a_coulombTab,
const int numTypes,
const float rCoulombSq,
{
static constexpr EnergyFunctionProperties<elecType, vdwType> props;
- cgh.require(a_xq);
- cgh.require(a_f);
- cgh.require(a_shiftVec);
- cgh.require(a_fShift);
+ a_xq.bind(cgh);
+ a_f.bind(cgh);
+ a_shiftVec.bind(cgh);
+ a_fShift.bind(cgh);
if constexpr (doCalcEnergies)
{
- cgh.require(a_energyElec);
- cgh.require(a_energyVdw);
+ a_energyElec.bind(cgh);
+ a_energyVdw.bind(cgh);
}
- cgh.require(a_plistCJ4);
- cgh.require(a_plistSci);
- cgh.require(a_plistExcl);
+ a_plistCJ4.bind(cgh);
+ a_plistSci.bind(cgh);
+ a_plistExcl.bind(cgh);
if constexpr (!props.vdwComb)
{
- cgh.require(a_atomTypes);
- cgh.require(a_nbfp);
+ a_atomTypes.bind(cgh);
+ a_nbfp.bind(cgh);
}
else
{
- cgh.require(a_ljComb);
+ a_ljComb.bind(cgh);
}
if constexpr (props.vdwEwald)
{
- cgh.require(a_nbfpComb);
+ a_nbfpComb.bind(cgh);
}
if constexpr (props.elecEwaldTab)
{
- cgh.require(a_coulombTab);
+ a_coulombTab.bind(cgh);
}
// shmem buffer for i x+q pre-loading
constexpr bool doExclusionForces =
(props.elecEwald || props.elecRF || props.vdwEwald || (props.elecCutoff && doCalcEnergies));
- constexpr int subGroupSize = c_clSize * c_clSize / 2;
-
- return [=](cl::sycl::nd_item<1> itemIdx) [[intel::reqd_sub_group_size(subGroupSize)]]
+ // The post-prune j-i cluster-pair organization is linked to how exclusion and interaction mask data is stored.
+ // Currently, this is ideally suited for 32-wide subgroup size but slightly less so for others,
+ // e.g. subGroupSize > prunedClusterPairSize on AMD GCN / CDNA.
+ // Hence, the two are decoupled.
+ // When changing this code, please update requiredSubGroupSizeForNbnxm in src/gromacs/hardware/device_management_sycl.cpp.
+ constexpr int prunedClusterPairSize = c_clSize * c_splitClSize;
+#if defined(HIPSYCL_PLATFORM_ROCM) // SYCL-TODO AMD RDNA/RDNA2 has 32-wide exec; how can we check for that?
+ gmx_unused constexpr int subGroupSize = c_clSize * c_clSize;
+#else
+ gmx_unused constexpr int subGroupSize = prunedClusterPairSize;
+#endif
+
+ return [=](cl::sycl::nd_item<3> itemIdx) [[intel::reqd_sub_group_size(subGroupSize)]]
{
/* thread/block/warp id-s */
- const cl::sycl::id<3> localId = unflattenId<c_clSize, c_clSize>(itemIdx.get_local_id());
- const unsigned tidxi = localId[0];
- const unsigned tidxj = localId[1];
- const unsigned tidx = tidxj * c_clSize + tidxi;
- const unsigned tidxz = 0;
+ const unsigned tidxi = itemIdx.get_local_id(2);
+ const unsigned tidxj = itemIdx.get_local_id(1);
+ const unsigned tidx = tidxj * c_clSize + tidxi;
+ const unsigned tidxz = 0;
- // Group indexing was flat originally, no need to unflatten it.
const unsigned bidx = itemIdx.get_group(0);
const sycl_2020::sub_group sg = itemIdx.get_sub_group();
- // Better use sg.get_group_range, but too much of the logic relies on it anyway
- const unsigned widx = tidx / subGroupSize;
+ // Could use sg.get_group_range to compute the imask & exclusion Idx, but too much of the logic relies on it anyway
+ // and in cases where prunedClusterPairSize != subGroupSize we can't use it anyway
+ const unsigned imeiIdx = tidx / prunedClusterPairSize;
Float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster]; // i force buffer
for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
}
if constexpr (doCalcEnergies && doExclusionForces)
{
- if (nbSci.shift == CENTRAL && a_plistCJ4[cij4Start].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster)
+ if (nbSci.shift == gmx::c_centralShiftIndex
+ && a_plistCJ4[cij4Start].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster)
{
// we have the diagonal: add the charge and LJ self interaction energy term
for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
{
energyVdw +=
a_nbfp[a_atomTypes[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
- * (numTypes + 1) * 2];
+ * (numTypes + 1)][0];
}
}
/* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
energyElec /= epsFac * c_clSize;
energyElec *= -ewaldBeta * c_OneOverSqrtPi; /* last factor 1/sqrt(pi) */
}
- } // (nbSci.shift == CENTRAL && a_plistCJ4[cij4Start].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster)
+ } // (nbSci.shift == gmx::c_centralShiftIndex && a_plistCJ4[cij4Start].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster)
} // (doCalcEnergies && doExclusionForces)
// Only needed if (doExclusionForces)
- const bool nonSelfInteraction = !(nbSci.shift == CENTRAL & tidxj <= tidxi);
+ const bool nonSelfInteraction = !(nbSci.shift == gmx::c_centralShiftIndex & tidxj <= tidxi);
// loop over the j clusters = seen by any of the atoms in the current super-cluster
for (int j4 = cij4Start + tidxz; j4 < cij4End; j4 += 1)
{
- unsigned imask = a_plistCJ4[j4].imei[widx].imask;
+ unsigned imask = a_plistCJ4[j4].imei[imeiIdx].imask;
if (!doPruneNBL && !imask)
{
continue;
}
- const int wexclIdx = a_plistCJ4[j4].imei[widx].excl_ind;
- const unsigned wexcl = a_plistExcl[wexclIdx].pair[tidx & (subGroupSize - 1)]; // sg.get_local_linear_id()
+ const int wexclIdx = a_plistCJ4[j4].imei[imeiIdx].excl_ind;
+ static_assert(gmx::isPowerOfTwo(prunedClusterPairSize));
+ const unsigned wexcl = a_plistExcl[wexclIdx].pair[tidx & (prunedClusterPairSize - 1)];
for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
{
const bool maskSet =
{
const float qi = xqi[3];
int atomTypeI; // Only needed if (!props.vdwComb)
- float c6, c12, sigma, epsilon;
+ float sigma, epsilon;
+ Float2 c6c12;
if constexpr (!props.vdwComb)
{
/* LJ 6*C6 and 12*C12 */
- atomTypeI = sm_atomTypeI[i][tidxi];
- const int idx = (numTypes * atomTypeI + atomTypeJ) * 2;
- c6 = a_nbfp[idx]; // TODO: Make a_nbfm into float2
- c12 = a_nbfp[idx + 1];
+ atomTypeI = sm_atomTypeI[i][tidxi];
+ c6c12 = a_nbfp[numTypes * atomTypeI + atomTypeJ];
}
else
{
const Float2 ljCombI = sm_ljCombI[i][tidxi];
if constexpr (props.vdwCombGeom)
{
- c6 = ljCombI[0] * ljCombJ[0];
- c12 = ljCombI[1] * ljCombJ[1];
+ c6c12 = Float2(ljCombI[0] * ljCombJ[0], ljCombI[1] * ljCombJ[1]);
}
else
{
epsilon = ljCombI[1] * ljCombJ[1];
if constexpr (doCalcEnergies)
{
- convertSigmaEpsilonToC6C12(sigma, epsilon, &c6, &c12);
+ c6c12 = convertSigmaEpsilonToC6C12(sigma, epsilon);
}
} // props.vdwCombGeom
} // !props.vdwComb
+ // c6 and c12 are unused and garbage iff props.vdwCombLB && !doCalcEnergies
+ const float c6 = c6c12[0];
+ const float c12 = c6c12[1];
+
// Ensure distance do not become so small that r^-12 overflows
r2 = std::max(r2, c_nbnxnMinDistanceSquared);
+#if GMX_SYCL_HIPSYCL
+ // No fast/native functions in some compilation passes
+ const float rInv = cl::sycl::rsqrt(r2);
+#else
// SYCL-TODO: sycl::half_precision::rsqrt?
- const float rInv = cl::sycl::native::rsqrt(r2);
+ const float rInv = cl::sycl::native::rsqrt(r2);
+#endif
const float r2Inv = rInv * rInv;
float r6Inv, fInvR, energyLJPair;
if constexpr (!props.vdwCombLB || doCalcEnergies)
fInvR += qi * qj
* (pairExclMask * r2Inv
- interpolateCoulombForceR(
- a_coulombTab, coulombTabScale, r2 * rInv))
+ a_coulombTab, coulombTabScale, r2 * rInv))
* rInv;
}
if constexpr (props.elecRF)
{
energyElec +=
- qi * qj * (pairExclMask * rInv + 0.5f * twoKRf * r2 - cRF);
+ qi * qj * (pairExclMask * rInv + 0.5F * twoKRf * r2 - cRF);
}
if constexpr (props.elecEwald)
{
maskJI += maskJI;
} // for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
/* reduce j forces */
- reduceForceJShuffle(fCjBuf, itemIdx, tidxi, aj, a_f);
+ reduceForceJ(sm_reductionBuffer, fCjBuf, itemIdx, tidxi, tidxj, aj, a_f);
} // for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
if constexpr (doPruneNBL)
{
/* Update the imask with the new one which does not contain the
* out of range clusters anymore. */
- a_plistCJ4[j4].imei[widx].imask = imask;
+ a_plistCJ4[j4].imei[imeiIdx].imask = imask;
}
} // for (int j4 = cij4Start; j4 < cij4End; j4 += 1)
/* skip central shifts when summing shift forces */
- const bool doCalcShift = (calcShift && !(nbSci.shift == CENTRAL));
+ const bool doCalcShift = (calcShift && nbSci.shift != gmx::c_centralShiftIndex);
reduceForceIAndFShift(
sm_reductionBuffer, fCiBuf, doCalcShift, itemIdx, tidxi, tidxj, sci, nbSci.shift, a_f, a_fShift);
if constexpr (doCalcEnergies)
{
- const float energyVdwGroup = sycl_2020::group_reduce(
- itemIdx.get_group(), energyVdw, 0.0F, sycl_2020::plus<float>());
- const float energyElecGroup = sycl_2020::group_reduce(
- itemIdx.get_group(), energyElec, 0.0F, sycl_2020::plus<float>());
+ const float energyVdwGroup =
+ groupReduce<subGroupSize, c_clSizeSq>(itemIdx, tidx, sm_reductionBuffer, energyVdw);
+ itemIdx.barrier(fence_space::local_space); // Prevent the race on sm_reductionBuffer.
+ const float energyElecGroup = groupReduce<subGroupSize, c_clSizeSq>(
+ itemIdx, tidx, sm_reductionBuffer, energyElec);
if (tidx == 0)
{
- atomicFetchAdd(a_energyVdw, 0, energyVdwGroup);
- atomicFetchAdd(a_energyElec, 0, energyElecGroup);
+ atomicFetchAdd(a_energyVdw[0], energyVdwGroup);
+ atomicFetchAdd(a_energyElec[0], energyElecGroup);
}
}
};
}
-// SYCL 1.2.1 requires providing a unique type for a kernel. Should not be needed for SYCL2020.
-template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType>
-class NbnxmKernelName;
-
+//! \brief NBNXM kernel launch code.
template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType, class... Args>
cl::sycl::event launchNbnxmKernel(const DeviceStream& deviceStream, const int numSci, Args&&... args)
{
- // Should not be needed for SYCL2020.
- using kernelNameType = NbnxmKernelName<doPruneNBL, doCalcEnergies, elecType, vdwType>;
+ using kernelNameType = NbnxmKernel<doPruneNBL, doCalcEnergies, elecType, vdwType>;
/* Kernel launch config:
* - The thread block dimensions match the size of i-clusters, j-clusters,
* - The 1D block-grid contains as many blocks as super-clusters.
*/
const int numBlocks = numSci;
- const cl::sycl::range<3> blockSize{ c_clSize, c_clSize, 1 };
+ const cl::sycl::range<3> blockSize{ 1, c_clSize, c_clSize };
const cl::sycl::range<3> globalSize{ numBlocks * blockSize[0], blockSize[1], blockSize[2] };
const cl::sycl::nd_range<3> range{ globalSize, blockSize };
cl::sycl::event e = q.submit([&](cl::sycl::handler& cgh) {
auto kernel = nbnxmKernel<doPruneNBL, doCalcEnergies, elecType, vdwType>(
cgh, std::forward<Args>(args)...);
- cgh.parallel_for<kernelNameType>(flattenNDRange(range), kernel);
+ cgh.parallel_for<kernelNameType>(range, kernel);
});
return e;
}
+//! \brief Select templated kernel and launch it.
template<class... Args>
cl::sycl::event chooseAndLaunchNbnxmKernel(bool doPruneNBL,
bool doCalcEnergies,
void launchNbnxmKernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc)
{
- NBAtomData* adat = nb->atdat;
+ NBAtomDataGpu* adat = nb->atdat;
NBParamGpu* nbp = nb->nbparam;
gpu_plist* plist = nb->plist[iloc];
const bool doPruneNBL = (plist->haveFreshList && !nb->didPrune[iloc]);
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
- // Casting to float simplifies using atomic ops in the kernel
- cl::sycl::buffer<Float3, 1> f(*adat->f.buffer_);
- auto fAsFloat = f.reinterpret<float, 1>(f.get_count() * DIM);
- cl::sycl::buffer<Float3, 1> fShift(*adat->fShift.buffer_);
- auto fShiftAsFloat = fShift.reinterpret<float, 1>(fShift.get_count() * DIM);
-
cl::sycl::event e = chooseAndLaunchNbnxmKernel(doPruneNBL,
stepWork.computeEnergy,
nbp->elecType,
deviceStream,
plist->nsci,
adat->xq,
- fAsFloat,
+ adat->f,
adat->shiftVec,
- fShiftAsFloat,
+ adat->fShift,
adat->eElec,
adat->eLJ,
plist->cj4,