Sometimes it was enough to convert existing comments to Doxygen.
Now there are no SYCL-related warnings when building webpage.
Also fixed a bit of trivial clang-tidy complaints.
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
}
}
+// Doxygen does not like recursive templates.
+//! \cond
template<std::size_t N, class F, typename std::enable_if<(N > 1)>::type* = nullptr>
static auto mp_with_index(std::size_t i, F&& f)
{
return mp_with_index<N - 1>(i, std::forward<F>(f));
}
}
-
+//! \endcond
} // namespace compat
} // namespace gmx
namespace gmx
{
+//! Reinterpret-cast any pointer \p in to \c Float3, checking the type compatibility.
template<typename T>
static inline Float3* asGenericFloat3Pointer(T* in)
{
return reinterpret_cast<Float3*>(in);
}
+//! Reinterpret-cast any const pointer \p in to \c Float3, checking the type compatibility.
template<typename T>
static inline const Float3* asGenericFloat3Pointer(const T* in)
{
return reinterpret_cast<const Float3*>(in);
}
+//! Reinterpret-cast any container \p in to \c Float3, checking the type compatibility.
template<typename C>
static inline Float3* asGenericFloat3Pointer(C& in)
{
return reinterpret_cast<Float3*>(in.data());
}
+//! Reinterpret-cast any const container \p in to \c Float3, checking the type compatibility.
template<typename C>
static inline const Float3* asGenericFloat3Pointer(const C& in)
{
#include "gromacs/gpu_utils/gmxsycl.h"
#include "gromacs/math/vectypes.h"
+//! Type of device texture object. In SYCL, that would be \c sycl::image, but it's not used.
using DeviceTexture = void*;
//! \brief Single GPU call timing event, not used with SYCL
* This file is part of the GROMACS molecular simulation package.
*
* Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
- * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
compilerOptions += " -cl-opt-disable";
}
- /* Fastmath imprves performance on all supported arch */
+ /* Fastmath improves performance on all supported arch */
if (getenv("GMX_OCL_DISABLE_FASTMATH") == nullptr)
{
compilerOptions += " -cl-fast-relaxed-math";
static constexpr auto mode_atomic = GMX_SYCL_DPCPP ? cl::sycl::access::mode::read_write :
/* GMX_SYCL_HIPSYCL */ cl::sycl::access::mode::atomic;
-// \brief Full warp active thread mask used in CUDA warp-level primitives.
+//! \brief Full warp active thread mask used in CUDA warp-level primitives.
static constexpr unsigned int c_cudaFullWarpMask = 0xffffffff;
/*! \brief Convenience wrapper to do atomic addition to a global buffer.
#endif
}
-/* \brief Issue an intra sub-group barrier.
+/*! \brief Issue an intra sub-group barrier.
*
- * Equivalent with CUDA syncwarp(c_cudaFullWarpMask).
+ * Equivalent with CUDA's \c syncwarp(c_cudaFullWarpMask).
*
*/
static inline void subGroupBarrier(const cl::sycl::nd_item<1> itemIdx)
//! Constant used to help minimize preprocessed code
static constexpr bool c_binarySupportsGpus = (GMX_GPU != 0);
+//! Whether \ref DeviceInformation can be serialized for sending via MPI.
static constexpr bool c_canSerializeDeviceInformation =
(!GMX_GPU_OPENCL && !GMX_GPU_SYCL); /*NOLINT(misc-redundant-expression)*/
using cl::sycl::access::mode;
+//! \brief Function returning the force reduction kernel lambda.
template<bool addRvecForce, bool accumulateForce>
static auto reduceKernel(cl::sycl::handler& cgh,
DeviceAccessor<Float3, mode::read> a_nbnxmForce,
};
}
+//! \brief Force reduction SYCL kernel launch code.
template<bool addRvecForce, bool accumulateForce>
static void launchReductionKernel_(const int numAtoms,
const int atomStart,
});
}
-/*! \brief Select templated kernel and launch it. */
+/*! \brief Select templated Force reduction kernel and launch it. */
void launchForceReductionKernel(int numAtoms,
int atomStart,
bool addRvecForce,
copyToDeviceBuffer(
&d_inverseMasses_, inverseMasses, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
- // Temperature scale group map only used if there are more then one group
+ // Temperature scale group map only used if there are more than one group
if (numTempScaleValues_ > 1)
{
reallocateDeviceBuffer(
namespace gmx
{
+/*! \brief Backend-specific function to launch GPU Leap Frog kernel.
+ *
+ * \param numAtoms Total number of atoms.
+ * \param[in,out] d_x Buffer containing initial coordinates, and where the updated ones will be written.
+ * \param[out] d_xp Buffer where a copy of the initial coordinates will be written.
+ * \param[in,out] d_v Buffer containing initial velocities, and where the updated ones will be written.
+ * \param[in] d_f Buffer containing forces.
+ * \param[in] d_inverseMasses Buffer containing atoms' reciprocal masses.
+ * \param dt Timestep.
+ * \param doTemperatureScaling Whether temperature scaling is needed.
+ * \param numTempScaleValues Number of different T-couple values.
+ * \param d_tempScaleGroups Mapping of atoms into temperature scaling groups.
+ * \param d_lambdas Temperature scaling factors (one per group).
+ * \param prVelocityScalingType Type of Parrinello-Rahman velocity rescaling.
+ * \param prVelocityScalingMatrixDiagonal Diagonal elements of Parrinello-Rahman velocity scaling matrix.
+ * \param deviceStream Device stream for kernel launch.
+ */
void launchLeapFrogKernel(int numAtoms,
DeviceBuffer<Float3> d_x,
DeviceBuffer<Float3> d_xp,
* \param[in] dt Timestep.
* \param[in] a_lambdas Temperature scaling factors (one per group).
* \param[in] a_tempScaleGroups Mapping of atoms into groups.
- * \param[in] prVelocityScalingMatrixDiagonal Diagonal elements of Parrinello-Rahman velocity scaling matrix
+ * \param[in] prVelocityScalingMatrixDiagonal Diagonal elements of Parrinello-Rahman velocity scaling matrix.
*/
template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
auto leapFrogKernel(
};
}
+//! \brief Leap Frog SYCL kernel launch code.
template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling, class... Args>
static cl::sycl::event launchLeapFrogKernel(const DeviceStream& deviceStream, int numAtoms, Args&&... args)
{
return e;
}
+//! Convert \p doTemperatureScaling and \p numTempScaleValues to \ref NumTempScaleValues.
static NumTempScaleValues getTempScalingType(bool doTemperatureScaling, int numTempScaleValues)
{
if (!doTemperatureScaling)
//! Number of threads in a GPU block
constexpr static int c_threadsPerBlock = 256;
+/*! \brief Backend-specific function to launch LINCS kernel.
+ *
+ * \param kernelParams LINCS parameters.
+ * \param d_x Initial coordinates before the integration.
+ * \param d_xp Coordinates after the integration which will be updated.
+ * \param updateVelocities Whether to also update velocities.
+ * \param d_v Velocities to update (ignored if \p updateVelocities is \c false).
+ * \param invdt Reciprocal of timestep.
+ * \param computeVirial Whether to compute the virial.
+ * \param deviceStream Device stream for kernel launch.
+ */
void launchLincsGpuKernel(const LincsGpuKernelParameters& kernelParams,
const DeviceBuffer<Float3>& d_x,
DeviceBuffer<Float3> d_xp,
//! Number of work-items in a work-group
constexpr static int sc_workGroupSize = 256;
+//! \brief Function returning the SETTLE kernel lambda.
template<bool updateVelocities, bool computeVirial>
auto settleKernel(cl::sycl::handler& cgh,
const int numSettles,
template<bool updateVelocities, bool computeVirial>
class SettleKernelName;
+//! \brief SETTLE SYCL kernel launch code.
template<bool updateVelocities, bool computeVirial, class... Args>
static cl::sycl::event launchSettleKernel(const DeviceStream& deviceStream, int numSettles, Args&&... args)
{
namespace gmx
{
+//! \brief Function returning the scaling kernel lambda.
static auto scaleKernel(cl::sycl::handler& cgh,
DeviceAccessor<Float3, cl::sycl::access::mode::read_write> a_x,
const ScalingMatrix scalingMatrix)
* Currently the random seeds for SD and BD are missing.
*/
-/* \brief Enum for all entries in \p t_state
+/*! \brief Enum for all entries in \p t_state
*
* These enums are used in flags as (1<<est...).
* The order of these enums should not be changed,
return static_cast<real>(numAtoms) / (size[XX] * size[YY] * size[ZZ]);
}
-// Get approximate dimensions of each cell. Returns the length along X and Y.
+//! \brief Get approximate dimensions of each cell. Returns the length along X and Y.
static std::array<real, DIM - 1> getTargetCellLength(const Grid::Geometry& geometry, const real atomDensity)
{
if (geometry.isSimple)
static constexpr bool elecEwaldTab =
(elecType == ElecType::EwaldTab || elecType == ElecType::EwaldTabTwin); ///< EL_EWALD_TAB
static constexpr bool elecEwaldTwin =
- (elecType == ElecType::EwaldAnaTwin || elecType == ElecType::EwaldTabTwin);
- static constexpr bool elecEwald = (elecEwaldAna || elecEwaldTab); ///< EL_EWALD_ANY
- static constexpr bool vdwCombLB = (vdwType == VdwType::CutCombLB);
+ (elecType == ElecType::EwaldAnaTwin || elecType == ElecType::EwaldTabTwin); ///< Use twin cut-off.
+ static constexpr bool elecEwald = (elecEwaldAna || elecEwaldTab); ///< EL_EWALD_ANY
+ static constexpr bool vdwCombLB = (vdwType == VdwType::CutCombLB); ///< LJ_COMB && !LJ_COMB_GEOM
static constexpr bool vdwCombGeom = (vdwType == VdwType::CutCombGeom); ///< LJ_COMB_GEOM
static constexpr bool vdwComb = (vdwCombLB || vdwCombGeom); ///< LJ_COMB
static constexpr bool vdwEwaldCombGeom = (vdwType == VdwType::EwaldGeom); ///< LJ_EWALD_COMB_GEOM
template<enum VdwType vdwType>
constexpr bool ljComb = EnergyFunctionProperties<ElecType::Count, vdwType>().vdwComb;
-template<enum ElecType elecType> // Yes, ElecType
-constexpr bool vdwCutoffCheck = EnergyFunctionProperties<elecType, VdwType::Count>().elecEwaldTwin;
-
template<enum ElecType elecType>
constexpr bool elecEwald = EnergyFunctionProperties<elecType, VdwType::Count>().elecEwald;
using cl::sycl::access::mode;
using cl::sycl::access::target;
+//! \brief Convert \p sigma and \p epsilon VdW parameters to \c c6,c12 pair.
static inline Float2 convertSigmaEpsilonToC6C12(const float sigma, const float epsilon)
{
const float sigma2 = sigma * sigma;
const float c6 = epsilon * sigma6;
const float c12 = c6 * sigma6;
- return Float2(c6, c12);
+ return { c6, c12 };
}
+//! \brief Calculate force and energy for a pair of atoms, VdW force-switch flavor.
template<bool doCalcEnergies>
static inline void ljForceSwitch(const shift_consts_t dispersionShift,
const shift_consts_t repulsionShift,
}
}
-// This function also requires sm_buf to have a length of at least 1.
-// The function returns:
-// - for thread #0 in the group: sum of all valueToReduce in a group
-// - for other threads: unspecified
+/*!
+ * \brief Do workgroup-level reduction of a single \c float.
+ *
+ * While SYCL has \c sycl::reduce_over_group, it currently (oneAPI 2021.3.0) uses a very large
+ * shared memory buffer, which leads to a reduced occupancy.
+ *
+ * \tparam subGroupSize Size of a sub-group.
+ * \tparam groupSize Size of a work-group.
+ * \param itemIdx Current thread's \c sycl::nd_item.
+ * \param tidxi Current thread's linearized local index.
+ * \param sm_buf Accessor for local reduction buffer.
+ * \param valueToReduce Current thread's value. Must have length of at least 1.
+ * \return For thread with \p tidxi 0: sum of all \p valueToReduce. Other threads: unspecified.
+ */
template<int subGroupSize, int groupSize>
static inline float groupReduce(const cl::sycl::nd_item<1> itemIdx,
const unsigned int tidxi,
/*! \brief Final i-force reduction.
*
- * Reduce c_nbnxnGpuNumClusterPerSupercluster i-force componets stored in \p fCiBuf[]
+ * Reduce c_nbnxnGpuNumClusterPerSupercluster i-force components stored in \p fCiBuf[]
* accumulating atomically into \p a_f.
* If \p calcFShift is true, further reduce shift forces and atomically accumulate into \p a_fShift.
*
(props.elecEwald || props.elecRF || props.vdwEwald || (props.elecCutoff && doCalcEnergies));
// The post-prune j-i cluster-pair organization is linked to how exclusion and interaction mask data is stored.
- // Currently this is ideally suited for 32-wide subgroup size but slightly less so for others,
+ // Currently, this is ideally suited for 32-wide subgroup size but slightly less so for others,
// e.g. subGroupSize > prunedClusterPairSize on AMD GCN / CDNA.
// Hence, the two are decoupled.
// When changing this code, please update requiredSubGroupSizeForNbnxm in src/gromacs/hardware/device_management_sycl.cpp.
if constexpr (props.elecRF)
{
energyElec +=
- qi * qj * (pairExclMask * rInv + 0.5f * twoKRf * r2 - cRF);
+ qi * qj * (pairExclMask * rInv + 0.5F * twoKRf * r2 - cRF);
}
if constexpr (props.elecEwald)
{
} // for (int j4 = cij4Start; j4 < cij4End; j4 += 1)
/* skip central shifts when summing shift forces */
- const bool doCalcShift = (calcShift && !(nbSci.shift == gmx::c_centralShiftIndex));
+ const bool doCalcShift = (calcShift && nbSci.shift != gmx::c_centralShiftIndex);
reduceForceIAndFShift(
sm_reductionBuffer, fCiBuf, doCalcShift, itemIdx, tidxi, tidxj, sci, nbSci.shift, a_f, a_fShift);
};
}
+//! \brief NBNXM kernel launch code.
template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType, class... Args>
cl::sycl::event launchNbnxmKernel(const DeviceStream& deviceStream, const int numSci, Args&&... args)
{
return e;
}
+//! \brief Select templated kernel and launch it.
template<class... Args>
cl::sycl::event chooseAndLaunchNbnxmKernel(bool doPruneNBL,
bool doCalcEnergies,
{
using gmx::InteractionLocality;
+/*! Launch SYCL NBNXM kernel.
+ *
+ * \param nb Non-bonded parameters.
+ * \param stepWork Workload flags for the current step.
+ * \param iloc Interaction locality.
+ */
void launchNbnxmKernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc);
} // namespace Nbnxm
};
}
+//! \brief Leap Frog SYCL prune-only kernel launch code.
template<bool haveFreshList, class... Args>
cl::sycl::event launchNbnxmKernelPruneOnly(const DeviceStream& deviceStream,
const int numSciInPart,
return e;
}
+//! \brief Select templated kernel and launch it.
template<class... Args>
cl::sycl::event chooseAndLaunchNbnxmKernelPruneOnly(bool haveFreshList, Args&&... args)
{
{
using gmx::InteractionLocality;
+/*! Launch SYCL NBNXM prune-only kernel.
+ *
+ * \param nb Non-bonded parameters.
+ * \param iloc Interaction locality.
+ * \param numParts Total number of rolling-prune parts.
+ * \param part Number of the part to prune.
+ * \param numSciInPart Number of superclusters in \p part.
+ */
void launchNbnxmKernelPruneOnly(NbnxmGpu* nb,
const InteractionLocality iloc,
const int numParts,
{
#ifndef GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY
+//! \brief Default for the prune kernel's j4 processing concurrency.
# define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY 4
#endif
-/*! \brief Macro defining default for the prune kernel's j4 processing concurrency.
+
+/*! \brief Prune kernel's j4 processing concurrency.
*
- * The GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override.
+ * The \c GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override.
*/
static constexpr int c_syclPruneKernelJ4Concurrency = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY;
//! \brief Convert 3D range to 1D
static inline cl::sycl::range<1> flattenRange(cl::sycl::range<3> range3d)
{
- return cl::sycl::range<1>(range3d.size());
+ return { range3d.size() };
}
//! \brief Convert 3D nd_range to 1D
static inline cl::sycl::nd_range<1> flattenNDRange(cl::sycl::nd_range<3> nd_range3d)
{
- return cl::sycl::nd_range<1>(flattenRange(nd_range3d.get_global_range()),
- flattenRange(nd_range3d.get_local_range()));
+ return { flattenRange(nd_range3d.get_global_range()), flattenRange(nd_range3d.get_local_range()) };
}
//! \brief Convert flattened 1D index to 3D
const unsigned id = id1d[0];
const unsigned z = id / rangeXY;
const unsigned xy = id % rangeXY;
- return cl::sycl::id<3>(xy % rangeX, xy / rangeX, z);
+ return { xy % rangeX, xy / rangeX, z };
}
} // namespace Nbnxm
namespace gmx
{
-template<class Function>
-auto dispatchTemplatedFunction(Function&& f)
-{
- return std::forward<Function>(f)();
-}
-
/*! \internal \brief
* Helper function to select appropriate template based on runtime values.
*
p0, p1, p2);
}
* \endcode
- */
+ *
+ * \tparam Function Type of \p f.
+ * \param f Function to call.
+ * \return The result of calling \c f().
+*/
+template<class Function>
+auto dispatchTemplatedFunction(Function&& f)
+{
+ return std::forward<Function>(f)();
+}
+
+// Recursive templates confuse Doxygen
+//! \cond
template<class Function, class Enum, class... Enums>
auto dispatchTemplatedFunction(Function&& f, Enum e, Enums... es)
{
},
es...);
}
+//! \endcond
} // namespace gmx