From 6ff8982624c43e96f041dc00d0483ff37bb6a4d6 Mon Sep 17 00:00:00 2001
From: Andrey Alekseenko <al42and@gmail.com>
Date: Tue, 24 Aug 2021 18:13:46 +0300
Subject: [PATCH] Add missing Doxygen for SYCL functions and some others

Sometimes it was enough to convert existing comments to Doxygen.

Now there are no SYCL-related warnings when building webpage.

Also fixed a bit of trivial clang-tidy complaints.
---
 src/gromacs/compat/mp11.h                     |  6 ++-
 src/gromacs/gpu_utils/gputraits.h             |  4 ++
 src/gromacs/gpu_utils/gputraits_sycl.h        |  1 +
 src/gromacs/gpu_utils/ocl_compiler.cpp        |  4 +-
 src/gromacs/gpu_utils/sycl_kernel_utils.h     |  6 +--
 src/gromacs/hardware/device_information.h     |  1 +
 .../gpuforcereduction_impl_internal_sycl.cpp  |  4 +-
 src/gromacs/mdlib/leapfrog_gpu.cpp            |  2 +-
 src/gromacs/mdlib/leapfrog_gpu_internal.h     | 17 ++++++++
 .../mdlib/leapfrog_gpu_internal_sycl.cpp      |  4 +-
 src/gromacs/mdlib/lincs_gpu_internal.h        | 11 +++++
 .../mdlib/settle_gpu_internal_sycl.cpp        |  2 +
 .../update_constrain_gpu_internal_sycl.cpp    |  1 +
 src/gromacs/mdtypes/state.h                   |  2 +-
 src/gromacs/nbnxm/grid.cpp                    |  2 +-
 src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp  | 41 ++++++++++++-------
 src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.h    |  6 +++
 .../sycl/nbnxm_sycl_kernel_pruneonly.cpp      |  2 +
 .../nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.h  |  8 ++++
 .../nbnxm/sycl/nbnxm_sycl_kernel_utils.h      | 13 +++---
 src/gromacs/utility/template_mp.h             | 21 ++++++----
 21 files changed, 118 insertions(+), 40 deletions(-)
diff --git a/src/gromacs/compat/mp11.h b/src/gromacs/compat/mp11.h
index 439084ed9c..d474bbe9aa 100644
--- a/src/gromacs/compat/mp11.h
+++ b/src/gromacs/compat/mp11.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -89,6 +89,8 @@ static auto mp_with_index(std::size_t i, F&& f)
     }
 }
 
+// Doxygen does not like recursive templates.
+//! \cond
 template<std::size_t N, class F, typename std::enable_if<(N > 1)>::type* = nullptr>
 static auto mp_with_index(std::size_t i, F&& f)
 {
@@ -101,7 +103,7 @@ static auto mp_with_index(std::size_t i, F&& f)
         return mp_with_index<N - 1>(i, std::forward<F>(f));
     }
 }
-
+//! \endcond
 
 } // namespace compat
 } // namespace gmx
diff --git a/src/gromacs/gpu_utils/gputraits.h b/src/gromacs/gpu_utils/gputraits.h
index 344b0427c1..3fb58eb8b4 100644
--- a/src/gromacs/gpu_utils/gputraits.h
+++ b/src/gromacs/gpu_utils/gputraits.h
@@ -81,6 +81,7 @@ struct Float4
 
 namespace gmx
 {
+//! Reinterpret-cast any pointer \p in to \c Float3, checking the type compatibility.
 template<typename T>
 static inline Float3* asGenericFloat3Pointer(T* in)
 {
@@ -90,6 +91,7 @@ static inline Float3* asGenericFloat3Pointer(T* in)
     return reinterpret_cast<Float3*>(in);
 }
 
+//! Reinterpret-cast any const pointer \p in to \c Float3, checking the type compatibility.
 template<typename T>
 static inline const Float3* asGenericFloat3Pointer(const T* in)
 {
@@ -99,6 +101,7 @@ static inline const Float3* asGenericFloat3Pointer(const T* in)
     return reinterpret_cast<const Float3*>(in);
 }
 
+//! Reinterpret-cast any container \p in to \c Float3, checking the type compatibility.
 template<typename C>
 static inline Float3* asGenericFloat3Pointer(C& in)
 {
@@ -108,6 +111,7 @@ static inline Float3* asGenericFloat3Pointer(C& in)
     return reinterpret_cast<Float3*>(in.data());
 }
 
+//! Reinterpret-cast any const container \p in to \c Float3, checking the type compatibility.
 template<typename C>
 static inline const Float3* asGenericFloat3Pointer(const C& in)
 {
diff --git a/src/gromacs/gpu_utils/gputraits_sycl.h b/src/gromacs/gpu_utils/gputraits_sycl.h
index 9c64d3f303..b6b76c7a44 100644
--- a/src/gromacs/gpu_utils/gputraits_sycl.h
+++ b/src/gromacs/gpu_utils/gputraits_sycl.h
@@ -49,6 +49,7 @@
 #include "gromacs/gpu_utils/gmxsycl.h"
 #include "gromacs/math/vectypes.h"
 
+//! Type of device texture object. In SYCL, that would be \c sycl::image, but it's not used.
 using DeviceTexture = void*;
 
 //! \brief Single GPU call timing event, not used with SYCL
diff --git a/src/gromacs/gpu_utils/ocl_compiler.cpp b/src/gromacs/gpu_utils/ocl_compiler.cpp
index c8ed6f19c5..98b02df93a 100644
--- a/src/gromacs/gpu_utils/ocl_compiler.cpp
+++ b/src/gromacs/gpu_utils/ocl_compiler.cpp
@@ -2,7 +2,7 @@
  * This file is part of the GROMACS molecular simulation package.
  *
  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
- * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -168,7 +168,7 @@ static std::string selectCompilerOptions(DeviceVendor deviceVendor)
         compilerOptions += " -cl-opt-disable";
     }
 
-    /* Fastmath imprves performance on all supported arch */
+    /* Fastmath improves performance on all supported arch */
     if (getenv("GMX_OCL_DISABLE_FASTMATH") == nullptr)
     {
         compilerOptions += " -cl-fast-relaxed-math";
diff --git a/src/gromacs/gpu_utils/sycl_kernel_utils.h b/src/gromacs/gpu_utils/sycl_kernel_utils.h
index 42f3437bfe..8a2d6bace1 100644
--- a/src/gromacs/gpu_utils/sycl_kernel_utils.h
+++ b/src/gromacs/gpu_utils/sycl_kernel_utils.h
@@ -58,7 +58,7 @@
 static constexpr auto mode_atomic = GMX_SYCL_DPCPP ? cl::sycl::access::mode::read_write :
                                                    /* GMX_SYCL_HIPSYCL */ cl::sycl::access::mode::atomic;
 
-// \brief Full warp active thread mask used in CUDA warp-level primitives.
+//! \brief Full warp active thread mask used in CUDA warp-level primitives.
 static constexpr unsigned int c_cudaFullWarpMask = 0xffffffff;
 
 /*! \brief Convenience wrapper to do atomic addition to a global buffer.
@@ -91,9 +91,9 @@ static inline void atomicFetchAdd(DeviceAccessor<float, mode_atomic> acc, const
 #endif
 }
 
-/* \brief Issue an intra sub-group barrier.
+/*! \brief Issue an intra sub-group barrier.
  *
- * Equivalent with CUDA syncwarp(c_cudaFullWarpMask).
+ * Equivalent with CUDA's \c syncwarp(c_cudaFullWarpMask).
  *
  */
 static inline void subGroupBarrier(const cl::sycl::nd_item<1> itemIdx)
diff --git a/src/gromacs/hardware/device_information.h b/src/gromacs/hardware/device_information.h
index 079be8be45..a03e915972 100644
--- a/src/gromacs/hardware/device_information.h
+++ b/src/gromacs/hardware/device_information.h
@@ -64,6 +64,7 @@
 
 //! Constant used to help minimize preprocessed code
 static constexpr bool c_binarySupportsGpus = (GMX_GPU != 0);
+//! Whether \ref DeviceInformation can be serialized for sending via MPI.
 static constexpr bool c_canSerializeDeviceInformation =
         (!GMX_GPU_OPENCL && !GMX_GPU_SYCL); /*NOLINT(misc-redundant-expression)*/
 
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp
index 0ccbc98e09..bd76fbf7d6 100644
--- a/src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp
+++ b/src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp
@@ -63,6 +63,7 @@ namespace gmx
 
 using cl::sycl::access::mode;
 
+//! \brief Function returning the force reduction kernel lambda.
 template<bool addRvecForce, bool accumulateForce>
 static auto reduceKernel(cl::sycl::handler&                                 cgh,
                          DeviceAccessor<Float3, mode::read>                 a_nbnxmForce,
@@ -97,6 +98,7 @@ static auto reduceKernel(cl::sycl::handler&                                 cgh,
     };
 }
 
+//! \brief Force reduction SYCL kernel launch code.
 template<bool addRvecForce, bool accumulateForce>
 static void launchReductionKernel_(const int                   numAtoms,
                                    const int                   atomStart,
@@ -119,7 +121,7 @@ static void launchReductionKernel_(const int                   numAtoms,
     });
 }
 
-/*! \brief Select templated kernel and launch it. */
+/*! \brief Select templated Force reduction kernel and launch it. */
 void launchForceReductionKernel(int                  numAtoms,
                                 int                  atomStart,
                                 bool                 addRvecForce,
diff --git a/src/gromacs/mdlib/leapfrog_gpu.cpp b/src/gromacs/mdlib/leapfrog_gpu.cpp
index b07c167c7c..1d9167ec79 100644
--- a/src/gromacs/mdlib/leapfrog_gpu.cpp
+++ b/src/gromacs/mdlib/leapfrog_gpu.cpp
@@ -162,7 +162,7 @@ void LeapFrogGpu::set(const int numAtoms, const real* inverseMasses, const unsig
     copyToDeviceBuffer(
             &d_inverseMasses_, inverseMasses, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
 
-    // Temperature scale group map only used if there are more then one group
+    // Temperature scale group map only used if there are more than one group
     if (numTempScaleValues_ > 1)
     {
         reallocateDeviceBuffer(
diff --git a/src/gromacs/mdlib/leapfrog_gpu_internal.h b/src/gromacs/mdlib/leapfrog_gpu_internal.h
index 76330fe180..44c9ca6804 100644
--- a/src/gromacs/mdlib/leapfrog_gpu_internal.h
+++ b/src/gromacs/mdlib/leapfrog_gpu_internal.h
@@ -50,6 +50,23 @@
 namespace gmx
 {
 
+/*! \brief Backend-specific function to launch GPU Leap Frog kernel.
+ *
+ * \param numAtoms Total number of atoms.
+ * \param[in,out] d_x Buffer containing initial coordinates, and where the updated ones will be written.
+ * \param[out] d_xp Buffer where a copy of the initial coordinates will be written.
+ * \param[in,out] d_v Buffer containing initial velocities, and where the updated ones will be written.
+ * \param[in]  d_f Buffer containing forces.
+ * \param[in] d_inverseMasses Buffer containing atoms' reciprocal masses.
+ * \param dt Timestep.
+ * \param doTemperatureScaling Whether temperature scaling is needed.
+ * \param numTempScaleValues Number of different T-couple values.
+ * \param d_tempScaleGroups Mapping of atoms into temperature scaling groups.
+ * \param d_lambdas Temperature scaling factors (one per group).
+ * \param prVelocityScalingType Type of Parrinello-Rahman velocity rescaling.
+ * \param prVelocityScalingMatrixDiagonal Diagonal elements of Parrinello-Rahman velocity scaling matrix.
+ * \param deviceStream Device stream for kernel launch.
+ */
 void launchLeapFrogKernel(int                          numAtoms,
                           DeviceBuffer<Float3>         d_x,
                           DeviceBuffer<Float3>         d_xp,
diff --git a/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp b/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp
index d0d019c25e..b5572dcfe1 100644
--- a/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp
+++ b/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp
@@ -83,7 +83,7 @@ using cl::sycl::access::mode;
  * \param[in]     dt                               Timestep.
  * \param[in]     a_lambdas                        Temperature scaling factors (one per group).
  * \param[in]     a_tempScaleGroups                Mapping of atoms into groups.
- * \param[in]     prVelocityScalingMatrixDiagonal  Diagonal elements of Parrinello-Rahman velocity scaling matrix
+ * \param[in]     prVelocityScalingMatrixDiagonal  Diagonal elements of Parrinello-Rahman velocity scaling matrix.
  */
 template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
 auto leapFrogKernel(
@@ -160,6 +160,7 @@ auto leapFrogKernel(
     };
 }
 
+//! \brief Leap Frog SYCL kernel launch code.
 template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling, class... Args>
 static cl::sycl::event launchLeapFrogKernel(const DeviceStream& deviceStream, int numAtoms, Args&&... args)
 {
@@ -178,6 +179,7 @@ static cl::sycl::event launchLeapFrogKernel(const DeviceStream& deviceStream, in
     return e;
 }
 
+//! Convert \p doTemperatureScaling and \p numTempScaleValues to \ref NumTempScaleValues.
 static NumTempScaleValues getTempScalingType(bool doTemperatureScaling, int numTempScaleValues)
 {
     if (!doTemperatureScaling)
diff --git a/src/gromacs/mdlib/lincs_gpu_internal.h b/src/gromacs/mdlib/lincs_gpu_internal.h
index cd421c198c..db44205fb3 100644
--- a/src/gromacs/mdlib/lincs_gpu_internal.h
+++ b/src/gromacs/mdlib/lincs_gpu_internal.h
@@ -57,6 +57,17 @@ struct LincsGpuKernelParameters;
 //! Number of threads in a GPU block
 constexpr static int c_threadsPerBlock = 256;
 
+/*! \brief Backend-specific function to launch LINCS kernel.
+ *
+ * \param kernelParams LINCS parameters.
+ * \param d_x Initial coordinates before the integration.
+ * \param d_xp Coordinates after the integration which will be updated.
+ * \param updateVelocities Whether to also update velocities.
+ * \param d_v Velocities to update (ignored if \p updateVelocities is \c false).
+ * \param invdt Reciprocal of timestep.
+ * \param computeVirial Whether to compute the virial.
+ * \param deviceStream Device stream for kernel launch.
+ */
 void launchLincsGpuKernel(const LincsGpuKernelParameters& kernelParams,
                           const DeviceBuffer<Float3>&     d_x,
                           DeviceBuffer<Float3>            d_xp,
diff --git a/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp b/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp
index 340b39bb73..3e5b56fdf3 100644
--- a/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp
+++ b/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp
@@ -59,6 +59,7 @@ using cl::sycl::access::target;
 //! Number of work-items in a work-group
 constexpr static int sc_workGroupSize = 256;
 
+//! \brief Function returning the SETTLE kernel lambda.
 template<bool updateVelocities, bool computeVirial>
 auto settleKernel(cl::sycl::handler&                                           cgh,
                   const int                                                    numSettles,
@@ -349,6 +350,7 @@ auto settleKernel(cl::sycl::handler&                                           c
 template<bool updateVelocities, bool computeVirial>
 class SettleKernelName;
 
+//! \brief SETTLE SYCL kernel launch code.
 template<bool updateVelocities, bool computeVirial, class... Args>
 static cl::sycl::event launchSettleKernel(const DeviceStream& deviceStream, int numSettles, Args&&... args)
 {
diff --git a/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp b/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp
index 1843b2f75b..f7113a203f 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp
+++ b/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp
@@ -55,6 +55,7 @@ class ScaleKernel;
 namespace gmx
 {
 
+//! \brief Function returning the scaling kernel lambda.
 static auto scaleKernel(cl::sycl::handler&                                         cgh,
                         DeviceAccessor<Float3, cl::sycl::access::mode::read_write> a_x,
                         const ScalingMatrix                                        scalingMatrix)
diff --git a/src/gromacs/mdtypes/state.h b/src/gromacs/mdtypes/state.h
index df3691adfe..81c9d13a6b 100644
--- a/src/gromacs/mdtypes/state.h
+++ b/src/gromacs/mdtypes/state.h
@@ -88,7 +88,7 @@ using PaddedHostVector = gmx::PaddedHostVector<T>;
  * Currently the random seeds for SD and BD are missing.
  */
 
-/* \brief Enum for all entries in \p t_state
+/*! \brief Enum for all entries in \p t_state
  *
  * These enums are used in flags as (1<<est...).
  * The order of these enums should not be changed,
diff --git a/src/gromacs/nbnxm/grid.cpp b/src/gromacs/nbnxm/grid.cpp
index 75b44dfdcb..9c4b7d4108 100644
--- a/src/gromacs/nbnxm/grid.cpp
+++ b/src/gromacs/nbnxm/grid.cpp
@@ -99,7 +99,7 @@ static real gridAtomDensity(int numAtoms, const rvec lowerCorner, const rvec upp
     return static_cast<real>(numAtoms) / (size[XX] * size[YY] * size[ZZ]);
 }
 
-// Get approximate dimensions of each cell. Returns the length along X and Y.
+//! \brief Get approximate dimensions of each cell. Returns the length along X and Y.
 static std::array<real, DIM - 1> getTargetCellLength(const Grid::Geometry& geometry, const real atomDensity)
 {
     if (geometry.isSimple)
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
index ae703bdb16..7304922c09 100644
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
@@ -70,9 +70,9 @@ struct EnergyFunctionProperties {
     static constexpr bool elecEwaldTab =
             (elecType == ElecType::EwaldTab || elecType == ElecType::EwaldTabTwin); ///< EL_EWALD_TAB
     static constexpr bool elecEwaldTwin =
-            (elecType == ElecType::EwaldAnaTwin || elecType == ElecType::EwaldTabTwin);
-    static constexpr bool elecEwald        = (elecEwaldAna || elecEwaldTab); ///< EL_EWALD_ANY
-    static constexpr bool vdwCombLB        = (vdwType == VdwType::CutCombLB);
+            (elecType == ElecType::EwaldAnaTwin || elecType == ElecType::EwaldTabTwin); ///< Use twin cut-off.
+    static constexpr bool elecEwald = (elecEwaldAna || elecEwaldTab);  ///< EL_EWALD_ANY
+    static constexpr bool vdwCombLB = (vdwType == VdwType::CutCombLB); ///< LJ_COMB && !LJ_COMB_GEOM
     static constexpr bool vdwCombGeom      = (vdwType == VdwType::CutCombGeom); ///< LJ_COMB_GEOM
     static constexpr bool vdwComb          = (vdwCombLB || vdwCombGeom);        ///< LJ_COMB
     static constexpr bool vdwEwaldCombGeom = (vdwType == VdwType::EwaldGeom); ///< LJ_EWALD_COMB_GEOM
@@ -87,9 +87,6 @@ struct EnergyFunctionProperties {
 template<enum VdwType vdwType>
 constexpr bool ljComb = EnergyFunctionProperties<ElecType::Count, vdwType>().vdwComb;
 
-template<enum ElecType elecType> // Yes, ElecType
-constexpr bool vdwCutoffCheck = EnergyFunctionProperties<elecType, VdwType::Count>().elecEwaldTwin;
-
 template<enum ElecType elecType>
 constexpr bool elecEwald = EnergyFunctionProperties<elecType, VdwType::Count>().elecEwald;
 
@@ -104,6 +101,7 @@ using cl::sycl::access::fence_space;
 using cl::sycl::access::mode;
 using cl::sycl::access::target;
 
+//! \brief Convert \p sigma and \p epsilon VdW parameters to \c c6,c12 pair.
 static inline Float2 convertSigmaEpsilonToC6C12(const float sigma, const float epsilon)
 {
     const float sigma2 = sigma * sigma;
@@ -111,9 +109,10 @@ static inline Float2 convertSigmaEpsilonToC6C12(const float sigma, const float e
     const float c6     = epsilon * sigma6;
     const float c12    = c6 * sigma6;
 
-    return Float2(c6, c12);
+    return { c6, c12 };
 }
 
+//! \brief Calculate force and energy for a pair of atoms, VdW force-switch flavor.
 template<bool doCalcEnergies>
 static inline void ljForceSwitch(const shift_consts_t         dispersionShift,
                                  const shift_consts_t         repulsionShift,
@@ -342,10 +341,20 @@ static inline void reduceForceJShuffle(Float3                             f,
     }
 }
 
-// This function also requires sm_buf to have a length of at least 1.
-// The function returns:
-//     - for thread #0 in the group: sum of all valueToReduce in a group
-//     - for other threads: unspecified
+/*!
+ * \brief Do workgroup-level reduction of a single \c float.
+ *
+ * While SYCL has \c sycl::reduce_over_group, it currently (oneAPI 2021.3.0) uses a very large
+ * shared memory buffer, which leads to a reduced occupancy.
+ *
+ * \tparam subGroupSize Size of a sub-group.
+ * \tparam groupSize Size of a work-group.
+ * \param itemIdx Current thread's \c sycl::nd_item.
+ * \param tidxi Current thread's linearized local index.
+ * \param sm_buf Accessor for local reduction buffer.
+ * \param valueToReduce Current thread's value. Must have length of at least 1.
+ * \return For thread with \p tidxi 0: sum of all \p valueToReduce. Other threads: unspecified.
+ */
 template<int subGroupSize, int groupSize>
 static inline float groupReduce(const cl::sycl::nd_item<1> itemIdx,
                                 const unsigned int         tidxi,
@@ -434,7 +443,7 @@ static inline void reduceForceJ(cl::sycl::accessor<float, 1, mode::read_write, t
 
 /*! \brief Final i-force reduction.
  *
- * Reduce c_nbnxnGpuNumClusterPerSupercluster i-force componets stored in \p fCiBuf[]
+ * Reduce c_nbnxnGpuNumClusterPerSupercluster i-force components stored in \p fCiBuf[]
  * accumulating atomically into \p a_f.
  * If \p calcFShift is true, further reduce shift forces and atomically accumulate into \p a_fShift.
  *
@@ -642,7 +651,7 @@ auto nbnxmKernel(cl::sycl::handler&                                   cgh,
             (props.elecEwald || props.elecRF || props.vdwEwald || (props.elecCutoff && doCalcEnergies));
 
     // The post-prune j-i cluster-pair organization is linked to how exclusion and interaction mask data is stored.
-    // Currently this is ideally suited for 32-wide subgroup size but slightly less so for others,
+    // Currently, this is ideally suited for 32-wide subgroup size but slightly less so for others,
     // e.g. subGroupSize > prunedClusterPairSize on AMD GCN / CDNA.
     // Hence, the two are decoupled.
     // When changing this code, please update requiredSubGroupSizeForNbnxm in src/gromacs/hardware/device_management_sycl.cpp.
@@ -996,7 +1005,7 @@ auto nbnxmKernel(cl::sycl::handler&                                   cgh,
                                 if constexpr (props.elecRF)
                                 {
                                     energyElec +=
-                                            qi * qj * (pairExclMask * rInv + 0.5f * twoKRf * r2 - cRF);
+                                            qi * qj * (pairExclMask * rInv + 0.5F * twoKRf * r2 - cRF);
                                 }
                                 if constexpr (props.elecEwald)
                                 {
@@ -1030,7 +1039,7 @@ auto nbnxmKernel(cl::sycl::handler&                                   cgh,
         } // for (int j4 = cij4Start; j4 < cij4End; j4 += 1)
 
         /* skip central shifts when summing shift forces */
-        const bool doCalcShift = (calcShift && !(nbSci.shift == gmx::c_centralShiftIndex));
+        const bool doCalcShift = (calcShift && nbSci.shift != gmx::c_centralShiftIndex);
 
         reduceForceIAndFShift(
                 sm_reductionBuffer, fCiBuf, doCalcShift, itemIdx, tidxi, tidxj, sci, nbSci.shift, a_f, a_fShift);
@@ -1051,6 +1060,7 @@ auto nbnxmKernel(cl::sycl::handler&                                   cgh,
     };
 }
 
+//! \brief NBNXM kernel launch code.
 template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType, class... Args>
 cl::sycl::event launchNbnxmKernel(const DeviceStream& deviceStream, const int numSci, Args&&... args)
 {
@@ -1077,6 +1087,7 @@ cl::sycl::event launchNbnxmKernel(const DeviceStream& deviceStream, const int nu
     return e;
 }
 
+//! \brief Select templated kernel and launch it.
 template<class... Args>
 cl::sycl::event chooseAndLaunchNbnxmKernel(bool          doPruneNBL,
                                            bool          doCalcEnergies,
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.h
index dd7f52138d..bb256a8482 100644
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.h
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.h
@@ -53,6 +53,12 @@ namespace Nbnxm
 {
 using gmx::InteractionLocality;
 
+/*! Launch SYCL NBNXM kernel.
+ *
+ * \param nb Non-bonded parameters.
+ * \param stepWork Workload flags for the current step.
+ * \param iloc Interaction locality.
+ */
 void launchNbnxmKernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc);
 
 } // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp
index 7770915c4d..1562cbc6fd 100644
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp
@@ -220,6 +220,7 @@ auto nbnxmKernelPruneOnly(cl::sycl::handler&                            cgh,
     };
 }
 
+//! \brief Leap Frog SYCL prune-only kernel launch code.
 template<bool haveFreshList, class... Args>
 cl::sycl::event launchNbnxmKernelPruneOnly(const DeviceStream& deviceStream,
                                            const int           numSciInPart,
@@ -247,6 +248,7 @@ cl::sycl::event launchNbnxmKernelPruneOnly(const DeviceStream& deviceStream,
     return e;
 }
 
+//! \brief Select templated kernel and launch it.
 template<class... Args>
 cl::sycl::event chooseAndLaunchNbnxmKernelPruneOnly(bool haveFreshList, Args&&... args)
 {
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.h
index 6af97b1d1e..da2e96b996 100644
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.h
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.h
@@ -52,6 +52,14 @@ namespace Nbnxm
 {
 using gmx::InteractionLocality;
 
+/*! Launch SYCL NBNXM prune-only kernel.
+ *
+ * \param nb Non-bonded parameters.
+ * \param iloc Interaction locality.
+ * \param numParts Total number of rolling-prune parts.
+ * \param part Number of the part to prune.
+ * \param numSciInPart Number of superclusters in \p part.
+ */
 void launchNbnxmKernelPruneOnly(NbnxmGpu*                 nb,
                                 const InteractionLocality iloc,
                                 const int                 numParts,
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h
index c316555d0f..53633da4be 100644
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h
@@ -49,11 +49,13 @@ namespace Nbnxm
 {
 
 #ifndef GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY
+//! \brief Default for the prune kernel's j4 processing concurrency.
 #    define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY 4
 #endif
-/*! \brief Macro defining default for the prune kernel's j4 processing concurrency.
+
+/*! \brief Prune kernel's j4 processing concurrency.
  *
- *  The GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override.
+ *  The \c GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override.
  */
 static constexpr int c_syclPruneKernelJ4Concurrency = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY;
 
@@ -82,14 +84,13 @@ static constexpr float c_oneTwelfth = 0.08333333F;
 //! \brief Convert 3D range to 1D
 static inline cl::sycl::range<1> flattenRange(cl::sycl::range<3> range3d)
 {
-    return cl::sycl::range<1>(range3d.size());
+    return { range3d.size() };
 }
 
 //! \brief Convert 3D nd_range to 1D
 static inline cl::sycl::nd_range<1> flattenNDRange(cl::sycl::nd_range<3> nd_range3d)
 {
-    return cl::sycl::nd_range<1>(flattenRange(nd_range3d.get_global_range()),
-                                 flattenRange(nd_range3d.get_local_range()));
+    return { flattenRange(nd_range3d.get_global_range()), flattenRange(nd_range3d.get_local_range()) };
 }
 
 //! \brief Convert flattened 1D index to 3D
@@ -100,7 +101,7 @@ static inline cl::sycl::id<3> unflattenId(cl::sycl::id<1> id1d)
     const unsigned     id      = id1d[0];
     const unsigned     z       = id / rangeXY;
     const unsigned     xy      = id % rangeXY;
-    return cl::sycl::id<3>(xy % rangeX, xy / rangeX, z);
+    return { xy % rangeX, xy / rangeX, z };
 }
 
 } // namespace Nbnxm
diff --git a/src/gromacs/utility/template_mp.h b/src/gromacs/utility/template_mp.h
index acd3744f02..902e1deef3 100644
--- a/src/gromacs/utility/template_mp.h
+++ b/src/gromacs/utility/template_mp.h
@@ -54,12 +54,6 @@
 namespace gmx
 {
 
-template<class Function>
-auto dispatchTemplatedFunction(Function&& f)
-{
-    return std::forward<Function>(f)();
-}
-
 /*! \internal \brief
  * Helper function to select appropriate template based on runtime values.
  *
@@ -85,7 +79,19 @@ auto dispatchTemplatedFunction(Function&& f)
             p0, p1, p2);
     }
  * \endcode
- */
+ *
+ * \tparam Function Type of \p f.
+ * \param f Function to call.
+ * \return The result of calling \c f().
+*/
+template<class Function>
+auto dispatchTemplatedFunction(Function&& f)
+{
+    return std::forward<Function>(f)();
+}
+
+// Recursive templates confuse Doxygen
+//! \cond
 template<class Function, class Enum, class... Enums>
 auto dispatchTemplatedFunction(Function&& f, Enum e, Enums... es)
 {
@@ -110,6 +116,7 @@ auto dispatchTemplatedFunction(Function&& f, bool e, Enums... es)
             },
             es...);
 }
+//! \endcond
 
 } // namespace gmx
 
-- 
2.22.0