From 6ff8982624c43e96f041dc00d0483ff37bb6a4d6 Mon Sep 17 00:00:00 2001 From: Andrey Alekseenko Date: Tue, 24 Aug 2021 18:13:46 +0300 Subject: [PATCH] Add missing Doxygen for SYCL functions and some others Sometimes it was enough to convert existing comments to Doxygen. Now there are no SYCL-related warnings when building webpage. Also fixed a bit of trivial clang-tidy complaints. --- src/gromacs/compat/mp11.h | 6 ++- src/gromacs/gpu_utils/gputraits.h | 4 ++ src/gromacs/gpu_utils/gputraits_sycl.h | 1 + src/gromacs/gpu_utils/ocl_compiler.cpp | 4 +- src/gromacs/gpu_utils/sycl_kernel_utils.h | 6 +-- src/gromacs/hardware/device_information.h | 1 + .../gpuforcereduction_impl_internal_sycl.cpp | 4 +- src/gromacs/mdlib/leapfrog_gpu.cpp | 2 +- src/gromacs/mdlib/leapfrog_gpu_internal.h | 17 ++++++++ .../mdlib/leapfrog_gpu_internal_sycl.cpp | 4 +- src/gromacs/mdlib/lincs_gpu_internal.h | 11 +++++ .../mdlib/settle_gpu_internal_sycl.cpp | 2 + .../update_constrain_gpu_internal_sycl.cpp | 1 + src/gromacs/mdtypes/state.h | 2 +- src/gromacs/nbnxm/grid.cpp | 2 +- src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp | 41 ++++++++++++------- src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.h | 6 +++ .../sycl/nbnxm_sycl_kernel_pruneonly.cpp | 2 + .../nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.h | 8 ++++ .../nbnxm/sycl/nbnxm_sycl_kernel_utils.h | 13 +++--- src/gromacs/utility/template_mp.h | 21 ++++++---- 21 files changed, 118 insertions(+), 40 deletions(-) diff --git a/src/gromacs/compat/mp11.h b/src/gromacs/compat/mp11.h index 439084ed9c..d474bbe9aa 100644 --- a/src/gromacs/compat/mp11.h +++ b/src/gromacs/compat/mp11.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -89,6 +89,8 @@ static auto mp_with_index(std::size_t i, F&& f) } } +// Doxygen does not like recursive templates. +//! \cond template 1)>::type* = nullptr> static auto mp_with_index(std::size_t i, F&& f) { @@ -101,7 +103,7 @@ static auto mp_with_index(std::size_t i, F&& f) return mp_with_index(i, std::forward(f)); } } - +//! \endcond } // namespace compat } // namespace gmx diff --git a/src/gromacs/gpu_utils/gputraits.h b/src/gromacs/gpu_utils/gputraits.h index 344b0427c1..3fb58eb8b4 100644 --- a/src/gromacs/gpu_utils/gputraits.h +++ b/src/gromacs/gpu_utils/gputraits.h @@ -81,6 +81,7 @@ struct Float4 namespace gmx { +//! Reinterpret-cast any pointer \p in to \c Float3, checking the type compatibility. template static inline Float3* asGenericFloat3Pointer(T* in) { @@ -90,6 +91,7 @@ static inline Float3* asGenericFloat3Pointer(T* in) return reinterpret_cast(in); } +//! Reinterpret-cast any const pointer \p in to \c Float3, checking the type compatibility. template static inline const Float3* asGenericFloat3Pointer(const T* in) { @@ -99,6 +101,7 @@ static inline const Float3* asGenericFloat3Pointer(const T* in) return reinterpret_cast(in); } +//! Reinterpret-cast any container \p in to \c Float3, checking the type compatibility. template static inline Float3* asGenericFloat3Pointer(C& in) { @@ -108,6 +111,7 @@ static inline Float3* asGenericFloat3Pointer(C& in) return reinterpret_cast(in.data()); } +//! Reinterpret-cast any const container \p in to \c Float3, checking the type compatibility. template static inline const Float3* asGenericFloat3Pointer(const C& in) { diff --git a/src/gromacs/gpu_utils/gputraits_sycl.h b/src/gromacs/gpu_utils/gputraits_sycl.h index 9c64d3f303..b6b76c7a44 100644 --- a/src/gromacs/gpu_utils/gputraits_sycl.h +++ b/src/gromacs/gpu_utils/gputraits_sycl.h @@ -49,6 +49,7 @@ #include "gromacs/gpu_utils/gmxsycl.h" #include "gromacs/math/vectypes.h" +//! Type of device texture object. In SYCL, that would be \c sycl::image, but it's not used. using DeviceTexture = void*; //! \brief Single GPU call timing event, not used with SYCL diff --git a/src/gromacs/gpu_utils/ocl_compiler.cpp b/src/gromacs/gpu_utils/ocl_compiler.cpp index c8ed6f19c5..98b02df93a 100644 --- a/src/gromacs/gpu_utils/ocl_compiler.cpp +++ b/src/gromacs/gpu_utils/ocl_compiler.cpp @@ -2,7 +2,7 @@ * This file is part of the GROMACS molecular simulation package. * * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team. - * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -168,7 +168,7 @@ static std::string selectCompilerOptions(DeviceVendor deviceVendor) compilerOptions += " -cl-opt-disable"; } - /* Fastmath imprves performance on all supported arch */ + /* Fastmath improves performance on all supported arch */ if (getenv("GMX_OCL_DISABLE_FASTMATH") == nullptr) { compilerOptions += " -cl-fast-relaxed-math"; diff --git a/src/gromacs/gpu_utils/sycl_kernel_utils.h b/src/gromacs/gpu_utils/sycl_kernel_utils.h index 42f3437bfe..8a2d6bace1 100644 --- a/src/gromacs/gpu_utils/sycl_kernel_utils.h +++ b/src/gromacs/gpu_utils/sycl_kernel_utils.h @@ -58,7 +58,7 @@ static constexpr auto mode_atomic = GMX_SYCL_DPCPP ? cl::sycl::access::mode::read_write : /* GMX_SYCL_HIPSYCL */ cl::sycl::access::mode::atomic; -// \brief Full warp active thread mask used in CUDA warp-level primitives. +//! \brief Full warp active thread mask used in CUDA warp-level primitives. static constexpr unsigned int c_cudaFullWarpMask = 0xffffffff; /*! \brief Convenience wrapper to do atomic addition to a global buffer. @@ -91,9 +91,9 @@ static inline void atomicFetchAdd(DeviceAccessor acc, const #endif } -/* \brief Issue an intra sub-group barrier. +/*! \brief Issue an intra sub-group barrier. * - * Equivalent with CUDA syncwarp(c_cudaFullWarpMask). + * Equivalent with CUDA's \c syncwarp(c_cudaFullWarpMask). * */ static inline void subGroupBarrier(const cl::sycl::nd_item<1> itemIdx) diff --git a/src/gromacs/hardware/device_information.h b/src/gromacs/hardware/device_information.h index 079be8be45..a03e915972 100644 --- a/src/gromacs/hardware/device_information.h +++ b/src/gromacs/hardware/device_information.h @@ -64,6 +64,7 @@ //! Constant used to help minimize preprocessed code static constexpr bool c_binarySupportsGpus = (GMX_GPU != 0); +//! Whether \ref DeviceInformation can be serialized for sending via MPI. static constexpr bool c_canSerializeDeviceInformation = (!GMX_GPU_OPENCL && !GMX_GPU_SYCL); /*NOLINT(misc-redundant-expression)*/ diff --git a/src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp index 0ccbc98e09..bd76fbf7d6 100644 --- a/src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp +++ b/src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp @@ -63,6 +63,7 @@ namespace gmx using cl::sycl::access::mode; +//! \brief Function returning the force reduction kernel lambda. template static auto reduceKernel(cl::sycl::handler& cgh, DeviceAccessor a_nbnxmForce, @@ -97,6 +98,7 @@ static auto reduceKernel(cl::sycl::handler& cgh, }; } +//! \brief Force reduction SYCL kernel launch code. template static void launchReductionKernel_(const int numAtoms, const int atomStart, @@ -119,7 +121,7 @@ static void launchReductionKernel_(const int numAtoms, }); } -/*! \brief Select templated kernel and launch it. */ +/*! \brief Select templated Force reduction kernel and launch it. */ void launchForceReductionKernel(int numAtoms, int atomStart, bool addRvecForce, diff --git a/src/gromacs/mdlib/leapfrog_gpu.cpp b/src/gromacs/mdlib/leapfrog_gpu.cpp index b07c167c7c..1d9167ec79 100644 --- a/src/gromacs/mdlib/leapfrog_gpu.cpp +++ b/src/gromacs/mdlib/leapfrog_gpu.cpp @@ -162,7 +162,7 @@ void LeapFrogGpu::set(const int numAtoms, const real* inverseMasses, const unsig copyToDeviceBuffer( &d_inverseMasses_, inverseMasses, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr); - // Temperature scale group map only used if there are more then one group + // Temperature scale group map only used if there are more than one group if (numTempScaleValues_ > 1) { reallocateDeviceBuffer( diff --git a/src/gromacs/mdlib/leapfrog_gpu_internal.h b/src/gromacs/mdlib/leapfrog_gpu_internal.h index 76330fe180..44c9ca6804 100644 --- a/src/gromacs/mdlib/leapfrog_gpu_internal.h +++ b/src/gromacs/mdlib/leapfrog_gpu_internal.h @@ -50,6 +50,23 @@ namespace gmx { +/*! \brief Backend-specific function to launch GPU Leap Frog kernel. + * + * \param numAtoms Total number of atoms. + * \param[in,out] d_x Buffer containing initial coordinates, and where the updated ones will be written. + * \param[out] d_xp Buffer where a copy of the initial coordinates will be written. + * \param[in,out] d_v Buffer containing initial velocities, and where the updated ones will be written. + * \param[in] d_f Buffer containing forces. + * \param[in] d_inverseMasses Buffer containing atoms' reciprocal masses. + * \param dt Timestep. + * \param doTemperatureScaling Whether temperature scaling is needed. + * \param numTempScaleValues Number of different T-couple values. + * \param d_tempScaleGroups Mapping of atoms into temperature scaling groups. + * \param d_lambdas Temperature scaling factors (one per group). + * \param prVelocityScalingType Type of Parrinello-Rahman velocity rescaling. + * \param prVelocityScalingMatrixDiagonal Diagonal elements of Parrinello-Rahman velocity scaling matrix. + * \param deviceStream Device stream for kernel launch. + */ void launchLeapFrogKernel(int numAtoms, DeviceBuffer d_x, DeviceBuffer d_xp, diff --git a/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp b/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp index d0d019c25e..b5572dcfe1 100644 --- a/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp +++ b/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp @@ -83,7 +83,7 @@ using cl::sycl::access::mode; * \param[in] dt Timestep. * \param[in] a_lambdas Temperature scaling factors (one per group). * \param[in] a_tempScaleGroups Mapping of atoms into groups. - * \param[in] prVelocityScalingMatrixDiagonal Diagonal elements of Parrinello-Rahman velocity scaling matrix + * \param[in] prVelocityScalingMatrixDiagonal Diagonal elements of Parrinello-Rahman velocity scaling matrix. */ template auto leapFrogKernel( @@ -160,6 +160,7 @@ auto leapFrogKernel( }; } +//! \brief Leap Frog SYCL kernel launch code. template static cl::sycl::event launchLeapFrogKernel(const DeviceStream& deviceStream, int numAtoms, Args&&... args) { @@ -178,6 +179,7 @@ static cl::sycl::event launchLeapFrogKernel(const DeviceStream& deviceStream, in return e; } +//! Convert \p doTemperatureScaling and \p numTempScaleValues to \ref NumTempScaleValues. static NumTempScaleValues getTempScalingType(bool doTemperatureScaling, int numTempScaleValues) { if (!doTemperatureScaling) diff --git a/src/gromacs/mdlib/lincs_gpu_internal.h b/src/gromacs/mdlib/lincs_gpu_internal.h index cd421c198c..db44205fb3 100644 --- a/src/gromacs/mdlib/lincs_gpu_internal.h +++ b/src/gromacs/mdlib/lincs_gpu_internal.h @@ -57,6 +57,17 @@ struct LincsGpuKernelParameters; //! Number of threads in a GPU block constexpr static int c_threadsPerBlock = 256; +/*! \brief Backend-specific function to launch LINCS kernel. + * + * \param kernelParams LINCS parameters. + * \param d_x Initial coordinates before the integration. + * \param d_xp Coordinates after the integration which will be updated. + * \param updateVelocities Whether to also update velocities. + * \param d_v Velocities to update (ignored if \p updateVelocities is \c false). + * \param invdt Reciprocal of timestep. + * \param computeVirial Whether to compute the virial. + * \param deviceStream Device stream for kernel launch. + */ void launchLincsGpuKernel(const LincsGpuKernelParameters& kernelParams, const DeviceBuffer& d_x, DeviceBuffer d_xp, diff --git a/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp b/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp index 340b39bb73..3e5b56fdf3 100644 --- a/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp +++ b/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp @@ -59,6 +59,7 @@ using cl::sycl::access::target; //! Number of work-items in a work-group constexpr static int sc_workGroupSize = 256; +//! \brief Function returning the SETTLE kernel lambda. template auto settleKernel(cl::sycl::handler& cgh, const int numSettles, @@ -349,6 +350,7 @@ auto settleKernel(cl::sycl::handler& c template class SettleKernelName; +//! \brief SETTLE SYCL kernel launch code. template static cl::sycl::event launchSettleKernel(const DeviceStream& deviceStream, int numSettles, Args&&... args) { diff --git a/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp b/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp index 1843b2f75b..f7113a203f 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp +++ b/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp @@ -55,6 +55,7 @@ class ScaleKernel; namespace gmx { +//! \brief Function returning the scaling kernel lambda. static auto scaleKernel(cl::sycl::handler& cgh, DeviceAccessor a_x, const ScalingMatrix scalingMatrix) diff --git a/src/gromacs/mdtypes/state.h b/src/gromacs/mdtypes/state.h index df3691adfe..81c9d13a6b 100644 --- a/src/gromacs/mdtypes/state.h +++ b/src/gromacs/mdtypes/state.h @@ -88,7 +88,7 @@ using PaddedHostVector = gmx::PaddedHostVector; * Currently the random seeds for SD and BD are missing. */ -/* \brief Enum for all entries in \p t_state +/*! \brief Enum for all entries in \p t_state * * These enums are used in flags as (1<(numAtoms) / (size[XX] * size[YY] * size[ZZ]); } -// Get approximate dimensions of each cell. Returns the length along X and Y. +//! \brief Get approximate dimensions of each cell. Returns the length along X and Y. static std::array getTargetCellLength(const Grid::Geometry& geometry, const real atomDensity) { if (geometry.isSimple) diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp index ae703bdb16..7304922c09 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp @@ -70,9 +70,9 @@ struct EnergyFunctionProperties { static constexpr bool elecEwaldTab = (elecType == ElecType::EwaldTab || elecType == ElecType::EwaldTabTwin); ///< EL_EWALD_TAB static constexpr bool elecEwaldTwin = - (elecType == ElecType::EwaldAnaTwin || elecType == ElecType::EwaldTabTwin); - static constexpr bool elecEwald = (elecEwaldAna || elecEwaldTab); ///< EL_EWALD_ANY - static constexpr bool vdwCombLB = (vdwType == VdwType::CutCombLB); + (elecType == ElecType::EwaldAnaTwin || elecType == ElecType::EwaldTabTwin); ///< Use twin cut-off. + static constexpr bool elecEwald = (elecEwaldAna || elecEwaldTab); ///< EL_EWALD_ANY + static constexpr bool vdwCombLB = (vdwType == VdwType::CutCombLB); ///< LJ_COMB && !LJ_COMB_GEOM static constexpr bool vdwCombGeom = (vdwType == VdwType::CutCombGeom); ///< LJ_COMB_GEOM static constexpr bool vdwComb = (vdwCombLB || vdwCombGeom); ///< LJ_COMB static constexpr bool vdwEwaldCombGeom = (vdwType == VdwType::EwaldGeom); ///< LJ_EWALD_COMB_GEOM @@ -87,9 +87,6 @@ struct EnergyFunctionProperties { template constexpr bool ljComb = EnergyFunctionProperties().vdwComb; -template // Yes, ElecType -constexpr bool vdwCutoffCheck = EnergyFunctionProperties().elecEwaldTwin; - template constexpr bool elecEwald = EnergyFunctionProperties().elecEwald; @@ -104,6 +101,7 @@ using cl::sycl::access::fence_space; using cl::sycl::access::mode; using cl::sycl::access::target; +//! \brief Convert \p sigma and \p epsilon VdW parameters to \c c6,c12 pair. static inline Float2 convertSigmaEpsilonToC6C12(const float sigma, const float epsilon) { const float sigma2 = sigma * sigma; @@ -111,9 +109,10 @@ static inline Float2 convertSigmaEpsilonToC6C12(const float sigma, const float e const float c6 = epsilon * sigma6; const float c12 = c6 * sigma6; - return Float2(c6, c12); + return { c6, c12 }; } +//! \brief Calculate force and energy for a pair of atoms, VdW force-switch flavor. template static inline void ljForceSwitch(const shift_consts_t dispersionShift, const shift_consts_t repulsionShift, @@ -342,10 +341,20 @@ static inline void reduceForceJShuffle(Float3 f, } } -// This function also requires sm_buf to have a length of at least 1. -// The function returns: -// - for thread #0 in the group: sum of all valueToReduce in a group -// - for other threads: unspecified +/*! + * \brief Do workgroup-level reduction of a single \c float. + * + * While SYCL has \c sycl::reduce_over_group, it currently (oneAPI 2021.3.0) uses a very large + * shared memory buffer, which leads to a reduced occupancy. + * + * \tparam subGroupSize Size of a sub-group. + * \tparam groupSize Size of a work-group. + * \param itemIdx Current thread's \c sycl::nd_item. + * \param tidxi Current thread's linearized local index. + * \param sm_buf Accessor for local reduction buffer. + * \param valueToReduce Current thread's value. Must have length of at least 1. + * \return For thread with \p tidxi 0: sum of all \p valueToReduce. Other threads: unspecified. + */ template static inline float groupReduce(const cl::sycl::nd_item<1> itemIdx, const unsigned int tidxi, @@ -434,7 +443,7 @@ static inline void reduceForceJ(cl::sycl::accessor prunedClusterPairSize on AMD GCN / CDNA. // Hence, the two are decoupled. // When changing this code, please update requiredSubGroupSizeForNbnxm in src/gromacs/hardware/device_management_sycl.cpp. @@ -996,7 +1005,7 @@ auto nbnxmKernel(cl::sycl::handler& cgh, if constexpr (props.elecRF) { energyElec += - qi * qj * (pairExclMask * rInv + 0.5f * twoKRf * r2 - cRF); + qi * qj * (pairExclMask * rInv + 0.5F * twoKRf * r2 - cRF); } if constexpr (props.elecEwald) { @@ -1030,7 +1039,7 @@ auto nbnxmKernel(cl::sycl::handler& cgh, } // for (int j4 = cij4Start; j4 < cij4End; j4 += 1) /* skip central shifts when summing shift forces */ - const bool doCalcShift = (calcShift && !(nbSci.shift == gmx::c_centralShiftIndex)); + const bool doCalcShift = (calcShift && nbSci.shift != gmx::c_centralShiftIndex); reduceForceIAndFShift( sm_reductionBuffer, fCiBuf, doCalcShift, itemIdx, tidxi, tidxj, sci, nbSci.shift, a_f, a_fShift); @@ -1051,6 +1060,7 @@ auto nbnxmKernel(cl::sycl::handler& cgh, }; } +//! \brief NBNXM kernel launch code. template cl::sycl::event launchNbnxmKernel(const DeviceStream& deviceStream, const int numSci, Args&&... args) { @@ -1077,6 +1087,7 @@ cl::sycl::event launchNbnxmKernel(const DeviceStream& deviceStream, const int nu return e; } +//! \brief Select templated kernel and launch it. template cl::sycl::event chooseAndLaunchNbnxmKernel(bool doPruneNBL, bool doCalcEnergies, diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.h index dd7f52138d..bb256a8482 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.h +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.h @@ -53,6 +53,12 @@ namespace Nbnxm { using gmx::InteractionLocality; +/*! Launch SYCL NBNXM kernel. + * + * \param nb Non-bonded parameters. + * \param stepWork Workload flags for the current step. + * \param iloc Interaction locality. + */ void launchNbnxmKernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc); } // namespace Nbnxm diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp index 7770915c4d..1562cbc6fd 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp @@ -220,6 +220,7 @@ auto nbnxmKernelPruneOnly(cl::sycl::handler& cgh, }; } +//! \brief Leap Frog SYCL prune-only kernel launch code. template cl::sycl::event launchNbnxmKernelPruneOnly(const DeviceStream& deviceStream, const int numSciInPart, @@ -247,6 +248,7 @@ cl::sycl::event launchNbnxmKernelPruneOnly(const DeviceStream& deviceStream, return e; } +//! \brief Select templated kernel and launch it. template cl::sycl::event chooseAndLaunchNbnxmKernelPruneOnly(bool haveFreshList, Args&&... args) { diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.h index 6af97b1d1e..da2e96b996 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.h +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.h @@ -52,6 +52,14 @@ namespace Nbnxm { using gmx::InteractionLocality; +/*! Launch SYCL NBNXM prune-only kernel. + * + * \param nb Non-bonded parameters. + * \param iloc Interaction locality. + * \param numParts Total number of rolling-prune parts. + * \param part Number of the part to prune. + * \param numSciInPart Number of superclusters in \p part. + */ void launchNbnxmKernelPruneOnly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts, diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h index c316555d0f..53633da4be 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h @@ -49,11 +49,13 @@ namespace Nbnxm { #ifndef GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY +//! \brief Default for the prune kernel's j4 processing concurrency. # define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY 4 #endif -/*! \brief Macro defining default for the prune kernel's j4 processing concurrency. + +/*! \brief Prune kernel's j4 processing concurrency. * - * The GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override. + * The \c GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override. */ static constexpr int c_syclPruneKernelJ4Concurrency = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY; @@ -82,14 +84,13 @@ static constexpr float c_oneTwelfth = 0.08333333F; //! \brief Convert 3D range to 1D static inline cl::sycl::range<1> flattenRange(cl::sycl::range<3> range3d) { - return cl::sycl::range<1>(range3d.size()); + return { range3d.size() }; } //! \brief Convert 3D nd_range to 1D static inline cl::sycl::nd_range<1> flattenNDRange(cl::sycl::nd_range<3> nd_range3d) { - return cl::sycl::nd_range<1>(flattenRange(nd_range3d.get_global_range()), - flattenRange(nd_range3d.get_local_range())); + return { flattenRange(nd_range3d.get_global_range()), flattenRange(nd_range3d.get_local_range()) }; } //! \brief Convert flattened 1D index to 3D @@ -100,7 +101,7 @@ static inline cl::sycl::id<3> unflattenId(cl::sycl::id<1> id1d) const unsigned id = id1d[0]; const unsigned z = id / rangeXY; const unsigned xy = id % rangeXY; - return cl::sycl::id<3>(xy % rangeX, xy / rangeX, z); + return { xy % rangeX, xy / rangeX, z }; } } // namespace Nbnxm diff --git a/src/gromacs/utility/template_mp.h b/src/gromacs/utility/template_mp.h index acd3744f02..902e1deef3 100644 --- a/src/gromacs/utility/template_mp.h +++ b/src/gromacs/utility/template_mp.h @@ -54,12 +54,6 @@ namespace gmx { -template -auto dispatchTemplatedFunction(Function&& f) -{ - return std::forward(f)(); -} - /*! \internal \brief * Helper function to select appropriate template based on runtime values. * @@ -85,7 +79,19 @@ auto dispatchTemplatedFunction(Function&& f) p0, p1, p2); } * \endcode - */ + * + * \tparam Function Type of \p f. + * \param f Function to call. + * \return The result of calling \c f(). +*/ +template +auto dispatchTemplatedFunction(Function&& f) +{ + return std::forward(f)(); +} + +// Recursive templates confuse Doxygen +//! \cond template auto dispatchTemplatedFunction(Function&& f, Enum e, Enums... es) { @@ -110,6 +116,7 @@ auto dispatchTemplatedFunction(Function&& f, bool e, Enums... es) }, es...); } +//! \endcond } // namespace gmx -- 2.22.0