From 00bbe9d48d731f828b19f85b5c706f4114325a88 Mon Sep 17 00:00:00 2001
From: Andrey Alekseenko <al42and@gmail.com>
Date: Tue, 14 Sep 2021 20:50:47 +0300
Subject: [PATCH] Minor fixes to comments and Doxygen

- Fix issue number in cmake/gmxManageNvccConfig.cmake.
- Fix use of \tparam in CUDA code.
---
 cmake/gmxManageNvccConfig.cmake               |  2 +-
 src/gromacs/ewald/pme_gather.cu               | 26 +++++++++----------
 .../ewald/pme_gpu_calculate_splines.cuh       | 18 ++++++-------
 src/gromacs/ewald/pme_solve.cu                |  6 ++---
 src/gromacs/ewald/pme_spread.cu               | 26 +++++++++----------
 src/gromacs/gpu_utils/cuda_kernel_utils.cuh   |  4 +--
 6 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/cmake/gmxManageNvccConfig.cmake b/cmake/gmxManageNvccConfig.cmake
index 5a83a80f4a..e72fdaca8c 100644
--- a/cmake/gmxManageNvccConfig.cmake
+++ b/cmake/gmxManageNvccConfig.cmake
@@ -51,7 +51,7 @@
 # glibc source shows that _FORCE_INLINES is only used in this string.h
 # feature and performance of memcpy variants is unimportant for CUDA
 # code in GROMACS. So this workaround is good enough to keep problems
-# away from users installing GROMACS. See Issue #1942.
+# away from users installing GROMACS. See Issue #1982.
 function(work_around_glibc_2_23)
     try_compile(IS_GLIBC_2_23_OR_HIGHER ${CMAKE_BINARY_DIR} ${CMAKE_SOURCE_DIR}/cmake/TestGlibcVersion.cpp)
     if(IS_GLIBC_2_23_OR_HIGHER)
diff --git a/src/gromacs/ewald/pme_gather.cu b/src/gromacs/ewald/pme_gather.cu
index ec1d9ecbcf..28de3817e5 100644
--- a/src/gromacs/ewald/pme_gather.cu
+++ b/src/gromacs/ewald/pme_gather.cu
@@ -68,10 +68,10 @@ __device__ __forceinline__ float read_grid_size(const float* realGridSizeFP, con
 
 /*! \brief Reduce the partial force contributions.
  *
- * \tparam[in] order              The PME order (must be 4).
- * \tparam[in] atomDataSize       The number of partial force contributions for each atom (currently
+ * \tparam     order              The PME order (must be 4).
+ * \tparam     atomDataSize       The number of partial force contributions for each atom (currently
  *                                order^2 == 16)
- * \tparam[in] blockSize          The CUDA block size
+ * \tparam     blockSize          The CUDA block size
  *
  * \param[out] sm_forces          Shared memory array with the output forces (number of elements
  *                                is number of atoms per block)
@@ -220,10 +220,10 @@ __device__ __forceinline__ void reduce_atom_forces(float3* __restrict__ sm_force
 
 /*! \brief Calculate the sum of the force partial components (in X, Y and Z)
  *
- * \tparam[in] order              The PME order (must be 4).
- * \tparam[in] atomsPerWarp       The number of atoms per GPU warp.
- * \tparam[in] wrapX              Tells if the grid is wrapped in the X dimension.
- * \tparam[in] wrapY              Tells if the grid is wrapped in the Y dimension.
+ * \tparam     order              The PME order (must be 4).
+ * \tparam     atomsPerWarp       The number of atoms per GPU warp.
+ * \tparam     wrapX              Tells if the grid is wrapped in the X dimension.
+ * \tparam     wrapY              Tells if the grid is wrapped in the Y dimension.
  * \param[out] fx                 The force partial component in the X dimension.
  * \param[out] fy                 The force partial component in the Y dimension.
  * \param[out] fz                 The force partial component in the Z dimension.
@@ -335,12 +335,12 @@ __device__ __forceinline__ void calculateAndStoreGridForces(float3* __restrict__
  * A CUDA kernel which gathers the atom forces from the grid.
  * The grid is assumed to be wrapped in dimension Z.
  *
- * \tparam[in] order                The PME order (must be 4 currently).
- * \tparam[in] wrapX                Tells if the grid is wrapped in the X dimension.
- * \tparam[in] wrapY                Tells if the grid is wrapped in the Y dimension.
- * \tparam[in] numGrids             The number of grids to use in the kernel. Can be 1 or 2.
- * \tparam[in] readGlobal           Tells if we should read spline values from global memory
- * \tparam[in] threadsPerAtom       How many threads work on each atom
+ * \tparam     order                The PME order (must be 4 currently).
+ * \tparam     wrapX                Tells if the grid is wrapped in the X dimension.
+ * \tparam     wrapY                Tells if the grid is wrapped in the Y dimension.
+ * \tparam     numGrids             The number of grids to use in the kernel. Can be 1 or 2.
+ * \tparam     readGlobal           Tells if we should read spline values from global memory
+ * \tparam     threadsPerAtom       How many threads work on each atom
  *
  * \param[in]  kernelParams         All the PME GPU data.
  */
diff --git a/src/gromacs/ewald/pme_gpu_calculate_splines.cuh b/src/gromacs/ewald/pme_gpu_calculate_splines.cuh
index 1ff60f7ed1..d8eefd43f0 100644
--- a/src/gromacs/ewald/pme_gpu_calculate_splines.cuh
+++ b/src/gromacs/ewald/pme_gpu_calculate_splines.cuh
@@ -140,10 +140,10 @@ __device__ inline void assertIsFinite(T gmx_unused arg)
 /*! \brief
  * General purpose function for loading atom-related data from global to shared memory.
  *
- * \tparam[in] T                  Data type (float/int/...)
- * \tparam[in] atomsPerBlock      Number of atoms processed by a block - should be accounted for in
+ * \tparam     T                  Data type (float/int/...)
+ * \tparam     atomsPerBlock      Number of atoms processed by a block - should be accounted for in
  * the size of the shared memory array.
- * \tparam[in] dataCountPerAtom   Number of data elements per single atom (e.g. DIM for an rvec
+ * \tparam     dataCountPerAtom   Number of data elements per single atom (e.g. DIM for an rvec
  * coordinates array).
  * \param[out] sm_destination     Shared memory array for output.
  * \param[in]  gm_source          Global memory array for input.
@@ -169,16 +169,16 @@ __device__ __forceinline__ void pme_gpu_stage_atom_data(T* __restrict__ sm_desti
  * This corresponds to the CPU functions calc_interpolation_idx() and make_bsplines().
  * First stage of the whole kernel.
  *
- * \tparam[in] order                PME interpolation order.
- * \tparam[in] atomsPerBlock        Number of atoms processed by a block - should be accounted for
+ * \tparam     order                PME interpolation order.
+ * \tparam     atomsPerBlock        Number of atoms processed by a block - should be accounted for
  *                                  in the sizes of the shared memory arrays.
- * \tparam[in] atomsPerWarp         Number of atoms processed by a warp
- * \tparam[in] writeSmDtheta        Bool controlling if the theta derivative should be written to
+ * \tparam     atomsPerWarp         Number of atoms processed by a warp
+ * \tparam     writeSmDtheta        Bool controlling if the theta derivative should be written to
  *                                  shared memory. Enables calculation of dtheta if set.
- * \tparam[in] writeGlobal          A boolean which tells if the theta values and gridlines should
+ * \tparam     writeGlobal          A boolean which tells if the theta values and gridlines should
  *                                  be written to global memory. Enables calculation of dtheta if
  *                                  set.
- * \tparam[in] numGrids             The number of grids using the splines.
+ * \tparam     numGrids             The number of grids using the splines.
  * \param[in]  kernelParams         Input PME CUDA data in constant memory.
  * \param[in]  atomIndexOffset      Starting atom index for the execution block w.r.t. global memory.
  * \param[in]  atomX                Atom coordinate of atom processed by thread.
diff --git a/src/gromacs/ewald/pme_solve.cu b/src/gromacs/ewald/pme_solve.cu
index 83e21b1f11..42dbf9c0ec 100644
--- a/src/gromacs/ewald/pme_solve.cu
+++ b/src/gromacs/ewald/pme_solve.cu
@@ -52,9 +52,9 @@
 /*! \brief
  * PME complex grid solver kernel function.
  *
- * \tparam[in] gridOrdering             Specifies the dimension ordering of the complex grid.
- * \tparam[in] computeEnergyAndVirial   Tells if the reciprocal energy and virial should be computed.
- * \tparam[in] gridIndex                The index of the grid to use in the kernel.
+ * \tparam     gridOrdering             Specifies the dimension ordering of the complex grid.
+ * \tparam     computeEnergyAndVirial   Tells if the reciprocal energy and virial should be computed.
+ * \tparam     gridIndex                The index of the grid to use in the kernel.
  * \param[in]  kernelParams             Input PME CUDA data in constant memory.
  */
 template<GridOrdering gridOrdering, bool computeEnergyAndVirial, const int gridIndex>
diff --git a/src/gromacs/ewald/pme_spread.cu b/src/gromacs/ewald/pme_spread.cu
index 4765c9c4c8..d0856602a9 100644
--- a/src/gromacs/ewald/pme_spread.cu
+++ b/src/gromacs/ewald/pme_spread.cu
@@ -69,11 +69,11 @@
  * This corresponds to the CPU function spread_coefficients_bsplines_thread().
  * Optional second stage of the spline_and_spread_kernel.
  *
- * \tparam[in] order                PME interpolation order.
- * \tparam[in] wrapX                Whether the grid overlap in dimension X should be wrapped.
- * \tparam[in] wrapY                Whether the grid overlap in dimension Y should be wrapped.
- * \tparam[in] gridIndex            The index of the grid to use in the kernel.
- * \tparam[in] threadsPerAtom       How many threads work on each atom
+ * \tparam     order                PME interpolation order.
+ * \tparam     wrapX                Whether the grid overlap in dimension X should be wrapped.
+ * \tparam     wrapY                Whether the grid overlap in dimension Y should be wrapped.
+ * \tparam     gridIndex            The index of the grid to use in the kernel.
+ * \tparam     threadsPerAtom       How many threads work on each atom
  *
  * \param[in]  kernelParams         Input PME CUDA data in constant memory.
  * \param[in]  atomCharge           Atom charge/coefficient of atom processed by thread.
@@ -169,15 +169,15 @@ __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams kern
  * writeGlobal should be used removing the need to recalculate the theta values in the gather kernel.
  * Similarly for useOrderThreads large systems order threads per atom gives higher performance than order*order threads
  *
- * \tparam[in] order                PME interpolation order.
- * \tparam[in] computeSplines       A boolean which tells if the spline parameter and
+ * \tparam     order                PME interpolation order.
+ * \tparam     computeSplines       A boolean which tells if the spline parameter and
  *                                  gridline indices' computation should be performed.
- * \tparam[in] spreadCharges        A boolean which tells if the charge spreading should be performed.
- * \tparam[in] wrapX                A boolean which tells if the grid overlap in dimension X should be wrapped.
- * \tparam[in] wrapY                A boolean which tells if the grid overlap in dimension Y should be wrapped.
- * \tparam[in] numGrids             The number of grids to use in the kernel. Can be 1 or 2.
- * \tparam[in] writeGlobal          A boolean which tells if the theta values and gridlines should be written to global memory.
- * \tparam[in] threadsPerAtom       How many threads work on each atom
+ * \tparam     spreadCharges        A boolean which tells if the charge spreading should be performed.
+ * \tparam     wrapX                A boolean which tells if the grid overlap in dimension X should be wrapped.
+ * \tparam     wrapY                A boolean which tells if the grid overlap in dimension Y should be wrapped.
+ * \tparam     numGrids             The number of grids to use in the kernel. Can be 1 or 2.
+ * \tparam     writeGlobal          A boolean which tells if the theta values and gridlines should be written to global memory.
+ * \tparam     threadsPerAtom       How many threads work on each atom
  * \param[in]  kernelParams         Input PME CUDA data in constant memory.
  */
 template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, int numGrids, bool writeGlobal, ThreadsPerAtom threadsPerAtom>
diff --git a/src/gromacs/gpu_utils/cuda_kernel_utils.cuh b/src/gromacs/gpu_utils/cuda_kernel_utils.cuh
index 0333d84ae8..b8497c68f1 100644
--- a/src/gromacs/gpu_utils/cuda_kernel_utils.cuh
+++ b/src/gromacs/gpu_utils/cuda_kernel_utils.cuh
@@ -58,7 +58,7 @@ __device__ __forceinline__ T LDG(const T* ptr)
 
 /*! \brief Fetch the value by \p index from the texture object.
  *
- * \tparam[in] T        Raw data type
+ * \tparam T            Raw data type
  * \param[in] texObj    Table texture object
  * \param[in] index     Non-negative element index
  * \returns             The value from the table at \p index
@@ -77,7 +77,7 @@ static __forceinline__ __device__ T fetchFromTexture(const cudaTextureObject_t t
  *  Depending on what is supported, it fetches parameters either
  *  using direct load or texture objects.
  *
- * \tparam[in] T        Raw data type
+ * \tparam T            Raw data type
  * \param[in] d_ptr     Device pointer to the raw table memory
  * \param[in] texObj    Table texture object
  * \param[in] index     Non-negative element index
-- 
2.22.0