From 04c763fb1d0293afd87065abed3b4fd6036bc72a Mon Sep 17 00:00:00 2001
From: Paul Bauer <paul.bauer.q@gmail.com>
Date: Mon, 24 May 2021 12:58:44 +0000
Subject: [PATCH] Apply clang-tidy-11 fixes to CUDA files

---
 src/.clang-tidy                               | 10 ++-
 src/gromacs/domdec/gpuhaloexchange_impl.cu    | 16 +---
 .../ewald/pme_coordinate_receiver_gpu_impl.cu |  1 +
 .../ewald/pme_force_sender_gpu_impl.cu        |  1 +
 src/gromacs/ewald/pme_force_sender_gpu_impl.h |  1 +
 src/gromacs/ewald/pme_gather.cu               | 32 ++++---
 src/gromacs/ewald/pme_gpu_3dfft.cu            |  6 +-
 .../ewald/pme_gpu_calculate_splines.cuh       | 30 +++----
 src/gromacs/ewald/pme_gpu_internal.cpp        |  4 +-
 src/gromacs/ewald/pme_gpu_program_impl.cu     |  6 +-
 src/gromacs/ewald/pme_pp_comm_gpu_impl.cu     |  3 +-
 src/gromacs/ewald/pme_pp_comm_gpu_impl.h      |  1 -
 src/gromacs/ewald/pme_solve.cu                | 34 ++++----
 src/gromacs/gpu_utils/cuda_kernel_utils.cuh   |  3 +-
 src/gromacs/gpu_utils/cudautils.cuh           | 24 +++---
 src/gromacs/gpu_utils/devicebuffer.cuh        | 29 +++++--
 src/gromacs/gpu_utils/devicebuffer_sycl.h     |  2 +-
 src/gromacs/gpu_utils/gpu_utils.cu            |  9 +-
 src/gromacs/gpu_utils/gpuregiontimer.cuh      |  4 +-
 src/gromacs/gpu_utils/pmalloc.cu              |  2 +-
 src/gromacs/gpu_utils/tests/hostallocator.cpp |  2 +-
 .../gpu_utils/tests/typecasts_runner.cpp      | 12 +--
 .../gpu_utils/tests/typecasts_runner.cu       | 13 +--
 .../gpu_utils/tests/typecasts_runner.h        | 13 +--
 src/gromacs/gpu_utils/vectype_ops.cuh         | 27 +++---
 src/gromacs/hardware/device_management.cu     |  8 +-
 src/gromacs/listed_forces/gpubonded_impl.cu   |  2 +-
 src/gromacs/listed_forces/gpubonded_impl.h    |  4 +-
 src/gromacs/listed_forces/gpubondedkernels.cu | 76 ++++++++--------
 src/gromacs/mdlib/gpuforcereduction_impl.cpp  |  8 +-
 src/gromacs/mdlib/gpuforcereduction_impl.h    | 12 +--
 .../mdlib/gpuforcereduction_impl_internal.cu  |  1 -
 src/gromacs/mdlib/leapfrog_gpu.h              | 12 +--
 src/gromacs/mdlib/lincs_gpu.cpp               | 26 +++---
 src/gromacs/mdlib/lincs_gpu.h                 | 16 ++--
 src/gromacs/mdlib/lincs_gpu_internal.cu       | 46 +++++-----
 src/gromacs/mdlib/lincs_gpu_internal.h        | 16 ++--
 src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp |  4 +-
 src/gromacs/mdlib/settle_gpu.cpp              | 18 ++--
 src/gromacs/mdlib/settle_gpu.h                | 16 ++--
 src/gromacs/mdlib/settle_gpu_internal.cu      | 36 ++++----
 src/gromacs/mdlib/settle_gpu_internal.h       | 24 +++---
 .../mdlib/settle_gpu_internal_sycl.cpp        |  8 +-
 .../mdlib/update_constrain_gpu_impl.cpp       |  4 +-
 src/gromacs/mdlib/update_constrain_gpu_impl.h |  2 +-
 .../mdlib/update_constrain_gpu_internal.cu    |  2 +-
 .../mdlib/update_constrain_gpu_internal.h     |  2 +-
 .../update_constrain_gpu_internal_sycl.cpp    |  2 +-
 .../mdtypes/state_propagator_data_gpu.h       |  6 +-
 .../state_propagator_data_gpu_impl.cpp        |  6 +-
 .../mdtypes/state_propagator_data_gpu_impl.h  |  8 +-
 .../state_propagator_data_gpu_impl_gpu.cpp    | 14 +--
 src/gromacs/nbnxm/cuda/.clang-tidy            |  8 ++
 .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu        |  2 +-
 src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh  | 44 +++++-----
 .../cuda/nbnxm_cuda_kernel_pruneonly.cuh      | 14 +--
 .../nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh    | 86 ++++++++++---------
 .../cuda/nbnxm_gpu_buffer_ops_internal.cu     |  2 +-
 .../nbnxm/nbnxm_gpu_buffer_ops_internal.h     |  3 +-
 src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp     |  8 +-
 src/gromacs/pbcutil/pbc_aiuc.h                | 26 +++---
 src/gromacs/pbcutil/pbc_aiuc_cuda.cuh         |  1 +
 62 files changed, 448 insertions(+), 410 deletions(-)
 create mode 100644 src/gromacs/nbnxm/cuda/.clang-tidy

diff --git a/src/.clang-tidy b/src/.clang-tidy
index 4cfa2bddc8..252ccd00dd 100644
--- a/src/.clang-tidy
+++ b/src/.clang-tidy
@@ -35,6 +35,12 @@
 #
 #         -misc-no-recursion
 # We have way too many functions and methods relying on recursion
+#
+#         -readability-static-accessed-through-instance
+# Especially the GPU code uses this in many places.
+#
+#         -performance-type-promotion-in-math-fn
+# Those warnings are in the gpu code and we shouldn't to worry about them.
 Checks:  clang-diagnostic-*,-clang-analyzer-*,-clang-analyzer-security.insecureAPI.strcpy,
          bugprone-*,misc-*,readability-*,performance-*,mpi-*,
          -readability-inconsistent-declaration-parameter-name,
@@ -60,7 +66,9 @@ Checks:  clang-diagnostic-*,-clang-analyzer-*,-clang-analyzer-security.insecureA
          -bugprone-narrowing-conversions,
          -google-readability-avoid-underscore-in-googletest-name,
          -cppcoreguidelines-init-variables,
-         -misc-no-recursion
+         -misc-no-recursion,
+         -readability-static-accessed-through-instance,
+         -performance-type-promotion-in-math-fn
 HeaderFilterRegex: .*
 CheckOptions:
   - key:           cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu
index f32a95bae7..3a245e21e4 100644
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cu
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -97,8 +97,6 @@ __global__ void packSendBufKernel(float3* __restrict__ dataPacked,
             *gm_dataDest = *gm_dataSrc;
         }
     }
-
-    return;
 }
 
 /*! \brief unpack non-local force data buffer on the GPU using pre-populated "map" containing index
@@ -128,8 +126,6 @@ __global__ void unpackRecvBufKernel(float3* __restrict__ data,
             *gm_dataDest = *gm_dataSrc;
         }
     }
-
-    return;
 }
 
 void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_forcesBuffer)
@@ -251,8 +247,6 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo
 
     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::DDGpu);
     wallcycle_stop(wcycle_, WallCycleCounter::Domdec);
-
-    return;
 }
 
 void GpuHaloExchange::Impl::enqueueWaitRemoteCoordinatesReadyEvent(GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
@@ -265,12 +259,12 @@ void GpuHaloExchange::Impl::enqueueWaitRemoteCoordinatesReadyEvent(GpuEventSynch
     // Similarly send event to task that will push data to this task.
     GpuEventSynchronizer* remoteCoordinatesReadyOnDeviceEvent;
     MPI_Sendrecv(&coordinatesReadyOnDeviceEvent,
-                 sizeof(GpuEventSynchronizer*),
+                 sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression)
                  MPI_BYTE,
                  recvRankX_,
                  0,
                  &remoteCoordinatesReadyOnDeviceEvent,
-                 sizeof(GpuEventSynchronizer*),
+                 sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression)
                  MPI_BYTE,
                  sendRankX_,
                  0,
@@ -346,8 +340,6 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box
     communicateHaloData(d_sendBuf_, xSendSize_, sendRankX_, recvPtr, xRecvSize_, recvRankX_);
 
     wallcycle_stop(wcycle_, WallCycleCounter::MoveX);
-
-    return;
 }
 
 // The following method should be called after non-local buffer operations,
@@ -517,12 +509,12 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(float3* sendPtr,
     haloDataTransferLaunched_->markEvent(nonLocalStream_);
 
     MPI_Sendrecv(&haloDataTransferLaunched_,
-                 sizeof(GpuEventSynchronizer*),
+                 sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression)
                  MPI_BYTE,
                  sendRank,
                  0,
                  &haloDataTransferRemote,
-                 sizeof(GpuEventSynchronizer*),
+                 sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression)
                  MPI_BYTE,
                  recvRank,
                  0,
diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
index 0fbede7b09..6991fe77be 100644
--- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
@@ -101,6 +101,7 @@ void PmeCoordinateReceiverGpu::Impl::receiveCoordinatesSynchronizerFromPpCudaDir
 
 #if GMX_MPI
     // Receive event from PP task
+    // NOLINTNEXTLINE(bugprone-sizeof-expression)
     MPI_Irecv(&ppSync_[recvCount_], sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_, &request_[recvCount_]);
     recvCount_++;
 #else
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
index ecec4d6ac6..75409d0296 100644
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
@@ -101,6 +101,7 @@ void PmeForceSenderGpu::Impl::sendFSynchronizerToPpCudaDirect(int ppRank)
 #if GMX_MPI
     // TODO Using MPI_Isend would be more efficient, particularly when
     // sending to multiple PP ranks
+    // NOLINTNEXTLINE(bugprone-sizeof-expression)
     MPI_Send(&pmeForcesReady_, sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_);
 #else
     GMX_UNUSED_VALUE(ppRank);
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.h b/src/gromacs/ewald/pme_force_sender_gpu_impl.h
index 0e0ad8122c..4a73f56376 100644
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.h
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.h
@@ -65,6 +65,7 @@ public:
      * \param[in] ppRanks         List of PP ranks
      */
     Impl(GpuEventSynchronizer* pmeForcesReady, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+    // NOLINTNEXTLINE(performance-trivially-destructible)
     ~Impl();
 
     /*! \brief
diff --git a/src/gromacs/ewald/pme_gather.cu b/src/gromacs/ewald/pme_gather.cu
index 095c47acf8..eedee8a67e 100644
--- a/src/gromacs/ewald/pme_gather.cu
+++ b/src/gromacs/ewald/pme_gather.cu
@@ -63,7 +63,7 @@ __device__ __forceinline__ float read_grid_size(const float* realGridSizeFP, con
         case ZZ: return realGridSizeFP[ZZ];
     }
     assert(false);
-    return 0.0f;
+    return 0.0F;
 }
 
 /*! \brief Reduce the partial force contributions.
@@ -89,9 +89,9 @@ __device__ __forceinline__ void reduce_atom_forces(float3* __restrict__ sm_force
                                                    const int    splineIndex,
                                                    const int    lineIndex,
                                                    const float* realGridSizeFP,
-                                                   float&       fx,
-                                                   float&       fy,
-                                                   float&       fz)
+                                                   float& fx, // NOLINT(google-runtime-references)
+                                                   float& fy, // NOLINT(google-runtime-references)
+                                                   float& fz) // NOLINT(google-runtime-references)
 {
     if (gmx::isPowerOfTwo(order)) // Only for orders of power of 2
     {
@@ -135,7 +135,9 @@ __device__ __forceinline__ void reduce_atom_forces(float3* __restrict__ sm_force
         if (dimIndex < DIM)
         {
             const float n = read_grid_size(realGridSizeFP, dimIndex);
-            *((float*)(&sm_forces[atomIndexLocal]) + dimIndex) = fx * n;
+            float* __restrict__ sm_forcesAtomIndexOffset =
+                    reinterpret_cast<float*>(&sm_forces[atomIndexLocal]);
+            sm_forcesAtomIndexOffset[dimIndex] = fx * n;
         }
     }
     else
@@ -207,7 +209,9 @@ __device__ __forceinline__ void reduce_atom_forces(float3* __restrict__ sm_force
 
             if (sourceIndex == minStride * atomIndex)
             {
-                *((float*)(&sm_forces[atomIndex]) + dimIndex) =
+                float* __restrict__ sm_forcesAtomIndexOffset =
+                        reinterpret_cast<float*>(&sm_forces[atomIndex]);
+                sm_forcesAtomIndexOffset[dimIndex] =
                         (sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n;
             }
         }
@@ -465,9 +469,9 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
                 kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, sm_dtheta, sm_gridlineIndices);
         __syncwarp();
     }
-    float fx = 0.0f;
-    float fy = 0.0f;
-    float fz = 0.0f;
+    float fx = 0.0F;
+    float fy = 0.0F;
+    float fz = 0.0F;
 
     const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsA[atomIndexGlobal]);
 
@@ -545,7 +549,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
         {
             int   outputIndexLocal       = i * iterThreads + threadLocalId;
             int   outputIndexGlobal      = blockIndex * blockForcesSize + outputIndexLocal;
-            float outputForceComponent   = ((float*)sm_forces)[outputIndexLocal];
+            float outputForceComponent   = (reinterpret_cast<float*>(sm_forces)[outputIndexLocal]);
             gm_forces[outputIndexGlobal] = outputForceComponent;
         }
     }
@@ -554,9 +558,9 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
     {
         /* We must sync here since the same shared memory is used as above. */
         __syncthreads();
-        fx                    = 0.0f;
-        fy                    = 0.0f;
-        fz                    = 0.0f;
+        fx                    = 0.0F;
+        fy                    = 0.0F;
+        fz                    = 0.0F;
         const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsB[atomIndexGlobal]);
         if (chargeCheck)
         {
@@ -605,7 +609,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
             {
                 int   outputIndexLocal     = i * iterThreads + threadLocalId;
                 int   outputIndexGlobal    = blockIndex * blockForcesSize + outputIndexLocal;
-                float outputForceComponent = ((float*)sm_forces)[outputIndexLocal];
+                float outputForceComponent = (reinterpret_cast<float*>(sm_forces)[outputIndexLocal]);
                 gm_forces[outputIndexGlobal] += outputForceComponent;
             }
         }
diff --git a/src/gromacs/ewald/pme_gpu_3dfft.cu b/src/gromacs/ewald/pme_gpu_3dfft.cu
index 1a8e9c577b..80daa42020 100644
--- a/src/gromacs/ewald/pme_gpu_3dfft.cu
+++ b/src/gromacs/ewald/pme_gpu_3dfft.cu
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -79,9 +79,9 @@ GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu, const int gridIndex)
     const int realGridSizePaddedTotal =
             realGridSizePadded[XX] * realGridSizePadded[YY] * realGridSizePadded[ZZ];
 
-    realGrid_ = (cufftReal*)kernelParamsPtr->grid.d_realGrid[gridIndex];
+    realGrid_ = reinterpret_cast<cufftReal*>(kernelParamsPtr->grid.d_realGrid[gridIndex]);
     GMX_RELEASE_ASSERT(realGrid_, "Bad (null) input real-space grid");
-    complexGrid_ = (cufftComplex*)kernelParamsPtr->grid.d_fourierGrid[gridIndex];
+    complexGrid_ = reinterpret_cast<cufftComplex*>(kernelParamsPtr->grid.d_fourierGrid[gridIndex]);
     GMX_RELEASE_ASSERT(complexGrid_, "Bad (null) input complex grid");
 
     cufftResult_t result;
diff --git a/src/gromacs/ewald/pme_gpu_calculate_splines.cuh b/src/gromacs/ewald/pme_gpu_calculate_splines.cuh
index 8e8496da8f..6c85fff0ff 100644
--- a/src/gromacs/ewald/pme_gpu_calculate_splines.cuh
+++ b/src/gromacs/ewald/pme_gpu_calculate_splines.cuh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -104,10 +104,10 @@ int __device__ __forceinline__ getSplineParamIndex(int paramIndexBase, int dimIn
  *
  * This is called from the spline_and_spread and gather PME kernels.
  */
-int __device__ __forceinline__ pme_gpu_check_atom_charge(const float coefficient)
+bool __device__ __forceinline__ pme_gpu_check_atom_charge(const float coefficient)
 {
     assert(isfinite(coefficient));
-    return c_skipNeutralAtoms ? (coefficient != 0.0f) : 1;
+    return c_skipNeutralAtoms ? (coefficient != 0.0F) : true;
 }
 
 //! Controls if the atom and charge data is prefeched into shared memory or loaded per thread from global
@@ -126,15 +126,15 @@ __device__ inline void assertIsFinite(T arg);
 template<>
 __device__ inline void assertIsFinite(float3 gmx_unused arg)
 {
-    assert(isfinite(float(arg.x)));
-    assert(isfinite(float(arg.y)));
-    assert(isfinite(float(arg.z)));
+    assert(isfinite(static_cast<float>(arg.x)));
+    assert(isfinite(static_cast<float>(arg.y)));
+    assert(isfinite(static_cast<float>(arg.z)));
 }
 
 template<typename T>
 __device__ inline void assertIsFinite(T gmx_unused arg)
 {
-    assert(isfinite(float(arg)));
+    assert(isfinite(static_cast<float>(arg)));
 }
 
 /*! \brief
@@ -268,7 +268,7 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k
             const float shift = c_pmeMaxUnitcellShift;
             /* Fractional coordinates along box vectors, adding a positive shift to ensure t is positive for triclinic boxes */
             t    = (t + shift) * n;
-            tInt = (int)t;
+            tInt = static_cast<int>(t);
             assert(sharedMemoryIndex < atomsPerBlock * DIM);
             sm_fractCoords[sharedMemoryIndex] = t - tInt;
             tableIndex += tInt;
@@ -302,14 +302,14 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k
             assert(isfinite(dr));
 
             /* dr is relative offset from lower cell limit */
-            splineData[order - 1] = 0.0f;
+            splineData[order - 1] = 0.0F;
             splineData[1]         = dr;
-            splineData[0]         = 1.0f - dr;
+            splineData[0]         = 1.0F - dr;
 
 #pragma unroll
             for (int k = 3; k < order; k++)
             {
-                div               = 1.0f / (k - 1.0f);
+                div               = 1.0F / (k - 1.0F);
                 splineData[k - 1] = div * dr * splineData[k - 2];
 #pragma unroll
                 for (int l = 1; l < (k - 1); l++)
@@ -317,7 +317,7 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k
                     splineData[k - l - 1] =
                             div * ((dr + l) * splineData[k - l - 2] + (k - l - dr) * splineData[k - l - 1]);
                 }
-                splineData[0] = div * (1.0f - dr) * splineData[0];
+                splineData[0] = div * (1.0F - dr) * splineData[0];
             }
 
             const int thetaIndexBase =
@@ -333,7 +333,7 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k
                     const int thetaIndex =
                             getSplineParamIndex<order, atomsPerWarp>(thetaIndexBase, dimIndex, o);
 
-                    const float dtheta = ((o > 0) ? splineData[o - 1] : 0.0f) - splineData[o];
+                    const float dtheta = ((o > 0) ? splineData[o - 1] : 0.0F) - splineData[o];
                     assert(isfinite(dtheta));
                     assert(thetaIndex < order * DIM * atomsPerBlock);
                     if (writeSmDtheta)
@@ -348,7 +348,7 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k
                 }
             }
 
-            div                   = 1.0f / (order - 1.0f);
+            div                   = 1.0F / (order - 1.0F);
             splineData[order - 1] = div * dr * splineData[order - 2];
 #pragma unroll
             for (int k = 1; k < (order - 1); k++)
@@ -357,7 +357,7 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams k
                                             * ((dr + k) * splineData[order - k - 2]
                                                + (order - k - dr) * splineData[order - k - 1]);
             }
-            splineData[0] = div * (1.0f - dr) * splineData[0];
+            splineData[0] = div * (1.0F - dr) * splineData[0];
 
             /* Storing the spline values (theta) */
 #pragma unroll
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp
index d0e54ea101..328e0c11f8 100644
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -463,9 +463,9 @@ void pme_gpu_free_fract_shifts(const PmeGpu* pmeGpu)
     auto* kernelParamsPtr = pmeGpu->kernelParams.get();
 #if GMX_GPU_CUDA
     destroyParamLookupTable(&kernelParamsPtr->grid.d_fractShiftsTable,
-                            kernelParamsPtr->fractShiftsTableTexture);
+                            &kernelParamsPtr->fractShiftsTableTexture);
     destroyParamLookupTable(&kernelParamsPtr->grid.d_gridlineIndicesTable,
-                            kernelParamsPtr->gridlineIndicesTableTexture);
+                            &kernelParamsPtr->gridlineIndicesTableTexture);
 #elif GMX_GPU_OPENCL || GMX_GPU_SYCL
     freeDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable);
     freeDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable);
diff --git a/src/gromacs/ewald/pme_gpu_program_impl.cu b/src/gromacs/ewald/pme_gpu_program_impl.cu
index b7830fa60b..ea0494ec8b 100644
--- a/src/gromacs/ewald/pme_gpu_program_impl.cu
+++ b/src/gromacs/ewald/pme_gpu_program_impl.cu
@@ -60,7 +60,7 @@ constexpr int c_stateB = 1;
 
 //! PME CUDA kernels forward declarations. Kernels are documented in their respective files.
 template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, int mode, bool writeGlobal, ThreadsPerAtom threadsPerAtom>
-__global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams);
+__global__ void pme_spline_and_spread_kernel(PmeGpuCudaKernelParams kernelParams);
 
 // Add extern declarations to inform that there will be a definition
 // provided in another translation unit.
@@ -99,7 +99,7 @@ extern template __global__ void
 pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
 
 template<GridOrdering gridOrdering, bool computeEnergyAndVirial, const int gridIndex> /* It is significantly slower to pass gridIndex as a kernel parameter */
-__global__ void pme_solve_kernel(const PmeGpuCudaKernelParams kernelParams);
+__global__ void pme_solve_kernel(PmeGpuCudaKernelParams kernelParams);
 
 // Add extern declarations to inform that there will be a definition
 // provided in another translation unit.
@@ -115,7 +115,7 @@ extern template __global__ void pme_solve_kernel<GridOrdering::YZX, true, c_stat
 // clang-format on
 
 template<int order, bool wrapX, bool wrapY, int nGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom>
-__global__ void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams);
+__global__ void pme_gather_kernel(PmeGpuCudaKernelParams kernelParams);
 
 // Add extern declarations to inform that there will be a definition
 // provided in another translation unit.
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
index 2e242a074f..39068544b4 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
@@ -89,7 +89,6 @@ void PmePpCommGpu::Impl::reinit(int size)
 
     // Reallocate buffer used for staging PME force on GPU
     reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, deviceContext_);
-    return;
 }
 
 void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(float3* pmeForcePtr, int recvSize, bool receivePmeForceToGpu)
@@ -98,6 +97,7 @@ void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(float3* pmeForcePtr, int
     // Receive event from PME task and add to stream, to ensure pull of data doesn't
     // occur before PME force calc is completed
     GpuEventSynchronizer* pmeSync;
+    // NOLINTNEXTLINE(bugprone-sizeof-expression)
     MPI_Recv(&pmeSync, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
     pmeSync->enqueueWaitEvent(pmePpCommStream_);
 #endif
@@ -166,6 +166,7 @@ void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(float3*               se
     // Record and send event to allow PME task to sync to above transfer before commencing force calculations
     pmeCoordinatesSynchronizer_.markEvent(pmePpCommStream_);
     GpuEventSynchronizer* pmeSync = &pmeCoordinatesSynchronizer_;
+    // NOLINTNEXTLINE(bugprone-sizeof-expression)
     MPI_Send(&pmeSync, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_);
 #endif
 }
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
index d4ee85872e..fbd9b4f8a4 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
@@ -152,7 +152,6 @@ private:
                                      int                   sendSize,
                                      GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
 
-private:
     //! GPU context handle (not used in CUDA)
     const DeviceContext& deviceContext_;
     //! Handle for CUDA stream used for the communication operations in this class
diff --git a/src/gromacs/ewald/pme_solve.cu b/src/gromacs/ewald/pme_solve.cu
index 3f5d2d06f4..83e21b1f11 100644
--- a/src/gromacs/ewald/pme_solve.cu
+++ b/src/gromacs/ewald/pme_solve.cu
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -88,7 +88,7 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT
     const float* __restrict__ gm_splineValueMinor = kernelParams.grid.d_splineModuli[gridIndex]
                                                     + kernelParams.grid.splineValuesOffset[minorDim];
     float* __restrict__ gm_virialAndEnergy = kernelParams.constants.d_virialAndEnergy[gridIndex];
-    float2* __restrict__ gm_grid           = (float2*)kernelParams.grid.d_fourierGrid[gridIndex];
+    float2* __restrict__ gm_grid = reinterpret_cast<float2*>(kernelParams.grid.d_fourierGrid[gridIndex]);
 
     /* Various grid sizes and indices */
     const int localOffsetMinor = 0, localOffsetMajor = 0, localOffsetMiddle = 0; // unused
@@ -119,13 +119,13 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT
     const int indexMajor        = blockIdx.z;
 
     /* Optional outputs */
-    float energy = 0.0f;
-    float virxx  = 0.0f;
-    float virxy  = 0.0f;
-    float virxz  = 0.0f;
-    float viryy  = 0.0f;
-    float viryz  = 0.0f;
-    float virzz  = 0.0f;
+    float energy = 0.0F;
+    float virxx  = 0.0F;
+    float virxy  = 0.0F;
+    float virxz  = 0.0F;
+    float viryy  = 0.0F;
+    float viryz  = 0.0F;
+    float virzz  = 0.0F;
 
     assert(indexMajor < kernelParams.grid.complexGridSize[majorDim]);
     if ((indexMiddle < localCountMiddle) & (indexMinor < localCountMinor)
@@ -176,20 +176,20 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT
         }
 
         /* 0.5 correction factor for the first and last components of a Z dimension */
-        float corner_fac = 1.0f;
+        float corner_fac = 1.0F;
         switch (gridOrdering)
         {
             case GridOrdering::YZX:
                 if ((kMiddle == 0) | (kMiddle == maxkMiddle))
                 {
-                    corner_fac = 0.5f;
+                    corner_fac = 0.5F;
                 }
                 break;
 
             case GridOrdering::XYZ:
                 if ((kMinor == 0) | (kMinor == maxkMinor))
                 {
-                    corner_fac = 0.5f;
+                    corner_fac = 0.5F;
                 }
                 break;
 
@@ -206,13 +206,13 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT
                                + mZ * kernelParams.current.recipBox[ZZ][ZZ];
 
             const float m2k = mhxk * mhxk + mhyk * mhyk + mhzk * mhzk;
-            assert(m2k != 0.0f);
+            assert(m2k != 0.0F);
             // TODO: use LDG/textures for gm_splineValue
             float denom = m2k * float(CUDART_PI_F) * kernelParams.current.boxVolume
                           * gm_splineValueMajor[kMajor] * gm_splineValueMiddle[kMiddle]
                           * gm_splineValueMinor[kMinor];
             assert(isfinite(denom));
-            assert(denom != 0.0f);
+            assert(denom != 0.0F);
 
             const float tmp1   = expf(-kernelParams.grid.ewaldFactor * m2k);
             const float etermk = kernelParams.constants.elFactor * tmp1 / denom;
@@ -226,9 +226,9 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT
             if (computeEnergyAndVirial)
             {
                 const float tmp1k =
-                        2.0f * (gridValue.x * oldGridValue.x + gridValue.y * oldGridValue.y);
+                        2.0F * (gridValue.x * oldGridValue.x + gridValue.y * oldGridValue.y);
 
-                float vfactor = (kernelParams.grid.ewaldFactor + 1.0f / m2k) * 2.0f;
+                float vfactor = (kernelParams.grid.ewaldFactor + 1.0F / m2k) * 2.0F;
                 float ets2    = corner_fac * tmp1k;
                 energy        = ets2;
 
@@ -303,7 +303,7 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT
         /* Reduce 7 outputs per warp in the shared memory */
         const int stride =
                 8; // this is c_virialAndEnergyCount==7 rounded up to power of 2 for convenience, hence the assert
-        assert(c_virialAndEnergyCount == 7);
+        static_assert(c_virialAndEnergyCount == 7);
         const int        reductionBufferSize = (c_solveMaxThreadsPerBlock / warp_size) * stride;
         __shared__ float sm_virialAndEnergy[reductionBufferSize];
 
diff --git a/src/gromacs/gpu_utils/cuda_kernel_utils.cuh b/src/gromacs/gpu_utils/cuda_kernel_utils.cuh
index 1346c6218e..0333d84ae8 100644
--- a/src/gromacs/gpu_utils/cuda_kernel_utils.cuh
+++ b/src/gromacs/gpu_utils/cuda_kernel_utils.cuh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -67,6 +67,7 @@ template<typename T>
 static __forceinline__ __device__ T fetchFromTexture(const cudaTextureObject_t texObj, int index)
 {
     assert(index >= 0);
+    // NOLINTNEXTLINE(misc-static-assert)
     assert(!c_disableCudaTextures);
     return tex1Dfetch<T>(texObj, index);
 }
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh
index e5d3c3aad6..daf05a019d 100644
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -2,7 +2,7 @@
  * This file is part of the GROMACS molecular simulation package.
  *
  * Copyright (c) 2012,2014,2015,2016,2017 by the GROMACS development team.
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -61,7 +61,7 @@ namespace
  *
  * \returns A description of the API error. Returns '(CUDA error #0 (cudaSuccess): no error)' in case deviceError is cudaSuccess.
  */
-static inline std::string getDeviceErrorString(const cudaError_t deviceError)
+inline std::string getDeviceErrorString(const cudaError_t deviceError)
 {
     return formatString("CUDA error #%d (%s): %s.",
                         deviceError,
@@ -76,7 +76,7 @@ static inline std::string getDeviceErrorString(const cudaError_t deviceError)
  *
  *  \throws InternalError if deviceError is not a success.
  */
-static inline void checkDeviceError(const cudaError_t deviceError, const std::string& errorMessage)
+inline void checkDeviceError(const cudaError_t deviceError, const std::string& errorMessage)
 {
     if (deviceError != cudaSuccess)
     {
@@ -92,7 +92,7 @@ static inline void checkDeviceError(const cudaError_t deviceError, const std::st
  *
  * \param[in]  errorMessage  Undecorated error message.
  */
-static inline void ensureNoPendingDeviceError(const std::string& errorMessage)
+inline void ensureNoPendingDeviceError(const std::string& errorMessage)
 {
     // Ensure there is no pending error that would otherwise affect
     // the behaviour of future error handling.
@@ -135,13 +135,13 @@ enum class GpuApiCallBehavior;
 #ifdef CHECK_CUDA_ERRORS
 
 /*! Check for CUDA error on the return status of a CUDA RT API call. */
-#    define CU_RET_ERR(deviceError, msg)                                                          \
-        do                                                                                        \
-        {                                                                                         \
-            if (deviceError != cudaSuccess)                                                       \
-            {                                                                                     \
-                gmx_fatal(FARGS, "%s\n", (msg + gmx::getDeviceErrorString(deviceError)).c_str()); \
-            }                                                                                     \
+#    define CU_RET_ERR(deviceError, msg)                                                            \
+        do                                                                                          \
+        {                                                                                           \
+            if ((deviceError) != cudaSuccess)                                                       \
+            {                                                                                       \
+                gmx_fatal(FARGS, "%s\n", ((msg) + gmx::getDeviceErrorString(deviceError)).c_str()); \
+            }                                                                                       \
         } while (0)
 
 #else /* CHECK_CUDA_ERRORS */
@@ -239,6 +239,7 @@ void prepareGpuKernelArgument(KernelPtr                          kernel,
                               const CurrentArg*                  argPtr,
                               const RemainingArgs*... otherArgsPtrs)
 {
+    // NOLINTNEXTLINE(google-readability-casting)
     (*kernelArgsPtr)[argIndex] = (void*)argPtr;
     prepareGpuKernelArgument(kernel, kernelArgsPtr, argIndex + 1, otherArgsPtrs...);
 }
@@ -283,6 +284,7 @@ void launchGpuKernel(void (*kernel)(Args...),
 {
     dim3 blockSize(config.blockSize[0], config.blockSize[1], config.blockSize[2]);
     dim3 gridSize(config.gridSize[0], config.gridSize[1], config.gridSize[2]);
+    // NOLINTNEXTLINE(google-readability-casting)
     cudaLaunchKernel((void*)kernel,
                      gridSize,
                      blockSize,
diff --git a/src/gromacs/gpu_utils/devicebuffer.cuh b/src/gromacs/gpu_utils/devicebuffer.cuh
index 2b83752c2a..97e9d525b2 100644
--- a/src/gromacs/gpu_utils/devicebuffer.cuh
+++ b/src/gromacs/gpu_utils/devicebuffer.cuh
@@ -68,7 +68,8 @@ template<typename ValueType>
 void allocateDeviceBuffer(DeviceBuffer<ValueType>* buffer, size_t numValues, const DeviceContext& /* deviceContext */)
 {
     GMX_ASSERT(buffer, "needs a buffer pointer");
-    cudaError_t stat = cudaMalloc((void**)buffer, numValues * sizeof(ValueType));
+    // NOLINTNEXTLINE(google-readability-casting)
+    cudaError_t stat = cudaMalloc((void**)(buffer), numValues * sizeof(ValueType));
     GMX_RELEASE_ASSERT(
             stat == cudaSuccess,
             ("Allocation of the device buffer failed. " + gmx::getDeviceErrorString(stat)).c_str());
@@ -130,6 +131,7 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
     {
         case GpuApiCallBehavior::Async:
             GMX_ASSERT(isHostMemoryPinned(hostBuffer), "Source host buffer was not pinned for CUDA");
+            // NOLINTNEXTLINE(google-readability-casting)
             stat = cudaMemcpyAsync(*((ValueType**)buffer) + startingOffset,
                                    hostBuffer,
                                    bytes,
@@ -142,7 +144,11 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
 
         case GpuApiCallBehavior::Sync:
             stat = cudaMemcpy(
-                    *((ValueType**)buffer) + startingOffset, hostBuffer, bytes, cudaMemcpyHostToDevice);
+                    // NOLINTNEXTLINE(google-readability-casting)
+                    *((ValueType**)buffer) + startingOffset,
+                    hostBuffer,
+                    bytes,
+                    cudaMemcpyHostToDevice);
             GMX_RELEASE_ASSERT(
                     stat == cudaSuccess,
                     ("Synchronous H2D copy failed. " + gmx::getDeviceErrorString(stat)).c_str());
@@ -189,6 +195,7 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
             GMX_ASSERT(isHostMemoryPinned(hostBuffer),
                        "Destination host buffer was not pinned for CUDA");
             stat = cudaMemcpyAsync(hostBuffer,
+                                   // NOLINTNEXTLINE(google-readability-casting)
                                    *((ValueType**)buffer) + startingOffset,
                                    bytes,
                                    cudaMemcpyDeviceToHost,
@@ -199,8 +206,11 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
             break;
 
         case GpuApiCallBehavior::Sync:
-            stat = cudaMemcpy(
-                    hostBuffer, *((ValueType**)buffer) + startingOffset, bytes, cudaMemcpyDeviceToHost);
+            stat = cudaMemcpy(hostBuffer,
+                              // NOLINTNEXTLINE(google-readability-casting)
+                              *((ValueType**)buffer) + startingOffset,
+                              bytes,
+                              cudaMemcpyDeviceToHost);
             GMX_RELEASE_ASSERT(
                     stat == cudaSuccess,
                     ("Synchronous D2H copy failed. " + gmx::getDeviceErrorString(stat)).c_str());
@@ -287,7 +297,11 @@ void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer,
     const char   pattern = 0;
 
     cudaError_t stat = cudaMemsetAsync(
-            *((ValueType**)buffer) + startingOffset, pattern, bytes, deviceStream.stream());
+            // NOLINTNEXTLINE(google-readability-casting)
+            *((ValueType**)buffer) + startingOffset,
+            pattern,
+            bytes,
+            deviceStream.stream());
     GMX_RELEASE_ASSERT(stat == cudaSuccess,
                        ("Couldn't clear the device buffer. " + gmx::getDeviceErrorString(stat)).c_str());
 }
@@ -345,6 +359,7 @@ void initParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer,
     const size_t sizeInBytes = numValues * sizeof(ValueType);
 
     cudaError_t stat =
+            // NOLINTNEXTLINE(google-readability-casting)
             cudaMemcpy(*((ValueType**)deviceBuffer), hostBuffer, sizeInBytes, cudaMemcpyHostToDevice);
 
     GMX_RELEASE_ASSERT(stat == cudaSuccess,
@@ -378,11 +393,11 @@ void initParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer,
  * \param[in,out]  deviceTexture  Device texture object to unbind.
  */
 template<typename ValueType>
-void destroyParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer, DeviceTexture& deviceTexture)
+void destroyParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer, const DeviceTexture* deviceTexture)
 {
     if (!c_disableCudaTextures && deviceTexture && deviceBuffer)
     {
-        cudaError_t stat = cudaDestroyTextureObject(deviceTexture);
+        cudaError_t stat = cudaDestroyTextureObject(*deviceTexture);
         GMX_RELEASE_ASSERT(
                 stat == cudaSuccess,
                 ("Destruction of the texture object failed. " + gmx::getDeviceErrorString(stat)).c_str());
diff --git a/src/gromacs/gpu_utils/devicebuffer_sycl.h b/src/gromacs/gpu_utils/devicebuffer_sycl.h
index 3ae9b615da..743b9ea785 100644
--- a/src/gromacs/gpu_utils/devicebuffer_sycl.h
+++ b/src/gromacs/gpu_utils/devicebuffer_sycl.h
@@ -535,7 +535,7 @@ void initParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer,
  * \param[in,out] deviceBuffer  Device buffer to store data in.
  */
 template<typename ValueType>
-void destroyParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer, DeviceTexture& /* deviceTexture */)
+void destroyParamLookupTable(DeviceBuffer<ValueType>* deviceBuffer, DeviceTexture* /* deviceTexture */)
 {
     deviceBuffer->buffer_.reset(nullptr);
 }
diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu
index b35fcabd4a..b72dbaeff6 100644
--- a/src/gromacs/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gpu_utils/gpu_utils.cu
@@ -65,6 +65,7 @@
 #include "gromacs/utility/snprintf.h"
 #include "gromacs/utility/stringutil.h"
 
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 static bool cudaProfilerRun = ((getenv("NVPROF_ID") != nullptr));
 
 bool isHostMemoryPinned(const void* h_ptr)
@@ -102,7 +103,7 @@ bool isHostMemoryPinned(const void* h_ptr)
     return isPinned;
 }
 
-void startGpuProfiler(void)
+void startGpuProfiler()
 {
     /* The NVPROF_ID environment variable is set by nvprof and indicates that
        mdrun is executed in the CUDA profiler.
@@ -118,7 +119,7 @@ void startGpuProfiler(void)
     }
 }
 
-void stopGpuProfiler(void)
+void stopGpuProfiler()
 {
     /* Stopping the nvidia here allows us to eliminate the subsequent
        API calls from the trace, e.g. uninitialization and cleanup. */
@@ -130,7 +131,7 @@ void stopGpuProfiler(void)
     }
 }
 
-void resetGpuProfiler(void)
+void resetGpuProfiler()
 {
     /* With CUDA <=7.5 the profiler can't be properly reset; we can only start
      *  the profiling here (can't stop it) which will achieve the desired effect if
@@ -180,7 +181,7 @@ static void peerAccessCheckStat(const cudaError_t    stat,
     {
         std::string errorString =
                 gmx::formatString("%s from GPU %d to GPU %d failed", cudaCallName, gpuA, gpuB);
-        CU_RET_ERR(stat, errorString.c_str());
+        CU_RET_ERR(stat, errorString);
     }
     if (stat != cudaSuccess)
     {
diff --git a/src/gromacs/gpu_utils/gpuregiontimer.cuh b/src/gromacs/gpu_utils/gpuregiontimer.cuh
index c56d60da61..59793095db 100644
--- a/src/gromacs/gpu_utils/gpuregiontimer.cuh
+++ b/src/gromacs/gpu_utils/gpuregiontimer.cuh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -109,7 +109,7 @@ public:
      * for passing into individual GPU API calls.
      * This is just a dummy in CUDA.
      */
-    inline CommandEvent* fetchNextEvent() { return nullptr; }
+    static inline CommandEvent* fetchNextEvent() { return nullptr; }
 };
 
 //! Short-hand for external use
diff --git a/src/gromacs/gpu_utils/pmalloc.cu b/src/gromacs/gpu_utils/pmalloc.cu
index 3a8f1058fe..cef8b97d73 100644
--- a/src/gromacs/gpu_utils/pmalloc.cu
+++ b/src/gromacs/gpu_utils/pmalloc.cu
@@ -66,7 +66,7 @@ void pmalloc(void** h_ptr, size_t nbytes)
     gmx::ensureNoPendingDeviceError("Could not allocate page-locked memory.");
 
     stat = cudaMallocHost(h_ptr, nbytes, flag);
-    sprintf(strbuf, "cudaMallocHost of size %d bytes failed", (int)nbytes);
+    sprintf(strbuf, "cudaMallocHost of size %d bytes failed", static_cast<int>(nbytes));
     CU_RET_ERR(stat, strbuf);
 }
 
diff --git a/src/gromacs/gpu_utils/tests/hostallocator.cpp b/src/gromacs/gpu_utils/tests/hostallocator.cpp
index 2bf3903214..21ed3e85ac 100644
--- a/src/gromacs/gpu_utils/tests/hostallocator.cpp
+++ b/src/gromacs/gpu_utils/tests/hostallocator.cpp
@@ -331,7 +331,7 @@ TYPED_TEST(HostAllocatorTestCopyable, ManualPinningOperationsWorkWithCuda)
         EXPECT_TRUE(input.empty());
         resizeAndFillInput(&input, 3, 1);
         // realloc and copy).
-        auto oldInputData = input.data();
+        auto* oldInputData = input.data();
         changePinningPolicy(&input, PinningPolicy::CannotBePinned);
         EXPECT_FALSE(isPinned(input));
         // These cannot be equal as both had to be allocated at the same
diff --git a/src/gromacs/gpu_utils/tests/typecasts_runner.cpp b/src/gromacs/gpu_utils/tests/typecasts_runner.cpp
index 023f14ae43..0256c463d9 100644
--- a/src/gromacs/gpu_utils/tests/typecasts_runner.cpp
+++ b/src/gromacs/gpu_utils/tests/typecasts_runner.cpp
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2020, by the GROMACS development team, led by
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -46,6 +46,8 @@
 
 #include <vector>
 
+#include "gromacs/utility/arrayref.h"
+
 #include "testutils/testasserts.h"
 
 #if !GMX_GPU_CUDA
@@ -56,14 +58,14 @@ namespace gmx
 namespace test
 {
 
-void convertRVecToFloat3OnHost(std::vector<gmx::RVec>& /* rVecOutput */,
-                               const std::vector<gmx::RVec>& /* rVecInput */)
+void convertRVecToFloat3OnHost(ArrayRef<gmx::RVec> /* rVecOutput */,
+                               ArrayRef<const gmx::RVec> /* rVecInput */)
 {
     FAIL() << "Can't test float3 and RVec compatibility without CUDA.";
 }
 
-void convertRVecToFloat3OnDevice(std::vector<gmx::RVec>& /* rVecOutput */,
-                                 const std::vector<gmx::RVec>& /* rVecInput */,
+void convertRVecToFloat3OnDevice(ArrayRef<gmx::RVec> /* rVecOutput */,
+                                 ArrayRef<const gmx::RVec> /* rVecInput */,
                                  const TestDevice* /* testDevice */)
 {
     FAIL() << "Can't test float3 and RVec compatibility without CUDA.";
diff --git a/src/gromacs/gpu_utils/tests/typecasts_runner.cu b/src/gromacs/gpu_utils/tests/typecasts_runner.cu
index dde856d93f..4353f9a30b 100644
--- a/src/gromacs/gpu_utils/tests/typecasts_runner.cu
+++ b/src/gromacs/gpu_utils/tests/typecasts_runner.cu
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2020, by the GROMACS development team, led by
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -48,6 +48,7 @@
 #include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/gpu_utils/typecasts.cuh"
 #include "gromacs/hardware/device_information.h"
+#include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/exceptions.h"
 #include "gromacs/utility/stringutil.h"
 
@@ -65,7 +66,7 @@ namespace test
  * \param[in]  float3Output  Output data in float3 format.
  * \param[in]  numElements   Size of the data buffers.
  */
-void inline saveFloat3InRVecFormat(std::vector<gmx::RVec>& rVecOutput, const float3* float3Output, int numElements)
+void inline saveFloat3InRVecFormat(ArrayRef<gmx::RVec> rVecOutput, const float3* float3Output, int numElements)
 {
     for (int i = 0; i < numElements; i++)
     {
@@ -75,7 +76,7 @@ void inline saveFloat3InRVecFormat(std::vector<gmx::RVec>& rVecOutput, const flo
     }
 }
 
-void convertRVecToFloat3OnHost(std::vector<gmx::RVec>& rVecOutput, const std::vector<gmx::RVec>& rVecInput)
+void convertRVecToFloat3OnHost(ArrayRef<gmx::RVec> rVecOutput, ArrayRef<const gmx::RVec> rVecInput)
 {
     const int numElements = rVecInput.size();
 
@@ -105,9 +106,9 @@ static __global__ void convertRVecToFloat3OnDevice_kernel(DeviceBuffer<float3> g
     }
 }
 
-void convertRVecToFloat3OnDevice(std::vector<gmx::RVec>&       h_rVecOutput,
-                                 const std::vector<gmx::RVec>& h_rVecInput,
-                                 const TestDevice*             testDevice)
+void convertRVecToFloat3OnDevice(ArrayRef<gmx::RVec>       h_rVecOutput,
+                                 ArrayRef<const gmx::RVec> h_rVecInput,
+                                 const TestDevice*         testDevice)
 {
     const DeviceContext& deviceContext = testDevice->deviceContext();
     const DeviceStream&  deviceStream  = testDevice->deviceStream();
diff --git a/src/gromacs/gpu_utils/tests/typecasts_runner.h b/src/gromacs/gpu_utils/tests/typecasts_runner.h
index 44a3247a45..f1c60517e5 100644
--- a/src/gromacs/gpu_utils/tests/typecasts_runner.h
+++ b/src/gromacs/gpu_utils/tests/typecasts_runner.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2020, by the GROMACS development team, led by
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -54,6 +54,9 @@
 namespace gmx
 {
 
+template<typename>
+class ArrayRef;
+
 namespace test
 {
 
@@ -62,7 +65,7 @@ namespace test
  * \param[out] rVecOutput  Data in RVec format for the output.
  * \param[in]  rVecInput   Data in RVec format with the input.
  */
-void convertRVecToFloat3OnHost(std::vector<gmx::RVec>& rVecOutput, const std::vector<gmx::RVec>& rVecInput);
+void convertRVecToFloat3OnHost(ArrayRef<gmx::RVec> rVecOutput, ArrayRef<const gmx::RVec> rVecInput);
 
 /*! \brief Tests the compatibility of RVec and float3 using the conversion on device.
  *
@@ -70,9 +73,9 @@ void convertRVecToFloat3OnHost(std::vector<gmx::RVec>& rVecOutput, const std::ve
  * \param[in]  rVecInput   Data in RVec format with the input.
  * \param[in]  testDevice  Test herdware environment to get DeviceContext and DeviceStream from.
  */
-void convertRVecToFloat3OnDevice(std::vector<gmx::RVec>&       rVecOutput,
-                                 const std::vector<gmx::RVec>& rVecInput,
-                                 const TestDevice*             testDevice);
+void convertRVecToFloat3OnDevice(ArrayRef<gmx::RVec>       rVecOutput,
+                                 ArrayRef<const gmx::RVec> rVecInput,
+                                 const TestDevice*         testDevice);
 
 
 } // namespace test
diff --git a/src/gromacs/gpu_utils/vectype_ops.cuh b/src/gromacs/gpu_utils/vectype_ops.cuh
index cce3fc9008..e358e0c0e6 100644
--- a/src/gromacs/gpu_utils/vectype_ops.cuh
+++ b/src/gromacs/gpu_utils/vectype_ops.cuh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2015,2016,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2012,2015,2016,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -45,7 +45,7 @@ __forceinline__ __host__ __device__ float3 make_float3(float4 a)
 {
     return make_float3(a.x, a.y, a.z);
 }
-__forceinline__ __host__ __device__ float3 operator-(float3& a)
+__forceinline__ __host__ __device__ float3 operator-(const float3& a)
 {
     return make_float3(-a.x, -a.y, -a.z);
 }
@@ -65,18 +65,21 @@ __forceinline__ __host__ __device__ float3 operator*(float k, float3 a)
 {
     return make_float3(k * a.x, k * a.y, k * a.z);
 }
+// NOLINTNEXTLINE(google-runtime-references)
 __forceinline__ __host__ __device__ void operator+=(float3& a, float3 b)
 {
     a.x += b.x;
     a.y += b.y;
     a.z += b.z;
 }
+// NOLINTNEXTLINE(google-runtime-references)
 __forceinline__ __host__ __device__ void operator+=(float3& a, float4 b)
 {
     a.x += b.x;
     a.y += b.y;
     a.z += b.z;
 }
+// NOLINTNEXTLINE(google-runtime-references)
 __forceinline__ __host__ __device__ void operator-=(float3& a, float3 b)
 {
     a.x -= b.x;
@@ -99,12 +102,14 @@ __forceinline__ __host__ __device__ float3 operator*(float3 a, float3 b)
 {
     return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
 }
+// NOLINTNEXTLINE(google-runtime-references)
 __forceinline__ __host__ __device__ void operator*=(float3& a, float3 b)
 {
     a.x *= b.x;
     a.y *= b.y;
     a.z *= b.z;
 }
+// NOLINTNEXTLINE(google-runtime-references)
 __forceinline__ __host__ __device__ void operator*=(float3& a, float b)
 {
     a.x *= b;
@@ -126,7 +131,7 @@ __forceinline__ __host__ __device__ float4 make_float4(float s)
 }
 __forceinline__ __host__ __device__ float4 make_float4(float3 a)
 {
-    return make_float4(a.x, a.y, a.z, 0.0f);
+    return make_float4(a.x, a.y, a.z, 0.0F);
 }
 __forceinline__ __host__ __device__ float4 operator+(float4 a, float4 b)
 {
@@ -151,12 +156,14 @@ __forceinline__ __host__ __device__ void operator+=(float4& a, float4 b)
     a.z += b.z;
     a.w += b.w;
 }
+// NOLINTNEXTLINE(google-runtime-references)
 __forceinline__ __host__ __device__ void operator+=(float4& a, float3 b)
 {
     a.x += b.x;
     a.y += b.y;
     a.z += b.z;
 }
+// NOLINTNEXTLINE(google-runtime-references)
 __forceinline__ __host__ __device__ void operator-=(float4& a, float3 b)
 {
     a.x -= b.x;
@@ -223,21 +230,21 @@ __forceinline__ __device__ float cos_angle(const float3 a, const float3 b)
     float ipb  = norm2(b);
     float ip   = iprod(a, b);
     float ipab = ipa * ipb;
-    if (ipab > 0.0f)
+    if (ipab > 0.0F)
     {
         cosval = ip * rsqrt(ipab);
     }
     else
     {
-        cosval = 1.0f;
+        cosval = 1.0F;
     }
-    if (cosval > 1.0f)
+    if (cosval > 1.0F)
     {
-        return 1.0f;
+        return 1.0F;
     }
-    if (cosval < -1.0f)
+    if (cosval < -1.0F)
     {
-        return -1.0f;
+        return -1.0F;
     }
 
     return cosval;
@@ -267,8 +274,8 @@ __forceinline__ __device__ float gmx_angle(const float3 a, const float3 b)
  *
  * \param[in] a  First vector.
  * \param[in] b  Second vector.
- * \returns Angle between vectors.
  */
+// NOLINTNEXTLINE(google-runtime-references)
 __forceinline__ __device__ void atomicAdd(float3& a, const float3 b)
 {
     atomicAdd(&a.x, b.x);
diff --git a/src/gromacs/hardware/device_management.cu b/src/gromacs/hardware/device_management.cu
index 3df63f8131..c1d669c4e2 100644
--- a/src/gromacs/hardware/device_management.cu
+++ b/src/gromacs/hardware/device_management.cu
@@ -2,7 +2,7 @@
  * This file is part of the GROMACS molecular simulation package.
  *
  * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team.
- * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -66,10 +66,10 @@
  *
  * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
  */
-static int c_cudaMaxDeviceCount = 32;
+static const int c_cudaMaxDeviceCount = 32;
 
 /** Dummy kernel used for sanity checking. */
-static __global__ void dummy_kernel(void) {}
+static __global__ void dummy_kernel() {}
 
 static cudaError_t checkCompiledTargetCompatibility(int deviceId, const cudaDeviceProp& deviceProp)
 {
@@ -338,7 +338,7 @@ void setActiveDevice(const DeviceInformation& deviceInfo)
     if (stat != cudaSuccess)
     {
         auto message = gmx::formatString("Failed to initialize GPU #%d", deviceId);
-        CU_RET_ERR(stat, message.c_str());
+        CU_RET_ERR(stat, message);
     }
 
     if (debug)
diff --git a/src/gromacs/listed_forces/gpubonded_impl.cu b/src/gromacs/listed_forces/gpubonded_impl.cu
index 48bd3f8360..90c5449da9 100644
--- a/src/gromacs/listed_forces/gpubonded_impl.cu
+++ b/src/gromacs/listed_forces/gpubonded_impl.cu
@@ -156,7 +156,7 @@ static void convertIlistToNbnxnOrder(const InteractionList& src,
                                      int                    numAtomsPerInteraction,
                                      ArrayRef<const int>    nbnxnAtomOrder)
 {
-    GMX_ASSERT(src.size() == 0 || !nbnxnAtomOrder.empty(), "We need the nbnxn atom order");
+    GMX_ASSERT(src.empty() || !nbnxnAtomOrder.empty(), "We need the nbnxn atom order");
 
     dest->iatoms.resize(src.size());
 
diff --git a/src/gromacs/listed_forces/gpubonded_impl.h b/src/gromacs/listed_forces/gpubonded_impl.h
index dacb612308..5e66c52eea 100644
--- a/src/gromacs/listed_forces/gpubonded_impl.h
+++ b/src/gromacs/listed_forces/gpubonded_impl.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -127,7 +127,7 @@ class GpuBonded::Impl
 public:
     //! Constructor
     Impl(const gmx_ffparams_t& ffparams,
-         const float           electrostaticsScaleFactor,
+         float                 electrostaticsScaleFactor,
          const DeviceContext&  deviceContext,
          const DeviceStream&   deviceStream,
          gmx_wallcycle*        wcycle);
diff --git a/src/gromacs/listed_forces/gpubondedkernels.cu b/src/gromacs/listed_forces/gpubondedkernels.cu
index 8ab52bf5b4..407e447bdc 100644
--- a/src/gromacs/listed_forces/gpubondedkernels.cu
+++ b/src/gromacs/listed_forces/gpubondedkernels.cu
@@ -73,7 +73,7 @@
 /*-------------------------------- CUDA kernels-------------------------------- */
 /*------------------------------------------------------------------------------*/
 
-#define CUDA_DEG2RAD_F (CUDART_PI_F / 180.0f)
+#define CUDA_DEG2RAD_F (CUDART_PI_F / 180.0F)
 
 /*---------------- BONDED CUDA kernels--------------*/
 
@@ -81,7 +81,7 @@
 __device__ __forceinline__ static void
 harmonic_gpu(const float kA, const float xA, const float x, float* V, float* F)
 {
-    constexpr float half = 0.5f;
+    constexpr float half = 0.5F;
     float           dx, dx2;
 
     dx  = x - xA;
@@ -104,10 +104,10 @@ __device__ void bonds_gpu(const int       i,
 {
     if (i < numBonds)
     {
-        int3 bondData = *(int3*)(d_forceatoms + 3 * i);
-        int  type     = bondData.x;
-        int  ai       = bondData.y;
-        int  aj       = bondData.z;
+        const int3 bondData = *(reinterpret_cast<const int3*>(d_forceatoms + 3 * i));
+        int        type     = bondData.x;
+        int        ai       = bondData.y;
+        int        aj       = bondData.z;
 
         /* dx = xi - xj, corrected for periodic boundary conditions. */
         float3 dx;
@@ -125,7 +125,7 @@ __device__ void bonds_gpu(const int       i,
             *vtot_loc += vbond;
         }
 
-        if (dr2 != 0.0f)
+        if (dr2 != 0.0F)
         {
             fbond *= rsqrtf(dr2);
 
@@ -175,11 +175,11 @@ __device__ void angles_gpu(const int       i,
 {
     if (i < numBonds)
     {
-        int4 angleData = *(int4*)(d_forceatoms + 4 * i);
-        int  type      = angleData.x;
-        int  ai        = angleData.y;
-        int  aj        = angleData.z;
-        int  ak        = angleData.w;
+        const int4 angleData = *(reinterpret_cast<const int4*>(d_forceatoms + 4 * i));
+        int        type      = angleData.x;
+        int        ai        = angleData.y;
+        int        aj        = angleData.z;
+        int        ak        = angleData.w;
 
         float3 r_ij;
         float3 r_kj;
@@ -203,9 +203,9 @@ __device__ void angles_gpu(const int       i,
         }
 
         float cos_theta2 = cos_theta * cos_theta;
-        if (cos_theta2 < 1.0f)
+        if (cos_theta2 < 1.0F)
         {
-            float st    = dVdt * rsqrtf(1.0f - cos_theta2);
+            float st    = dVdt * rsqrtf(1.0F - cos_theta2);
             float sth   = st * cos_theta;
             float nrij2 = norm2(r_ij);
             float nrkj2 = norm2(r_kj);
@@ -248,11 +248,11 @@ __device__ void urey_bradley_gpu(const int       i,
 {
     if (i < numBonds)
     {
-        int4 ubData = *(int4*)(d_forceatoms + 4 * i);
-        int  type   = ubData.x;
-        int  ai     = ubData.y;
-        int  aj     = ubData.z;
-        int  ak     = ubData.w;
+        const int4 ubData = *(reinterpret_cast<const int4*>(d_forceatoms + 4 * i));
+        int        type   = ubData.x;
+        int        ai     = ubData.y;
+        int        aj     = ubData.z;
+        int        ak     = ubData.w;
 
         float th0A = d_forceparams[type].u_b.thetaA * CUDA_DEG2RAD_F;
         float kthA = d_forceparams[type].u_b.kthetaA;
@@ -287,9 +287,9 @@ __device__ void urey_bradley_gpu(const int       i,
         harmonic_gpu(kUBA, r13A, dr, &vbond, &fbond);
 
         float cos_theta2 = cos_theta * cos_theta;
-        if (cos_theta2 < 1.0f)
+        if (cos_theta2 < 1.0F)
         {
-            float st  = dVdt * rsqrtf(1.0f - cos_theta2);
+            float st  = dVdt * rsqrtf(1.0F - cos_theta2);
             float sth = st * cos_theta;
 
             float nrkj2 = norm2(r_kj);
@@ -316,7 +316,7 @@ __device__ void urey_bradley_gpu(const int       i,
         }
 
         /* Time for the bond calculations */
-        if (dr2 != 0.0f)
+        if (dr2 != 0.0F)
         {
             if (calcEner)
             {
@@ -361,7 +361,7 @@ __device__ __forceinline__ static float dih_angle_gpu(const T        xi,
     *n         = cprod(*r_kj, *r_kl);
     float phi  = gmx_angle(*m, *n);
     float ipr  = iprod(*r_ij, *n);
-    float sign = (ipr < 0.0f) ? -1.0f : 1.0f;
+    float sign = (ipr < 0.0F) ? -1.0F : 1.0F;
     phi        = sign * phi;
 
     return phi;
@@ -375,7 +375,7 @@ dopdihs_gpu(const float cpA, const float phiA, const int mult, const float phi,
 
     mdphi = mult * phi - phiA * CUDA_DEG2RAD_F;
     sdphi = sinf(mdphi);
-    *v    = cpA * (1.0f + cosf(mdphi));
+    *v    = cpA * (1.0F + cosf(mdphi));
     *f    = -cpA * mult * sdphi;
 }
 
@@ -499,7 +499,7 @@ __device__ void rbdihs_gpu(const int       i,
                            float3          sm_fShiftLoc[],
                            const PbcAiuc   pbcAiuc)
 {
-    constexpr float c0 = 0.0f, c1 = 1.0f, c2 = 2.0f, c3 = 3.0f, c4 = 4.0f, c5 = 5.0f;
+    constexpr float c0 = 0.0F, c1 = 1.0F, c2 = 2.0F, c3 = 3.0F, c4 = 4.0F, c5 = 5.0F;
 
     if (i < numBonds)
     {
@@ -597,11 +597,11 @@ __device__ __forceinline__ static void make_dp_periodic_gpu(float* dp)
     /* dp cannot be outside (-pi,pi) */
     if (*dp >= CUDART_PI_F)
     {
-        *dp -= 2.0f * CUDART_PI_F;
+        *dp -= 2.0F * CUDART_PI_F;
     }
     else if (*dp < -CUDART_PI_F)
     {
-        *dp += 2.0f * CUDART_PI_F;
+        *dp += 2.0F * CUDART_PI_F;
     }
 }
 
@@ -658,7 +658,7 @@ __device__ void idihs_gpu(const int       i,
 
         if (calcEner)
         {
-            *vtot_loc += -0.5f * ddphi * dp;
+            *vtot_loc += -0.5F * ddphi * dp;
         }
     }
 }
@@ -679,10 +679,10 @@ __device__ void pairs_gpu(const int       i,
     if (i < numBonds)
     {
         // TODO this should be made into a separate type, the GPU and CPU sizes should be compared
-        int3 pairData = *(int3*)(d_forceatoms + 3 * i);
-        int  type     = pairData.x;
-        int  ai       = pairData.y;
-        int  aj       = pairData.z;
+        const int3 pairData = *(reinterpret_cast<const int3*>(d_forceatoms + 3 * i));
+        int        type     = pairData.x;
+        int        ai       = pairData.y;
+        int        aj       = pairData.z;
 
         float qq  = gm_xq[ai].w * gm_xq[aj].w;
         float c6  = iparams[type].lj14.c6A;
@@ -701,7 +701,7 @@ __device__ void pairs_gpu(const int       i,
         float velec = scale_factor * qq * rinv;
 
         /* Calculate the LJ force * r and add it to the Coulomb part */
-        float fr = (12.0f * c12 * rinv6 - 6.0f * c6) * rinv6 + velec;
+        float fr = (12.0F * c12 * rinv6 - 6.0F * c6) * rinv6 + velec;
 
         float  finvr = fr * rinv2;
         float3 f     = finvr * dr;
@@ -737,14 +737,14 @@ __global__ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
 
     extern __shared__ char sm_dynamicShmem[];
     char*                  sm_nextSlotPtr = sm_dynamicShmem;
-    float3*                sm_fShiftLoc   = (float3*)sm_nextSlotPtr;
+    float3*                sm_fShiftLoc   = reinterpret_cast<float3*>(sm_nextSlotPtr);
     sm_nextSlotPtr += c_numShiftVectors * sizeof(float3);
 
     if (calcVir)
     {
         if (threadIdx.x < c_numShiftVectors)
         {
-            sm_fShiftLoc[threadIdx.x] = make_float3(0.0f, 0.0f, 0.0f);
+            sm_fShiftLoc[threadIdx.x] = make_float3(0.0F, 0.0F, 0.0F);
         }
         __syncthreads();
     }
@@ -865,11 +865,11 @@ __global__ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
         int warpId   = threadIdx.x / warpSize;
 
         // Shared memory variables to hold block-local partial sum
-        float* sm_vTot = (float*)sm_nextSlotPtr;
+        float* sm_vTot = reinterpret_cast<float*>(sm_nextSlotPtr);
         sm_nextSlotPtr += numWarps * sizeof(float);
-        float* sm_vTotVdw = (float*)sm_nextSlotPtr;
+        float* sm_vTotVdw = reinterpret_cast<float*>(sm_nextSlotPtr);
         sm_nextSlotPtr += numWarps * sizeof(float);
-        float* sm_vTotElec = (float*)sm_nextSlotPtr;
+        float* sm_vTotElec = reinterpret_cast<float*>(sm_nextSlotPtr);
 
         if (threadIdx.x % warpSize == 0)
         {
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl.cpp
index 93772853d5..73972c3f66 100644
--- a/src/gromacs/mdlib/gpuforcereduction_impl.cpp
+++ b/src/gromacs/mdlib/gpuforcereduction_impl.cpp
@@ -81,7 +81,7 @@ void GpuForceReduction::Impl::reinit(DeviceBuffer<Float3>  baseForcePtr,
     baseForce_        = baseForcePtr;
     numAtoms_         = numAtoms;
     atomStart_        = atomStart;
-    accumulate_       = static_cast<int>(accumulate);
+    accumulate_       = accumulate;
     completionMarker_ = completionMarker;
     cellInfo_.cell    = cell.data();
 
@@ -112,7 +112,7 @@ void GpuForceReduction::Impl::registerRvecForce(DeviceBuffer<RVec> forcePtr)
     rvecForceToAdd_ = forcePtr;
 };
 
-void GpuForceReduction::Impl::addDependency(GpuEventSynchronizer* const dependency)
+void GpuForceReduction::Impl::addDependency(GpuEventSynchronizer* dependency)
 {
     dependencyList_.push_back(dependency);
 }
@@ -157,8 +157,6 @@ void GpuForceReduction::Impl::execute()
     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
 
-GpuForceReduction::Impl::~Impl() = default;
-
 GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext,
                                      const DeviceStream&  deviceStream,
                                      gmx_wallcycle*       wcycle) :
@@ -176,7 +174,7 @@ void GpuForceReduction::registerRvecForce(DeviceBuffer<RVec> forcePtr)
     impl_->registerRvecForce(forcePtr);
 }
 
-void GpuForceReduction::addDependency(GpuEventSynchronizer* const dependency)
+void GpuForceReduction::addDependency(GpuEventSynchronizer* dependency)
 {
     impl_->addDependency(dependency);
 }
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.h b/src/gromacs/mdlib/gpuforcereduction_impl.h
index 316f4cca31..f4b5d05e8a 100644
--- a/src/gromacs/mdlib/gpuforcereduction_impl.h
+++ b/src/gromacs/mdlib/gpuforcereduction_impl.h
@@ -78,8 +78,8 @@ public:
      * \param [in] wcycle        The wallclock counter
      */
     Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream, gmx_wallcycle* wcycle);
-    ~Impl();
 
+    ~Impl() = default;
     /*! \brief Register a nbnxm-format force to be reduced
      *
      * \param [in] forcePtr  Pointer to force to be reduced
@@ -96,7 +96,7 @@ public:
      *
      * \param [in] dependency   Dependency for this reduction
      */
-    void addDependency(GpuEventSynchronizer* const dependency);
+    void addDependency(GpuEventSynchronizer* dependency);
 
     /*! \brief Reinitialize the GPU force reduction
      *
@@ -108,10 +108,10 @@ public:
      * \param [in] completionMarker Event to be marked when launch of reduction is complete
      */
     void reinit(DeviceBuffer<Float3>  baseForcePtr,
-                const int             numAtoms,
+                int                   numAtoms,
                 ArrayRef<const int>   cell,
-                const int             atomStart,
-                const bool            accumulate,
+                int                   atomStart,
+                bool                  accumulate,
                 GpuEventSynchronizer* completionMarker = nullptr);
 
     /*! \brief Execute the force reduction */
@@ -125,7 +125,7 @@ private:
     //! number of atoms
     int numAtoms_ = 0;
     //! whether reduction is accumulated into base force buffer
-    int accumulate_ = true;
+    bool accumulate_ = true;
     //! cell information for any nbat-format forces
     struct cellInfo cellInfo_;
     //! GPU context object
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl_internal.cu b/src/gromacs/mdlib/gpuforcereduction_impl_internal.cu
index 4fb187f484..acdaf36c32 100644
--- a/src/gromacs/mdlib/gpuforcereduction_impl_internal.cu
+++ b/src/gromacs/mdlib/gpuforcereduction_impl_internal.cu
@@ -91,7 +91,6 @@ static __global__ void reduceKernel(const float3* __restrict__ gm_nbnxmForce,
 
         *gm_fDest = temp;
     }
-    return;
 }
 
 void launchForceReductionKernel(int                        numAtoms,
diff --git a/src/gromacs/mdlib/leapfrog_gpu.h b/src/gromacs/mdlib/leapfrog_gpu.h
index 0f48770178..757aa4a7b0 100644
--- a/src/gromacs/mdlib/leapfrog_gpu.h
+++ b/src/gromacs/mdlib/leapfrog_gpu.h
@@ -124,12 +124,12 @@ public:
     void integrate(DeviceBuffer<Float3>              d_x,
                    DeviceBuffer<Float3>              d_xp,
                    DeviceBuffer<Float3>              d_v,
-                   const DeviceBuffer<Float3>        d_f,
-                   const float                       dt,
-                   const bool                        doTemperatureScaling,
+                   DeviceBuffer<Float3>              d_f,
+                   float                             dt,
+                   bool                              doTemperatureScaling,
                    gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                   const bool                        doParrinelloRahman,
-                   const float                       dtPressureCouple,
+                   bool                              doParrinelloRahman,
+                   float                             dtPressureCouple,
                    const matrix                      prVelocityScalingMatrix);
 
     /*! \brief Set the integrator
@@ -142,7 +142,7 @@ public:
      * \param[in] inverseMasses   Inverse masses of atoms.
      * \param[in] tempScaleGroups Maps the atom index to temperature scale value.
      */
-    void set(const int numAtoms, const real* inverseMasses, const unsigned short* tempScaleGroups);
+    void set(int numAtoms, const real* inverseMasses, const unsigned short* tempScaleGroups);
 
     /*! \brief Class with hardware-specific interfaces and implementations.*/
     class Impl;
diff --git a/src/gromacs/mdlib/lincs_gpu.cpp b/src/gromacs/mdlib/lincs_gpu.cpp
index 761f38a120..86211b17cd 100644
--- a/src/gromacs/mdlib/lincs_gpu.cpp
+++ b/src/gromacs/mdlib/lincs_gpu.cpp
@@ -67,14 +67,14 @@
 namespace gmx
 {
 
-void LincsGpu::apply(const DeviceBuffer<Float3> d_x,
-                     DeviceBuffer<Float3>       d_xp,
-                     const bool                 updateVelocities,
-                     DeviceBuffer<Float3>       d_v,
-                     const real                 invdt,
-                     const bool                 computeVirial,
-                     tensor                     virialScaled,
-                     const PbcAiuc              pbcAiuc)
+void LincsGpu::apply(const DeviceBuffer<Float3>& d_x,
+                     DeviceBuffer<Float3>        d_xp,
+                     const bool                  updateVelocities,
+                     DeviceBuffer<Float3>        d_v,
+                     const real                  invdt,
+                     const bool                  computeVirial,
+                     tensor                      virialScaled,
+                     const PbcAiuc&              pbcAiuc)
 {
     GMX_ASSERT(GMX_GPU_CUDA, "LINCS GPU is only implemented in CUDA.");
 
@@ -120,8 +120,6 @@ void LincsGpu::apply(const DeviceBuffer<Float3> d_x,
         virialScaled[ZZ][YY] += h_virialScaled_[4];
         virialScaled[ZZ][ZZ] += h_virialScaled_[5];
     }
-
-    return;
 }
 
 LincsGpu::LincsGpu(int                  numIterations,
@@ -482,8 +480,8 @@ void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const
 
                 int center = c1a1;
 
-                float sqrtmu1 = 1.0 / sqrt(invmass[c1a1] + invmass[c1a2]);
-                float sqrtmu2 = 1.0 / sqrt(invmass[c2a1] + invmass[c2a2]);
+                float sqrtmu1 = 1.0 / std::sqrt(invmass[c1a1] + invmass[c1a2]);
+                float sqrtmu2 = 1.0 / std::sqrt(invmass[c2a1] + invmass[c2a2]);
 
                 massFactorsHost.at(index) = -sign * invmass[center] * sqrtmu1 * sqrtmu2;
 
@@ -509,8 +507,8 @@ void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const
 
                 int center = c1a2;
 
-                float sqrtmu1 = 1.0 / sqrt(invmass[c1a1] + invmass[c1a2]);
-                float sqrtmu2 = 1.0 / sqrt(invmass[c2a1] + invmass[c2a2]);
+                float sqrtmu1 = 1.0 / std::sqrt(invmass[c1a1] + invmass[c1a2]);
+                float sqrtmu2 = 1.0 / std::sqrt(invmass[c2a1] + invmass[c2a2]);
 
                 massFactorsHost.at(index) = sign * invmass[center] * sqrtmu1 * sqrtmu2;
 
diff --git a/src/gromacs/mdlib/lincs_gpu.h b/src/gromacs/mdlib/lincs_gpu.h
index 3ed6cd3e59..690d21df82 100644
--- a/src/gromacs/mdlib/lincs_gpu.h
+++ b/src/gromacs/mdlib/lincs_gpu.h
@@ -144,14 +144,14 @@ public:
      * \param[in,out] virialScaled      Scaled virial tensor to be updated.
      * \param[in]     pbcAiuc           PBC data.
      */
-    void apply(const DeviceBuffer<Float3> d_x,
-               DeviceBuffer<Float3>       d_xp,
-               const bool                 updateVelocities,
-               DeviceBuffer<Float3>       d_v,
-               const real                 invdt,
-               const bool                 computeVirial,
-               tensor                     virialScaled,
-               const PbcAiuc              pbcAiuc);
+    void apply(const DeviceBuffer<Float3>& d_x,
+               DeviceBuffer<Float3>        d_xp,
+               bool                        updateVelocities,
+               DeviceBuffer<Float3>        d_v,
+               real                        invdt,
+               bool                        computeVirial,
+               tensor                      virialScaled,
+               const PbcAiuc&              pbcAiuc);
 
     /*! \brief
      * Update data-structures (e.g. after NB search step).
diff --git a/src/gromacs/mdlib/lincs_gpu_internal.cu b/src/gromacs/mdlib/lincs_gpu_internal.cu
index 55d4b48bad..15e3a288df 100644
--- a/src/gromacs/mdlib/lincs_gpu_internal.cu
+++ b/src/gromacs/mdlib/lincs_gpu_internal.cu
@@ -107,7 +107,7 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
     const float* __restrict__ gm_inverseMasses         = kernelParams.d_inverseMasses;
     float* __restrict__ gm_virialScaled                = kernelParams.d_virialScaled;
 
-    int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+    const int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
 
     // numConstraintsThreads should be a integer multiple of blockSize (numConstraintsThreads = numBlocks*blockSize).
     // This is to ensure proper synchronizations and reduction. All array are padded to the required size.
@@ -122,7 +122,7 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
     int      j    = pair.j;
 
     // Mass-scaled Lagrange multiplier
-    float lagrangeScaled = 0.0f;
+    float lagrangeScaled = 0.0F;
 
     float targetLength;
     float inverseMassi;
@@ -139,14 +139,14 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
     // Everything computed for these dummies will be equal to zero
     if (isDummyThread)
     {
-        targetLength    = 0.0f;
-        inverseMassi    = 0.0f;
-        inverseMassj    = 0.0f;
-        sqrtReducedMass = 0.0f;
-
-        xi = make_float3(0.0f, 0.0f, 0.0f);
-        xj = make_float3(0.0f, 0.0f, 0.0f);
-        rc = make_float3(0.0f, 0.0f, 0.0f);
+        targetLength    = 0.0F;
+        inverseMassi    = 0.0F;
+        inverseMassj    = 0.0F;
+        sqrtReducedMass = 0.0F;
+
+        xi = make_float3(0.0F, 0.0F, 0.0F);
+        xj = make_float3(0.0F, 0.0F, 0.0F);
+        rc = make_float3(0.0F, 0.0F, 0.0F);
     }
     else
     {
@@ -209,7 +209,7 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
     {
         // Making sure that all sm_rhs are saved before they are accessed in a loop below
         __syncthreads();
-        float mvb = 0.0f;
+        float mvb = 0.0F;
 
         for (int n = 0; n < coupledConstraintsCount; n++)
         {
@@ -256,12 +256,12 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
         float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
 
         float len2  = targetLength * targetLength;
-        float dlen2 = 2.0f * len2 - norm2(dx);
+        float dlen2 = 2.0F * len2 - norm2(dx);
 
         // TODO A little bit more effective but slightly less readable version of the below would be:
         //      float proj = sqrtReducedMass*(targetLength - (dlen2 > 0.0f ? 1.0f : 0.0f)*dlen2*rsqrt(dlen2));
         float proj;
-        if (dlen2 > 0.0f)
+        if (dlen2 > 0.0F)
         {
             proj = sqrtReducedMass * (targetLength - dlen2 * rsqrt(dlen2));
         }
@@ -373,8 +373,6 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
             atomicAdd(&(gm_virialScaled[threadIdx.x]), sm_threadVirial[threadIdx.x * blockDim.x]);
         }
     }
-
-    return;
 }
 
 /*! \brief Select templated kernel.
@@ -409,14 +407,14 @@ inline auto getLincsKernelPtr(const bool updateVelocities, const bool computeVir
     return kernelPtr;
 }
 
-void launchLincsGpuKernel(LincsGpuKernelParameters&  kernelParams,
-                          const DeviceBuffer<Float3> d_x,
-                          DeviceBuffer<Float3>       d_xp,
-                          const bool                 updateVelocities,
-                          DeviceBuffer<Float3>       d_v,
-                          const real                 invdt,
-                          const bool                 computeVirial,
-                          const DeviceStream&        deviceStream)
+void launchLincsGpuKernel(const LincsGpuKernelParameters& kernelParams,
+                          const DeviceBuffer<Float3>&     d_x,
+                          DeviceBuffer<Float3>            d_xp,
+                          const bool                      updateVelocities,
+                          DeviceBuffer<Float3>            d_v,
+                          const real                      invdt,
+                          const bool                      computeVirial,
+                          const DeviceStream&             deviceStream)
 {
 
     auto kernelPtr = getLincsKernelPtr(updateVelocities, computeVirial);
@@ -459,8 +457,6 @@ void launchLincsGpuKernel(LincsGpuKernelParameters&  kernelParams,
                     nullptr,
                     "lincs_kernel<updateVelocities, computeVirial>",
                     kernelArgs);
-
-    return;
 }
 
 } // namespace gmx
diff --git a/src/gromacs/mdlib/lincs_gpu_internal.h b/src/gromacs/mdlib/lincs_gpu_internal.h
index 5c42a784c6..cd421c198c 100644
--- a/src/gromacs/mdlib/lincs_gpu_internal.h
+++ b/src/gromacs/mdlib/lincs_gpu_internal.h
@@ -57,14 +57,14 @@ struct LincsGpuKernelParameters;
 //! Number of threads in a GPU block
 constexpr static int c_threadsPerBlock = 256;
 
-void launchLincsGpuKernel(LincsGpuKernelParameters&  kernelParams,
-                          const DeviceBuffer<Float3> d_x,
-                          DeviceBuffer<Float3>       d_xp,
-                          const bool                 updateVelocities,
-                          DeviceBuffer<Float3>       d_v,
-                          const real                 invdt,
-                          const bool                 computeVirial,
-                          const DeviceStream&        deviceStream);
+void launchLincsGpuKernel(const LincsGpuKernelParameters& kernelParams,
+                          const DeviceBuffer<Float3>&     d_x,
+                          DeviceBuffer<Float3>            d_xp,
+                          bool                            updateVelocities,
+                          DeviceBuffer<Float3>            d_v,
+                          real                            invdt,
+                          bool                            computeVirial,
+                          const DeviceStream&             deviceStream);
 
 } // namespace gmx
 
diff --git a/src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp b/src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp
index b5594c1bc0..1e87f6968f 100644
--- a/src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp
+++ b/src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp
@@ -49,8 +49,8 @@
 namespace gmx
 {
 
-void launchLincsGpuKernel(LincsGpuKernelParameters& /* kernelParams */,
-                          const DeviceBuffer<Float3> /* d_x */,
+void launchLincsGpuKernel(const LincsGpuKernelParameters& /* kernelParams */,
+                          const DeviceBuffer<Float3>& /* d_x */,
                           DeviceBuffer<Float3> /* d_xp */,
                           const bool /* updateVelocities */,
                           DeviceBuffer<Float3> /* d_v */,
diff --git a/src/gromacs/mdlib/settle_gpu.cpp b/src/gromacs/mdlib/settle_gpu.cpp
index 659947ac35..e836ff1779 100644
--- a/src/gromacs/mdlib/settle_gpu.cpp
+++ b/src/gromacs/mdlib/settle_gpu.cpp
@@ -63,14 +63,14 @@
 namespace gmx
 {
 
-void SettleGpu::apply(const DeviceBuffer<Float3> d_x,
-                      DeviceBuffer<Float3>       d_xp,
-                      const bool                 updateVelocities,
-                      DeviceBuffer<Float3>       d_v,
-                      const real                 invdt,
-                      const bool                 computeVirial,
-                      tensor                     virialScaled,
-                      const PbcAiuc              pbcAiuc)
+void SettleGpu::apply(const DeviceBuffer<Float3>& d_x,
+                      DeviceBuffer<Float3>        d_xp,
+                      const bool                  updateVelocities,
+                      DeviceBuffer<Float3>        d_v,
+                      const real                  invdt,
+                      const bool                  computeVirial,
+                      tensor                      virialScaled,
+                      const PbcAiuc&              pbcAiuc)
 {
 
     // Early exit if no settles
@@ -118,8 +118,6 @@ void SettleGpu::apply(const DeviceBuffer<Float3> d_x,
         virialScaled[ZZ][YY] += h_virialScaled_[4];
         virialScaled[ZZ][ZZ] += h_virialScaled_[5];
     }
-
-    return;
 }
 
 SettleGpu::SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
diff --git a/src/gromacs/mdlib/settle_gpu.h b/src/gromacs/mdlib/settle_gpu.h
index 0f85649b95..998f545f86 100644
--- a/src/gromacs/mdlib/settle_gpu.h
+++ b/src/gromacs/mdlib/settle_gpu.h
@@ -114,14 +114,14 @@ public:
      * \param[in,out] virialScaled      Scaled virial tensor to be updated.
      * \param[in]     pbcAiuc           PBC data.
      */
-    void apply(const DeviceBuffer<Float3> d_x,
-               DeviceBuffer<Float3>       d_xp,
-               const bool                 updateVelocities,
-               DeviceBuffer<Float3>       d_v,
-               const real                 invdt,
-               const bool                 computeVirial,
-               tensor                     virialScaled,
-               const PbcAiuc              pbcAiuc);
+    void apply(const DeviceBuffer<Float3>& d_x,
+               DeviceBuffer<Float3>        d_xp,
+               bool                        updateVelocities,
+               DeviceBuffer<Float3>        d_v,
+               real                        invdt,
+               bool                        computeVirial,
+               tensor                      virialScaled,
+               const PbcAiuc&              pbcAiuc);
 
     /*! \brief
      * Update data-structures (e.g. after NB search step).
diff --git a/src/gromacs/mdlib/settle_gpu_internal.cu b/src/gromacs/mdlib/settle_gpu_internal.cu
index 11dd63b035..366df8d3e4 100644
--- a/src/gromacs/mdlib/settle_gpu_internal.cu
+++ b/src/gromacs/mdlib/settle_gpu_internal.cu
@@ -205,7 +205,7 @@ __launch_bounds__(sc_maxThreadsPerBlock) __global__
 
 
         float sinphi = a1d_z * rsqrt(pars.ra * pars.ra);
-        float tmp2   = 1.0f - sinphi * sinphi;
+        float tmp2   = 1.0F - sinphi * sinphi;
 
         if (almost_zero > tmp2)
         {
@@ -215,7 +215,7 @@ __launch_bounds__(sc_maxThreadsPerBlock) __global__
         float tmp    = rsqrt(tmp2);
         float cosphi = tmp2 * tmp;
         float sinpsi = (b1d.z - c1d.z) * pars.irc2 * tmp;
-        tmp2         = 1.0f - sinpsi * sinpsi;
+        tmp2         = 1.0F - sinpsi * sinpsi;
 
         float cospsi = tmp2 * rsqrt(tmp2);
 
@@ -235,7 +235,7 @@ __launch_bounds__(sc_maxThreadsPerBlock) __global__
         float sinthe = (alpha * gamma - beta * tmp2 * rsqrt(tmp2)) * rsqrt(al2be2 * al2be2);
 
         /*  --- Step4  A3' --- */
-        tmp2         = 1.0f - sinthe * sinthe;
+        tmp2         = 1.0F - sinthe * sinthe;
         float costhe = tmp2 * rsqrt(tmp2);
 
         float3 a3d, b3d, c3d;
@@ -318,7 +318,7 @@ __launch_bounds__(sc_maxThreadsPerBlock) __global__
         {
             for (int d = 0; d < 6; d++)
             {
-                sm_threadVirial[d * blockDim.x + threadIdx.x] = 0.0f;
+                sm_threadVirial[d * blockDim.x + threadIdx.x] = 0.0F;
             }
         }
     }
@@ -358,8 +358,6 @@ __launch_bounds__(sc_maxThreadsPerBlock) __global__
             atomicAdd(&(gm_virialScaled[tib]), sm_threadVirial[tib * blockSize]);
         }
     }
-
-    return;
 }
 
 /*! \brief Select templated kernel.
@@ -394,18 +392,18 @@ inline auto getSettleKernelPtr(const bool updateVelocities, const bool computeVi
     return kernelPtr;
 }
 
-void launchSettleGpuKernel(const int                         numSettles,
-                           const DeviceBuffer<WaterMolecule> d_atomIds,
-                           const SettleParameters            settleParameters,
-                           const DeviceBuffer<Float3>        d_x,
-                           DeviceBuffer<Float3>              d_xp,
-                           const bool                        updateVelocities,
-                           DeviceBuffer<Float3>              d_v,
-                           const real                        invdt,
-                           const bool                        computeVirial,
-                           DeviceBuffer<float>               virialScaled,
-                           const PbcAiuc                     pbcAiuc,
-                           const DeviceStream&               deviceStream)
+void launchSettleGpuKernel(const int                          numSettles,
+                           const DeviceBuffer<WaterMolecule>& d_atomIds,
+                           const SettleParameters&            settleParameters,
+                           const DeviceBuffer<Float3>&        d_x,
+                           DeviceBuffer<Float3>               d_xp,
+                           const bool                         updateVelocities,
+                           DeviceBuffer<Float3>               d_v,
+                           const real                         invdt,
+                           const bool                         computeVirial,
+                           DeviceBuffer<float>                virialScaled,
+                           const PbcAiuc&                     pbcAiuc,
+                           const DeviceStream&                deviceStream)
 {
     static_assert(
             gmx::isPowerOfTwo(sc_threadsPerBlock),
@@ -449,8 +447,6 @@ void launchSettleGpuKernel(const int                         numSettles,
                     nullptr,
                     "settle_kernel<updateVelocities, computeVirial>",
                     kernelArgs);
-
-    return;
 }
 
 } // namespace gmx
diff --git a/src/gromacs/mdlib/settle_gpu_internal.h b/src/gromacs/mdlib/settle_gpu_internal.h
index 9963944e74..f88d046150 100644
--- a/src/gromacs/mdlib/settle_gpu_internal.h
+++ b/src/gromacs/mdlib/settle_gpu_internal.h
@@ -73,18 +73,18 @@ namespace gmx
  * \param[in]     pbcAiuc           PBC data.
  * \param[in]     deviceStream      Device stream to launch kernel in.
  */
-void launchSettleGpuKernel(int                               numSettles,
-                           const DeviceBuffer<WaterMolecule> d_atomIds,
-                           const SettleParameters            settleParameters,
-                           const DeviceBuffer<Float3>        d_x,
-                           DeviceBuffer<Float3>              d_xp,
-                           const bool                        updateVelocities,
-                           DeviceBuffer<Float3>              d_v,
-                           const real                        invdt,
-                           const bool                        computeVirial,
-                           DeviceBuffer<float>               virialScaled,
-                           const PbcAiuc                     pbcAiuc,
-                           const DeviceStream&               deviceStream);
+void launchSettleGpuKernel(int                                numSettles,
+                           const DeviceBuffer<WaterMolecule>& d_atomIds,
+                           const SettleParameters&            settleParameters,
+                           const DeviceBuffer<Float3>&        d_x,
+                           DeviceBuffer<Float3>               d_xp,
+                           bool                               updateVelocities,
+                           DeviceBuffer<Float3>               d_v,
+                           real                               invdt,
+                           bool                               computeVirial,
+                           DeviceBuffer<float>                virialScaled,
+                           const PbcAiuc&                     pbcAiuc,
+                           const DeviceStream&                deviceStream);
 
 } // namespace gmx
 
diff --git a/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp b/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp
index ef063c36d5..a6e6e9a40d 100644
--- a/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp
+++ b/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp
@@ -49,16 +49,16 @@ namespace gmx
 {
 
 void launchSettleGpuKernel(const int /* numSettles */,
-                           const DeviceBuffer<WaterMolecule> /* d_atomIds */,
-                           const SettleParameters /* settleParameters */,
-                           const DeviceBuffer<Float3> /* d_x */,
+                           const DeviceBuffer<WaterMolecule>& /* d_atomIds */,
+                           const SettleParameters& /* settleParameters */,
+                           const DeviceBuffer<Float3>& /* d_x */,
                            DeviceBuffer<Float3> /* d_xp */,
                            const bool /* updateVelocities */,
                            DeviceBuffer<Float3> /* d_v */,
                            const real /* invdt */,
                            const bool /* computeVirial */,
                            DeviceBuffer<float> /* virialScaled */,
-                           const PbcAiuc /* pbcAiuc */,
+                           const PbcAiuc& /* pbcAiuc */,
                            const DeviceStream& /* deviceStream */)
 {
     // SYCL_TODO
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp
index 6e5c56a867..96d447d36c 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp
@@ -72,7 +72,7 @@
 #include "gromacs/timing/wallcycle.h"
 #include "gromacs/topology/mtop_util.h"
 
-static constexpr bool sc_haveGpuConstraintSupport = (GMX_GPU_CUDA);
+static constexpr bool sc_haveGpuConstraintSupport = GMX_GPU_CUDA;
 
 namespace gmx
 {
@@ -125,8 +125,6 @@ void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer*             fRead
 
     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
-
-    return;
 }
 
 void UpdateConstrainGpu::Impl::scaleCoordinates(const matrix scalingMatrix)
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h
index 150c944aae..adfbccda85 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.h
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h
@@ -153,7 +153,7 @@ public:
      */
     void set(DeviceBuffer<Float3>          d_x,
              DeviceBuffer<Float3>          d_v,
-             const DeviceBuffer<Float3>    d_f,
+             DeviceBuffer<Float3>          d_f,
              const InteractionDefinitions& idef,
              const t_mdatoms&              md);
 
diff --git a/src/gromacs/mdlib/update_constrain_gpu_internal.cu b/src/gromacs/mdlib/update_constrain_gpu_internal.cu
index 5787bc476e..9d1dea39af 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_internal.cu
+++ b/src/gromacs/mdlib/update_constrain_gpu_internal.cu
@@ -77,7 +77,7 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
 
 void launchScaleCoordinatesKernel(const int            numAtoms,
                                   DeviceBuffer<Float3> d_coordinates,
-                                  const ScalingMatrix  mu,
+                                  const ScalingMatrix& mu,
                                   const DeviceStream&  deviceStream)
 {
     KernelLaunchConfig kernelLaunchConfig;
diff --git a/src/gromacs/mdlib/update_constrain_gpu_internal.h b/src/gromacs/mdlib/update_constrain_gpu_internal.h
index 893110f15b..7e0a2d1a04 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_internal.h
+++ b/src/gromacs/mdlib/update_constrain_gpu_internal.h
@@ -76,7 +76,7 @@ struct ScalingMatrix
  */
 void launchScaleCoordinatesKernel(int                  numAtoms,
                                   DeviceBuffer<Float3> d_coordinates,
-                                  const ScalingMatrix  mu,
+                                  const ScalingMatrix& mu,
                                   const DeviceStream&  deviceStream);
 
 } // namespace gmx
diff --git a/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp b/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp
index cf9439bf17..5f2233a7f1 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp
+++ b/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp
@@ -69,7 +69,7 @@ static auto scaleKernel(cl::sycl::handler&
 
 void launchScaleCoordinatesKernel(const int            numAtoms,
                                   DeviceBuffer<Float3> d_coordinates,
-                                  const ScalingMatrix  mu,
+                                  const ScalingMatrix& mu,
                                   const DeviceStream&  deviceStream)
 {
     const cl::sycl::range<1> rangeAllAtoms(numAtoms);
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h
index 3f66196509..cd119960f3 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -158,7 +158,7 @@ public:
      *
      * \returns Tuple, containing the index of the first atom in the range and the total number of atoms in the range.
      */
-    std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality atomLocality);
+    std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality atomLocality) const;
 
 
     /*! \brief Get the positions buffer on the GPU.
@@ -323,13 +323,13 @@ public:
      *
      *  \returns The number of local atoms.
      */
-    int numAtomsLocal();
+    int numAtomsLocal() const;
 
     /*! \brief Getter for the total number of atoms.
      *
      *  \returns The total number of atoms.
      */
-    int numAtomsAll();
+    int numAtomsAll() const;
 
 private:
     class Impl;
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
index f112d1b1cb..0ce99a4dd4 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
@@ -84,7 +84,7 @@ void StatePropagatorDataGpu::reinit(int /* numAtomsLocal */, int /* numAtomsAll
                "GPU implementation.");
 }
 
-std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality /* atomLocality */)
+std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality /* atomLocality */) const
 {
     GMX_ASSERT(!impl_,
                "A CPU stub method from GPU state propagator data was called instead of one from "
@@ -254,7 +254,7 @@ const DeviceStream* StatePropagatorDataGpu::getUpdateStream()
     return nullptr;
 }
 
-int StatePropagatorDataGpu::numAtomsLocal()
+int StatePropagatorDataGpu::numAtomsLocal() const
 {
     GMX_ASSERT(!impl_,
                "A CPU stub method from GPU state propagator data was called instead of one from "
@@ -262,7 +262,7 @@ int StatePropagatorDataGpu::numAtomsLocal()
     return 0;
 }
 
-int StatePropagatorDataGpu::numAtomsAll()
+int StatePropagatorDataGpu::numAtomsAll() const
 {
     GMX_ASSERT(!impl_,
                "A CPU stub method from GPU state propagator data was called instead of one from "
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
index cc9f92bcfe..0a041db04f 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -153,7 +153,7 @@ public:
      *
      * \returns Tuple, containing the index of the first atom in the range and the total number of atoms in the range.
      */
-    std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality atomLocality);
+    std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality atomLocality) const;
 
 
     /*! \brief Get the positions buffer on the GPU.
@@ -318,13 +318,13 @@ public:
      *
      *  \returns The number of local atoms.
      */
-    int numAtomsLocal();
+    int numAtomsLocal() const;
 
     /*! \brief Getter for the total number of atoms.
      *
      *  \returns The total number of atoms.
      */
-    int numAtomsAll();
+    int numAtomsAll() const;
 
 private:
     //! GPU PME stream.
@@ -443,7 +443,7 @@ private:
     void clearOnDevice(DeviceBuffer<RVec>  d_data,
                        int                 dataSize,
                        AtomLocality        atomLocality,
-                       const DeviceStream& deviceStream);
+                       const DeviceStream& deviceStream) const;
 };
 
 } // namespace gmx
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
index 69e11d69c0..d4868d20dc 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -178,7 +178,7 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
 
-std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
+std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality) const
 {
     int atomsStartAt   = 0;
     int numAtomsToCopy = 0;
@@ -281,7 +281,7 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_dat
 void StatePropagatorDataGpu::Impl::clearOnDevice(DeviceBuffer<RVec>  d_data,
                                                  int                 dataSize,
                                                  AtomLocality        atomLocality,
-                                                 const DeviceStream& deviceStream)
+                                                 const DeviceStream& deviceStream) const
 {
     GMX_UNUSED_VALUE(dataSize);
 
@@ -540,12 +540,12 @@ const DeviceStream* StatePropagatorDataGpu::Impl::getUpdateStream()
     return updateStream_;
 }
 
-int StatePropagatorDataGpu::Impl::numAtomsLocal()
+int StatePropagatorDataGpu::Impl::numAtomsLocal() const
 {
     return numAtomsLocal_;
 }
 
-int StatePropagatorDataGpu::Impl::numAtomsAll()
+int StatePropagatorDataGpu::Impl::numAtomsAll() const
 {
     return numAtomsAll_;
 }
@@ -580,7 +580,7 @@ void StatePropagatorDataGpu::reinit(int numAtomsLocal, int numAtomsAll)
     return impl_->reinit(numAtomsLocal, numAtomsAll);
 }
 
-std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
+std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality atomLocality) const
 {
     return impl_->getAtomRangesFromAtomLocality(atomLocality);
 }
@@ -695,12 +695,12 @@ const DeviceStream* StatePropagatorDataGpu::getUpdateStream()
     return impl_->getUpdateStream();
 }
 
-int StatePropagatorDataGpu::numAtomsLocal()
+int StatePropagatorDataGpu::numAtomsLocal() const
 {
     return impl_->numAtomsLocal();
 }
 
-int StatePropagatorDataGpu::numAtomsAll()
+int StatePropagatorDataGpu::numAtomsAll() const
 {
     return impl_->numAtomsAll();
 }
diff --git a/src/gromacs/nbnxm/cuda/.clang-tidy b/src/gromacs/nbnxm/cuda/.clang-tidy
new file mode 100644
index 0000000000..47d31a387a
--- /dev/null
+++ b/src/gromacs/nbnxm/cuda/.clang-tidy
@@ -0,0 +1,8 @@
+# List of rationales for check suppressions (where known).
+# This have to precede the list because inline comments are not
+# supported by clang-tidy.
+#
+#         -readability-implicit-bool-conversion
+# This happens often in the GPU code and should generally be harmless
+Checks:  -readability-implicit-bool-conversion
+InheritParentConfig: true
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
index 1db33caf3e..20ea65b673 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -89,7 +89,7 @@ namespace Nbnxm
  * there is a bit of fluctuations in the generated block counts, we use
  * a target of 44 instead of the ideal value of 48.
  */
-static unsigned int gpu_min_ci_balanced_factor = 44;
+static const unsigned int gpu_min_ci_balanced_factor = 44;
 
 void gpu_init_platform_specific(NbnxmGpu* /* nb */)
 {
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh
index 7df9231d51..5aaf3b9ead 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh
@@ -160,7 +160,7 @@ __launch_bounds__(THREADS_PER_BLOCK)
         __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 #    endif /* CALC_ENERGIES */
 #endif     /* PRUNE_NBL */
-                (const NBAtomDataGpu atdat, const NBParamGpu nbparam, const Nbnxm::gpu_plist plist, bool bCalcFshift)
+                (NBAtomDataGpu atdat, NBParamGpu nbparam, Nbnxm::gpu_plist plist, bool bCalcFshift)
 #ifdef FUNCTION_DECLARATION_ONLY
                         ; /* Only do function declaration, omit the function body. */
 #else
@@ -257,28 +257,29 @@ __launch_bounds__(THREADS_PER_BLOCK)
      * sm_nextSlotPtr should always be updated to point to the "next slot",
      * that is past the last point where data has been stored.
      */
+    // NOLINTNEXTLINE(readability-redundant-declaration)
     extern __shared__ char sm_dynamicShmem[];
     char*                  sm_nextSlotPtr = sm_dynamicShmem;
     static_assert(sizeof(char) == 1,
                   "The shared memory offset calculation assumes that char is 1 byte");
 
     /* shmem buffer for i x+q pre-loading */
-    float4* xqib = (float4*)sm_nextSlotPtr;
+    float4* xqib = reinterpret_cast<float4*>(sm_nextSlotPtr);
     sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*xqib));
 
     /* shmem buffer for cj, for each warp separately */
-    int* cjs = (int*)(sm_nextSlotPtr);
+    int* cjs = reinterpret_cast<int*>(sm_nextSlotPtr);
     /* the cjs buffer's use expects a base pointer offset for pairs of warps in the j-concurrent execution */
     cjs += tidxz * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize;
     sm_nextSlotPtr += (NTHREAD_Z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(*cjs));
 
 #    ifndef LJ_COMB
     /* shmem buffer for i atom-type pre-loading */
-    int* atib = (int*)sm_nextSlotPtr;
+    int* atib = reinterpret_cast<int*>(sm_nextSlotPtr);
     sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*atib));
 #    else
     /* shmem buffer for i-atom LJ combination rule parameters */
-    float2* ljcpib = (float2*)sm_nextSlotPtr;
+    float2* ljcpib = reinterpret_cast<float2*>(sm_nextSlotPtr);
     sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*ljcpib));
 #    endif
     /*********************************************************************/
@@ -294,8 +295,8 @@ __launch_bounds__(THREADS_PER_BLOCK)
         ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj;
         ai = ci * c_clSize + tidxi;
 
-        float* shiftptr = (float*)&shift_vec[nb_sci.shift];
-        xqbuf = xq[ai] + make_float4(LDG(shiftptr), LDG(shiftptr + 1), LDG(shiftptr + 2), 0.0f);
+        const float* shiftptr = reinterpret_cast<const float*>(&shift_vec[nb_sci.shift]);
+        xqbuf = xq[ai] + make_float4(LDG(shiftptr), LDG(shiftptr + 1), LDG(shiftptr + 2), 0.0F);
         xqbuf.w *= nbparam.epsfac;
         xqib[tidxj * c_clSize + tidxi] = xqbuf;
 
@@ -311,7 +312,7 @@ __launch_bounds__(THREADS_PER_BLOCK)
 
     for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
     {
-        fci_buf[i] = make_float3(0.0f);
+        fci_buf[i] = make_float3(0.0F);
     }
 
 #    ifdef LJ_EWALD
@@ -322,8 +323,8 @@ __launch_bounds__(THREADS_PER_BLOCK)
 
 
 #    ifdef CALC_ENERGIES
-    E_lj         = 0.0f;
-    E_el         = 0.0f;
+    E_lj         = 0.0F;
+    E_el         = 0.0F;
 
 #        ifdef EXCLUSION_FORCES /* Ewald or RF */
     if (nb_sci.shift == gmx::c_centralShiftIndex
@@ -339,22 +340,23 @@ __launch_bounds__(THREADS_PER_BLOCK)
 
 #            ifdef LJ_EWALD
             // load only the first 4 bytes of the parameter pair (equivalent with nbfp[idx].x)
-            E_lj += LDG((float*)&nbparam.nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
-                                              * (ntypes + 1)]);
+            E_lj += LDG(reinterpret_cast<float*>(
+                    &nbparam.nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
+                                  * (ntypes + 1)]));
 #            endif
         }
 
         /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
 #            ifdef LJ_EWALD
         E_lj /= c_clSize * NTHREAD_Z;
-        E_lj *= 0.5f * c_oneSixth * lje_coeff6_6;
+        E_lj *= 0.5F * c_oneSixth * lje_coeff6_6;
 #            endif
 
 #            if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
         /* Correct for epsfac^2 due to adding qi^2 */
         E_el /= nbparam.epsfac * c_clSize * NTHREAD_Z;
 #                if defined EL_RF || defined EL_CUTOFF
-        E_el *= -0.5f * reactionFieldShift;
+        E_el *= -0.5F * reactionFieldShift;
 #                else
         E_el *= -beta * M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
 #                endif
@@ -412,7 +414,7 @@ __launch_bounds__(THREADS_PER_BLOCK)
                     ljcp_j = lj_comb[aj];
 #    endif
 
-                    fcj_buf = make_float3(0.0f);
+                    fcj_buf = make_float3(0.0F);
 
 #    if !defined PRUNE_NBL
 #        pragma unroll 8
@@ -441,7 +443,7 @@ __launch_bounds__(THREADS_PER_BLOCK)
                             }
 #    endif
 
-                            int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
+                            int_bit = (wexcl & mask_ji) ? 1.0F : 0.0F;
 
                             /* cutoff & exclusion check */
 #    ifdef EXCLUSION_FORCES
@@ -499,7 +501,7 @@ __launch_bounds__(THREADS_PER_BLOCK)
                                 sig_r6 *= int_bit;
 #        endif /* EXCLUSION_FORCES */
 
-                                F_invr = epsilon * sig_r6 * (sig_r6 - 1.0f) * inv_r2;
+                                F_invr = epsilon * sig_r6 * (sig_r6 - 1.0F) * inv_r2;
 #    endif     /* !LJ_COMB_LB || CALC_ENERGIES */
 
 #    ifdef LJ_FORCE_SWITCH
@@ -553,7 +555,7 @@ __launch_bounds__(THREADS_PER_BLOCK)
                                 /* Separate VDW cut-off check to enable twin-range cut-offs
                                  * (rvdw < rcoulomb <= rlist)
                                  */
-                                vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
+                                vdw_in_range = (r2 < rvdw_sq) ? 1.0F : 0.0F;
                                 F_invr *= vdw_in_range;
 #        ifdef CALC_ENERGIES
                                 E_lj_p *= vdw_in_range;
@@ -591,10 +593,10 @@ __launch_bounds__(THREADS_PER_BLOCK)
 #        endif
 #        ifdef EL_RF
                                 E_el += qi * qj_f
-                                        * (int_bit * inv_r + 0.5f * two_k_rf * r2 - reactionFieldShift);
+                                        * (int_bit * inv_r + 0.5F * two_k_rf * r2 - reactionFieldShift);
 #        endif
 #        ifdef EL_EWALD_ANY
-                                /* 1.0f - erff is faster than erfcf */
+                                /* 1.0F - erff is faster than erfcf */
                                 E_el += qi * qj_f
                                         * (inv_r * (int_bit - erff(r2 * inv_r * beta)) - int_bit * ewald_shift);
 #        endif /* EL_EWALD_ANY */
@@ -633,7 +635,7 @@ __launch_bounds__(THREADS_PER_BLOCK)
         bCalcFshift = false;
     }
 
-    float fshift_buf = 0.0f;
+    float fshift_buf = 0.0F;
 
     /* reduce i forces */
     for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh
index bcc3dd1b09..8219ad16a3 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh
@@ -104,11 +104,11 @@
  */
 template<bool haveFreshList>
 __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP) __global__
-        void nbnxn_kernel_prune_cuda(const NBAtomDataGpu    atdat,
-                                     const NBParamGpu       nbparam,
-                                     const Nbnxm::gpu_plist plist,
-                                     int                    numParts,
-                                     int                    part)
+        void nbnxn_kernel_prune_cuda(NBAtomDataGpu    atdat,
+                                     NBParamGpu       nbparam,
+                                     Nbnxm::gpu_plist plist,
+                                     int              numParts,
+                                     int              part)
 #ifdef FUNCTION_DECLARATION_ONLY
                 ; /* Only do function declaration, omit the function body. */
 
@@ -152,11 +152,11 @@ nbnxn_kernel_prune_cuda<false>(const NBAtomDataGpu, const NBParamGpu, const Nbnx
                   "The shared memory offset calculation assumes that char is 1 byte");
 
     /* shmem buffer for i x+q pre-loading */
-    float4* xib = (float4*)sm_nextSlotPtr;
+    float4* xib = reinterpret_cast<float4*>(sm_nextSlotPtr);
     sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*xib));
 
     /* shmem buffer for cj, for each warp separately */
-    int* cjs = (int*)(sm_nextSlotPtr);
+    int* cjs = reinterpret_cast<int*>(sm_nextSlotPtr);
     /* the cjs buffer's use expects a base pointer offset for pairs of warps in the j-concurrent execution */
     cjs += tidxz * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize;
     sm_nextSlotPtr += (NTHREAD_Z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(*cjs));
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh
index 554748330f..deeb17a8fa 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh
@@ -72,8 +72,8 @@ static const int __device__ c_fbufStride = c_clSizeSq;
 static const unsigned __device__ superClInteractionMask =
         ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U);
 
-static const float __device__ c_oneSixth    = 0.16666667f;
-static const float __device__ c_oneTwelveth = 0.08333333f;
+static const float __device__ c_oneSixth    = 0.16666667F;
+static const float __device__ c_oneTwelveth = 0.08333333F;
 
 
 /*! Convert LJ sigma,epsilon parameters to C6,C12. */
@@ -102,7 +102,7 @@ calculate_force_switch_F(const NBParamGpu nbparam, float c6, float c12, float in
 
     r        = r2 * inv_r;
     r_switch = r - nbparam.rvdw_switch;
-    r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+    r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
 
     *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
                + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
@@ -132,7 +132,7 @@ static __forceinline__ __device__ void calculate_force_switch_F_E(const NBParamG
 
     r        = r2 * inv_r;
     r_switch = r - nbparam.rvdw_switch;
-    r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+    r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
 
     *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
                + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
@@ -141,8 +141,11 @@ static __forceinline__ __device__ void calculate_force_switch_F_E(const NBParamG
 }
 
 /*! Apply potential switch, force-only version. */
-static __forceinline__ __device__ void
-calculate_potential_switch_F(const NBParamGpu nbparam, float inv_r, float r2, float* F_invr, float* E_lj)
+static __forceinline__ __device__ void calculate_potential_switch_F(const NBParamGpu& nbparam,
+                                                                    float             inv_r,
+                                                                    float             r2,
+                                                                    float*            F_invr,
+                                                                    const float*      E_lj)
 {
     float r, r_switch;
     float sw, dsw;
@@ -159,9 +162,9 @@ calculate_potential_switch_F(const NBParamGpu nbparam, float inv_r, float r2, fl
     r_switch = r - nbparam.rvdw_switch;
 
     /* Unlike in the F+E kernel, conditional is faster here */
-    if (r_switch > 0.0f)
+    if (r_switch > 0.0F)
     {
-        sw  = 1.0f + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
+        sw  = 1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
         dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
 
         *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
@@ -185,10 +188,10 @@ calculate_potential_switch_F_E(const NBParamGpu nbparam, float inv_r, float r2,
 
     r        = r2 * inv_r;
     r_switch = r - nbparam.rvdw_switch;
-    r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+    r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
 
     /* Unlike in the F-only kernel, masking is faster here */
-    sw  = 1.0f + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
+    sw  = 1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
     dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
 
     *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
@@ -234,7 +237,7 @@ static __forceinline__ __device__ void calculate_lj_ewald_comb_geom_F(const NBPa
     inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
     cr2       = lje_coeff2 * r2;
     expmcr2   = expf(-cr2);
-    poly      = 1.0f + cr2 + 0.5f * cr2 * cr2;
+    poly      = 1.0F + cr2 + 0.5F * cr2 * cr2;
 
     /* Subtract the grid force from the total LJ force */
     *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
@@ -263,14 +266,14 @@ static __forceinline__ __device__ void calculate_lj_ewald_comb_geom_F_E(const NB
     inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
     cr2       = lje_coeff2 * r2;
     expmcr2   = expf(-cr2);
-    poly      = 1.0f + cr2 + 0.5f * cr2 * cr2;
+    poly      = 1.0F + cr2 + 0.5F * cr2 * cr2;
 
     /* Subtract the grid force from the total LJ force */
     *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
 
     /* Shift should be applied only to real LJ pairs */
     sh_mask = nbparam.sh_lj_ewald * int_bit;
-    *E_lj += c_oneSixth * c6grid * (inv_r6_nm * (1.0f - expmcr2 * poly) + sh_mask);
+    *E_lj += c_oneSixth * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
 }
 
 /*! Fetch per-type LJ parameters.
@@ -321,7 +324,7 @@ static __forceinline__ __device__ void calculate_lj_ewald_comb_LB_F_E(const NBPa
     inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
     cr2       = lje_coeff2 * r2;
     expmcr2   = expf(-cr2);
-    poly      = 1.0f + cr2 + 0.5f * cr2 * cr2;
+    poly      = 1.0F + cr2 + 0.5F * cr2 * cr2;
 
     /* Subtract the grid force from the total LJ force */
     *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
@@ -332,7 +335,7 @@ static __forceinline__ __device__ void calculate_lj_ewald_comb_LB_F_E(const NBPa
 
         /* Shift should be applied only to real LJ pairs */
         sh_mask = nbparam.sh_lj_ewald * int_bit;
-        *E_lj += c_oneSixth * c6grid * (inv_r6_nm * (1.0f - expmcr2 * poly) + sh_mask);
+        *E_lj += c_oneSixth * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
     }
 }
 
@@ -376,7 +379,7 @@ __forceinline__ __host__ __device__ T lerp(T d0, T d1, T t)
 static __forceinline__ __device__ float interpolate_coulomb_force_r(const NBParamGpu nbparam, float r)
 {
     float normalized = nbparam.coulomb_tab_scale * r;
-    int   index      = (int)normalized;
+    int   index      = static_cast<int>(normalized);
     float fraction   = normalized - index;
 
     float2 d01 = fetch_coulomb_force_r(nbparam, index);
@@ -389,6 +392,7 @@ static __forceinline__ __device__ float interpolate_coulomb_force_r(const NBPara
  *  Depending on what is supported, it fetches parameters either
  *  using direct load, texture objects, or texrefs.
  */
+// NOLINTNEXTLINE(google-runtime-references)
 static __forceinline__ __device__ void fetch_nbfp_c6_c12(float& c6, float& c12, const NBParamGpu nbparam, int baseIndex)
 {
     float2 c6c12;
@@ -405,19 +409,19 @@ static __forceinline__ __device__ void fetch_nbfp_c6_c12(float& c6, float& c12,
 /*! Calculate analytical Ewald correction term. */
 static __forceinline__ __device__ float pmecorrF(float z2)
 {
-    const float FN6 = -1.7357322914161492954e-8f;
-    const float FN5 = 1.4703624142580877519e-6f;
-    const float FN4 = -0.000053401640219807709149f;
-    const float FN3 = 0.0010054721316683106153f;
-    const float FN2 = -0.019278317264888380590f;
-    const float FN1 = 0.069670166153766424023f;
-    const float FN0 = -0.75225204789749321333f;
-
-    const float FD4 = 0.0011193462567257629232f;
-    const float FD3 = 0.014866955030185295499f;
-    const float FD2 = 0.11583842382862377919f;
-    const float FD1 = 0.50736591960530292870f;
-    const float FD0 = 1.0f;
+    const float FN6 = -1.7357322914161492954e-8F;
+    const float FN5 = 1.4703624142580877519e-6F;
+    const float FN4 = -0.000053401640219807709149F;
+    const float FN3 = 0.0010054721316683106153F;
+    const float FN2 = -0.019278317264888380590F;
+    const float FN1 = 0.069670166153766424023F;
+    const float FN0 = -0.75225204789749321333F;
+
+    const float FD4 = 0.0011193462567257629232F;
+    const float FD3 = 0.014866955030185295499F;
+    const float FD2 = 0.11583842382862377919F;
+    const float FD1 = 0.50736591960530292870F;
+    const float FD0 = 1.0F;
 
     float z4;
     float polyFN0, polyFN1, polyFD0, polyFD1;
@@ -429,7 +433,7 @@ static __forceinline__ __device__ float pmecorrF(float z2)
     polyFD0 = polyFD0 * z4 + FD0;
     polyFD0 = polyFD1 * z2 + polyFD0;
 
-    polyFD0 = 1.0f / polyFD0;
+    polyFD0 = 1.0F / polyFD0;
 
     polyFN0 = FN6 * z4 + FN4;
     polyFN1 = FN5 * z4 + FN3;
@@ -445,11 +449,11 @@ static __forceinline__ __device__ float pmecorrF(float z2)
  *  arbitrary array sizes.
  */
 static __forceinline__ __device__ void
-reduce_force_j_generic(float* f_buf, float3* fout, int tidxi, int tidxj, int aidx)
+reduce_force_j_generic(const float* f_buf, float3* fout, int tidxi, int tidxj, int aidx)
 {
     if (tidxi < 3)
     {
-        float f = 0.0f;
+        float f = 0.0F;
         for (int j = tidxj * c_clSize; j < (tidxj + 1) * c_clSize; j++)
         {
             f += f_buf[c_fbufStride * tidxi + j];
@@ -494,17 +498,17 @@ reduce_force_j_warp_shfl(float3 f, float3* fout, int tidxi, int aidx, const unsi
  *  arbitrary array sizes.
  * TODO: add the tidxi < 3 trick
  */
-static __forceinline__ __device__ void reduce_force_i_generic(float*  f_buf,
-                                                              float3* fout,
-                                                              float*  fshift_buf,
-                                                              bool    bCalcFshift,
-                                                              int     tidxi,
-                                                              int     tidxj,
-                                                              int     aidx)
+static __forceinline__ __device__ void reduce_force_i_generic(const float* f_buf,
+                                                              float3*      fout,
+                                                              float*       fshift_buf,
+                                                              bool         bCalcFshift,
+                                                              int          tidxi,
+                                                              int          tidxj,
+                                                              int          aidx)
 {
     if (tidxj < 3)
     {
-        float f = 0.0f;
+        float f = 0.0F;
         for (int j = tidxi; j < c_clSizeSq; j += c_clSize)
         {
             f += f_buf[tidxj * c_fbufStride + j];
@@ -533,7 +537,7 @@ static __forceinline__ __device__ void reduce_force_i_pow2(volatile float* f_buf
     int   i, j;
     float f;
 
-    assert(c_clSize == 1 << c_clSizeLog2);
+    static_assert(c_clSize == 1 << c_clSizeLog2);
 
     /* Reduce the initial c_clSize values for each i atom to half
      * every step by using c_clSize * i threads.
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_gpu_buffer_ops_internal.cu b/src/gromacs/nbnxm/cuda/nbnxm_gpu_buffer_ops_internal.cu
index 577b2b4b92..f6533259b5 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_gpu_buffer_ops_internal.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_gpu_buffer_ops_internal.cu
@@ -89,7 +89,7 @@ static __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int numColumns,
 
         // Destination address where x should be stored in nbnxm layout. We use this cast here to
         // save only x, y and z components, not touching the w (q) component, which is pre-defined.
-        float3* gm_xqDest = (float3*)&gm_xq[threadIndex + offset];
+        float3* gm_xqDest = reinterpret_cast<float3*>(&gm_xq[threadIndex + offset]);
 
         // Perform layout conversion of each element.
         if (threadIndex < numAtoms)
diff --git a/src/gromacs/nbnxm/nbnxm_gpu_buffer_ops_internal.h b/src/gromacs/nbnxm/nbnxm_gpu_buffer_ops_internal.h
index dc91a7a299..09b2d6e379 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu_buffer_ops_internal.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu_buffer_ops_internal.h
@@ -43,12 +43,13 @@
 #include "gromacs/gpu_utils/gputraits.h"
 
 class DeviceStream;
-class Grid;
 struct NbnxmGpu;
 
 namespace Nbnxm
 {
 
+class Grid;
+
 /*! \brief Launch coordinate layout conversion kernel
  *
  * \param[in]     grid          Pair-search grid.
diff --git a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
index 41c107deea..6e7a5b799f 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
@@ -107,7 +107,7 @@ static inline void init_ewald_coulomb_force_table(const EwaldCorrectionTables& t
 {
     if (nbp->coulomb_tab)
     {
-        destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
+        destroyParamLookupTable(&nbp->coulomb_tab, &nbp->coulomb_tab_texobj);
     }
 
     nbp->coulomb_tab_scale = tables.scale;
@@ -1132,17 +1132,17 @@ void gpu_free(NbnxmGpu* nb)
     /* Free nbparam */
     if (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin)
     {
-        destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
+        destroyParamLookupTable(&nbparam->coulomb_tab, &nbparam->coulomb_tab_texobj);
     }
 
     if (!useLjCombRule(nb->nbparam->vdwType))
     {
-        destroyParamLookupTable(&nbparam->nbfp, nbparam->nbfp_texobj);
+        destroyParamLookupTable(&nbparam->nbfp, &nbparam->nbfp_texobj);
     }
 
     if (nbparam->vdwType == VdwType::EwaldGeom || nbparam->vdwType == VdwType::EwaldLB)
     {
-        destroyParamLookupTable(&nbparam->nbfp_comb, nbparam->nbfp_comb_texobj);
+        destroyParamLookupTable(&nbparam->nbfp_comb, &nbparam->nbfp_comb_texobj);
     }
 
     /* Free plist */
diff --git a/src/gromacs/pbcutil/pbc_aiuc.h b/src/gromacs/pbcutil/pbc_aiuc.h
index bd45ec5d5c..97717d5267 100644
--- a/src/gromacs/pbcutil/pbc_aiuc.h
+++ b/src/gromacs/pbcutil/pbc_aiuc.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -99,32 +99,32 @@ struct PbcAiuc
 static inline void setPbcAiuc(int numPbcDim, const matrix box, PbcAiuc* pbcAiuc)
 {
 
-    pbcAiuc->invBoxDiagZ = 0.0f;
-    pbcAiuc->boxZX       = 0.0f;
-    pbcAiuc->boxZY       = 0.0f;
-    pbcAiuc->boxZZ       = 0.0f;
-    pbcAiuc->invBoxDiagY = 0.0f;
-    pbcAiuc->boxYX       = 0.0f;
-    pbcAiuc->boxYY       = 0.0f;
-    pbcAiuc->invBoxDiagX = 0.0f;
-    pbcAiuc->boxXX       = 0.0f;
+    pbcAiuc->invBoxDiagZ = 0.0F;
+    pbcAiuc->boxZX       = 0.0F;
+    pbcAiuc->boxZY       = 0.0F;
+    pbcAiuc->boxZZ       = 0.0F;
+    pbcAiuc->invBoxDiagY = 0.0F;
+    pbcAiuc->boxYX       = 0.0F;
+    pbcAiuc->boxYY       = 0.0F;
+    pbcAiuc->invBoxDiagX = 0.0F;
+    pbcAiuc->boxXX       = 0.0F;
 
     if (numPbcDim > ZZ)
     {
-        pbcAiuc->invBoxDiagZ = 1.0f / box[ZZ][ZZ];
+        pbcAiuc->invBoxDiagZ = 1.0F / box[ZZ][ZZ];
         pbcAiuc->boxZX       = box[ZZ][XX];
         pbcAiuc->boxZY       = box[ZZ][YY];
         pbcAiuc->boxZZ       = box[ZZ][ZZ];
     }
     if (numPbcDim > YY)
     {
-        pbcAiuc->invBoxDiagY = 1.0f / box[YY][YY];
+        pbcAiuc->invBoxDiagY = 1.0F / box[YY][YY];
         pbcAiuc->boxYX       = box[YY][XX];
         pbcAiuc->boxYY       = box[YY][YY];
     }
     if (numPbcDim > XX)
     {
-        pbcAiuc->invBoxDiagX = 1.0f / box[XX][XX];
+        pbcAiuc->invBoxDiagX = 1.0F / box[XX][XX];
         pbcAiuc->boxXX       = box[XX][XX];
     }
 }
diff --git a/src/gromacs/pbcutil/pbc_aiuc_cuda.cuh b/src/gromacs/pbcutil/pbc_aiuc_cuda.cuh
index a2784a6d16..eb12862bf4 100644
--- a/src/gromacs/pbcutil/pbc_aiuc_cuda.cuh
+++ b/src/gromacs/pbcutil/pbc_aiuc_cuda.cuh
@@ -94,6 +94,7 @@ static inline __device__ int int3ToShiftIndex(int3 iv)
  */
 template<bool returnShift>
 static __forceinline__ __device__ int
+// NOLINTNEXTLINE(google-runtime-references)
 pbcDxAiuc(const PbcAiuc& pbcAiuc, const float4 r1, const float4 r2, float3& dr)
 {
     dr.x = r1.x - r2.x;
-- 
2.22.0