From 2dae5db6f3bf0a64ab68b4fcf4ee6facccdbb43b Mon Sep 17 00:00:00 2001
From: Roland Schulz <roland.schulz@intel.com>
Date: Wed, 12 Sep 2018 02:34:10 -0700
Subject: [PATCH] Fix clang-tidy warnings for OCL

Change-Id: I2f794a2209f355c0b3ee67623360724d96982458
---
 src/gromacs/CMakeLists.txt                    |  2 +-
 src/gromacs/ewald/pme-gpu-3dfft-ocl.cpp       |  6 +--
 src/gromacs/ewald/pme-gpu-internal.cpp        | 27 ++++++-------
 src/gromacs/ewald/pme-gpu-internal.h          | 18 ++++-----
 .../ewald/pme-gpu-program-impl-ocl.cpp        |  2 +-
 src/gromacs/ewald/pme-gpu.cpp                 |  6 +--
 src/gromacs/gpu_utils/devicebuffer_ocl.h      |  2 +-
 src/gromacs/gpu_utils/gpu_utils.h             |  2 +-
 src/gromacs/gpu_utils/gpu_utils_ocl.cpp       |  4 +-
 src/gromacs/gpu_utils/gpuregiontimer_ocl.h    |  4 +-
 src/gromacs/gpu_utils/ocl_caching.cpp         |  4 +-
 src/gromacs/gpu_utils/ocl_caching.h           |  6 +--
 src/gromacs/gpu_utils/ocl_compiler.cpp        | 10 ++---
 src/gromacs/gpu_utils/ocl_compiler.h          |  4 +-
 src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu    |  2 +-
 src/gromacs/mdlib/nbnxn_gpu_common.h          | 24 +++++++-----
 src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp     | 38 +++++++++----------
 .../mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp   | 35 ++++++++---------
 18 files changed, 97 insertions(+), 99 deletions(-)
diff --git a/src/gromacs/CMakeLists.txt b/src/gromacs/CMakeLists.txt
index c2d2bb23b0..7f9093399b 100644
--- a/src/gromacs/CMakeLists.txt
+++ b/src/gromacs/CMakeLists.txt
@@ -223,7 +223,7 @@ if (GMX_USE_OPENCL)
         target_sources(libgromacs PRIVATE
             $<TARGET_OBJECTS:clFFT>
         )
-        target_include_directories(libgromacs PRIVATE ${_clFFT_dir}/include)
+        target_include_directories(libgromacs SYSTEM PRIVATE ${_clFFT_dir}/include)
         # Use the magic variable for how to link any library needed for
         # dlopen, etc.  which is -ldl where needed, and empty otherwise
         # (e.g. Windows, BSD, Mac).
diff --git a/src/gromacs/ewald/pme-gpu-3dfft-ocl.cpp b/src/gromacs/ewald/pme-gpu-3dfft-ocl.cpp
index dcc29cc69e..729a32a07d 100644
--- a/src/gromacs/ewald/pme-gpu-3dfft-ocl.cpp
+++ b/src/gromacs/ewald/pme-gpu-3dfft-ocl.cpp
@@ -69,7 +69,7 @@ GpuParallel3dFft::GpuParallel3dFft(const PmeGpu *pmeGpu)
     std::array<size_t, DIM> realGridSize, realGridSizePadded, complexGridSizePadded;
 
     GMX_RELEASE_ASSERT(!pme_gpu_uses_dd(pmeGpu), "FFT decomposition not implemented");
-    PmeGpuKernelParamsBase *kernelParamsPtr = (PmeGpuKernelParamsBase *)pmeGpu->kernelParams.get();
+    PmeGpuKernelParamsBase *kernelParamsPtr = pmeGpu->kernelParams.get();
     for (int i = 0; i < DIM; i++)
     {
         realGridSize[i]          = kernelParamsPtr->grid.realGridSize[i];
@@ -152,18 +152,14 @@ void GpuParallel3dFft::perform3dFft(gmx_fft_direction  dir,
             inputGrids  = &realGrid_;
             outputGrids = &complexGrid_;
             break;
-
-            break;
         case GMX_FFT_COMPLEX_TO_REAL:
             plan        = planC2R_;
             direction   = CLFFT_BACKWARD;
             inputGrids  = &complexGrid_;
             outputGrids = &realGrid_;
             break;
-
         default:
             GMX_THROW(gmx::NotImplementedError("The chosen 3D-FFT case is not implemented on GPUs"));
-            break;
     }
     handleClfftError(clfftEnqueueTransform(plan, direction,
                                            commandStreams_.size(), commandStreams_.data(),
diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp
index d919b84a66..a030b45a30 100644
--- a/src/gromacs/ewald/pme-gpu-internal.cpp
+++ b/src/gromacs/ewald/pme-gpu-internal.cpp
@@ -54,6 +54,7 @@
 #include <list>
 #include <string>
 
+#include "gromacs/compat/make_unique.h"
 #include "gromacs/ewald/ewald-utils.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/math/invertmatrix.h"
@@ -117,11 +118,11 @@ void pme_gpu_synchronize(const PmeGpu *pmeGpu)
     gpuStreamSynchronize(pmeGpu->archSpecific->pmeStream);
 }
 
-void pme_gpu_alloc_energy_virial(const PmeGpu *pmeGpu)
+void pme_gpu_alloc_energy_virial(PmeGpu *pmeGpu)
 {
     const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float);
     allocateDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy, c_virialAndEnergyCount, pmeGpu->archSpecific->context);
-    pmalloc((void **)&pmeGpu->staging.h_virialAndEnergy, energyAndVirialSize);
+    pmalloc(reinterpret_cast<void **>(&pmeGpu->staging.h_virialAndEnergy), energyAndVirialSize);
 }
 
 void pme_gpu_free_energy_virial(PmeGpu *pmeGpu)
@@ -137,14 +138,14 @@ void pme_gpu_clear_energy_virial(const PmeGpu *pmeGpu)
                            c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream);
 }
 
-void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *pmeGpu)
+void pme_gpu_realloc_and_copy_bspline_values(PmeGpu *pmeGpu)
 {
     const int splineValuesOffset[DIM] = {
         0,
         pmeGpu->kernelParams->grid.realGridSize[XX],
         pmeGpu->kernelParams->grid.realGridSize[XX] + pmeGpu->kernelParams->grid.realGridSize[YY]
     };
-    memcpy((void *)&pmeGpu->kernelParams->grid.splineValuesOffset, &splineValuesOffset, sizeof(splineValuesOffset));
+    memcpy(&pmeGpu->kernelParams->grid.splineValuesOffset, &splineValuesOffset, sizeof(splineValuesOffset));
 
     const int newSplineValuesSize = pmeGpu->kernelParams->grid.realGridSize[XX] +
         pmeGpu->kernelParams->grid.realGridSize[YY] +
@@ -156,7 +157,7 @@ void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *pmeGpu)
     {
         /* Reallocate the host buffer */
         pfree(pmeGpu->staging.h_splineModuli);
-        pmalloc((void **)&pmeGpu->staging.h_splineModuli, newSplineValuesSize * sizeof(float));
+        pmalloc(reinterpret_cast<void **>(&pmeGpu->staging.h_splineModuli), newSplineValuesSize * sizeof(float));
     }
     for (int i = 0; i < DIM; i++)
     {
@@ -271,7 +272,7 @@ void pme_gpu_free_coefficients(const PmeGpu *pmeGpu)
     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients);
 }
 
-void pme_gpu_realloc_spline_data(const PmeGpu *pmeGpu)
+void pme_gpu_realloc_spline_data(PmeGpu *pmeGpu)
 {
     const int    order             = pmeGpu->common->pme_order;
     const int    alignment         = pme_gpu_get_atoms_per_warp(pmeGpu);
@@ -290,9 +291,9 @@ void pme_gpu_realloc_spline_data(const PmeGpu *pmeGpu)
     if (shouldRealloc)
     {
         pfree(pmeGpu->staging.h_theta);
-        pmalloc((void **)&pmeGpu->staging.h_theta, newSplineDataSize * sizeof(float));
+        pmalloc(reinterpret_cast<void **>(&pmeGpu->staging.h_theta), newSplineDataSize * sizeof(float));
         pfree(pmeGpu->staging.h_dtheta);
-        pmalloc((void **)&pmeGpu->staging.h_dtheta, newSplineDataSize * sizeof(float));
+        pmalloc(reinterpret_cast<void **>(&pmeGpu->staging.h_dtheta), newSplineDataSize * sizeof(float));
     }
 }
 
@@ -305,14 +306,14 @@ void pme_gpu_free_spline_data(const PmeGpu *pmeGpu)
     pfree(pmeGpu->staging.h_dtheta);
 }
 
-void pme_gpu_realloc_grid_indices(const PmeGpu *pmeGpu)
+void pme_gpu_realloc_grid_indices(PmeGpu *pmeGpu)
 {
     const size_t newIndicesSize = DIM * pmeGpu->nAtomsAlloc;
     GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU");
     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices, newIndicesSize,
                            &pmeGpu->archSpecific->gridlineIndicesSize, &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, pmeGpu->archSpecific->context);
     pfree(pmeGpu->staging.h_gridlineIndices);
-    pmalloc((void **)&pmeGpu->staging.h_gridlineIndices, newIndicesSize * sizeof(int));
+    pmalloc(reinterpret_cast<void **>(&pmeGpu->staging.h_gridlineIndices), newIndicesSize * sizeof(int));
 }
 
 void pme_gpu_free_grid_indices(const PmeGpu *pmeGpu)
@@ -580,7 +581,7 @@ void pme_gpu_reinit_3dfft(const PmeGpu *pmeGpu)
         pmeGpu->archSpecific->fftSetup.resize(0);
         for (int i = 0; i < pmeGpu->common->ngrids; i++)
         {
-            pmeGpu->archSpecific->fftSetup.push_back(std::unique_ptr<GpuParallel3dFft>(new GpuParallel3dFft(pmeGpu)));
+            pmeGpu->archSpecific->fftSetup.push_back(gmx::compat::make_unique<GpuParallel3dFft>(pmeGpu));
         }
     }
 }
@@ -680,7 +681,7 @@ static void pme_gpu_reinit_grids(PmeGpu *pmeGpu)
     for (int i = 0; i < DIM; i++)
     {
         kernelParamsPtr->grid.realGridSize[i]       = pmeGpu->common->nk[i];
-        kernelParamsPtr->grid.realGridSizeFP[i]     = (float)kernelParamsPtr->grid.realGridSize[i];
+        kernelParamsPtr->grid.realGridSizeFP[i]     = static_cast<float>(kernelParamsPtr->grid.realGridSize[i]);
         kernelParamsPtr->grid.realGridSizePadded[i] = kernelParamsPtr->grid.realGridSize[i];
 
         // The complex grid currently uses no padding;
@@ -761,7 +762,7 @@ static void pme_gpu_init(gmx_pme_t          *pme,
     pme->gpu          = new PmeGpu();
     PmeGpu *pmeGpu = pme->gpu;
     changePinningPolicy(&pmeGpu->staging.h_forces, pme_get_pinning_policy());
-    pmeGpu->common = std::shared_ptr<PmeShared>(new PmeShared());
+    pmeGpu->common = std::make_shared<PmeShared>();
 
     /* These settings are set here for the whole run; dynamic ones are set in pme_gpu_reinit() */
     /* A convenience variable. */
diff --git a/src/gromacs/ewald/pme-gpu-internal.h b/src/gromacs/ewald/pme-gpu-internal.h
index cb6371f2d3..e48e18a7f5 100644
--- a/src/gromacs/ewald/pme-gpu-internal.h
+++ b/src/gromacs/ewald/pme-gpu-internal.h
@@ -105,9 +105,9 @@ GPU_FUNC_QUALIFIER void pme_gpu_synchronize(const PmeGpu *GPU_FUNC_ARGUMENT(pmeG
 /*! \libinternal \brief
  * Allocates the fixed size energy and virial buffer both on GPU and CPU.
  *
- * \param[in] pmeGpu            The PME GPU structure.
+ * \param[in,out] pmeGpu            The PME GPU structure.
  */
-void pme_gpu_alloc_energy_virial(const PmeGpu *pmeGpu);
+void pme_gpu_alloc_energy_virial(PmeGpu *pmeGpu);
 
 /*! \libinternal \brief
  * Frees the energy and virial memory both on GPU and CPU.
@@ -127,9 +127,9 @@ void pme_gpu_clear_energy_virial(const PmeGpu *pmeGpu);
 /*! \libinternal \brief
  * Reallocates and copies the pre-computed B-spline values to the GPU.
  *
- * \param[in] pmeGpu             The PME GPU structure.
+ * \param[in,out] pmeGpu             The PME GPU structure.
  */
-void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *pmeGpu);
+void pme_gpu_realloc_and_copy_bspline_values(PmeGpu *pmeGpu);
 
 /*! \libinternal \brief
  * Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers).
@@ -226,9 +226,9 @@ void pme_gpu_free_coefficients(const PmeGpu *pmeGpu);
 /*! \libinternal \brief
  * Reallocates the buffers on the GPU and the host for the atoms spline data.
  *
- * \param[in] pmeGpu            The PME GPU structure.
+ * \param[in,out] pmeGpu            The PME GPU structure.
  */
-void pme_gpu_realloc_spline_data(const PmeGpu *pmeGpu);
+void pme_gpu_realloc_spline_data(PmeGpu *pmeGpu);
 
 /*! \libinternal \brief
  * Frees the buffers on the GPU for the atoms spline data.
@@ -240,9 +240,9 @@ void pme_gpu_free_spline_data(const PmeGpu *pmeGpu);
 /*! \libinternal \brief
  * Reallocates the buffers on the GPU and the host for the particle gridline indices.
  *
- * \param[in] pmeGpu            The PME GPU structure.
+ * \param[in,out] pmeGpu            The PME GPU structure.
  */
-void pme_gpu_realloc_grid_indices(const PmeGpu *pmeGpu);
+void pme_gpu_realloc_grid_indices(PmeGpu *pmeGpu);
 
 /*! \libinternal \brief
  * Frees the buffer on the GPU for the particle gridline indices.
@@ -654,7 +654,7 @@ GPU_FUNC_QUALIFIER void pme_gpu_destroy(PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_F
  * Should be called before the pme_gpu_set_io_ranges.
  */
 GPU_FUNC_QUALIFIER void pme_gpu_reinit_atoms(PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
-                                             const int         GPU_FUNC_ARGUMENT(nAtoms),
+                                             int         GPU_FUNC_ARGUMENT(nAtoms),
                                              const real       *GPU_FUNC_ARGUMENT(charges)) GPU_FUNC_TERM
 
 /*! \brief \libinternal
diff --git a/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp b/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp
index 1c138d2894..13f7decf54 100644
--- a/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp
+++ b/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp
@@ -58,7 +58,7 @@ PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *deviceInfo)
     cl_device_id          deviceId   = deviceInfo->ocl_gpu_id.ocl_device_id;
     cl_context_properties contextProperties[3];
     contextProperties[0] = CL_CONTEXT_PLATFORM;
-    contextProperties[1] = (cl_context_properties) platformId;
+    contextProperties[1] = reinterpret_cast<cl_context_properties>(platformId);
     contextProperties[2] = 0; /* Terminates the list of properties */
 
     cl_int  clError;
diff --git a/src/gromacs/ewald/pme-gpu.cpp b/src/gromacs/ewald/pme-gpu.cpp
index 29c357d3ca..6f8caf3249 100644
--- a/src/gromacs/ewald/pme-gpu.cpp
+++ b/src/gromacs/ewald/pme-gpu.cpp
@@ -190,8 +190,8 @@ void pme_gpu_launch_complex_transforms(gmx_pme_t      *pme,
                                        gmx_wallcycle  *wcycle)
 {
     PmeGpu            *pmeGpu                 = pme->gpu;
-    const bool         computeEnergyAndVirial = pmeGpu->settings.currentFlags & GMX_PME_CALC_ENER_VIR;
-    const bool         performBackFFT         = pmeGpu->settings.currentFlags & (GMX_PME_CALC_F | GMX_PME_CALC_POT);
+    const bool         computeEnergyAndVirial = (pmeGpu->settings.currentFlags & GMX_PME_CALC_ENER_VIR) != 0;
+    const bool         performBackFFT         = (pmeGpu->settings.currentFlags & (GMX_PME_CALC_F | GMX_PME_CALC_POT)) != 0;
     const unsigned int gridIndex              = 0;
     t_complex         *cfftgrid               = pme->cfftgrid[gridIndex];
 
@@ -274,7 +274,7 @@ static void pme_gpu_get_staged_results(const gmx_pme_t                *pme,
                                        matrix                          virial,
                                        real                           *energy)
 {
-    const bool haveComputedEnergyAndVirial = pme->gpu->settings.currentFlags & GMX_PME_CALC_ENER_VIR;
+    const bool haveComputedEnergyAndVirial = (pme->gpu->settings.currentFlags & GMX_PME_CALC_ENER_VIR) != 0;
     *forces = pme_gpu_get_forces(pme->gpu);
 
     if (haveComputedEnergyAndVirial)
diff --git a/src/gromacs/gpu_utils/devicebuffer_ocl.h b/src/gromacs/gpu_utils/devicebuffer_ocl.h
index 64f969fa9d..52c949fc76 100644
--- a/src/gromacs/gpu_utils/devicebuffer_ocl.h
+++ b/src/gromacs/gpu_utils/devicebuffer_ocl.h
@@ -61,7 +61,7 @@ class TypedClMemory
         cl_mem data_;
     public:
         //! \brief An assignment operator - the purpose is to make allocation/zeroing work
-        void operator=(cl_mem data){data_ = data; }
+        TypedClMemory &operator=(cl_mem data){data_ = data; return *this; }
         //! \brief Returns underlying cl_mem transparently
         operator cl_mem() {return data_; }
 };
diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h
index 6ee8b09a9d..65d8502e39 100644
--- a/src/gromacs/gpu_utils/gpu_utils.h
+++ b/src/gromacs/gpu_utils/gpu_utils.h
@@ -213,7 +213,7 @@ void get_gpu_device_info_string(char *GPU_FUNC_ARGUMENT(s),
  * \returns                 size in bytes of gpu_dev_info
  */
 GPU_FUNC_QUALIFIER
-size_t sizeof_gpu_dev_info(void) GPU_FUNC_TERM_WITH_RETURN(0)
+size_t sizeof_gpu_dev_info() GPU_FUNC_TERM_WITH_RETURN(0)
 
 /*! \brief Returns a pointer *ptr to page-locked memory of size nbytes.
  *
diff --git a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
index c541eb06e7..ea8451fb2e 100644
--- a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
+++ b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
@@ -191,7 +191,7 @@ void findGpus(gmx_gpu_info_t *gpu_info)
         req_dev_type = CL_DEVICE_TYPE_CPU;
     }
 
-    while (1)
+    while (true)
     {
         cl_int status = clGetPlatformIDs(0, nullptr, &ocl_platform_count);
         if (CL_SUCCESS != status)
@@ -413,7 +413,7 @@ gmx_device_info_t *getDeviceInfo(const gmx_gpu_info_t &gpu_info,
 }
 
 //! This function is documented in the header file
-size_t sizeof_gpu_dev_info(void)
+size_t sizeof_gpu_dev_info()
 {
     return sizeof(gmx_device_info_t);
 }
diff --git a/src/gromacs/gpu_utils/gpuregiontimer_ocl.h b/src/gromacs/gpu_utils/gpuregiontimer_ocl.h
index 00f28c633f..47209b9bc4 100644
--- a/src/gromacs/gpu_utils/gpuregiontimer_ocl.h
+++ b/src/gromacs/gpu_utils/gpuregiontimer_ocl.h
@@ -83,9 +83,9 @@ class GpuRegionTimerImpl
         GpuRegionTimerImpl(GpuRegionTimerImpl &&)            = delete;
 
         /*! \brief Should be called before the region start. */
-        inline void openTimingRegion(CommandStream){}
+        inline void openTimingRegion(CommandStream /*unused*/){}
         /*! \brief Should be called after the region end. */
-        inline void closeTimingRegion(CommandStream){}
+        inline void closeTimingRegion(CommandStream /*unused*/){}
         /*! \brief Returns the last measured region timespan (in milliseconds) and calls reset(). */
         inline double getLastRangeTime()
         {
diff --git a/src/gromacs/gpu_utils/ocl_caching.cpp b/src/gromacs/gpu_utils/ocl_caching.cpp
index 566885e5b1..ef7fa819ea 100644
--- a/src/gromacs/gpu_utils/ocl_caching.cpp
+++ b/src/gromacs/gpu_utils/ocl_caching.cpp
@@ -179,5 +179,5 @@ writeBinaryToCache(cl_program program, const std::string &filename)
     fwrite(binary, 1, fileSize, f.get());
 }
 
-} // namespace
-} // namespace
+} // namespace ocl
+} // namespace gmx
diff --git a/src/gromacs/gpu_utils/ocl_caching.h b/src/gromacs/gpu_utils/ocl_caching.h
index 47ee8f6225..cb6c9811e0 100644
--- a/src/gromacs/gpu_utils/ocl_caching.h
+++ b/src/gromacs/gpu_utils/ocl_caching.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016,2018, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -99,7 +99,7 @@ makeProgramFromCache(const std::string &filename,
 void
 writeBinaryToCache(cl_program program, const std::string &filename);
 
-} // namespace
-} // namespace
+} // namespace ocl
+} // namespace gmx
 
 #endif
diff --git a/src/gromacs/gpu_utils/ocl_compiler.cpp b/src/gromacs/gpu_utils/ocl_compiler.cpp
index c52a6ec74b..fa8c4137e6 100644
--- a/src/gromacs/gpu_utils/ocl_compiler.cpp
+++ b/src/gromacs/gpu_utils/ocl_compiler.cpp
@@ -75,7 +75,7 @@ namespace ocl
  *
  *  Currently caching is disabled by default unless the env var override
  *  is used until we resolve concurrency issues. */
-static bool useBuildCache = getenv("GMX_OCL_GENCACHE"); // (NULL == getenv("GMX_OCL_NOGENCACHE"));
+static bool useBuildCache = getenv("GMX_OCL_GENCACHE") != nullptr;
 
 /*! \brief Handles writing the OpenCL JIT compilation log to \c fplog.
  *
@@ -257,7 +257,7 @@ size_t getWarpSize(cl_context context, cl_device_id deviceId)
 {
     cl_int      cl_error;
     const char *warpSizeKernel = "__kernel void test(__global int* test){test[get_local_id(0)] = 0;}";
-    cl_program  program        = clCreateProgramWithSource(context, 1, (const char**)&warpSizeKernel, nullptr, &cl_error);
+    cl_program  program        = clCreateProgramWithSource(context, 1, &warpSizeKernel, nullptr, &cl_error);
     if (cl_error != CL_SUCCESS)
     {
         GMX_THROW(InternalError("Could not create OpenCL program to determine warp size, error was " + ocl_get_error_string(cl_error)));
@@ -374,7 +374,7 @@ removeExtraSpaces(std::string *str)
 {
     GMX_RELEASE_ASSERT(str != nullptr, "A pointer to an actual string must be provided");
     std::string::iterator newEnd =
-        std::unique( str->begin(), str->end(), [ = ](char a, char b){ return isspace(a) && (a == b); } );
+        std::unique( str->begin(), str->end(), [ = ](char a, char b){ return isspace(a) != 0 && (a == b); } );
     str->erase(newEnd, str->end());
 }
 
@@ -551,5 +551,5 @@ compileProgram(FILE              *fplog,
     return program;
 }
 
-} // namespace
-} // namespace
+} // namespace ocl
+} // namespace gmx
diff --git a/src/gromacs/gpu_utils/ocl_compiler.h b/src/gromacs/gpu_utils/ocl_compiler.h
index 54f0332fed..94dbe02ccb 100644
--- a/src/gromacs/gpu_utils/ocl_compiler.h
+++ b/src/gromacs/gpu_utils/ocl_compiler.h
@@ -98,7 +98,7 @@ compileProgram(FILE              *fplog,
                cl_device_id       deviceId,
                ocl_vendor_id_t    deviceVendorId);
 
-} // namespace
-} // namespace
+}  // namespace ocl
+}  // namespace gmx
 
 #endif
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
index 960649956a..e9d816e1fe 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -610,7 +610,7 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
         return;
     }
 
-    getGpuAtomRange(adat, aloc, adat_begin, adat_len);
+    getGpuAtomRange(adat, aloc, &adat_begin, &adat_len);
 
     /* beginning of timed D2H section */
     if (bDoTime)
diff --git a/src/gromacs/mdlib/nbnxn_gpu_common.h b/src/gromacs/mdlib/nbnxn_gpu_common.h
index 90f4bc6204..7f361182c9 100644
--- a/src/gromacs/mdlib/nbnxn_gpu_common.h
+++ b/src/gromacs/mdlib/nbnxn_gpu_common.h
@@ -57,6 +57,8 @@
 
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/math/vec.h"
+#include "gromacs/mdlib/force_flags.h"
+#include "gromacs/mdlib/nb_verlet.h"
 #include "gromacs/mdlib/nbnxn_gpu_types.h"
 #include "gromacs/pbcutil/ishift.h"
 #include "gromacs/timing/gpu_timing.h"
@@ -117,8 +119,8 @@ static inline int gpuAtomToInteractionLocality(int atomLocality)
 template <typename AtomDataT>
 static inline void getGpuAtomRange(const AtomDataT *atomData,
                                    int              atomLocality,
-                                   int             &atomRangeBegin,
-                                   int             &atomRangeLen)
+                                   int             *atomRangeBegin,
+                                   int             *atomRangeLen)
 {
     assert(atomData);
     validateGpuAtomLocality(atomLocality);
@@ -126,13 +128,13 @@ static inline void getGpuAtomRange(const AtomDataT *atomData,
     /* calculate the atom data index range based on locality */
     if (LOCAL_A(atomLocality))
     {
-        atomRangeBegin  = 0;
-        atomRangeLen    = atomData->natoms_local;
+        *atomRangeBegin  = 0;
+        *atomRangeLen    = atomData->natoms_local;
     }
     else
     {
-        atomRangeBegin  = atomData->natoms_local;
-        atomRangeLen    = atomData->natoms - atomData->natoms_local;
+        *atomRangeBegin  = atomData->natoms_local;
+        *atomRangeLen    = atomData->natoms - atomData->natoms_local;
     }
 }
 
@@ -298,6 +300,8 @@ static inline void nbnxn_gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timin
     }
 }
 
+//TODO: move into shared source file with gmx_compile_cpp_as_cuda
+//NOLINTNEXTLINE(misc-definitions-in-headers)
 bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t  *nb,
                                int               flags,
                                int               aloc,
@@ -328,10 +332,11 @@ bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t  *nb,
             gpuStreamSynchronize(nb->stream[iLocality]);
         }
 
-        bool calcEner   = flags & GMX_FORCE_ENERGY;
-        bool calcFshift = flags & GMX_FORCE_VIRIAL;
+        bool calcEner   = (flags & GMX_FORCE_ENERGY) != 0;
+        bool calcFshift = (flags & GMX_FORCE_VIRIAL) != 0;
 
-        nbnxn_gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner, nb->bDoTime);
+        nbnxn_gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner,
+                                     nb->bDoTime != 0);
 
         nbnxn_gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift, e_lj, e_el, fshift);
     }
@@ -360,6 +365,7 @@ bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t  *nb,
  * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
  * \param[out] fshift Pointer to the shift force buffer to accumulate into
  */
+//NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file
 void nbnxn_gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb,
                                 int              flags,
                                 int              aloc,
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
index c2a055b79c..3ef88eaea1 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
@@ -119,7 +119,7 @@ static inline void validate_global_work_size(const KernelLaunchConfig &config, i
        https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
      */
     device_size_t_size_bits = dinfo->adress_bits;
-    host_size_t_size_bits   = (cl_uint)(sizeof(size_t) * 8);
+    host_size_t_size_bits   = static_cast<cl_uint>(sizeof(size_t) * 8);
 
     /* If sizeof(host size_t) <= sizeof(device size_t)
             => global_work_size components will always be valid
@@ -131,7 +131,7 @@ static inline void validate_global_work_size(const KernelLaunchConfig &config, i
     {
         size_t device_limit;
 
-        device_limit = (((size_t)1) << device_size_t_size_bits) - 1;
+        device_limit = (1ull << device_size_t_size_bits) - 1;
 
         for (int i = 0; i < work_dim; i++)
         {
@@ -393,9 +393,9 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
     cl_timers_t         *t       = nb->timers;
     cl_command_queue     stream  = nb->stream[iloc];
 
-    bool                 bCalcEner   = flags & GMX_FORCE_ENERGY;
+    bool                 bCalcEner   = (flags & GMX_FORCE_ENERGY) != 0;
     int                  bCalcFshift = flags & GMX_FORCE_VIRIAL;
-    bool                 bDoTime     = nb->bDoTime;
+    bool                 bDoTime     = (nb->bDoTime) != 0;
 
     cl_nbparam_params_t  nbparams_params;
 
@@ -508,10 +508,10 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
 
     if (debug)
     {
-        fprintf(debug, "Non-bonded GPU launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
-                "Global work size : %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
-                (int)(config.blockSize[0]), (int)(config.blockSize[1]), (int)(config.blockSize[2]),
-                (int)(config.blockSize[0] * config.gridSize[0]), (int)(config.blockSize[1] * config.gridSize[1]), plist->nsci*c_numClPerSupercl,
+        fprintf(debug, "Non-bonded GPU launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
+                "Global work size : %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n",
+                config.blockSize[0], config.blockSize[1], config.blockSize[2],
+                config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1], plist->nsci*c_numClPerSupercl,
                 c_numClPerSupercl, plist->na_c);
     }
 
@@ -581,7 +581,7 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t       *nb,
     cl_plist_t          *plist   = nb->plist[iloc];
     cl_timers_t         *t       = nb->timers;
     cl_command_queue     stream  = nb->stream[iloc];
-    bool                 bDoTime = nb->bDoTime;
+    bool                 bDoTime = nb->bDoTime == CL_TRUE;
 
     if (plist->haveFreshList)
     {
@@ -658,11 +658,11 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t       *nb,
 
     if (debug)
     {
-        fprintf(debug, "Pruning GPU kernel launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
-                "\tGlobal work size: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n"
+        fprintf(debug, "Pruning GPU kernel launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
+                "\tGlobal work size: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
                 "\tShMem: %zu\n",
-                (int)(config.blockSize[0]), (int)(config.blockSize[1]), (int)(config.blockSize[2]),
-                (int)(config.blockSize[0] * config.gridSize[0]), (int)(config.blockSize[1] * config.gridSize[1]), plist->nsci*c_numClPerSupercl,
+                config.blockSize[0], config.blockSize[1], config.blockSize[2],
+                config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1], plist->nsci*c_numClPerSupercl,
                 c_numClPerSupercl, plist->na_c, config.sharedMemorySize);
     }
 
@@ -711,10 +711,10 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
 
     cl_atomdata_t   *adat    = nb->atdat;
     cl_timers_t     *t       = nb->timers;
-    bool             bDoTime = nb->bDoTime;
+    bool             bDoTime = nb->bDoTime == CL_TRUE;
     cl_command_queue stream  = nb->stream[iloc];
 
-    bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
+    bool             bCalcEner   = (flags & GMX_FORCE_ENERGY) != 0;
     int              bCalcFshift = flags & GMX_FORCE_VIRIAL;
 
 
@@ -729,11 +729,11 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
            test case, overall simulation performance was higher with
            the API calls, but this has not been tested on AMD OpenCL,
            so could be worth considering in future. */
-        nb->bNonLocalStreamActive = false;
+        nb->bNonLocalStreamActive = CL_FALSE;
         return;
     }
 
-    getGpuAtomRange(adat, aloc, adat_begin, adat_len);
+    getGpuAtomRange(adat, aloc, &adat_begin, &adat_len);
 
     /* beginning of timed D2H section */
     if (bDoTime)
@@ -764,7 +764,7 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
     {
         cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->nonlocal_done));
         assert(CL_SUCCESS == cl_error);
-        nb->bNonLocalStreamActive = true;
+        nb->bNonLocalStreamActive = CL_TRUE;
     }
 
     /* only transfer energies in the local stream */
@@ -819,7 +819,7 @@ int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut)
      *
      */
     //if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
-    if ((1                         || bForceAnalyticalEwald) && !bForceTabulatedEwald)
+    if (!bForceTabulatedEwald)
     {
         bUseAnalyticalEwald = true;
 
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
index f1bddab95d..0b931f6028 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
@@ -149,7 +149,7 @@ static void init_atomdata_first(cl_atomdata_t *ad, int ntypes, gmx_device_runtim
     // TODO: handle errors, check clCreateBuffer flags
     ad->shift_vec = clCreateBuffer(runData->context, CL_MEM_READ_WRITE, SHIFTS * ad->shift_vec_elem_size, nullptr, &cl_error);
     assert(cl_error == CL_SUCCESS);
-    ad->bShiftVecUploaded = false;
+    ad->bShiftVecUploaded = CL_FALSE;
 
     /* An element of the fshift device buffer has the same size as one element
        of the host side fshift buffer. */
@@ -233,7 +233,6 @@ map_interaction_types_to_gpu_kernel_flavors(const interaction_const_t *ic,
                         break;
                     default:
                         gmx_incons("The requested LJ combination rule is not implemented in the OpenCL GPU accelerated kernels!");
-                        break;
                 }
                 break;
             case eintmodFORCESWITCH:
@@ -244,7 +243,6 @@ map_interaction_types_to_gpu_kernel_flavors(const interaction_const_t *ic,
                 break;
             default:
                 gmx_incons("The requested VdW interaction modifier is not implemented in the GPU accelerated kernels!");
-                break;
         }
     }
     else if (ic->vdwtype == evdwPME)
@@ -500,7 +498,7 @@ nbnxn_gpu_create_context(gmx_device_runtime_data_t *runtimeData,
     device_id        = devInfo->ocl_gpu_id.ocl_device_id;
 
     context_properties[0] = CL_CONTEXT_PLATFORM;
-    context_properties[1] = (cl_context_properties) platform_id;
+    context_properties[1] = reinterpret_cast<cl_context_properties>(platform_id);
     context_properties[2] = 0; /* Terminates the list of properties */
 
     context = clCreateContext(context_properties, 1, &device_id, nullptr, nullptr, &cl_error);
@@ -510,7 +508,6 @@ nbnxn_gpu_create_context(gmx_device_runtime_data_t *runtimeData,
                   rank,
                   devInfo->device_name,
                   cl_error, ocl_get_error_string(cl_error).c_str());
-        return;
     }
 
     runtimeData->context = context;
@@ -640,7 +637,7 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
         snew(nb->plist[eintNonlocal], 1);
     }
 
-    nb->bUseTwoStreams = bLocalAndNonlocal;
+    nb->bUseTwoStreams = static_cast<cl_bool>(bLocalAndNonlocal);
 
     nb->timers = new cl_timers_t();
     snew(nb->timings, 1);
@@ -650,14 +647,14 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
     snew(nb->dev_rundata, 1);
 
     /* init nbst */
-    pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
-    pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
-    pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
+    pmalloc(reinterpret_cast<void**>(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj));
+    pmalloc(reinterpret_cast<void**>(&nb->nbst.e_el), sizeof(*nb->nbst.e_el));
+    pmalloc(reinterpret_cast<void**>(&nb->nbst.fshift), SHIFTS * sizeof(*nb->nbst.fshift));
 
     init_plist(nb->plist[eintLocal]);
 
     /* OpenCL timing disabled if GMX_DISABLE_GPU_TIMING is defined. */
-    nb->bDoTime = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
+    nb->bDoTime = static_cast<cl_bool>(getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
 
     /* Create queues only after bDoTime has been initialized */
     if (nb->bDoTime)
@@ -679,7 +676,6 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
                   rank,
                   nb->dev_info->device_name,
                   cl_error);
-        return;
     }
 
     if (nb->bUseTwoStreams)
@@ -693,13 +689,12 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
                       rank,
                       nb->dev_info->device_name,
                       cl_error);
-            return;
         }
     }
 
     if (nb->bDoTime)
     {
-        init_timers(nb->timers, nb->bUseTwoStreams);
+        init_timers(nb->timers, nb->bUseTwoStreams == CL_TRUE);
         init_timings(nb->timings);
     }
 
@@ -796,7 +791,7 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
     // Timing accumulation should happen only if there was work to do
     // because getLastRangeTime() gets skipped with empty lists later
     // which leads to the counter not being reset.
-    bool             bDoTime    = (nb->bDoTime && h_plist->nsci > 0);
+    bool             bDoTime    = ((nb->bDoTime == CL_TRUE) && h_plist->nsci > 0);
     cl_command_queue stream     = nb->stream[iloc];
     cl_plist_t      *d_plist    = nb->plist[iloc];
 
@@ -865,7 +860,7 @@ void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_ocl_t        *nb,
     {
         ocl_copy_H2D_async(adat->shift_vec, nbatom->shift_vec, 0,
                            SHIFTS * adat->shift_vec_elem_size, ls, nullptr);
-        adat->bShiftVecUploaded = true;
+        adat->bShiftVecUploaded = CL_TRUE;
     }
 }
 
@@ -876,7 +871,7 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t               *nb,
     cl_int           cl_error;
     int              nalloc, natoms;
     bool             realloced;
-    bool             bDoTime = nb->bDoTime;
+    bool             bDoTime = nb->bDoTime == CL_TRUE;
     cl_timers_t     *timers  = nb->timers;
     cl_atomdata_t   *d_atdat = nb->atdat;
     cl_command_queue ls      = nb->stream[eintLocal];
@@ -1035,16 +1030,16 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
 
     /* Free kernels */
     int kernel_count = sizeof(nb->kernel_ener_noprune_ptr) / sizeof(nb->kernel_ener_noprune_ptr[0][0]);
-    free_kernels((cl_kernel*)nb->kernel_ener_noprune_ptr, kernel_count);
+    free_kernels(nb->kernel_ener_noprune_ptr[0], kernel_count);
 
     kernel_count = sizeof(nb->kernel_ener_prune_ptr) / sizeof(nb->kernel_ener_prune_ptr[0][0]);
-    free_kernels((cl_kernel*)nb->kernel_ener_prune_ptr, kernel_count);
+    free_kernels(nb->kernel_ener_prune_ptr[0], kernel_count);
 
     kernel_count = sizeof(nb->kernel_noener_noprune_ptr) / sizeof(nb->kernel_noener_noprune_ptr[0][0]);
-    free_kernels((cl_kernel*)nb->kernel_noener_noprune_ptr, kernel_count);
+    free_kernels(nb->kernel_noener_noprune_ptr[0], kernel_count);
 
     kernel_count = sizeof(nb->kernel_noener_prune_ptr) / sizeof(nb->kernel_noener_prune_ptr[0][0]);
-    free_kernels((cl_kernel*)nb->kernel_noener_prune_ptr, kernel_count);
+    free_kernels(nb->kernel_noener_prune_ptr[0], kernel_count);
 
     free_kernel(&(nb->kernel_memset_f));
     free_kernel(&(nb->kernel_memset_f2));
-- 
2.22.0