From: Szilárd Páll <pall.szilard@gmail.com>
Date: Thu, 22 Aug 2019 00:11:41 +0000 (+0200)
Subject: Add target for offline PME OpenCL compilation
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=4f1c454810fd0efd3266eb6fdcb1f14b14faa911;p=alexxy%2Fgromacs.git

Add target for offline PME OpenCL compilation

Also fix clang-tidy warnings in the PME OpenCL kernels; tested with
clang-tidy 8.0 and 9.0.
Target will be triggered in all clang-tidy + OpenCL builds.

Moved the invocation of the targets in jenkins before the
tests where it makes more sense.

Change-Id: I31d7dd4883e20d0564ab2f18a6de7e0d17e8024a
---

diff --git a/admin/builds/gromacs.py b/admin/builds/gromacs.py
index a0659aae84..d78c29c13e 100644
--- a/admin/builds/gromacs.py
+++ b/admin/builds/gromacs.py
@@ -1,7 +1,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2015,2016,2017,2018,2019 by the GROMACS development team.
+# Copyright (c) 2015,2016,2017,2018,2019,2020, by the GROMACS development team.
 # Copyright (c) 2020, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
@@ -247,6 +247,11 @@ def do_build(context):
         # TODO: Generalize the machinery here such that it can easily be used
         # also for non-release builds.
     else:
+        # run OpenCL offline compile tests on clang tidy builds
+        if (context.opts.tidy and context.opts.opencl):
+            context.build_target(target='ocl_nbnxm_kernels')
+            context.build_target(target='ocl_pme_kernels')
+
         context.build_target(target='tests', keep_going=True)
 
         context.run_ctest(args=['--output-on-failure', '--label-exclude', 'SlowTest'], memcheck=context.opts.asan)
@@ -254,10 +259,6 @@ def do_build(context):
         context.build_target(target='install')
         # TODO: Consider what could be tested about the installed binaries.
 
-        # run OpenCL offline compile tests on clang tidy builds
-        if (context.opts.tidy and context.opts.opencl):
-            context.build_target(target='ocl_nbnxm_kernels')
-
         if not context.opts.mdrun_only:
             context.env.prepend_path_env(os.path.join(context.workspace.build_dir, 'bin'))
             context.chdir(regressiontests_path)
diff --git a/src/gromacs/ewald/CMakeLists.txt b/src/gromacs/ewald/CMakeLists.txt
index 45517cdae1..7583e1db50 100644
--- a/src/gromacs/ewald/CMakeLists.txt
+++ b/src/gromacs/ewald/CMakeLists.txt
@@ -95,3 +95,55 @@ endif()
 if (BUILD_TESTING)
     add_subdirectory(tests)
 endif()
+
+
+set(PME_OCL_KERNEL_SOURCES
+    "${CMAKE_CURRENT_SOURCE_DIR}/pme_gpu_utils.clh"
+    "${CMAKE_CURRENT_SOURCE_DIR}/pme_solve.clh"
+    "${CMAKE_CURRENT_SOURCE_DIR}/pme_gather.clh"
+    "${CMAKE_CURRENT_SOURCE_DIR}/pme_spread.clh")
+
+if(CLANG_TIDY_EXE)
+   set(OCL_COMPILER "${CLANG_TIDY_EXE}")
+   set(CLANG_TIDY_ARGS "-quiet;-checks=*,-readability-implicit-bool-conversion,-llvm-header-guard,-hicpp-signed-bitwise,-clang-analyzer-deadcode.DeadStores,-google-readability-todo,-clang-diagnostic-padded,-fcomment-block-commands=internal;--;${CMAKE_C_COMPILER}")
+else()
+   set(OCL_COMPILER "${CMAKE_C_COMPILER}")
+endif()
+
+# TODO: test all warp sizes on all vendor targets?
+foreach(VENDOR AMD NVIDIA INTEL)
+    foreach(WARPSIZE 16 32 64)
+        math(EXPR SPREAD_WG_SIZE "8*${WARPSIZE}")
+        math(EXPR SOLVE_WG_SIZE "8*${WARPSIZE}")
+        math(EXPR GATHER_WG_SIZE "4*${WARPSIZE}")
+        set(OBJ_FILE pme_ocl_kernel_warpSize${WARPSIZE}_${VENDOR}.o)
+        add_custom_command(OUTPUT ${OBJ_FILE} COMMAND ${OCL_COMPILER}
+        ${CMAKE_CURRENT_SOURCE_DIR}/pme_program.cl ${CLANG_TIDY_ARGS}
+        -Xclang -finclude-default-header  -D_${VENDOR}_SOURCE_
+        -Dwarp_size=${WARPSIZE}
+        -Dorder=4
+        -DthreadsPerAtom=16
+        -Dc_pmeMaxUnitcellShift=2
+        -Dc_usePadding=true
+        -Dc_skipNeutralAtoms=false
+        -Dc_virialAndEnergyCount=7
+        -Dc_spreadWorkGroupSize=${SPREAD_WG_SIZE}
+        -Dc_solveMaxWorkGroupSize=${SOLVE_WG_SIZE}
+        -Dc_gatherWorkGroupSize=${GATHER_WG_SIZE}
+        -DDIM=3 -DXX=0 -DYY=1 -DZZ=2
+        -DwrapX=true -DwrapY=true
+        -c -I ${CMAKE_SOURCE_DIR}/src -std=cl1.2
+        -Weverything  -Wno-conversion -Wno-missing-variable-declarations -Wno-used-but-marked-unused
+        -Wno-cast-align -Wno-incompatible-pointer-types
+        # to avoid  "warning: unknown command tag name" for \internal
+        -Wno-documentation-unknown-command
+        # to avoid pme_gpu_types.h:100:52: warning: padding struct 'struct PmeGpuConstParams' with 4 bytes to align 'd_virialAndEnergy'
+        -Wno-padded
+        -o${OBJ_FILE}
+        )
+        list(APPEND PME_OCL_KERNELS ${OBJ_FILE})
+    endforeach()
+endforeach()
+
+add_custom_target(ocl_pme_kernels DEPENDS ${PME_OCL_KERNELS} )
+gmx_set_custom_target_output(ocl_pme_kernels ${PME_OCL_KERNELS})
diff --git a/src/gromacs/ewald/pme_gather.clh b/src/gromacs/ewald/pme_gather.clh
index 3b760e9d02..8ade47919a 100644
--- a/src/gromacs/ewald/pme_gather.clh
+++ b/src/gromacs/ewald/pme_gather.clh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -67,9 +67,10 @@ inline float read_grid_size(const float* realGridSizeFP, const int dimIndex)
         case XX: return realGridSizeFP[XX];
         case YY: return realGridSizeFP[YY];
         case ZZ: return realGridSizeFP[ZZ];
+        default: assert(false); break;
     }
     assert(false);
-    return 0.0f;
+    return 0.0F;
 }
 
 /*! \brief Reduce the partial force contributions.
@@ -211,7 +212,7 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
 {
     /* These are the atom indices - for the shared and global memory */
     const int atomIndexLocal  = get_local_id(ZZ);
-    const int atomIndexOffset = get_group_id(XX) * atomsPerBlock;
+    const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock;
     const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
 
 /* Some sizes which are defines and not consts because they go into the array size */
@@ -230,18 +231,23 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
     const int ithy = get_local_id(YY);
     const int ithz = get_local_id(XX);
 
-    const int threadLocalId = (get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0)
-                              + get_local_id(0);
+    assert((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0)
+           <= MAX_INT);
+    const int threadLocalId =
+            (int)((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0)
+                  + get_local_id(0));
 
     /* These are the spline contribution indices in shared memory */
-    const int splineIndex = (get_local_id(1) * get_local_size(0)
-                             + get_local_id(0)); /* Relative to the current particle , 0..15 for order 4 */
-    const int lineIndex   = threadLocalId; /* And to all the block's particles */
+    assert((get_local_id(1) * get_local_size(0) + get_local_id(0)) <= MAX_INT);
+    const int splineIndex =
+            (int)(get_local_id(1) * get_local_size(0)
+                  + get_local_id(0));    /* Relative to the current particle , 0..15 for order 4 */
+    const int lineIndex = threadLocalId; /* And to all the block's particles */
 
     /* Staging the atom gridline indices, DIM * atomsPerBlock threads */
     const int localGridlineIndicesIndex = threadLocalId;
     const int globalGridlineIndicesIndex =
-            get_group_id(XX) * gridlineIndicesSize + localGridlineIndicesIndex;
+            (int)get_group_id(XX) * gridlineIndicesSize + localGridlineIndicesIndex;
     const int globalCheckIndices =
             pme_gpu_check_atom_data_index(globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
     if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
@@ -250,8 +256,8 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
         assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
     }
     /* Staging the spline parameters, DIM * order * atomsPerBlock threads */
-    const int localSplineParamsIndex  = threadLocalId;
-    const int globalSplineParamsIndex = get_group_id(XX) * splineParamsSize + localSplineParamsIndex;
+    const int localSplineParamsIndex = threadLocalId;
+    const int globalSplineParamsIndex = (int)get_group_id(XX) * splineParamsSize + localSplineParamsIndex;
     const int globalCheckSplineParams = pme_gpu_check_atom_data_index(
             globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
     if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
@@ -263,9 +269,9 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    float fx = 0.0f;
-    float fy = 0.0f;
-    float fz = 0.0f;
+    float fx = 0.0F;
+    float fy = 0.0F;
+    float fz = 0.0F;
 
     const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
     const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
@@ -371,7 +377,7 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
         for (int i = 0; i < numIter; i++)
         {
             const int outputIndexLocal  = i * iterThreads + threadLocalId;
-            const int outputIndexGlobal = get_group_id(XX) * blockForcesSize + outputIndexLocal;
+            const int outputIndexGlobal = (int)get_group_id(XX) * blockForcesSize + outputIndexLocal;
             const int globalOutputCheck =
                     pme_gpu_check_atom_data_index(outputIndexGlobal, kernelParams.atoms.nAtoms * DIM);
             if (globalOutputCheck)
diff --git a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
index 800f43ea9e..79ef8b6d21 100644
--- a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
+++ b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -142,7 +142,6 @@ void PmeGpuProgramImpl::compileKernels(const gmx_device_info_t* deviceInfo)
         const std::string commonDefines = gmx::formatString(
                 "-Dwarp_size=%zd "
                 "-Dorder=%d "
-                "-DatomsPerWarp=%zd "
                 "-DthreadsPerAtom=%d "
                 // forwarding from pme_grid.h, used for spline computation table sizes only
                 "-Dc_pmeMaxUnitcellShift=%f "
@@ -158,10 +157,10 @@ void PmeGpuProgramImpl::compileKernels(const gmx_device_info_t* deviceInfo)
                 "-DDIM=%d -DXX=%d -DYY=%d -DZZ=%d "
                 // decomposition parameter placeholders
                 "-DwrapX=true -DwrapY=true ",
-                warpSize, c_pmeGpuOrder, warpSize / c_pmeSpreadGatherThreadsPerAtom,
-                c_pmeSpreadGatherThreadsPerAtom, static_cast<float>(c_pmeMaxUnitcellShift),
-                static_cast<int>(c_usePadding), static_cast<int>(c_skipNeutralAtoms), c_virialAndEnergyCount,
-                spreadWorkGroupSize, solveMaxWorkGroupSize, gatherWorkGroupSize, DIM, XX, YY, ZZ);
+                warpSize, c_pmeGpuOrder, c_pmeSpreadGatherThreadsPerAtom,
+                static_cast<float>(c_pmeMaxUnitcellShift), static_cast<int>(c_usePadding),
+                static_cast<int>(c_skipNeutralAtoms), c_virialAndEnergyCount, spreadWorkGroupSize,
+                solveMaxWorkGroupSize, gatherWorkGroupSize, DIM, XX, YY, ZZ);
         try
         {
             /* TODO when we have a proper MPI-aware logging module,
diff --git a/src/gromacs/ewald/pme_gpu_utils.clh b/src/gromacs/ewald/pme_gpu_utils.clh
index 5f611e9510..1cec1a1cee 100644
--- a/src/gromacs/ewald/pme_gpu_utils.clh
+++ b/src/gromacs/ewald/pme_gpu_utils.clh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -78,8 +78,6 @@ inline int getSplineParamIndexBase(int warpIndex, int atomWarpIndex)
  * \param[in] paramIndexBase   Must be result of getSplineParamIndexBase().
  * \param[in] dimIndex         Dimension index (from 0 to 2)
  * \param[in] splineIndex      Spline contribution index (from 0 to \p order - 1)
- * \param[in] order            PME order
- * \param[in] atomsPerWarp     Number of atoms processed by a warp
  *
  * \returns Index into theta or dtheta array using GPU layout.
  */
@@ -93,7 +91,6 @@ inline int getSplineParamIndex(int paramIndexBase, int dimIndex, int splineIndex
 /*! \brief
  * A function for checking the global atom data indices against the atom data array sizes.
  *
- * \param[in] atomDataIndexGlobal  The atom data index.
  * \param[in] nAtomData            The atom data array element count.
  * \returns                        Non-0 if index is within bounds (or PME data padding is enabled), 0 otherwise.
  *
@@ -113,7 +110,7 @@ inline int pme_gpu_check_atom_data_index(const size_t atomDataIndex, const size_
  */
 inline int pme_gpu_check_atom_charge(const float coefficient)
 {
-    return c_skipNeutralAtoms ? (coefficient != 0.0f) : 1;
+    return c_skipNeutralAtoms ? (coefficient != 0.0F) : 1;
 }
 
 #endif
diff --git a/src/gromacs/ewald/pme_program.cl b/src/gromacs/ewald/pme_program.cl
index 493fb5f8f2..091658d489 100644
--- a/src/gromacs/ewald/pme_program.cl
+++ b/src/gromacs/ewald/pme_program.cl
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -50,7 +50,6 @@
  */
 
 // Assert placeholders, to not rip them out from OpenCL implementation - hopefully they come in handy some day with OpenCL 2
-#define static_assert(a, b)
 #define assert(a)
 
 #define PmeOpenCLKernelParams PmeGpuKernelParamsBase
@@ -58,6 +57,7 @@
 /* SPREAD/SPLINE */
 
 #define atomsPerBlock (c_spreadWorkGroupSize / threadsPerAtom)
+#define atomsPerWarp (warp_size / threadsPerAtom)
 
 // spline/spread fused
 #define computeSplines 1
diff --git a/src/gromacs/ewald/pme_solve.clh b/src/gromacs/ewald/pme_solve.clh
index ac38117d38..2c4dbde609 100644
--- a/src/gromacs/ewald/pme_solve.clh
+++ b/src/gromacs/ewald/pme_solve.clh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -49,9 +49,10 @@
  *  \author Aleksei Iupinov <a.yupinov@gmail.com>
  */
 
-#include "pme_gpu_types.h"
 #include "gromacs/gpu_utils/vectype_ops.clh"
 
+#include "pme_gpu_types.h"
+
 /*! \brief
  * PME complex grid solver kernel function.
  * Please see the file description for additional defines which this kernel expects.
@@ -68,7 +69,9 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer
                                                        __global float2* __restrict__ gm_grid)
 {
     /* This kernel supports 2 different grid dimension orderings: YZX and XYZ */
-    int majorDim, middleDim, minorDim;
+    int majorDim;
+    int middleDim;
+    int minorDim;
     if (gridOrdering == YZX)
     {
         majorDim  = YY;
@@ -90,17 +93,19 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer
             gm_splineModuli + kernelParams.grid.splineValuesOffset[minorDim];
 
     /* Various grid sizes and indices */
-    const int localOffsetMinor = 0, localOffsetMajor = 0, localOffsetMiddle = 0; // unused
-    const int localSizeMinor   = kernelParams.grid.complexGridSizePadded[minorDim];
-    const int localSizeMiddle  = kernelParams.grid.complexGridSizePadded[middleDim];
-    const int localCountMiddle = kernelParams.grid.complexGridSize[middleDim];
-    const int localCountMinor  = kernelParams.grid.complexGridSize[minorDim];
-    const int nMajor           = kernelParams.grid.realGridSize[majorDim];
-    const int nMiddle          = kernelParams.grid.realGridSize[middleDim];
-    const int nMinor           = kernelParams.grid.realGridSize[minorDim];
-    const int maxkMajor        = (nMajor + 1) / 2;  // X or Y
-    const int maxkMiddle       = (nMiddle + 1) / 2; // Y OR Z => only check for !YZX
-    const int maxkMinor        = (nMinor + 1) / 2;  // Z or X => only check for YZX
+    const int localOffsetMinor  = 0; // unused
+    const int localOffsetMajor  = 0; // unused
+    const int localOffsetMiddle = 0; // unused
+    const int localSizeMinor    = kernelParams.grid.complexGridSizePadded[minorDim];
+    const int localSizeMiddle   = kernelParams.grid.complexGridSizePadded[middleDim];
+    const int localCountMiddle  = kernelParams.grid.complexGridSize[middleDim];
+    const int localCountMinor   = kernelParams.grid.complexGridSize[minorDim];
+    const int nMajor            = kernelParams.grid.realGridSize[majorDim];
+    const int nMiddle           = kernelParams.grid.realGridSize[middleDim];
+    const int nMinor            = kernelParams.grid.realGridSize[minorDim];
+    const int maxkMajor         = (nMajor + 1) / 2;  // X or Y
+    const int maxkMiddle        = (nMiddle + 1) / 2; // Y OR Z => only check for !YZX
+    const int maxkMinor         = (nMinor + 1) / 2;  // Z or X => only check for YZX
 
     /* Each thread works on one cell of the Fourier space complex 3D grid (gm_grid).
      * Each block handles up to c_solveMaxWorkGroupSize cells -
@@ -112,19 +117,20 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer
     const int gridLineIndex     = threadLocalId / gridLineSize;
     const int gridLineCellIndex = threadLocalId - gridLineSize * gridLineIndex;
     const int gridLinesPerBlock = max((int)(get_local_size(XX)) / gridLineSize, 1);
-    const int activeWarps       = (get_local_size(XX) / warp_size);
-    const int indexMinor        = get_group_id(XX) * get_local_size(XX) + gridLineCellIndex;
-    const int indexMiddle       = get_group_id(YY) * gridLinesPerBlock + gridLineIndex;
-    const int indexMajor        = get_group_id(ZZ);
+    const int activeWarps       = ((int)get_local_size(XX) / warp_size);
+    assert((get_group_id(XX) * get_local_size(XX)) < MAX_INT);
+    const int indexMinor  = (int)get_group_id(XX) * (int)get_local_size(XX) + gridLineCellIndex;
+    const int indexMiddle = (int)get_group_id(YY) * gridLinesPerBlock + gridLineIndex;
+    const int indexMajor  = (int)get_group_id(ZZ);
 
     /* Optional outputs */
-    float energy = 0.0f;
-    float virxx  = 0.0f;
-    float virxy  = 0.0f;
-    float virxz  = 0.0f;
-    float viryy  = 0.0f;
-    float viryz  = 0.0f;
-    float virzz  = 0.0f;
+    float energy = 0.0F;
+    float virxx  = 0.0F;
+    float virxy  = 0.0F;
+    float virxz  = 0.0F;
+    float viryy  = 0.0F;
+    float viryz  = 0.0F;
+    float virzz  = 0.0F;
 
     assert(indexMajor < kernelParams.grid.complexGridSize[majorDim]);
     if ((indexMiddle < localCountMiddle) & (indexMinor < localCountMinor)
@@ -136,26 +142,28 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer
 
         const int kMajor = indexMajor + localOffsetMajor;
         /* Checking either X in XYZ, or Y in YZX cases */
-        const float mMajor = (kMajor < maxkMajor) ? kMajor : (kMajor - nMajor);
+        const float mMajor = (float)((kMajor < maxkMajor) ? kMajor : (kMajor - nMajor));
 
         const int kMiddle = indexMiddle + localOffsetMiddle;
-        float     mMiddle = kMiddle;
+        float     mMiddle = (float)kMiddle;
         /* Checking Y in XYZ case */
         if (gridOrdering == XYZ)
         {
-            mMiddle = (kMiddle < maxkMiddle) ? kMiddle : (kMiddle - nMiddle);
+            mMiddle = (float)((kMiddle < maxkMiddle) ? kMiddle : (kMiddle - nMiddle));
         }
         const int kMinor = localOffsetMinor + indexMinor;
-        float     mMinor = kMinor;
+        float     mMinor = (float)kMinor;
         /* Checking X in YZX case */
         if (gridOrdering == YZX)
         {
-            mMinor = (kMinor < maxkMinor) ? kMinor : (kMinor - nMinor);
+            mMinor = (float)((kMinor < maxkMinor) ? kMinor : (kMinor - nMinor));
         }
         /* We should skip the k-space point (0,0,0) */
         const bool notZeroPoint = (kMinor > 0) | (kMajor > 0) | (kMiddle > 0);
 
-        float mX, mY, mZ;
+        float mX;
+        float mY;
+        float mZ;
         if (gridOrdering == YZX)
         {
             mX = mMinor;
@@ -170,19 +178,20 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer
         }
 
         /* 0.5 correction factor for the first and last components of a Z dimension */
-        float corner_fac = 1.0f;
+        float       corner_fac   = 1.0F;
+        const float z_corner_fac = 0.5F;
         if (gridOrdering == YZX)
         {
             if ((kMiddle == 0) | (kMiddle == maxkMiddle))
             {
-                corner_fac = 0.5f;
+                corner_fac = z_corner_fac;
             }
         }
         if (gridOrdering == XYZ)
         {
             if ((kMinor == 0) | (kMinor == maxkMinor))
             {
-                corner_fac = 0.5f;
+                corner_fac = z_corner_fac;
             }
         }
 
@@ -196,11 +205,11 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer
                                + mZ * kernelParams.current.recipBox[ZZ][ZZ];
 
             const float m2k = mhxk * mhxk + mhyk * mhyk + mhzk * mhzk;
-            assert(m2k != 0.0f);
+            assert(m2k != 0.0F);
             const float denom = m2k * M_PI_F * kernelParams.current.boxVolume * gm_splineValueMajor[kMajor]
                                 * gm_splineValueMiddle[kMiddle] * gm_splineValueMinor[kMinor];
             assert(isfinite(denom));
-            assert(denom != 0.0f);
+            assert(denom != 0.0F);
             const float tmp1   = exp(-kernelParams.grid.ewaldFactor * m2k);
             const float etermk = kernelParams.constants.elFactor * tmp1 / denom;
 
@@ -214,9 +223,9 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer
             if (computeEnergyAndVirial)
             {
                 const float tmp1k =
-                        2.0f * (gridValue.x * oldGridValue.x + gridValue.y * oldGridValue.y);
+                        2.0F * (gridValue.x * oldGridValue.x + gridValue.y * oldGridValue.y);
 
-                const float vfactor = (kernelParams.grid.ewaldFactor + 1.0f / m2k) * 2.0f;
+                const float vfactor = (kernelParams.grid.ewaldFactor + 1.0F / m2k) * 2.0F;
                 const float ets2    = corner_fac * tmp1k;
                 energy              = ets2;
 
@@ -257,7 +266,9 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer
             sm_virialAndEnergy[2 * warp_size + lane] = virzz;
             sm_virialAndEnergy[3 * warp_size + lane] = virxy;
             sm_virialAndEnergy[4 * warp_size + lane] = virxz;
+            // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
             sm_virialAndEnergy[5 * warp_size + lane] = viryz;
+            // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
             sm_virialAndEnergy[6 * warp_size + lane] = energy;
         }
         barrier(CLK_LOCAL_MEM_FENCE);
@@ -268,7 +279,9 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer
             atomicAdd_l_f(sm_virialAndEnergy + 2 * warp_size + lane, virzz);
             atomicAdd_l_f(sm_virialAndEnergy + 3 * warp_size + lane, virxy);
             atomicAdd_l_f(sm_virialAndEnergy + 4 * warp_size + lane, virxz);
+            // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
             atomicAdd_l_f(sm_virialAndEnergy + 5 * warp_size + lane, viryz);
+            // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
             atomicAdd_l_f(sm_virialAndEnergy + 6 * warp_size + lane, energy);
         }
         barrier(CLK_LOCAL_MEM_FENCE);
diff --git a/src/gromacs/ewald/pme_spread.clh b/src/gromacs/ewald/pme_spread.clh
index 8215614ef0..0ea1449c22 100644
--- a/src/gromacs/ewald/pme_spread.clh
+++ b/src/gromacs/ewald/pme_spread.clh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -51,9 +51,10 @@
  *  \author Aleksei Iupinov <a.yupinov@gmail.com>
  */
 
+#include "gromacs/gpu_utils/vectype_ops.clh"
+
 #include "pme_gpu_types.h"
 #include "pme_gpu_utils.clh"
-#include "gromacs/gpu_utils/vectype_ops.clh"
 
 /*
  * This define affects the spline calculation behaviour in the kernel.
@@ -83,12 +84,15 @@ inline void pme_gpu_stage_atom_data(const struct PmeOpenCLKernelParams kernelPar
                                     __global const float* __restrict__ gm_source,
                                     const int dataCountPerAtom)
 {
-    const size_t threadLocalIndex =
-            ((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0));
-    const size_t localIndex      = threadLocalIndex;
-    const size_t globalIndexBase = get_group_id(XX) * atomsPerBlock * dataCountPerAtom;
-    const size_t globalIndex     = globalIndexBase + localIndex;
-    const int    globalCheck =
+    assert((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0)
+           < MAX_INT);
+    const int threadLocalIndex =
+            (int)((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0)
+                  + get_local_id(0));
+    const int localIndex      = threadLocalIndex;
+    const int globalIndexBase = (int)get_group_id(XX) * atomsPerBlock * dataCountPerAtom;
+    const int globalIndex     = globalIndexBase + localIndex;
+    const int globalCheck =
             pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
     if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
     {
@@ -115,22 +119,25 @@ inline void pme_gpu_stage_atom_data(const struct PmeOpenCLKernelParams kernelPar
  * \param[in]  gm_fractShiftsTable      Atom fractional coordinates correction table
  * \param[in]  gm_gridlineIndicesTable  Atom fractional coordinates correction table
  */
-inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams,
-                              const int                          atomIndexOffset,
-                              __local const float* __restrict__ sm_coordinates,
-                              __local const float* __restrict__ sm_coefficients,
-                              __local float* __restrict__ sm_theta,
-                              __local int* __restrict__ sm_gridlineIndices,
-                              __local float* __restrict__ sm_fractCoords,
-                              __global float* __restrict__ gm_theta,
-                              __global float* __restrict__ gm_dtheta,
-                              __global int* __restrict__ gm_gridlineIndices,
-                              __global const float* __restrict__ gm_fractShiftsTable,
-                              __global const int* __restrict__ gm_gridlineIndicesTable)
+gmx_opencl_inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams,
+                                         const int                          atomIndexOffset,
+                                         __local const float* __restrict__ sm_coordinates,
+                                         __local const float* __restrict__ sm_coefficients,
+                                         __local float* __restrict__ sm_theta,
+                                         __local int* __restrict__ sm_gridlineIndices,
+                                         __local float* __restrict__ sm_fractCoords,
+                                         __global float* __restrict__ gm_theta,
+                                         __global float* __restrict__ gm_dtheta,
+                                         __global int* __restrict__ gm_gridlineIndices,
+                                         __global const float* __restrict__ gm_fractShiftsTable,
+                                         __global const int* __restrict__ gm_gridlineIndicesTable)
 {
     /* Thread index w.r.t. block */
+    assert((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0)
+           < MAX_INT);
     const int threadLocalIndex =
-            ((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0));
+            (int)((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0)
+                  + get_local_id(0));
     /* Warp index w.r.t. block - could probably be obtained easier? */
     const int warpIndex = threadLocalIndex / warp_size;
     /* Thread index w.r.t. warp */
@@ -179,8 +186,10 @@ inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams,
         /* Indices interpolation */
         if (orderIndex == 0)
         {
-            int          tableIndex, tInt;
-            float        n, t;
+            int          tableIndex;
+            int          tInt;
+            float        n;
+            float        t;
             const float3 x = vload3(atomIndexLocal, sm_coordinates);
 
             /* Accessing fields in fshOffset/nXYZ/recipbox/... with dimIndex offset
@@ -213,13 +222,17 @@ inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams,
                                 .z
                         * kernelParams.current.recipBox[dimIndex][ZZ];
                     break;
+                default:
+                    assert(false);
+                    return;
+                    break;
             }
             const float shift = c_pmeMaxUnitcellShift;
 
             /* Fractional coordinates along box vectors, adding a positive shift to ensure t is positive for triclinic boxes */
             t                                 = (t + shift) * n;
             tInt                              = (int)t;
-            sm_fractCoords[sharedMemoryIndex] = t - tInt;
+            sm_fractCoords[sharedMemoryIndex] = t - (float)tInt;
             tableIndex += tInt;
             assert(tInt >= 0);
             assert(tInt < c_pmeNeighborUnitcellCount * n);
@@ -241,23 +254,24 @@ inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams,
             assert(isfinite(dr));
 
             /* dr is relative offset from lower cell limit */
-            *SPLINE_DATA_PTR(order - 1) = 0.0f;
+            *SPLINE_DATA_PTR(order - 1) = 0.0F;
             *SPLINE_DATA_PTR(1)         = dr;
-            *SPLINE_DATA_PTR(0)         = 1.0f - dr;
+            *SPLINE_DATA_PTR(0)         = 1.0F - dr;
 
 #    pragma unroll order
             for (int k = 3; k < order; k++)
             {
-                div                     = 1.0f / (k - 1.0f);
+                div                     = 1.0F / ((float)k - 1.0F);
                 *SPLINE_DATA_PTR(k - 1) = div * dr * SPLINE_DATA(k - 2);
 #    pragma unroll
                 for (int l = 1; l < (k - 1); l++)
                 {
-                    *SPLINE_DATA_PTR(k - l - 1) = div
-                                                  * ((dr + l) * SPLINE_DATA(k - l - 2)
-                                                     + (k - l - dr) * SPLINE_DATA(k - l - 1));
+                    *SPLINE_DATA_PTR(k - l - 1) =
+                            div
+                            * ((dr + (float)l) * SPLINE_DATA(k - l - 2)
+                               + ((float)k - (float)l - dr) * SPLINE_DATA(k - l - 1));
                 }
-                *SPLINE_DATA_PTR(0) = div * (1.0f - dr) * SPLINE_DATA(0);
+                *SPLINE_DATA_PTR(0) = div * (1.0F - dr) * SPLINE_DATA(0);
             }
 
             const int thetaIndexBase        = getSplineParamIndexBase(warpIndex, atomWarpIndex);
@@ -274,21 +288,21 @@ inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams,
                 const int thetaIndex       = getSplineParamIndex(thetaIndexBase, dimIndex, o);
                 const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
 
-                const float dtheta = ((o > 0) ? SPLINE_DATA(o - 1) : 0.0f) - SPLINE_DATA(o);
+                const float dtheta = ((o > 0) ? SPLINE_DATA(o - 1) : 0.0F) - SPLINE_DATA(o);
                 assert(isfinite(dtheta));
                 gm_dtheta[thetaGlobalIndex] = dtheta;
             }
 
-            div                         = 1.0f / (order - 1.0f);
+            div                         = 1.0F / (order - 1.0F);
             *SPLINE_DATA_PTR(order - 1) = div * dr * SPLINE_DATA(order - 2);
 #    pragma unroll
             for (int k = 1; k < (order - 1); k++)
             {
                 *SPLINE_DATA_PTR(order - k - 1) = div
-                                                  * ((dr + k) * SPLINE_DATA(order - k - 2)
+                                                  * ((dr + (float)k) * SPLINE_DATA(order - k - 2)
                                                      + (order - k - dr) * SPLINE_DATA(order - k - 1));
             }
-            *SPLINE_DATA_PTR(0) = div * (1.0f - dr) * SPLINE_DATA(0);
+            *SPLINE_DATA_PTR(0) = div * (1.0F - dr) * SPLINE_DATA(0);
 
             /* Storing the spline values (theta) */
 #    if !PME_GPU_PARALLEL_SPLINE
@@ -319,12 +333,12 @@ inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams,
  * sm_gridlineIndices   Atom gridline indices in the shared memory. \param[in]  sm_theta Atom spline
  * values in the shared memory. \param[out] gm_grid              Global 3D grid for spreading.
  */
-inline void spread_charges(const struct PmeOpenCLKernelParams kernelParams,
-                           int                                atomIndexOffset,
-                           __local const float* __restrict__ sm_coefficients,
-                           __local const int* __restrict__ sm_gridlineIndices,
-                           __local const float* __restrict__ sm_theta,
-                           __global float* __restrict__ gm_grid)
+gmx_opencl_inline void spread_charges(const struct PmeOpenCLKernelParams kernelParams,
+                                      int                                atomIndexOffset,
+                                      __local const float* __restrict__ sm_coefficients,
+                                      __local const int* __restrict__ sm_gridlineIndices,
+                                      __local const float* __restrict__ sm_theta,
+                                      __global float* __restrict__ gm_grid)
 {
     const int nx  = kernelParams.grid.realGridSize[XX];
     const int ny  = kernelParams.grid.realGridSize[YY];
@@ -332,7 +346,9 @@ inline void spread_charges(const struct PmeOpenCLKernelParams kernelParams,
     const int pny = kernelParams.grid.realGridSizePadded[YY];
     const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
 
-    const int offx = 0, offy = 0, offz = 0; // unused for now
+    const int offx = 0;
+    const int offy = 0;
+    const int offz = 0;
 
     const int atomIndexLocal  = get_local_id(ZZ);
     const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
@@ -426,7 +442,7 @@ __attribute__((reqd_work_group_size(order, order, atomsPerBlock))) __kernel void
     // Staging coordinates - only for spline computation
     __local float sm_coordinates[DIM * atomsPerBlock];
 
-    const int atomIndexOffset = get_group_id(XX) * atomsPerBlock;
+    const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock;
 
     /* Staging coefficients/charges for both spline and spread */
     pme_gpu_stage_atom_data(kernelParams, sm_coefficients, gm_coefficients, 1);