Add target for offline PME OpenCL compilation

author Szilárd Páll <pall.szilard@gmail.com>

Thu, 22 Aug 2019 00:11:41 +0000 (02:11 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Fri, 24 Jan 2020 19:37:15 +0000 (20:37 +0100)
author Szilárd Páll <pall.szilard@gmail.com>
Thu, 22 Aug 2019 00:11:41 +0000 (02:11 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Fri, 24 Jan 2020 19:37:15 +0000 (20:37 +0100)
diff --git a/admin/builds/gromacs.py b/admin/builds/gromacs.py

index a0659aae8432498feacf1a4bc9ab17ac0ba7d2fd..d78c29c13ede1b383ec59ab695a828014847aeca 100644 (file)
--- a/admin/builds/gromacs.py
+++ b/admin/builds/gromacs.py
@@ -1,7 +1,7 @@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
-# Copyright (c) 2015,2016,2017,2018,2019 by the GROMACS development team.
+# Copyright (c) 2015,2016,2017,2018,2019,2020, by the GROMACS development team.
  # Copyright (c) 2020, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
@@ -247,6 +247,11 @@ def do_build(context):
          # TODO: Generalize the machinery here such that it can easily be used
          # also for non-release builds.
      else:
+        # run OpenCL offline compile tests on clang tidy builds
+        if (context.opts.tidy and context.opts.opencl):
+            context.build_target(target='ocl_nbnxm_kernels')
+            context.build_target(target='ocl_pme_kernels')
+
          context.build_target(target='tests', keep_going=True)
  
          context.run_ctest(args=['--output-on-failure', '--label-exclude', 'SlowTest'], memcheck=context.opts.asan)
@@ -254,10 +259,6 @@ def do_build(context):
          context.build_target(target='install')
          # TODO: Consider what could be tested about the installed binaries.
  
-        # run OpenCL offline compile tests on clang tidy builds
-        if (context.opts.tidy and context.opts.opencl):
-            context.build_target(target='ocl_nbnxm_kernels')
-
          if not context.opts.mdrun_only:
              context.env.prepend_path_env(os.path.join(context.workspace.build_dir, 'bin'))
              context.chdir(regressiontests_path)
diff --git a/src/gromacs/ewald/CMakeLists.txt b/src/gromacs/ewald/CMakeLists.txt

index 45517cdae11204d67a801584d3f34d1aca2ddd3b..7583e1db50edab86143cede8f1a6b9aabf3570d3 100644 (file)
--- a/src/gromacs/ewald/CMakeLists.txt
+++ b/src/gromacs/ewald/CMakeLists.txt
@@ -95,3 +95,55 @@ endif()
  if (BUILD_TESTING)
      add_subdirectory(tests)
  endif()
+
+
+set(PME_OCL_KERNEL_SOURCES
+    "${CMAKE_CURRENT_SOURCE_DIR}/pme_gpu_utils.clh"
+    "${CMAKE_CURRENT_SOURCE_DIR}/pme_solve.clh"
+    "${CMAKE_CURRENT_SOURCE_DIR}/pme_gather.clh"
+    "${CMAKE_CURRENT_SOURCE_DIR}/pme_spread.clh")
+
+if(CLANG_TIDY_EXE)
+   set(OCL_COMPILER "${CLANG_TIDY_EXE}")
+   set(CLANG_TIDY_ARGS "-quiet;-checks=*,-readability-implicit-bool-conversion,-llvm-header-guard,-hicpp-signed-bitwise,-clang-analyzer-deadcode.DeadStores,-google-readability-todo,-clang-diagnostic-padded,-fcomment-block-commands=internal;--;${CMAKE_C_COMPILER}")
+else()
+   set(OCL_COMPILER "${CMAKE_C_COMPILER}")
+endif()
+
+# TODO: test all warp sizes on all vendor targets?
+foreach(VENDOR AMD NVIDIA INTEL)
+    foreach(WARPSIZE 16 32 64)
+        math(EXPR SPREAD_WG_SIZE "8*${WARPSIZE}")
+        math(EXPR SOLVE_WG_SIZE "8*${WARPSIZE}")
+        math(EXPR GATHER_WG_SIZE "4*${WARPSIZE}")
+        set(OBJ_FILE pme_ocl_kernel_warpSize${WARPSIZE}_${VENDOR}.o)
+        add_custom_command(OUTPUT ${OBJ_FILE} COMMAND ${OCL_COMPILER}
+        ${CMAKE_CURRENT_SOURCE_DIR}/pme_program.cl ${CLANG_TIDY_ARGS}
+        -Xclang -finclude-default-header  -D_${VENDOR}_SOURCE_
+        -Dwarp_size=${WARPSIZE}
+        -Dorder=4
+        -DthreadsPerAtom=16
+        -Dc_pmeMaxUnitcellShift=2
+        -Dc_usePadding=true
+        -Dc_skipNeutralAtoms=false
+        -Dc_virialAndEnergyCount=7
+        -Dc_spreadWorkGroupSize=${SPREAD_WG_SIZE}
+        -Dc_solveMaxWorkGroupSize=${SOLVE_WG_SIZE}
+        -Dc_gatherWorkGroupSize=${GATHER_WG_SIZE}
+        -DDIM=3 -DXX=0 -DYY=1 -DZZ=2
+        -DwrapX=true -DwrapY=true
+        -c -I ${CMAKE_SOURCE_DIR}/src -std=cl1.2
+        -Weverything  -Wno-conversion -Wno-missing-variable-declarations -Wno-used-but-marked-unused
+        -Wno-cast-align -Wno-incompatible-pointer-types
+        # to avoid  "warning: unknown command tag name" for \internal
+        -Wno-documentation-unknown-command
+        # to avoid pme_gpu_types.h:100:52: warning: padding struct 'struct PmeGpuConstParams' with 4 bytes to align 'd_virialAndEnergy'
+        -Wno-padded
+        -o${OBJ_FILE}
+        )
+        list(APPEND PME_OCL_KERNELS ${OBJ_FILE})
+    endforeach()
+endforeach()
+
+add_custom_target(ocl_pme_kernels DEPENDS ${PME_OCL_KERNELS} )
+gmx_set_custom_target_output(ocl_pme_kernels ${PME_OCL_KERNELS})
diff --git a/src/gromacs/ewald/pme_gather.clh b/src/gromacs/ewald/pme_gather.clh

index 3b760e9d02bba97c7b8d47c1749e0075975790b4..8ade47919adcbff6dd4dfd5c28074b0fe85b5de3 100644 (file)
--- a/src/gromacs/ewald/pme_gather.clh
+++ b/src/gromacs/ewald/pme_gather.clh
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -67,9 +67,10 @@ inline float read_grid_size(const float* realGridSizeFP, const int dimIndex)
          case XX: return realGridSizeFP[XX];
          case YY: return realGridSizeFP[YY];
          case ZZ: return realGridSizeFP[ZZ];
+        default: assert(false); break;
      }
      assert(false);
-    return 0.0f;
+    return 0.0F;
  }
  
  /*! \brief Reduce the partial force contributions.
@@ -211,7 +212,7 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
  {
      /* These are the atom indices - for the shared and global memory */
      const int atomIndexLocal  = get_local_id(ZZ);
-    const int atomIndexOffset = get_group_id(XX) * atomsPerBlock;
+    const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock;
      const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
  
  /* Some sizes which are defines and not consts because they go into the array size */
@@ -230,18 +231,23 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
      const int ithy = get_local_id(YY);
      const int ithz = get_local_id(XX);
  
-    const int threadLocalId = (get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0)
-                              + get_local_id(0);
+    assert((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0)
+           <= MAX_INT);
+    const int threadLocalId =
+            (int)((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0)
+                  + get_local_id(0));
  
      /* These are the spline contribution indices in shared memory */
-    const int splineIndex = (get_local_id(1) * get_local_size(0)
-                             + get_local_id(0)); /* Relative to the current particle , 0..15 for order 4 */
-    const int lineIndex   = threadLocalId; /* And to all the block's particles */
+    assert((get_local_id(1) * get_local_size(0) + get_local_id(0)) <= MAX_INT);
+    const int splineIndex =
+            (int)(get_local_id(1) * get_local_size(0)
+                  + get_local_id(0));    /* Relative to the current particle , 0..15 for order 4 */
+    const int lineIndex = threadLocalId; /* And to all the block's particles */
  
      /* Staging the atom gridline indices, DIM * atomsPerBlock threads */
      const int localGridlineIndicesIndex = threadLocalId;
      const int globalGridlineIndicesIndex =
-            get_group_id(XX) * gridlineIndicesSize + localGridlineIndicesIndex;
+            (int)get_group_id(XX) * gridlineIndicesSize + localGridlineIndicesIndex;
      const int globalCheckIndices =
              pme_gpu_check_atom_data_index(globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
      if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
@@ -250,8 +256,8 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
          assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
      }
      /* Staging the spline parameters, DIM * order * atomsPerBlock threads */
-    const int localSplineParamsIndex  = threadLocalId;
-    const int globalSplineParamsIndex = get_group_id(XX) * splineParamsSize + localSplineParamsIndex;
+    const int localSplineParamsIndex = threadLocalId;
+    const int globalSplineParamsIndex = (int)get_group_id(XX) * splineParamsSize + localSplineParamsIndex;
      const int globalCheckSplineParams = pme_gpu_check_atom_data_index(
              globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
      if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
@@ -263,9 +269,9 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
      }
      barrier(CLK_LOCAL_MEM_FENCE);
  
-    float fx = 0.0f;
-    float fy = 0.0f;
-    float fz = 0.0f;
+    float fx = 0.0F;
+    float fy = 0.0F;
+    float fz = 0.0F;
  
      const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
      const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
@@ -371,7 +377,7 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
          for (int i = 0; i < numIter; i++)
          {
              const int outputIndexLocal  = i * iterThreads + threadLocalId;
-            const int outputIndexGlobal = get_group_id(XX) * blockForcesSize + outputIndexLocal;
+            const int outputIndexGlobal = (int)get_group_id(XX) * blockForcesSize + outputIndexLocal;
              const int globalOutputCheck =
                      pme_gpu_check_atom_data_index(outputIndexGlobal, kernelParams.atoms.nAtoms * DIM);
              if (globalOutputCheck)
diff --git a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp

index 800f43ea9ec40732c1a8c0bac21b05276fd3b4d8..79ef8b6d2103fb7678fa753b86ec962a96ec943d 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
+++ b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -142,7 +142,6 @@ void PmeGpuProgramImpl::compileKernels(const gmx_device_info_t* deviceInfo)
          const std::string commonDefines = gmx::formatString(
                  "-Dwarp_size=%zd "
                  "-Dorder=%d "
-                "-DatomsPerWarp=%zd "
                  "-DthreadsPerAtom=%d "
                  // forwarding from pme_grid.h, used for spline computation table sizes only
                  "-Dc_pmeMaxUnitcellShift=%f "
@@ -158,10 +157,10 @@ void PmeGpuProgramImpl::compileKernels(const gmx_device_info_t* deviceInfo)
                  "-DDIM=%d -DXX=%d -DYY=%d -DZZ=%d "
                  // decomposition parameter placeholders
                  "-DwrapX=true -DwrapY=true ",
-                warpSize, c_pmeGpuOrder, warpSize / c_pmeSpreadGatherThreadsPerAtom,
-                c_pmeSpreadGatherThreadsPerAtom, static_cast<float>(c_pmeMaxUnitcellShift),
-                static_cast<int>(c_usePadding), static_cast<int>(c_skipNeutralAtoms), c_virialAndEnergyCount,
-                spreadWorkGroupSize, solveMaxWorkGroupSize, gatherWorkGroupSize, DIM, XX, YY, ZZ);
+                warpSize, c_pmeGpuOrder, c_pmeSpreadGatherThreadsPerAtom,
+                static_cast<float>(c_pmeMaxUnitcellShift), static_cast<int>(c_usePadding),
+                static_cast<int>(c_skipNeutralAtoms), c_virialAndEnergyCount, spreadWorkGroupSize,
+                solveMaxWorkGroupSize, gatherWorkGroupSize, DIM, XX, YY, ZZ);
          try
          {
              /* TODO when we have a proper MPI-aware logging module,
diff --git a/src/gromacs/ewald/pme_gpu_utils.clh b/src/gromacs/ewald/pme_gpu_utils.clh

index 5f611e9510b8546dbff90e79f9c3e6d29863001a..1cec1a1cee2215bfdb90b5f227b730caf03db60b 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_utils.clh
+++ b/src/gromacs/ewald/pme_gpu_utils.clh
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -78,8 +78,6 @@ inline int getSplineParamIndexBase(int warpIndex, int atomWarpIndex)
   * \param[in] paramIndexBase   Must be result of getSplineParamIndexBase().
   * \param[in] dimIndex         Dimension index (from 0 to 2)
   * \param[in] splineIndex      Spline contribution index (from 0 to \p order - 1)
- * \param[in] order            PME order
- * \param[in] atomsPerWarp     Number of atoms processed by a warp
   *
   * \returns Index into theta or dtheta array using GPU layout.
   */
@@ -93,7 +91,6 @@ inline int getSplineParamIndex(int paramIndexBase, int dimIndex, int splineIndex
  /*! \brief
   * A function for checking the global atom data indices against the atom data array sizes.
   *
- * \param[in] atomDataIndexGlobal  The atom data index.
   * \param[in] nAtomData            The atom data array element count.
   * \returns                        Non-0 if index is within bounds (or PME data padding is enabled), 0 otherwise.
   *
@@ -113,7 +110,7 @@ inline int pme_gpu_check_atom_data_index(const size_t atomDataIndex, const size_
   */
  inline int pme_gpu_check_atom_charge(const float coefficient)
  {
-    return c_skipNeutralAtoms ? (coefficient != 0.0f) : 1;
+    return c_skipNeutralAtoms ? (coefficient != 0.0F) : 1;
  }
  
  #endif
diff --git a/src/gromacs/ewald/pme_program.cl b/src/gromacs/ewald/pme_program.cl

index 493fb5f8f24db27323a46c484effb4f8c3b11d55..091658d489608019f587f1ee21f9ef8801d4a386 100644 (file)
--- a/src/gromacs/ewald/pme_program.cl
+++ b/src/gromacs/ewald/pme_program.cl
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -50,7 +50,6 @@
   */
  
  // Assert placeholders, to not rip them out from OpenCL implementation - hopefully they come in handy some day with OpenCL 2
-#define static_assert(a, b)
  #define assert(a)
  
  #define PmeOpenCLKernelParams PmeGpuKernelParamsBase
@@ -58,6 +57,7 @@
  /* SPREAD/SPLINE */
  
  #define atomsPerBlock (c_spreadWorkGroupSize / threadsPerAtom)
+#define atomsPerWarp (warp_size / threadsPerAtom)
  
  // spline/spread fused
  #define computeSplines 1
diff --git a/src/gromacs/ewald/pme_solve.clh b/src/gromacs/ewald/pme_solve.clh

index ac38117d385facc6e7b63d5f3420a11e948fe172..2c4dbde6095f2288248b45584a29f06092c94e2a 100644 (file)
--- a/src/gromacs/ewald/pme_solve.clh
+++ b/src/gromacs/ewald/pme_solve.clh
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -49,9 +49,10 @@
   *  \author Aleksei Iupinov <a.yupinov@gmail.com>
   */
  
-#include "pme_gpu_types.h"
  #include "gromacs/gpu_utils/vectype_ops.clh"
  
+#include "pme_gpu_types.h"
+
  /*! \brief
   * PME complex grid solver kernel function.
   * Please see the file description for additional defines which this kernel expects.
@@ -68,7 +69,9 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer
                                                         __global float2* __restrict__ gm_grid)
  {
      /* This kernel supports 2 different grid dimension orderings: YZX and XYZ */
-    int majorDim, middleDim, minorDim;
+    int majorDim;
+    int middleDim;
+    int minorDim;
      if (gridOrdering == YZX)
      {
          majorDim  = YY;
@@ -90,17 +93,19 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer
              gm_splineModuli + kernelParams.grid.splineValuesOffset[minorDim];
  
      /* Various grid sizes and indices */
-    const int localOffsetMinor = 0, localOffsetMajor = 0, localOffsetMiddle = 0; // unused
-    const int localSizeMinor   = kernelParams.grid.complexGridSizePadded[minorDim];
-    const int localSizeMiddle  = kernelParams.grid.complexGridSizePadded[middleDim];
-    const int localCountMiddle = kernelParams.grid.complexGridSize[middleDim];
-    const int localCountMinor  = kernelParams.grid.complexGridSize[minorDim];
-    const int nMajor           = kernelParams.grid.realGridSize[majorDim];
-    const int nMiddle          = kernelParams.grid.realGridSize[middleDim];
-    const int nMinor           = kernelParams.grid.realGridSize[minorDim];
-    const int maxkMajor        = (nMajor + 1) / 2;  // X or Y
-    const int maxkMiddle       = (nMiddle + 1) / 2; // Y OR Z => only check for !YZX
-    const int maxkMinor        = (nMinor + 1) / 2;  // Z or X => only check for YZX
+    const int localOffsetMinor  = 0; // unused
+    const int localOffsetMajor  = 0; // unused
+    const int localOffsetMiddle = 0; // unused
+    const int localSizeMinor    = kernelParams.grid.complexGridSizePadded[minorDim];
+    const int localSizeMiddle   = kernelParams.grid.complexGridSizePadded[middleDim];
+    const int localCountMiddle  = kernelParams.grid.complexGridSize[middleDim];
+    const int localCountMinor   = kernelParams.grid.complexGridSize[minorDim];
+    const int nMajor            = kernelParams.grid.realGridSize[majorDim];
+    const int nMiddle           = kernelParams.grid.realGridSize[middleDim];
+    const int nMinor            = kernelParams.grid.realGridSize[minorDim];
+    const int maxkMajor         = (nMajor + 1) / 2;  // X or Y
+    const int maxkMiddle        = (nMiddle + 1) / 2; // Y OR Z => only check for !YZX
+    const int maxkMinor         = (nMinor + 1) / 2;  // Z or X => only check for YZX
  
      /* Each thread works on one cell of the Fourier space complex 3D grid (gm_grid).
       * Each block handles up to c_solveMaxWorkGroupSize cells -
@@ -112,19 +117,20 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer
      const int gridLineIndex     = threadLocalId / gridLineSize;
      const int gridLineCellIndex = threadLocalId - gridLineSize * gridLineIndex;
      const int gridLinesPerBlock = max((int)(get_local_size(XX)) / gridLineSize, 1);
-    const int activeWarps       = (get_local_size(XX) / warp_size);
-    const int indexMinor        = get_group_id(XX) * get_local_size(XX) + gridLineCellIndex;
-    const int indexMiddle       = get_group_id(YY) * gridLinesPerBlock + gridLineIndex;
-    const int indexMajor        = get_group_id(ZZ);
+    const int activeWarps       = ((int)get_local_size(XX) / warp_size);
+    assert((get_group_id(XX) * get_local_size(XX)) < MAX_INT);
+    const int indexMinor  = (int)get_group_id(XX) * (int)get_local_size(XX) + gridLineCellIndex;
+    const int indexMiddle = (int)get_group_id(YY) * gridLinesPerBlock + gridLineIndex;
+    const int indexMajor  = (int)get_group_id(ZZ);
  
      /* Optional outputs */
-    float energy = 0.0f;
-    float virxx  = 0.0f;
-    float virxy  = 0.0f;
-    float virxz  = 0.0f;
-    float viryy  = 0.0f;
-    float viryz  = 0.0f;
-    float virzz  = 0.0f;
+    float energy = 0.0F;
+    float virxx  = 0.0F;
+    float virxy  = 0.0F;
+    float virxz  = 0.0F;
+    float viryy  = 0.0F;
+    float viryz  = 0.0F;
+    float virzz  = 0.0F;
  
      assert(indexMajor < kernelParams.grid.complexGridSize[majorDim]);
      if ((indexMiddle < localCountMiddle) & (indexMinor < localCountMinor)
@@ -136,26 +142,28 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer
  
          const int kMajor = indexMajor + localOffsetMajor;
          /* Checking either X in XYZ, or Y in YZX cases */
-        const float mMajor = (kMajor < maxkMajor) ? kMajor : (kMajor - nMajor);
+        const float mMajor = (float)((kMajor < maxkMajor) ? kMajor : (kMajor - nMajor));
  
          const int kMiddle = indexMiddle + localOffsetMiddle;
-        float     mMiddle = kMiddle;
+        float     mMiddle = (float)kMiddle;
          /* Checking Y in XYZ case */
          if (gridOrdering == XYZ)
          {
-            mMiddle = (kMiddle < maxkMiddle) ? kMiddle : (kMiddle - nMiddle);
+            mMiddle = (float)((kMiddle < maxkMiddle) ? kMiddle : (kMiddle - nMiddle));
          }
          const int kMinor = localOffsetMinor + indexMinor;
-        float     mMinor = kMinor;
+        float     mMinor = (float)kMinor;
          /* Checking X in YZX case */
          if (gridOrdering == YZX)
          {
-            mMinor = (kMinor < maxkMinor) ? kMinor : (kMinor - nMinor);
+            mMinor = (float)((kMinor < maxkMinor) ? kMinor : (kMinor - nMinor));
          }
          /* We should skip the k-space point (0,0,0) */
          const bool notZeroPoint = (kMinor > 0) | (kMajor > 0) | (kMiddle > 0);
  
-        float mX, mY, mZ;
+        float mX;
+        float mY;
+        float mZ;
          if (gridOrdering == YZX)
          {
              mX = mMinor;
@@ -170,19 +178,20 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer
          }
  
          /* 0.5 correction factor for the first and last components of a Z dimension */
-        float corner_fac = 1.0f;
+        float       corner_fac   = 1.0F;
+        const float z_corner_fac = 0.5F;
          if (gridOrdering == YZX)
          {
              if ((kMiddle == 0) | (kMiddle == maxkMiddle))
              {
-                corner_fac = 0.5f;
+                corner_fac = z_corner_fac;
              }
          }
          if (gridOrdering == XYZ)
          {
              if ((kMinor == 0) | (kMinor == maxkMinor))
              {
-                corner_fac = 0.5f;
+                corner_fac = z_corner_fac;
              }
          }
  
@@ -196,11 +205,11 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer
                                 + mZ * kernelParams.current.recipBox[ZZ][ZZ];
  
              const float m2k = mhxk * mhxk + mhyk * mhyk + mhzk * mhzk;
-            assert(m2k != 0.0f);
+            assert(m2k != 0.0F);
              const float denom = m2k * M_PI_F * kernelParams.current.boxVolume * gm_splineValueMajor[kMajor]
                                  * gm_splineValueMiddle[kMiddle] * gm_splineValueMinor[kMinor];
              assert(isfinite(denom));
-            assert(denom != 0.0f);
+            assert(denom != 0.0F);
              const float tmp1   = exp(-kernelParams.grid.ewaldFactor * m2k);
              const float etermk = kernelParams.constants.elFactor * tmp1 / denom;
  
@@ -214,9 +223,9 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer
              if (computeEnergyAndVirial)
              {
                  const float tmp1k =
-                        2.0f * (gridValue.x * oldGridValue.x + gridValue.y * oldGridValue.y);
+                        2.0F * (gridValue.x * oldGridValue.x + gridValue.y * oldGridValue.y);
  
-                const float vfactor = (kernelParams.grid.ewaldFactor + 1.0f / m2k) * 2.0f;
+                const float vfactor = (kernelParams.grid.ewaldFactor + 1.0F / m2k) * 2.0F;
                  const float ets2    = corner_fac * tmp1k;
                  energy              = ets2;
  
@@ -257,7 +266,9 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer
              sm_virialAndEnergy[2 * warp_size + lane] = virzz;
              sm_virialAndEnergy[3 * warp_size + lane] = virxy;
              sm_virialAndEnergy[4 * warp_size + lane] = virxz;
+            // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
              sm_virialAndEnergy[5 * warp_size + lane] = viryz;
+            // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
              sm_virialAndEnergy[6 * warp_size + lane] = energy;
          }
          barrier(CLK_LOCAL_MEM_FENCE);
@@ -268,7 +279,9 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer
              atomicAdd_l_f(sm_virialAndEnergy + 2 * warp_size + lane, virzz);
              atomicAdd_l_f(sm_virialAndEnergy + 3 * warp_size + lane, virxy);
              atomicAdd_l_f(sm_virialAndEnergy + 4 * warp_size + lane, virxz);
+            // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
              atomicAdd_l_f(sm_virialAndEnergy + 5 * warp_size + lane, viryz);
+            // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
              atomicAdd_l_f(sm_virialAndEnergy + 6 * warp_size + lane, energy);
          }
          barrier(CLK_LOCAL_MEM_FENCE);
diff --git a/src/gromacs/ewald/pme_spread.clh b/src/gromacs/ewald/pme_spread.clh

index 8215614ef04fe6229034494ff396b31a6564213e..0ea1449c22f33e6a81c23215b81bfef7d2af200c 100644 (file)
--- a/src/gromacs/ewald/pme_spread.clh
+++ b/src/gromacs/ewald/pme_spread.clh
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -51,9 +51,10 @@
   *  \author Aleksei Iupinov <a.yupinov@gmail.com>
   */
  
+#include "gromacs/gpu_utils/vectype_ops.clh"
+
  #include "pme_gpu_types.h"
  #include "pme_gpu_utils.clh"
-#include "gromacs/gpu_utils/vectype_ops.clh"
  
  /*
   * This define affects the spline calculation behaviour in the kernel.
@@ -83,12 +84,15 @@ inline void pme_gpu_stage_atom_data(const struct PmeOpenCLKernelParams kernelPar
                                      __global const float* __restrict__ gm_source,
                                      const int dataCountPerAtom)
  {
-    const size_t threadLocalIndex =
-            ((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0));
-    const size_t localIndex      = threadLocalIndex;
-    const size_t globalIndexBase = get_group_id(XX) * atomsPerBlock * dataCountPerAtom;
-    const size_t globalIndex     = globalIndexBase + localIndex;
-    const int    globalCheck =
+    assert((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0)
+           < MAX_INT);
+    const int threadLocalIndex =
+            (int)((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0)
+                  + get_local_id(0));
+    const int localIndex      = threadLocalIndex;
+    const int globalIndexBase = (int)get_group_id(XX) * atomsPerBlock * dataCountPerAtom;
+    const int globalIndex     = globalIndexBase + localIndex;
+    const int globalCheck =
              pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
      if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
      {
@@ -115,22 +119,25 @@ inline void pme_gpu_stage_atom_data(const struct PmeOpenCLKernelParams kernelPar
   * \param[in]  gm_fractShiftsTable      Atom fractional coordinates correction table
   * \param[in]  gm_gridlineIndicesTable  Atom fractional coordinates correction table
   */
-inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams,
-                              const int                          atomIndexOffset,
-                              __local const float* __restrict__ sm_coordinates,
-                              __local const float* __restrict__ sm_coefficients,
-                              __local float* __restrict__ sm_theta,
-                              __local int* __restrict__ sm_gridlineIndices,
-                              __local float* __restrict__ sm_fractCoords,
-                              __global float* __restrict__ gm_theta,
-                              __global float* __restrict__ gm_dtheta,
-                              __global int* __restrict__ gm_gridlineIndices,
-                              __global const float* __restrict__ gm_fractShiftsTable,
-                              __global const int* __restrict__ gm_gridlineIndicesTable)
+gmx_opencl_inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams,
+                                         const int                          atomIndexOffset,
+                                         __local const float* __restrict__ sm_coordinates,
+                                         __local const float* __restrict__ sm_coefficients,
+                                         __local float* __restrict__ sm_theta,
+                                         __local int* __restrict__ sm_gridlineIndices,
+                                         __local float* __restrict__ sm_fractCoords,
+                                         __global float* __restrict__ gm_theta,
+                                         __global float* __restrict__ gm_dtheta,
+                                         __global int* __restrict__ gm_gridlineIndices,
+                                         __global const float* __restrict__ gm_fractShiftsTable,
+                                         __global const int* __restrict__ gm_gridlineIndicesTable)
  {
      /* Thread index w.r.t. block */
+    assert((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0)
+           < MAX_INT);
      const int threadLocalIndex =
-            ((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0));
+            (int)((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0)
+                  + get_local_id(0));
      /* Warp index w.r.t. block - could probably be obtained easier? */
      const int warpIndex = threadLocalIndex / warp_size;
      /* Thread index w.r.t. warp */
@@ -179,8 +186,10 @@ inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams,
          /* Indices interpolation */
          if (orderIndex == 0)
          {
-            int          tableIndex, tInt;
-            float        n, t;
+            int          tableIndex;
+            int          tInt;
+            float        n;
+            float        t;
              const float3 x = vload3(atomIndexLocal, sm_coordinates);
  
              /* Accessing fields in fshOffset/nXYZ/recipbox/... with dimIndex offset
@@ -213,13 +222,17 @@ inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams,
                                  .z
                          * kernelParams.current.recipBox[dimIndex][ZZ];
                      break;
+                default:
+                    assert(false);
+                    return;
+                    break;
              }
              const float shift = c_pmeMaxUnitcellShift;
  
              /* Fractional coordinates along box vectors, adding a positive shift to ensure t is positive for triclinic boxes */
              t                                 = (t + shift) * n;
              tInt                              = (int)t;
-            sm_fractCoords[sharedMemoryIndex] = t - tInt;
+            sm_fractCoords[sharedMemoryIndex] = t - (float)tInt;
              tableIndex += tInt;
              assert(tInt >= 0);
              assert(tInt < c_pmeNeighborUnitcellCount * n);
@@ -241,23 +254,24 @@ inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams,
              assert(isfinite(dr));
  
              /* dr is relative offset from lower cell limit */
-            *SPLINE_DATA_PTR(order - 1) = 0.0f;
+            *SPLINE_DATA_PTR(order - 1) = 0.0F;
              *SPLINE_DATA_PTR(1)         = dr;
-            *SPLINE_DATA_PTR(0)         = 1.0f - dr;
+            *SPLINE_DATA_PTR(0)         = 1.0F - dr;
  
  #    pragma unroll order
              for (int k = 3; k < order; k++)
              {
-                div                     = 1.0f / (k - 1.0f);
+                div                     = 1.0F / ((float)k - 1.0F);
                  *SPLINE_DATA_PTR(k - 1) = div * dr * SPLINE_DATA(k - 2);
  #    pragma unroll
                  for (int l = 1; l < (k - 1); l++)
                  {
-                    *SPLINE_DATA_PTR(k - l - 1) = div
-                                                  * ((dr + l) * SPLINE_DATA(k - l - 2)
-                                                     + (k - l - dr) * SPLINE_DATA(k - l - 1));
+                    *SPLINE_DATA_PTR(k - l - 1) =
+                            div
+                            * ((dr + (float)l) * SPLINE_DATA(k - l - 2)
+                               + ((float)k - (float)l - dr) * SPLINE_DATA(k - l - 1));
                  }
-                *SPLINE_DATA_PTR(0) = div * (1.0f - dr) * SPLINE_DATA(0);
+                *SPLINE_DATA_PTR(0) = div * (1.0F - dr) * SPLINE_DATA(0);
              }
  
              const int thetaIndexBase        = getSplineParamIndexBase(warpIndex, atomWarpIndex);
@@ -274,21 +288,21 @@ inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams,
                  const int thetaIndex       = getSplineParamIndex(thetaIndexBase, dimIndex, o);
                  const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
  
-                const float dtheta = ((o > 0) ? SPLINE_DATA(o - 1) : 0.0f) - SPLINE_DATA(o);
+                const float dtheta = ((o > 0) ? SPLINE_DATA(o - 1) : 0.0F) - SPLINE_DATA(o);
                  assert(isfinite(dtheta));
                  gm_dtheta[thetaGlobalIndex] = dtheta;
              }
  
-            div                         = 1.0f / (order - 1.0f);
+            div                         = 1.0F / (order - 1.0F);
              *SPLINE_DATA_PTR(order - 1) = div * dr * SPLINE_DATA(order - 2);
  #    pragma unroll
              for (int k = 1; k < (order - 1); k++)
              {
                  *SPLINE_DATA_PTR(order - k - 1) = div
-                                                  * ((dr + k) * SPLINE_DATA(order - k - 2)
+                                                  * ((dr + (float)k) * SPLINE_DATA(order - k - 2)
                                                       + (order - k - dr) * SPLINE_DATA(order - k - 1));
              }
-            *SPLINE_DATA_PTR(0) = div * (1.0f - dr) * SPLINE_DATA(0);
+            *SPLINE_DATA_PTR(0) = div * (1.0F - dr) * SPLINE_DATA(0);
  
              /* Storing the spline values (theta) */
  #    if !PME_GPU_PARALLEL_SPLINE
@@ -319,12 +333,12 @@ inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams,
   * sm_gridlineIndices   Atom gridline indices in the shared memory. \param[in]  sm_theta Atom spline
   * values in the shared memory. \param[out] gm_grid              Global 3D grid for spreading.
   */
-inline void spread_charges(const struct PmeOpenCLKernelParams kernelParams,
-                           int                                atomIndexOffset,
-                           __local const float* __restrict__ sm_coefficients,
-                           __local const int* __restrict__ sm_gridlineIndices,
-                           __local const float* __restrict__ sm_theta,
-                           __global float* __restrict__ gm_grid)
+gmx_opencl_inline void spread_charges(const struct PmeOpenCLKernelParams kernelParams,
+                                      int                                atomIndexOffset,
+                                      __local const float* __restrict__ sm_coefficients,
+                                      __local const int* __restrict__ sm_gridlineIndices,
+                                      __local const float* __restrict__ sm_theta,
+                                      __global float* __restrict__ gm_grid)
  {
      const int nx  = kernelParams.grid.realGridSize[XX];
      const int ny  = kernelParams.grid.realGridSize[YY];
@@ -332,7 +346,9 @@ inline void spread_charges(const struct PmeOpenCLKernelParams kernelParams,
      const int pny = kernelParams.grid.realGridSizePadded[YY];
      const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
  
-    const int offx = 0, offy = 0, offz = 0; // unused for now
+    const int offx = 0;
+    const int offy = 0;
+    const int offz = 0;
  
      const int atomIndexLocal  = get_local_id(ZZ);
      const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
@@ -426,7 +442,7 @@ __attribute__((reqd_work_group_size(order, order, atomsPerBlock))) __kernel void
      // Staging coordinates - only for spline computation
      __local float sm_coordinates[DIM * atomsPerBlock];
  
-    const int atomIndexOffset = get_group_id(XX) * atomsPerBlock;
+    const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock;
  
      /* Staging coefficients/charges for both spline and spread */
      pme_gpu_stage_atom_data(kernelParams, sm_coefficients, gm_coefficients, 1);
author	Szilárd Páll <pall.szilard@gmail.com>
	Thu, 22 Aug 2019 00:11:41 +0000 (02:11 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Fri, 24 Jan 2020 19:37:15 +0000 (20:37 +0100)
admin/builds/gromacs.py		patch \| blob \| history
src/gromacs/ewald/CMakeLists.txt		patch \| blob \| history
src/gromacs/ewald/pme_gather.clh		patch \| blob \| history
src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_utils.clh		patch \| blob \| history
src/gromacs/ewald/pme_program.cl		patch \| blob \| history
src/gromacs/ewald/pme_solve.clh		patch \| blob \| history
src/gromacs/ewald/pme_spread.clh		patch \| blob \| history