From: Szilárd Páll Date: Thu, 22 Aug 2019 00:11:41 +0000 (+0200) Subject: Add target for offline PME OpenCL compilation X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=4f1c454810fd0efd3266eb6fdcb1f14b14faa911;p=alexxy%2Fgromacs.git Add target for offline PME OpenCL compilation Also fix clang-tidy warnings in the PME OpenCL kernels; tested with clang-tidy 8.0 and 9.0. Target will be triggered in all clang-tidy + OpenCL builds. Moved the invocation of the targets in jenkins before the tests where it makes more sense. Change-Id: I31d7dd4883e20d0564ab2f18a6de7e0d17e8024a --- diff --git a/admin/builds/gromacs.py b/admin/builds/gromacs.py index a0659aae84..d78c29c13e 100644 --- a/admin/builds/gromacs.py +++ b/admin/builds/gromacs.py @@ -1,7 +1,7 @@ # # This file is part of the GROMACS molecular simulation package. # -# Copyright (c) 2015,2016,2017,2018,2019 by the GROMACS development team. +# Copyright (c) 2015,2016,2017,2018,2019,2020, by the GROMACS development team. # Copyright (c) 2020, by the GROMACS development team, led by # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, # and including many others, as listed in the AUTHORS file in the @@ -247,6 +247,11 @@ def do_build(context): # TODO: Generalize the machinery here such that it can easily be used # also for non-release builds. else: + # run OpenCL offline compile tests on clang tidy builds + if (context.opts.tidy and context.opts.opencl): + context.build_target(target='ocl_nbnxm_kernels') + context.build_target(target='ocl_pme_kernels') + context.build_target(target='tests', keep_going=True) context.run_ctest(args=['--output-on-failure', '--label-exclude', 'SlowTest'], memcheck=context.opts.asan) @@ -254,10 +259,6 @@ def do_build(context): context.build_target(target='install') # TODO: Consider what could be tested about the installed binaries. - # run OpenCL offline compile tests on clang tidy builds - if (context.opts.tidy and context.opts.opencl): - context.build_target(target='ocl_nbnxm_kernels') - if not context.opts.mdrun_only: context.env.prepend_path_env(os.path.join(context.workspace.build_dir, 'bin')) context.chdir(regressiontests_path) diff --git a/src/gromacs/ewald/CMakeLists.txt b/src/gromacs/ewald/CMakeLists.txt index 45517cdae1..7583e1db50 100644 --- a/src/gromacs/ewald/CMakeLists.txt +++ b/src/gromacs/ewald/CMakeLists.txt @@ -95,3 +95,55 @@ endif() if (BUILD_TESTING) add_subdirectory(tests) endif() + + +set(PME_OCL_KERNEL_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/pme_gpu_utils.clh" + "${CMAKE_CURRENT_SOURCE_DIR}/pme_solve.clh" + "${CMAKE_CURRENT_SOURCE_DIR}/pme_gather.clh" + "${CMAKE_CURRENT_SOURCE_DIR}/pme_spread.clh") + +if(CLANG_TIDY_EXE) + set(OCL_COMPILER "${CLANG_TIDY_EXE}") + set(CLANG_TIDY_ARGS "-quiet;-checks=*,-readability-implicit-bool-conversion,-llvm-header-guard,-hicpp-signed-bitwise,-clang-analyzer-deadcode.DeadStores,-google-readability-todo,-clang-diagnostic-padded,-fcomment-block-commands=internal;--;${CMAKE_C_COMPILER}") +else() + set(OCL_COMPILER "${CMAKE_C_COMPILER}") +endif() + +# TODO: test all warp sizes on all vendor targets? +foreach(VENDOR AMD NVIDIA INTEL) + foreach(WARPSIZE 16 32 64) + math(EXPR SPREAD_WG_SIZE "8*${WARPSIZE}") + math(EXPR SOLVE_WG_SIZE "8*${WARPSIZE}") + math(EXPR GATHER_WG_SIZE "4*${WARPSIZE}") + set(OBJ_FILE pme_ocl_kernel_warpSize${WARPSIZE}_${VENDOR}.o) + add_custom_command(OUTPUT ${OBJ_FILE} COMMAND ${OCL_COMPILER} + ${CMAKE_CURRENT_SOURCE_DIR}/pme_program.cl ${CLANG_TIDY_ARGS} + -Xclang -finclude-default-header -D_${VENDOR}_SOURCE_ + -Dwarp_size=${WARPSIZE} + -Dorder=4 + -DthreadsPerAtom=16 + -Dc_pmeMaxUnitcellShift=2 + -Dc_usePadding=true + -Dc_skipNeutralAtoms=false + -Dc_virialAndEnergyCount=7 + -Dc_spreadWorkGroupSize=${SPREAD_WG_SIZE} + -Dc_solveMaxWorkGroupSize=${SOLVE_WG_SIZE} + -Dc_gatherWorkGroupSize=${GATHER_WG_SIZE} + -DDIM=3 -DXX=0 -DYY=1 -DZZ=2 + -DwrapX=true -DwrapY=true + -c -I ${CMAKE_SOURCE_DIR}/src -std=cl1.2 + -Weverything -Wno-conversion -Wno-missing-variable-declarations -Wno-used-but-marked-unused + -Wno-cast-align -Wno-incompatible-pointer-types + # to avoid "warning: unknown command tag name" for \internal + -Wno-documentation-unknown-command + # to avoid pme_gpu_types.h:100:52: warning: padding struct 'struct PmeGpuConstParams' with 4 bytes to align 'd_virialAndEnergy' + -Wno-padded + -o${OBJ_FILE} + ) + list(APPEND PME_OCL_KERNELS ${OBJ_FILE}) + endforeach() +endforeach() + +add_custom_target(ocl_pme_kernels DEPENDS ${PME_OCL_KERNELS} ) +gmx_set_custom_target_output(ocl_pme_kernels ${PME_OCL_KERNELS}) diff --git a/src/gromacs/ewald/pme_gather.clh b/src/gromacs/ewald/pme_gather.clh index 3b760e9d02..8ade47919a 100644 --- a/src/gromacs/ewald/pme_gather.clh +++ b/src/gromacs/ewald/pme_gather.clh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -67,9 +67,10 @@ inline float read_grid_size(const float* realGridSizeFP, const int dimIndex) case XX: return realGridSizeFP[XX]; case YY: return realGridSizeFP[YY]; case ZZ: return realGridSizeFP[ZZ]; + default: assert(false); break; } assert(false); - return 0.0f; + return 0.0F; } /*! \brief Reduce the partial force contributions. @@ -211,7 +212,7 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe { /* These are the atom indices - for the shared and global memory */ const int atomIndexLocal = get_local_id(ZZ); - const int atomIndexOffset = get_group_id(XX) * atomsPerBlock; + const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock; const int atomIndexGlobal = atomIndexOffset + atomIndexLocal; /* Some sizes which are defines and not consts because they go into the array size */ @@ -230,18 +231,23 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe const int ithy = get_local_id(YY); const int ithz = get_local_id(XX); - const int threadLocalId = (get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) - + get_local_id(0); + assert((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0) + <= MAX_INT); + const int threadLocalId = + (int)((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + + get_local_id(0)); /* These are the spline contribution indices in shared memory */ - const int splineIndex = (get_local_id(1) * get_local_size(0) - + get_local_id(0)); /* Relative to the current particle , 0..15 for order 4 */ - const int lineIndex = threadLocalId; /* And to all the block's particles */ + assert((get_local_id(1) * get_local_size(0) + get_local_id(0)) <= MAX_INT); + const int splineIndex = + (int)(get_local_id(1) * get_local_size(0) + + get_local_id(0)); /* Relative to the current particle , 0..15 for order 4 */ + const int lineIndex = threadLocalId; /* And to all the block's particles */ /* Staging the atom gridline indices, DIM * atomsPerBlock threads */ const int localGridlineIndicesIndex = threadLocalId; const int globalGridlineIndicesIndex = - get_group_id(XX) * gridlineIndicesSize + localGridlineIndicesIndex; + (int)get_group_id(XX) * gridlineIndicesSize + localGridlineIndicesIndex; const int globalCheckIndices = pme_gpu_check_atom_data_index(globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM); if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices) @@ -250,8 +256,8 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0); } /* Staging the spline parameters, DIM * order * atomsPerBlock threads */ - const int localSplineParamsIndex = threadLocalId; - const int globalSplineParamsIndex = get_group_id(XX) * splineParamsSize + localSplineParamsIndex; + const int localSplineParamsIndex = threadLocalId; + const int globalSplineParamsIndex = (int)get_group_id(XX) * splineParamsSize + localSplineParamsIndex; const int globalCheckSplineParams = pme_gpu_check_atom_data_index( globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order); if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams) @@ -263,9 +269,9 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe } barrier(CLK_LOCAL_MEM_FENCE); - float fx = 0.0f; - float fy = 0.0f; - float fz = 0.0f; + float fx = 0.0F; + float fy = 0.0F; + float fz = 0.0F; const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms); const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]); @@ -371,7 +377,7 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe for (int i = 0; i < numIter; i++) { const int outputIndexLocal = i * iterThreads + threadLocalId; - const int outputIndexGlobal = get_group_id(XX) * blockForcesSize + outputIndexLocal; + const int outputIndexGlobal = (int)get_group_id(XX) * blockForcesSize + outputIndexLocal; const int globalOutputCheck = pme_gpu_check_atom_data_index(outputIndexGlobal, kernelParams.atoms.nAtoms * DIM); if (globalOutputCheck) diff --git a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp index 800f43ea9e..79ef8b6d21 100644 --- a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp +++ b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -142,7 +142,6 @@ void PmeGpuProgramImpl::compileKernels(const gmx_device_info_t* deviceInfo) const std::string commonDefines = gmx::formatString( "-Dwarp_size=%zd " "-Dorder=%d " - "-DatomsPerWarp=%zd " "-DthreadsPerAtom=%d " // forwarding from pme_grid.h, used for spline computation table sizes only "-Dc_pmeMaxUnitcellShift=%f " @@ -158,10 +157,10 @@ void PmeGpuProgramImpl::compileKernels(const gmx_device_info_t* deviceInfo) "-DDIM=%d -DXX=%d -DYY=%d -DZZ=%d " // decomposition parameter placeholders "-DwrapX=true -DwrapY=true ", - warpSize, c_pmeGpuOrder, warpSize / c_pmeSpreadGatherThreadsPerAtom, - c_pmeSpreadGatherThreadsPerAtom, static_cast(c_pmeMaxUnitcellShift), - static_cast(c_usePadding), static_cast(c_skipNeutralAtoms), c_virialAndEnergyCount, - spreadWorkGroupSize, solveMaxWorkGroupSize, gatherWorkGroupSize, DIM, XX, YY, ZZ); + warpSize, c_pmeGpuOrder, c_pmeSpreadGatherThreadsPerAtom, + static_cast(c_pmeMaxUnitcellShift), static_cast(c_usePadding), + static_cast(c_skipNeutralAtoms), c_virialAndEnergyCount, spreadWorkGroupSize, + solveMaxWorkGroupSize, gatherWorkGroupSize, DIM, XX, YY, ZZ); try { /* TODO when we have a proper MPI-aware logging module, diff --git a/src/gromacs/ewald/pme_gpu_utils.clh b/src/gromacs/ewald/pme_gpu_utils.clh index 5f611e9510..1cec1a1cee 100644 --- a/src/gromacs/ewald/pme_gpu_utils.clh +++ b/src/gromacs/ewald/pme_gpu_utils.clh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -78,8 +78,6 @@ inline int getSplineParamIndexBase(int warpIndex, int atomWarpIndex) * \param[in] paramIndexBase Must be result of getSplineParamIndexBase(). * \param[in] dimIndex Dimension index (from 0 to 2) * \param[in] splineIndex Spline contribution index (from 0 to \p order - 1) - * \param[in] order PME order - * \param[in] atomsPerWarp Number of atoms processed by a warp * * \returns Index into theta or dtheta array using GPU layout. */ @@ -93,7 +91,6 @@ inline int getSplineParamIndex(int paramIndexBase, int dimIndex, int splineIndex /*! \brief * A function for checking the global atom data indices against the atom data array sizes. * - * \param[in] atomDataIndexGlobal The atom data index. * \param[in] nAtomData The atom data array element count. * \returns Non-0 if index is within bounds (or PME data padding is enabled), 0 otherwise. * @@ -113,7 +110,7 @@ inline int pme_gpu_check_atom_data_index(const size_t atomDataIndex, const size_ */ inline int pme_gpu_check_atom_charge(const float coefficient) { - return c_skipNeutralAtoms ? (coefficient != 0.0f) : 1; + return c_skipNeutralAtoms ? (coefficient != 0.0F) : 1; } #endif diff --git a/src/gromacs/ewald/pme_program.cl b/src/gromacs/ewald/pme_program.cl index 493fb5f8f2..091658d489 100644 --- a/src/gromacs/ewald/pme_program.cl +++ b/src/gromacs/ewald/pme_program.cl @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -50,7 +50,6 @@ */ // Assert placeholders, to not rip them out from OpenCL implementation - hopefully they come in handy some day with OpenCL 2 -#define static_assert(a, b) #define assert(a) #define PmeOpenCLKernelParams PmeGpuKernelParamsBase @@ -58,6 +57,7 @@ /* SPREAD/SPLINE */ #define atomsPerBlock (c_spreadWorkGroupSize / threadsPerAtom) +#define atomsPerWarp (warp_size / threadsPerAtom) // spline/spread fused #define computeSplines 1 diff --git a/src/gromacs/ewald/pme_solve.clh b/src/gromacs/ewald/pme_solve.clh index ac38117d38..2c4dbde609 100644 --- a/src/gromacs/ewald/pme_solve.clh +++ b/src/gromacs/ewald/pme_solve.clh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -49,9 +49,10 @@ * \author Aleksei Iupinov */ -#include "pme_gpu_types.h" #include "gromacs/gpu_utils/vectype_ops.clh" +#include "pme_gpu_types.h" + /*! \brief * PME complex grid solver kernel function. * Please see the file description for additional defines which this kernel expects. @@ -68,7 +69,9 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer __global float2* __restrict__ gm_grid) { /* This kernel supports 2 different grid dimension orderings: YZX and XYZ */ - int majorDim, middleDim, minorDim; + int majorDim; + int middleDim; + int minorDim; if (gridOrdering == YZX) { majorDim = YY; @@ -90,17 +93,19 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer gm_splineModuli + kernelParams.grid.splineValuesOffset[minorDim]; /* Various grid sizes and indices */ - const int localOffsetMinor = 0, localOffsetMajor = 0, localOffsetMiddle = 0; // unused - const int localSizeMinor = kernelParams.grid.complexGridSizePadded[minorDim]; - const int localSizeMiddle = kernelParams.grid.complexGridSizePadded[middleDim]; - const int localCountMiddle = kernelParams.grid.complexGridSize[middleDim]; - const int localCountMinor = kernelParams.grid.complexGridSize[minorDim]; - const int nMajor = kernelParams.grid.realGridSize[majorDim]; - const int nMiddle = kernelParams.grid.realGridSize[middleDim]; - const int nMinor = kernelParams.grid.realGridSize[minorDim]; - const int maxkMajor = (nMajor + 1) / 2; // X or Y - const int maxkMiddle = (nMiddle + 1) / 2; // Y OR Z => only check for !YZX - const int maxkMinor = (nMinor + 1) / 2; // Z or X => only check for YZX + const int localOffsetMinor = 0; // unused + const int localOffsetMajor = 0; // unused + const int localOffsetMiddle = 0; // unused + const int localSizeMinor = kernelParams.grid.complexGridSizePadded[minorDim]; + const int localSizeMiddle = kernelParams.grid.complexGridSizePadded[middleDim]; + const int localCountMiddle = kernelParams.grid.complexGridSize[middleDim]; + const int localCountMinor = kernelParams.grid.complexGridSize[minorDim]; + const int nMajor = kernelParams.grid.realGridSize[majorDim]; + const int nMiddle = kernelParams.grid.realGridSize[middleDim]; + const int nMinor = kernelParams.grid.realGridSize[minorDim]; + const int maxkMajor = (nMajor + 1) / 2; // X or Y + const int maxkMiddle = (nMiddle + 1) / 2; // Y OR Z => only check for !YZX + const int maxkMinor = (nMinor + 1) / 2; // Z or X => only check for YZX /* Each thread works on one cell of the Fourier space complex 3D grid (gm_grid). * Each block handles up to c_solveMaxWorkGroupSize cells - @@ -112,19 +117,20 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer const int gridLineIndex = threadLocalId / gridLineSize; const int gridLineCellIndex = threadLocalId - gridLineSize * gridLineIndex; const int gridLinesPerBlock = max((int)(get_local_size(XX)) / gridLineSize, 1); - const int activeWarps = (get_local_size(XX) / warp_size); - const int indexMinor = get_group_id(XX) * get_local_size(XX) + gridLineCellIndex; - const int indexMiddle = get_group_id(YY) * gridLinesPerBlock + gridLineIndex; - const int indexMajor = get_group_id(ZZ); + const int activeWarps = ((int)get_local_size(XX) / warp_size); + assert((get_group_id(XX) * get_local_size(XX)) < MAX_INT); + const int indexMinor = (int)get_group_id(XX) * (int)get_local_size(XX) + gridLineCellIndex; + const int indexMiddle = (int)get_group_id(YY) * gridLinesPerBlock + gridLineIndex; + const int indexMajor = (int)get_group_id(ZZ); /* Optional outputs */ - float energy = 0.0f; - float virxx = 0.0f; - float virxy = 0.0f; - float virxz = 0.0f; - float viryy = 0.0f; - float viryz = 0.0f; - float virzz = 0.0f; + float energy = 0.0F; + float virxx = 0.0F; + float virxy = 0.0F; + float virxz = 0.0F; + float viryy = 0.0F; + float viryz = 0.0F; + float virzz = 0.0F; assert(indexMajor < kernelParams.grid.complexGridSize[majorDim]); if ((indexMiddle < localCountMiddle) & (indexMinor < localCountMinor) @@ -136,26 +142,28 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer const int kMajor = indexMajor + localOffsetMajor; /* Checking either X in XYZ, or Y in YZX cases */ - const float mMajor = (kMajor < maxkMajor) ? kMajor : (kMajor - nMajor); + const float mMajor = (float)((kMajor < maxkMajor) ? kMajor : (kMajor - nMajor)); const int kMiddle = indexMiddle + localOffsetMiddle; - float mMiddle = kMiddle; + float mMiddle = (float)kMiddle; /* Checking Y in XYZ case */ if (gridOrdering == XYZ) { - mMiddle = (kMiddle < maxkMiddle) ? kMiddle : (kMiddle - nMiddle); + mMiddle = (float)((kMiddle < maxkMiddle) ? kMiddle : (kMiddle - nMiddle)); } const int kMinor = localOffsetMinor + indexMinor; - float mMinor = kMinor; + float mMinor = (float)kMinor; /* Checking X in YZX case */ if (gridOrdering == YZX) { - mMinor = (kMinor < maxkMinor) ? kMinor : (kMinor - nMinor); + mMinor = (float)((kMinor < maxkMinor) ? kMinor : (kMinor - nMinor)); } /* We should skip the k-space point (0,0,0) */ const bool notZeroPoint = (kMinor > 0) | (kMajor > 0) | (kMiddle > 0); - float mX, mY, mZ; + float mX; + float mY; + float mZ; if (gridOrdering == YZX) { mX = mMinor; @@ -170,19 +178,20 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer } /* 0.5 correction factor for the first and last components of a Z dimension */ - float corner_fac = 1.0f; + float corner_fac = 1.0F; + const float z_corner_fac = 0.5F; if (gridOrdering == YZX) { if ((kMiddle == 0) | (kMiddle == maxkMiddle)) { - corner_fac = 0.5f; + corner_fac = z_corner_fac; } } if (gridOrdering == XYZ) { if ((kMinor == 0) | (kMinor == maxkMinor)) { - corner_fac = 0.5f; + corner_fac = z_corner_fac; } } @@ -196,11 +205,11 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer + mZ * kernelParams.current.recipBox[ZZ][ZZ]; const float m2k = mhxk * mhxk + mhyk * mhyk + mhzk * mhzk; - assert(m2k != 0.0f); + assert(m2k != 0.0F); const float denom = m2k * M_PI_F * kernelParams.current.boxVolume * gm_splineValueMajor[kMajor] * gm_splineValueMiddle[kMiddle] * gm_splineValueMinor[kMinor]; assert(isfinite(denom)); - assert(denom != 0.0f); + assert(denom != 0.0F); const float tmp1 = exp(-kernelParams.grid.ewaldFactor * m2k); const float etermk = kernelParams.constants.elFactor * tmp1 / denom; @@ -214,9 +223,9 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer if (computeEnergyAndVirial) { const float tmp1k = - 2.0f * (gridValue.x * oldGridValue.x + gridValue.y * oldGridValue.y); + 2.0F * (gridValue.x * oldGridValue.x + gridValue.y * oldGridValue.y); - const float vfactor = (kernelParams.grid.ewaldFactor + 1.0f / m2k) * 2.0f; + const float vfactor = (kernelParams.grid.ewaldFactor + 1.0F / m2k) * 2.0F; const float ets2 = corner_fac * tmp1k; energy = ets2; @@ -257,7 +266,9 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer sm_virialAndEnergy[2 * warp_size + lane] = virzz; sm_virialAndEnergy[3 * warp_size + lane] = virxy; sm_virialAndEnergy[4 * warp_size + lane] = virxz; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers) sm_virialAndEnergy[5 * warp_size + lane] = viryz; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers) sm_virialAndEnergy[6 * warp_size + lane] = energy; } barrier(CLK_LOCAL_MEM_FENCE); @@ -268,7 +279,9 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer atomicAdd_l_f(sm_virialAndEnergy + 2 * warp_size + lane, virzz); atomicAdd_l_f(sm_virialAndEnergy + 3 * warp_size + lane, virxy); atomicAdd_l_f(sm_virialAndEnergy + 4 * warp_size + lane, virxz); + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers) atomicAdd_l_f(sm_virialAndEnergy + 5 * warp_size + lane, viryz); + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers) atomicAdd_l_f(sm_virialAndEnergy + 6 * warp_size + lane, energy); } barrier(CLK_LOCAL_MEM_FENCE); diff --git a/src/gromacs/ewald/pme_spread.clh b/src/gromacs/ewald/pme_spread.clh index 8215614ef0..0ea1449c22 100644 --- a/src/gromacs/ewald/pme_spread.clh +++ b/src/gromacs/ewald/pme_spread.clh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -51,9 +51,10 @@ * \author Aleksei Iupinov */ +#include "gromacs/gpu_utils/vectype_ops.clh" + #include "pme_gpu_types.h" #include "pme_gpu_utils.clh" -#include "gromacs/gpu_utils/vectype_ops.clh" /* * This define affects the spline calculation behaviour in the kernel. @@ -83,12 +84,15 @@ inline void pme_gpu_stage_atom_data(const struct PmeOpenCLKernelParams kernelPar __global const float* __restrict__ gm_source, const int dataCountPerAtom) { - const size_t threadLocalIndex = - ((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0)); - const size_t localIndex = threadLocalIndex; - const size_t globalIndexBase = get_group_id(XX) * atomsPerBlock * dataCountPerAtom; - const size_t globalIndex = globalIndexBase + localIndex; - const int globalCheck = + assert((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0) + < MAX_INT); + const int threadLocalIndex = + (int)((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + + get_local_id(0)); + const int localIndex = threadLocalIndex; + const int globalIndexBase = (int)get_group_id(XX) * atomsPerBlock * dataCountPerAtom; + const int globalIndex = globalIndexBase + localIndex; + const int globalCheck = pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom); if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck) { @@ -115,22 +119,25 @@ inline void pme_gpu_stage_atom_data(const struct PmeOpenCLKernelParams kernelPar * \param[in] gm_fractShiftsTable Atom fractional coordinates correction table * \param[in] gm_gridlineIndicesTable Atom fractional coordinates correction table */ -inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams, - const int atomIndexOffset, - __local const float* __restrict__ sm_coordinates, - __local const float* __restrict__ sm_coefficients, - __local float* __restrict__ sm_theta, - __local int* __restrict__ sm_gridlineIndices, - __local float* __restrict__ sm_fractCoords, - __global float* __restrict__ gm_theta, - __global float* __restrict__ gm_dtheta, - __global int* __restrict__ gm_gridlineIndices, - __global const float* __restrict__ gm_fractShiftsTable, - __global const int* __restrict__ gm_gridlineIndicesTable) +gmx_opencl_inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams, + const int atomIndexOffset, + __local const float* __restrict__ sm_coordinates, + __local const float* __restrict__ sm_coefficients, + __local float* __restrict__ sm_theta, + __local int* __restrict__ sm_gridlineIndices, + __local float* __restrict__ sm_fractCoords, + __global float* __restrict__ gm_theta, + __global float* __restrict__ gm_dtheta, + __global int* __restrict__ gm_gridlineIndices, + __global const float* __restrict__ gm_fractShiftsTable, + __global const int* __restrict__ gm_gridlineIndicesTable) { /* Thread index w.r.t. block */ + assert((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0) + < MAX_INT); const int threadLocalIndex = - ((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0)); + (int)((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + + get_local_id(0)); /* Warp index w.r.t. block - could probably be obtained easier? */ const int warpIndex = threadLocalIndex / warp_size; /* Thread index w.r.t. warp */ @@ -179,8 +186,10 @@ inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams, /* Indices interpolation */ if (orderIndex == 0) { - int tableIndex, tInt; - float n, t; + int tableIndex; + int tInt; + float n; + float t; const float3 x = vload3(atomIndexLocal, sm_coordinates); /* Accessing fields in fshOffset/nXYZ/recipbox/... with dimIndex offset @@ -213,13 +222,17 @@ inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams, .z * kernelParams.current.recipBox[dimIndex][ZZ]; break; + default: + assert(false); + return; + break; } const float shift = c_pmeMaxUnitcellShift; /* Fractional coordinates along box vectors, adding a positive shift to ensure t is positive for triclinic boxes */ t = (t + shift) * n; tInt = (int)t; - sm_fractCoords[sharedMemoryIndex] = t - tInt; + sm_fractCoords[sharedMemoryIndex] = t - (float)tInt; tableIndex += tInt; assert(tInt >= 0); assert(tInt < c_pmeNeighborUnitcellCount * n); @@ -241,23 +254,24 @@ inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams, assert(isfinite(dr)); /* dr is relative offset from lower cell limit */ - *SPLINE_DATA_PTR(order - 1) = 0.0f; + *SPLINE_DATA_PTR(order - 1) = 0.0F; *SPLINE_DATA_PTR(1) = dr; - *SPLINE_DATA_PTR(0) = 1.0f - dr; + *SPLINE_DATA_PTR(0) = 1.0F - dr; # pragma unroll order for (int k = 3; k < order; k++) { - div = 1.0f / (k - 1.0f); + div = 1.0F / ((float)k - 1.0F); *SPLINE_DATA_PTR(k - 1) = div * dr * SPLINE_DATA(k - 2); # pragma unroll for (int l = 1; l < (k - 1); l++) { - *SPLINE_DATA_PTR(k - l - 1) = div - * ((dr + l) * SPLINE_DATA(k - l - 2) - + (k - l - dr) * SPLINE_DATA(k - l - 1)); + *SPLINE_DATA_PTR(k - l - 1) = + div + * ((dr + (float)l) * SPLINE_DATA(k - l - 2) + + ((float)k - (float)l - dr) * SPLINE_DATA(k - l - 1)); } - *SPLINE_DATA_PTR(0) = div * (1.0f - dr) * SPLINE_DATA(0); + *SPLINE_DATA_PTR(0) = div * (1.0F - dr) * SPLINE_DATA(0); } const int thetaIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex); @@ -274,21 +288,21 @@ inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams, const int thetaIndex = getSplineParamIndex(thetaIndexBase, dimIndex, o); const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex; - const float dtheta = ((o > 0) ? SPLINE_DATA(o - 1) : 0.0f) - SPLINE_DATA(o); + const float dtheta = ((o > 0) ? SPLINE_DATA(o - 1) : 0.0F) - SPLINE_DATA(o); assert(isfinite(dtheta)); gm_dtheta[thetaGlobalIndex] = dtheta; } - div = 1.0f / (order - 1.0f); + div = 1.0F / (order - 1.0F); *SPLINE_DATA_PTR(order - 1) = div * dr * SPLINE_DATA(order - 2); # pragma unroll for (int k = 1; k < (order - 1); k++) { *SPLINE_DATA_PTR(order - k - 1) = div - * ((dr + k) * SPLINE_DATA(order - k - 2) + * ((dr + (float)k) * SPLINE_DATA(order - k - 2) + (order - k - dr) * SPLINE_DATA(order - k - 1)); } - *SPLINE_DATA_PTR(0) = div * (1.0f - dr) * SPLINE_DATA(0); + *SPLINE_DATA_PTR(0) = div * (1.0F - dr) * SPLINE_DATA(0); /* Storing the spline values (theta) */ # if !PME_GPU_PARALLEL_SPLINE @@ -319,12 +333,12 @@ inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams, * sm_gridlineIndices Atom gridline indices in the shared memory. \param[in] sm_theta Atom spline * values in the shared memory. \param[out] gm_grid Global 3D grid for spreading. */ -inline void spread_charges(const struct PmeOpenCLKernelParams kernelParams, - int atomIndexOffset, - __local const float* __restrict__ sm_coefficients, - __local const int* __restrict__ sm_gridlineIndices, - __local const float* __restrict__ sm_theta, - __global float* __restrict__ gm_grid) +gmx_opencl_inline void spread_charges(const struct PmeOpenCLKernelParams kernelParams, + int atomIndexOffset, + __local const float* __restrict__ sm_coefficients, + __local const int* __restrict__ sm_gridlineIndices, + __local const float* __restrict__ sm_theta, + __global float* __restrict__ gm_grid) { const int nx = kernelParams.grid.realGridSize[XX]; const int ny = kernelParams.grid.realGridSize[YY]; @@ -332,7 +346,9 @@ inline void spread_charges(const struct PmeOpenCLKernelParams kernelParams, const int pny = kernelParams.grid.realGridSizePadded[YY]; const int pnz = kernelParams.grid.realGridSizePadded[ZZ]; - const int offx = 0, offy = 0, offz = 0; // unused for now + const int offx = 0; + const int offy = 0; + const int offz = 0; const int atomIndexLocal = get_local_id(ZZ); const int atomIndexGlobal = atomIndexOffset + atomIndexLocal; @@ -426,7 +442,7 @@ __attribute__((reqd_work_group_size(order, order, atomsPerBlock))) __kernel void // Staging coordinates - only for spline computation __local float sm_coordinates[DIM * atomsPerBlock]; - const int atomIndexOffset = get_group_id(XX) * atomsPerBlock; + const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock; /* Staging coefficients/charges for both spline and spread */ pme_gpu_stage_atom_data(kernelParams, sm_coefficients, gm_coefficients, 1);