#
# This file is part of the GROMACS molecular simulation package.
#
-# Copyright (c) 2015,2016,2017,2018,2019 by the GROMACS development team.
+# Copyright (c) 2015,2016,2017,2018,2019,2020, by the GROMACS development team.
# Copyright (c) 2020, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# TODO: Generalize the machinery here such that it can easily be used
# also for non-release builds.
else:
+ # run OpenCL offline compile tests on clang tidy builds
+ if (context.opts.tidy and context.opts.opencl):
+ context.build_target(target='ocl_nbnxm_kernels')
+ context.build_target(target='ocl_pme_kernels')
+
context.build_target(target='tests', keep_going=True)
context.run_ctest(args=['--output-on-failure', '--label-exclude', 'SlowTest'], memcheck=context.opts.asan)
context.build_target(target='install')
# TODO: Consider what could be tested about the installed binaries.
- # run OpenCL offline compile tests on clang tidy builds
- if (context.opts.tidy and context.opts.opencl):
- context.build_target(target='ocl_nbnxm_kernels')
-
if not context.opts.mdrun_only:
context.env.prepend_path_env(os.path.join(context.workspace.build_dir, 'bin'))
context.chdir(regressiontests_path)
if (BUILD_TESTING)
add_subdirectory(tests)
endif()
+
+
+set(PME_OCL_KERNEL_SOURCES
+ "${CMAKE_CURRENT_SOURCE_DIR}/pme_gpu_utils.clh"
+ "${CMAKE_CURRENT_SOURCE_DIR}/pme_solve.clh"
+ "${CMAKE_CURRENT_SOURCE_DIR}/pme_gather.clh"
+ "${CMAKE_CURRENT_SOURCE_DIR}/pme_spread.clh")
+
+if(CLANG_TIDY_EXE)
+ set(OCL_COMPILER "${CLANG_TIDY_EXE}")
+ set(CLANG_TIDY_ARGS "-quiet;-checks=*,-readability-implicit-bool-conversion,-llvm-header-guard,-hicpp-signed-bitwise,-clang-analyzer-deadcode.DeadStores,-google-readability-todo,-clang-diagnostic-padded,-fcomment-block-commands=internal;--;${CMAKE_C_COMPILER}")
+else()
+ set(OCL_COMPILER "${CMAKE_C_COMPILER}")
+endif()
+
+# TODO: test all warp sizes on all vendor targets?
+foreach(VENDOR AMD NVIDIA INTEL)
+ foreach(WARPSIZE 16 32 64)
+ math(EXPR SPREAD_WG_SIZE "8*${WARPSIZE}")
+ math(EXPR SOLVE_WG_SIZE "8*${WARPSIZE}")
+ math(EXPR GATHER_WG_SIZE "4*${WARPSIZE}")
+ set(OBJ_FILE pme_ocl_kernel_warpSize${WARPSIZE}_${VENDOR}.o)
+ add_custom_command(OUTPUT ${OBJ_FILE} COMMAND ${OCL_COMPILER}
+ ${CMAKE_CURRENT_SOURCE_DIR}/pme_program.cl ${CLANG_TIDY_ARGS}
+ -Xclang -finclude-default-header -D_${VENDOR}_SOURCE_
+ -Dwarp_size=${WARPSIZE}
+ -Dorder=4
+ -DthreadsPerAtom=16
+ -Dc_pmeMaxUnitcellShift=2
+ -Dc_usePadding=true
+ -Dc_skipNeutralAtoms=false
+ -Dc_virialAndEnergyCount=7
+ -Dc_spreadWorkGroupSize=${SPREAD_WG_SIZE}
+ -Dc_solveMaxWorkGroupSize=${SOLVE_WG_SIZE}
+ -Dc_gatherWorkGroupSize=${GATHER_WG_SIZE}
+ -DDIM=3 -DXX=0 -DYY=1 -DZZ=2
+ -DwrapX=true -DwrapY=true
+ -c -I ${CMAKE_SOURCE_DIR}/src -std=cl1.2
+ -Weverything -Wno-conversion -Wno-missing-variable-declarations -Wno-used-but-marked-unused
+ -Wno-cast-align -Wno-incompatible-pointer-types
+ # to avoid "warning: unknown command tag name" for \internal
+ -Wno-documentation-unknown-command
+ # to avoid pme_gpu_types.h:100:52: warning: padding struct 'struct PmeGpuConstParams' with 4 bytes to align 'd_virialAndEnergy'
+ -Wno-padded
+ -o${OBJ_FILE}
+ )
+ list(APPEND PME_OCL_KERNELS ${OBJ_FILE})
+ endforeach()
+endforeach()
+
+add_custom_target(ocl_pme_kernels DEPENDS ${PME_OCL_KERNELS} )
+gmx_set_custom_target_output(ocl_pme_kernels ${PME_OCL_KERNELS})
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
case XX: return realGridSizeFP[XX];
case YY: return realGridSizeFP[YY];
case ZZ: return realGridSizeFP[ZZ];
+ default: assert(false); break;
}
assert(false);
- return 0.0f;
+ return 0.0F;
}
/*! \brief Reduce the partial force contributions.
{
/* These are the atom indices - for the shared and global memory */
const int atomIndexLocal = get_local_id(ZZ);
- const int atomIndexOffset = get_group_id(XX) * atomsPerBlock;
+ const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock;
const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
/* Some sizes which are defines and not consts because they go into the array size */
const int ithy = get_local_id(YY);
const int ithz = get_local_id(XX);
- const int threadLocalId = (get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0)
- + get_local_id(0);
+ assert((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0)
+ <= MAX_INT);
+ const int threadLocalId =
+ (int)((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0)
+ + get_local_id(0));
/* These are the spline contribution indices in shared memory */
- const int splineIndex = (get_local_id(1) * get_local_size(0)
- + get_local_id(0)); /* Relative to the current particle , 0..15 for order 4 */
- const int lineIndex = threadLocalId; /* And to all the block's particles */
+ assert((get_local_id(1) * get_local_size(0) + get_local_id(0)) <= MAX_INT);
+ const int splineIndex =
+ (int)(get_local_id(1) * get_local_size(0)
+ + get_local_id(0)); /* Relative to the current particle , 0..15 for order 4 */
+ const int lineIndex = threadLocalId; /* And to all the block's particles */
/* Staging the atom gridline indices, DIM * atomsPerBlock threads */
const int localGridlineIndicesIndex = threadLocalId;
const int globalGridlineIndicesIndex =
- get_group_id(XX) * gridlineIndicesSize + localGridlineIndicesIndex;
+ (int)get_group_id(XX) * gridlineIndicesSize + localGridlineIndicesIndex;
const int globalCheckIndices =
pme_gpu_check_atom_data_index(globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
}
/* Staging the spline parameters, DIM * order * atomsPerBlock threads */
- const int localSplineParamsIndex = threadLocalId;
- const int globalSplineParamsIndex = get_group_id(XX) * splineParamsSize + localSplineParamsIndex;
+ const int localSplineParamsIndex = threadLocalId;
+ const int globalSplineParamsIndex = (int)get_group_id(XX) * splineParamsSize + localSplineParamsIndex;
const int globalCheckSplineParams = pme_gpu_check_atom_data_index(
globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
}
barrier(CLK_LOCAL_MEM_FENCE);
- float fx = 0.0f;
- float fy = 0.0f;
- float fz = 0.0f;
+ float fx = 0.0F;
+ float fy = 0.0F;
+ float fz = 0.0F;
const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
for (int i = 0; i < numIter; i++)
{
const int outputIndexLocal = i * iterThreads + threadLocalId;
- const int outputIndexGlobal = get_group_id(XX) * blockForcesSize + outputIndexLocal;
+ const int outputIndexGlobal = (int)get_group_id(XX) * blockForcesSize + outputIndexLocal;
const int globalOutputCheck =
pme_gpu_check_atom_data_index(outputIndexGlobal, kernelParams.atoms.nAtoms * DIM);
if (globalOutputCheck)
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
const std::string commonDefines = gmx::formatString(
"-Dwarp_size=%zd "
"-Dorder=%d "
- "-DatomsPerWarp=%zd "
"-DthreadsPerAtom=%d "
// forwarding from pme_grid.h, used for spline computation table sizes only
"-Dc_pmeMaxUnitcellShift=%f "
"-DDIM=%d -DXX=%d -DYY=%d -DZZ=%d "
// decomposition parameter placeholders
"-DwrapX=true -DwrapY=true ",
- warpSize, c_pmeGpuOrder, warpSize / c_pmeSpreadGatherThreadsPerAtom,
- c_pmeSpreadGatherThreadsPerAtom, static_cast<float>(c_pmeMaxUnitcellShift),
- static_cast<int>(c_usePadding), static_cast<int>(c_skipNeutralAtoms), c_virialAndEnergyCount,
- spreadWorkGroupSize, solveMaxWorkGroupSize, gatherWorkGroupSize, DIM, XX, YY, ZZ);
+ warpSize, c_pmeGpuOrder, c_pmeSpreadGatherThreadsPerAtom,
+ static_cast<float>(c_pmeMaxUnitcellShift), static_cast<int>(c_usePadding),
+ static_cast<int>(c_skipNeutralAtoms), c_virialAndEnergyCount, spreadWorkGroupSize,
+ solveMaxWorkGroupSize, gatherWorkGroupSize, DIM, XX, YY, ZZ);
try
{
/* TODO when we have a proper MPI-aware logging module,
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* \param[in] paramIndexBase Must be result of getSplineParamIndexBase().
* \param[in] dimIndex Dimension index (from 0 to 2)
* \param[in] splineIndex Spline contribution index (from 0 to \p order - 1)
- * \param[in] order PME order
- * \param[in] atomsPerWarp Number of atoms processed by a warp
*
* \returns Index into theta or dtheta array using GPU layout.
*/
/*! \brief
* A function for checking the global atom data indices against the atom data array sizes.
*
- * \param[in] atomDataIndexGlobal The atom data index.
* \param[in] nAtomData The atom data array element count.
* \returns Non-0 if index is within bounds (or PME data padding is enabled), 0 otherwise.
*
*/
inline int pme_gpu_check_atom_charge(const float coefficient)
{
- return c_skipNeutralAtoms ? (coefficient != 0.0f) : 1;
+ return c_skipNeutralAtoms ? (coefficient != 0.0F) : 1;
}
#endif
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
*/
// Assert placeholders, to not rip them out from OpenCL implementation - hopefully they come in handy some day with OpenCL 2
-#define static_assert(a, b)
#define assert(a)
#define PmeOpenCLKernelParams PmeGpuKernelParamsBase
/* SPREAD/SPLINE */
#define atomsPerBlock (c_spreadWorkGroupSize / threadsPerAtom)
+#define atomsPerWarp (warp_size / threadsPerAtom)
// spline/spread fused
#define computeSplines 1
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* \author Aleksei Iupinov <a.yupinov@gmail.com>
*/
-#include "pme_gpu_types.h"
#include "gromacs/gpu_utils/vectype_ops.clh"
+#include "pme_gpu_types.h"
+
/*! \brief
* PME complex grid solver kernel function.
* Please see the file description for additional defines which this kernel expects.
__global float2* __restrict__ gm_grid)
{
/* This kernel supports 2 different grid dimension orderings: YZX and XYZ */
- int majorDim, middleDim, minorDim;
+ int majorDim;
+ int middleDim;
+ int minorDim;
if (gridOrdering == YZX)
{
majorDim = YY;
gm_splineModuli + kernelParams.grid.splineValuesOffset[minorDim];
/* Various grid sizes and indices */
- const int localOffsetMinor = 0, localOffsetMajor = 0, localOffsetMiddle = 0; // unused
- const int localSizeMinor = kernelParams.grid.complexGridSizePadded[minorDim];
- const int localSizeMiddle = kernelParams.grid.complexGridSizePadded[middleDim];
- const int localCountMiddle = kernelParams.grid.complexGridSize[middleDim];
- const int localCountMinor = kernelParams.grid.complexGridSize[minorDim];
- const int nMajor = kernelParams.grid.realGridSize[majorDim];
- const int nMiddle = kernelParams.grid.realGridSize[middleDim];
- const int nMinor = kernelParams.grid.realGridSize[minorDim];
- const int maxkMajor = (nMajor + 1) / 2; // X or Y
- const int maxkMiddle = (nMiddle + 1) / 2; // Y OR Z => only check for !YZX
- const int maxkMinor = (nMinor + 1) / 2; // Z or X => only check for YZX
+ const int localOffsetMinor = 0; // unused
+ const int localOffsetMajor = 0; // unused
+ const int localOffsetMiddle = 0; // unused
+ const int localSizeMinor = kernelParams.grid.complexGridSizePadded[minorDim];
+ const int localSizeMiddle = kernelParams.grid.complexGridSizePadded[middleDim];
+ const int localCountMiddle = kernelParams.grid.complexGridSize[middleDim];
+ const int localCountMinor = kernelParams.grid.complexGridSize[minorDim];
+ const int nMajor = kernelParams.grid.realGridSize[majorDim];
+ const int nMiddle = kernelParams.grid.realGridSize[middleDim];
+ const int nMinor = kernelParams.grid.realGridSize[minorDim];
+ const int maxkMajor = (nMajor + 1) / 2; // X or Y
+ const int maxkMiddle = (nMiddle + 1) / 2; // Y OR Z => only check for !YZX
+ const int maxkMinor = (nMinor + 1) / 2; // Z or X => only check for YZX
/* Each thread works on one cell of the Fourier space complex 3D grid (gm_grid).
* Each block handles up to c_solveMaxWorkGroupSize cells -
const int gridLineIndex = threadLocalId / gridLineSize;
const int gridLineCellIndex = threadLocalId - gridLineSize * gridLineIndex;
const int gridLinesPerBlock = max((int)(get_local_size(XX)) / gridLineSize, 1);
- const int activeWarps = (get_local_size(XX) / warp_size);
- const int indexMinor = get_group_id(XX) * get_local_size(XX) + gridLineCellIndex;
- const int indexMiddle = get_group_id(YY) * gridLinesPerBlock + gridLineIndex;
- const int indexMajor = get_group_id(ZZ);
+ const int activeWarps = ((int)get_local_size(XX) / warp_size);
+ assert((get_group_id(XX) * get_local_size(XX)) < MAX_INT);
+ const int indexMinor = (int)get_group_id(XX) * (int)get_local_size(XX) + gridLineCellIndex;
+ const int indexMiddle = (int)get_group_id(YY) * gridLinesPerBlock + gridLineIndex;
+ const int indexMajor = (int)get_group_id(ZZ);
/* Optional outputs */
- float energy = 0.0f;
- float virxx = 0.0f;
- float virxy = 0.0f;
- float virxz = 0.0f;
- float viryy = 0.0f;
- float viryz = 0.0f;
- float virzz = 0.0f;
+ float energy = 0.0F;
+ float virxx = 0.0F;
+ float virxy = 0.0F;
+ float virxz = 0.0F;
+ float viryy = 0.0F;
+ float viryz = 0.0F;
+ float virzz = 0.0F;
assert(indexMajor < kernelParams.grid.complexGridSize[majorDim]);
if ((indexMiddle < localCountMiddle) & (indexMinor < localCountMinor)
const int kMajor = indexMajor + localOffsetMajor;
/* Checking either X in XYZ, or Y in YZX cases */
- const float mMajor = (kMajor < maxkMajor) ? kMajor : (kMajor - nMajor);
+ const float mMajor = (float)((kMajor < maxkMajor) ? kMajor : (kMajor - nMajor));
const int kMiddle = indexMiddle + localOffsetMiddle;
- float mMiddle = kMiddle;
+ float mMiddle = (float)kMiddle;
/* Checking Y in XYZ case */
if (gridOrdering == XYZ)
{
- mMiddle = (kMiddle < maxkMiddle) ? kMiddle : (kMiddle - nMiddle);
+ mMiddle = (float)((kMiddle < maxkMiddle) ? kMiddle : (kMiddle - nMiddle));
}
const int kMinor = localOffsetMinor + indexMinor;
- float mMinor = kMinor;
+ float mMinor = (float)kMinor;
/* Checking X in YZX case */
if (gridOrdering == YZX)
{
- mMinor = (kMinor < maxkMinor) ? kMinor : (kMinor - nMinor);
+ mMinor = (float)((kMinor < maxkMinor) ? kMinor : (kMinor - nMinor));
}
/* We should skip the k-space point (0,0,0) */
const bool notZeroPoint = (kMinor > 0) | (kMajor > 0) | (kMiddle > 0);
- float mX, mY, mZ;
+ float mX;
+ float mY;
+ float mZ;
if (gridOrdering == YZX)
{
mX = mMinor;
}
/* 0.5 correction factor for the first and last components of a Z dimension */
- float corner_fac = 1.0f;
+ float corner_fac = 1.0F;
+ const float z_corner_fac = 0.5F;
if (gridOrdering == YZX)
{
if ((kMiddle == 0) | (kMiddle == maxkMiddle))
{
- corner_fac = 0.5f;
+ corner_fac = z_corner_fac;
}
}
if (gridOrdering == XYZ)
{
if ((kMinor == 0) | (kMinor == maxkMinor))
{
- corner_fac = 0.5f;
+ corner_fac = z_corner_fac;
}
}
+ mZ * kernelParams.current.recipBox[ZZ][ZZ];
const float m2k = mhxk * mhxk + mhyk * mhyk + mhzk * mhzk;
- assert(m2k != 0.0f);
+ assert(m2k != 0.0F);
const float denom = m2k * M_PI_F * kernelParams.current.boxVolume * gm_splineValueMajor[kMajor]
* gm_splineValueMiddle[kMiddle] * gm_splineValueMinor[kMinor];
assert(isfinite(denom));
- assert(denom != 0.0f);
+ assert(denom != 0.0F);
const float tmp1 = exp(-kernelParams.grid.ewaldFactor * m2k);
const float etermk = kernelParams.constants.elFactor * tmp1 / denom;
if (computeEnergyAndVirial)
{
const float tmp1k =
- 2.0f * (gridValue.x * oldGridValue.x + gridValue.y * oldGridValue.y);
+ 2.0F * (gridValue.x * oldGridValue.x + gridValue.y * oldGridValue.y);
- const float vfactor = (kernelParams.grid.ewaldFactor + 1.0f / m2k) * 2.0f;
+ const float vfactor = (kernelParams.grid.ewaldFactor + 1.0F / m2k) * 2.0F;
const float ets2 = corner_fac * tmp1k;
energy = ets2;
sm_virialAndEnergy[2 * warp_size + lane] = virzz;
sm_virialAndEnergy[3 * warp_size + lane] = virxy;
sm_virialAndEnergy[4 * warp_size + lane] = virxz;
+ // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
sm_virialAndEnergy[5 * warp_size + lane] = viryz;
+ // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
sm_virialAndEnergy[6 * warp_size + lane] = energy;
}
barrier(CLK_LOCAL_MEM_FENCE);
atomicAdd_l_f(sm_virialAndEnergy + 2 * warp_size + lane, virzz);
atomicAdd_l_f(sm_virialAndEnergy + 3 * warp_size + lane, virxy);
atomicAdd_l_f(sm_virialAndEnergy + 4 * warp_size + lane, virxz);
+ // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
atomicAdd_l_f(sm_virialAndEnergy + 5 * warp_size + lane, viryz);
+ // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
atomicAdd_l_f(sm_virialAndEnergy + 6 * warp_size + lane, energy);
}
barrier(CLK_LOCAL_MEM_FENCE);
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* \author Aleksei Iupinov <a.yupinov@gmail.com>
*/
+#include "gromacs/gpu_utils/vectype_ops.clh"
+
#include "pme_gpu_types.h"
#include "pme_gpu_utils.clh"
-#include "gromacs/gpu_utils/vectype_ops.clh"
/*
* This define affects the spline calculation behaviour in the kernel.
__global const float* __restrict__ gm_source,
const int dataCountPerAtom)
{
- const size_t threadLocalIndex =
- ((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0));
- const size_t localIndex = threadLocalIndex;
- const size_t globalIndexBase = get_group_id(XX) * atomsPerBlock * dataCountPerAtom;
- const size_t globalIndex = globalIndexBase + localIndex;
- const int globalCheck =
+ assert((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0)
+ < MAX_INT);
+ const int threadLocalIndex =
+ (int)((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0)
+ + get_local_id(0));
+ const int localIndex = threadLocalIndex;
+ const int globalIndexBase = (int)get_group_id(XX) * atomsPerBlock * dataCountPerAtom;
+ const int globalIndex = globalIndexBase + localIndex;
+ const int globalCheck =
pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
{
* \param[in] gm_fractShiftsTable Atom fractional coordinates correction table
* \param[in] gm_gridlineIndicesTable Atom fractional coordinates correction table
*/
-inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams,
- const int atomIndexOffset,
- __local const float* __restrict__ sm_coordinates,
- __local const float* __restrict__ sm_coefficients,
- __local float* __restrict__ sm_theta,
- __local int* __restrict__ sm_gridlineIndices,
- __local float* __restrict__ sm_fractCoords,
- __global float* __restrict__ gm_theta,
- __global float* __restrict__ gm_dtheta,
- __global int* __restrict__ gm_gridlineIndices,
- __global const float* __restrict__ gm_fractShiftsTable,
- __global const int* __restrict__ gm_gridlineIndicesTable)
+gmx_opencl_inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams,
+ const int atomIndexOffset,
+ __local const float* __restrict__ sm_coordinates,
+ __local const float* __restrict__ sm_coefficients,
+ __local float* __restrict__ sm_theta,
+ __local int* __restrict__ sm_gridlineIndices,
+ __local float* __restrict__ sm_fractCoords,
+ __global float* __restrict__ gm_theta,
+ __global float* __restrict__ gm_dtheta,
+ __global int* __restrict__ gm_gridlineIndices,
+ __global const float* __restrict__ gm_fractShiftsTable,
+ __global const int* __restrict__ gm_gridlineIndicesTable)
{
/* Thread index w.r.t. block */
+ assert((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0)
+ < MAX_INT);
const int threadLocalIndex =
- ((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0));
+ (int)((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0)
+ + get_local_id(0));
/* Warp index w.r.t. block - could probably be obtained easier? */
const int warpIndex = threadLocalIndex / warp_size;
/* Thread index w.r.t. warp */
/* Indices interpolation */
if (orderIndex == 0)
{
- int tableIndex, tInt;
- float n, t;
+ int tableIndex;
+ int tInt;
+ float n;
+ float t;
const float3 x = vload3(atomIndexLocal, sm_coordinates);
/* Accessing fields in fshOffset/nXYZ/recipbox/... with dimIndex offset
.z
* kernelParams.current.recipBox[dimIndex][ZZ];
break;
+ default:
+ assert(false);
+ return;
+ break;
}
const float shift = c_pmeMaxUnitcellShift;
/* Fractional coordinates along box vectors, adding a positive shift to ensure t is positive for triclinic boxes */
t = (t + shift) * n;
tInt = (int)t;
- sm_fractCoords[sharedMemoryIndex] = t - tInt;
+ sm_fractCoords[sharedMemoryIndex] = t - (float)tInt;
tableIndex += tInt;
assert(tInt >= 0);
assert(tInt < c_pmeNeighborUnitcellCount * n);
assert(isfinite(dr));
/* dr is relative offset from lower cell limit */
- *SPLINE_DATA_PTR(order - 1) = 0.0f;
+ *SPLINE_DATA_PTR(order - 1) = 0.0F;
*SPLINE_DATA_PTR(1) = dr;
- *SPLINE_DATA_PTR(0) = 1.0f - dr;
+ *SPLINE_DATA_PTR(0) = 1.0F - dr;
# pragma unroll order
for (int k = 3; k < order; k++)
{
- div = 1.0f / (k - 1.0f);
+ div = 1.0F / ((float)k - 1.0F);
*SPLINE_DATA_PTR(k - 1) = div * dr * SPLINE_DATA(k - 2);
# pragma unroll
for (int l = 1; l < (k - 1); l++)
{
- *SPLINE_DATA_PTR(k - l - 1) = div
- * ((dr + l) * SPLINE_DATA(k - l - 2)
- + (k - l - dr) * SPLINE_DATA(k - l - 1));
+ *SPLINE_DATA_PTR(k - l - 1) =
+ div
+ * ((dr + (float)l) * SPLINE_DATA(k - l - 2)
+ + ((float)k - (float)l - dr) * SPLINE_DATA(k - l - 1));
}
- *SPLINE_DATA_PTR(0) = div * (1.0f - dr) * SPLINE_DATA(0);
+ *SPLINE_DATA_PTR(0) = div * (1.0F - dr) * SPLINE_DATA(0);
}
const int thetaIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex);
const int thetaIndex = getSplineParamIndex(thetaIndexBase, dimIndex, o);
const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
- const float dtheta = ((o > 0) ? SPLINE_DATA(o - 1) : 0.0f) - SPLINE_DATA(o);
+ const float dtheta = ((o > 0) ? SPLINE_DATA(o - 1) : 0.0F) - SPLINE_DATA(o);
assert(isfinite(dtheta));
gm_dtheta[thetaGlobalIndex] = dtheta;
}
- div = 1.0f / (order - 1.0f);
+ div = 1.0F / (order - 1.0F);
*SPLINE_DATA_PTR(order - 1) = div * dr * SPLINE_DATA(order - 2);
# pragma unroll
for (int k = 1; k < (order - 1); k++)
{
*SPLINE_DATA_PTR(order - k - 1) = div
- * ((dr + k) * SPLINE_DATA(order - k - 2)
+ * ((dr + (float)k) * SPLINE_DATA(order - k - 2)
+ (order - k - dr) * SPLINE_DATA(order - k - 1));
}
- *SPLINE_DATA_PTR(0) = div * (1.0f - dr) * SPLINE_DATA(0);
+ *SPLINE_DATA_PTR(0) = div * (1.0F - dr) * SPLINE_DATA(0);
/* Storing the spline values (theta) */
# if !PME_GPU_PARALLEL_SPLINE
* sm_gridlineIndices Atom gridline indices in the shared memory. \param[in] sm_theta Atom spline
* values in the shared memory. \param[out] gm_grid Global 3D grid for spreading.
*/
-inline void spread_charges(const struct PmeOpenCLKernelParams kernelParams,
- int atomIndexOffset,
- __local const float* __restrict__ sm_coefficients,
- __local const int* __restrict__ sm_gridlineIndices,
- __local const float* __restrict__ sm_theta,
- __global float* __restrict__ gm_grid)
+gmx_opencl_inline void spread_charges(const struct PmeOpenCLKernelParams kernelParams,
+ int atomIndexOffset,
+ __local const float* __restrict__ sm_coefficients,
+ __local const int* __restrict__ sm_gridlineIndices,
+ __local const float* __restrict__ sm_theta,
+ __global float* __restrict__ gm_grid)
{
const int nx = kernelParams.grid.realGridSize[XX];
const int ny = kernelParams.grid.realGridSize[YY];
const int pny = kernelParams.grid.realGridSizePadded[YY];
const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
- const int offx = 0, offy = 0, offz = 0; // unused for now
+ const int offx = 0;
+ const int offy = 0;
+ const int offz = 0;
const int atomIndexLocal = get_local_id(ZZ);
const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
// Staging coordinates - only for spline computation
__local float sm_coordinates[DIM * atomsPerBlock];
- const int atomIndexOffset = get_group_id(XX) * atomsPerBlock;
+ const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock;
/* Staging coefficients/charges for both spline and spread */
pme_gpu_stage_atom_data(kernelParams, sm_coefficients, gm_coefficients, 1);