*
* \param[in] kernelParams Input PME GPU data in constant memory.
* \param[in] gm_splineModuli B-Spline moduli.
- * \param[out] gm_virialAndEnergy Reduced virial and enrgy (only with computeEnergyAndVirial == true)
- * \param[in,out] gm_grid Fourier grid to transform.
+ * \param[out] gm_virialAndEnergy Reduced virial and enrgy (only with computeEnergyAndVirial ==
+ * true) \param[in,out] gm_grid Fourier grid to transform.
*/
__attribute__((work_group_size_hint(c_solveMaxWorkGroupSize, 1, 1)))
-__kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKernelParams kernelParams,
- __global const float * __restrict__ gm_splineModuli,
- __global float * __restrict__ gm_virialAndEnergy,
- __global float2 * __restrict__ gm_grid)
+__kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKernelParams kernelParams,
+ __global const float* __restrict__ gm_splineModuli,
+ __global float* __restrict__ gm_virialAndEnergy,
+ __global float2* __restrict__ gm_grid)
{
/* This kernel supports 2 different grid dimension orderings: YZX and XYZ */
int majorDim, middleDim, minorDim;
minorDim = ZZ;
}
- __global const float * __restrict__ gm_splineValueMajor = gm_splineModuli + kernelParams.grid.splineValuesOffset[majorDim];
- __global const float * __restrict__ gm_splineValueMiddle = gm_splineModuli + kernelParams.grid.splineValuesOffset[middleDim];
- __global const float * __restrict__ gm_splineValueMinor = gm_splineModuli + kernelParams.grid.splineValuesOffset[minorDim];
+ __global const float* __restrict__ gm_splineValueMajor =
+ gm_splineModuli + kernelParams.grid.splineValuesOffset[majorDim];
+ __global const float* __restrict__ gm_splineValueMiddle =
+ gm_splineModuli + kernelParams.grid.splineValuesOffset[middleDim];
+ __global const float* __restrict__ gm_splineValueMinor =
+ gm_splineModuli + kernelParams.grid.splineValuesOffset[minorDim];
/* Various grid sizes and indices */
- const int localOffsetMinor = 0, localOffsetMajor = 0, localOffsetMiddle = 0; //unused
+ const int localOffsetMinor = 0, localOffsetMajor = 0, localOffsetMiddle = 0; // unused
const int localSizeMinor = kernelParams.grid.complexGridSizePadded[minorDim];
const int localSizeMiddle = kernelParams.grid.complexGridSizePadded[middleDim];
const int localCountMiddle = kernelParams.grid.complexGridSize[middleDim];
float virzz = 0.0f;
assert(indexMajor < kernelParams.grid.complexGridSize[majorDim]);
- if ((indexMiddle < localCountMiddle) & (indexMinor < localCountMinor) & (gridLineIndex < gridLinesPerBlock))
+ if ((indexMiddle < localCountMiddle) & (indexMinor < localCountMinor)
+ & (gridLineIndex < gridLinesPerBlock))
{
/* The offset should be equal to the global thread index for coalesced access */
- const int gridIndex = (indexMajor * localSizeMiddle + indexMiddle) * localSizeMinor + indexMinor;
- __global float2 * __restrict__ gm_gridCell = gm_grid + gridIndex;
+ const int gridIndex = (indexMajor * localSizeMiddle + indexMiddle) * localSizeMinor + indexMinor;
+ __global float2* __restrict__ gm_gridCell = gm_grid + gridIndex;
- const int kMajor = indexMajor + localOffsetMajor;
+ const int kMajor = indexMajor + localOffsetMajor;
/* Checking either X in XYZ, or Y in YZX cases */
- const float mMajor = (kMajor < maxkMajor) ? kMajor : (kMajor - nMajor);
+ const float mMajor = (kMajor < maxkMajor) ? kMajor : (kMajor - nMajor);
- const int kMiddle = indexMiddle + localOffsetMiddle;
- float mMiddle = kMiddle;
+ const int kMiddle = indexMiddle + localOffsetMiddle;
+ float mMiddle = kMiddle;
/* Checking Y in XYZ case */
if (gridOrdering == XYZ)
{
mMiddle = (kMiddle < maxkMiddle) ? kMiddle : (kMiddle - nMiddle);
}
- const int kMinor = localOffsetMinor + indexMinor;
- float mMinor = kMinor;
+ const int kMinor = localOffsetMinor + indexMinor;
+ float mMinor = kMinor;
/* Checking X in YZX case */
if (gridOrdering == YZX)
{
mMinor = (kMinor < maxkMinor) ? kMinor : (kMinor - nMinor);
}
/* We should skip the k-space point (0,0,0) */
- const bool notZeroPoint = (kMinor > 0) | (kMajor > 0) | (kMiddle > 0);
+ const bool notZeroPoint = (kMinor > 0) | (kMajor > 0) | (kMiddle > 0);
- float mX, mY, mZ;
+ float mX, mY, mZ;
if (gridOrdering == YZX)
{
mX = mMinor;
if (notZeroPoint)
{
const float mhxk = mX * kernelParams.current.recipBox[XX][XX];
- const float mhyk = mX * kernelParams.current.recipBox[XX][YY] + mY * kernelParams.current.recipBox[YY][YY];
- const float mhzk = mX * kernelParams.current.recipBox[XX][ZZ] + mY * kernelParams.current.recipBox[YY][ZZ] + mZ * kernelParams.current.recipBox[ZZ][ZZ];
+ const float mhyk = mX * kernelParams.current.recipBox[XX][YY]
+ + mY * kernelParams.current.recipBox[YY][YY];
+ const float mhzk = mX * kernelParams.current.recipBox[XX][ZZ]
+ + mY * kernelParams.current.recipBox[YY][ZZ]
+ + mZ * kernelParams.current.recipBox[ZZ][ZZ];
- const float m2k = mhxk * mhxk + mhyk * mhyk + mhzk * mhzk;
+ const float m2k = mhxk * mhxk + mhyk * mhyk + mhzk * mhzk;
assert(m2k != 0.0f);
- const float denom = m2k * M_PI_F * kernelParams.current.boxVolume * gm_splineValueMajor[kMajor] * gm_splineValueMiddle[kMiddle] * gm_splineValueMinor[kMinor];
+ const float denom = m2k * M_PI_F * kernelParams.current.boxVolume * gm_splineValueMajor[kMajor]
+ * gm_splineValueMiddle[kMiddle] * gm_splineValueMinor[kMinor];
assert(isfinite(denom));
assert(denom != 0.0f);
- const float tmp1 = exp(-kernelParams.grid.ewaldFactor * m2k);
- const float etermk = kernelParams.constants.elFactor * tmp1 / denom;
+ const float tmp1 = exp(-kernelParams.grid.ewaldFactor * m2k);
+ const float etermk = kernelParams.constants.elFactor * tmp1 / denom;
- float2 gridValue = *gm_gridCell;
- const float2 oldGridValue = gridValue;
+ float2 gridValue = *gm_gridCell;
+ const float2 oldGridValue = gridValue;
- gridValue.x *= etermk;
- gridValue.y *= etermk;
- *gm_gridCell = gridValue;
+ gridValue.x *= etermk;
+ gridValue.y *= etermk;
+ *gm_gridCell = gridValue;
if (computeEnergyAndVirial)
{
- const float tmp1k = 2.0f * (gridValue.x * oldGridValue.x + gridValue.y * oldGridValue.y);
+ const float tmp1k =
+ 2.0f * (gridValue.x * oldGridValue.x + gridValue.y * oldGridValue.y);
const float vfactor = (kernelParams.grid.ewaldFactor + 1.0f / m2k) * 2.0f;
const float ets2 = corner_fac * tmp1k;
- energy = ets2;
+ energy = ets2;
- const float ets2vf = ets2 * vfactor;
+ const float ets2vf = ets2 * vfactor;
- virxx = ets2vf * mhxk * mhxk - ets2;
- virxy = ets2vf * mhxk * mhyk;
- virxz = ets2vf * mhxk * mhzk;
- viryy = ets2vf * mhyk * mhyk - ets2;
- viryz = ets2vf * mhyk * mhzk;
- virzz = ets2vf * mhzk * mhzk - ets2;
+ virxx = ets2vf * mhxk * mhxk - ets2;
+ virxy = ets2vf * mhxk * mhyk;
+ virxz = ets2vf * mhxk * mhzk;
+ viryy = ets2vf * mhyk * mhyk - ets2;
+ viryz = ets2vf * mhyk * mhzk;
+ virzz = ets2vf * mhzk * mhzk - ets2;
}
}
}
if (componentIndex < c_virialAndEnergyCount)
{
const int targetIndex = componentIndex * warp_size + lane;
- #pragma unroll
+#pragma unroll
for (int reductionStride = warp_size >> 1; reductionStride >= 1; reductionStride >>= 1)
{
if (lane < reductionStride)
{
- sm_virialAndEnergy[targetIndex] += sm_virialAndEnergy[targetIndex + reductionStride];
+ sm_virialAndEnergy[targetIndex] +=
+ sm_virialAndEnergy[targetIndex + reductionStride];
}
#ifdef _NVIDIA_SOURCE_
- /* FIXME: this execution happens within execution width aka warp, but somehow NVIDIA OpenCL of all things
- * fails without the memory barrier here. #2519
+ /* FIXME: this execution happens within execution width aka warp, but somehow
+ * NVIDIA OpenCL of all things fails without the memory barrier here. #2519
*/
barrier(CLK_LOCAL_MEM_FENCE);
#endif