--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief Implements helper routines for PME gather and spline routines.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ */
+
+#include "gmxpre.h"
+
+#include <cassert>
+
+#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
+
+#include "pme.cuh"
+#include "pme_gpu_utils.h"
+#include "pme_grid.h"
+
+//! Controls if the atom and charge data is prefeched into shared memory or loaded per thread from global
+static const bool c_useAtomDataPrefetch = true;
+
+/*! \brief
+ * General purpose function for loading atom-related data from global to shared memory.
+ *
+ * \tparam[in] T Data type (float/int/...)
+ * \tparam[in] atomsPerBlock Number of atoms processed by a block - should be accounted for in the size of the shared memory array.
+ * \tparam[in] dataCountPerAtom Number of data elements per single atom (e.g. DIM for an rvec coordinates array).
+ * \param[in] kernelParams Input PME CUDA data in constant memory.
+ * \param[out] sm_destination Shared memory array for output.
+ * \param[in] gm_source Global memory array for input.
+ */
+template<typename T,
+ const int atomsPerBlock,
+ const int dataCountPerAtom>
+__device__ __forceinline__
+void pme_gpu_stage_atom_data(const PmeGpuCudaKernelParams kernelParams,
+ T * __restrict__ sm_destination,
+ const T * __restrict__ gm_source)
+{
+ static_assert(c_usePadding, "With padding disabled, index checking should be fixed to account for spline theta/dtheta pr-warp alignment");
+ const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
+ const int threadLocalIndex = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x) + threadIdx.x;
+ const int localIndex = threadLocalIndex;
+ const int globalIndexBase = blockIndex * atomsPerBlock * dataCountPerAtom;
+ const int globalIndex = globalIndexBase + localIndex;
+ const int globalCheck = pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
+ if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
+ {
+ assert(isfinite(float(gm_source[globalIndex])));
+ sm_destination[localIndex] = gm_source[globalIndex];
+ }
+}
+
+/*! \brief
+ * PME GPU spline parameter and gridline indices calculation.
+ * This corresponds to the CPU functions calc_interpolation_idx() and make_bsplines().
+ * First stage of the whole kernel.
+ *
+ * \tparam[in] order PME interpolation order.
+ * \tparam[in] atomsPerBlock Number of atoms processed by a block - should be accounted for
+ * in the sizes of the shared memory arrays.
+ * \tparam[in] atomsPerWarp Number of atoms processed by a warp
+ * \tparam[in] writeSmDtheta Bool controling if the theta derivative should be written to shared memory. Enables calculation of dtheta if set.
+ * \tparam[in] writeGlobal A boolean which tells if the theta values and gridlines should be written to global memory. Enables calculation of dtheta if set.
+ * \param[in] kernelParams Input PME CUDA data in constant memory.
+ * \param[in] atomIndexOffset Starting atom index for the execution block w.r.t. global memory.
+ * \param[in] atomX Atom coordinate of atom processed by thread.
+ * \param[in] atomCharge Atom charge/coefficient of atom processed by thread.
+ * \param[out] sm_theta Atom spline values in the shared memory.
+ * \param[out] sm_dtheta Derivative of atom spline values in shared memory.
+ * \param[out] sm_gridlineIndices Atom gridline indices in the shared memory.
+ */
+
+template <const int order,
+ const int atomsPerBlock,
+ const int atomsPerWarp,
+ const bool writeSmDtheta,
+ const bool writeGlobal>
+__device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams kernelParams,
+ const int atomIndexOffset,
+ const float3 atomX,
+ const float atomCharge,
+ float * __restrict__ sm_theta,
+ float * __restrict__ sm_dtheta,
+ int * __restrict__ sm_gridlineIndices)
+{
+ /* Global memory pointers for output */
+ float * __restrict__ gm_theta = kernelParams.atoms.d_theta;
+ float * __restrict__ gm_dtheta = kernelParams.atoms.d_dtheta;
+ int * __restrict__ gm_gridlineIndices = kernelParams.atoms.d_gridlineIndices;
+
+ /* Fractional coordinates */
+ __shared__ float sm_fractCoords[atomsPerBlock * DIM];
+
+ /* Thread index w.r.t. block */
+ const int threadLocalId = (threadIdx.z * (blockDim.x * blockDim.y))
+ + (threadIdx.y * blockDim.x) + threadIdx.x;
+ /* Warp index w.r.t. block - could probably be obtained easier? */
+ const int warpIndex = threadLocalId / warp_size;
+ /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
+ const int atomWarpIndex = threadIdx.z %atomsPerWarp;
+ /* Atom index w.r.t. block/shared memory */
+ const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
+
+ /* Atom index w.r.t. global memory */
+ const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
+ /* Spline contribution index in one dimension */
+ const int threadLocalIdXY = (threadIdx.y * blockDim.x) + threadIdx.x;
+ const int orderIndex = threadLocalIdXY / DIM;
+ /* Dimension index */
+ const int dimIndex = threadLocalIdXY % DIM;
+
+ /* Multi-purpose index of rvec/ivec atom data */
+ const int sharedMemoryIndex = atomIndexLocal * DIM + dimIndex;
+
+ float splineData[order];
+
+ const int localCheck = (dimIndex < DIM) && (orderIndex < 1);
+ const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
+
+ /* we have 4 threads per atom, but can only use 3 here for the dimensions */
+ if (localCheck && globalCheck)
+ {
+ /* Indices interpolation */
+
+ if (orderIndex == 0)
+ {
+ int tableIndex, tInt;
+ float n, t;
+ assert(atomIndexLocal < DIM*atomsPerBlock);
+ /* Accessing fields in fshOffset/nXYZ/recipbox/... with dimIndex offset
+ * puts them into local memory(!) instead of accessing the constant memory directly.
+ * That's the reason for the switch, to unroll explicitly.
+ * The commented parts correspond to the 0 components of the recipbox.
+ */
+ switch (dimIndex)
+ {
+ case XX:
+ tableIndex = kernelParams.grid.tablesOffsets[XX];
+ n = kernelParams.grid.realGridSizeFP[XX];
+ t = atomX.x * kernelParams.current.recipBox[dimIndex][XX] + atomX.y * kernelParams.current.recipBox[dimIndex][YY] + atomX.z * kernelParams.current.recipBox[dimIndex][ZZ];
+ break;
+
+ case YY:
+ tableIndex = kernelParams.grid.tablesOffsets[YY];
+ n = kernelParams.grid.realGridSizeFP[YY];
+ t = /*atomX.x * kernelParams.current.recipBox[dimIndex][XX] + */ atomX.y * kernelParams.current.recipBox[dimIndex][YY] + atomX.z * kernelParams.current.recipBox[dimIndex][ZZ];
+ break;
+
+ case ZZ:
+ tableIndex = kernelParams.grid.tablesOffsets[ZZ];
+ n = kernelParams.grid.realGridSizeFP[ZZ];
+ t = /*atomX.x * kernelParams.current.recipBox[dimIndex][XX] + atomX.y * kernelParams.current.recipBox[dimIndex][YY] + */ atomX.z * kernelParams.current.recipBox[dimIndex][ZZ];
+ break;
+ }
+ const float shift = c_pmeMaxUnitcellShift;
+ /* Fractional coordinates along box vectors, adding a positive shift to ensure t is positive for triclinic boxes */
+ t = (t + shift) * n;
+ tInt = (int)t;
+ assert(sharedMemoryIndex < atomsPerBlock*DIM);
+ sm_fractCoords[sharedMemoryIndex] = t - tInt;
+ tableIndex += tInt;
+ assert(tInt >= 0);
+ assert(tInt < c_pmeNeighborUnitcellCount * n);
+
+ // TODO have shared table for both parameters to share the fetch, as index is always same?
+ // TODO compare texture/LDG performance
+ sm_fractCoords[sharedMemoryIndex] +=
+ fetchFromParamLookupTable(kernelParams.grid.d_fractShiftsTable,
+ kernelParams.fractShiftsTableTexture,
+ tableIndex);
+ sm_gridlineIndices[sharedMemoryIndex] =
+ fetchFromParamLookupTable(kernelParams.grid.d_gridlineIndicesTable,
+ kernelParams.gridlineIndicesTableTexture,
+ tableIndex);
+ if (writeGlobal)
+ {
+ gm_gridlineIndices[atomIndexOffset * DIM + sharedMemoryIndex] = sm_gridlineIndices[sharedMemoryIndex];
+ }
+ }
+
+ /* B-spline calculation */
+
+ const int chargeCheck = pme_gpu_check_atom_charge(atomCharge);
+ if (chargeCheck)
+ {
+ float div;
+ int o = orderIndex; // This is an index that is set once for PME_GPU_PARALLEL_SPLINE == 1
+
+ const float dr = sm_fractCoords[sharedMemoryIndex];
+ assert(isfinite(dr));
+
+ /* dr is relative offset from lower cell limit */
+ splineData[order-1] = 0.0f;
+ splineData[1] = dr;
+ splineData[0] = 1.0f - dr;
+
+#pragma unroll
+ for (int k = 3; k < order; k++)
+ {
+ div = 1.0f / (k - 1.0f);
+ splineData[k - 1] = div * dr * splineData[k - 2];
+#pragma unroll
+ for (int l = 1; l < (k - 1); l++)
+ {
+ splineData[k - l - 1] = div * ((dr + l) * splineData[k - l - 2] + (k - l - dr) * splineData[k - l - 1] );
+ }
+ splineData[0] = div * (1.0f - dr) * splineData[0];
+ }
+
+ const int thetaIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
+ const int thetaGlobalOffsetBase = atomIndexOffset * DIM * order;
+ /* only calculate dtheta if we are saving it to shared or global memory */
+ if (writeSmDtheta || writeGlobal)
+ {
+ /* Differentiation and storing the spline derivatives (dtheta) */
+#pragma unroll
+ for (o = 0; o < order; o++)
+ {
+ const int thetaIndex = getSplineParamIndex<order, atomsPerWarp>(thetaIndexBase, dimIndex, o);
+
+ const float dtheta = ((o > 0) ? splineData[o - 1] : 0.0f) - splineData[o];
+ assert(isfinite(dtheta));
+ assert(thetaIndex < order*DIM*atomsPerBlock);
+ if (writeSmDtheta)
+ {
+ sm_dtheta[thetaIndex] = dtheta;
+ }
+ if (writeGlobal)
+ {
+ const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
+ gm_dtheta[thetaGlobalIndex] = dtheta;
+ }
+ }
+ }
+
+ div = 1.0f / (order - 1.0f);
+ splineData[order - 1] = div * dr * splineData[order - 2];
+#pragma unroll
+ for (int k = 1; k < (order - 1); k++)
+ {
+ splineData[order - k - 1] = div * ((dr + k) * splineData[order - k - 2] + (order - k - dr) * splineData[order - k - 1]);
+ }
+ splineData[0] = div * (1.0f - dr) * splineData[0];
+
+ /* Storing the spline values (theta) */
+#pragma unroll
+ for (o = 0; o < order; o++)
+ {
+ const int thetaIndex = getSplineParamIndex<order, atomsPerWarp>(thetaIndexBase, dimIndex, o);
+ assert(thetaIndex < order*DIM*atomsPerBlock);
+ sm_theta[thetaIndex] = splineData[o];
+ assert(isfinite(sm_theta[thetaIndex]));
+ if (writeGlobal)
+ {
+ const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
+ gm_theta[thetaGlobalIndex] = splineData[o];
+ }
+ }
+ }
+ }
+}
#include <cassert>
+#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
+
#include "pme.cuh"
+#include "pme_calculate_splines.cuh"
#include "pme_gpu_utils.h"
+#include "pme_grid.h"
/*! \brief
* An inline CUDA function: unroll the dynamic index accesses to the constant grid sizes to avoid local memory operations.
* False: the forces are added non-atomically to the output buffer (e.g. to the bonded forces).
* \tparam[in] wrapX Tells if the grid is wrapped in the X dimension.
* \tparam[in] wrapY Tells if the grid is wrapped in the Y dimension.
+ * \tparam[in] readGlobal Tells if we should read spline values from global memory
+ * \tparam[in] useOrderThreads Tells if we should use order threads per atom (order*order used if false)
* \param[in] kernelParams All the PME GPU data.
*/
template <
const int order,
const bool overwriteForces,
const bool wrapX,
- const bool wrapY
+ const bool wrapY,
+ const bool readGlobal,
+ const bool useOrderThreads
>
__launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP)
__global__ void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams)
/* Global memory pointers */
const float * __restrict__ gm_coefficients = kernelParams.atoms.d_coefficients;
const float * __restrict__ gm_grid = kernelParams.grid.d_realGrid;
+ float * __restrict__ gm_forces = kernelParams.atoms.d_forces;
+
+ /* Global memory pointers for readGlobal */
const float * __restrict__ gm_theta = kernelParams.atoms.d_theta;
const float * __restrict__ gm_dtheta = kernelParams.atoms.d_dtheta;
const int * __restrict__ gm_gridlineIndices = kernelParams.atoms.d_gridlineIndices;
- float * __restrict__ gm_forces = kernelParams.atoms.d_forces;
+
+ float3 atomX;
+ float atomCharge;
/* Some sizes */
- const int atomsPerBlock = (c_gatherMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom);
- const int atomDataSize = c_pmeSpreadGatherThreadsPerAtom; /* Number of data components and threads for a single atom */
- const int atomsPerWarp = c_pmeSpreadGatherAtomsPerWarp;
+ const int atomsPerBlock = useOrderThreads ? (c_gatherMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom) :
+ (c_gatherMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom);
+ const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
+
+ /* Number of data components and threads for a single atom */
+ const int atomDataSize = useOrderThreads ? c_pmeSpreadGatherThreadsPerAtom4ThPerAtom : c_pmeSpreadGatherThreadsPerAtom;
+ const int atomsPerWarp = useOrderThreads ? c_pmeSpreadGatherAtomsPerWarp4ThPerAtom : c_pmeSpreadGatherAtomsPerWarp;
const int blockSize = atomsPerBlock * atomDataSize;
assert(blockSize == blockDim.x * blockDim.y * blockDim.z);
- const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
/* These are the atom indices - for the shared and global memory */
const int atomIndexLocal = threadIdx.z;
{
return;
}
-
+ // 4 warps per block, 8 atoms per warp *3 *4
const int splineParamsSize = atomsPerBlock * DIM * order;
const int gridlineIndicesSize = atomsPerBlock * DIM;
__shared__ int sm_gridlineIndices[gridlineIndicesSize];
- __shared__ float2 sm_splineParams[splineParamsSize]; /* Theta/dtheta pairs as .x/.y */
+ __shared__ float sm_theta[splineParamsSize];
+ __shared__ float sm_dtheta[splineParamsSize];
- /* Spline Y/Z coordinates */
- const int ithy = threadIdx.y;
+ /* Spline Z coordinates */
const int ithz = threadIdx.x;
/* These are the spline contribution indices in shared memory */
- const int splineIndex = threadIdx.y * blockDim.x + threadIdx.x; /* Relative to the current particle , 0..15 for order 4 */
- const int lineIndex = (threadIdx.z * (blockDim.x * blockDim.y)) + splineIndex; /* And to all the block's particles */
-
- int threadLocalId = (threadIdx.z * (blockDim.x * blockDim.y))
- + (threadIdx.y * blockDim.x)
- + threadIdx.x;
-
- /* Staging the atom gridline indices, DIM * atomsPerBlock threads */
- const int localGridlineIndicesIndex = threadLocalId;
- const int globalGridlineIndicesIndex = blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
- const int globalCheckIndices = pme_gpu_check_atom_data_index(globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
- if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
+ const int splineIndex = threadIdx.y * blockDim.x + threadIdx.x;
+ const int lineIndex = (threadIdx.z * (blockDim.x * blockDim.y)) + splineIndex; /* And to all the block's particles */
+
+ const int threadLocalId = (threadIdx.z * (blockDim.x * blockDim.y)) + blockDim.x*threadIdx.y + threadIdx.x;
+ const int threadLocalIdMax = blockDim.x * blockDim.y * blockDim.z;
+
+ if (readGlobal)
{
- sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
- assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
+ /* Read splines */
+ const int localGridlineIndicesIndex = threadLocalId;
+ const int globalGridlineIndicesIndex = blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
+ const int globalCheckIndices = pme_gpu_check_atom_data_index(globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
+ if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
+ {
+ sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
+ assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
+ }
+ /* The loop needed for order threads per atom to make sure we load all data values, as each thread must load multiple values
+ with order*order threads per atom, it is only required for each thread to load one data value */
+
+ const int iMin = 0;
+ const int iMax = useOrderThreads ? 3 : 1;
+
+ for (int i = iMin; i < iMax; i++)
+ {
+ int localSplineParamsIndex = threadLocalId + i*threadLocalIdMax; /* i will always be zero for order*order threads per atom */
+ int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
+ int globalCheckSplineParams = pme_gpu_check_atom_data_index(globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
+ if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
+ {
+ sm_theta[localSplineParamsIndex] = gm_theta[globalSplineParamsIndex];
+ sm_dtheta[localSplineParamsIndex] = gm_dtheta[globalSplineParamsIndex];
+ assert(isfinite(sm_theta[localSplineParamsIndex]));
+ assert(isfinite(sm_dtheta[localSplineParamsIndex]));
+ }
+ }
+ __syncthreads();
}
- /* Staging the spline parameters, DIM * order * atomsPerBlock threads */
- const int localSplineParamsIndex = threadLocalId;
- const int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
- const int globalCheckSplineParams = pme_gpu_check_atom_data_index(globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
- if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
+ else
{
- sm_splineParams[localSplineParamsIndex].x = gm_theta[globalSplineParamsIndex];
- sm_splineParams[localSplineParamsIndex].y = gm_dtheta[globalSplineParamsIndex];
- assert(isfinite(sm_splineParams[localSplineParamsIndex].x));
- assert(isfinite(sm_splineParams[localSplineParamsIndex].y));
- }
- __syncthreads();
+ /* Recaclulate Splines */
+ if (c_useAtomDataPrefetch)
+ {
+ //charges
+ __shared__ float sm_coefficients[atomsPerBlock];
+ // Coordinates
+ __shared__ float sm_coordinates[DIM * atomsPerBlock];
+ /* Staging coefficients/charges */
+ pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, kernelParams.atoms.d_coefficients);
+
+ /* Staging coordinates */
+ pme_gpu_stage_atom_data<float, atomsPerBlock, DIM>(kernelParams, sm_coordinates, kernelParams.atoms.d_coordinates);
+ __syncthreads();
+ atomX.x = sm_coordinates[atomIndexLocal*DIM+XX];
+ atomX.y = sm_coordinates[atomIndexLocal*DIM+YY];
+ atomX.z = sm_coordinates[atomIndexLocal*DIM+ZZ];
+ atomCharge = sm_coefficients[atomIndexLocal];
+ }
+ else
+ {
+ atomCharge = gm_coefficients[atomIndexGlobal];
+ atomX.x = kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + XX];
+ atomX.y = kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + YY];
+ atomX.z = kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + ZZ];
+ }
+ calculate_splines<order, atomsPerBlock, atomsPerWarp, true, false>(kernelParams, atomIndexOffset,
+ atomX, atomCharge,
+ sm_theta, sm_dtheta,
+ sm_gridlineIndices);
+ __syncwarp();
+ }
float fx = 0.0f;
float fy = 0.0f;
float fz = 0.0f;
const int warpIndex = atomIndexLocal / atomsPerWarp;
const int splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
- const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
- const float2 tdy = sm_splineParams[splineIndexY];
const int splineIndexZ = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
- const float2 tdz = sm_splineParams[splineIndexZ];
+ const float2 tdz = make_float2(sm_theta[splineIndexZ], sm_dtheta[splineIndexZ] );
+ int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
- int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
- if (wrapY & (iy >= ny))
- {
- iy -= ny;
- }
- int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
+
if (iz >= nz)
{
iz -= nz;
}
- const int constOffset = iy * pnz + iz;
+ int constOffset, iy;
-#pragma unroll
- for (int ithx = 0; (ithx < order); ithx++)
+ const int ithyMin = useOrderThreads ? 0 : threadIdx.y;
+ const int ithyMax = useOrderThreads ? order : threadIdx.y + 1;
+ for (int ithy = ithyMin; ithy < ithyMax; ithy++)
{
- int ix = ixBase + ithx;
- if (wrapX & (ix >= nx))
+ const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
+ const float2 tdy = make_float2(sm_theta[splineIndexY], sm_dtheta[splineIndexY] );
+
+ iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
+ if (wrapY & (iy >= ny))
{
- ix -= nx;
+ iy -= ny;
+ }
+ constOffset = iy * pnz + iz;
+
+#pragma unroll
+ for (int ithx = 0; (ithx < order); ithx++)
+ {
+ int ix = ixBase + ithx;
+ if (wrapX & (ix >= nx))
+ {
+ ix -= nx;
+ }
+ const int gridIndexGlobal = ix * pny * pnz + constOffset;
+ assert(gridIndexGlobal >= 0);
+ const float gridValue = gm_grid[gridIndexGlobal];
+ assert(isfinite(gridValue));
+ const int splineIndexX = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
+ const float2 tdx = make_float2( sm_theta[splineIndexX], sm_dtheta[splineIndexX]);
+ const float fxy1 = tdz.x * gridValue;
+ const float fz1 = tdz.y * gridValue;
+ fx += tdx.y * tdy.x * fxy1;
+ fy += tdx.x * tdy.y * fxy1;
+ fz += tdx.x * tdy.x * fz1;
}
- const int gridIndexGlobal = ix * pny * pnz + constOffset;
- assert(gridIndexGlobal >= 0);
- const float gridValue = gm_grid[gridIndexGlobal];
- assert(isfinite(gridValue));
- const int splineIndexX = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
- const float2 tdx = sm_splineParams[splineIndexX];
- const float fxy1 = tdz.x * gridValue;
- const float fz1 = tdz.y * gridValue;
- fx += tdx.y * tdy.x * fxy1;
- fy += tdx.x * tdy.y * fxy1;
- fz += tdx.x * tdy.x * fz1;
}
}
}
//! Kernel instantiations
-template __global__ void pme_gather_kernel<4, true, true, true>(const PmeGpuCudaKernelParams);
-template __global__ void pme_gather_kernel<4, false, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, true, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, true, true, false>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, false, true, true, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, false, true, true, true, false>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, true, false, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, true, false, false>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, false, true, true, false, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, false, true, true, false, false>(const PmeGpuCudaKernelParams);
* The assumption is currently that any thread processes only a single atom's contributions.
* TODO: this assumption leads to minimum execution width of 16. See Redmine #2516
*/
-constexpr int c_pmeSpreadGatherThreadsPerAtom = (c_pmeGpuOrder * c_pmeGpuOrder);
+constexpr int c_pmeSpreadGatherThreadsPerAtom = c_pmeGpuOrder*c_pmeGpuOrder;
+
+//! Number of threads per atom when order threads are used
+constexpr int c_pmeSpreadGatherThreadsPerAtom4ThPerAtom = c_pmeGpuOrder;
/*! \brief Minimum execution width of the PME spread and gather kernels.
*
* Due to the one thread per atom and order=4 implementation constraints, order^2 threads
* should execute without synchronization needed. See c_pmeSpreadGatherThreadsPerAtom
*/
-constexpr int c_pmeSpreadGatherMinWarpSize = c_pmeSpreadGatherThreadsPerAtom;
+constexpr int c_pmeSpreadGatherMinWarpSize = c_pmeSpreadGatherThreadsPerAtom;
+
+//! Minimum warp size if order threads pera atom are used instead of order^2
+constexpr int c_pmeSpreadGatherMinWarpSize4ThPerAtom = c_pmeSpreadGatherThreadsPerAtom4ThPerAtom;
/*! \brief
* Atom data alignment (in terms of number of atoms).
* Then the numbers of atoms which would fit in the padded GPU buffers have to be divisible by this.
* There are debug asserts for this divisibility in pme_gpu_spread() and pme_gpu_gather().
*/
-constexpr int c_pmeAtomDataAlignment = 32;
+constexpr int c_pmeAtomDataAlignment = 64;
/*
* The execution widths for PME GPU kernels, used both on host and device for correct scheduling.
* This macro depends on the templated order parameter (2 atoms per warp for order 4 and warp_size of 32).
* It is mostly used for spline data layout tweaked for coalesced access.
*/
-constexpr int c_pmeSpreadGatherAtomsPerWarp = (warp_size / c_pmeSpreadGatherThreadsPerAtom);
+constexpr int c_pmeSpreadGatherAtomsPerWarp = (warp_size / c_pmeSpreadGatherThreadsPerAtom);
+
+//! number of atoms per warp when order threads are used per atom
+constexpr int c_pmeSpreadGatherAtomsPerWarp4ThPerAtom = (warp_size / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom);
//! Spreading max block size in threads
constexpr int c_spreadMaxThreadsPerBlock = c_spreadMaxWarpsPerBlock * warp_size;
#include "pme_internal.h"
#include "pme_solve.h"
+
+/*! \brief
+ * CUDA only.
+ * Controls if we should use order (i.e. 4) threads per atom for the GPU
+ * or order*order (i.e. 16) threads per atom.
+ */
+constexpr bool c_useOrderThreadsPerAtom = false;
+/*! \brief
+ * CUDA only.
+ * Controls if we should recalculate the splines in the gather or
+ * save the values in the spread and reload in the gather.
+ */
+constexpr bool c_recalculateSplines = false;
+
/*! \internal \brief
* Wrapper for getting a pointer to the plain C++ part of the GPU kernel parameters structure.
*
int pme_gpu_get_atoms_per_warp(const PmeGpu *pmeGpu)
{
- return pmeGpu->programHandle_->impl_->warpSize / c_pmeSpreadGatherThreadsPerAtom;
+ if (c_useOrderThreadsPerAtom)
+ {
+ return pmeGpu->programHandle_->impl_->warpSize / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom;
+ }
+ else
+ {
+ return pmeGpu->programHandle_->impl_->warpSize / c_pmeSpreadGatherThreadsPerAtom;
+ }
}
void pme_gpu_synchronize(const PmeGpu *pmeGpu)
return std::pair<int, int>(colCount, minRowCount);
}
+/*! \brief
+ * Returns a pointer to appropriate spline and spread kernel based on the input bool values
+ *
+ * \param[in] pmeGpu The PME GPU structure.
+ * \param[in] useOrderThreadsPerAtom bool controlling if we should use order or order*order threads per atom
+ * \param[in] writeSplinesToGlobal bool controlling if we should write spline data to global memory
+ *
+ * \return Pointer to CUDA kernel
+ */
+static auto selectSplineAndSpreadKernelPtr(const PmeGpu *pmeGpu,
+ bool useOrderThreadsPerAtom,
+ bool writeSplinesToGlobal)
+{
+ PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
+ if (writeSplinesToGlobal)
+ {
+ if (useOrderThreadsPerAtom)
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplinesThPerAtom4;
+ }
+ else
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplines;
+ }
+ }
+ else
+ {
+ if (useOrderThreadsPerAtom)
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4;
+ }
+ else
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernel;
+ }
+ }
+
+ return kernelPtr;
+}
+
+/*! \brief
+ * Returns a pointer to appropriate spline kernel based on the input bool values
+ *
+ * \param[in] pmeGpu The PME GPU structure.
+ * \param[in] useOrderThreadsPerAtom bool controlling if we should use order or order*order threads per atom
+ * \param[in] writeSplinesToGlobal bool controlling if we should write spline data to global memory
+ *
+ * \return Pointer to CUDA kernel
+ */
+static auto selectSplineKernelPtr(const PmeGpu *pmeGpu,
+ bool useOrderThreadsPerAtom,
+ bool gmx_unused writeSplinesToGlobal)
+{
+ PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
+ GMX_ASSERT(writeSplinesToGlobal, "Spline data should always be written to global memory when just calculating splines");
+
+ if (useOrderThreadsPerAtom)
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineKernelThPerAtom4;
+ }
+ else
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineKernel;
+ }
+ return kernelPtr;
+}
+
+/*! \brief
+ * Returns a pointer to appropriate spread kernel based on the input bool values
+ *
+ * \param[in] pmeGpu The PME GPU structure.
+ * \param[in] useOrderThreadsPerAtom bool controlling if we should use order or order*order threads per atom
+ * \param[in] writeSplinesToGlobal bool controlling if we should write spline data to global memory
+ *
+ * \return Pointer to CUDA kernel
+ */
+static auto selectSpreadKernelPtr(const PmeGpu *pmeGpu,
+ bool useOrderThreadsPerAtom,
+ bool writeSplinesToGlobal)
+{
+ PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
+ if (writeSplinesToGlobal)
+ {
+ if (useOrderThreadsPerAtom)
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelThPerAtom4;
+ }
+ else
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->spreadKernel;
+ }
+ }
+ else
+ {
+ /* if we are not saving the spline data we need to recalculate it
+ using the spline and spread Kernel */
+ if (useOrderThreadsPerAtom)
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4;
+ }
+ else
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernel;
+ }
+ }
+ return kernelPtr;
+}
+
void pme_gpu_spread(const PmeGpu *pmeGpu,
GpuEventSynchronizer *xReadyOnDevice,
int gmx_unused gridIndex,
const int order = pmeGpu->common->pme_order;
GMX_ASSERT(order == c_pmeGpuOrder, "Only PME order 4 is implemented");
- const int atomsPerBlock = blockSize / c_pmeSpreadGatherThreadsPerAtom;
+ const bool writeGlobal = pmeGpu->settings.copyAllOutputs;
+#if GMX_GPU == GMX_GPU_OPENCL
+ GMX_ASSERT(!c_useOrderThreadsPerAtom, "Only 16 threads per atom supported in OpenCL");
+ GMX_ASSERT(!c_recalculateSplines, "Recalculating splines not supported in OpenCL");
+#endif
+ const int atomsPerBlock = c_useOrderThreadsPerAtom ? blockSize / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom :
+ blockSize / c_pmeSpreadGatherThreadsPerAtom;
+
// TODO: pick smaller block size in runtime if needed
// (e.g. on 660 Ti where 50% occupancy is ~25% faster than 100% occupancy with RNAse (~17.8k atoms))
// If doing so, change atomsPerBlock in the kernels as well.
// TODO: test varying block sizes on modern arch-s as well
// TODO: also consider using cudaFuncSetCacheConfig() for preferring shared memory on older architectures
- //(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define)
+ //(for spline data mostly)
GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock), "inconsistent atom data padding vs. spreading block size");
// Ensure that coordinates are ready on the device before launching spread;
auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount);
KernelLaunchConfig config;
- config.blockSize[0] = config.blockSize[1] = order;
+ config.blockSize[0] = order;
+ config.blockSize[1] = c_useOrderThreadsPerAtom ? 1 : order;
config.blockSize[2] = atomsPerBlock;
config.gridSize[0] = dimGrid.first;
config.gridSize[1] = dimGrid.second;
if (spreadCharges)
{
timingId = gtPME_SPLINEANDSPREAD;
- kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernel;
+ kernelPtr = selectSplineAndSpreadKernelPtr( pmeGpu,
+ c_useOrderThreadsPerAtom,
+ writeGlobal || (!c_recalculateSplines));
}
else
{
timingId = gtPME_SPLINE;
- kernelPtr = pmeGpu->programHandle_->impl_->splineKernel;
+ kernelPtr = selectSplineKernelPtr( pmeGpu,
+ c_useOrderThreadsPerAtom,
+ writeGlobal || (!c_recalculateSplines));
}
}
else
{
timingId = gtPME_SPREAD;
- kernelPtr = pmeGpu->programHandle_->impl_->spreadKernel;
+ kernelPtr = selectSpreadKernelPtr( pmeGpu,
+ c_useOrderThreadsPerAtom,
+ writeGlobal || (!c_recalculateSplines));
}
+
pme_gpu_start_timing(pmeGpu, timingId);
auto *timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
#if c_canEmbedBuffers
}
}
+/*! \brief
+ * Returns a pointer to appropriate gather kernel based on the inputvalues
+ *
+ * \param[in] pmeGpu The PME GPU structure.
+ * \param[in] useOrderThreadsPerAtom bool controlling if we should use order or order*order threads per atom
+ * \param[in] readSplinesFromGlobal bool controlling if we should write spline data to global memory
+ * \param[in] forceTreatment Controls if the forces from the gather should increment or replace the input forces.
+ *
+ * \return Pointer to CUDA kernel
+ */
+inline auto selectGatherKernelPtr(const PmeGpu *pmeGpu,
+ bool useOrderThreadsPerAtom,
+ bool readSplinesFromGlobal,
+ PmeForceOutputHandling forceTreatment)
+
+{
+ PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
+
+ if (readSplinesFromGlobal)
+ {
+ if (useOrderThreadsPerAtom)
+ {
+ kernelPtr = (forceTreatment == PmeForceOutputHandling::Set) ?
+ pmeGpu->programHandle_->impl_->gatherKernelReadSplinesThPerAtom4 :
+ pmeGpu->programHandle_->impl_->gatherReduceWithInputKernelReadSplinesThPerAtom4;
+ }
+ else
+ {
+ kernelPtr = (forceTreatment == PmeForceOutputHandling::Set) ?
+ pmeGpu->programHandle_->impl_->gatherKernelReadSplines :
+ pmeGpu->programHandle_->impl_->gatherReduceWithInputKernelReadSplines;
+ }
+ }
+ else
+ {
+ if (useOrderThreadsPerAtom)
+ {
+ kernelPtr = (forceTreatment == PmeForceOutputHandling::Set) ?
+ pmeGpu->programHandle_->impl_->gatherKernelThPerAtom4 :
+ pmeGpu->programHandle_->impl_->gatherReduceWithInputKernelThPerAtom4;
+ }
+ else
+ {
+ kernelPtr = (forceTreatment == PmeForceOutputHandling::Set) ?
+ pmeGpu->programHandle_->impl_->gatherKernel :
+ pmeGpu->programHandle_->impl_->gatherReduceWithInputKernel;
+ }
+ }
+ return kernelPtr;
+}
+
+
void pme_gpu_gather(PmeGpu *pmeGpu,
PmeForceOutputHandling forceTreatment,
const float *h_grid)
pme_gpu_copy_input_gather_atom_data(pmeGpu);
}
+ /* Set if we have unit tests */
+ const bool readGlobal = pmeGpu->settings.copyAllOutputs;
const size_t blockSize = pmeGpu->programHandle_->impl_->gatherWorkGroupSize;
- const int atomsPerBlock = blockSize / c_pmeSpreadGatherThreadsPerAtom;
+#if GMX_GPU == GMX_GPU_OPENCL
+ GMX_ASSERT(!c_useOrderThreadsPerAtom, "Only 16 threads per atom supported in OpenCL");
+ GMX_ASSERT(!c_recalculateSplines, "Recalculating splines not supported in OpenCL");
+#endif
+ const int atomsPerBlock = c_useOrderThreadsPerAtom ? blockSize / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom :
+ blockSize / c_pmeSpreadGatherThreadsPerAtom;
+
GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock), "inconsistent atom data padding vs. gathering block size");
const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount);
-
- const int order = pmeGpu->common->pme_order;
+ const int order = pmeGpu->common->pme_order;
GMX_ASSERT(order == c_pmeGpuOrder, "Only PME order 4 is implemented");
KernelLaunchConfig config;
- config.blockSize[0] = config.blockSize[1] = order;
+ config.blockSize[0] = order;
+ config.blockSize[1] = c_useOrderThreadsPerAtom ? 1 : order;
config.blockSize[2] = atomsPerBlock;
config.gridSize[0] = dimGrid.first;
config.gridSize[1] = dimGrid.second;
// TODO test different cache configs
int timingId = gtPME_GATHER;
+ PmeGpuProgramImpl::PmeKernelHandle kernelPtr = selectGatherKernelPtr( pmeGpu,
+ c_useOrderThreadsPerAtom,
+ readGlobal || (!c_recalculateSplines),
+ forceTreatment);
// TODO design kernel selection getters and make PmeGpu a friend of PmeGpuProgramImpl
- PmeGpuProgramImpl::PmeKernelHandle kernelPtr = (forceTreatment == PmeForceOutputHandling::Set) ?
- pmeGpu->programHandle_->impl_->gatherKernel :
- pmeGpu->programHandle_->impl_->gatherReduceWithInputKernel;
pme_gpu_start_timing(pmeGpu, timingId);
auto *timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
const bool computeSplines,
const bool spreadCharges,
const bool wrapX,
- const bool wrapY
+ const bool wrapY,
+ const bool writeGlobal,
+ const bool orderThreads
>
void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams);
// Add extern declarations to inform that there will be a definition
// provided in another translation unit.
extern template
-void pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
+void pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, true, true>(const PmeGpuCudaKernelParams);
extern template
-void pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
+void pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, true, false>(const PmeGpuCudaKernelParams);
extern template
-void pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
+void pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, true, true>(const PmeGpuCudaKernelParams);
+extern template
+void pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, true, false>(const PmeGpuCudaKernelParams);
+extern template
+void pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, true, true>(const PmeGpuCudaKernelParams);
+extern template
+void pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, true, false>(const PmeGpuCudaKernelParams);
+extern template
+void pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, false, true>(const PmeGpuCudaKernelParams);
+extern template
+void pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, false, false>(const PmeGpuCudaKernelParams);
template<
GridOrdering gridOrdering,
const int order,
const bool overwriteForces,
const bool wrapX,
- const bool wrapY
+ const bool wrapY,
+ const bool readGlobal,
+ const bool orderThreads
>
void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams);
// Add extern declarations to inform that there will be a definition
// provided in another translation unit.
extern template
-void pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
+void pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, true, true>(const PmeGpuCudaKernelParams);
+extern template
+void pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, false, true>(const PmeGpuCudaKernelParams);
extern template
-void pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY>(const PmeGpuCudaKernelParams);
+void pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, true, true>(const PmeGpuCudaKernelParams);
+extern template
+void pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, false, true>(const PmeGpuCudaKernelParams);
+extern template
+void pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, true, false>(const PmeGpuCudaKernelParams);
+extern template
+void pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, false, false>(const PmeGpuCudaKernelParams);
+extern template
+void pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, true, false>(const PmeGpuCudaKernelParams);
+extern template
+void pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, false, false>(const PmeGpuCudaKernelParams);
PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *)
{
solveMaxWorkGroupSize = c_solveMaxThreadsPerBlock;
gatherWorkGroupSize = c_gatherMaxThreadsPerBlock;
- splineAndSpreadKernel = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY>;
- splineKernel = pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY>;
- spreadKernel = pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY>;
- gatherKernel = pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY>;
- gatherReduceWithInputKernel = pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY>;
- solveXYZKernel = pme_solve_kernel<GridOrdering::XYZ, false>;
- solveXYZEnergyKernel = pme_solve_kernel<GridOrdering::XYZ, true>;
- solveYZXKernel = pme_solve_kernel<GridOrdering::YZX, false>;
- solveYZXEnergyKernel = pme_solve_kernel<GridOrdering::YZX, true>;
+/*!
+ * Not all combinations of the splineAndSpread, spline and Spread kernels are required
+ * If only the spline (without the spread) then it does not make sense not to write the data to global memory
+ * Similarly the spread kernel (without the spline) implies that we should read the spline data from global memory
+ */
+ splineAndSpreadKernel = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, false, false>;
+ splineAndSpreadKernelThPerAtom4 = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, false, true>;
+ splineAndSpreadKernelWriteSplines = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, true, false>;
+ splineAndSpreadKernelWriteSplinesThPerAtom4 = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, true, true>;
+ splineKernel = pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, true, false>;
+ splineKernelThPerAtom4 = pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, true, true>;
+ spreadKernel = pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, true, false>;
+ spreadKernelThPerAtom4 = pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, true, true>;
+ gatherKernel = pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, false, false>;
+ gatherKernelThPerAtom4 = pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, false, true>;
+ gatherKernelReadSplines = pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, true, false>;
+ gatherKernelReadSplinesThPerAtom4 = pme_gather_kernel<c_pmeOrder, true, c_wrapX, c_wrapY, true, true>;
+ gatherReduceWithInputKernel = pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, false, false>;
+ gatherReduceWithInputKernelThPerAtom4 = pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, false, true>;
+ gatherReduceWithInputKernelReadSplines = pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, true, false>;
+ gatherReduceWithInputKernelReadSplinesThPerAtom4 = pme_gather_kernel<c_pmeOrder, false, c_wrapX, c_wrapY, true, true>;
+ solveXYZKernel = pme_solve_kernel<GridOrdering::XYZ, false>;
+ solveXYZEnergyKernel = pme_solve_kernel<GridOrdering::XYZ, true>;
+ solveYZXKernel = pme_solve_kernel<GridOrdering::YZX, false>;
+ solveYZXEnergyKernel = pme_solve_kernel<GridOrdering::YZX, true>;
}
PmeGpuProgramImpl::~PmeGpuProgramImpl()
//@{
/**
* Spread/spline kernels are compiled only for order of 4.
+ * There are multiple versions of each kernel, paramaretized according to
+ * Number of threads per atom. Using either order(4) or order*order (16) threads per atom is supported
+ * If the spline data is written in the spline/spread kernel and loaded in the gather
+ * or recalculated in the gather.
* Spreading kernels also have hardcoded X/Y indices wrapping parameters,
* as a placeholder for implementing 1/2D decomposition.
*/
size_t spreadWorkGroupSize;
PmeKernelHandle splineKernel;
+ PmeKernelHandle splineKernelThPerAtom4;
PmeKernelHandle spreadKernel;
+ PmeKernelHandle spreadKernelThPerAtom4;
PmeKernelHandle splineAndSpreadKernel;
+ PmeKernelHandle splineAndSpreadKernelThPerAtom4;
+ PmeKernelHandle splineAndSpreadKernelWriteSplines;
+ PmeKernelHandle splineAndSpreadKernelWriteSplinesThPerAtom4;
//@}
//@{
/** Same for gather: hardcoded X/Y unwrap parameters, order of 4, plus
* it can either reduce with previous forces in the host buffer, or ignore them.
+ * Also similarly to the gather we can use either order(4) or order*order (16) threads per atom
+ * and either recalculate the splines or read the ones written by the spread
*/
size_t gatherWorkGroupSize;
PmeKernelHandle gatherReduceWithInputKernel;
+ PmeKernelHandle gatherReduceWithInputKernelThPerAtom4;
PmeKernelHandle gatherKernel;
+ PmeKernelHandle gatherKernelThPerAtom4;
+ PmeKernelHandle gatherReduceWithInputKernelReadSplines;
+ PmeKernelHandle gatherReduceWithInputKernelReadSplinesThPerAtom4;
+ PmeKernelHandle gatherKernelReadSplines;
+ PmeKernelHandle gatherKernelReadSplinesThPerAtom4;
//@}
//@{
// TODO use a map with string key instead?
if (!strcmp(kernelNamesBuffer.data(), "pmeSplineKernel"))
{
- splineKernel = kernel;
+ splineKernel = kernel;
}
else if (!strcmp(kernelNamesBuffer.data(), "pmeSplineAndSpreadKernel"))
{
- splineAndSpreadKernel = kernel;
+ splineAndSpreadKernel = kernel;
+ splineAndSpreadKernelWriteSplines = kernel;
checkRequiredWarpSize(splineAndSpreadKernel, kernelNamesBuffer.data(), deviceInfo);
}
else if (!strcmp(kernelNamesBuffer.data(), "pmeSpreadKernel"))
{
- spreadKernel = kernel;
+ spreadKernel = kernel;
checkRequiredWarpSize(spreadKernel, kernelNamesBuffer.data(), deviceInfo);
}
else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherKernel"))
{
- gatherKernel = kernel;
+ gatherKernel = kernel;
+ gatherKernelReadSplines = kernel;
checkRequiredWarpSize(gatherKernel, kernelNamesBuffer.data(), deviceInfo);
}
else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherReduceWithInputKernel"))
{
- gatherReduceWithInputKernel = kernel;
+ gatherReduceWithInputKernel = kernel;
+ gatherReduceWithInputKernelReadSplines = kernel;
checkRequiredWarpSize(gatherReduceWithInputKernel, kernelNamesBuffer.data(), deviceInfo);
}
else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXKernel"))
#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
#include "pme.cuh"
+#include "pme_calculate_splines.cuh"
#include "pme_gpu_utils.h"
#include "pme_grid.h"
-/*
- * This define affects the spline calculation behaviour in the kernel.
- * 0: a single GPU thread handles a single dimension of a single particle (calculating and storing (order) spline values and derivatives).
- * 1: (order) threads do redundant work on this same task, each one stores only a single theta and single dtheta into global arrays.
- * The only efficiency difference is less global store operations, countered by more redundant spline computation.
- *
- * TODO: estimate if this should be a boolean parameter (and add it to the unit test if so).
- */
-#define PME_GPU_PARALLEL_SPLINE 0
-
-
-/*! \brief
- * General purpose function for loading atom-related data from global to shared memory.
- *
- * \tparam[in] T Data type (float/int/...)
- * \tparam[in] atomsPerBlock Number of atoms processed by a block - should be accounted for in the size of the shared memory array.
- * \tparam[in] dataCountPerAtom Number of data elements per single atom (e.g. DIM for an rvec coordinates array).
- * \param[in] kernelParams Input PME CUDA data in constant memory.
- * \param[out] sm_destination Shared memory array for output.
- * \param[in] gm_source Global memory array for input.
- */
-template<typename T,
- const int atomsPerBlock,
- const int dataCountPerAtom>
-__device__ __forceinline__
-void pme_gpu_stage_atom_data(const PmeGpuCudaKernelParams kernelParams,
- T * __restrict__ sm_destination,
- const T * __restrict__ gm_source)
-{
- static_assert(c_usePadding, "With padding disabled, index checking should be fixed to account for spline theta/dtheta per-warp alignment");
- const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
- const int threadLocalIndex = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x) + threadIdx.x;
- const int localIndex = threadLocalIndex;
- const int globalIndexBase = blockIndex * atomsPerBlock * dataCountPerAtom;
- const int globalIndex = globalIndexBase + localIndex;
- const int globalCheck = pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
- if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
- {
- assert(isfinite(float(gm_source[globalIndex])));
- sm_destination[localIndex] = gm_source[globalIndex];
- }
-}
-
-/*! \brief
- * PME GPU spline parameter and gridline indices calculation.
- * This corresponds to the CPU functions calc_interpolation_idx() and make_bsplines().
- * First stage of the whole kernel.
- *
- * \tparam[in] order PME interpolation order.
- * \tparam[in] atomsPerBlock Number of atoms processed by a block - should be accounted for
- * in the sizes of the shared memory arrays.
- * \param[in] kernelParams Input PME CUDA data in constant memory.
- * \param[in] atomIndexOffset Starting atom index for the execution block w.r.t. global memory.
- * \param[in] sm_coordinates Atom coordinates in the shared memory.
- * \param[in] sm_coefficients Atom charges/coefficients in the shared memory.
- * \param[out] sm_theta Atom spline values in the shared memory.
- * \param[out] sm_gridlineIndices Atom gridline indices in the shared memory.
- */
-template <const int order,
- const int atomsPerBlock>
-__device__ __forceinline__ void calculate_splines(const PmeGpuCudaKernelParams kernelParams,
- const int atomIndexOffset,
- const float3 * __restrict__ sm_coordinates,
- const float * __restrict__ sm_coefficients,
- float * __restrict__ sm_theta,
- int * __restrict__ sm_gridlineIndices)
-{
- /* Global memory pointers for output */
- float * __restrict__ gm_theta = kernelParams.atoms.d_theta;
- float * __restrict__ gm_dtheta = kernelParams.atoms.d_dtheta;
- int * __restrict__ gm_gridlineIndices = kernelParams.atoms.d_gridlineIndices;
-
- const int atomsPerWarp = c_pmeSpreadGatherAtomsPerWarp;
-
- /* Fractional coordinates */
- __shared__ float sm_fractCoords[atomsPerBlock * DIM];
-
- /* Thread index w.r.t. block */
- const int threadLocalId = (threadIdx.z * (blockDim.x * blockDim.y))
- + (threadIdx.y * blockDim.x) + threadIdx.x;
- /* Warp index w.r.t. block - could probably be obtained easier? */
- const int warpIndex = threadLocalId / warp_size;
- /* Thread index w.r.t. warp */
- const int threadWarpIndex = threadLocalId % warp_size;
- /* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
- const int atomWarpIndex = threadWarpIndex % atomsPerWarp;
- /* Atom index w.r.t. block/shared memory */
- const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
-
- /* Atom index w.r.t. global memory */
- const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
- /* Spline contribution index in one dimension */
- const int orderIndex = threadWarpIndex / (atomsPerWarp * DIM);
- /* Dimension index */
- const int dimIndex = (threadWarpIndex - orderIndex * (atomsPerWarp * DIM)) / atomsPerWarp;
-
- /* Multi-purpose index of rvec/ivec atom data */
- const int sharedMemoryIndex = atomIndexLocal * DIM + dimIndex;
-
- /* Spline parameters need a working buffer.
- * With PME_GPU_PARALLEL_SPLINE == 0 it is just a local array of (order) values for some of the threads, which is fine;
- * With PME_GPU_PARALLEL_SPLINE == 1 (order) times more threads are involved, so the shared memory is used to avoid overhead.
- * The buffer's size, striding and indexing are adjusted accordingly.
- * The buffer is accessed with SPLINE_DATA_PTR and SPLINE_DATA macros.
- */
-#if PME_GPU_PARALLEL_SPLINE
- const int splineDataStride = atomsPerBlock * DIM;
- const int splineDataIndex = sharedMemoryIndex;
- __shared__ float sm_splineData[splineDataStride * order];
- float *splineDataPtr = sm_splineData;
-#else
- const int splineDataStride = 1;
- const int splineDataIndex = 0;
- float splineData[splineDataStride * order];
- float *splineDataPtr = splineData;
-#endif
-
-#define SPLINE_DATA_PTR(i) (splineDataPtr + ((i) * splineDataStride + splineDataIndex))
-#define SPLINE_DATA(i) (*SPLINE_DATA_PTR(i))
-
- const int localCheck = (dimIndex < DIM) && (orderIndex < (PME_GPU_PARALLEL_SPLINE ? order : 1));
- const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
-
- if (localCheck && globalCheck)
- {
- /* Indices interpolation */
-
- if (orderIndex == 0)
- {
- int tableIndex, tInt;
- float n, t;
- const float3 x = sm_coordinates[atomIndexLocal];
- /* Accessing fields in fshOffset/nXYZ/recipbox/... with dimIndex offset
- * puts them into local memory(!) instead of accessing the constant memory directly.
- * That's the reason for the switch, to unroll explicitly.
- * The commented parts correspond to the 0 components of the recipbox.
- */
- switch (dimIndex)
- {
- case XX:
- tableIndex = kernelParams.grid.tablesOffsets[XX];
- n = kernelParams.grid.realGridSizeFP[XX];
- t = x.x * kernelParams.current.recipBox[dimIndex][XX] + x.y * kernelParams.current.recipBox[dimIndex][YY] + x.z * kernelParams.current.recipBox[dimIndex][ZZ];
- break;
-
- case YY:
- tableIndex = kernelParams.grid.tablesOffsets[YY];
- n = kernelParams.grid.realGridSizeFP[YY];
- t = /*x.x * kernelParams.current.recipbox[dimIndex][XX] + */ x.y * kernelParams.current.recipBox[dimIndex][YY] + x.z * kernelParams.current.recipBox[dimIndex][ZZ];
- break;
-
- case ZZ:
- tableIndex = kernelParams.grid.tablesOffsets[ZZ];
- n = kernelParams.grid.realGridSizeFP[ZZ];
- t = /*x.x * kernelParams.current.recipbox[dimIndex][XX] + x.y * kernelParams.current.recipbox[dimIndex][YY] + */ x.z * kernelParams.current.recipBox[dimIndex][ZZ];
- break;
- }
- const float shift = c_pmeMaxUnitcellShift;
- /* Fractional coordinates along box vectors, adding a positive shift to ensure t is positive for triclinic boxes */
- t = (t + shift) * n;
- tInt = (int)t;
- sm_fractCoords[sharedMemoryIndex] = t - tInt;
- tableIndex += tInt;
- assert(tInt >= 0);
- assert(tInt < c_pmeNeighborUnitcellCount * n);
-
- // TODO have shared table for both parameters to share the fetch, as index is always same?
- // TODO compare texture/LDG performance
- sm_fractCoords[sharedMemoryIndex] +=
- fetchFromParamLookupTable(kernelParams.grid.d_fractShiftsTable,
- kernelParams.fractShiftsTableTexture,
- tableIndex);
- sm_gridlineIndices[sharedMemoryIndex] =
- fetchFromParamLookupTable(kernelParams.grid.d_gridlineIndicesTable,
- kernelParams.gridlineIndicesTableTexture,
- tableIndex);
- gm_gridlineIndices[atomIndexOffset * DIM + sharedMemoryIndex] = sm_gridlineIndices[sharedMemoryIndex];
- }
-
- /* B-spline calculation */
-
- const int chargeCheck = pme_gpu_check_atom_charge(sm_coefficients[atomIndexLocal]);
- if (chargeCheck)
- {
- float div;
- int o = orderIndex; // This is an index that is set once for PME_GPU_PARALLEL_SPLINE == 1
-
- const float dr = sm_fractCoords[sharedMemoryIndex];
- assert(isfinite(dr));
-
- /* dr is relative offset from lower cell limit */
- *SPLINE_DATA_PTR(order - 1) = 0.0f;
- *SPLINE_DATA_PTR(1) = dr;
- *SPLINE_DATA_PTR(0) = 1.0f - dr;
-
-#pragma unroll
- for (int k = 3; k < order; k++)
- {
- div = 1.0f / (k - 1.0f);
- *SPLINE_DATA_PTR(k - 1) = div * dr * SPLINE_DATA(k - 2);
-#pragma unroll
- for (int l = 1; l < (k - 1); l++)
- {
- *SPLINE_DATA_PTR(k - l - 1) = div * ((dr + l) * SPLINE_DATA(k - l - 2) + (k - l - dr) * SPLINE_DATA(k - l - 1));
- }
- *SPLINE_DATA_PTR(0) = div * (1.0f - dr) * SPLINE_DATA(0);
- }
-
- const int thetaIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
- const int thetaGlobalOffsetBase = atomIndexOffset * DIM * order;
-
- /* Differentiation and storing the spline derivatives (dtheta) */
-#if !PME_GPU_PARALLEL_SPLINE
- // With PME_GPU_PARALLEL_SPLINE == 1, o is already set to orderIndex;
- // With PME_GPU_PARALLEL_SPLINE == 0, we loop o over range(order).
-#pragma unroll
- for (o = 0; o < order; o++)
-#endif
- {
- const int thetaIndex = getSplineParamIndex<order, atomsPerWarp>(thetaIndexBase, dimIndex, o);
- const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
-
- const float dtheta = ((o > 0) ? SPLINE_DATA(o - 1) : 0.0f) - SPLINE_DATA(o);
- assert(isfinite(dtheta));
- gm_dtheta[thetaGlobalIndex] = dtheta;
- }
-
- div = 1.0f / (order - 1.0f);
- *SPLINE_DATA_PTR(order - 1) = div * dr * SPLINE_DATA(order - 2);
-#pragma unroll
- for (int k = 1; k < (order - 1); k++)
- {
- *SPLINE_DATA_PTR(order - k - 1) = div * ((dr + k) * SPLINE_DATA(order - k - 2) + (order - k - dr) * SPLINE_DATA(order - k - 1));
- }
- *SPLINE_DATA_PTR(0) = div * (1.0f - dr) * SPLINE_DATA(0);
-
- /* Storing the spline values (theta) */
-#if !PME_GPU_PARALLEL_SPLINE
- // See comment for the loop above
-#pragma unroll
- for (o = 0; o < order; o++)
-#endif
- {
- const int thetaIndex = getSplineParamIndex<order, atomsPerWarp>(thetaIndexBase, dimIndex, o);
- const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
-
- sm_theta[thetaIndex] = SPLINE_DATA(o);
- assert(isfinite(sm_theta[thetaIndex]));
- gm_theta[thetaGlobalIndex] = SPLINE_DATA(o);
- }
- }
- }
-}
-
/*! \brief
* Charge spreading onto the grid.
* This corresponds to the CPU function spread_coefficients_bsplines_thread().
- * Second stage of the whole kernel.
+ * Optional second stage of the spline_and_spread_kernel.
*
* \tparam[in] order PME interpolation order.
* \tparam[in] wrapX A boolean which tells if the grid overlap in dimension X should be wrapped.
* \tparam[in] wrapY A boolean which tells if the grid overlap in dimension Y should be wrapped.
+ * \tparam[in] useOrderThreads A boolean which Tells if we should use order threads per atom (order*order used if false)
* \param[in] kernelParams Input PME CUDA data in constant memory.
* \param[in] atomIndexOffset Starting atom index for the execution block w.r.t. global memory.
- * \param[in] sm_coefficients Atom coefficents/charges in the shared memory.
+ * \param[in] atomCharge Atom charge/coefficient of atom processed by thread.
* \param[in] sm_gridlineIndices Atom gridline indices in the shared memory.
* \param[in] sm_theta Atom spline values in the shared memory.
*/
template <
- const int order, const bool wrapX, const bool wrapY>
+ const int order, const bool wrapX, const bool wrapY, const bool useOrderThreads >
__device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams kernelParams,
int atomIndexOffset,
- const float * __restrict__ sm_coefficients,
+ const float * atomCharge,
const int * __restrict__ sm_gridlineIndices,
const float * __restrict__ sm_theta)
{
float * __restrict__ gm_grid = kernelParams.grid.d_realGrid;
- const int atomsPerWarp = c_pmeSpreadGatherAtomsPerWarp;
+ const int atomsPerWarp = useOrderThreads ? c_pmeSpreadGatherAtomsPerWarp4ThPerAtom : c_pmeSpreadGatherAtomsPerWarp;
const int nx = kernelParams.grid.realGridSize[XX];
const int ny = kernelParams.grid.realGridSize[YY];
const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
- const int chargeCheck = pme_gpu_check_atom_charge(sm_coefficients[atomIndexLocal]);
+ const int chargeCheck = pme_gpu_check_atom_charge(*atomCharge);
if (chargeCheck & globalCheck)
{
- // Spline Y/Z coordinates
- const int ithy = threadIdx.y;
+ // Spline Z coordinates
const int ithz = threadIdx.x;
+
const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX] - offx;
- int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] - offy + ithy;
- if (wrapY & (iy >= ny))
- {
- iy -= ny;
- }
- int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] - offz + ithz;
+ const int iyBase = sm_gridlineIndices[atomIndexLocal * DIM + YY] - offy;
+ int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] - offz + ithz;
if (iz >= nz)
{
iz -= nz;
}
-
/* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
/* Warp index w.r.t. block - could probably be obtained easier? */
const int splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
const int splineIndexZ = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
const float thetaZ = sm_theta[splineIndexZ];
- const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
- const float thetaY = sm_theta[splineIndexY];
- const float constVal = thetaZ * thetaY * sm_coefficients[atomIndexLocal];
- assert(isfinite(constVal));
- const int constOffset = iy * pnz + iz;
-#pragma unroll
- for (int ithx = 0; (ithx < order); ithx++)
+ /* loop not used if order*order threads per atom */
+ const int ithyMin = useOrderThreads ? 0 : threadIdx.y;
+ const int ithyMax = useOrderThreads ? order : threadIdx.y + 1;
+ for (int ithy = ithyMin; ithy < ithyMax; ithy++)
{
- int ix = ixBase + ithx;
- if (wrapX & (ix >= nx))
+ int iy = iyBase + ithy;
+ if (wrapY & (iy >= ny))
+ {
+ iy -= ny;
+ }
+
+ const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
+ float thetaY = sm_theta[splineIndexY];
+ const float Val = thetaZ * thetaY * (*atomCharge);
+ assert(isfinite(Val));
+ const int offset = iy * pnz + iz;
+
+#pragma unroll
+ for (int ithx = 0; (ithx < order); ithx++)
{
- ix -= nx;
+ int ix = ixBase + ithx;
+ if (wrapX & (ix >= nx))
+ {
+ ix -= nx;
+ }
+ const int gridIndexGlobal = ix * pny * pnz + offset;
+ const int splineIndexX = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
+ const float thetaX = sm_theta[splineIndexX];
+ assert(isfinite(thetaX));
+ assert(isfinite(gm_grid[gridIndexGlobal]));
+ atomicAdd(gm_grid + gridIndexGlobal, thetaX * Val);
}
- const int gridIndexGlobal = ix * pny * pnz + constOffset;
- const int splineIndexX = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
- const float thetaX = sm_theta[splineIndexX];
- assert(isfinite(thetaX));
- assert(isfinite(gm_grid[gridIndexGlobal]));
- atomicAdd(gm_grid + gridIndexGlobal, thetaX * constVal);
}
}
}
/*! \brief
* A spline computation and charge spreading kernel function.
*
+ * Two tuning parameters can be used for additional performance. For small systems and for debugging
+ * writeGlobal should be used removing the need to recalculate the theta values in the gather kernel.
+ * Similarly for useOrderThreads large systems order threads per atom gives higher performance than order*order threads
+ *
* \tparam[in] order PME interpolation order.
* \tparam[in] computeSplines A boolean which tells if the spline parameter and
* gridline indices' computation should be performed.
* \tparam[in] spreadCharges A boolean which tells if the charge spreading should be performed.
* \tparam[in] wrapX A boolean which tells if the grid overlap in dimension X should be wrapped.
* \tparam[in] wrapY A boolean which tells if the grid overlap in dimension Y should be wrapped.
+ * \tparam[in] writeGlobal A boolean which tells if the theta values and gridlines should be written to global memory.
+ * \tparam[in] useOrderThreads A boolean which tells if we should use order threads per atom (order*order used if false).
* \param[in] kernelParams Input PME CUDA data in constant memory.
*/
template <
const bool computeSplines,
const bool spreadCharges,
const bool wrapX,
- const bool wrapY
+ const bool wrapY,
+ const bool writeGlobal,
+ const bool useOrderThreads
>
__launch_bounds__(c_spreadMaxThreadsPerBlock)
CLANG_DISABLE_OPTIMIZATION_ATTRIBUTE
__global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams)
{
- const int atomsPerBlock = c_spreadMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom;
+ const int atomsPerBlock = useOrderThreads ? c_spreadMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom :
+ c_spreadMaxThreadsPerBlock / c_pmeSpreadGatherThreadsPerAtom;
// Gridline indices, ivec
- __shared__ int sm_gridlineIndices[atomsPerBlock * DIM];
- // Charges
- __shared__ float sm_coefficients[atomsPerBlock];
+ __shared__ int sm_gridlineIndices[atomsPerBlock * DIM];
// Spline values
- __shared__ float sm_theta[atomsPerBlock * DIM * order];
+ __shared__ float sm_theta[atomsPerBlock * DIM * order];
+ float dtheta;
+
+ const int atomsPerWarp = useOrderThreads ? c_pmeSpreadGatherAtomsPerWarp4ThPerAtom :
+ c_pmeSpreadGatherAtomsPerWarp;
- const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
- const int atomIndexOffset = blockIndex * atomsPerBlock;
+ float3 atomX;
+ float atomCharge;
+
+ const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
+ const int atomIndexOffset = blockIndex * atomsPerBlock;
+
+ /* Thread index w.r.t. block */
+ const int threadLocalId = (threadIdx.z * (blockDim.x * blockDim.y))
+ + (threadIdx.y * blockDim.x) + threadIdx.x;
+ /* Warp index w.r.t. block - could probably be obtained easier? */
+ const int warpIndex = threadLocalId / warp_size;
+
+ /* Atom index w.r.t. warp */
+ const int atomWarpIndex = threadIdx.z %atomsPerWarp;
+ /* Atom index w.r.t. block/shared memory */
+ const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
+ /* Atom index w.r.t. global memory */
+ const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
/* Early return for fully empty blocks at the end
* (should only happen for billions of input atoms)
{
return;
}
-
- /* Staging coefficients/charges for both spline and spread */
- pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, kernelParams.atoms.d_coefficients);
+ /* Charges, required for both spline and spread */
+ if (c_useAtomDataPrefetch)
+ {
+ __shared__ float sm_coefficients[atomsPerBlock];
+ pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, kernelParams.atoms.d_coefficients);
+ __syncthreads();
+ atomCharge = sm_coefficients[atomIndexLocal];
+ }
+ else
+ {
+ atomCharge = kernelParams.atoms.d_coefficients[atomIndexGlobal];
+ }
if (computeSplines)
{
- /* Staging coordinates */
- __shared__ float sm_coordinates[DIM * atomsPerBlock];
- pme_gpu_stage_atom_data<float, atomsPerBlock, DIM>(kernelParams, sm_coordinates, kernelParams.atoms.d_coordinates);
-
- __syncthreads();
- calculate_splines<order, atomsPerBlock>(kernelParams, atomIndexOffset, (const float3 *)sm_coordinates,
- sm_coefficients, sm_theta, sm_gridlineIndices);
+ if (c_useAtomDataPrefetch)
+ {
+ // Coordinates
+ __shared__ float sm_coordinates[DIM * atomsPerBlock];
+
+ /* Staging coordinates */
+ pme_gpu_stage_atom_data<float, atomsPerBlock, DIM>(kernelParams, sm_coordinates, kernelParams.atoms.d_coordinates);
+ __syncthreads();
+ atomX.x = sm_coordinates[atomIndexLocal*DIM+XX];
+ atomX.y = sm_coordinates[atomIndexLocal*DIM+YY];
+ atomX.z = sm_coordinates[atomIndexLocal*DIM+ZZ];
+ }
+ else
+ {
+ atomX.x = kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + XX];
+ atomX.y = kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + YY];
+ atomX.z = kernelParams.atoms.d_coordinates[ atomIndexGlobal*DIM + ZZ];
+ }
+ calculate_splines<order, atomsPerBlock, atomsPerWarp, false, writeGlobal>
+ (kernelParams, atomIndexOffset, atomX, atomCharge,
+ sm_theta, &dtheta, sm_gridlineIndices);
__syncwarp();
}
else
/* Spreading */
if (spreadCharges)
{
- spread_charges<order, wrapX, wrapY>(kernelParams, atomIndexOffset, sm_coefficients, sm_gridlineIndices, sm_theta);
+ spread_charges<order, wrapX, wrapY, useOrderThreads>(kernelParams, atomIndexOffset, &atomCharge,
+ sm_gridlineIndices, sm_theta);
}
}
//! Kernel instantiations
-template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true>(const PmeGpuCudaKernelParams);
-template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true>(const PmeGpuCudaKernelParams);
-template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, true, true>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, true, true>(const PmeGpuCudaKernelParams);
+
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, false, true>(const PmeGpuCudaKernelParams);
+
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, true, false>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, true, false>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, true, false>(const PmeGpuCudaKernelParams);
+
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, false, false>(const PmeGpuCudaKernelParams);