2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2016,2017,2018,2019,2020 by the GROMACS development team.
5 * Copyright (c) 2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
38 * \brief Implements PME force gathering in CUDA.
40 * \author Aleksei Iupinov <a.yupinov@gmail.com>
47 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
48 #include "gromacs/gpu_utils/typecasts.cuh"
51 #include "pme_gpu_calculate_splines.cuh"
55 * An inline CUDA function: unroll the dynamic index accesses to the constant grid sizes to avoid local memory operations.
57 __device__ __forceinline__ float read_grid_size(const float* realGridSizeFP, const int dimIndex)
61 case XX: return realGridSizeFP[XX];
62 case YY: return realGridSizeFP[YY];
63 case ZZ: return realGridSizeFP[ZZ];
69 /*! \brief Reduce the partial force contributions.
71 * \tparam[in] order The PME order (must be 4).
72 * \tparam[in] atomDataSize The number of partial force contributions for each atom (currently
74 * \tparam[in] blockSize The CUDA block size
76 * \param[out] sm_forces Shared memory array with the output forces (number of elements
77 * is number of atoms per block)
78 * \param[in] atomIndexLocal Local atom index
79 * \param[in] splineIndex Spline index
80 * \param[in] lineIndex Line index (same as threadLocalId)
81 * \param[in] realGridSizeFP Local grid size constant
82 * \param[in] fx Input force partial component X
83 * \param[in] fy Input force partial component Y
84 * \param[in] fz Input force partial component Z
86 template<int order, int atomDataSize, int blockSize>
87 __device__ __forceinline__ void reduce_atom_forces(float3* __restrict__ sm_forces,
88 const int atomIndexLocal,
89 const int splineIndex,
91 const float* realGridSizeFP,
92 float& fx, // NOLINT(google-runtime-references)
93 float& fy, // NOLINT(google-runtime-references)
94 float& fz) // NOLINT(google-runtime-references)
96 if (gmx::isPowerOfTwo(order)) // Only for orders of power of 2
98 const unsigned int activeMask = c_fullWarpMask;
100 // A tricky shuffle reduction inspired by reduce_force_j_warp_shfl
101 // TODO: find out if this is the best in terms of transactions count
102 static_assert(order == 4, "Only order of 4 is implemented");
103 static_assert(atomDataSize <= warp_size,
104 "TODO: rework for atomDataSize > warp_size (order 8 or larger)");
105 const int width = atomDataSize;
107 fx += __shfl_down_sync(activeMask, fx, 1, width);
108 fy += __shfl_up_sync(activeMask, fy, 1, width);
109 fz += __shfl_down_sync(activeMask, fz, 1, width);
116 fx += __shfl_down_sync(activeMask, fx, 2, width);
117 fz += __shfl_up_sync(activeMask, fz, 2, width);
124 // By now fx contains intermediate quad sums of all 3 components:
125 // splineIndex 0 1 2 and 3 4 5 6 and 7 8...
126 // sum of... fx0 to fx3 fy0 to fy3 fz0 to fz3 fx4 to fx7 fy4 to fy7 fz4 to fz7 etc.
128 // We have to just further reduce those groups of 4
129 for (int delta = 4; delta < atomDataSize; delta <<= 1)
131 fx += __shfl_down_sync(activeMask, fx, delta, width);
134 const int dimIndex = splineIndex;
137 const float n = read_grid_size(realGridSizeFP, dimIndex);
138 float* __restrict__ sm_forcesAtomIndexOffset =
139 reinterpret_cast<float*>(&sm_forces[atomIndexLocal]);
140 sm_forcesAtomIndexOffset[dimIndex] = fx * n;
145 // We use blockSize shared memory elements to read fx, or fy, or fz, and then reduce them to
146 // fit into smemPerDim elements which are stored separately (first 2 dimensions only)
147 const int smemPerDim = warp_size;
148 const int smemReserved = (DIM)*smemPerDim;
149 __shared__ float sm_forceReduction[smemReserved + blockSize];
150 __shared__ float* sm_forceTemp[DIM];
152 const int numWarps = blockSize / smemPerDim;
153 const int minStride =
154 max(1, atomDataSize / numWarps); // order 4: 128 threads => 4, 256 threads => 2, etc
157 for (int dimIndex = 0; dimIndex < DIM; dimIndex++)
159 int elementIndex = smemReserved + lineIndex;
160 // Store input force contributions
161 sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz;
162 // sync here because two warps write data that the first one consumes below
164 // Reduce to fit into smemPerDim (warp size)
166 for (int redStride = atomDataSize / 2; redStride > minStride; redStride >>= 1)
168 if (splineIndex < redStride)
170 sm_forceReduction[elementIndex] += sm_forceReduction[elementIndex + redStride];
174 // Last iteration - packing everything to be nearby, storing convenience pointer
175 sm_forceTemp[dimIndex] = sm_forceReduction + dimIndex * smemPerDim;
176 int redStride = minStride;
177 if (splineIndex < redStride)
179 const int packedIndex = atomIndexLocal * redStride + splineIndex;
180 sm_forceTemp[dimIndex][packedIndex] =
181 sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride];
186 assert((blockSize / warp_size) >= DIM);
187 // assert (atomsPerBlock <= warp_size);
189 const int warpIndex = lineIndex / warp_size;
190 const int dimIndex = warpIndex;
192 // First 3 warps can now process 1 dimension each
195 int sourceIndex = lineIndex % warp_size;
197 for (int redStride = minStride / 2; redStride > 1; redStride >>= 1)
199 if (!(splineIndex & redStride))
201 sm_forceTemp[dimIndex][sourceIndex] += sm_forceTemp[dimIndex][sourceIndex + redStride];
207 const float n = read_grid_size(realGridSizeFP, dimIndex);
208 const int atomIndex = sourceIndex / minStride;
210 if (sourceIndex == minStride * atomIndex)
212 float* __restrict__ sm_forcesAtomIndexOffset =
213 reinterpret_cast<float*>(&sm_forces[atomIndex]);
214 sm_forcesAtomIndexOffset[dimIndex] =
215 (sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n;
221 /*! \brief Calculate the sum of the force partial components (in X, Y and Z)
223 * \tparam[in] order The PME order (must be 4).
224 * \tparam[in] atomsPerWarp The number of atoms per GPU warp.
225 * \tparam[in] wrapX Tells if the grid is wrapped in the X dimension.
226 * \tparam[in] wrapY Tells if the grid is wrapped in the Y dimension.
227 * \param[out] fx The force partial component in the X dimension.
228 * \param[out] fy The force partial component in the Y dimension.
229 * \param[out] fz The force partial component in the Z dimension.
230 * \param[in] ithyMin The thread minimum index in the Y dimension.
231 * \param[in] ithyMax The thread maximum index in the Y dimension.
232 * \param[in] ixBase The grid line index base value in the X dimension.
233 * \param[in] iz The grid line index in the Z dimension.
234 * \param[in] nx The grid real size in the X dimension.
235 * \param[in] ny The grid real size in the Y dimension.
236 * \param[in] pny The padded grid real size in the Y dimension.
237 * \param[in] pnz The padded grid real size in the Z dimension.
238 * \param[in] atomIndexLocal The atom index for this thread.
239 * \param[in] splineIndexBase The base value of the spline parameter index.
240 * \param[in] tdz The theta and dtheta in the Z dimension.
241 * \param[in] sm_gridlineIndices Shared memory array of grid line indices.
242 * \param[in] sm_theta Shared memory array of atom theta values.
243 * \param[in] sm_dtheta Shared memory array of atom dtheta values.
244 * \param[in] gm_grid Global memory array of the grid to use.
246 template<int order, int atomsPerWarp, bool wrapX, bool wrapY>
247 __device__ __forceinline__ void sumForceComponents(float* __restrict__ fx,
248 float* __restrict__ fy,
249 float* __restrict__ fz,
258 const int atomIndexLocal,
259 const int splineIndexBase,
261 const int* __restrict__ sm_gridlineIndices,
262 const float* __restrict__ sm_theta,
263 const float* __restrict__ sm_dtheta,
264 const float* __restrict__ gm_grid)
267 for (int ithy = ithyMin; ithy < ithyMax; ithy++)
269 const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
270 const float2 tdy = make_float2(sm_theta[splineIndexY], sm_dtheta[splineIndexY]);
272 int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
273 if (wrapY & (iy >= ny))
277 const int constOffset = iy * pnz + iz;
280 for (int ithx = 0; (ithx < order); ithx++)
282 int ix = ixBase + ithx;
283 if (wrapX & (ix >= nx))
287 const int gridIndexGlobal = ix * pny * pnz + constOffset;
288 assert(gridIndexGlobal >= 0);
289 const float gridValue = gm_grid[gridIndexGlobal];
290 assert(isfinite(gridValue));
291 const int splineIndexX = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
292 const float2 tdx = make_float2(sm_theta[splineIndexX], sm_dtheta[splineIndexX]);
293 const float fxy1 = tdz.x * gridValue;
294 const float fz1 = tdz.y * gridValue;
295 *fx += tdx.y * tdy.x * fxy1;
296 *fy += tdx.x * tdy.y * fxy1;
297 *fz += tdx.x * tdy.x * fz1;
302 /*! \brief Calculate the grid forces and store them in shared memory.
304 * \param[in,out] sm_forces Shared memory array with the output forces.
305 * \param[in] forceIndexLocal The local (per thread) index in the sm_forces array.
306 * \param[in] forceIndexGlobal The index of the thread in the gm_coefficients array.
307 * \param[in] recipBox The reciprocal box.
308 * \param[in] scale The scale to use when calculating the forces. For gm_coefficientsB
309 * (when using multiple coefficients on a single grid) the scale will be (1.0 - scale).
310 * \param[in] gm_coefficients Global memory array of the coefficients to use for an unperturbed
311 * or FEP in state A if a single grid is used (\p multiCoefficientsSingleGrid == true).If two
312 * separate grids are used this should be the coefficients of the grid in question.
313 * \param[in] gm_coefficientsB Global memory array of the coefficients to use for FEP in state B.
314 * Should be nullptr if two separate grids are used.
316 __device__ __forceinline__ void calculateAndStoreGridForces(float3* __restrict__ sm_forces,
317 const int forceIndexLocal,
318 const int forceIndexGlobal,
319 const float recipBox[DIM][DIM],
321 const float* __restrict__ gm_coefficients)
323 const float3 atomForces = sm_forces[forceIndexLocal];
324 float negCoefficient = -scale * gm_coefficients[forceIndexGlobal];
326 result.x = negCoefficient * recipBox[XX][XX] * atomForces.x;
327 result.y = negCoefficient * (recipBox[XX][YY] * atomForces.x + recipBox[YY][YY] * atomForces.y);
328 result.z = negCoefficient
329 * (recipBox[XX][ZZ] * atomForces.x + recipBox[YY][ZZ] * atomForces.y
330 + recipBox[ZZ][ZZ] * atomForces.z);
331 sm_forces[forceIndexLocal] = result;
335 * A CUDA kernel which gathers the atom forces from the grid.
336 * The grid is assumed to be wrapped in dimension Z.
338 * \tparam[in] order The PME order (must be 4 currently).
339 * \tparam[in] wrapX Tells if the grid is wrapped in the X dimension.
340 * \tparam[in] wrapY Tells if the grid is wrapped in the Y dimension.
341 * \tparam[in] numGrids The number of grids to use in the kernel. Can be 1 or 2.
342 * \tparam[in] readGlobal Tells if we should read spline values from global memory
343 * \tparam[in] threadsPerAtom How many threads work on each atom
345 * \param[in] kernelParams All the PME GPU data.
347 template<int order, bool wrapX, bool wrapY, int numGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom>
348 __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
349 void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams)
351 assert(numGrids == 1 || numGrids == 2);
353 /* Global memory pointers */
354 const float* __restrict__ gm_coefficientsA = kernelParams.atoms.d_coefficients[0];
355 const float* __restrict__ gm_coefficientsB = kernelParams.atoms.d_coefficients[1];
356 const float* __restrict__ gm_gridA = kernelParams.grid.d_realGrid[0];
357 const float* __restrict__ gm_gridB = kernelParams.grid.d_realGrid[1];
358 static_assert(sizeof(*kernelParams.atoms.d_forces) == 3 * sizeof(float));
359 float* __restrict__ gm_forces = reinterpret_cast<float*>(kernelParams.atoms.d_forces);
361 /* Global memory pointers for readGlobal */
362 const float* __restrict__ gm_theta = kernelParams.atoms.d_theta;
363 const float* __restrict__ gm_dtheta = kernelParams.atoms.d_dtheta;
364 const int* __restrict__ gm_gridlineIndices = kernelParams.atoms.d_gridlineIndices;
369 const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
371 /* Number of data components and threads for a single atom */
372 const int threadsPerAtomValue = (threadsPerAtom == ThreadsPerAtom::Order) ? order : order * order;
373 const int atomDataSize = threadsPerAtomValue;
374 const int atomsPerBlock = c_gatherMaxThreadsPerBlock / atomDataSize;
375 // Number of atoms processed by a single warp in spread and gather
376 const int atomsPerWarp = warp_size / atomDataSize;
378 const int blockSize = atomsPerBlock * atomDataSize;
379 assert(blockSize == blockDim.x * blockDim.y * blockDim.z);
381 /* These are the atom indices - for the shared and global memory */
382 const int atomIndexLocal = threadIdx.z;
383 const int atomIndexOffset = blockIndex * atomsPerBlock;
384 const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
386 /* Early return for fully empty blocks at the end
387 * (should only happen for billions of input atoms)
389 if (atomIndexOffset >= kernelParams.atoms.nAtoms)
393 // 4 warps per block, 8 atoms per warp *3 *4
394 const int splineParamsSize = atomsPerBlock * DIM * order;
395 const int gridlineIndicesSize = atomsPerBlock * DIM;
396 __shared__ int sm_gridlineIndices[gridlineIndicesSize];
397 __shared__ float sm_theta[splineParamsSize];
398 __shared__ float sm_dtheta[splineParamsSize];
400 /* Spline Z coordinates */
401 const int ithz = threadIdx.x;
403 /* These are the spline contribution indices in shared memory */
404 const int splineIndex = threadIdx.y * blockDim.x + threadIdx.x;
405 const int lineIndex = (threadIdx.z * (blockDim.x * blockDim.y))
406 + splineIndex; /* And to all the block's particles */
408 const int threadLocalId =
409 (threadIdx.z * (blockDim.x * blockDim.y)) + blockDim.x * threadIdx.y + threadIdx.x;
410 const int threadLocalIdMax = blockDim.x * blockDim.y * blockDim.z;
415 const int localGridlineIndicesIndex = threadLocalId;
416 const int globalGridlineIndicesIndex = blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
417 if (localGridlineIndicesIndex < gridlineIndicesSize)
419 sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
420 assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
422 /* The loop needed for order threads per atom to make sure we load all data values, as each thread must load multiple values
423 with order*order threads per atom, it is only required for each thread to load one data value */
426 const int iMax = (threadsPerAtom == ThreadsPerAtom::Order) ? 3 : 1;
428 for (int i = iMin; i < iMax; i++)
430 int localSplineParamsIndex =
432 + i * threadLocalIdMax; /* i will always be zero for order*order threads per atom */
433 int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
434 if (localSplineParamsIndex < splineParamsSize)
436 sm_theta[localSplineParamsIndex] = gm_theta[globalSplineParamsIndex];
437 sm_dtheta[localSplineParamsIndex] = gm_dtheta[globalSplineParamsIndex];
438 assert(isfinite(sm_theta[localSplineParamsIndex]));
439 assert(isfinite(sm_dtheta[localSplineParamsIndex]));
446 const float3* __restrict__ gm_coordinates = asFloat3(kernelParams.atoms.d_coordinates);
447 /* Recalculate Splines */
448 if (c_useAtomDataPrefetch)
451 __shared__ float sm_coefficients[atomsPerBlock];
453 __shared__ float3 sm_coordinates[atomsPerBlock];
454 /* Staging coefficients/charges */
455 pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients, gm_coefficientsA);
457 /* Staging coordinates */
458 pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(sm_coordinates, gm_coordinates);
460 atomX = sm_coordinates[atomIndexLocal];
461 atomCharge = sm_coefficients[atomIndexLocal];
465 atomX = gm_coordinates[atomIndexGlobal];
466 atomCharge = gm_coefficientsA[atomIndexGlobal];
468 calculate_splines<order, atomsPerBlock, atomsPerWarp, true, false, numGrids>(
469 kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, sm_dtheta, sm_gridlineIndices);
476 const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsA[atomIndexGlobal]);
478 const int nx = kernelParams.grid.realGridSize[XX];
479 const int ny = kernelParams.grid.realGridSize[YY];
480 const int nz = kernelParams.grid.realGridSize[ZZ];
481 const int pny = kernelParams.grid.realGridSizePadded[YY];
482 const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
484 const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
485 const int warpIndex = atomIndexLocal / atomsPerWarp;
487 const int splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
488 const int splineIndexZ = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
489 const float2 tdz = make_float2(sm_theta[splineIndexZ], sm_dtheta[splineIndexZ]);
491 int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
492 const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
499 const int ithyMin = (threadsPerAtom == ThreadsPerAtom::Order) ? 0 : threadIdx.y;
500 const int ithyMax = (threadsPerAtom == ThreadsPerAtom::Order) ? order : threadIdx.y + 1;
503 sumForceComponents<order, atomsPerWarp, wrapX, wrapY>(&fx,
522 // Reduction of partial force contributions
523 __shared__ float3 sm_forces[atomsPerBlock];
524 reduce_atom_forces<order, atomDataSize, blockSize>(
525 sm_forces, atomIndexLocal, splineIndex, lineIndex, kernelParams.grid.realGridSizeFP, fx, fy, fz);
528 /* Calculating the final forces with no component branching, atomsPerBlock threads */
529 const int forceIndexLocal = threadLocalId;
530 const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
531 const float scale = kernelParams.current.scale;
532 if (forceIndexLocal < atomsPerBlock)
534 calculateAndStoreGridForces(
535 sm_forces, forceIndexLocal, forceIndexGlobal, kernelParams.current.recipBox, scale, gm_coefficientsA);
539 assert(atomsPerBlock <= warp_size);
541 /* Writing or adding the final forces component-wise, single warp */
542 const int blockForcesSize = atomsPerBlock * DIM;
543 const int numIter = (blockForcesSize + warp_size - 1) / warp_size;
544 const int iterThreads = blockForcesSize / numIter;
545 if (threadLocalId < iterThreads)
548 for (int i = 0; i < numIter; i++)
550 int outputIndexLocal = i * iterThreads + threadLocalId;
551 int outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal;
552 float outputForceComponent = (reinterpret_cast<float*>(sm_forces)[outputIndexLocal]);
553 gm_forces[outputIndexGlobal] = outputForceComponent;
559 /* We must sync here since the same shared memory is used as above. */
564 const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsB[atomIndexGlobal]);
567 sumForceComponents<order, atomsPerWarp, wrapX, wrapY>(&fx,
586 // Reduction of partial force contributions
587 reduce_atom_forces<order, atomDataSize, blockSize>(
588 sm_forces, atomIndexLocal, splineIndex, lineIndex, kernelParams.grid.realGridSizeFP, fx, fy, fz);
591 /* Calculating the final forces with no component branching, atomsPerBlock threads */
592 if (forceIndexLocal < atomsPerBlock)
594 calculateAndStoreGridForces(sm_forces,
597 kernelParams.current.recipBox,
604 /* Writing or adding the final forces component-wise, single warp */
605 if (threadLocalId < iterThreads)
608 for (int i = 0; i < numIter; i++)
610 int outputIndexLocal = i * iterThreads + threadLocalId;
611 int outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal;
612 float outputForceComponent = (reinterpret_cast<float*>(sm_forces)[outputIndexLocal]);
613 gm_forces[outputIndexGlobal] += outputForceComponent;
619 //! Kernel instantiations
621 template __global__ void pme_gather_kernel<4, true, true, 1, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
622 template __global__ void pme_gather_kernel<4, true, true, 1, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
623 template __global__ void pme_gather_kernel<4, true, true, 1, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
624 template __global__ void pme_gather_kernel<4, true, true, 1, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
625 template __global__ void pme_gather_kernel<4, true, true, 2, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
626 template __global__ void pme_gather_kernel<4, true, true, 2, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
627 template __global__ void pme_gather_kernel<4, true, true, 2, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
628 template __global__ void pme_gather_kernel<4, true, true, 2, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);